188 files changed, 181946 insertions, 0 deletions
diff --git a/src/os/CMakeLists.txt b/src/os/CMakeLists.txt
new file mode 100644
index 000000000..9008f2ed8
--- /dev/null
+++ b/src/os/CMakeLists.txt
@@ -0,0 +1,111 @@
+set(libos_srcs
+  ObjectStore.cc
+  Transaction.cc
+  filestore/chain_xattr.cc
+  filestore/BtrfsFileStoreBackend.cc
+  filestore/DBObjectMap.cc
+  filestore/FileJournal.cc
+  filestore/FileStore.cc
+  filestore/JournalThrottle.cc
+  filestore/GenericFileStoreBackend.cc
+  filestore/JournalingObjectStore.cc
+  filestore/HashIndex.cc
+  filestore/IndexManager.cc
+  filestore/LFNIndex.cc
+  filestore/WBThrottle.cc
+  filestore/os_xattr.c
+  memstore/MemStore.cc
+  kstore/KStore.cc
+  kstore/kstore_types.cc
+  fs/FS.cc)
+
+if(WITH_BLUESTORE)
+  list(APPEND libos_srcs
+    bluestore/Allocator.cc
+    bluestore/BitmapFreelistManager.cc
+    bluestore/BlueFS.cc
+    bluestore/bluefs_types.cc
+    bluestore/BlueRocksEnv.cc
+    bluestore/BlueStore.cc
+    bluestore/bluestore_types.cc
+    bluestore/fastbmap_allocator_impl.cc
+    bluestore/FreelistManager.cc
+    bluestore/StupidAllocator.cc
+    bluestore/BitmapAllocator.cc
+    bluestore/AvlAllocator.cc
+    bluestore/HybridAllocator.cc
+  )
+endif(WITH_BLUESTORE)
+
+if(WITH_ZBD)
+  list(APPEND libos_srcs
+    bluestore/zoned_types.cc
+    bluestore/ZonedFreelistManager.cc
+    bluestore/ZonedAllocator.cc)
+endif()
+
+if(WITH_FUSE)
+  list(APPEND libos_srcs
+    FuseStore.cc)
+endif(WITH_FUSE)
+
+if(HAVE_LIBXFS)
+  list(APPEND libos_srcs
+    filestore/XfsFileStoreBackend.cc
+    fs/XFS.cc)
+endif()
+
+if(HAVE_LIBZFS)
+  add_library(os_zfs_objs OBJECT
+    filestore/ZFSFileStoreBackend.cc
+    fs/ZFS.cc)
+  target_include_directories(os_zfs_objs SYSTEM PRIVATE
+    ${ZFS_INCLUDE_DIRS})
+  list(APPEND libos_srcs $<TARGET_OBJECTS:os_zfs_objs>)
+endif()
+
+add_library(os STATIC ${libos_srcs})
+target_link_libraries(os blk)
+
+target_link_libraries(os heap_profiler kv)
+
+if(WITH_BLUEFS)
+  add_library(bluefs SHARED 
+    bluestore/BlueRocksEnv.cc)
+  target_include_directories(bluefs SYSTEM PUBLIC
+    $<TARGET_PROPERTY:RocksDB::RocksDB,INTERFACE_INCLUDE_DIRECTORIES>)
+  target_link_libraries(bluefs global)
+  install(TARGETS bluefs DESTINATION lib)
+endif(WITH_BLUEFS)
+
+if(WITH_FUSE)
+  target_link_libraries(os FUSE::FUSE)
+endif()
+
+if(HAVE_LIBZFS)
+  target_link_libraries(os ${ZFS_LIBRARIES})
+endif()
+
+if(WITH_LTTNG)
+  add_dependencies(os objectstore-tp)
+  add_dependencies(os bluestore-tp)
+endif()
+
+if(WITH_JAEGER)
+  target_link_libraries(os jaeger-base)
+endif()
+
+target_link_libraries(os kv)
+
+add_dependencies(os compressor_plugins)
+add_dependencies(os crypto_plugins)
+
+
+if(WITH_BLUESTORE)
+  add_executable(ceph-bluestore-tool
+    bluestore/bluestore_tool.cc)
+  target_link_libraries(ceph-bluestore-tool
+    os global)
+  install(TARGETS ceph-bluestore-tool
+    DESTINATION bin)
+endif()
diff --git a/src/os/FuseStore.cc b/src/os/FuseStore.cc
new file mode 100644
index 000000000..e06131ce3
--- /dev/null
+++ b/src/os/FuseStore.cc
@@ -0,0 +1,1286 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "include/ceph_fuse.h"
+#include "FuseStore.h"
+#include "os/ObjectStore.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+
+#include <fuse_lowlevel.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>           /* Definition of AT_* constants */
+#include <sys/stat.h>
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif
+
+#define dout_context store->cct
+#define dout_subsys ceph_subsys_fuse
+#include "common/debug.h"
+#undef dout_prefix
+#define dout_prefix *_dout << "fuse "
+
+using std::list;
+using std::map;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+
+// some fuse-y bits of state
+struct fs_info {
+  struct fuse_args args;
+  struct fuse *f;
+#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0)
+  struct fuse_chan *ch;
+#endif
+  char *mountpoint;
+};
+
+int FuseStore::open_file(string p, struct fuse_file_info *fi,
+			 std::function<int(bufferlist *bl)> f)
+{
+  if (open_files.count(p)) {
+    OpenFile *o = open_files[p];
+    fi->fh = reinterpret_cast<uint64_t>(o);
+    ++o->ref;
+    return 0;
+  }
+  bufferlist bl;
+  int r = f(&bl);
+  if (r < 0) {
+    return r;
+  }
+  OpenFile *o = new OpenFile;
+  o->path = p;
+  o->bl = std::move(bl);
+  open_files[p] = o;
+  fi->fh = reinterpret_cast<uint64_t>(o);
+  ++o->ref;
+  return 0;
+}
+
+FuseStore::FuseStore(ObjectStore *s, string p)
+  : store(s),
+    mount_point(p),
+    fuse_thread(this)
+{
+  info = new fs_info();
+}
+
+FuseStore::~FuseStore()
+{
+  delete info;
+}
+
+/*
+ * / - root directory
+ * $cid/
+ * $cid/type - objectstore type
+ * $cid/bitwise_hash_start = lowest hash value
+ * $cid/bitwise_hash_end = highest hash value
+ * $cid/bitwise_hash_bits - how many bits are significant
+ * $cid/pgmeta/ - pgmeta object
+ * $cid/all/ - all objects
+ * $cid/all/$obj/
+ * $cid/all/$obj/bitwise_hash
+ * $cid/all/$obj/data
+ * $cid/all/$obj/omap/$key
+ * $cid/all/$obj/attr/$name
+ * $cid/by_bitwise_hash/$hash/$bits/$obj - all objects with this (bitwise) hash (prefix)
+ */
+enum {
+  FN_ROOT = 1,
+  FN_TYPE,
+  FN_COLLECTION,
+  FN_HASH_START,
+  FN_HASH_END,
+  FN_HASH_BITS,
+  FN_OBJECT,
+  FN_OBJECT_HASH,
+  FN_OBJECT_DATA,
+  FN_OBJECT_OMAP_HEADER,
+  FN_OBJECT_OMAP,
+  FN_OBJECT_OMAP_VAL,
+  FN_OBJECT_ATTR,
+  FN_OBJECT_ATTR_VAL,
+  FN_ALL,
+  FN_HASH_DIR,
+  FN_HASH_VAL,
+};
+
+static int parse_fn(CephContext* cct, const char *path, coll_t *cid,
+		    ghobject_t *oid, string *key,
+		    uint32_t *hash, uint32_t *hash_bits)
+{
+  list<string> v;
+  for (const char *p = path; *p; ++p) {
+    if (*p == '/')
+      continue;
+    const char *e;
+    for (e = p + 1; *e && *e != '/'; e++) ;
+    string c(p, e-p);
+    v.push_back(c);
+    p = e;
+    if (!*p)
+      break;
+  }
+  ldout(cct, 10) << __func__ << " path " << path << " -> " << v << dendl;
+
+  if (v.empty())
+    return FN_ROOT;
+
+  if (v.front() == "type")
+    return FN_TYPE;
+
+  if (!cid->parse(v.front())) {
+    return -ENOENT;
+  }
+  if (v.size() == 1)
+    return FN_COLLECTION;
+  v.pop_front();
+
+  if (v.front() == "bitwise_hash_start")
+    return FN_HASH_START;
+  if (v.front() == "bitwise_hash_end")
+    return FN_HASH_END;
+  if (v.front() == "bitwise_hash_bits")
+    return FN_HASH_BITS;
+  if (v.front() == "pgmeta") {
+    spg_t pgid;
+    if (cid->is_pg(&pgid)) {
+      *oid = pgid.make_pgmeta_oid();
+      v.pop_front();
+      if (v.empty())
+	return FN_OBJECT;
+      goto do_object;
+    }
+    return -ENOENT;
+  }
+  if (v.front() == "all") {
+    v.pop_front();
+    if (v.empty())
+      return FN_ALL;
+    goto do_dir;
+  }
+  if (v.front() == "by_bitwise_hash") {
+    v.pop_front();
+    if (v.empty())
+      return FN_HASH_DIR;
+    unsigned long hv, hm;
+    int r = sscanf(v.front().c_str(), "%lx", &hv);
+    if (r != 1)
+      return -ENOENT;
+    int shift = 32 - v.front().length() * 4;
+    v.pop_front();
+    if (v.empty())
+      return FN_HASH_DIR;
+    r = sscanf(v.front().c_str(), "%ld", &hm);
+    if (r != 1)
+      return -ENOENT;
+    if (hm < 1 || hm > 32)
+      return -ENOENT;
+    v.pop_front();
+    *hash = hv << shift;//hobject_t::_reverse_bits(hv << shift);
+    *hash_bits = hm;
+    if (v.empty())
+      return FN_HASH_VAL;
+    goto do_dir;
+  }
+  return -ENOENT;
+
+ do_dir:
+  {
+    string o = v.front();
+    if (!oid->parse(o)) {
+      return -ENOENT;
+    }
+    v.pop_front();
+    if (v.empty())
+      return FN_OBJECT;
+  }
+
+ do_object:
+  if (v.front() == "data")
+    return FN_OBJECT_DATA;
+  if (v.front() == "omap_header")
+    return FN_OBJECT_OMAP_HEADER;
+  if (v.front() == "omap") {
+    v.pop_front();
+    if (v.empty())
+      return FN_OBJECT_OMAP;
+    *key = v.front();
+    v.pop_front();
+    if (v.empty())
+      return FN_OBJECT_OMAP_VAL;
+    return -ENOENT;
+  }
+  if (v.front() == "attr") {
+    v.pop_front();
+    if (v.empty())
+      return FN_OBJECT_ATTR;
+    *key = v.front();
+    v.pop_front();
+    if (v.empty())
+      return FN_OBJECT_ATTR_VAL;
+    return -ENOENT;
+  }
+  if (v.front() == "bitwise_hash")
+    return FN_OBJECT_HASH;
+  return -ENOENT;
+}
+
+
+static int os_getattr(const char *path, struct stat *stbuf
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+                      , struct fuse_file_info *fi
+#endif
+                      )
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int t = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (t < 0)
+    return t;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  stbuf->st_size = 0;
+  stbuf->st_uid = 0;
+  stbuf->st_gid = 0;
+  stbuf->st_mode = S_IFREG | 0700;
+
+  auto ch = fs->store->open_collection(cid);
+
+  switch (t) {
+  case FN_OBJECT_OMAP:
+  case FN_OBJECT_ATTR:
+  case FN_OBJECT:
+  case FN_OBJECT_DATA:
+  case FN_OBJECT_OMAP_HEADER:
+  case FN_OBJECT_OMAP_VAL:
+    {
+      spg_t pgid;
+      if (cid.is_pg(&pgid)) {
+	if (!ch) {
+	  return -ENOENT;
+	}
+	int bits = fs->store->collection_bits(ch);
+	if (bits >= 0 && !oid.match(bits, pgid.ps())) {
+	  // sorry, not part of this PG
+	  return -ENOENT;
+	}
+      }
+    }
+    break;
+  }
+
+  switch (t) {
+  case FN_OBJECT_OMAP:
+  case FN_OBJECT_ATTR:
+  case FN_OBJECT:
+    if (!fs->store->exists(ch, oid))
+      return -ENOENT;
+    // fall-thru
+  case FN_ALL:
+  case FN_HASH_DIR:
+  case FN_HASH_VAL:
+  case FN_COLLECTION:
+    if (!fs->store->collection_exists(cid))
+      return -ENOENT;
+    // fall-thru
+  case FN_ROOT:
+    stbuf->st_mode = S_IFDIR | 0700;
+    return 0;
+
+  case FN_TYPE:
+    stbuf->st_size = fs->store->get_type().length() + 1;
+    break;
+
+  case FN_OBJECT_HASH:
+    if (!fs->store->exists(ch, oid))
+      return -ENOENT;
+    stbuf->st_size = 9;
+    return 0;
+
+  case FN_HASH_END:
+    if (!ch)
+      return -ENOENT;
+    if (fs->store->collection_bits(ch) < 0)
+      return -ENOENT;
+    // fall-thru
+  case FN_HASH_START:
+    stbuf->st_size = 9;
+    return 0;
+
+  case FN_HASH_BITS:
+    {
+      if (!ch)
+	return -ENOENT;
+      int bits = fs->store->collection_bits(ch);
+      if (bits < 0)
+	return -ENOENT;
+      char buf[12];
+      snprintf(buf, sizeof(buf), "%d\n", bits);
+      stbuf->st_size = strlen(buf);
+    }
+    return 0;
+
+  case FN_OBJECT_DATA:
+    {
+      if (!fs->store->exists(ch, oid))
+	return -ENOENT;
+      int r = fs->store->stat(ch, oid, stbuf);
+      if (r < 0)
+	return r;
+    }
+    break;
+
+  case FN_OBJECT_OMAP_HEADER:
+    {
+      if (!fs->store->exists(ch, oid))
+	return -ENOENT;
+      bufferlist bl;
+      fs->store->omap_get_header(ch, oid, &bl);
+      stbuf->st_size = bl.length();
+    }
+    break;
+
+  case FN_OBJECT_OMAP_VAL:
+    {
+      if (!fs->store->exists(ch, oid))
+	return -ENOENT;
+      set<string> k;
+      k.insert(key);
+      map<string,bufferlist> v;
+      fs->store->omap_get_values(ch, oid, k, &v);
+      if (!v.count(key)) {
+	return -ENOENT;
+      }
+      stbuf->st_size = v[key].length();
+    }
+    break;
+
+  case FN_OBJECT_ATTR_VAL:
+    {
+      if (!fs->store->exists(ch, oid))
+	return -ENOENT;
+      bufferptr v;
+      int r = fs->store->getattr(ch, oid, key.c_str(), v);
+      if (r == -ENODATA)
+	r = -ENOENT;
+      if (r < 0)
+	return r;
+      stbuf->st_size = v.length();
+    }
+    break;
+
+  default:
+    return -ENOENT;
+  }
+
+  return 0;
+}
+
+static int os_readdir(const char *path,
+		      void *buf,
+		      fuse_fill_dir_t filler,
+		      off_t offset,
+		      struct fuse_file_info *fi
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+                      , enum fuse_readdir_flags
+#endif
+                      )
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << " offset " << offset
+		     << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int t = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (t < 0)
+    return t;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  auto ch = fs->store->open_collection(cid);
+
+  // we can't shift 32 bits or else off_t will go negative
+  const int hash_shift = 31;
+
+  switch (t) {
+  case FN_ROOT:
+    {
+      filler_compat(filler, buf, "type", NULL, 0);
+      vector<coll_t> cls;
+      fs->store->list_collections(cls);
+      for (auto c : cls) {
+	int r = filler_compat(filler, buf, stringify(c).c_str(), NULL, 0);
+	if (r > 0)
+	  break;
+      }
+    }
+    break;
+
+  case FN_COLLECTION:
+    {
+      if (!ch) {
+	return -ENOENT;
+      }
+      filler_compat(filler, buf, "bitwise_hash_start", NULL, 0);
+      if (fs->store->collection_bits(ch) >= 0) {
+	filler_compat(filler, buf, "bitwise_hash_end", NULL, 0);
+	filler_compat(filler, buf, "bitwise_hash_bits", NULL, 0);
+      }
+      filler_compat(filler, buf, "all", NULL, 0);
+      filler_compat(filler, buf, "by_bitwise_hash", NULL, 0);
+      spg_t pgid;
+      if (cid.is_pg(&pgid) &&
+	  fs->store->exists(ch, pgid.make_pgmeta_oid())) {
+	filler_compat(filler, buf, "pgmeta", NULL, 0);
+      }
+    }
+    break;
+
+  case FN_OBJECT:
+    {
+      filler_compat(filler, buf, "bitwise_hash", NULL, 0);
+      filler_compat(filler, buf, "data", NULL, 0);
+      filler_compat(filler, buf, "omap", NULL, 0);
+      filler_compat(filler, buf, "attr", NULL, 0);
+      filler_compat(filler, buf, "omap_header", NULL, 0);
+    }
+    break;
+
+  case FN_HASH_VAL:
+  case FN_ALL:
+    {
+      uint32_t bitwise_hash = (offset >> hash_shift) & 0xffffffff;
+      uint32_t hashoff = offset - (bitwise_hash << hash_shift);
+      int skip = hashoff;
+      ghobject_t next = cid.get_min_hobj();
+      if (offset) {
+	// obey the offset
+	next.hobj.set_hash(hobject_t::_reverse_bits(bitwise_hash));
+      } else if (t == FN_HASH_VAL) {
+	next.hobj.set_hash(hobject_t::_reverse_bits(hash_value));
+      }
+      ghobject_t last;
+      if (t == FN_HASH_VAL) {
+	last = next;
+	uint64_t rev_end = (hash_value | (0xffffffff >> hash_bits)) + 1;
+	if (rev_end >= 0x100000000)
+	  last = ghobject_t::get_max();
+	else
+	  last.hobj.set_hash(hobject_t::_reverse_bits(rev_end));
+      } else {
+	last = ghobject_t::get_max();
+      }
+      ldout(fs->store->cct, 10) << __func__ << std::hex
+			 << " offset " << offset << " hash "
+			 << hobject_t::_reverse_bits(hash_value)
+			 << std::dec
+			 << "/" << hash_bits
+			 << " first " << next << " last " << last
+			 << dendl;
+      while (true) {
+	vector<ghobject_t> ls;
+	int r = fs->store->collection_list(
+	  ch, next, last, 1000, &ls, &next);
+	if (r < 0)
+	  return r;
+	for (auto p : ls) {
+	  if (skip) {
+	    --skip;
+	    continue;
+	  }
+	  uint32_t cur_bitwise_hash = p.hobj.get_bitwise_key_u32();
+	  if (cur_bitwise_hash != bitwise_hash) {
+	    bitwise_hash = cur_bitwise_hash;
+	    hashoff = 0;
+	  }
+	  ++hashoff;
+	  uint64_t cur_off = ((uint64_t)bitwise_hash << hash_shift) |
+	    (uint64_t)hashoff;
+	  string s = stringify(p);
+	  r = filler_compat(filler, buf, s.c_str(), NULL, cur_off);
+	  if (r)
+	    break;
+	}
+	if (r)
+	  break;
+	if (next == ghobject_t::get_max() || next == last)
+	  break;
+      }
+    }
+    break;
+
+  case FN_OBJECT_OMAP:
+    {
+      set<string> keys;
+      fs->store->omap_get_keys(ch, oid, &keys);
+      unsigned skip = offset;
+      for (auto k : keys) {
+	if (skip) {
+	  --skip;
+	  continue;
+	}
+	++offset;
+	int r = filler_compat(filler, buf, k.c_str(), NULL, offset);
+	if (r)
+	  break;
+      }
+    }
+    break;
+
+  case FN_OBJECT_ATTR:
+    {
+      map<string,bufferptr> aset;
+      fs->store->getattrs(ch, oid, aset);
+      unsigned skip = offset;
+      for (auto a : aset) {
+	if (skip) {
+	  --skip;
+	  continue;
+	}
+	++offset;
+	int r = filler_compat(filler, buf, a.first.c_str(), NULL, offset);
+	if (r)
+	  break;
+      }
+    }
+    break;
+  }
+  return 0;
+}
+
+static int os_open(const char *path, struct fuse_file_info *fi)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int t = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (t < 0)
+    return t;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  auto ch = fs->store->open_collection(cid);
+
+  bufferlist *pbl = 0;
+  switch (t) {
+  case FN_TYPE:
+    pbl = new bufferlist;
+    pbl->append(fs->store->get_type());
+    pbl->append("\n");
+    break;
+
+  case FN_HASH_START:
+    {
+      pbl = new bufferlist;
+      spg_t pgid;
+      if (cid.is_pg(&pgid)) {
+	unsigned long h;
+	h = hobject_t::_reverse_bits(pgid.ps());
+	char buf[10];
+	snprintf(buf, sizeof(buf), "%08lx\n", h);
+	pbl->append(buf);
+      } else {
+	pbl->append("00000000\n");
+      }
+    }
+    break;
+
+  case FN_HASH_END:
+    {
+      if (!ch) {
+	return -ENOENT;
+      }
+      spg_t pgid;
+      unsigned long h;
+      if (cid.is_pg(&pgid)) {
+	int hash_bits = fs->store->collection_bits(ch);
+	if (hash_bits >= 0) {
+	  uint64_t rev_start = hobject_t::_reverse_bits(pgid.ps());
+	  uint64_t rev_end = (rev_start | (0xffffffff >> hash_bits));
+	  h = rev_end;
+	} else {
+	  return -ENOENT;
+	}
+      } else {
+	h = 0xffffffff;
+      }
+      char buf[10];
+      snprintf(buf, sizeof(buf), "%08lx\n", h);
+      pbl = new bufferlist;
+      pbl->append(buf);
+    }
+    break;
+
+  case FN_HASH_BITS:
+    {
+      if (!ch) {
+	return -ENOENT;
+      }
+      int r = fs->store->collection_bits(ch);
+      if (r < 0)
+        return r;
+      char buf[12];
+      snprintf(buf, sizeof(buf), "%d\n", r);
+      pbl = new bufferlist;
+      pbl->append(buf);
+    }
+    break;
+
+  case FN_OBJECT_HASH:
+    {
+      pbl = new bufferlist;
+      char buf[10];
+      snprintf(buf, sizeof(buf), "%08x\n",
+	       (unsigned)oid.hobj.get_bitwise_key_u32());
+      pbl->append(buf);
+    }
+    break;
+
+  case FN_OBJECT_DATA:
+    {
+      int r = fs->open_file(
+	path, fi,
+	[&](bufferlist *pbl) {
+	  return fs->store->read(ch, oid, 0, 0, *pbl);
+	});
+      if (r < 0) {
+        return r;
+      }
+    }
+    break;
+
+  case FN_OBJECT_ATTR_VAL:
+    {
+      int r = fs->open_file(
+	path, fi,
+	[&](bufferlist *pbl) {
+	  bufferptr bp;
+	  int r = fs->store->getattr(ch, oid, key.c_str(), bp);
+	  if (r < 0)
+	    return r;
+	  pbl->append(bp);
+	  return 0;
+	});
+      if (r < 0)
+        return r;
+    }
+    break;
+
+  case FN_OBJECT_OMAP_VAL:
+    {
+      int r = fs->open_file(
+	path, fi,
+	[&](bufferlist *pbl) {
+	  set<string> k;
+	  k.insert(key);
+	  map<string,bufferlist> v;
+	  int r = fs->store->omap_get_values(ch, oid, k, &v);
+	  if (r < 0)
+	    return r;
+	  *pbl = v[key];
+	  return 0;
+	});
+      if (r < 0)
+	return r;
+    }
+    break;
+
+  case FN_OBJECT_OMAP_HEADER:
+    {
+      int r = fs->open_file(
+	path, fi,
+	[&](bufferlist *pbl) {
+	  return fs->store->omap_get_header(ch, oid, pbl);
+	});
+      if (r < 0)
+       return r;
+    }
+    break;
+  }
+
+  if (pbl) {
+    FuseStore::OpenFile *o = new FuseStore::OpenFile;
+    o->bl = std::move(*pbl);
+    fi->fh = reinterpret_cast<uint64_t>(o);
+  }
+  return 0;
+}
+
+static int os_mkdir(const char *path, mode_t mode)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (f < 0)
+    return f;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  ObjectStore::CollectionHandle ch;
+
+  ObjectStore::Transaction t;
+  switch (f) {
+  case FN_OBJECT:
+    {
+      ch = fs->store->open_collection(cid);
+      if (!ch) {
+	return -ENOENT;
+      }
+      spg_t pgid;
+      if (cid.is_pg(&pgid)) {
+	int bits = fs->store->collection_bits(ch);
+	if (bits >= 0 && !oid.match(bits, pgid.ps())) {
+	  // sorry, not part of this PG
+	  return -EINVAL;
+	}
+      }
+      t.touch(cid, oid);
+      ch = fs->store->open_collection(cid);
+    }
+    break;
+
+  case FN_COLLECTION:
+    if (cid.is_pg()) {
+      // use the mode for the bit count.  e.g., mkdir --mode=0003
+      // mnt/0.7_head will create 0.7 with bits = 3.
+      mode &= 0777;
+      if (mode >= 32)
+	return -EINVAL;
+    } else {
+      mode = 0;
+    }
+    t.create_collection(cid, mode);
+    ch = fs->store->create_new_collection(cid);
+    break;
+
+  default:
+    return -EPERM;
+  }
+
+  if (!t.empty()) {
+    fs->store->queue_transaction(ch, std::move(t));
+  }
+
+  return 0;
+}
+
+static int os_chmod(const char *path, mode_t mode
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+                    , struct fuse_file_info *fi
+#endif
+                    )
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  return 0;
+}
+
+static int os_create(const char *path, mode_t mode, struct fuse_file_info *fi)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (f < 0)
+    return f;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  ObjectStore::CollectionHandle ch = fs->store->open_collection(cid);
+
+  ObjectStore::Transaction t;
+  bufferlist *pbl = 0;
+  switch (f) {
+  case FN_OBJECT_DATA:
+    {
+      pbl = new bufferlist;
+      fs->store->read(ch, oid, 0, 0, *pbl);
+    }
+    break;
+
+  case FN_OBJECT_ATTR_VAL:
+    {
+      pbl = new bufferlist;
+      bufferptr bp;
+      int r = fs->store->getattr(ch, oid, key.c_str(), bp);
+      if (r == -ENODATA) {
+	bufferlist empty;
+	t.setattr(cid, oid, key.c_str(), empty);
+      }
+      pbl->append(bp);
+    }
+    break;
+
+  case FN_OBJECT_OMAP_VAL:
+    {
+      pbl = new bufferlist;
+      set<string> k;
+      k.insert(key);
+      map<string,bufferlist> v;
+      fs->store->omap_get_values(ch, oid, k, &v);
+      if (v.count(key) == 0) {
+	map<string,bufferlist> aset;
+	aset[key] = bufferlist();
+	t.omap_setkeys(cid, oid, aset);
+      } else {
+	*pbl = v[key];
+      }
+    }
+    break;
+  }
+
+  if (!t.empty()) {
+    fs->store->queue_transaction(ch, std::move(t));
+  }
+
+  if (pbl) {
+    FuseStore::OpenFile *o = new FuseStore::OpenFile;
+    o->bl = std::move(*pbl);
+    o->dirty = true;
+    fi->fh = reinterpret_cast<uint64_t>(o);
+  }
+  return 0;
+}
+
+static int os_release(const char *path, struct fuse_file_info *fi)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  std::lock_guard<std::mutex> l(fs->lock);
+  FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh);
+  if (--o->ref == 0) {
+    ldout(fs->store->cct, 10) << __func__ << " closing last " << o->path << dendl;
+    fs->open_files.erase(o->path);
+    delete o;
+  }
+  return 0;
+}
+
+static int os_read(const char *path, char *buf, size_t size, off_t offset,
+		   struct fuse_file_info *fi)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << " offset " << offset
+		     << " size " << size << dendl;
+  std::lock_guard<std::mutex> l(fs->lock);
+  FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh);
+  if (!o)
+    return 0;
+  if (offset >= o->bl.length())
+    return 0;
+  if (offset + size > o->bl.length())
+    size = o->bl.length() - offset;
+  bufferlist r;
+  r.substr_of(o->bl, offset, size);
+  memcpy(buf, r.c_str(), r.length());
+  return r.length();
+}
+
+static int os_write(const char *path, const char *buf, size_t size,
+		    off_t offset, struct fuse_file_info *fi)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << " offset " << offset
+		     << " size " << size << dendl;
+  std::lock_guard<std::mutex> l(fs->lock);
+  FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh);
+  if (!o)
+    return 0;
+
+  bufferlist final;
+  if (offset) {
+    if (offset > o->bl.length()) {
+      final.substr_of(o->bl, 0, offset);
+    } else {
+      final.claim_append(o->bl);
+      size_t zlen = offset - final.length();
+      final.append_zero(zlen);
+    }
+  }
+  final.append(buf, size);
+  if (offset + size < o->bl.length()) {
+    bufferlist rest;
+    rest.substr_of(o->bl, offset + size, o->bl.length() - offset - size);
+    final.claim_append(rest);
+  }
+  o->bl = final;
+  o->dirty = true;
+  return size;
+}
+
+int os_flush(const char *path, struct fuse_file_info *fi)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (f < 0)
+    return f;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh);
+  if (!o)
+    return 0;
+  if (!o->dirty)
+    return 0;
+
+  ObjectStore::CollectionHandle ch = fs->store->open_collection(cid);
+
+  ObjectStore::Transaction t;
+
+  switch (f) {
+  case FN_OBJECT_DATA:
+    t.write(cid, oid, 0, o->bl.length(), o->bl);
+    break;
+
+  case FN_OBJECT_ATTR_VAL:
+    t.setattr(cid, oid, key.c_str(), o->bl);
+    break;
+
+  case FN_OBJECT_OMAP_VAL:
+    {
+      map<string,bufferlist> aset;
+      aset[key] = o->bl;
+      t.omap_setkeys(cid, oid, aset);
+      break;
+    }
+
+  case FN_OBJECT_OMAP_HEADER:
+    t.omap_setheader(cid, oid, o->bl);
+    break;
+
+  default:
+    return 0;
+  }
+
+  fs->store->queue_transaction(ch, std::move(t));
+
+  return 0;
+}
+
+static int os_unlink(const char *path)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (f < 0)
+    return f;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  ObjectStore::CollectionHandle ch = fs->store->open_collection(cid);
+  ObjectStore::Transaction t;
+
+  switch (f) {
+  case FN_OBJECT_OMAP_VAL:
+    {
+      t.omap_rmkey(cid, oid, key);
+    }
+    break;
+
+  case FN_OBJECT_ATTR_VAL:
+    t.rmattr(cid, oid, key.c_str());
+    break;
+
+  case FN_OBJECT_OMAP_HEADER:
+    {
+      bufferlist empty;
+      t.omap_setheader(cid, oid, empty);
+    }
+    break;
+
+  case FN_OBJECT:
+    t.remove(cid, oid);
+    break;
+
+  case FN_COLLECTION:
+    {
+      bool empty;
+      int r = fs->store->collection_empty(ch, &empty);
+      if (r < 0)
+        return r;
+      if (!empty)
+        return -ENOTEMPTY;
+      t.remove_collection(cid);
+    }
+    break;
+
+  case FN_OBJECT_DATA:
+    t.truncate(cid, oid, 0);
+    break;
+
+  default:
+    return -EPERM;
+  }
+
+  fs->store->queue_transaction(ch, std::move(t));
+
+  return 0;
+}
+
+static int os_truncate(const char *path, off_t size
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+                       , struct fuse_file_info *fi
+#endif
+                       )
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << " size " << size << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (f < 0)
+    return f;
+
+  if (f == FN_OBJECT_OMAP_VAL ||
+      f == FN_OBJECT_ATTR_VAL ||
+      f == FN_OBJECT_OMAP_HEADER) {
+    if (size)
+      return -EPERM;
+    return 0;
+  }
+  if (f != FN_OBJECT_DATA)
+    return -EPERM;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  if (fs->open_files.count(path)) {
+    FuseStore::OpenFile *o = fs->open_files[path];
+    if (o->bl.length() > size) {
+      bufferlist t;
+      t.substr_of(o->bl, 0, size);
+      o->bl.swap(t);
+    }
+  }
+
+  ObjectStore::CollectionHandle ch = fs->store->open_collection(cid);
+  ObjectStore::Transaction t;
+  t.truncate(cid, oid, size);
+  fs->store->queue_transaction(ch, std::move(t));
+  return 0;
+}
+
+static int os_statfs(const char *path, struct statvfs *stbuf)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  struct store_statfs_t s;
+  int r = fs->store->statfs(&s);
+  if (r < 0)
+    return r;
+  stbuf->f_bsize = 4096;   // LIES!
+  stbuf->f_blocks = s.total / 4096;
+  stbuf->f_bavail = s.available / 4096;
+  stbuf->f_bfree = stbuf->f_bavail;
+
+  ldout(fs->store->cct, 10) << __func__ << " " << path << ": " 
+    << stbuf->f_bavail << "/" << stbuf->f_blocks << dendl;
+  return 0;
+}
+
+static struct fuse_operations fs_oper = {
+  getattr: os_getattr,
+  readlink: 0,
+#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0)
+  getdir: 0,
+#endif
+  mknod: 0,
+  mkdir: os_mkdir,
+  unlink: os_unlink,
+  rmdir: os_unlink,
+  symlink: 0,
+  rename: 0,
+  link: 0,
+  chmod: os_chmod,
+  chown: 0,
+  truncate: os_truncate,
+#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0)
+  utime: 0,
+#endif
+  open: os_open,
+  read: os_read,
+  write: os_write,
+  statfs: os_statfs,
+  flush: os_flush,
+  release: os_release,
+  fsync: 0,
+  setxattr: 0,
+  getxattr: 0,
+  listxattr: 0,
+  removexattr: 0,
+  opendir: 0,
+  readdir: os_readdir,
+  releasedir: 0,
+  fsyncdir: 0,
+  init: 0,
+  destroy: 0,
+  access: 0,
+  create: os_create,
+};
+
+int FuseStore::main()
+{
+  const char *v[] = {
+    "foo",
+    mount_point.c_str(),
+    "-f",
+    "-d", // debug
+  };
+  int c = 3;
+  auto fuse_debug = store->cct->_conf.get_val<bool>("fuse_debug");
+  if (fuse_debug)
+    ++c;
+  return fuse_main(c, (char**)v, &fs_oper, (void*)this);
+}
+
+int FuseStore::start()
+{
+  dout(10) << __func__ << dendl;
+
+  memset(&info->args, 0, sizeof(info->args));
+  const char *v[] = {
+    "foo",
+    mount_point.c_str(),
+    "-f", // foreground
+    "-d", // debug
+  };
+  int c = 3;
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+  int rc;
+  struct fuse_cmdline_opts opts = {};
+#endif
+  auto fuse_debug = store->cct->_conf.get_val<bool>("fuse_debug");
+  if (fuse_debug)
+    ++c;
+  fuse_args a = FUSE_ARGS_INIT(c, (char**)v);
+  info->args = a;
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+  if (fuse_parse_cmdline(&info->args, &opts) == -1) {
+#else
+  if (fuse_parse_cmdline(&info->args, &info->mountpoint, NULL, NULL) == -1) {
+#endif
+    derr << __func__ << " failed to parse args" << dendl;
+    return -EINVAL;
+  }
+
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+  info->mountpoint = opts.mountpoint;
+  info->f = fuse_new(&info->args, &fs_oper, sizeof(fs_oper), (void*)this);
+  if (!info->f) {
+    derr << __func__ << " fuse_new failed" << dendl;
+    return -EIO;
+  }
+
+  rc = fuse_mount(info->f, info->mountpoint);
+  if (rc != 0) {
+    derr << __func__ << " fuse_mount failed" << dendl;
+    return -EIO;
+  }
+#else
+  info->ch = fuse_mount(info->mountpoint, &info->args);
+  if (!info->ch) {
+    derr << __func__ << " fuse_mount failed" << dendl;
+    return -EIO;
+  }
+
+  info->f = fuse_new(info->ch, &info->args, &fs_oper, sizeof(fs_oper),
+		     (void*)this);
+  if (!info->f) {
+    fuse_unmount(info->mountpoint, info->ch);
+    derr << __func__ << " fuse_new failed" << dendl;
+    return -EIO;
+  }
+#endif
+
+  fuse_thread.create("fusestore");
+  dout(10) << __func__ << " done" << dendl;
+  return 0;
+}
+
+int FuseStore::loop()
+{
+  dout(10) << __func__ << " enter" << dendl;
+  int r = fuse_loop(info->f);
+  if (r)
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+  dout(10) << __func__ << " exit" << dendl;
+  return r;
+}
+
+int FuseStore::stop()
+{
+  dout(10) << __func__ << " enter" << dendl;
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+  fuse_unmount(info->f);
+#else
+  fuse_unmount(info->mountpoint, info->ch);
+#endif
+  fuse_thread.join();
+  fuse_destroy(info->f);
+  dout(10) << __func__ << " exit" << dendl;
+  return 0;
+}
diff --git a/src/os/FuseStore.h b/src/os/FuseStore.h
new file mode 100644
index 000000000..a3000d89d
--- /dev/null
+++ b/src/os/FuseStore.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_FUSESTORE_H
+#define CEPH_OS_FUSESTORE_H
+
+#include <string>
+#include <map>
+#include <mutex>
+#include <functional>
+
+#include "common/Thread.h"
+#include "include/buffer.h"
+
+class ObjectStore;
+
+class FuseStore {
+public:
+  ObjectStore *store;
+  std::string mount_point;
+  struct fs_info *info;
+  std::mutex lock;
+
+  struct OpenFile {
+    std::string path;
+    ceph::buffer::list bl;
+    bool dirty = false;
+    int ref = 0;
+  };
+  std::map<std::string,OpenFile*> open_files;
+
+  int open_file(std::string p, struct fuse_file_info *fi,
+		std::function<int(ceph::buffer::list *bl)> f);
+
+  class FuseThread : public Thread {
+    FuseStore *fs;
+  public:
+    explicit FuseThread(FuseStore *f) : fs(f) {}
+    void *entry() override {
+      fs->loop();
+      return NULL;
+    }
+  } fuse_thread;
+
+  FuseStore(ObjectStore *s, std::string p);
+  ~FuseStore();
+
+  int main();
+  int start();
+  int loop();
+  int stop();
+};
+
+#endif
diff --git a/src/os/ObjectMap.h b/src/os/ObjectMap.h
new file mode 100644
index 000000000..517d0ca98
--- /dev/null
+++ b/src/os/ObjectMap.h
@@ -0,0 +1,172 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef OS_KEYVALUESTORE_H
+#define OS_KEYVALUESTORE_H
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "kv/KeyValueDB.h"
+#include "common/hobject.h"
+
+class SequencerPosition;
+
+/**
+ * Encapsulates the FileStore key value store
+ *
+ * Implementations of this interface will be used to implement TMAP
+ */
+class ObjectMap {
+public:
+  CephContext* cct;
+  boost::scoped_ptr<KeyValueDB> db;
+  /// std::Set keys and values from specified map
+  virtual int set_keys(
+    const ghobject_t &oid,              ///< [in] object containing map
+    const std::map<std::string, ceph::buffer::list> &set,  ///< [in] key to value map to set
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+  /// std::Set header
+  virtual int set_header(
+    const ghobject_t &oid,              ///< [in] object containing map
+    const ceph::buffer::list &bl,               ///< [in] header to set
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+  /// Retrieve header
+  virtual int get_header(
+    const ghobject_t &oid,              ///< [in] object containing map
+    ceph::buffer::list *bl                      ///< [out] header to set
+    ) = 0;
+
+  /// Clear all map keys and values from oid
+  virtual int clear(
+    const ghobject_t &oid,             ///< [in] object containing map
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+  /// Clear all map keys and values in to_clear from oid
+  virtual int rm_keys(
+    const ghobject_t &oid,              ///< [in] object containing map
+    const std::set<std::string> &to_clear,        ///< [in] Keys to clear
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+  /// Clear all omap keys and the header
+  virtual int clear_keys_header(
+    const ghobject_t &oid,              ///< [in] oid to clear
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+  /// Get all keys and values
+  virtual int get(
+    const ghobject_t &oid,             ///< [in] object containing map
+    ceph::buffer::list *header,                ///< [out] Returned Header
+    std::map<std::string, ceph::buffer::list> *out       ///< [out] Returned keys and values
+    ) = 0;
+
+  /// Get values for supplied keys
+  virtual int get_keys(
+    const ghobject_t &oid,             ///< [in] object containing map
+    std::set<std::string> *keys                  ///< [out] Keys defined on oid
+    ) = 0;
+
+  /// Get values for supplied keys
+  virtual int get_values(
+    const ghobject_t &oid,             ///< [in] object containing map
+    const std::set<std::string> &keys,           ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out       ///< [out] Returned keys and values
+    ) = 0;
+
+  /// Check key existence
+  virtual int check_keys(
+    const ghobject_t &oid,             ///< [in] object containing map
+    const std::set<std::string> &keys,           ///< [in] Keys to check
+    std::set<std::string> *out                   ///< [out] Subset of keys defined on oid
+    ) = 0;
+
+  /// Get xattrs
+  virtual int get_xattrs(
+    const ghobject_t &oid,             ///< [in] object
+    const std::set<std::string> &to_get,         ///< [in] keys to get
+    std::map<std::string, ceph::buffer::list> *out       ///< [out] subset of attrs/vals defined
+    ) = 0;
+
+  /// Get all xattrs
+  virtual int get_all_xattrs(
+    const ghobject_t &oid,             ///< [in] object
+    std::set<std::string> *out                   ///< [out] attrs and values
+    ) = 0;
+
+  /// std::set xattrs in to_set
+  virtual int set_xattrs(
+    const ghobject_t &oid,                ///< [in] object
+    const std::map<std::string, ceph::buffer::list> &to_set,///< [in] attrs/values to set
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+  /// remove xattrs in to_remove
+  virtual int remove_xattrs(
+    const ghobject_t &oid,               ///< [in] object
+    const std::set<std::string> &to_remove,        ///< [in] attrs to remove
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+
+  /// Clone keys from oid map to target map
+  virtual int clone(
+    const ghobject_t &oid,             ///< [in] object containing map
+    const ghobject_t &target,           ///< [in] target of clone
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) { return 0; }
+
+  /// Rename map because of name change
+  virtual int rename(
+    const ghobject_t &from,             ///< [in] object containing map
+    const ghobject_t &to,               ///< [in] new name
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) { return 0; }
+
+  /// For testing clone keys from oid map to target map using faster but more complex method
+  virtual int legacy_clone(
+    const ghobject_t &oid,             ///< [in] object containing map
+    const ghobject_t &target,           ///< [in] target of clone
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) { return 0; }
+
+  /// Ensure all previous writes are durable
+  virtual int sync(
+    const ghobject_t *oid=0,          ///< [in] object
+    const SequencerPosition *spos=0   ///< [in] Sequencer
+    ) { return 0; }
+
+  virtual int check(std::ostream &out, bool repair = false, bool force = false) { return 0; }
+
+  virtual void compact() {}
+
+  typedef KeyValueDB::SimplestIteratorImpl ObjectMapIteratorImpl;
+  typedef std::shared_ptr<ObjectMapIteratorImpl> ObjectMapIterator;
+  virtual ObjectMapIterator get_iterator(const ghobject_t &oid) {
+    return ObjectMapIterator();
+  }
+
+  virtual KeyValueDB *get_db() { return nullptr; }
+
+  ObjectMap(CephContext* cct, KeyValueDB *db) : cct(cct), db(db) {}
+  virtual ~ObjectMap() {}
+};
+
+#endif
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
new file mode 100644
index 000000000..dc606bd2c
--- /dev/null
+++ b/src/os/ObjectStore.cc
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#include <ctype.h>
+#include <sstream>
+#include "ObjectStore.h"
+#include "common/Formatter.h"
+#include "common/safe_io.h"
+
+#ifndef  WITH_SEASTAR
+#include "filestore/FileStore.h"
+#include "memstore/MemStore.h"
+#endif
+#if defined(WITH_BLUESTORE)
+#include "bluestore/BlueStore.h"
+#endif
+#ifndef WITH_SEASTAR
+#include "kstore/KStore.h"
+#endif
+
+using std::string;
+
+ObjectStore *ObjectStore::create(CephContext *cct,
+				 const string& type,
+				 const string& data,
+				 const string& journal,
+				 osflagbits_t flags)
+{
+#ifndef WITH_SEASTAR
+  if (type == "filestore") {
+    return new FileStore(cct, data, journal, flags);
+  }
+  if (type == "memstore") {
+    return new MemStore(cct, data);
+  }
+#endif
+#if defined(WITH_BLUESTORE)
+  if (type == "bluestore") {
+    return new BlueStore(cct, data);
+  }
+#ifndef WITH_SEASTAR
+  if (type == "random") {
+    if (rand() % 2) {
+      return new FileStore(cct, data, journal, flags);
+    } else {
+      return new BlueStore(cct, data);
+    }
+  }
+#endif
+#else
+#ifndef WITH_SEASTAR
+  if (type == "random") {
+    return new FileStore(cct, data, journal, flags);
+  }
+#endif
+#endif
+#ifndef WITH_SEASTAR
+  if (type == "kstore" &&
+      cct->check_experimental_feature_enabled("kstore")) {
+    return new KStore(cct, data);
+  }
+#endif
+  return NULL;
+}
+
+int ObjectStore::probe_block_device_fsid(
+  CephContext *cct,
+  const string& path,
+  uuid_d *fsid)
+{
+  int r;
+
+#if defined(WITH_BLUESTORE)
+  // first try bluestore -- it has a crc on its header and will fail
+  // reliably.
+  r = BlueStore::get_block_device_fsid(cct, path, fsid);
+  if (r == 0) {
+    lgeneric_dout(cct, 0) << __func__ << " " << path << " is bluestore, "
+			  << *fsid << dendl;
+    return r;
+  }
+#endif
+
+#ifndef WITH_SEASTAR
+  // okay, try FileStore (journal).
+  r = FileStore::get_block_device_fsid(cct, path, fsid);
+  if (r == 0) {
+    lgeneric_dout(cct, 0) << __func__ << " " << path << " is filestore, "
+			  << *fsid << dendl;
+    return r;
+  }
+#endif
+
+  return -EINVAL;
+}
+
+int ObjectStore::write_meta(const std::string& key,
+			    const std::string& value)
+{
+  string v = value;
+  v += "\n";
+  int r = safe_write_file(path.c_str(), key.c_str(),
+			  v.c_str(), v.length(), 0600);
+  if (r < 0)
+    return r;
+  return 0;
+}
+
+int ObjectStore::read_meta(const std::string& key,
+			   std::string *value)
+{
+  char buf[4096];
+  int r = safe_read_file(path.c_str(), key.c_str(),
+			 buf, sizeof(buf));
+  if (r <= 0)
+    return r;
+  // drop trailing newlines
+  while (r && isspace(buf[r-1])) {
+    --r;
+  }
+  *value = string(buf, r);
+  return 0;
+}
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
new file mode 100644
index 000000000..bba8627f5
--- /dev/null
+++ b/src/os/ObjectStore.h
@@ -0,0 +1,777 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_OBJECTSTORE_H
+#define CEPH_OBJECTSTORE_H
+
+#include "include/buffer.h"
+#include "include/common_fwd.h"
+#include "include/Context.h"
+#include "include/interval_set.h"
+#include "include/stringify.h"
+#include "include/types.h"
+
+#include "osd/osd_types.h"
+#include "common/TrackedOp.h"
+#include "common/WorkQueue.h"
+#include "ObjectMap.h"
+#include "os/Transaction.h"
+
+#include <errno.h>
+#include <sys/stat.h>
+#include <vector>
+#include <map>
+
+#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__sun) || defined(_WIN32)
+#include <sys/statvfs.h>
+#else
+#include <sys/vfs.h>    /* or <sys/statfs.h> */
+#endif
+
+namespace ceph {
+  class Formatter;
+}
+
+/*
+ * low-level interface to the local OSD file system
+ */
+
+class Logger;
+class ContextQueue;
+
+static inline void encode(const std::map<std::string,ceph::buffer::ptr> *attrset, ceph::buffer::list &bl) {
+  using ceph::encode;
+  encode(*attrset, bl);
+}
+
+// Flag bits
+typedef uint32_t osflagbits_t;
+const int SKIP_JOURNAL_REPLAY = 1 << 0;
+const int SKIP_MOUNT_OMAP = 1 << 1;
+
+class ObjectStore {
+protected:
+  std::string path;
+
+public:
+  using Transaction = ceph::os::Transaction;
+
+  CephContext* cct;
+  /**
+   * create - create an ObjectStore instance.
+   *
+   * This is invoked once at initialization time.
+   *
+   * @param type type of store. This is a std::string from the configuration file.
+   * @param data path (or other descriptor) for data
+   * @param journal path (or other descriptor) for journal (optional)
+   * @param flags which filestores should check if applicable
+   */
+  static ObjectStore *create(CephContext *cct,
+			     const std::string& type,
+			     const std::string& data,
+			     const std::string& journal,
+			     osflagbits_t flags = 0);
+
+  /**
+   * probe a block device to learn the uuid of the owning OSD
+   *
+   * @param cct cct
+   * @param path path to device
+   * @param fsid [out] osd uuid
+   */
+  static int probe_block_device_fsid(
+    CephContext *cct,
+    const std::string& path,
+    uuid_d *fsid);
+
+  /**
+   * Fetch Object Store statistics.
+   *
+   * Currently only latency of write and apply times are measured.
+   *
+   * This appears to be called with nothing locked.
+   */
+  virtual objectstore_perf_stat_t get_cur_stats() = 0;
+
+  /**
+   * Fetch Object Store performance counters.
+   *
+   *
+   * This appears to be called with nothing locked.
+   */
+  virtual const PerfCounters* get_perf_counters() const = 0;
+
+  /**
+   * a collection also orders transactions
+   *
+   * Any transactions queued under a given collection will be applied in
+   * sequence.  Transactions queued under different collections may run
+   * in parallel.
+   *
+   * ObjectStore users may get collection handles with open_collection() (or,
+   * for bootstrapping a new collection, create_new_collection()).
+   */
+  struct CollectionImpl : public RefCountedObject {
+    const coll_t cid;
+
+    /// wait for any queued transactions to apply
+    // block until any previous transactions are visible.  specifically,
+    // collection_list and collection_empty need to reflect prior operations.
+    virtual void flush() = 0;
+
+    /**
+     * Async flush_commit
+     *
+     * There are two cases:
+     * 1) collection is currently idle: the method returns true.  c is
+     *    not touched.
+     * 2) collection is not idle: the method returns false and c is
+     *    called asynchronously with a value of 0 once all transactions
+     *    queued on this collection prior to the call have been applied
+     *    and committed.
+     */
+    virtual bool flush_commit(Context *c) = 0;
+
+    const coll_t &get_cid() {
+      return cid;
+    }
+  protected:
+    CollectionImpl() = delete;
+    CollectionImpl(CephContext* cct, const coll_t& c) : RefCountedObject(cct), cid(c) {}
+    ~CollectionImpl() = default;
+  };
+  using CollectionHandle = ceph::ref_t<CollectionImpl>;
+
+
+  /*********************************
+   *
+   * Object Contents and semantics
+   *
+   * All ObjectStore objects are identified as a named object
+   * (ghobject_t and hobject_t) in a named collection (coll_t).
+   * ObjectStore operations support the creation, mutation, deletion
+   * and enumeration of objects within a collection.  Enumeration is
+   * in sorted key order (where keys are sorted by hash). Object names
+   * are globally unique.
+   *
+   * Each object has four distinct parts: byte data, xattrs, omap_header
+   * and omap entries.
+   *
+   * The data portion of an object is conceptually equivalent to a
+   * file in a file system. Random and Partial access for both read
+   * and write operations is required. The ability to have a sparse
+   * implementation of the data portion of an object is beneficial for
+   * some workloads, but not required. There is a system-wide limit on
+   * the maximum size of an object, which is typically around 100 MB.
+   *
+   * Xattrs are equivalent to the extended attributes of file
+   * systems. Xattrs are a std::set of key/value pairs.  Sub-value access
+   * is not required. It is possible to enumerate the std::set of xattrs in
+   * key order.  At the implementation level, xattrs are used
+   * exclusively internal to Ceph and the implementer can expect the
+   * total size of all of the xattrs on an object to be relatively
+   * small, i.e., less than 64KB. Much of Ceph assumes that accessing
+   * xattrs on temporally adjacent object accesses (recent past or
+   * near future) is inexpensive.
+   *
+   * omap_header is a single blob of data. It can be read or written
+   * in total.
+   *
+   * Omap entries are conceptually the same as xattrs
+   * but in a different address space. In other words, you can have
+   * the same key as an xattr and an omap entry and they have distinct
+   * values. Enumeration of xattrs doesn't include omap entries and
+   * vice versa. The size and access characteristics of omap entries
+   * are very different from xattrs. In particular, the value portion
+   * of an omap entry can be quite large (MBs).  More importantly, the
+   * interface must support efficient range queries on omap entries even
+   * when there are a large numbers of entries.
+   *
+   *********************************/
+
+  /*******************************
+   *
+   * Collections
+   *
+   * A collection is simply a grouping of objects. Collections have
+   * names (coll_t) and can be enumerated in order.  Like an
+   * individual object, a collection also has a std::set of xattrs.
+   *
+   *
+   */
+
+
+  int queue_transaction(CollectionHandle& ch,
+			Transaction&& t,
+			TrackedOpRef op = TrackedOpRef(),
+			ThreadPool::TPHandle *handle = NULL) {
+    std::vector<Transaction> tls;
+    tls.push_back(std::move(t));
+    return queue_transactions(ch, tls, op, handle);
+  }
+
+  virtual int queue_transactions(
+    CollectionHandle& ch, std::vector<Transaction>& tls,
+    TrackedOpRef op = TrackedOpRef(),
+    ThreadPool::TPHandle *handle = NULL) = 0;
+
+
+ public:
+  ObjectStore(CephContext* cct,
+	      const std::string& path_) : path(path_), cct(cct) {}
+  virtual ~ObjectStore() {}
+
+  // no copying
+  explicit ObjectStore(const ObjectStore& o) = delete;
+  const ObjectStore& operator=(const ObjectStore& o) = delete;
+
+  // versioning
+  virtual int upgrade() {
+    return 0;
+  }
+
+  virtual void get_db_statistics(ceph::Formatter *f) { }
+  virtual void generate_db_histogram(ceph::Formatter *f) { }
+  virtual int flush_cache(std::ostream *os = NULL) { return -1; }
+  virtual void dump_perf_counters(ceph::Formatter *f) {}
+  virtual void dump_cache_stats(ceph::Formatter *f) {}
+  virtual void dump_cache_stats(std::ostream& os) {}
+
+  virtual std::string get_type() = 0;
+
+  // mgmt
+  virtual bool test_mount_in_use() = 0;
+  virtual int mount() = 0;
+  virtual int umount() = 0;
+  virtual int fsck(bool deep) {
+    return -EOPNOTSUPP;
+  }
+  virtual int repair(bool deep) {
+    return -EOPNOTSUPP;
+  }
+  virtual int quick_fix() {
+    return -EOPNOTSUPP;
+  }
+
+  virtual void set_cache_shards(unsigned num) { }
+
+  /**
+   * Returns 0 if the hobject is valid, -error otherwise
+   *
+   * Errors:
+   * -ENAMETOOLONG: locator/namespace/name too large
+   */
+  virtual int validate_hobject_key(const hobject_t &obj) const = 0;
+
+  virtual unsigned get_max_attr_name_length() = 0;
+  virtual int mkfs() = 0;  // wipe
+  virtual int mkjournal() = 0; // journal only
+  virtual bool needs_journal() = 0;  //< requires a journal
+  virtual bool wants_journal() = 0;  //< prefers a journal
+  virtual bool allows_journal() = 0; //< allows a journal
+
+  // return store min allocation size, if applicable
+  virtual uint64_t get_min_alloc_size() const {
+    return 0;
+  }
+
+  /// enumerate hardware devices (by 'devname', e.g., 'sda' as in /sys/block/sda)
+  virtual int get_devices(std::set<std::string> *devls) {
+    return -EOPNOTSUPP;
+  }
+
+  /// true if a txn is readable immediately after it is queued.
+  virtual bool is_sync_onreadable() const {
+    return true;
+  }
+
+  /**
+   * is_rotational
+   *
+   * Check whether store is backed by a rotational (HDD) or non-rotational
+   * (SSD) device.
+   *
+   * This must be usable *before* the store is mounted.
+   *
+   * @return true for HDD, false for SSD
+   */
+  virtual bool is_rotational() {
+    return true;
+  }
+
+  /**
+   * is_journal_rotational
+   *
+   * Check whether journal is backed by a rotational (HDD) or non-rotational
+   * (SSD) device.
+   *
+   *
+   * @return true for HDD, false for SSD
+   */
+  virtual bool is_journal_rotational() {
+    return true;
+  }
+
+  virtual std::string get_default_device_class() {
+    return is_rotational() ? "hdd" : "ssd";
+  }
+
+  virtual int get_numa_node(
+    int *numa_node,
+    std::set<int> *nodes,
+    std::set<std::string> *failed) {
+    return -EOPNOTSUPP;
+  }
+
+
+  virtual bool can_sort_nibblewise() {
+    return false;   // assume a backend cannot, unless it says otherwise
+  }
+
+  virtual int statfs(struct store_statfs_t *buf,
+		     osd_alert_list_t* alerts = nullptr) = 0;
+  virtual int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+			  bool *per_pool_omap) = 0;
+
+  virtual void collect_metadata(std::map<std::string,std::string> *pm) { }
+
+  /**
+   * write_meta - write a simple configuration key out-of-band
+   *
+   * Write a simple key/value pair for basic store configuration
+   * (e.g., a uuid or magic number) to an unopened/unmounted store.
+   * The default implementation writes this to a plaintext file in the
+   * path.
+   *
+   * A newline is appended.
+   *
+   * @param key key name (e.g., "fsid")
+   * @param value value (e.g., a uuid rendered as a std::string)
+   * @returns 0 for success, or an error code
+   */
+  virtual int write_meta(const std::string& key,
+			 const std::string& value);
+
+  /**
+   * read_meta - read a simple configuration key out-of-band
+   *
+   * Read a simple key value to an unopened/mounted store.
+   *
+   * Trailing whitespace is stripped off.
+   *
+   * @param key key name
+   * @param value pointer to value std::string
+   * @returns 0 for success, or an error code
+   */
+  virtual int read_meta(const std::string& key,
+			std::string *value);
+
+  /**
+   * get ideal max value for collection_list()
+   *
+   * default to some arbitrary values; the implementation will override.
+   */
+  virtual int get_ideal_list_max() { return 64; }
+
+
+  /**
+   * get a collection handle
+   *
+   * Provide a trivial handle as a default to avoid converting legacy
+   * implementations.
+   */
+  virtual CollectionHandle open_collection(const coll_t &cid) = 0;
+
+  /**
+   * get a collection handle for a soon-to-be-created collection
+   *
+   * This handle must be used by queue_transaction that includes a
+   * create_collection call in order to become valid.  It will become the
+   * reference to the created collection.
+   */
+  virtual CollectionHandle create_new_collection(const coll_t &cid) = 0;
+
+  /**
+   * std::set ContextQueue for a collection
+   *
+   * After that, oncommits of Transaction will queue into commit_queue.
+   * And osd ShardThread will call oncommits.
+   */
+  virtual void set_collection_commit_queue(const coll_t &cid, ContextQueue *commit_queue) = 0;
+
+  /**
+   * Synchronous read operations
+   */
+
+  /**
+   * exists -- Test for existance of object
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @returns true if object exists, false otherwise
+   */
+  virtual bool exists(CollectionHandle& c, const ghobject_t& oid) = 0;
+  /**
+   * set_collection_opts -- std::set pool options for a collectioninformation for an object
+   *
+   * @param cid collection
+   * @param opts new collection options
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int set_collection_opts(
+    CollectionHandle& c,
+    const pool_opts_t& opts) = 0;
+
+  /**
+   * stat -- get information for an object
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param st output information for the object
+   * @param allow_eio if false, assert on -EIO operation failure
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int stat(
+    CollectionHandle &c,
+    const ghobject_t& oid,
+    struct stat *st,
+    bool allow_eio = false) = 0;
+  /**
+   * read -- read a byte range of data from an object
+   *
+   * Note: if reading from an offset past the end of the object, we
+   * return 0 (not, say, -EINVAL).
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param offset location offset of first byte to be read
+   * @param len number of bytes to be read
+   * @param bl output ceph::buffer::list
+   * @param op_flags is CEPH_OSD_OP_FLAG_*
+   * @returns number of bytes read on success, or negative error code on failure.
+   */
+   virtual int read(
+     CollectionHandle &c,
+     const ghobject_t& oid,
+     uint64_t offset,
+     size_t len,
+     ceph::buffer::list& bl,
+     uint32_t op_flags = 0) = 0;
+
+  /**
+   * fiemap -- get extent std::map of data of an object
+   *
+   * Returns an encoded std::map of the extents of an object's data portion
+   * (std::map<offset,size>).
+   *
+   * A non-enlightened implementation is free to return the extent (offset, len)
+   * as the sole extent.
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param offset location offset of first byte to be read
+   * @param len number of bytes to be read
+   * @param bl output ceph::buffer::list for extent std::map information.
+   * @returns 0 on success, negative error code on failure.
+   */
+   virtual int fiemap(CollectionHandle& c, const ghobject_t& oid,
+		      uint64_t offset, size_t len, ceph::buffer::list& bl) = 0;
+   virtual int fiemap(CollectionHandle& c, const ghobject_t& oid,
+		      uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) = 0;
+
+  /**
+   * readv -- read specfic intervals from an object;
+   * caller must call fiemap to fill in the extent-map first.
+   *
+   * Note: if reading from an offset past the end of the object, we
+   * return 0 (not, say, -EINVAL). Also the default version of readv
+   * reads each extent separately synchronously, which can become horribly
+   * inefficient if the physical layout of the pushing object get massively
+   * fragmented and hence should be overridden by any real os that
+   * cares about the performance..
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param m intervals to be read
+   * @param bl output ceph::buffer::list
+   * @param op_flags is CEPH_OSD_OP_FLAG_*
+   * @returns number of bytes read on success, or negative error code on failure.
+   */
+   virtual int readv(
+     CollectionHandle &c,
+     const ghobject_t& oid,
+     interval_set<uint64_t>& m,
+     ceph::buffer::list& bl,
+     uint32_t op_flags = 0) {
+     int total = 0;
+     for (auto p = m.begin(); p != m.end(); p++) {
+       ceph::buffer::list t;
+       int r = read(c, oid, p.get_start(), p.get_len(), t, op_flags);
+       if (r < 0)
+         return r;
+       total += r;
+       // prune fiemap, if necessary
+       if (p.get_len() != t.length()) {
+          auto save = p++;
+          if (t.length() == 0) {
+            m.erase(save); // Remove this empty interval
+          } else {
+            save.set_len(t.length()); // fix interval length
+            bl.claim_append(t);
+          }
+          // Remove any other follow-up intervals present too
+          while (p != m.end()) {
+            save = p++;
+            m.erase(save);
+          }
+          break;
+       }
+       bl.claim_append(t);
+     }
+     return total;
+   }
+
+  /**
+   * dump_onode -- dumps onode metadata in human readable form,
+     intended primiarily for debugging
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param section_name section name to create and print under
+   * @param f Formatter class instance to print to
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int dump_onode(
+    CollectionHandle &c,
+    const ghobject_t& oid,
+    const std::string& section_name,
+    ceph::Formatter *f) {
+    return -ENOTSUP;
+  }
+
+  /**
+   * getattr -- get an xattr of an object
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param name name of attr to read
+   * @param value place to put output result.
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int getattr(CollectionHandle &c, const ghobject_t& oid,
+		      const char *name, ceph::buffer::ptr& value) = 0;
+
+  /**
+   * getattr -- get an xattr of an object
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param name name of attr to read
+   * @param value place to put output result.
+   * @returns 0 on success, negative error code on failure.
+   */
+  int getattr(
+    CollectionHandle &c, const ghobject_t& oid,
+    const std::string& name, ceph::buffer::list& value) {
+    ceph::buffer::ptr bp;
+    int r = getattr(c, oid, name.c_str(), bp);
+    value.push_back(bp);
+    return r;
+  }
+
+  /**
+   * getattrs -- get all of the xattrs of an object
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param aset place to put output result.
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int getattrs(CollectionHandle &c, const ghobject_t& oid,
+		       std::map<std::string,ceph::buffer::ptr>& aset) = 0;
+
+  /**
+   * getattrs -- get all of the xattrs of an object
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param aset place to put output result.
+   * @returns 0 on success, negative error code on failure.
+   */
+  int getattrs(CollectionHandle &c, const ghobject_t& oid,
+	       std::map<std::string,ceph::buffer::list>& aset) {
+    std::map<std::string,ceph::buffer::ptr> bmap;
+    int r = getattrs(c, oid, bmap);
+    for (auto i = bmap.begin(); i != bmap.end(); ++i) {
+      aset[i->first].append(i->second);
+    }
+    return r;
+  }
+
+
+  // collections
+
+  /**
+   * list_collections -- get all of the collections known to this ObjectStore
+   *
+   * @param ls std::list of the collections in sorted order.
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int list_collections(std::vector<coll_t>& ls) = 0;
+
+  /**
+   * does a collection exist?
+   *
+   * @param c collection
+   * @returns true if it exists, false otherwise
+   */
+  virtual bool collection_exists(const coll_t& c) = 0;
+
+  /**
+   * is a collection empty?
+   *
+   * @param c collection
+   * @param empty true if the specified collection is empty, false otherwise
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int collection_empty(CollectionHandle& c, bool *empty) = 0;
+
+  /**
+   * return the number of significant bits of the coll_t::pgid.
+   *
+   * This should return what the last create_collection or split_collection
+   * std::set.  A legacy backend may return -EAGAIN if the value is unavailable
+   * (because we upgraded from an older version, e.g., FileStore).
+   */
+  virtual int collection_bits(CollectionHandle& c) = 0;
+
+
+  /**
+   * std::list contents of a collection that fall in the range [start, end) and no more than a specified many result
+   *
+   * @param c collection
+   * @param start list object that sort >= this value
+   * @param end list objects that sort < this value
+   * @param max return no more than this many results
+   * @param seq return no objects with snap < seq
+   * @param ls [out] result
+   * @param next [out] next item sorts >= this value
+   * @return zero on success, or negative error
+   */
+  virtual int collection_list(CollectionHandle &c,
+			      const ghobject_t& start, const ghobject_t& end,
+			      int max,
+			      std::vector<ghobject_t> *ls, ghobject_t *next) = 0;
+
+  virtual int collection_list_legacy(CollectionHandle &c,
+                                     const ghobject_t& start,
+                                     const ghobject_t& end, int max,
+                                     std::vector<ghobject_t> *ls,
+                                     ghobject_t *next) {
+    return collection_list(c, start, end, max, ls, next);
+  }
+
+  /// OMAP
+  /// Get omap contents
+  virtual int omap_get(
+    CollectionHandle &c,     ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value std::map
+    ) = 0;
+
+  /// Get omap header
+  virtual int omap_get_header(
+    CollectionHandle &c,     ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    bool allow_eio = false ///< [in] don't assert on eio
+    ) = 0;
+
+  /// Get keys defined on oid
+  virtual int omap_get_keys(
+    CollectionHandle &c,   ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    std::set<std::string> *keys      ///< [out] Keys defined on oid
+    ) = 0;
+
+  /// Get key values
+  virtual int omap_get_values(
+    CollectionHandle &c,         ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::set<std::string> &keys,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) = 0;
+
+#ifdef WITH_SEASTAR
+  virtual int omap_get_values(
+    CollectionHandle &c,         ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::optional<std::string> &start_after,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) = 0;
+#endif
+
+  /// Filters keys into out which are defined on oid
+  virtual int omap_check_keys(
+    CollectionHandle &c,     ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    const std::set<std::string> &keys, ///< [in] Keys to check
+    std::set<std::string> *out         ///< [out] Subset of keys defined on oid
+    ) = 0;
+
+  /**
+   * Returns an object map iterator
+   *
+   * Warning!  The returned iterator is an implicit lock on filestore
+   * operations in c.  Do not use filestore methods on c while the returned
+   * iterator is live.  (Filling in a transaction is no problem).
+   *
+   * @return iterator, null on error
+   */
+  virtual ObjectMap::ObjectMapIterator get_omap_iterator(
+    CollectionHandle &c,   ///< [in] collection
+    const ghobject_t &oid  ///< [in] object
+    ) = 0;
+
+  virtual int flush_journal() { return -EOPNOTSUPP; }
+
+  virtual int dump_journal(std::ostream& out) { return -EOPNOTSUPP; }
+
+  virtual int snapshot(const std::string& name) { return -EOPNOTSUPP; }
+
+  /**
+   * Set and get internal fsid for this instance. No external data is modified
+   */
+  virtual void set_fsid(uuid_d u) = 0;
+  virtual uuid_d get_fsid() = 0;
+
+  /**
+  * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store
+  * - num objects - total (including witeouts) object count to measure used space for.
+  */
+  virtual uint64_t estimate_objects_overhead(uint64_t num_objects) = 0;
+
+
+  // DEBUG
+  virtual void inject_data_error(const ghobject_t &oid) {}
+  virtual void inject_mdata_error(const ghobject_t &oid) {}
+
+  virtual void compact() {}
+  virtual bool has_builtin_csum() const {
+    return false;
+  }
+};
+
+#endif
diff --git a/src/os/Transaction.cc b/src/os/Transaction.cc
new file mode 100644
index 000000000..af82da340
--- /dev/null
+++ b/src/os/Transaction.cc
@@ -0,0 +1,582 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "os/Transaction.h"
+#include "common/Formatter.h"
+
+using std::list;
+using std::map;
+using std::ostream;
+using std::set;
+using std::string;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+
+void decode_str_str_map_to_bl(bufferlist::const_iterator& p,
+			      bufferlist *out)
+{
+  auto start = p;
+  __u32 n;
+  decode(n, p);
+  unsigned len = 4;
+  while (n--) {
+    __u32 l;
+    decode(l, p);
+    p += l;
+    len += 4 + l;
+    decode(l, p);
+    p += l;
+    len += 4 + l;
+  }
+  start.copy(len, *out);
+}
+
+void decode_str_set_to_bl(bufferlist::const_iterator& p,
+			  bufferlist *out)
+{
+  auto start = p;
+  __u32 n;
+  decode(n, p);
+  unsigned len = 4;
+  while (n--) {
+    __u32 l;
+    decode(l, p);
+    p += l;
+    len += 4 + l;
+  }
+  start.copy(len, *out);
+}
+
+namespace ceph::os {
+
+void Transaction::dump(ceph::Formatter *f)
+{
+  f->open_array_section("ops");
+  iterator i = begin();
+  int op_num = 0;
+  bool stop_looping = false;
+  while (i.have_op() && !stop_looping) {
+    Transaction::Op *op = i.decode_op();
+    f->open_object_section("op");
+    f->dump_int("op_num", op_num);
+
+    switch (op->op) {
+    case Transaction::OP_NOP:
+      f->dump_string("op_name", "nop");
+      break;
+    case Transaction::OP_CREATE:
+      {
+	coll_t cid = i.get_cid(op->cid);
+	ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "create");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+      }
+      break;
+
+    case Transaction::OP_TOUCH:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "touch");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+      }
+      break;
+      
+    case Transaction::OP_WRITE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	bufferlist bl;
+	i.decode_bl(bl);
+	f->dump_string("op_name", "write");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+        f->dump_unsigned("length", len);
+        f->dump_unsigned("offset", off);
+        f->dump_unsigned("bufferlist length", bl.length());
+      }
+      break;
+      
+    case Transaction::OP_ZERO:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	f->dump_string("op_name", "zero");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+        f->dump_unsigned("offset", off);
+	f->dump_unsigned("length", len);
+      }
+      break;
+      
+    case Transaction::OP_TRIMCACHE:
+      {
+        // deprecated, no-op
+	f->dump_string("op_name", "trim_cache");
+      }
+      break;
+      
+    case Transaction::OP_TRUNCATE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+	f->dump_string("op_name", "truncate");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->dump_unsigned("offset", off);
+      }
+      break;
+      
+    case Transaction::OP_REMOVE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "remove");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+      }
+      break;
+      
+    case Transaction::OP_SETATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        string name = i.decode_string();
+	bufferlist bl;
+	i.decode_bl(bl);
+	f->dump_string("op_name", "setattr");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->dump_string("name", name);
+	f->dump_unsigned("length", bl.length());
+      }
+      break;
+      
+    case Transaction::OP_SETATTRS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	map<string, bufferptr> aset;
+	i.decode_attrset(aset);
+	f->dump_string("op_name", "setattrs");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->open_object_section("attr_lens");
+	for (map<string,bufferptr>::iterator p = aset.begin();
+	    p != aset.end(); ++p) {
+	  f->dump_unsigned(p->first.c_str(), p->second.length());
+	}
+	f->close_section();
+      }
+      break;
+
+    case Transaction::OP_RMATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        string name = i.decode_string();
+	f->dump_string("op_name", "rmattr");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->dump_string("name", name);
+      }
+      break;
+
+    case Transaction::OP_RMATTRS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "rmattrs");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+      }
+      break;
+      
+    case Transaction::OP_CLONE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ghobject_t noid = i.get_oid(op->dest_oid);
+	f->dump_string("op_name", "clone");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("src_oid") << oid;
+	f->dump_stream("dst_oid") << noid;
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ghobject_t noid = i.get_oid(op->dest_oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	f->dump_string("op_name", "clonerange");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("src_oid") << oid;
+	f->dump_stream("dst_oid") << noid;
+	f->dump_unsigned("offset", off);
+	f->dump_unsigned("len", len);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE2:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ghobject_t noid = i.get_oid(op->dest_oid);
+        uint64_t srcoff = op->off;
+        uint64_t len = op->len;
+        uint64_t dstoff = op->dest_off;
+	f->dump_string("op_name", "clonerange2");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("src_oid") << oid;
+	f->dump_stream("dst_oid") << noid;
+	f->dump_unsigned("src_offset", srcoff);
+	f->dump_unsigned("len", len);
+	f->dump_unsigned("dst_offset", dstoff);
+      }
+      break;
+
+    case Transaction::OP_MKCOLL:
+      {
+        coll_t cid = i.get_cid(op->cid);
+	f->dump_string("op_name", "mkcoll");
+	f->dump_stream("collection") << cid;
+      }
+      break;
+
+    case Transaction::OP_COLL_HINT:
+      {
+	using ceph::decode;
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t type = op->hint;
+        f->dump_string("op_name", "coll_hint");
+        f->dump_stream("collection") << cid;
+        f->dump_unsigned("type", type);
+        bufferlist hint;
+        i.decode_bl(hint);
+        auto hiter = hint.cbegin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          decode(pg_num, hiter);
+          decode(num_objs, hiter);
+          f->dump_unsigned("pg_num", pg_num);
+          f->dump_unsigned("expected_num_objects", num_objs);
+        }
+      }
+      break;
+
+    case Transaction::OP_COLL_SET_BITS:
+      {
+	coll_t cid = i.get_cid(op->cid);
+	f->dump_string("op_name", "coll_set_bits");
+	f->dump_stream("collection") << cid;
+	f->dump_unsigned("bits", op->split_bits);
+      }
+      break;
+
+    case Transaction::OP_RMCOLL:
+      {
+        coll_t cid = i.get_cid(op->cid);
+	f->dump_string("op_name", "rmcoll");
+	f->dump_stream("collection") << cid;
+      }
+      break;
+
+    case Transaction::OP_COLL_ADD:
+      {
+        coll_t ocid = i.get_cid(op->cid);
+        coll_t ncid = i.get_cid(op->dest_cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "collection_add");
+	f->dump_stream("src_collection") << ocid;
+	f->dump_stream("dst_collection") << ncid;
+	f->dump_stream("oid") << oid;
+      }
+      break;
+
+    case Transaction::OP_COLL_REMOVE:
+       {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "collection_remove");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+       }
+      break;
+
+    case Transaction::OP_COLL_MOVE:
+       {
+        coll_t ocid = i.get_cid(op->cid);
+        coll_t ncid = i.get_cid(op->dest_cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->open_object_section("collection_move");
+	f->dump_stream("src_collection") << ocid;
+	f->dump_stream("dst_collection") << ncid;
+	f->dump_stream("oid") << oid;
+	f->close_section();
+       }
+      break;
+
+    case Transaction::OP_COLL_SETATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        string name = i.decode_string();
+	bufferlist bl;
+	i.decode_bl(bl);
+	f->dump_string("op_name", "collection_setattr");
+	f->dump_stream("collection") << cid;
+	f->dump_string("name", name);
+	f->dump_unsigned("length", bl.length());
+      }
+      break;
+
+    case Transaction::OP_COLL_RMATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        string name = i.decode_string();
+	f->dump_string("op_name", "collection_rmattr");
+	f->dump_stream("collection") << cid;
+	f->dump_string("name", name);
+      }
+      break;
+
+    case Transaction::OP_COLL_RENAME:
+      {
+	f->dump_string("op_name", "collection_rename");
+      }
+      break;
+
+    case Transaction::OP_OMAP_CLEAR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "omap_clear");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+      }
+      break;
+
+    case Transaction::OP_OMAP_SETKEYS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	map<string, bufferlist> aset;
+	i.decode_attrset(aset);
+	f->dump_string("op_name", "omap_setkeys");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->open_object_section("attr_lens");
+	for (map<string, bufferlist>::iterator p = aset.begin();
+	    p != aset.end(); ++p) {
+	  f->dump_unsigned(p->first.c_str(), p->second.length());
+	}
+	f->close_section();
+      }
+      break;
+
+    case Transaction::OP_OMAP_RMKEYS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	set<string> keys;
+	i.decode_keyset(keys);
+	f->dump_string("op_name", "omap_rmkeys");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->open_array_section("attrs");
+	for (auto& k : keys) {
+	  f->dump_string("", k.c_str());
+	}
+	f->close_section();
+      }
+      break;
+
+    case Transaction::OP_OMAP_SETHEADER:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	bufferlist bl;
+	i.decode_bl(bl);
+	f->dump_string("op_name", "omap_setheader");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->dump_stream("header_length") << bl.length();
+      }
+      break;
+
+    case Transaction::OP_SPLIT_COLLECTION:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+        coll_t dest = i.get_cid(op->dest_cid);
+	f->dump_string("op_name", "op_split_collection_create");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("bits") << bits;
+	f->dump_stream("rem") << rem;
+	f->dump_stream("dest") << dest;
+      }
+      break;
+
+    case Transaction::OP_SPLIT_COLLECTION2:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+        coll_t dest = i.get_cid(op->dest_cid);
+	f->dump_string("op_name", "op_split_collection");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("bits") << bits;
+	f->dump_stream("rem") << rem;
+	f->dump_stream("dest") << dest;
+      }
+      break;
+
+    case Transaction::OP_MERGE_COLLECTION:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        coll_t dest = i.get_cid(op->dest_cid);
+	f->dump_string("op_name", "op_merge_collection");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("dest") << dest;
+	f->dump_stream("bits") << bits;
+      }
+      break;
+
+    case Transaction::OP_OMAP_RMKEYRANGE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        string first, last;
+        first = i.decode_string();
+        last = i.decode_string();
+	f->dump_string("op_name", "op_omap_rmkeyrange");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->dump_string("first", first);
+	f->dump_string("last", last);
+      }
+      break;
+
+    case Transaction::OP_COLL_MOVE_RENAME:
+      {
+        coll_t old_cid = i.get_cid(op->cid);
+        ghobject_t old_oid = i.get_oid(op->oid);
+        coll_t new_cid = i.get_cid(op->dest_cid);
+        ghobject_t new_oid = i.get_oid(op->dest_oid);
+	f->dump_string("op_name", "op_coll_move_rename");
+	f->dump_stream("old_collection") << old_cid;
+	f->dump_stream("old_oid") << old_oid;
+	f->dump_stream("new_collection") << new_cid;
+	f->dump_stream("new_oid") << new_oid;
+      }
+      break;
+
+    case Transaction::OP_TRY_RENAME:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t old_oid = i.get_oid(op->oid);
+        ghobject_t new_oid = i.get_oid(op->dest_oid);
+	f->dump_string("op_name", "op_coll_move_rename");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("old_oid") << old_oid;
+	f->dump_stream("new_oid") << new_oid;
+      }
+      break;
+	
+    case Transaction::OP_SETALLOCHINT:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t expected_object_size = op->expected_object_size;
+        uint64_t expected_write_size = op->expected_write_size;
+        uint32_t alloc_hint_flags = op->hint;
+        f->dump_string("op_name", "op_setallochint");
+        f->dump_stream("collection") << cid;
+        f->dump_stream("oid") << oid;
+        f->dump_stream("expected_object_size") << expected_object_size;
+        f->dump_stream("expected_write_size") << expected_write_size;
+        f->dump_string("alloc_hint_flags", ceph_osd_alloc_hint_flag_string(alloc_hint_flags));
+      }
+      break;
+
+    default:
+      f->dump_string("op_name", "unknown");
+      f->dump_unsigned("op_code", op->op);
+      stop_looping = true;
+      break;
+    }
+    f->close_section();
+    op_num++;
+  }
+  f->close_section();
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+void Transaction::generate_test_instances(list<Transaction*>& o)
+{
+  o.push_back(new Transaction);
+
+  Transaction *t = new Transaction;
+  t->nop();
+  o.push_back(t);
+  
+  t = new Transaction;
+  coll_t c(spg_t(pg_t(1,2), shard_id_t::NO_SHARD));
+  coll_t c2(spg_t(pg_t(4,5), shard_id_t::NO_SHARD));
+  ghobject_t o1(hobject_t("obj", "", 123, 456, -1, ""));
+  ghobject_t o2(hobject_t("obj2", "", 123, 456, -1, ""));
+  ghobject_t o3(hobject_t("obj3", "", 123, 456, -1, ""));
+  t->touch(c, o1);
+  bufferlist bl;
+  bl.append("some data");
+  t->write(c, o1, 1, bl.length(), bl);
+  t->zero(c, o1, 22, 33);
+  t->truncate(c, o1, 99);
+  t->remove(c, o1);
+  o.push_back(t);
+
+  t = new Transaction;
+  t->setattr(c, o1, "key", bl);
+  map<string,bufferptr> m;
+  m["a"] = buffer::copy("this", 4);
+  m["b"] = buffer::copy("that", 4);
+  t->setattrs(c, o1, m);
+  t->rmattr(c, o1, "b");
+  t->rmattrs(c, o1);
+
+  t->clone(c, o1, o2);
+  t->clone(c, o1, o3);
+  t->clone_range(c, o1, o2, 1, 12, 99);
+
+  t->create_collection(c, 12);
+  t->collection_move_rename(c, o2, c2, o3);
+  t->remove_collection(c);
+  o.push_back(t);  
+}
+
+ostream& operator<<(ostream& out, const Transaction& tx) {
+
+  return out << "Transaction(" << &tx << ")";
+}
+
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
+
+}
diff --git a/src/os/Transaction.h b/src/os/Transaction.h
new file mode 100644
index 000000000..8ad164088
--- /dev/null
+++ b/src/os/Transaction.h
@@ -0,0 +1,1297 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+
+#include "include/Context.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+
+#include "osd/osd_types.h"
+
+#define OPS_PER_PTR 32
+
+void decode_str_str_map_to_bl(ceph::buffer::list::const_iterator& p, ceph::buffer::list *out);
+void decode_str_set_to_bl(ceph::buffer::list::const_iterator& p, ceph::buffer::list *out);
+
+
+/*********************************
+ * transaction
+ *
+ * A Transaction represents a sequence of primitive mutation
+ * operations.
+ *
+ * Three events in the life of a Transaction result in
+ * callbacks. Any Transaction can contain any number of callback
+ * objects (Context) for any combination of the three classes of
+ * callbacks:
+ *
+ *    on_applied_sync, on_applied, and on_commit.
+ *
+ * The "on_applied" and "on_applied_sync" callbacks are invoked when
+ * the modifications requested by the Transaction are visible to
+ * subsequent ObjectStore operations, i.e., the results are
+ * readable. The only conceptual difference between on_applied and
+ * on_applied_sync is the specific thread and locking environment in
+ * which the callbacks operate.  "on_applied_sync" is called
+ * directly by an ObjectStore execution thread. It is expected to
+ * execute quickly and must not acquire any locks of the calling
+ * environment. Conversely, "on_applied" is called from the separate
+ * Finisher thread, meaning that it can contend for calling
+ * environment locks. NB, on_applied and on_applied_sync are
+ * sometimes called on_readable and on_readable_sync.
+ *
+ * The "on_commit" callback is also called from the Finisher thread
+ * and indicates that all of the mutations have been durably
+ * committed to stable storage (i.e., are now software/hardware
+ * crashproof).
+ *
+ * At the implementation level, each mutation primitive (and its
+ * associated data) can be serialized to a single buffer.  That
+ * serialization, however, does not copy any data, but (using the
+ * ceph::buffer::list library) will reference the original buffers.  This
+ * implies that the buffer that contains the data being submitted
+ * must remain stable until the on_commit callback completes.  In
+ * practice, ceph::buffer::list handles all of this for you and this
+ * subtlety is only relevant if you are referencing an existing
+ * buffer via buffer::raw_static.
+ *
+ * Some implementations of ObjectStore choose to implement their own
+ * form of journaling that uses the serialized form of a
+ * Transaction. This requires that the encode/decode logic properly
+ * version itself and handle version upgrades that might change the
+ * format of the encoded Transaction. This has already happened a
+ * couple of times and the Transaction object contains some helper
+ * variables that aid in this legacy decoding:
+ *
+ *   sobject_encoding detects an older/simpler version of oid
+ *   present in pre-bobtail versions of ceph.  use_pool_override
+ *   also detects a situation where the pool of an oid can be
+ *   overridden for legacy operations/buffers.  For non-legacy
+ *   implementations of ObjectStore, neither of these fields are
+ *   relevant.
+ *
+ *
+ * TRANSACTION ISOLATION
+ *
+ * Except as noted above, isolation is the responsibility of the
+ * caller. In other words, if any storage element (storage element
+ * == any of the four portions of an object as described above) is
+ * altered by a transaction (including deletion), the caller
+ * promises not to attempt to read that element while the
+ * transaction is pending (here pending means from the time of
+ * issuance until the "on_applied_sync" callback has been
+ * received). Violations of isolation need not be detected by
+ * ObjectStore and there is no corresponding error mechanism for
+ * reporting an isolation violation (crashing would be the
+ * appropriate way to report an isolation violation if detected).
+ *
+ * Enumeration operations may violate transaction isolation as
+ * described above when a storage element is being created or
+ * deleted as part of a transaction. In this case, ObjectStore is
+ * allowed to consider the enumeration operation to either precede
+ * or follow the violating transaction element. In other words, the
+ * presence/absence of the mutated element in the enumeration is
+ * entirely at the discretion of ObjectStore. The arbitrary ordering
+ * applies independently to each transaction element. For example,
+ * if a transaction contains two mutating elements "create A" and
+ * "delete B". And an enumeration operation is performed while this
+ * transaction is pending. It is permissible for ObjectStore to
+ * report any of the four possible combinations of the existence of
+ * A and B.
+ *
+ */
+namespace ceph::os {
+class Transaction {
+public:
+  enum {
+    OP_NOP =          0,
+    OP_CREATE =       7,   // cid, oid
+    OP_TOUCH =        9,   // cid, oid
+    OP_WRITE =        10,  // cid, oid, offset, len, bl
+    OP_ZERO =         11,  // cid, oid, offset, len
+    OP_TRUNCATE =     12,  // cid, oid, len
+    OP_REMOVE =       13,  // cid, oid
+    OP_SETATTR =      14,  // cid, oid, attrname, bl
+    OP_SETATTRS =     15,  // cid, oid, attrset
+    OP_RMATTR =       16,  // cid, oid, attrname
+    OP_CLONE =        17,  // cid, oid, newoid
+    OP_CLONERANGE =   18,  // cid, oid, newoid, offset, len
+    OP_CLONERANGE2 =  30,  // cid, oid, newoid, srcoff, len, dstoff
+
+    OP_TRIMCACHE =    19,  // cid, oid, offset, len  **DEPRECATED**
+
+    OP_MKCOLL =       20,  // cid
+    OP_RMCOLL =       21,  // cid
+    OP_COLL_ADD =     22,  // cid, oldcid, oid
+    OP_COLL_REMOVE =  23,  // cid, oid
+    OP_COLL_SETATTR = 24,  // cid, attrname, bl
+    OP_COLL_RMATTR =  25,  // cid, attrname
+    OP_COLL_SETATTRS = 26,  // cid, attrset
+    OP_COLL_MOVE =    8,   // newcid, oldcid, oid
+
+    OP_RMATTRS =      28,  // cid, oid
+    OP_COLL_RENAME =       29,  // cid, newcid
+
+    OP_OMAP_CLEAR = 31,   // cid
+    OP_OMAP_SETKEYS = 32, // cid, attrset
+    OP_OMAP_RMKEYS = 33,  // cid, keyset
+    OP_OMAP_SETHEADER = 34, // cid, header
+    OP_SPLIT_COLLECTION = 35, // cid, bits, destination
+    OP_SPLIT_COLLECTION2 = 36, /* cid, bits, destination
+				    doesn't create the destination */
+    OP_OMAP_RMKEYRANGE = 37,  // cid, oid, firstkey, lastkey
+    OP_COLL_MOVE_RENAME = 38,   // oldcid, oldoid, newcid, newoid
+
+    OP_SETALLOCHINT = 39,  // cid, oid, object_size, write_size
+    OP_COLL_HINT = 40, // cid, type, bl
+
+    OP_TRY_RENAME = 41,   // oldcid, oldoid, newoid
+
+    OP_COLL_SET_BITS = 42, // cid, bits
+
+    OP_MERGE_COLLECTION = 43, // cid, destination
+  };
+
+  // Transaction hint type
+  enum {
+    COLL_HINT_EXPECTED_NUM_OBJECTS = 1,
+  };
+
+  struct Op {
+    ceph_le32 op;
+    ceph_le32 cid;
+    ceph_le32 oid;
+    ceph_le64 off;
+    ceph_le64 len;
+    ceph_le32 dest_cid;
+    ceph_le32 dest_oid;               //OP_CLONE, OP_CLONERANGE
+    ceph_le64 dest_off;               //OP_CLONERANGE
+    ceph_le32 hint;                   //OP_COLL_HINT,OP_SETALLOCHINT
+    ceph_le64 expected_object_size;   //OP_SETALLOCHINT
+    ceph_le64 expected_write_size;    //OP_SETALLOCHINT
+    ceph_le32 split_bits;             //OP_SPLIT_COLLECTION2,OP_COLL_SET_BITS,
+                                      //OP_MKCOLL
+    ceph_le32 split_rem;              //OP_SPLIT_COLLECTION2
+  } __attribute__ ((packed)) ;
+
+  struct TransactionData {
+    ceph_le64 ops;
+    ceph_le32 largest_data_len;
+    ceph_le32 largest_data_off;
+    ceph_le32 largest_data_off_in_data_bl;
+    ceph_le32 fadvise_flags;
+
+    TransactionData() noexcept :
+      ops(init_le64(0)),
+      largest_data_len(init_le32(0)),
+      largest_data_off(init_le32(0)),
+      largest_data_off_in_data_bl(init_le32(0)),
+      fadvise_flags(init_le32(0)) { }
+
+    // override default move operations to reset default values
+    TransactionData(TransactionData&& other) noexcept :
+      ops(other.ops),
+      largest_data_len(other.largest_data_len),
+      largest_data_off(other.largest_data_off),
+      largest_data_off_in_data_bl(other.largest_data_off_in_data_bl),
+      fadvise_flags(other.fadvise_flags) {
+      other.ops = 0;
+      other.largest_data_len = 0;
+      other.largest_data_off = 0;
+      other.largest_data_off_in_data_bl = 0;
+      other.fadvise_flags = 0;
+    }
+    TransactionData& operator=(TransactionData&& other) noexcept {
+      ops = other.ops;
+      largest_data_len = other.largest_data_len;
+      largest_data_off = other.largest_data_off;
+      largest_data_off_in_data_bl = other.largest_data_off_in_data_bl;
+      fadvise_flags = other.fadvise_flags;
+      other.ops = 0;
+      other.largest_data_len = 0;
+      other.largest_data_off = 0;
+      other.largest_data_off_in_data_bl = 0;
+      other.fadvise_flags = 0;
+      return *this;
+    }
+
+    TransactionData(const TransactionData& other) = default;
+    TransactionData& operator=(const TransactionData& other) = default;
+
+    void encode(ceph::buffer::list& bl) const {
+      bl.append((char*)this, sizeof(TransactionData));
+    }
+    void decode(ceph::buffer::list::const_iterator &bl) {
+      bl.copy(sizeof(TransactionData), (char*)this);
+    }
+  } __attribute__ ((packed)) ;
+
+private:
+  TransactionData data;
+
+  std::map<coll_t, uint32_t> coll_index;
+  std::map<ghobject_t, uint32_t> object_index;
+
+  uint32_t coll_id = 0;
+  uint32_t object_id = 0;
+
+  ceph::buffer::list data_bl;
+  ceph::buffer::list op_bl;
+
+  std::list<Context *> on_applied;
+  std::list<Context *> on_commit;
+  std::list<Context *> on_applied_sync;
+
+public:
+  Transaction() = default;
+
+  explicit Transaction(ceph::buffer::list::const_iterator &dp) {
+    decode(dp);
+  }
+  explicit Transaction(ceph::buffer::list &nbl) {
+    auto dp = nbl.cbegin();
+    decode(dp);
+  }
+
+  // override default move operations to reset default values
+  Transaction(Transaction&& other) noexcept :
+    data(std::move(other.data)),
+    coll_index(std::move(other.coll_index)),
+    object_index(std::move(other.object_index)),
+    coll_id(other.coll_id),
+    object_id(other.object_id),
+    data_bl(std::move(other.data_bl)),
+    op_bl(std::move(other.op_bl)),
+    on_applied(std::move(other.on_applied)),
+    on_commit(std::move(other.on_commit)),
+    on_applied_sync(std::move(other.on_applied_sync)) {
+    other.coll_id = 0;
+    other.object_id = 0;
+  }
+
+  Transaction& operator=(Transaction&& other) noexcept {
+    data = std::move(other.data);
+    coll_index = std::move(other.coll_index);
+    object_index = std::move(other.object_index);
+    coll_id = other.coll_id;
+    object_id = other.object_id;
+    data_bl = std::move(other.data_bl);
+    op_bl = std::move(other.op_bl);
+    on_applied = std::move(other.on_applied);
+    on_commit = std::move(other.on_commit);
+    on_applied_sync = std::move(other.on_applied_sync);
+    other.coll_id = 0;
+    other.object_id = 0;
+    return *this;
+  }
+
+  Transaction(const Transaction& other) = default;
+  Transaction& operator=(const Transaction& other) = default;
+
+  // expose object_index for FileStore::Op's benefit
+  const std::map<ghobject_t, uint32_t>& get_object_index() const {
+    return object_index;
+  }
+
+  /* Operations on callback contexts */
+  void register_on_applied(Context *c) {
+    if (!c) return;
+    on_applied.push_back(c);
+  }
+  void register_on_commit(Context *c) {
+    if (!c) return;
+    on_commit.push_back(c);
+  }
+  void register_on_applied_sync(Context *c) {
+    if (!c) return;
+    on_applied_sync.push_back(c);
+  }
+  void register_on_complete(Context *c) {
+    if (!c) return;
+    RunOnDeleteRef _complete (std::make_shared<RunOnDelete>(c));
+    register_on_applied(new ContainerContext<RunOnDeleteRef>(_complete));
+    register_on_commit(new ContainerContext<RunOnDeleteRef>(_complete));
+  }
+  bool has_contexts() const {
+    return
+	!on_commit.empty() ||
+	!on_applied.empty() ||
+	!on_applied_sync.empty();
+  }
+
+  static void collect_contexts(
+    std::vector<Transaction>& t,
+    Context **out_on_applied,
+    Context **out_on_commit,
+    Context **out_on_applied_sync) {
+    ceph_assert(out_on_applied);
+    ceph_assert(out_on_commit);
+    ceph_assert(out_on_applied_sync);
+    std::list<Context *> on_applied, on_commit, on_applied_sync;
+    for (auto& i : t) {
+	on_applied.splice(on_applied.end(), i.on_applied);
+	on_commit.splice(on_commit.end(), i.on_commit);
+	on_applied_sync.splice(on_applied_sync.end(), i.on_applied_sync);
+    }
+    *out_on_applied = C_Contexts::list_to_context(on_applied);
+    *out_on_commit = C_Contexts::list_to_context(on_commit);
+    *out_on_applied_sync = C_Contexts::list_to_context(on_applied_sync);
+  }
+  static void collect_contexts(
+    std::vector<Transaction>& t,
+    std::list<Context*> *out_on_applied,
+    std::list<Context*> *out_on_commit,
+    std::list<Context*> *out_on_applied_sync) {
+    ceph_assert(out_on_applied);
+    ceph_assert(out_on_commit);
+    ceph_assert(out_on_applied_sync);
+    for (auto& i : t) {
+	out_on_applied->splice(out_on_applied->end(), i.on_applied);
+	out_on_commit->splice(out_on_commit->end(), i.on_commit);
+	out_on_applied_sync->splice(out_on_applied_sync->end(),
+				    i.on_applied_sync);
+    }
+  }
+  static Context *collect_all_contexts(
+    Transaction& t) {
+    std::list<Context*> contexts;
+    contexts.splice(contexts.end(), t.on_applied);
+    contexts.splice(contexts.end(), t.on_commit);
+    contexts.splice(contexts.end(), t.on_applied_sync);
+    return C_Contexts::list_to_context(contexts);
+  }
+
+  Context *get_on_applied() {
+    return C_Contexts::list_to_context(on_applied);
+  }
+  Context *get_on_commit() {
+    return C_Contexts::list_to_context(on_commit);
+  }
+  Context *get_on_applied_sync() {
+    return C_Contexts::list_to_context(on_applied_sync);
+  }
+
+  void set_fadvise_flags(uint32_t flags) {
+    data.fadvise_flags = flags;
+  }
+  void set_fadvise_flag(uint32_t flag) {
+    data.fadvise_flags = data.fadvise_flags | flag;
+  }
+  uint32_t get_fadvise_flags() { return data.fadvise_flags; }
+
+  void swap(Transaction& other) noexcept {
+    std::swap(data, other.data);
+    std::swap(on_applied, other.on_applied);
+    std::swap(on_commit, other.on_commit);
+    std::swap(on_applied_sync, other.on_applied_sync);
+
+    std::swap(coll_index, other.coll_index);
+    std::swap(object_index, other.object_index);
+    std::swap(coll_id, other.coll_id);
+    std::swap(object_id, other.object_id);
+    op_bl.swap(other.op_bl);
+    data_bl.swap(other.data_bl);
+  }
+
+  void _update_op(Op* op,
+    std::vector<uint32_t> &cm,
+    std::vector<uint32_t> &om) {
+
+    switch (op->op) {
+    case OP_NOP:
+      break;
+
+    case OP_CREATE:
+    case OP_TOUCH:
+    case OP_REMOVE:
+    case OP_SETATTR:
+    case OP_SETATTRS:
+    case OP_RMATTR:
+    case OP_RMATTRS:
+    case OP_COLL_REMOVE:
+    case OP_OMAP_CLEAR:
+    case OP_OMAP_SETKEYS:
+    case OP_OMAP_RMKEYS:
+    case OP_OMAP_RMKEYRANGE:
+    case OP_OMAP_SETHEADER:
+    case OP_WRITE:
+    case OP_ZERO:
+    case OP_TRUNCATE:
+    case OP_SETALLOCHINT:
+      ceph_assert(op->cid < cm.size());
+      ceph_assert(op->oid < om.size());
+      op->cid = cm[op->cid];
+      op->oid = om[op->oid];
+      break;
+
+    case OP_CLONERANGE2:
+    case OP_CLONE:
+      ceph_assert(op->cid < cm.size());
+      ceph_assert(op->oid < om.size());
+      ceph_assert(op->dest_oid < om.size());
+      op->cid = cm[op->cid];
+      op->oid = om[op->oid];
+      op->dest_oid = om[op->dest_oid];
+      break;
+
+    case OP_MKCOLL:
+    case OP_RMCOLL:
+    case OP_COLL_SETATTR:
+    case OP_COLL_RMATTR:
+    case OP_COLL_SETATTRS:
+    case OP_COLL_HINT:
+    case OP_COLL_SET_BITS:
+      ceph_assert(op->cid < cm.size());
+      op->cid = cm[op->cid];
+      break;
+
+    case OP_COLL_ADD:
+      ceph_assert(op->cid < cm.size());
+      ceph_assert(op->oid < om.size());
+      ceph_assert(op->dest_cid < om.size());
+      op->cid = cm[op->cid];
+      op->dest_cid = cm[op->dest_cid];
+      op->oid = om[op->oid];
+      break;
+
+    case OP_COLL_MOVE_RENAME:
+      ceph_assert(op->cid < cm.size());
+      ceph_assert(op->oid < om.size());
+      ceph_assert(op->dest_cid < cm.size());
+      ceph_assert(op->dest_oid < om.size());
+      op->cid = cm[op->cid];
+      op->oid = om[op->oid];
+      op->dest_cid = cm[op->dest_cid];
+      op->dest_oid = om[op->dest_oid];
+      break;
+
+    case OP_TRY_RENAME:
+      ceph_assert(op->cid < cm.size());
+      ceph_assert(op->oid < om.size());
+      ceph_assert(op->dest_oid < om.size());
+      op->cid = cm[op->cid];
+      op->oid = om[op->oid];
+      op->dest_oid = om[op->dest_oid];
+	break;
+
+    case OP_SPLIT_COLLECTION2:
+      ceph_assert(op->cid < cm.size());
+	ceph_assert(op->dest_cid < cm.size());
+      op->cid = cm[op->cid];
+      op->dest_cid = cm[op->dest_cid];
+      break;
+
+    case OP_MERGE_COLLECTION:
+      ceph_assert(op->cid < cm.size());
+	ceph_assert(op->dest_cid < cm.size());
+      op->cid = cm[op->cid];
+      op->dest_cid = cm[op->dest_cid];
+      break;
+
+    default:
+      ceph_abort_msg("Unknown OP");
+    }
+  }
+  void _update_op_bl(
+    ceph::buffer::list& bl,
+    std::vector<uint32_t> &cm,
+    std::vector<uint32_t> &om) {
+    for (auto& bp : bl.buffers()) {
+      ceph_assert(bp.length() % sizeof(Op) == 0);
+
+      char* raw_p = const_cast<char*>(bp.c_str());
+      char* raw_end = raw_p + bp.length();
+      while (raw_p < raw_end) {
+        _update_op(reinterpret_cast<Op*>(raw_p), cm, om);
+        raw_p += sizeof(Op);
+      }
+    }
+  }
+  /// Append the operations of the parameter to this Transaction. Those operations are removed from the parameter Transaction
+  void append(Transaction& other) {
+
+    data.ops = data.ops + other.data.ops;
+    if (other.data.largest_data_len > data.largest_data_len) {
+	data.largest_data_len = other.data.largest_data_len;
+	data.largest_data_off = other.data.largest_data_off;
+	data.largest_data_off_in_data_bl = data_bl.length() + other.data.largest_data_off_in_data_bl;
+    }
+    data.fadvise_flags = data.fadvise_flags | other.data.fadvise_flags;
+    on_applied.splice(on_applied.end(), other.on_applied);
+    on_commit.splice(on_commit.end(), other.on_commit);
+    on_applied_sync.splice(on_applied_sync.end(), other.on_applied_sync);
+
+    //append coll_index & object_index
+    std::vector<uint32_t> cm(other.coll_index.size());
+    std::map<coll_t, uint32_t>::iterator coll_index_p;
+    for (coll_index_p = other.coll_index.begin();
+         coll_index_p != other.coll_index.end();
+         ++coll_index_p) {
+      cm[coll_index_p->second] = _get_coll_id(coll_index_p->first);
+    }
+
+    std::vector<uint32_t> om(other.object_index.size());
+    std::map<ghobject_t, uint32_t>::iterator object_index_p;
+    for (object_index_p = other.object_index.begin();
+         object_index_p != other.object_index.end();
+         ++object_index_p) {
+      om[object_index_p->second] = _get_object_id(object_index_p->first);
+    }
+
+    //the other.op_bl SHOULD NOT be changes during append operation,
+    //we use additional ceph::buffer::list to avoid this problem
+    ceph::buffer::list other_op_bl;
+    {
+      ceph::buffer::ptr other_op_bl_ptr(other.op_bl.length());
+      other.op_bl.begin().copy(other.op_bl.length(), other_op_bl_ptr.c_str());
+      other_op_bl.append(std::move(other_op_bl_ptr));
+    }
+
+    //update other_op_bl with cm & om
+    //When the other is appended to current transaction, all coll_index and
+    //object_index in other.op_buffer should be updated by new index of the
+    //combined transaction
+    _update_op_bl(other_op_bl, cm, om);
+
+    //append op_bl
+    op_bl.append(other_op_bl);
+    //append data_bl
+    data_bl.append(other.data_bl);
+  }
+
+  /** Inquires about the Transaction as a whole. */
+
+  /// How big is the encoded Transaction buffer?
+  uint64_t get_encoded_bytes() {
+    //layout: data_bl + op_bl + coll_index + object_index + data
+
+    // coll_index size, object_index size and sizeof(transaction_data)
+    // all here, so they may be computed at compile-time
+    size_t final_size = sizeof(__u32) * 2 + sizeof(data);
+
+    // coll_index second and object_index second
+    final_size += (coll_index.size() + object_index.size()) * sizeof(__u32);
+
+    // coll_index first
+    for (auto p = coll_index.begin(); p != coll_index.end(); ++p) {
+	final_size += p->first.encoded_size();
+    }
+
+    // object_index first
+    for (auto p = object_index.begin(); p != object_index.end(); ++p) {
+	final_size += p->first.encoded_size();
+    }
+
+    return data_bl.length() +
+	op_bl.length() +
+	final_size;
+  }
+
+  /// Retain old version for regression testing purposes
+  uint64_t get_encoded_bytes_test() {
+    using ceph::encode;
+    //layout: data_bl + op_bl + coll_index + object_index + data
+    ceph::buffer::list bl;
+    encode(coll_index, bl);
+    encode(object_index, bl);
+
+    return data_bl.length() +
+	op_bl.length() +
+	bl.length() +
+	sizeof(data);
+  }
+
+  uint64_t get_num_bytes() {
+    return get_encoded_bytes();
+  }
+  /// Size of largest data buffer to the "write" operation encountered so far
+  uint32_t get_data_length() {
+    return data.largest_data_len;
+  }
+  /// offset within the encoded buffer to the start of the largest data buffer that's encoded
+  uint32_t get_data_offset() {
+    if (data.largest_data_off_in_data_bl) {
+	return data.largest_data_off_in_data_bl +
+	  sizeof(__u8) +      // encode struct_v
+	  sizeof(__u8) +      // encode compat_v
+	  sizeof(__u32) +     // encode len
+	  sizeof(__u32);      // data_bl len
+    }
+    return 0;  // none
+  }
+  /// offset of buffer as aligned to destination within object.
+  int get_data_alignment() {
+    if (!data.largest_data_len)
+	return 0;
+    return (0 - get_data_offset()) & ~CEPH_PAGE_MASK;
+  }
+  /// Is the Transaction empty (no operations)
+  bool empty() {
+    return !data.ops;
+  }
+  /// Number of operations in the transaction
+  int get_num_ops() {
+    return data.ops;
+  }
+
+  /**
+   * iterator
+   *
+   * Helper object to parse Transactions.
+   *
+   * ObjectStore instances use this object to step down the encoded
+   * buffer decoding operation codes and parameters as we go.
+   *
+   */
+  class iterator {
+    Transaction *t;
+
+    uint64_t ops;
+    char* op_buffer_p;
+
+    ceph::buffer::list::const_iterator data_bl_p;
+
+  public:
+    std::vector<coll_t> colls;
+    std::vector<ghobject_t> objects;
+
+  private:
+    explicit iterator(Transaction *t)
+      : t(t),
+	  data_bl_p(t->data_bl.cbegin()),
+        colls(t->coll_index.size()),
+        objects(t->object_index.size()) {
+
+      ops = t->data.ops;
+      op_buffer_p = t->op_bl.c_str();
+
+      std::map<coll_t, uint32_t>::iterator coll_index_p;
+      for (coll_index_p = t->coll_index.begin();
+           coll_index_p != t->coll_index.end();
+           ++coll_index_p) {
+        colls[coll_index_p->second] = coll_index_p->first;
+      }
+
+      std::map<ghobject_t, uint32_t>::iterator object_index_p;
+      for (object_index_p = t->object_index.begin();
+           object_index_p != t->object_index.end();
+           ++object_index_p) {
+        objects[object_index_p->second] = object_index_p->first;
+      }
+    }
+
+    friend class Transaction;
+
+  public:
+
+    bool have_op() {
+      return ops > 0;
+    }
+    Op* decode_op() {
+      ceph_assert(ops > 0);
+
+      Op* op = reinterpret_cast<Op*>(op_buffer_p);
+      op_buffer_p += sizeof(Op);
+      ops--;
+
+      return op;
+    }
+    std::string decode_string() {
+	using ceph::decode;
+      std::string s;
+      decode(s, data_bl_p);
+      return s;
+    }
+    void decode_bp(ceph::buffer::ptr& bp) {
+	using ceph::decode;
+      decode(bp, data_bl_p);
+    }
+    void decode_bl(ceph::buffer::list& bl) {
+	using ceph::decode;
+      decode(bl, data_bl_p);
+    }
+    void decode_attrset(std::map<std::string,ceph::buffer::ptr>& aset) {
+	using ceph::decode;
+      decode(aset, data_bl_p);
+    }
+    void decode_attrset(std::map<std::string,ceph::buffer::list>& aset) {
+	using ceph::decode;
+      decode(aset, data_bl_p);
+    }
+    void decode_attrset_bl(ceph::buffer::list *pbl) {
+	decode_str_str_map_to_bl(data_bl_p, pbl);
+    }
+    void decode_keyset(std::set<std::string> &keys){
+	using ceph::decode;
+      decode(keys, data_bl_p);
+    }
+    void decode_keyset_bl(ceph::buffer::list *pbl){
+      decode_str_set_to_bl(data_bl_p, pbl);
+    }
+
+    const ghobject_t &get_oid(uint32_t oid_id) {
+      ceph_assert(oid_id < objects.size());
+      return objects[oid_id];
+    }
+    const coll_t &get_cid(uint32_t cid_id) {
+      ceph_assert(cid_id < colls.size());
+      return colls[cid_id];
+    }
+    uint32_t get_fadvise_flags() const {
+	return t->get_fadvise_flags();
+    }
+
+    const vector<ghobject_t> &get_objects() const {
+      return objects;
+    }
+  };
+
+  iterator begin() {
+     return iterator(this);
+  }
+
+private:
+  void _build_actions_from_tbl();
+
+  /**
+   * Helper functions to encode the various mutation elements of a
+   * transaction.  These are 1:1 with the operation codes (see
+   * enumeration above).  These routines ensure that the
+   * encoder/creator of a transaction gets the right data in the
+   * right place. Sadly, there's no corresponding version nor any
+   * form of seat belts for the decoder.
+   */
+  Op* _get_next_op() {
+    if (op_bl.get_append_buffer_unused_tail_length() < sizeof(Op)) {
+      op_bl.reserve(sizeof(Op) * OPS_PER_PTR);
+    }
+    // append_hole ensures bptr merging. Even huge number of ops
+    // shouldn't result in overpopulating bl::_buffers.
+    char* const p = op_bl.append_hole(sizeof(Op)).c_str();
+    memset(p, 0, sizeof(Op));
+    return reinterpret_cast<Op*>(p);
+  }
+  uint32_t _get_coll_id(const coll_t& coll) {
+    std::map<coll_t, uint32_t>::iterator c = coll_index.find(coll);
+    if (c != coll_index.end())
+      return c->second;
+
+    uint32_t index_id = coll_id++;
+    coll_index[coll] = index_id;
+    return index_id;
+  }
+  uint32_t _get_object_id(const ghobject_t& oid) {
+    std::map<ghobject_t, uint32_t>::iterator o = object_index.find(oid);
+    if (o != object_index.end())
+      return o->second;
+
+    uint32_t index_id = object_id++;
+    object_index[oid] = index_id;
+    return index_id;
+  }
+
+public:
+  /// noop. 'nuf said
+  void nop() {
+    Op* _op = _get_next_op();
+    _op->op = OP_NOP;
+    data.ops = data.ops + 1;
+  }
+  /**
+   * create
+   *
+   * create an object that does not yet exist
+   * (behavior is undefined if the object already exists)
+   */
+  void create(const coll_t& cid, const ghobject_t& oid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_CREATE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+  /**
+   * touch
+   *
+   * Ensure the existance of an object in a collection. Create an
+   * empty object if necessary
+   */
+  void touch(const coll_t& cid, const ghobject_t& oid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_TOUCH;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+  /**
+   * Write data to an offset within an object. If the object is too
+   * small, it is expanded as needed.  It is possible to specify an
+   * offset beyond the current end of an object and it will be
+   * expanded as needed. Simple implementations of ObjectStore will
+   * just zero the data between the old end of the object and the
+   * newly provided data. More sophisticated implementations of
+   * ObjectStore will omit the untouched data and store it as a
+   * "hole" in the file.
+   *
+   * Note that a 0-length write does not affect the size of the object.
+   */
+  void write(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len,
+	       const ceph::buffer::list& write_data, uint32_t flags = 0) {
+    using ceph::encode;
+    uint32_t orig_len = data_bl.length();
+    Op* _op = _get_next_op();
+    _op->op = OP_WRITE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    _op->off = off;
+    _op->len = len;
+    encode(write_data, data_bl);
+
+    ceph_assert(len == write_data.length());
+    data.fadvise_flags = data.fadvise_flags | flags;
+    if (write_data.length() > data.largest_data_len) {
+	data.largest_data_len = write_data.length();
+	data.largest_data_off = off;
+	data.largest_data_off_in_data_bl = orig_len + sizeof(__u32);  // we are about to
+    }
+    data.ops = data.ops + 1;
+  }
+  /**
+   * zero out the indicated byte range within an object. Some
+   * ObjectStore instances may optimize this to release the
+   * underlying storage space.
+   *
+   * If the zero range extends beyond the end of the object, the object
+   * size is extended, just as if we were writing a buffer full of zeros.
+   * EXCEPT if the length is 0, in which case (just like a 0-length write)
+   * we do not adjust the object size.
+   */
+  void zero(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len) {
+    Op* _op = _get_next_op();
+    _op->op = OP_ZERO;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    _op->off = off;
+    _op->len = len;
+    data.ops = data.ops + 1;
+  }
+  /// Discard all data in the object beyond the specified size.
+  void truncate(const coll_t& cid, const ghobject_t& oid, uint64_t off) {
+    Op* _op = _get_next_op();
+    _op->op = OP_TRUNCATE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    _op->off = off;
+    data.ops = data.ops + 1;
+  }
+  /// Remove an object. All four parts of the object are removed.
+  void remove(const coll_t& cid, const ghobject_t& oid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_REMOVE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+  /// Set an xattr of an object
+  void setattr(const coll_t& cid, const ghobject_t& oid, const char* name, ceph::buffer::list& val) {
+    std::string n(name);
+    setattr(cid, oid, n, val);
+  }
+  /// Set an xattr of an object
+  void setattr(const coll_t& cid, const ghobject_t& oid, const std::string& s, ceph::buffer::list& val) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_SETATTR;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(s, data_bl);
+    encode(val, data_bl);
+    data.ops = data.ops + 1;
+  }
+  /// Set multiple xattrs of an object
+  void setattrs(const coll_t& cid, const ghobject_t& oid, const std::map<std::string,ceph::buffer::ptr>& attrset) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_SETATTRS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(attrset, data_bl);
+    data.ops = data.ops + 1;
+  }
+  /// Set multiple xattrs of an object
+  void setattrs(const coll_t& cid, const ghobject_t& oid, const std::map<std::string,ceph::buffer::list>& attrset) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_SETATTRS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(attrset, data_bl);
+    data.ops = data.ops + 1;
+  }
+  /// remove an xattr from an object
+  void rmattr(const coll_t& cid, const ghobject_t& oid, const char *name) {
+    std::string n(name);
+    rmattr(cid, oid, n);
+  }
+  /// remove an xattr from an object
+  void rmattr(const coll_t& cid, const ghobject_t& oid, const std::string& s) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_RMATTR;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(s, data_bl);
+    data.ops = data.ops + 1;
+  }
+  /// remove all xattrs from an object
+  void rmattrs(const coll_t& cid, const ghobject_t& oid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_RMATTRS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+  /**
+   * Clone an object into another object.
+   *
+   * Low-cost (e.g., O(1)) cloning (if supported) is best, but
+   * fallback to an O(n) copy is allowed.  All four parts of the
+   * object are cloned (data, xattrs, omap header, omap
+   * entries).
+   *
+   * The destination named object may already exist, in
+   * which case its previous contents are discarded.
+   */
+  void clone(const coll_t& cid, const ghobject_t& oid,
+	       const ghobject_t& noid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_CLONE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    _op->dest_oid = _get_object_id(noid);
+    data.ops = data.ops + 1;
+  }
+  /**
+   * Clone a byte range from one object to another.
+   *
+   * The data portion of the destination object receives a copy of a
+   * portion of the data from the source object. None of the other
+   * three parts of an object is copied from the source.
+   *
+   * The destination object size may be extended to the dstoff + len.
+   *
+   * The source range *must* overlap with the source object data. If it does
+   * not the result is undefined.
+   */
+  void clone_range(const coll_t& cid, const ghobject_t& oid,
+		     const ghobject_t& noid,
+		     uint64_t srcoff, uint64_t srclen, uint64_t dstoff) {
+    Op* _op = _get_next_op();
+    _op->op = OP_CLONERANGE2;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    _op->dest_oid = _get_object_id(noid);
+    _op->off = srcoff;
+    _op->len = srclen;
+    _op->dest_off = dstoff;
+    data.ops = data.ops + 1;
+  }
+
+  /// Create the collection
+  void create_collection(const coll_t& cid, int bits) {
+    Op* _op = _get_next_op();
+    _op->op = OP_MKCOLL;
+    _op->cid = _get_coll_id(cid);
+    _op->split_bits = bits;
+    data.ops = data.ops + 1;
+  }
+
+  /**
+   * Give the collection a hint.
+   *
+   * @param cid  - collection id.
+   * @param type - hint type.
+   * @param hint - the hint payload, which contains the customized
+   *               data along with the hint type.
+   */
+  void collection_hint(const coll_t& cid, uint32_t type, const ceph::buffer::list& hint) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_COLL_HINT;
+    _op->cid = _get_coll_id(cid);
+    _op->hint = type;
+    encode(hint, data_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// remove the collection, the collection must be empty
+  void remove_collection(const coll_t& cid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_RMCOLL;
+    _op->cid = _get_coll_id(cid);
+    data.ops = data.ops + 1;
+  }
+  void collection_move(const coll_t& cid, const coll_t &oldcid, const ghobject_t& oid)
+    __attribute__ ((deprecated)) {
+	// NOTE: we encode this as a fixed combo of ADD + REMOVE.  they
+	// always appear together, so this is effectively a single MOVE.
+	Op* _op = _get_next_op();
+	_op->op = OP_COLL_ADD;
+	_op->cid = _get_coll_id(oldcid);
+	_op->oid = _get_object_id(oid);
+	_op->dest_cid = _get_coll_id(cid);
+	data.ops = data.ops + 1;
+
+	_op = _get_next_op();
+	_op->op = OP_COLL_REMOVE;
+	_op->cid = _get_coll_id(oldcid);
+	_op->oid = _get_object_id(oid);
+	data.ops = data.ops + 1;
+    }
+  void collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+				const coll_t &cid, const ghobject_t& oid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_COLL_MOVE_RENAME;
+    _op->cid = _get_coll_id(oldcid);
+    _op->oid = _get_object_id(oldoid);
+    _op->dest_cid = _get_coll_id(cid);
+    _op->dest_oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+  void try_rename(const coll_t &cid, const ghobject_t& oldoid,
+                  const ghobject_t& oid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_TRY_RENAME;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oldoid);
+    _op->dest_oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+
+  /// Remove omap from oid
+  void omap_clear(
+    const coll_t &cid,           ///< [in] Collection containing oid
+    const ghobject_t &oid  ///< [in] Object from which to remove omap
+    ) {
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_CLEAR;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+  /// Set keys on oid omap.  Replaces duplicate keys.
+  void omap_setkeys(
+    const coll_t& cid,                           ///< [in] Collection containing oid
+    const ghobject_t &oid,                ///< [in] Object to update
+    const std::map<std::string, ceph::buffer::list> &attrset ///< [in] Replacement keys and values
+    ) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_SETKEYS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(attrset, data_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Set keys on an oid omap (ceph::buffer::list variant).
+  void omap_setkeys(
+    const coll_t &cid,                           ///< [in] Collection containing oid
+    const ghobject_t &oid,                ///< [in] Object to update
+    const ceph::buffer::list &attrset_bl          ///< [in] Replacement keys and values
+    ) {
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_SETKEYS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data_bl.append(attrset_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Remove keys from oid omap
+  void omap_rmkeys(
+    const coll_t &cid,             ///< [in] Collection containing oid
+    const ghobject_t &oid,  ///< [in] Object from which to remove the omap
+    const std::set<std::string> &keys ///< [in] Keys to clear
+    ) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_RMKEYS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(keys, data_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Remove key from oid omap
+  void omap_rmkey(
+    const coll_t &cid,             ///< [in] Collection containing oid
+    const ghobject_t &oid,  ///< [in] Object from which to remove the omap
+    const std::string& key ///< [in] Keys to clear
+    ) {
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_RMKEYS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    using ceph::encode;
+    encode((uint32_t)1, data_bl);
+    encode(key, data_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Remove keys from oid omap
+  void omap_rmkeys(
+    const coll_t &cid,             ///< [in] Collection containing oid
+    const ghobject_t &oid,  ///< [in] Object from which to remove the omap
+    const ceph::buffer::list &keys_bl ///< [in] Keys to clear
+    ) {
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_RMKEYS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data_bl.append(keys_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Remove key range from oid omap
+  void omap_rmkeyrange(
+    const coll_t &cid,             ///< [in] Collection containing oid
+    const ghobject_t &oid,  ///< [in] Object from which to remove the omap keys
+    const std::string& first,    ///< [in] first key in range
+    const std::string& last      ///< [in] first key past range, range is [first,last)
+    ) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_RMKEYRANGE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(first, data_bl);
+    encode(last, data_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Remove key range from oid omap
+  void omap_rmkeyrange(
+    const coll_t cid,       ///< [in] Collection containing oid
+    const ghobject_t &oid,  ///< [in] Object from which to remove the omap keys
+    const bufferlist &keys_bl ///< [in] range of keys to clear
+    ) {
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_RMKEYRANGE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data_bl.append(keys_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Set omap header
+  void omap_setheader(
+    const coll_t &cid,             ///< [in] Collection containing oid
+    const ghobject_t &oid,  ///< [in] Object
+    const ceph::buffer::list &bl    ///< [in] Header value
+    ) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_SETHEADER;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(bl, data_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Split collection based on given prefixes, objects matching the specified bits/rem are
+  /// moved to the new collection
+  void split_collection(
+    const coll_t &cid,
+    uint32_t bits,
+    uint32_t rem,
+    const coll_t &destination) {
+    Op* _op = _get_next_op();
+    _op->op = OP_SPLIT_COLLECTION2;
+    _op->cid = _get_coll_id(cid);
+    _op->dest_cid = _get_coll_id(destination);
+    _op->split_bits = bits;
+    _op->split_rem = rem;
+    data.ops = data.ops + 1;
+  }
+
+  /// Merge collection into another.
+  void merge_collection(
+    coll_t cid,
+    coll_t destination,
+    uint32_t bits) {
+    Op* _op = _get_next_op();
+    _op->op = OP_MERGE_COLLECTION;
+    _op->cid = _get_coll_id(cid);
+    _op->dest_cid = _get_coll_id(destination);
+    _op->split_bits = bits;
+    data.ops = data.ops + 1;
+  }
+
+  void collection_set_bits(
+    const coll_t &cid,
+    int bits) {
+    Op* _op = _get_next_op();
+    _op->op = OP_COLL_SET_BITS;
+    _op->cid = _get_coll_id(cid);
+    _op->split_bits = bits;
+    data.ops = data.ops + 1;
+  }
+
+  /// Set allocation hint for an object
+  /// make 0 values(expected_object_size, expected_write_size) noops for all implementations
+  void set_alloc_hint(
+    const coll_t &cid,
+    const ghobject_t &oid,
+    uint64_t expected_object_size,
+    uint64_t expected_write_size,
+    uint32_t flags
+  ) {
+    Op* _op = _get_next_op();
+    _op->op = OP_SETALLOCHINT;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    _op->expected_object_size = expected_object_size;
+    _op->expected_write_size = expected_write_size;
+    _op->hint = flags;
+    data.ops = data.ops + 1;
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    //layout: data_bl + op_bl + coll_index + object_index + data
+    ENCODE_START(9, 9, bl);
+    encode(data_bl, bl);
+    encode(op_bl, bl);
+    encode(coll_index, bl);
+    encode(object_index, bl);
+    data.encode(bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(9, bl);
+    DECODE_OLDEST(9);
+
+    decode(data_bl, bl);
+    decode(op_bl, bl);
+    decode(coll_index, bl);
+    decode(object_index, bl);
+    data.decode(bl);
+    coll_id = coll_index.size();
+    object_id = object_index.size();
+
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f);
+  static void generate_test_instances(std::list<Transaction*>& o);
+};
+WRITE_CLASS_ENCODER(Transaction)
+WRITE_CLASS_ENCODER(Transaction::TransactionData)
+
+std::ostream& operator<<(std::ostream& out, const Transaction& tx);
+
+}
diff --git a/src/os/bluestore/Allocator.cc b/src/os/bluestore/Allocator.cc
new file mode 100644
index 000000000..f92821f0c
--- /dev/null
+++ b/src/os/bluestore/Allocator.cc
@@ -0,0 +1,212 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Allocator.h"
+#include "StupidAllocator.h"
+#include "BitmapAllocator.h"
+#include "AvlAllocator.h"
+#include "HybridAllocator.h"
+#ifdef HAVE_LIBZBD
+#include "ZonedAllocator.h"
+#endif
+#include "common/debug.h"
+#include "common/admin_socket.h"
+#define dout_subsys ceph_subsys_bluestore
+
+using std::string;
+using std::to_string;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+class Allocator::SocketHook : public AdminSocketHook {
+  Allocator *alloc;
+
+  friend class Allocator;
+  std::string name;
+public:
+  explicit SocketHook(Allocator *alloc,
+                      const std::string& _name) :
+    alloc(alloc), name(_name)
+  {
+    AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+    if (name.empty()) {
+      name = to_string((uintptr_t)this);
+    }
+    if (admin_socket) {
+      int r = admin_socket->register_command(
+	("bluestore allocator dump " + name).c_str(),
+	this,
+	"dump allocator free regions");
+      if (r != 0)
+        alloc = nullptr; //some collision, disable
+      if (alloc) {
+        r = admin_socket->register_command(
+	  ("bluestore allocator score " + name).c_str(),
+	  this,
+	  "give score on allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)");
+        ceph_assert(r == 0);
+        r = admin_socket->register_command(
+          ("bluestore allocator fragmentation " + name).c_str(),
+          this,
+          "give allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)");
+        ceph_assert(r == 0);
+      }
+    }
+  }
+  ~SocketHook()
+  {
+    AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+    if (admin_socket && alloc) {
+      admin_socket->unregister_commands(this);
+    }
+  }
+
+  int call(std::string_view command,
+	   const cmdmap_t& cmdmap,
+	   Formatter *f,
+	   std::ostream& ss,
+	   bufferlist& out) override {
+    int r = 0;
+    if (command == "bluestore allocator dump " + name) {
+      f->open_object_section("allocator_dump");
+      f->dump_unsigned("capacity", alloc->get_capacity());
+      f->dump_unsigned("alloc_unit", alloc->get_block_size());
+      f->dump_string("alloc_type", alloc->get_type());
+      f->dump_string("alloc_name", name);
+
+      f->open_array_section("extents");
+      auto iterated_allocation = [&](size_t off, size_t len) {
+        ceph_assert(len > 0);
+        f->open_object_section("free");
+        char off_hex[30];
+        char len_hex[30];
+        snprintf(off_hex, sizeof(off_hex) - 1, "0x%lx", off);
+        snprintf(len_hex, sizeof(len_hex) - 1, "0x%lx", len);
+        f->dump_string("offset", off_hex);
+        f->dump_string("length", len_hex);
+        f->close_section();
+      };
+      alloc->foreach(iterated_allocation);
+      f->close_section();
+      f->close_section();
+    } else if (command == "bluestore allocator score " + name) {
+      f->open_object_section("fragmentation_score");
+      f->dump_float("fragmentation_rating", alloc->get_fragmentation_score());
+      f->close_section();
+    } else if (command == "bluestore allocator fragmentation " + name) {
+      f->open_object_section("fragmentation");
+      f->dump_float("fragmentation_rating", alloc->get_fragmentation());
+      f->close_section();
+    } else {
+      ss << "Invalid command" << std::endl;
+      r = -ENOSYS;
+    }
+    return r;
+  }
+
+};
+Allocator::Allocator(const std::string& name,
+                     int64_t _capacity,
+                     int64_t _block_size)
+  : capacity(_capacity), block_size(_block_size)
+{
+  asok_hook = new SocketHook(this, name);
+}
+
+
+Allocator::~Allocator()
+{
+  delete asok_hook;
+}
+
+const string& Allocator::get_name() const {
+  return asok_hook->name;
+}
+
+Allocator *Allocator::create(CephContext* cct, string type,
+                             int64_t size, int64_t block_size, const std::string& name)
+{
+  Allocator* alloc = nullptr;
+  if (type == "stupid") {
+    alloc = new StupidAllocator(cct, name, size, block_size);
+  } else if (type == "bitmap") {
+    alloc = new BitmapAllocator(cct, size, block_size, name);
+  } else if (type == "avl") {
+    return new AvlAllocator(cct, size, block_size, name);
+  } else if (type == "hybrid") {
+    return new HybridAllocator(cct, size, block_size,
+      cct->_conf.get_val<uint64_t>("bluestore_hybrid_alloc_mem_cap"),
+      name);
+#ifdef HAVE_LIBZBD
+  } else if (type == "zoned") {
+    return new ZonedAllocator(cct, size, block_size, name);
+#endif
+  }
+  if (alloc == nullptr) {
+    lderr(cct) << "Allocator::" << __func__ << " unknown alloc type "
+	     << type << dendl;
+  }
+  return alloc;
+}
+
+void Allocator::release(const PExtentVector& release_vec)
+{
+  interval_set<uint64_t> release_set;
+  for (auto e : release_vec) {
+    release_set.insert(e.offset, e.length);
+  }
+  release(release_set);
+}
+
+/**
+ * Gives fragmentation a numeric value.
+ *
+ * Following algorithm applies value to each existing free unallocated block.
+ * Value of single block is a multiply of size and per-byte-value.
+ * Per-byte-value is greater for larger blocks.
+ * Assume block size X has value per-byte p; then block size 2*X will have per-byte value 1.1*p.
+ *
+ * This could be expressed in logarithms, but for speed this is interpolated inside ranges.
+ * [1]  [2..3] [4..7] [8..15] ...
+ * ^    ^      ^      ^
+ * 1.1  1.1^2  1.1^3  1.1^4 ...
+ *
+ * Final score is obtained by proportion between score that would have been obtained
+ * in condition of absolute fragmentation and score in no fragmentation at all.
+ */
+double Allocator::get_fragmentation_score()
+{
+  // this value represents how much worth is 2X bytes in one chunk then in X + X bytes
+  static const double double_size_worth = 1.1 ;
+  std::vector<double> scales{1};
+  double score_sum = 0;
+  size_t sum = 0;
+
+  auto get_score = [&](size_t v) -> double {
+    size_t sc = sizeof(v) * 8 - clz(v) - 1; //assign to grade depending on log2(len)
+    while (scales.size() <= sc + 1) {
+      //unlikely expand scales vector
+      scales.push_back(scales[scales.size() - 1] * double_size_worth);
+    }
+
+    size_t sc_shifted = size_t(1) << sc;
+    double x = double(v - sc_shifted) / sc_shifted; //x is <0,1) in its scale grade
+    // linear extrapolation in its scale grade
+    double score = (sc_shifted    ) * scales[sc]   * (1-x) +
+                   (sc_shifted * 2) * scales[sc+1] * x;
+    return score;
+  };
+
+  auto iterated_allocation = [&](size_t off, size_t len) {
+    ceph_assert(len > 0);
+    score_sum += get_score(len);
+    sum += len;
+  };
+  foreach(iterated_allocation);
+
+
+  double ideal = get_score(sum);
+  double terrible = sum * get_score(1);
+  return (ideal - score_sum) / (ideal - terrible);
+}
diff --git a/src/os/bluestore/Allocator.h b/src/os/bluestore/Allocator.h
new file mode 100644
index 000000000..38ea4f997
--- /dev/null
+++ b/src/os/bluestore/Allocator.h
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_OS_BLUESTORE_ALLOCATOR_H
+#define CEPH_OS_BLUESTORE_ALLOCATOR_H
+
+#include <functional>
+#include <ostream>
+#include "include/ceph_assert.h"
+#include "bluestore_types.h"
+#include "zoned_types.h"
+
+class Allocator {
+public:
+  explicit Allocator(const std::string& name,
+                     int64_t _capacity,
+                     int64_t _block_size);
+  virtual ~Allocator();
+
+  /*
+  * returns allocator type name as per names in config
+  */
+  virtual const char* get_type() const = 0;
+
+  /*
+   * Allocate required number of blocks in n number of extents.
+   * Min and Max number of extents are limited by:
+   * a. alloc unit
+   * b. max_alloc_size.
+   * as no extent can be lesser than block_size and greater than max_alloc size.
+   * Apart from that extents can vary between these lower and higher limits according
+   * to free block search algorithm and availability of contiguous space.
+   */
+  virtual int64_t allocate(uint64_t want_size, uint64_t block_size,
+			   uint64_t max_alloc_size, int64_t hint,
+			   PExtentVector *extents) = 0;
+
+  int64_t allocate(uint64_t want_size, uint64_t block_size,
+		   int64_t hint, PExtentVector *extents) {
+    return allocate(want_size, block_size, want_size, hint, extents);
+  }
+
+  /* Bulk release. Implementations may override this method to handle the whole
+   * set at once. This could save e.g. unnecessary mutex dance. */
+  virtual void release(const interval_set<uint64_t>& release_set) = 0;
+  void release(const PExtentVector& release_set);
+
+  virtual void dump() = 0;
+  virtual void foreach(
+    std::function<void(uint64_t offset, uint64_t length)> notify) = 0;
+
+  virtual void zoned_set_zone_states(std::vector<zone_state_t> &&_zone_states) {}
+  virtual bool zoned_get_zones_to_clean(std::deque<uint64_t> *zones_to_clean) {
+    return false;
+  }
+
+  virtual void init_add_free(uint64_t offset, uint64_t length) = 0;
+  virtual void init_rm_free(uint64_t offset, uint64_t length) = 0;
+
+  virtual uint64_t get_free() = 0;
+  virtual double get_fragmentation()
+  {
+    return 0.0;
+  }
+  virtual double get_fragmentation_score();
+  virtual void shutdown() = 0;
+
+  static Allocator *create(CephContext* cct, std::string type, int64_t size,
+			   int64_t block_size, const std::string& name = "");
+
+
+  const string& get_name() const;
+  int64_t get_capacity() const
+  {
+    return capacity;
+  }
+  int64_t get_block_size() const
+  {
+    return block_size;
+  }
+
+private:
+  class SocketHook;
+  SocketHook* asok_hook = nullptr;
+
+  int64_t capacity = 0;
+  int64_t block_size = 0;
+};
+
+#endif
diff --git a/src/os/bluestore/AvlAllocator.cc b/src/os/bluestore/AvlAllocator.cc
new file mode 100644
index 000000000..5f17a3689
--- /dev/null
+++ b/src/os/bluestore/AvlAllocator.cc
@@ -0,0 +1,474 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "AvlAllocator.h"
+
+#include <limits>
+
+#include "common/config_proxy.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef  dout_prefix
+#define dout_prefix *_dout << "AvlAllocator "
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(range_seg_t, range_seg_t, bluestore_alloc);
+
+namespace {
+  // a light-weight "range_seg_t", which only used as the key when searching in
+  // range_tree and range_size_tree
+  struct range_t {
+    uint64_t start;
+    uint64_t end;
+  };
+}
+
+/*
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
+ */
+uint64_t AvlAllocator::_pick_block_after(uint64_t *cursor,
+					 uint64_t size,
+					 uint64_t align)
+{
+  const auto compare = range_tree.key_comp();
+  uint32_t search_count = 0;
+  uint64_t search_bytes = 0;
+  auto rs_start = range_tree.lower_bound(range_t{*cursor, size}, compare);
+  for (auto rs = rs_start; rs != range_tree.end(); ++rs) {
+    uint64_t offset = p2roundup(rs->start, align);
+    *cursor = offset + size;
+    if (offset + size <= rs->end) {
+      return offset;
+    }
+    if (max_search_count > 0 && ++search_count > max_search_count) {
+      return -1ULL;
+    }
+    if (search_bytes = rs->start - rs_start->start;
+	max_search_bytes > 0 && search_bytes > max_search_bytes) {
+      return -1ULL;
+    }
+  }
+
+  if (*cursor == 0) {
+    // If we already started from beginning, don't bother with searching from beginning
+    return -1ULL;
+  }
+  // If we reached end, start from beginning till cursor.
+  for (auto rs = range_tree.begin(); rs != rs_start; ++rs) {
+    uint64_t offset = p2roundup(rs->start, align);
+    *cursor = offset + size;
+    if (offset + size <= rs->end) {
+      return offset;
+    }
+    if (max_search_count > 0 && ++search_count > max_search_count) {
+      return -1ULL;
+    }
+    if (max_search_bytes > 0 && search_bytes + rs->start > max_search_bytes) {
+      return -1ULL;
+    }
+  }
+  return -1ULL;
+}
+
+uint64_t AvlAllocator::_pick_block_fits(uint64_t size,
+					uint64_t align)
+{
+  // instead of searching from cursor, just pick the smallest range which fits
+  // the needs
+  const auto compare = range_size_tree.key_comp();
+  auto rs_start = range_size_tree.lower_bound(range_t{0, size}, compare);
+  for (auto rs = rs_start; rs != range_size_tree.end(); ++rs) {
+    uint64_t offset = p2roundup(rs->start, align);
+    if (offset + size <= rs->end) {
+      return offset;
+    }
+  }
+  return -1ULL;
+}
+
+void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size)
+{
+  ceph_assert(size != 0);
+
+  uint64_t end = start + size;
+
+  auto rs_after = range_tree.upper_bound(range_t{start, end},
+					 range_tree.key_comp());
+
+  /* Make sure we don't overlap with either of our neighbors */
+  auto rs_before = range_tree.end();
+  if (rs_after != range_tree.begin()) {
+    rs_before = std::prev(rs_after);
+  }
+
+  bool merge_before = (rs_before != range_tree.end() && rs_before->end == start);
+  bool merge_after = (rs_after != range_tree.end() && rs_after->start == end);
+
+  if (merge_before && merge_after) {
+    _range_size_tree_rm(*rs_before);
+    _range_size_tree_rm(*rs_after);
+    rs_after->start = rs_before->start;
+    range_tree.erase_and_dispose(rs_before, dispose_rs{});
+    _range_size_tree_try_insert(*rs_after);
+  } else if (merge_before) {
+    _range_size_tree_rm(*rs_before);
+    rs_before->end = end;
+    _range_size_tree_try_insert(*rs_before);
+  } else if (merge_after) {
+    _range_size_tree_rm(*rs_after);
+    rs_after->start = start;
+    _range_size_tree_try_insert(*rs_after);
+  } else {
+    _try_insert_range(start, end, &rs_after);
+  }
+}
+
+void AvlAllocator::_process_range_removal(uint64_t start, uint64_t end,
+  AvlAllocator::range_tree_t::iterator& rs)
+{
+  bool left_over = (rs->start != start);
+  bool right_over = (rs->end != end);
+
+  _range_size_tree_rm(*rs);
+
+  if (left_over && right_over) {
+    auto old_right_end = rs->end;
+    auto insert_pos = rs;
+    ceph_assert(insert_pos != range_tree.end());
+    ++insert_pos;
+    rs->end = start;
+
+    // Insert tail first to be sure insert_pos hasn't been disposed.
+    // This woulnd't dispose rs though since it's out of range_size_tree.
+    // Don't care about a small chance of 'not-the-best-choice-for-removal' case
+    // which might happen if rs has the lowest size.
+    _try_insert_range(end, old_right_end, &insert_pos);
+    _range_size_tree_try_insert(*rs);
+
+  } else if (left_over) {
+    rs->end = start;
+    _range_size_tree_try_insert(*rs);
+  } else if (right_over) {
+    rs->start = end;
+    _range_size_tree_try_insert(*rs);
+  } else {
+    range_tree.erase_and_dispose(rs, dispose_rs{});
+  }
+}
+
+void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size)
+{
+  uint64_t end = start + size;
+
+  ceph_assert(size != 0);
+  ceph_assert(size <= num_free);
+
+  auto rs = range_tree.find(range_t{start, end}, range_tree.key_comp());
+  /* Make sure we completely overlap with someone */
+  ceph_assert(rs != range_tree.end());
+  ceph_assert(rs->start <= start);
+  ceph_assert(rs->end >= end);
+
+  _process_range_removal(start, end, rs);
+}
+
+void AvlAllocator::_try_remove_from_tree(uint64_t start, uint64_t size,
+  std::function<void(uint64_t, uint64_t, bool)> cb)
+{
+  uint64_t end = start + size;
+
+  ceph_assert(size != 0);
+
+  auto rs = range_tree.find(range_t{ start, end },
+    range_tree.key_comp());
+
+  if (rs == range_tree.end() || rs->start >= end) {
+    cb(start, size, false);
+    return;
+  }
+
+  do {
+
+    auto next_rs = rs;
+    ++next_rs;
+
+    if (start < rs->start) {
+      cb(start, rs->start - start, false);
+      start = rs->start;
+    }
+    auto range_end = std::min(rs->end, end);
+    _process_range_removal(start, range_end, rs);
+    cb(start, range_end - start, true);
+    start = range_end;
+
+    rs = next_rs;
+  } while (rs != range_tree.end() && rs->start < end && start < end);
+  if (start < end) {
+    cb(start, end - start, false);
+  }
+}
+
+int64_t AvlAllocator::_allocate(
+  uint64_t want,
+  uint64_t unit,
+  uint64_t max_alloc_size,
+  int64_t  hint, // unused, for now!
+  PExtentVector* extents)
+{
+  uint64_t allocated = 0;
+  while (allocated < want) {
+    uint64_t offset, length;
+    int r = _allocate(std::min(max_alloc_size, want - allocated),
+      unit, &offset, &length);
+    if (r < 0) {
+      // Allocation failed.
+      break;
+    }
+    extents->emplace_back(offset, length);
+    allocated += length;
+  }
+  return allocated ? allocated : -ENOSPC;
+}
+
+int AvlAllocator::_allocate(
+  uint64_t size,
+  uint64_t unit,
+  uint64_t *offset,
+  uint64_t *length)
+{
+  uint64_t max_size = 0;
+  if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) {
+    max_size = p->end - p->start;
+  }
+
+  bool force_range_size_alloc = false;
+  if (max_size < size) {
+    if (max_size < unit) {
+      return -ENOSPC;
+    }
+    size = p2align(max_size, unit);
+    ceph_assert(size > 0);
+    force_range_size_alloc = true;
+  }
+
+  const int free_pct = num_free * 100 / num_total;
+  uint64_t start = 0;
+  // If we're running low on space, find a range by size by looking up in the size
+  // sorted tree (best-fit), instead of searching in the area pointed by cursor
+  if (force_range_size_alloc ||
+      max_size < range_size_alloc_threshold ||
+      free_pct < range_size_alloc_free_pct) {
+    start = -1ULL;
+  } else {
+    /*
+     * Find the largest power of 2 block size that evenly divides the
+     * requested size. This is used to try to allocate blocks with similar
+     * alignment from the same area (i.e. same cursor bucket) but it does
+     * not guarantee that other allocations sizes may exist in the same
+     * region.
+     */
+    uint64_t align = size & -size;
+    ceph_assert(align != 0);
+    uint64_t* cursor = &lbas[cbits(align) - 1];
+    start = _pick_block_after(cursor, size, unit);
+    dout(20) << __func__ << " first fit=" << start << " size=" << size << dendl;
+  }
+  if (start == -1ULL) {
+    do {
+      start = _pick_block_fits(size, unit);
+      dout(20) << __func__ << " best fit=" << start << " size=" << size << dendl;
+      if (start != uint64_t(-1ULL)) {
+        break;
+      }
+      // try to collect smaller extents as we could fail to retrieve
+      // that large block due to misaligned extents
+      size = p2align(size >> 1, unit);
+    } while (size >= unit);
+  }
+  if (start == -1ULL) {
+    return -ENOSPC;
+  }
+
+  _remove_from_tree(start, size);
+
+  *offset = start;
+  *length = size;
+  return 0;
+}
+
+void AvlAllocator::_release(const interval_set<uint64_t>& release_set)
+{
+  for (auto p = release_set.begin(); p != release_set.end(); ++p) {
+    const auto offset = p.get_start();
+    const auto length = p.get_len();
+    ldout(cct, 10) << __func__ << std::hex
+      << " offset 0x" << offset
+      << " length 0x" << length
+      << std::dec << dendl;
+    _add_to_tree(offset, length);
+  }
+}
+
+void AvlAllocator::_release(const PExtentVector& release_set) {
+  for (auto& e : release_set) {
+    ldout(cct, 10) << __func__ << std::hex
+      << " offset 0x" << e.offset
+      << " length 0x" << e.length
+      << std::dec << dendl;
+    _add_to_tree(e.offset, e.length);
+  }
+}
+
+void AvlAllocator::_shutdown()
+{
+  range_size_tree.clear();
+  range_tree.clear_and_dispose(dispose_rs{});
+}
+
+AvlAllocator::AvlAllocator(CephContext* cct,
+                           int64_t device_size,
+                           int64_t block_size,
+                           uint64_t max_mem,
+                           const std::string& name) :
+  Allocator(name, device_size, block_size),
+  num_total(device_size),
+  block_size(block_size),
+  range_size_alloc_threshold(
+    cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
+  range_size_alloc_free_pct(
+    cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
+  max_search_count(
+    cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_ff_max_search_count")),
+  max_search_bytes(
+    cct->_conf.get_val<Option::size_t>("bluestore_avl_alloc_ff_max_search_bytes")),
+  range_count_cap(max_mem / sizeof(range_seg_t)),
+  cct(cct)
+{}
+
+AvlAllocator::AvlAllocator(CephContext* cct,
+			   int64_t device_size,
+			   int64_t block_size,
+			   const std::string& name) :
+  AvlAllocator(cct, device_size, block_size, 0 /* max_mem */, name)
+{}
+
+AvlAllocator::~AvlAllocator()
+{
+  shutdown();
+}
+
+int64_t AvlAllocator::allocate(
+  uint64_t want,
+  uint64_t unit,
+  uint64_t max_alloc_size,
+  int64_t  hint, // unused, for now!
+  PExtentVector* extents)
+{
+  ldout(cct, 10) << __func__ << std::hex
+                 << " want 0x" << want
+                 << " unit 0x" << unit
+                 << " max_alloc_size 0x" << max_alloc_size
+                 << " hint 0x" << hint
+                 << std::dec << dendl;
+  ceph_assert(isp2(unit));
+  ceph_assert(want % unit == 0);
+
+  if (max_alloc_size == 0) {
+    max_alloc_size = want;
+  }
+  if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
+      max_alloc_size >= cap) {
+    max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size);
+  }
+  std::lock_guard l(lock);
+  return _allocate(want, unit, max_alloc_size, hint, extents);
+}
+
+void AvlAllocator::release(const interval_set<uint64_t>& release_set) {
+  std::lock_guard l(lock);
+  _release(release_set);
+}
+
+uint64_t AvlAllocator::get_free()
+{
+  std::lock_guard l(lock);
+  return num_free;
+}
+
+double AvlAllocator::get_fragmentation()
+{
+  std::lock_guard l(lock);
+  return _get_fragmentation();
+}
+
+void AvlAllocator::dump()
+{
+  std::lock_guard l(lock);
+  _dump();
+}
+
+void AvlAllocator::_dump() const
+{
+  ldout(cct, 0) << __func__ << " range_tree: " << dendl;
+  for (auto& rs : range_tree) {
+    ldout(cct, 0) << std::hex
+      << "0x" << rs.start << "~" << rs.end
+      << std::dec
+      << dendl;
+  }
+  ldout(cct, 0) << __func__ << " range_size_tree: " << dendl;
+  for (auto& rs : range_size_tree) {
+    ldout(cct, 0) << std::hex
+      << "0x" << rs.start << "~" << rs.end
+      << std::dec
+      << dendl;
+  }
+}
+
+void AvlAllocator::foreach(
+  std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+  std::lock_guard l(lock);
+  _foreach(notify);
+}
+
+void AvlAllocator::_foreach(
+  std::function<void(uint64_t offset, uint64_t length)> notify) const
+{
+  for (auto& rs : range_tree) {
+    notify(rs.start, rs.end - rs.start);
+  }
+}
+
+void AvlAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+  if (!length)
+    return;
+  std::lock_guard l(lock);
+  ldout(cct, 10) << __func__ << std::hex
+                 << " offset 0x" << offset
+                 << " length 0x" << length
+                 << std::dec << dendl;
+  _add_to_tree(offset, length);
+}
+
+void AvlAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+  if (!length)
+    return;
+  std::lock_guard l(lock);
+  ldout(cct, 10) << __func__ << std::hex
+                 << " offset 0x" << offset
+                 << " length 0x" << length
+                 << std::dec << dendl;
+  _remove_from_tree(offset, length);
+}
+
+void AvlAllocator::shutdown()
+{
+  std::lock_guard l(lock);
+  _shutdown();
+}
diff --git a/src/os/bluestore/AvlAllocator.h b/src/os/bluestore/AvlAllocator.h
new file mode 100644
index 000000000..f47ee5be0
--- /dev/null
+++ b/src/os/bluestore/AvlAllocator.h
@@ -0,0 +1,280 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <mutex>
+#include <boost/intrusive/avl_set.hpp>
+
+#include "Allocator.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+
+struct range_seg_t {
+  MEMPOOL_CLASS_HELPERS();  ///< memory monitoring
+  uint64_t start;   ///< starting offset of this segment
+  uint64_t end;	    ///< ending offset (non-inclusive)
+
+  range_seg_t(uint64_t start, uint64_t end)
+    : start{start},
+      end{end}
+  {}
+  // Tree is sorted by offset, greater offsets at the end of the tree.
+  struct before_t {
+    template<typename KeyLeft, typename KeyRight>
+    bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const {
+      return lhs.end <= rhs.start;
+    }
+  };
+  boost::intrusive::avl_set_member_hook<> offset_hook;
+
+  // Tree is sorted by size, larger sizes at the end of the tree.
+  struct shorter_t {
+    template<typename KeyType>
+    bool operator()(const range_seg_t& lhs, const KeyType& rhs) const {
+      auto lhs_size = lhs.end - lhs.start;
+      auto rhs_size = rhs.end - rhs.start;
+      if (lhs_size < rhs_size) {
+	return true;
+      } else if (lhs_size > rhs_size) {
+	return false;
+      } else {
+	return lhs.start < rhs.start;
+      }
+    }
+  };
+  inline uint64_t length() const {
+    return end - start;
+  }
+  boost::intrusive::avl_set_member_hook<> size_hook;
+};
+
+class AvlAllocator : public Allocator {
+  struct dispose_rs {
+    void operator()(range_seg_t* p)
+    {
+      delete p;
+    }
+  };
+
+protected:
+  /*
+  * ctor intended for the usage from descendant class(es) which
+  * provides handling for spilled over entries
+  * (when entry count >= max_entries)
+  */
+  AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
+    uint64_t max_mem,
+    const std::string& name);
+
+public:
+  AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
+	       const std::string& name);
+  ~AvlAllocator();
+  const char* get_type() const override
+  {
+    return "avl";
+  }
+  int64_t allocate(
+    uint64_t want,
+    uint64_t unit,
+    uint64_t max_alloc_size,
+    int64_t  hint,
+    PExtentVector *extents) override;
+  void release(const interval_set<uint64_t>& release_set) override;
+  int64_t get_capacity() const {
+    return num_total;
+  }
+
+  uint64_t get_block_size() const {
+    return block_size;
+  }
+  uint64_t get_free() override;
+  double get_fragmentation() override;
+
+  void dump() override;
+  void foreach(
+    std::function<void(uint64_t offset, uint64_t length)> notify) override;
+  void init_add_free(uint64_t offset, uint64_t length) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+  void shutdown() override;
+
+private:
+  // pick a range by search from cursor forward
+  uint64_t _pick_block_after(
+    uint64_t *cursor,
+    uint64_t size,
+    uint64_t align);
+  // pick a range with exactly the same size or larger
+  uint64_t _pick_block_fits(
+    uint64_t size,
+    uint64_t align);
+  int _allocate(
+    uint64_t size,
+    uint64_t unit,
+    uint64_t *offset,
+    uint64_t *length);
+
+  using range_tree_t = 
+    boost::intrusive::avl_set<
+      range_seg_t,
+      boost::intrusive::compare<range_seg_t::before_t>,
+      boost::intrusive::member_hook<
+	range_seg_t,
+	boost::intrusive::avl_set_member_hook<>,
+	&range_seg_t::offset_hook>>;
+  range_tree_t range_tree;    ///< main range tree
+  /*
+   * The range_size_tree should always contain the
+   * same number of segments as the range_tree.
+   * The only difference is that the range_size_tree
+   * is ordered by segment sizes.
+   */
+  using range_size_tree_t =
+    boost::intrusive::avl_multiset<
+      range_seg_t,
+      boost::intrusive::compare<range_seg_t::shorter_t>,
+      boost::intrusive::member_hook<
+	range_seg_t,
+	boost::intrusive::avl_set_member_hook<>,
+	&range_seg_t::size_hook>,
+      boost::intrusive::constant_time_size<true>>;
+  range_size_tree_t range_size_tree;
+
+  const int64_t num_total;   ///< device size
+  const uint64_t block_size; ///< block size
+  uint64_t num_free = 0;     ///< total bytes in freelist
+
+  /*
+   * This value defines the number of elements in the ms_lbas array.
+   * The value of 64 was chosen as it covers all power of 2 buckets
+   * up to UINT64_MAX.
+   * This is the equivalent of highest-bit of UINT64_MAX.
+   */
+  static constexpr unsigned MAX_LBAS = 64;
+  uint64_t lbas[MAX_LBAS] = {0};
+
+  /*
+   * Minimum size which forces the dynamic allocator to change
+   * it's allocation strategy.  Once the allocator cannot satisfy
+   * an allocation of this size then it switches to using more
+   * aggressive strategy (i.e search by size rather than offset).
+   */
+  uint64_t range_size_alloc_threshold = 0;
+  /*
+   * The minimum free space, in percent, which must be available
+   * in allocator to continue allocations in a first-fit fashion.
+   * Once the allocator's free space drops below this level we dynamically
+   * switch to using best-fit allocations.
+   */
+  int range_size_alloc_free_pct = 0;
+  /*
+   * Maximum number of segments to check in the first-fit mode, without this
+   * limit, fragmented device can see lots of iterations and _block_picker()
+   * becomes the performance limiting factor on high-performance storage.
+   */
+  const uint32_t max_search_count;
+  /*
+   * Maximum distance to search forward from the last offset, without this
+   * limit, fragmented device can see lots of iterations and _block_picker()
+   * becomes the performance limiting factor on high-performance storage.
+   */
+  const uint32_t max_search_bytes;
+  /*
+  * Max amount of range entries allowed. 0 - unlimited
+  */
+  uint64_t range_count_cap = 0;
+
+  void _range_size_tree_rm(range_seg_t& r) {
+    ceph_assert(num_free >= r.length());
+    num_free -= r.length();
+    range_size_tree.erase(r);
+
+  }
+  void _range_size_tree_try_insert(range_seg_t& r) {
+    if (_try_insert_range(r.start, r.end)) {
+      range_size_tree.insert(r);
+      num_free += r.length();
+    } else {
+      range_tree.erase_and_dispose(r, dispose_rs{});
+    }
+  }
+  bool _try_insert_range(uint64_t start,
+                         uint64_t end,
+                        range_tree_t::iterator* insert_pos = nullptr) {
+    bool res = !range_count_cap || range_size_tree.size() < range_count_cap;
+    bool remove_lowest = false;
+    if (!res) {
+      if (end - start > _lowest_size_available()) {
+        remove_lowest = true;
+        res = true;
+      }
+    }
+    if (!res) {
+      _spillover_range(start, end);
+    } else {
+      // NB:  we should do insertion before the following removal
+      // to avoid potential iterator disposal insertion might depend on.
+      if (insert_pos) {
+        auto new_rs = new range_seg_t{ start, end };
+        range_tree.insert_before(*insert_pos, *new_rs);
+        range_size_tree.insert(*new_rs);
+        num_free += new_rs->length();
+      }
+      if (remove_lowest) {
+        auto r = range_size_tree.begin();
+        _range_size_tree_rm(*r);
+        _spillover_range(r->start, r->end);
+        range_tree.erase_and_dispose(*r, dispose_rs{});
+      }
+    }
+    return res;
+  }
+  virtual void _spillover_range(uint64_t start, uint64_t end) {
+    // this should be overriden when range count cap is present,
+    // i.e. (range_count_cap > 0)
+    ceph_assert(false);
+  }
+protected:
+  // called when extent to be released/marked free
+  virtual void _add_to_tree(uint64_t start, uint64_t size);
+
+protected:
+  CephContext* cct;
+  std::mutex lock;
+
+  double _get_fragmentation() const {
+    auto free_blocks = p2align(num_free, block_size) / block_size;
+    if (free_blocks <= 1) {
+      return .0;
+    }
+    return (static_cast<double>(range_tree.size() - 1) / (free_blocks - 1));
+  }
+  void _dump() const;
+  void _foreach(std::function<void(uint64_t offset, uint64_t length)>) const;
+
+  uint64_t _lowest_size_available() {
+    auto rs = range_size_tree.begin();
+    return rs != range_size_tree.end() ? rs->length() : 0;
+  }
+
+  int64_t _allocate(
+    uint64_t want,
+    uint64_t unit,
+    uint64_t max_alloc_size,
+    int64_t  hint,
+    PExtentVector *extents);
+
+  void _release(const interval_set<uint64_t>& release_set);
+  void _release(const PExtentVector&  release_set);
+  void _shutdown();
+
+  void _process_range_removal(uint64_t start, uint64_t end, range_tree_t::iterator& rs);
+  void _remove_from_tree(uint64_t start, uint64_t size);
+  void _try_remove_from_tree(uint64_t start, uint64_t size,
+    std::function<void(uint64_t offset, uint64_t length, bool found)> cb);
+
+  uint64_t _get_free() const {
+    return num_free;
+  }
+};
diff --git a/src/os/bluestore/BitmapAllocator.cc b/src/os/bluestore/BitmapAllocator.cc
new file mode 100644
index 000000000..9304cb3f2
--- /dev/null
+++ b/src/os/bluestore/BitmapAllocator.cc
@@ -0,0 +1,104 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BitmapAllocator.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "fbmap_alloc " << this << " "
+
+BitmapAllocator::BitmapAllocator(CephContext* _cct,
+					 int64_t capacity,
+					 int64_t alloc_unit,
+					 const std::string& name) :
+    Allocator(name, capacity, alloc_unit),
+    cct(_cct)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << capacity << "/"
+		 << alloc_unit << std::dec << dendl;
+  _init(capacity, alloc_unit, false);
+}
+
+int64_t BitmapAllocator::allocate(
+  uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+  int64_t hint, PExtentVector *extents)
+{
+  uint64_t allocated = 0;
+  size_t old_size = extents->size();
+  ldout(cct, 10) << __func__ << std::hex << " 0x" << want_size
+		 << "/" << alloc_unit << "," << max_alloc_size << "," << hint
+		 << std::dec << dendl;
+    
+    
+  _allocate_l2(want_size, alloc_unit, max_alloc_size, hint,
+    &allocated, extents);
+  if (!allocated) {
+    return -ENOSPC;
+  }
+  for (auto i = old_size; i < extents->size(); ++i) {
+    auto& e = (*extents)[i];
+    ldout(cct, 10) << __func__
+                   << " extent: 0x" << std::hex << e.offset << "~" << e.length
+		   << "/" << alloc_unit << "," << max_alloc_size << "," << hint
+		   << std::dec << dendl;
+  }
+  return int64_t(allocated);
+}
+
+void BitmapAllocator::release(
+  const interval_set<uint64_t>& release_set)
+{
+  for (auto r : release_set) {
+    ldout(cct, 10) << __func__ << " 0x" << std::hex << r.first << "~" << r.second
+		  << std::dec << dendl;
+  }
+  _free_l2(release_set);
+  ldout(cct, 10) << __func__ << " done" << dendl;
+}
+
+
+void BitmapAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		  << std::dec << dendl;
+
+  auto mas = get_min_alloc_size();
+  uint64_t offs = round_up_to(offset, mas);
+  uint64_t l = p2align(offset + length - offs, mas);
+
+  _mark_free(offs, l);
+  ldout(cct, 10) << __func__ << " done" << dendl;
+}
+void BitmapAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		 << std::dec << dendl;
+  auto mas = get_min_alloc_size();
+  uint64_t offs = round_up_to(offset, mas);
+  uint64_t l = p2align(offset + length - offs, mas);
+  _mark_allocated(offs, l);
+  ldout(cct, 10) << __func__ << " done" << dendl;
+}
+
+void BitmapAllocator::shutdown()
+{
+  ldout(cct, 1) << __func__ << dendl;
+  _shutdown();
+}
+
+void BitmapAllocator::dump()
+{
+  // bin -> interval count
+  std::map<size_t, size_t> bins_overall;
+  collect_stats(bins_overall);
+  auto it = bins_overall.begin();
+  while (it != bins_overall.end()) {
+    ldout(cct, 0) << __func__
+                  << " bin " << it->first
+                  << "(< " << byte_u_t((1 << (it->first + 1)) * get_min_alloc_size()) << ")"
+                  << " : " << it->second << " extents"
+                  << dendl;
+    ++it;
+  }
+}
diff --git a/src/os/bluestore/BitmapAllocator.h b/src/os/bluestore/BitmapAllocator.h
new file mode 100644
index 000000000..bb6fa73a1
--- /dev/null
+++ b/src/os/bluestore/BitmapAllocator.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H
+#define CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H
+
+#include <mutex>
+
+#include "Allocator.h"
+#include "os/bluestore/bluestore_types.h"
+#include "fastbmap_allocator_impl.h"
+#include "include/mempool.h"
+#include "common/debug.h"
+
+class BitmapAllocator : public Allocator,
+  public AllocatorLevel02<AllocatorLevel01Loose> {
+  CephContext* cct;
+
+public:
+  BitmapAllocator(CephContext* _cct, int64_t capacity, int64_t alloc_unit, const std::string& name);
+  ~BitmapAllocator() override
+  {
+  }
+
+  const char* get_type() const override
+  {
+    return "bitmap";
+  }
+  int64_t allocate(
+    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+    int64_t hint, PExtentVector *extents) override;
+
+  void release(
+    const interval_set<uint64_t>& release_set) override;
+
+  using Allocator::release;
+
+  uint64_t get_free() override
+  {
+    return get_available();
+  }
+
+  void dump() override;
+  void foreach(
+    std::function<void(uint64_t offset, uint64_t length)> notify) override
+  {
+    foreach_internal(notify);
+  }
+  double get_fragmentation() override
+  {
+    return get_fragmentation_internal();
+  }
+
+  void init_add_free(uint64_t offset, uint64_t length) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+
+  void shutdown() override;
+};
+
+#endif
diff --git a/src/os/bluestore/BitmapFreelistManager.cc b/src/os/bluestore/BitmapFreelistManager.cc
new file mode 100644
index 000000000..315a6bb8c
--- /dev/null
+++ b/src/os/bluestore/BitmapFreelistManager.cc
@@ -0,0 +1,606 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BitmapFreelistManager.h"
+#include "kv/KeyValueDB.h"
+#include "os/kv.h"
+#include "include/stringify.h"
+
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "freelist "
+
+using std::string;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::decode;
+using ceph::encode;
+
+void make_offset_key(uint64_t offset, std::string *key)
+{
+  key->reserve(10);
+  _key_encode_u64(offset, key);
+}
+
+struct XorMergeOperator : public KeyValueDB::MergeOperator {
+  void merge_nonexistent(
+    const char *rdata, size_t rlen, std::string *new_value) override {
+    *new_value = std::string(rdata, rlen);
+  }
+  void merge(
+    const char *ldata, size_t llen,
+    const char *rdata, size_t rlen,
+    std::string *new_value) override {
+    ceph_assert(llen == rlen);
+    *new_value = std::string(ldata, llen);
+    for (size_t i = 0; i < rlen; ++i) {
+      (*new_value)[i] ^= rdata[i];
+    }
+  }
+  // We use each operator name and each prefix to construct the
+  // overall RocksDB operator name for consistency check at open time.
+  const char *name() const override {
+    return "bitwise_xor";
+  }
+};
+
+void BitmapFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix)
+{
+  std::shared_ptr<XorMergeOperator> merge_op(new XorMergeOperator);
+  db->set_merge_operator(prefix, merge_op);
+}
+
+BitmapFreelistManager::BitmapFreelistManager(CephContext* cct,
+					     string meta_prefix,
+					     string bitmap_prefix)
+  : FreelistManager(cct),
+    meta_prefix(meta_prefix),
+    bitmap_prefix(bitmap_prefix),
+    enumerate_bl_pos(0)
+{
+}
+
+int BitmapFreelistManager::create(uint64_t new_size, uint64_t granularity,
+				  KeyValueDB::Transaction txn)
+{
+  bytes_per_block = granularity;
+  ceph_assert(isp2(bytes_per_block));
+  size = p2align(new_size, bytes_per_block);
+  blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key;
+
+  _init_misc();
+
+  blocks = size_2_block_count(size);
+  if (blocks * bytes_per_block > size) {
+    dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size
+	     << " to 0x" << (blocks * bytes_per_block)
+	     << " (0x" << blocks << " blocks)" << std::dec << dendl;
+    // set past-eof blocks as allocated
+    _xor(size, blocks * bytes_per_block - size, txn);
+  }
+  dout(1) << __func__
+	   << " size 0x" << std::hex << size
+	   << " bytes_per_block 0x" << bytes_per_block
+	   << " blocks 0x" << blocks
+	   << " blocks_per_key 0x" << blocks_per_key
+	   << std::dec << dendl;
+  {
+    bufferlist bl;
+    encode(bytes_per_block, bl);
+    txn->set(meta_prefix, "bytes_per_block", bl);
+  }
+  {
+    bufferlist bl;
+    encode(blocks_per_key, bl);
+    txn->set(meta_prefix, "blocks_per_key", bl);
+  }
+  {
+    bufferlist bl;
+    encode(blocks, bl);
+    txn->set(meta_prefix, "blocks", bl);
+  }
+  {
+    bufferlist bl;
+    encode(size, bl);
+    txn->set(meta_prefix, "size", bl);
+  }
+  return 0;
+}
+
+int BitmapFreelistManager::_expand(uint64_t old_size, KeyValueDB* db)
+{
+  assert(old_size < size);
+  ceph_assert(isp2(bytes_per_block));
+
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  auto blocks0 = size_2_block_count(old_size);
+  if (blocks0 * bytes_per_block > old_size) {
+    dout(10) << __func__ << " rounding1 blocks up from 0x" << std::hex
+             << old_size << " to 0x" << (blocks0 * bytes_per_block)
+	     << " (0x" << blocks0 << " blocks)" << std::dec << dendl;
+    // reset past-eof blocks to unallocated
+    _xor(old_size, blocks0 * bytes_per_block - old_size, txn);
+  }
+
+  size = p2align(size, bytes_per_block);
+  blocks = size_2_block_count(size);
+
+  if (blocks * bytes_per_block > size) {
+    dout(10) << __func__ << " rounding2 blocks up from 0x" << std::hex
+             << size << " to 0x" << (blocks * bytes_per_block)
+	     << " (0x" << blocks << " blocks)" << std::dec << dendl;
+    // set past-eof blocks as allocated
+    _xor(size, blocks * bytes_per_block - size, txn);
+  }
+
+  dout(10) << __func__
+	   << " size 0x" << std::hex << size
+	   << " bytes_per_block 0x" << bytes_per_block
+	   << " blocks 0x" << blocks
+	   << " blocks_per_key 0x" << blocks_per_key
+	   << std::dec << dendl;
+  {
+    bufferlist bl;
+    encode(blocks, bl);
+    txn->set(meta_prefix, "blocks", bl);
+  }
+  {
+    bufferlist bl;
+    encode(size, bl);
+    txn->set(meta_prefix, "size", bl);
+  }
+  db->submit_transaction_sync(txn);
+
+  return 0;
+}
+
+int BitmapFreelistManager::read_size_meta_from_db(KeyValueDB* kvdb,
+  uint64_t* res)
+{
+  bufferlist v;
+  int r = kvdb->get(meta_prefix, "size", &v);
+  if (r < 0) {
+    derr << __func__ << " missing size meta in DB" << dendl;
+    return -ENOENT;
+  } else {
+    auto p = v.cbegin();
+    decode(*res, p);
+    r = 0;
+  }
+  return r;
+}
+
+void BitmapFreelistManager::_load_from_db(KeyValueDB* kvdb)
+{
+  KeyValueDB::Iterator it = kvdb->get_iterator(meta_prefix);
+  it->lower_bound(string());
+
+  // load meta
+  while (it->valid()) {
+    string k = it->key();
+    if (k == "bytes_per_block") {
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      decode(bytes_per_block, p);
+      dout(10) << __func__ << " bytes_per_block 0x" << std::hex
+        << bytes_per_block << std::dec << dendl;
+    } else if (k == "blocks") {
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      decode(blocks, p);
+      dout(10) << __func__ << " blocks 0x" << std::hex << blocks << std::dec
+        << dendl;
+    } else if (k == "size") {
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      decode(size, p);
+      dout(10) << __func__ << " size 0x" << std::hex << size << std::dec
+        << dendl;
+    } else if (k == "blocks_per_key") {
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      decode(blocks_per_key, p);
+      dout(10) << __func__ << " blocks_per_key 0x" << std::hex << blocks_per_key
+        << std::dec << dendl;
+    } else {
+      derr << __func__ << " unrecognized meta " << k << dendl;
+    }
+    it->next();
+  }
+}
+
+
+int BitmapFreelistManager::init(KeyValueDB *kvdb, bool db_in_read_only,
+  std::function<int(const std::string&, std::string*)> cfg_reader)
+{
+  dout(1) << __func__ << dendl;
+  int r = _read_cfg(cfg_reader);
+  if (r != 0) {
+    dout(1) << __func__ << " fall back to legacy meta repo" << dendl;
+    _load_from_db(kvdb);
+  }
+  _sync(kvdb, db_in_read_only);
+
+  dout(10) << __func__ << std::hex
+	   << " size 0x" << size
+	   << " bytes_per_block 0x" << bytes_per_block
+	   << " blocks 0x" << blocks
+	   << " blocks_per_key 0x" << blocks_per_key
+	   << std::dec << dendl;
+  _init_misc();
+  return 0;
+}
+
+int BitmapFreelistManager::_read_cfg(
+  std::function<int(const std::string&, std::string*)> cfg_reader)
+{
+  dout(1) << __func__ << dendl;
+
+  string err;
+
+  const size_t key_count = 4;
+  string keys[key_count] = {
+    "bfm_size",
+    "bfm_blocks",
+    "bfm_bytes_per_block",
+    "bfm_blocks_per_key"};
+  uint64_t* vals[key_count] = {
+    &size,
+    &blocks,
+    &bytes_per_block,
+    &blocks_per_key};
+
+  for (size_t i = 0; i < key_count; i++) {
+    string val;
+    int r = cfg_reader(keys[i], &val);
+    if (r == 0) {
+      *(vals[i]) = strict_iecstrtoll(val.c_str(), &err);
+      if (!err.empty()) {
+        derr << __func__ << " Failed to parse - "
+          << keys[i] << ":" << val
+          << ", error: " << err << dendl;
+        return -EINVAL;
+      }
+    } else {
+      // this is expected for legacy deployed OSDs
+      dout(0) << __func__ << " " << keys[i] << " not found in bdev meta" << dendl;
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+void BitmapFreelistManager::_init_misc()
+{
+  bufferptr z(blocks_per_key >> 3);
+  memset(z.c_str(), 0xff, z.length());
+  all_set_bl.clear();
+  all_set_bl.append(z);
+
+  block_mask = ~(bytes_per_block - 1);
+
+  bytes_per_key = bytes_per_block * blocks_per_key;
+  key_mask = ~(bytes_per_key - 1);
+  dout(10) << __func__ << std::hex << " bytes_per_key 0x" << bytes_per_key
+	   << ", key_mask 0x" << key_mask << std::dec
+	   << dendl;
+}
+
+void BitmapFreelistManager::sync(KeyValueDB* kvdb)
+{
+  _sync(kvdb, true);
+}
+
+void BitmapFreelistManager::_sync(KeyValueDB* kvdb, bool read_only)
+{
+  dout(10) << __func__ << " checks if size sync is needed" << dendl;
+  uint64_t size_db = 0;
+  int r = read_size_meta_from_db(kvdb, &size_db);
+  ceph_assert(r >= 0);
+  if (!read_only && size_db < size) {
+    dout(1) << __func__ << " committing new size 0x" << std::hex << size
+      << std::dec << dendl;
+    r = _expand(size_db, kvdb);
+    ceph_assert(r == 0);
+  } else if (size_db > size) {
+    // this might hapen when OSD passed the following sequence:
+    // upgrade -> downgrade -> expand -> upgrade
+    // One needs to run expand once again to syncup
+    dout(1) << __func__ << " fall back to legacy meta repo" << dendl;
+    _load_from_db(kvdb);
+  }
+}
+
+void BitmapFreelistManager::shutdown()
+{
+  dout(1) << __func__ << dendl;
+}
+
+void BitmapFreelistManager::enumerate_reset()
+{
+  std::lock_guard l(lock);
+  enumerate_offset = 0;
+  enumerate_bl_pos = 0;
+  enumerate_bl.clear();
+  enumerate_p.reset();
+}
+
+int get_next_clear_bit(bufferlist& bl, int start)
+{
+  const char *p = bl.c_str();
+  int bits = bl.length() << 3;
+  while (start < bits) {
+    // byte = start / 8 (or start >> 3)
+    // bit = start % 8 (or start & 7)
+    unsigned char byte_mask = 1 << (start & 7);
+    if ((p[start >> 3] & byte_mask) == 0) {
+      return start;
+    }
+    ++start;
+  }
+  return -1; // not found
+}
+
+int get_next_set_bit(bufferlist& bl, int start)
+{
+  const char *p = bl.c_str();
+  int bits = bl.length() << 3;
+  while (start < bits) {
+    int which_byte = start / 8;
+    int which_bit = start % 8;
+    unsigned char byte_mask = 1 << which_bit;
+    if (p[which_byte] & byte_mask) {
+      return start;
+    }
+    ++start;
+  }
+  return -1; // not found
+}
+
+bool BitmapFreelistManager::enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length)
+{
+  std::lock_guard l(lock);
+
+  // initial base case is a bit awkward
+  if (enumerate_offset == 0 && enumerate_bl_pos == 0) {
+    dout(10) << __func__ << " start" << dendl;
+    enumerate_p = kvdb->get_iterator(bitmap_prefix);
+    enumerate_p->lower_bound(string());
+    // we assert that the first block is always allocated; it's true,
+    // and it simplifies our lives a bit.
+    ceph_assert(enumerate_p->valid());
+    string k = enumerate_p->key();
+    const char *p = k.c_str();
+    _key_decode_u64(p, &enumerate_offset);
+    enumerate_bl = enumerate_p->value();
+    ceph_assert(enumerate_offset == 0);
+    ceph_assert(get_next_set_bit(enumerate_bl, 0) == 0);
+  }
+
+  if (enumerate_offset >= size) {
+    dout(10) << __func__ << " end" << dendl;
+    return false;
+  }
+
+  // skip set bits to find offset
+  while (true) {
+    enumerate_bl_pos = get_next_clear_bit(enumerate_bl, enumerate_bl_pos);
+    if (enumerate_bl_pos >= 0) {
+      *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
+      dout(30) << __func__ << " found clear bit, key 0x" << std::hex
+	       << enumerate_offset << " bit 0x" << enumerate_bl_pos
+	       << " offset 0x" << *offset
+	       << std::dec << dendl;
+      break;
+    }
+    dout(30) << " no more clear bits in 0x" << std::hex << enumerate_offset
+	     << std::dec << dendl;
+    enumerate_p->next();
+    enumerate_bl.clear();
+    if (!enumerate_p->valid()) {
+      enumerate_offset += bytes_per_key;
+      enumerate_bl_pos = 0;
+      *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
+      break;
+    }
+    string k = enumerate_p->key();
+    const char *p = k.c_str();
+    uint64_t next = enumerate_offset + bytes_per_key;
+    _key_decode_u64(p, &enumerate_offset);
+    enumerate_bl = enumerate_p->value();
+    enumerate_bl_pos = 0;
+    if (enumerate_offset > next) {
+      dout(30) << " no key at 0x" << std::hex << next << ", got 0x"
+	       << enumerate_offset << std::dec << dendl;
+      *offset = next;
+      break;
+    }
+  }
+
+  // skip clear bits to find the end
+  uint64_t end = 0;
+  if (enumerate_p->valid()) {
+    while (true) {
+      enumerate_bl_pos = get_next_set_bit(enumerate_bl, enumerate_bl_pos);
+      if (enumerate_bl_pos >= 0) {
+	end = _get_offset(enumerate_offset, enumerate_bl_pos);
+	dout(30) << __func__ << " found set bit, key 0x" << std::hex
+		 << enumerate_offset << " bit 0x" << enumerate_bl_pos
+		 << " offset 0x" << end << std::dec
+		 << dendl;
+	end = std::min(get_alloc_units() * bytes_per_block, end);
+	*length = end - *offset;
+        dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
+		 << std::dec << dendl;
+	return true;
+      }
+      dout(30) << " no more set bits in 0x" << std::hex << enumerate_offset
+	       << std::dec << dendl;
+      enumerate_p->next();
+      enumerate_bl.clear();
+      enumerate_bl_pos = 0;
+      if (!enumerate_p->valid()) {
+	break;
+      }
+      string k = enumerate_p->key();
+      const char *p = k.c_str();
+      _key_decode_u64(p, &enumerate_offset);
+      enumerate_bl = enumerate_p->value();
+    }
+  }
+
+  if (enumerate_offset < size) {
+    end = get_alloc_units() * bytes_per_block;
+    *length = end - *offset;
+    dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
+	     << std::dec << dendl;
+    enumerate_offset = size;
+    enumerate_bl_pos = blocks_per_key;
+    return true;
+  }
+
+  dout(10) << __func__ << " end" << dendl;
+  return false;
+}
+
+void BitmapFreelistManager::dump(KeyValueDB *kvdb)
+{
+  enumerate_reset();
+  uint64_t offset, length;
+  while (enumerate_next(kvdb, &offset, &length)) {
+    dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+	     << std::dec << dendl;
+  }
+}
+
+void BitmapFreelistManager::allocate(
+  uint64_t offset, uint64_t length,
+  KeyValueDB::Transaction txn)
+{
+  dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << std::dec << dendl;
+  _xor(offset, length, txn);
+}
+
+void BitmapFreelistManager::release(
+  uint64_t offset, uint64_t length,
+  KeyValueDB::Transaction txn)
+{
+  dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << std::dec << dendl;
+  _xor(offset, length, txn);
+}
+
+void BitmapFreelistManager::_xor(
+  uint64_t offset, uint64_t length,
+  KeyValueDB::Transaction txn)
+{
+  // must be block aligned
+  ceph_assert((offset & block_mask) == offset);
+  ceph_assert((length & block_mask) == length);
+
+  uint64_t first_key = offset & key_mask;
+  uint64_t last_key = (offset + length - 1) & key_mask;
+  dout(20) << __func__ << " first_key 0x" << std::hex << first_key
+	   << " last_key 0x" << last_key << std::dec << dendl;
+
+  if (first_key == last_key) {
+    bufferptr p(blocks_per_key >> 3);
+    p.zero();
+    unsigned s = (offset & ~key_mask) / bytes_per_block;
+    unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
+    for (unsigned i = s; i <= e; ++i) {
+      p[i >> 3] ^= 1ull << (i & 7);
+    }
+    string k;
+    make_offset_key(first_key, &k);
+    bufferlist bl;
+    bl.append(p);
+    dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
+    bl.hexdump(*_dout, false);
+    *_dout << dendl;
+    txn->merge(bitmap_prefix, k, bl);
+  } else {
+    // first key
+    {
+      bufferptr p(blocks_per_key >> 3);
+      p.zero();
+      unsigned s = (offset & ~key_mask) / bytes_per_block;
+      unsigned e = blocks_per_key;
+      for (unsigned i = s; i < e; ++i) {
+	p[i >> 3] ^= 1ull << (i & 7);
+      }
+      string k;
+      make_offset_key(first_key, &k);
+      bufferlist bl;
+      bl.append(p);
+      dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
+      bl.hexdump(*_dout, false);
+      *_dout << dendl;
+      txn->merge(bitmap_prefix, k, bl);
+      first_key += bytes_per_key;
+    }
+    // middle keys
+    while (first_key < last_key) {
+      string k;
+      make_offset_key(first_key, &k);
+      dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec
+      	 << ": ";
+      all_set_bl.hexdump(*_dout, false);
+      *_dout << dendl;
+      txn->merge(bitmap_prefix, k, all_set_bl);
+      first_key += bytes_per_key;
+    }
+    ceph_assert(first_key == last_key);
+    {
+      bufferptr p(blocks_per_key >> 3);
+      p.zero();
+      unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
+      for (unsigned i = 0; i <= e; ++i) {
+	p[i >> 3] ^= 1ull << (i & 7);
+      }
+      string k;
+      make_offset_key(first_key, &k);
+      bufferlist bl;
+      bl.append(p);
+      dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
+      bl.hexdump(*_dout, false);
+      *_dout << dendl;
+      txn->merge(bitmap_prefix, k, bl);
+    }
+  }
+}
+
+uint64_t BitmapFreelistManager::size_2_block_count(uint64_t target_size) const
+{
+  auto target_blocks = target_size / bytes_per_block;
+  if (target_blocks / blocks_per_key * blocks_per_key != target_blocks) {
+    target_blocks = (target_blocks / blocks_per_key + 1) * blocks_per_key;
+  }
+  return target_blocks;
+}
+
+void BitmapFreelistManager::get_meta(
+  uint64_t target_size,
+  std::vector<std::pair<string, string>>* res) const
+{
+  if (target_size == 0) {
+    res->emplace_back("bfm_blocks", stringify(blocks));
+    res->emplace_back("bfm_size", stringify(size));
+  } else {
+    target_size = p2align(target_size, bytes_per_block);
+    auto target_blocks = size_2_block_count(target_size);
+
+    res->emplace_back("bfm_blocks", stringify(target_blocks));
+    res->emplace_back("bfm_size", stringify(target_size));
+  }
+  res->emplace_back("bfm_bytes_per_block", stringify(bytes_per_block));
+  res->emplace_back("bfm_blocks_per_key", stringify(blocks_per_key));
+}
diff --git a/src/os/bluestore/BitmapFreelistManager.h b/src/os/bluestore/BitmapFreelistManager.h
new file mode 100644
index 000000000..c6bfe469f
--- /dev/null
+++ b/src/os/bluestore/BitmapFreelistManager.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_BITMAPFREELISTMANAGER_H
+#define CEPH_OS_BLUESTORE_BITMAPFREELISTMANAGER_H
+
+#include "FreelistManager.h"
+
+#include <string>
+#include <mutex>
+
+#include "common/ceph_mutex.h"
+#include "include/buffer.h"
+#include "kv/KeyValueDB.h"
+
+class BitmapFreelistManager : public FreelistManager {
+  std::string meta_prefix, bitmap_prefix;
+  std::shared_ptr<KeyValueDB::MergeOperator> merge_op;
+  ceph::mutex lock = ceph::make_mutex("BitmapFreelistManager::lock");
+
+  uint64_t size;            ///< size of device (bytes)
+  uint64_t bytes_per_block; ///< bytes per block (bdev_block_size)
+  uint64_t blocks_per_key;  ///< blocks (bits) per key/value pair
+  uint64_t bytes_per_key;   ///< bytes per key/value pair
+  uint64_t blocks;          ///< size of device (blocks, size rounded up)
+
+  uint64_t block_mask;  ///< mask to convert byte offset to block offset
+  uint64_t key_mask;    ///< mask to convert offset to key offset
+
+  ceph::buffer::list all_set_bl;
+
+  KeyValueDB::Iterator enumerate_p;
+  uint64_t enumerate_offset; ///< logical offset; position
+  ceph::buffer::list enumerate_bl;   ///< current key at enumerate_offset
+  int enumerate_bl_pos;      ///< bit position in enumerate_bl
+
+  uint64_t _get_offset(uint64_t key_off, int bit) {
+    return key_off + bit * bytes_per_block;
+  }
+
+  void _init_misc();
+
+  void _xor(
+    uint64_t offset, uint64_t length,
+    KeyValueDB::Transaction txn);
+
+  int _read_cfg(
+    std::function<int(const std::string&, std::string*)> cfg_reader);
+
+  int _expand(uint64_t new_size, KeyValueDB* db);
+
+  uint64_t size_2_block_count(uint64_t target_size) const;
+
+  int read_size_meta_from_db(KeyValueDB* kvdb, uint64_t* res);
+  void _sync(KeyValueDB* kvdb, bool read_only);
+
+  void _load_from_db(KeyValueDB* kvdb);
+
+public:
+  BitmapFreelistManager(CephContext* cct, std::string meta_prefix,
+			std::string bitmap_prefix);
+
+  static void setup_merge_operator(KeyValueDB *db, std::string prefix);
+
+  int create(uint64_t size, uint64_t granularity,
+	     KeyValueDB::Transaction txn) override;
+
+  int init(KeyValueDB *kvdb, bool db_in_read_only,
+    std::function<int(const std::string&, std::string*)> cfg_reader) override;
+
+  void shutdown() override;
+  void sync(KeyValueDB* kvdb) override;
+
+  void dump(KeyValueDB *kvdb) override;
+
+  void enumerate_reset() override;
+  bool enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length) override;
+
+  void allocate(
+    uint64_t offset, uint64_t length,
+    KeyValueDB::Transaction txn) override;
+  void release(
+    uint64_t offset, uint64_t length,
+    KeyValueDB::Transaction txn) override;
+
+  inline uint64_t get_size() const override {
+    return size;
+  }
+  inline uint64_t get_alloc_units() const override {
+    return size / bytes_per_block;
+  }
+  inline uint64_t get_alloc_size() const override {
+    return bytes_per_block;
+  }
+  void get_meta(uint64_t target_size,
+    std::vector<std::pair<string, string>>*) const override;
+};
+
+#endif
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
new file mode 100644
index 000000000..f462fa886
--- /dev/null
+++ b/src/os/bluestore/BlueFS.cc
@@ -0,0 +1,3903 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "boost/algorithm/string.hpp" 
+#include "bluestore_common.h"
+#include "BlueFS.h"
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "Allocator.h"
+#include "include/ceph_assert.h"
+#include "common/admin_socket.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluefs
+#undef dout_prefix
+#define dout_prefix *_dout << "bluefs "
+using TOPNSPC::common::cmd_getval;
+
+using std::byte;
+using std::list;
+using std::make_pair;
+using std::map;
+using std::ostream;
+using std::pair;
+using std::set;
+using std::string;
+using std::to_string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
+			      bluefs_file_reader_buffer, bluefs_file_reader);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
+
+static void wal_discard_cb(void *priv, void* priv2) {
+  BlueFS *bluefs = static_cast<BlueFS*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
+}
+
+static void db_discard_cb(void *priv, void* priv2) {
+  BlueFS *bluefs = static_cast<BlueFS*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
+}
+
+static void slow_discard_cb(void *priv, void* priv2) {
+  BlueFS *bluefs = static_cast<BlueFS*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
+}
+
+class BlueFS::SocketHook : public AdminSocketHook {
+  BlueFS* bluefs;
+public:
+  static BlueFS::SocketHook* create(BlueFS* bluefs)
+  {
+    BlueFS::SocketHook* hook = nullptr;
+    AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
+    if (admin_socket) {
+      hook = new BlueFS::SocketHook(bluefs);
+      int r = admin_socket->register_command("bluestore bluefs device info "
+                                             "name=alloc_size,type=CephInt,req=false",
+                                             hook,
+                                             "Shows space report for bluefs devices. "
+                                             "This also includes an estimation for space "
+                                             "available to bluefs at main device. "
+                                             "alloc_size, if set, specifies the custom bluefs "
+                                             "allocation unit size for the estimation above.");
+      if (r != 0) {
+        ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
+        delete hook;
+        hook = nullptr;
+      } else {
+        r = admin_socket->register_command("bluefs stats",
+                                           hook,
+                                           "Dump internal statistics for bluefs."
+                                           "");
+        ceph_assert(r == 0);
+	r = admin_socket->register_command("bluefs files list", hook,
+					   "print files in bluefs");
+	ceph_assert(r == 0);
+	r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook,
+					   "Injects 8K zeros into next BlueFS read. Debug only.");
+	ceph_assert(r == 0);
+      }
+    }
+    return hook;
+  }
+
+  ~SocketHook() {
+    AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
+    admin_socket->unregister_commands(this);
+  }
+private:
+  SocketHook(BlueFS* bluefs) :
+    bluefs(bluefs) {}
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+	   Formatter *f,
+	   std::ostream& errss,
+	   bufferlist& out) override {
+    if (command == "bluestore bluefs device info") {
+      int64_t alloc_size = 0;
+      cmd_getval(cmdmap, "alloc_size", alloc_size);
+      if ((alloc_size & (alloc_size - 1)) != 0) {
+	errss << "Invalid allocation size:'" << alloc_size << std::endl;
+	return -EINVAL;
+      }
+      if (alloc_size == 0)
+	alloc_size = bluefs->cct->_conf->bluefs_shared_alloc_size;
+      f->open_object_section("bluefs_device_info");
+      for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
+	if (bluefs->bdev[dev]) {
+	  f->open_object_section("dev");
+	  f->dump_string("device", bluefs->get_device_name(dev));
+	  ceph_assert(bluefs->alloc[dev]);
+          auto total = bluefs->get_total(dev);
+          auto free = bluefs->get_free(dev);
+          auto used = bluefs->get_used(dev);
+
+          f->dump_int("total", total);
+          f->dump_int("free", free);
+          f->dump_int("bluefs_used", used);
+          if (bluefs->is_shared_alloc(dev)) {
+            size_t avail = bluefs->probe_alloc_avail(dev, alloc_size);
+            f->dump_int("bluefs max available", avail);
+          }
+          f->close_section();
+        }
+      }
+
+      f->close_section();
+    } else if (command == "bluefs stats") {
+      std::stringstream ss;
+      bluefs->dump_block_extents(ss);
+      bluefs->dump_volume_selector(ss);
+      out.append(ss);
+    } else if (command == "bluefs files list") {
+      const char* devnames[3] = {"wal","db","slow"};
+      std::lock_guard l(bluefs->lock);
+      f->open_array_section("files");
+      for (auto &d : bluefs->dir_map) {
+        std::string dir = d.first;
+        for (auto &r : d.second->file_map) {
+          f->open_object_section("file");
+          f->dump_string("name", (dir + "/" + r.first).c_str());
+          std::vector<size_t> sizes;
+          sizes.resize(bluefs->bdev.size());
+          for(auto& i : r.second->fnode.extents) {
+            sizes[i.bdev] += i.length;
+          }
+          for (size_t i = 0; i < sizes.size(); i++) {
+            if (sizes[i]>0) {
+	      if (i < sizeof(devnames) / sizeof(*devnames))
+		f->dump_int(devnames[i], sizes[i]);
+	      else
+		f->dump_int(("dev-"+to_string(i)).c_str(), sizes[i]);
+	    }
+          }
+          f->close_section();
+        }
+      }
+      f->close_section();
+      f->flush(out);
+    } else if (command == "bluefs debug_inject_read_zeros") {
+      bluefs->inject_read_zeros++;
+    } else {
+      errss << "Invalid command" << std::endl;
+      return -ENOSYS;
+    }
+    return 0;
+  }
+};
+
+BlueFS::BlueFS(CephContext* cct)
+  : cct(cct),
+    bdev(MAX_BDEV),
+    ioc(MAX_BDEV),
+    block_reserved(MAX_BDEV),
+    alloc(MAX_BDEV),
+    alloc_size(MAX_BDEV, 0),
+    pending_release(MAX_BDEV)
+{
+  discard_cb[BDEV_WAL] = wal_discard_cb;
+  discard_cb[BDEV_DB] = db_discard_cb;
+  discard_cb[BDEV_SLOW] = slow_discard_cb;
+  asok_hook = SocketHook::create(this);
+
+}
+
+BlueFS::~BlueFS()
+{
+  delete asok_hook;
+  for (auto p : ioc) {
+    if (p)
+      p->aio_wait();
+  }
+  for (auto p : bdev) {
+    if (p) {
+      p->close();
+      delete p;
+    }
+  }
+  for (auto p : ioc) {
+    delete p;
+  }
+}
+
+void BlueFS::_init_logger()
+{
+  PerfCountersBuilder b(cct, "bluefs",
+                        l_bluefs_first, l_bluefs_last);
+  b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
+	    "Total bytes (main db device)",
+	    "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
+	    "Used bytes (main db device)",
+	    "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
+	    "Total bytes (wal device)",
+	    "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
+	    "Used bytes (wal device)",
+	    "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
+	    "Total bytes (slow device)",
+	    "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
+	    "Used bytes (slow device)",
+	    "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_num_files, "num_files", "File count",
+	    "f", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
+	    "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
+		    "Compactions of the metadata log");
+  b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
+		    "Bytes written to the metadata log", "j",
+		    PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
+		    "Files written to WAL");
+  b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
+		    "Files written to SSTs");
+  b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
+		    "Bytes written to WAL", "wal",
+		    PerfCountersBuilder::PRIO_CRITICAL);
+  b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
+		    "Bytes written to SSTs", "sst",
+		    PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
+		    "Bytes written to WAL/SSTs at slow device", NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
+		    "Maximum bytes allocated from WAL");
+  b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
+		    "Maximum bytes allocated from DB");
+  b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
+		    "Maximum bytes allocated from SLOW");
+
+  b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
+		    "random read requests processed");
+  b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
+		    "Bytes requested in random read mode", NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
+		    "random reads requests going to disk");
+  b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
+		    "Bytes read from disk in random read mode", NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
+		    "random read requests processed using prefetch buffer");
+  b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
+		    "Bytes read from prefetch buffer in random read mode", NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+  b.add_u64_counter(l_bluefs_read_count, "read_count",
+		    "buffered read requests processed");
+  b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
+		    "Bytes requested in buffered read mode", NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+  b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
+		    "prefetch read requests processed");
+  b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
+		    "Bytes requested in prefetch read mode", NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
+	    "How many times bluefs read found page with all 0s");
+  b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
+	    "How many times bluefs read found transient page with all 0s");
+
+  logger = b.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+}
+
+void BlueFS::_shutdown_logger()
+{
+  cct->get_perfcounters_collection()->remove(logger);
+  delete logger;
+}
+
+void BlueFS::_update_logger_stats()
+{
+  // we must be holding the lock
+  logger->set(l_bluefs_num_files, file_map.size());
+  logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
+
+  if (alloc[BDEV_WAL]) {
+    logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL));
+    logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL));
+  }
+  if (alloc[BDEV_DB]) {
+    logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB));
+    logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB));
+  }
+  if (alloc[BDEV_SLOW]) {
+    logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW));
+    logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW));
+  }
+}
+
+int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
+                             uint64_t reserved,
+                             bluefs_shared_alloc_context_t* _shared_alloc)
+{
+  dout(10) << __func__ << " bdev " << id << " path " << path << " "
+           << reserved << dendl;
+  ceph_assert(id < bdev.size());
+  ceph_assert(bdev[id] == NULL);
+  BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
+				       discard_cb[id], static_cast<void*>(this));
+  block_reserved[id] = reserved;
+  if (_shared_alloc) {
+    b->set_no_exclusive_lock();
+  }
+  int r = b->open(path);
+  if (r < 0) {
+    delete b;
+    return r;
+  }
+  if (trim) {
+    b->discard(0, b->get_size());
+  }
+
+  dout(1) << __func__ << " bdev " << id << " path " << path
+	  << " size " << byte_u_t(b->get_size()) << dendl;
+  bdev[id] = b;
+  ioc[id] = new IOContext(cct, NULL);
+  if (_shared_alloc) {
+    ceph_assert(!shared_alloc);
+    shared_alloc = _shared_alloc;
+    alloc[id] = shared_alloc->a;
+    shared_alloc_id = id;
+  }
+  return 0;
+}
+
+bool BlueFS::bdev_support_label(unsigned id)
+{
+  ceph_assert(id < bdev.size());
+  ceph_assert(bdev[id]);
+  return bdev[id]->supported_bdev_label();
+}
+
+uint64_t BlueFS::get_block_device_size(unsigned id) const
+{
+  if (id < bdev.size() && bdev[id])
+    return bdev[id]->get_size();
+  return 0;
+}
+
+void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
+{
+  dout(10) << __func__ << " bdev " << id << dendl;
+  ceph_assert(alloc[id]);
+  alloc[id]->release(to_release);
+  if (is_shared_alloc(id)) {
+    shared_alloc->bluefs_used -= to_release.size();
+  }
+}
+
+uint64_t BlueFS::get_used()
+{
+  std::lock_guard l(lock);
+  uint64_t used = 0;
+  for (unsigned id = 0; id < MAX_BDEV; ++id) {
+    used += _get_used(id);
+  }
+  return used;
+}
+
+uint64_t BlueFS::_get_used(unsigned id) const
+{
+  uint64_t used = 0;
+  if (!alloc[id])
+     return 0;
+
+  if (is_shared_alloc(id)) {
+    used = shared_alloc->bluefs_used;
+  } else {
+    used = _get_total(id) - alloc[id]->get_free();
+  }
+  return used;
+}
+
+uint64_t BlueFS::get_used(unsigned id)
+{
+  ceph_assert(id < alloc.size());
+  ceph_assert(alloc[id]);
+  std::lock_guard l(lock);
+  return _get_used(id);
+}
+
+uint64_t BlueFS::_get_total(unsigned id) const
+{
+  ceph_assert(id < bdev.size());
+  ceph_assert(id < block_reserved.size());
+  return get_block_device_size(id) - block_reserved[id];
+}
+
+uint64_t BlueFS::get_total(unsigned id)
+{
+  std::lock_guard l(lock);
+  return _get_total(id);
+}
+
+uint64_t BlueFS::get_free(unsigned id)
+{
+  std::lock_guard l(lock);
+  ceph_assert(id < alloc.size());
+  return alloc[id]->get_free();
+}
+
+void BlueFS::dump_perf_counters(Formatter *f)
+{
+  f->open_object_section("bluefs_perf_counters");
+  logger->dump_formatted(f,0);
+  f->close_section();
+}
+
+void BlueFS::dump_block_extents(ostream& out)
+{
+  for (unsigned i = 0; i < MAX_BDEV; ++i) {
+    if (!bdev[i]) {
+      continue;
+    }
+    auto total = get_total(i);
+    auto free = get_free(i);
+
+    out << i << " : device size 0x" << std::hex << total
+        << " : using 0x" << total - free
+	<< std::dec << "(" << byte_u_t(total - free) << ")";
+    out << "\n";
+  }
+}
+
+int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
+{
+  std::lock_guard l(lock);
+  dout(10) << __func__ << " bdev " << id << dendl;
+  ceph_assert(id < alloc.size());
+  for (auto& p : file_map) {
+    for (auto& q : p.second->fnode.extents) {
+      if (q.bdev == id) {
+        extents->insert(q.offset, q.length);
+      }
+    }
+  }
+  return 0;
+}
+
+int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
+{
+  std::unique_lock l(lock);
+  dout(1) << __func__
+	  << " osd_uuid " << osd_uuid
+	  << dendl;
+
+  // set volume selector if not provided before/outside
+  if (vselector == nullptr) {
+    vselector.reset(
+      new OriginalVolumeSelector(
+        get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+        get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+        get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
+  }
+
+  _init_alloc();
+  _init_logger();
+
+  super.version = 1;
+  super.block_size = bdev[BDEV_DB]->get_block_size();
+  super.osd_uuid = osd_uuid;
+  super.uuid.generate_random();
+  dout(1) << __func__ << " uuid " << super.uuid << dendl;
+
+  // init log
+  FileRef log_file = ceph::make_ref<File>();
+  log_file->fnode.ino = 1;
+  log_file->vselector_hint = vselector->get_hint_for_log();
+  int r = _allocate(
+    vselector->select_prefer_bdev(log_file->vselector_hint),
+    cct->_conf->bluefs_max_log_runway,
+    &log_file->fnode);
+  vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+  ceph_assert(r == 0);
+  log_writer = _create_writer(log_file);
+
+  // initial txn
+  log_t.op_init();
+  _flush_and_sync_log(l);
+
+  // write supers
+  super.log_fnode = log_file->fnode;
+  super.memorized_layout = layout;
+  _write_super(BDEV_DB);
+  flush_bdev();
+
+  // clean up
+  super = bluefs_super_t();
+  _close_writer(log_writer);
+  log_writer = NULL;
+  vselector.reset(nullptr);
+  _stop_alloc();
+  _shutdown_logger();
+  if (shared_alloc) {
+    ceph_assert(shared_alloc->need_init);
+    shared_alloc->need_init = false;
+  }
+
+  dout(10) << __func__ << " success" << dendl;
+  return 0;
+}
+
+void BlueFS::_init_alloc()
+{
+  dout(20) << __func__ << dendl;
+
+  if (bdev[BDEV_WAL]) {
+    alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size;
+  }
+  if (bdev[BDEV_SLOW]) {
+    alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
+    alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size;
+  } else {
+    alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size;
+  }
+  // new wal and db devices are never shared
+  if (bdev[BDEV_NEWWAL]) {
+    alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
+  }
+  if (bdev[BDEV_NEWDB]) {
+    alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
+  }
+
+  for (unsigned id = 0; id < bdev.size(); ++id) {
+    if (!bdev[id]) {
+      continue;
+    }
+    ceph_assert(bdev[id]->get_size());
+    ceph_assert(alloc_size[id]);
+    if (is_shared_alloc(id)) {
+      dout(1) << __func__ << " shared, id " << id << std::hex
+              << ", capacity 0x" << bdev[id]->get_size()
+              << ", block size 0x" << alloc_size[id]
+              << std::dec << dendl;
+    } else {
+      std::string name = "bluefs-";
+      const char* devnames[] = { "wal","db","slow" };
+      if (id <= BDEV_SLOW)
+        name += devnames[id];
+      else
+        name += to_string(uintptr_t(this));
+      dout(1) << __func__ << " new, id " << id << std::hex
+              << ", allocator name " << name
+              << ", allocator type " << cct->_conf->bluefs_allocator
+              << ", capacity 0x" << bdev[id]->get_size()
+              << ", block size 0x" << alloc_size[id]
+              << std::dec << dendl;
+      alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
+				    bdev[id]->get_size(),
+				    alloc_size[id], name);
+      alloc[id]->init_add_free(
+        block_reserved[id],
+        _get_total(id));
+    }
+  }
+}
+
+void BlueFS::_stop_alloc()
+{
+  dout(20) << __func__ << dendl;
+  for (auto p : bdev) {
+    if (p)
+      p->discard_drain();
+  }
+
+  for (size_t i = 0; i < alloc.size(); ++i) {
+    if (alloc[i] && !is_shared_alloc(i)) {
+      alloc[i]->shutdown();
+      delete alloc[i];
+      alloc[i] = nullptr;
+    }
+  }
+}
+
+int BlueFS::read(uint8_t ndev, uint64_t off, uint64_t len,
+		 ceph::buffer::list *pbl, IOContext *ioc, bool buffered)
+{
+  dout(10) << __func__ << " dev " << int(ndev)
+           << ": 0x" << std::hex << off << "~" << len << std::dec
+	   << (buffered ? " buffered" : "")
+	   << dendl;
+  int r;
+  bufferlist bl;
+  r = bdev[ndev]->read(off, len, &bl, ioc, buffered);
+  if (r != 0) {
+    return r;
+  }
+  uint64_t block_size = bdev[ndev]->get_block_size();
+  if (inject_read_zeros) {
+    if (len >= block_size * 2) {
+      derr << __func__ << " injecting error, zeros at "
+	   << int(ndev) << ": 0x" << std::hex << (off + len / 2)
+	   << "~" << (block_size * 2) << std::dec << dendl;
+      //use beginning, replace 8K in the middle with zeros, use tail
+      bufferlist temp;
+      bl.splice(0, len / 2 - block_size, &temp);
+      temp.append(buffer::create(block_size * 2, 0));
+      bl.splice(block_size * 2, len / 2 - block_size, &temp);
+      bl = temp;
+      inject_read_zeros--;
+    }
+  }
+  //make a check if there is a block with all 0
+  uint64_t to_check_len = len;
+  uint64_t skip = p2nphase(off, block_size);
+  if (skip >= to_check_len) {
+    return r;
+  }
+  auto it = bl.begin(skip);
+  to_check_len -= skip;
+  bool all_zeros = false;
+  while (all_zeros == false && to_check_len >= block_size) {
+    // checking 0s step
+    unsigned block_left = block_size;
+    unsigned avail;
+    const char* data;
+    all_zeros = true;
+    while (all_zeros && block_left > 0) {
+      avail = it.get_ptr_and_advance(block_left, &data);
+      block_left -= avail;
+      all_zeros = mem_is_zero(data, avail);
+    }
+    // skipping step
+    while (block_left > 0) {
+      avail = it.get_ptr_and_advance(block_left, &data);
+      block_left -= avail;
+    }
+    to_check_len -= block_size;
+  }
+  if (all_zeros) {
+    logger->inc(l_bluefs_read_zeros_candidate, 1);
+    bufferlist bl_reread;
+    r = bdev[ndev]->read(off, len, &bl_reread, ioc, buffered);
+    if (r != 0) {
+      return r;
+    }
+    // check if both read gave the same
+    if (!bl.contents_equal(bl_reread)) {
+      // report problems to log, but continue, maybe it will be good now...
+      derr << __func__ << " initial read of " << int(ndev)
+	   << ": 0x" << std::hex << off << "~" << len
+	   << std::dec << ": different then re-read " << dendl;
+      logger->inc(l_bluefs_read_zeros_errors, 1);
+    }
+    // use second read will be better if is different
+    pbl->append(bl_reread);
+  } else {
+    pbl->append(bl);
+  }
+  return r;
+}
+
+int BlueFS::read_random(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered)
+{
+  dout(10) << __func__ << " dev " << int(ndev)
+           << ": 0x" << std::hex << off << "~" << len << std::dec
+	   << (buffered ? " buffered" : "")
+	   << dendl;
+  int r;
+  r = bdev[ndev]->read_random(off, len, buf, buffered);
+  if (r != 0) {
+    return r;
+  }
+  uint64_t block_size = bdev[ndev]->get_block_size();
+  if (inject_read_zeros) {
+    if (len >= block_size * 2) {
+      derr << __func__ << " injecting error, zeros at "
+	   << int(ndev) << ": 0x" << std::hex << (off + len / 2)
+	   << "~" << (block_size * 2) << std::dec << dendl;
+      //zero middle 8K
+      memset(buf + len / 2 - block_size, 0, block_size * 2);
+      inject_read_zeros--;
+    }
+  }
+  //make a check if there is a block with all 0
+  uint64_t to_check_len = len;
+  const char* data = buf;
+  uint64_t skip = p2nphase(off, block_size);
+  if (skip >= to_check_len) {
+    return r;
+  }
+  to_check_len -= skip;
+  data += skip;
+
+  bool all_zeros = false;
+  while (all_zeros == false && to_check_len >= block_size) {
+    if (mem_is_zero(data, block_size)) {
+      // at least one block is all zeros
+      all_zeros = true;
+      break;
+    }
+    data += block_size;
+    to_check_len -= block_size;
+  }
+  if (all_zeros) {
+    logger->inc(l_bluefs_read_zeros_candidate, 1);
+    std::unique_ptr<char[]> data_reread(new char[len]);
+    r = bdev[ndev]->read_random(off, len, &data_reread[0], buffered);
+    if (r != 0) {
+      return r;
+    }
+    // check if both read gave the same
+    if (memcmp(buf, &data_reread[0], len) != 0) {
+      derr << __func__ << " initial read of " << int(ndev)
+	   << ": 0x" << std::hex << off << "~" << len
+	   << std::dec << ": different then re-read " << dendl;
+      logger->inc(l_bluefs_read_zeros_errors, 1);
+      // second read is probably better
+      memcpy(buf, &data_reread[0], len);
+    }
+  }
+  return r;
+}
+
+int BlueFS::mount()
+{
+  dout(1) << __func__ << dendl;
+
+  int r = _open_super();
+  if (r < 0) {
+    derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
+    goto out;
+  }
+
+  // set volume selector if not provided before/outside
+  if (vselector == nullptr) {
+    vselector.reset(
+      new OriginalVolumeSelector(
+        get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+        get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+        get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
+  }
+
+  _init_alloc();
+  _init_logger();
+
+  r = _replay(false, false);
+  if (r < 0) {
+    derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
+    _stop_alloc();
+    goto out;
+  }
+
+  // init freelist
+  for (auto& p : file_map) {
+    dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
+    for (auto& q : p.second->fnode.extents) {
+      bool is_shared = is_shared_alloc(q.bdev);
+      ceph_assert(!is_shared || (is_shared && shared_alloc));
+      if (is_shared && shared_alloc->need_init && shared_alloc->a) {
+        shared_alloc->bluefs_used += q.length;
+        alloc[q.bdev]->init_rm_free(q.offset, q.length);
+      } else if (!is_shared) {
+        alloc[q.bdev]->init_rm_free(q.offset, q.length);
+      }
+    }
+  }
+  if (shared_alloc) {
+    shared_alloc->need_init = false;
+    dout(1) << __func__ << " shared_bdev_used = "
+            << shared_alloc->bluefs_used << dendl;
+  } else {
+    dout(1) << __func__ << " shared bdev not used"
+            << dendl;
+  }
+
+  // set up the log for future writes
+  log_writer = _create_writer(_get_file(1));
+  ceph_assert(log_writer->file->fnode.ino == 1);
+  log_writer->pos = log_writer->file->fnode.size;
+  log_writer->file->fnode.reset_delta();
+  dout(10) << __func__ << " log write pos set to 0x"
+           << std::hex << log_writer->pos << std::dec
+           << dendl;
+
+  return 0;
+
+ out:
+  super = bluefs_super_t();
+  return r;
+}
+
+int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
+{
+  if (super.memorized_layout) {
+    if (layout == *super.memorized_layout) {
+      dout(10) << __func__ << " bluefs layout verified positively" << dendl;
+    } else {
+      derr << __func__ << " memorized layout doesn't fit current one" << dendl;
+      return -EIO;
+    }
+  } else {
+    dout(10) << __func__ << " no memorized_layout in bluefs superblock"
+             << dendl;
+  }
+
+  return 0;
+}
+
+void BlueFS::umount(bool avoid_compact)
+{
+  dout(1) << __func__ << dendl;
+
+  sync_metadata(avoid_compact);
+
+  _close_writer(log_writer);
+  log_writer = NULL;
+
+  vselector.reset(nullptr);
+  _stop_alloc();
+  file_map.clear();
+  dir_map.clear();
+  super = bluefs_super_t();
+  log_t.clear();
+  _shutdown_logger();
+}
+
+int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
+{
+  dout(1) << __func__ << dendl;
+
+  if(id == BDEV_NEWDB) {
+    int new_log_dev_cur = BDEV_WAL;
+    int new_log_dev_next = BDEV_WAL;
+    if (!bdev[BDEV_WAL]) {
+      new_log_dev_cur = BDEV_NEWDB;
+      new_log_dev_next = BDEV_DB;
+    }
+    _rewrite_log_and_layout_sync(false,
+      BDEV_NEWDB,
+      new_log_dev_cur,
+      new_log_dev_next,
+      RENAME_DB2SLOW,
+      layout);
+    //}
+  } else if(id == BDEV_NEWWAL) {
+    _rewrite_log_and_layout_sync(false,
+      BDEV_DB,
+      BDEV_NEWWAL,
+      BDEV_WAL,
+      REMOVE_WAL,
+      layout);
+  } else {
+    assert(false);
+  }
+  return 0;
+}
+
+void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
+{
+  if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
+    bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
+  if (bdev[BDEV_WAL])
+    bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
+}
+
+void BlueFS::get_devices(set<string> *ls)
+{
+  for (unsigned i = 0; i < MAX_BDEV; ++i) {
+    if (bdev[i]) {
+      bdev[i]->get_devices(ls);
+    }
+  }
+}
+
+int BlueFS::fsck()
+{
+  std::lock_guard l(lock);
+  dout(1) << __func__ << dendl;
+  // hrm, i think we check everything on mount...
+  return 0;
+}
+
+int BlueFS::_write_super(int dev)
+{
+  // build superblock
+  bufferlist bl;
+  encode(super, bl);
+  uint32_t crc = bl.crc32c(-1);
+  encode(crc, bl);
+  dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
+  dout(10) << __func__ << " superblock " << super.version << dendl;
+  dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
+  ceph_assert_always(bl.length() <= get_super_length());
+  bl.append_zero(get_super_length() - bl.length());
+
+  bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
+  dout(20) << __func__ << " v " << super.version
+           << " crc 0x" << std::hex << crc
+           << " offset 0x" << get_super_offset() << std::dec
+           << dendl;
+  return 0;
+}
+
+int BlueFS::_open_super()
+{
+  dout(10) << __func__ << dendl;
+
+  bufferlist bl;
+  uint32_t expected_crc, crc;
+  int r;
+
+  // always the second block
+  r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
+			  &bl, ioc[BDEV_DB], false);
+  if (r < 0)
+    return r;
+
+  auto p = bl.cbegin();
+  decode(super, p);
+  {
+    bufferlist t;
+    t.substr_of(bl, 0, p.get_off());
+    crc = t.crc32c(-1);
+  }
+  decode(expected_crc, p);
+  if (crc != expected_crc) {
+    derr << __func__ << " bad crc on superblock, expected 0x"
+         << std::hex << expected_crc << " != actual 0x" << crc << std::dec
+         << dendl;
+    return -EIO;
+  }
+  dout(10) << __func__ << " superblock " << super.version << dendl;
+  dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
+  return 0;
+}
+
+int BlueFS::_check_allocations(const bluefs_fnode_t& fnode,
+  boost::dynamic_bitset<uint64_t>* used_blocks,
+  bool is_alloc, //true when allocating, false when deallocating
+  const char* op_name)
+{
+  auto& fnode_extents = fnode.extents;
+  for (auto e : fnode_extents) {
+    auto id = e.bdev;
+    bool fail = false;
+    ceph_assert(id < MAX_BDEV);
+    if (int r = _verify_alloc_granularity(id, e.offset, e.length,
+					  op_name); r < 0) {
+      return r;
+    }
+
+    apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
+      [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
+	if (is_alloc == bs.test(pos)) {
+	  fail = true;
+	} else {
+	  bs.flip(pos);
+	}
+      }
+    );
+    if (fail) {
+      derr << __func__ << " " << op_name << " invalid extent " << int(e.bdev)
+        << ": 0x" << std::hex << e.offset << "~" << e.length << std::dec
+	<< (is_alloc == true ?
+	    ": duplicate reference, ino " : ": double free, ino ")
+	<< fnode.ino << dendl;
+      return -EFAULT;
+    }
+  }
+  return 0;
+}
+
+int BlueFS::_verify_alloc_granularity(
+  __u8 id, uint64_t offset, uint64_t length, const char *op)
+{
+  if ((offset & (alloc_size[id] - 1)) ||
+      (length & (alloc_size[id] - 1))) {
+    derr << __func__ << " " << op << " of " << (int)id
+	 << ":0x" << std::hex << offset << "~" << length << std::dec
+	 << " does not align to alloc_size 0x"
+	 << std::hex << alloc_size[id] << std::dec << dendl;
+    // be helpful
+    auto need = alloc_size[id];
+    while (need && ((offset & (need - 1)) ||
+		    (length & (need - 1)))) {
+      need >>= 1;
+    }
+    if (need) {
+      const char *which;
+      if (id == BDEV_SLOW ||
+	  (id == BDEV_DB && !bdev[BDEV_SLOW])) {
+	which = "bluefs_shared_alloc_size";
+      } else {
+	which = "bluefs_alloc_size";
+      }
+      derr << "work-around by setting " << which << " = " << need
+	   << " for this OSD" << dendl;
+    }
+    return -EFAULT;
+  }
+  return 0;
+}
+
+int BlueFS::_replay(bool noop, bool to_stdout)
+{
+  dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
+  ino_last = 1;  // by the log
+  log_seq = 0;
+
+  FileRef log_file;
+  log_file = _get_file(1);
+
+  log_file->fnode = super.log_fnode;
+  if (!noop) {
+    log_file->vselector_hint =
+      vselector->get_hint_for_log();
+  } else {
+    // do not use fnode from superblock in 'noop' mode - log_file's one should
+    // be fine and up-to-date
+    ceph_assert(log_file->fnode.ino == 1);
+    ceph_assert(log_file->fnode.extents.size() != 0);
+  }
+  dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
+  if (unlikely(to_stdout)) {
+    std::cout << " log_fnode " << super.log_fnode << std::endl;
+  } 
+
+  FileReader *log_reader = new FileReader(
+    log_file, cct->_conf->bluefs_max_prefetch,
+    false,  // !random
+    true);  // ignore eof
+
+  bool seen_recs = false;
+
+  boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
+
+  if (!noop) {
+    if (cct->_conf->bluefs_log_replay_check_allocations) {
+      for (size_t i = 0; i < MAX_BDEV; ++i) {
+	if (alloc_size[i] != 0 && bdev[i] != nullptr) {
+	  used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
+	}
+      }
+      // check initial log layout
+      int r = _check_allocations(log_file->fnode,
+				 used_blocks, true, "Log from super");
+      if (r < 0) {
+	return r;
+      }
+    }
+  }
+  
+  while (true) {
+    ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
+    uint64_t pos = log_reader->buf.pos;
+    uint64_t read_pos = pos;
+    bufferlist bl;
+    {
+      int r = _read(log_reader, read_pos, super.block_size,
+		    &bl, NULL);
+      if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
+	r += do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
+      }
+      assert(r == (int)super.block_size);
+      read_pos += r;
+    }
+    uint64_t more = 0;
+    uint64_t seq;
+    uuid_d uuid;
+    {
+      auto p = bl.cbegin();
+      __u8 a, b;
+      uint32_t len;
+      decode(a, p);
+      decode(b, p);
+      decode(len, p);
+      decode(uuid, p);
+      decode(seq, p);
+      if (len + 6 > bl.length()) {
+	more = round_up_to(len + 6 - bl.length(), super.block_size);
+      }
+    }
+    if (uuid != super.uuid) {
+      if (seen_recs) {
+	dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
+		 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
+		 << dendl;
+      } else {
+	derr << __func__ << " 0x" << std::hex << pos << std::dec
+		 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
+		 << ", block dump: \n";
+	bufferlist t;
+	t.substr_of(bl, 0, super.block_size);
+	t.hexdump(*_dout);
+	*_dout << dendl;
+      }
+      break;
+    }
+    if (seq != log_seq + 1) {
+      if (seen_recs) {
+	dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
+		 << ": stop: seq " << seq << " != expected " << log_seq + 1
+		 << dendl;;
+      } else {
+	derr << __func__ << " 0x" << std::hex << pos << std::dec
+	     << ": stop: seq " << seq << " != expected " << log_seq + 1
+	     << dendl;;
+      }
+      break;
+    }
+    if (more) {
+      dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
+               << " more bytes" << dendl;
+      bufferlist t;
+      int r = _read(log_reader, read_pos, more, &t, NULL);
+      if (r < (int)more) {
+	dout(10) << __func__ << " 0x" << std::hex << pos
+                 << ": stop: len is 0x" << bl.length() + more << std::dec
+                 << ", which is past eof" << dendl;
+	if (cct->_conf->bluefs_replay_recovery) {
+	  //try to search for more data
+	  r += do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
+	  if (r < (int)more) {
+	    //in normal mode we must read r==more, for recovery it is too strict
+	    break;
+	  }
+	}
+      }
+      ceph_assert(r == (int)more);
+      bl.claim_append(t);
+      read_pos += r;
+    }
+    bluefs_transaction_t t;
+    try {
+      auto p = bl.cbegin();
+      decode(t, p);
+      seen_recs = true;
+    }
+    catch (ceph::buffer::error& e) {
+      // Multi-block transactions might be incomplete due to unexpected
+      // power off. Hence let's treat that as a regular stop condition.
+      if (seen_recs && more) {
+        dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
+                 << ": stop: failed to decode: " << e.what()
+                 << dendl;
+      } else {
+        derr << __func__ << " 0x" << std::hex << pos << std::dec
+             << ": stop: failed to decode: " << e.what()
+             << dendl;
+        delete log_reader;
+        return -EIO;
+      }
+      break;
+    }
+    ceph_assert(seq == t.seq);
+    dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
+             << ": " << t << dendl;
+    if (unlikely(to_stdout)) {
+      std::cout << " 0x" << std::hex << pos << std::dec
+                << ": " << t << std::endl;
+    }
+
+    auto p = t.op_bl.cbegin();
+    while (!p.end()) {
+      __u8 op;
+      decode(op, p);
+      switch (op) {
+
+      case bluefs_transaction_t::OP_INIT:
+	dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                 << ":  op_init" << dendl;
+        if (unlikely(to_stdout)) {
+          std::cout << " 0x" << std::hex << pos << std::dec
+                    << ":  op_init" << std::endl;
+        }
+
+	ceph_assert(t.seq == 1);
+	break;
+
+      case bluefs_transaction_t::OP_JUMP:
+        {
+	  uint64_t next_seq;
+	  uint64_t offset;
+	  decode(next_seq, p);
+	  decode(offset, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+		   << ":  op_jump seq " << next_seq
+		   << " offset 0x" << std::hex << offset << std::dec << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_jump seq " << next_seq
+                      << " offset 0x" << std::hex << offset << std::dec
+                      << std::endl;
+          }
+
+	  ceph_assert(next_seq >= log_seq);
+	  log_seq = next_seq - 1; // we will increment it below
+	  uint64_t skip = offset - read_pos;
+	  if (skip) {
+	    bufferlist junk;
+	    int r = _read(log_reader, read_pos, skip, &junk,
+			  NULL);
+	    if (r != (int)skip) {
+	      dout(10) << __func__ << " 0x" << std::hex << read_pos
+		       << ": stop: failed to skip to " << offset
+		       << std::dec << dendl;
+	      ceph_abort_msg("problem with op_jump");
+	    }
+	  }
+	}
+	break;
+
+      case bluefs_transaction_t::OP_JUMP_SEQ:
+        {
+	  uint64_t next_seq;
+	  decode(next_seq, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_jump_seq " << next_seq << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_jump_seq " << next_seq << std::endl;
+          }
+
+	  ceph_assert(next_seq >= log_seq);
+	  log_seq = next_seq - 1; // we will increment it below
+	}
+	break;
+
+      case bluefs_transaction_t::OP_ALLOC_ADD:
+	// LEGACY, do nothing but read params
+        {
+          __u8 id;
+          uint64_t offset, length;
+          decode(id, p);
+          decode(offset, p);
+          decode(length, p);
+        }
+	break;
+
+      case bluefs_transaction_t::OP_ALLOC_RM:
+	// LEGACY, do nothing but read params
+        {
+          __u8 id;
+          uint64_t offset, length;
+          decode(id, p);
+          decode(offset, p);
+          decode(length, p);
+        }
+        break;
+
+      case bluefs_transaction_t::OP_DIR_LINK:
+        {
+	  string dirname, filename;
+	  uint64_t ino;
+	  decode(dirname, p);
+	  decode(filename, p);
+	  decode(ino, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_dir_link " << " " << dirname << "/" << filename
+                   << " to " << ino
+		   << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_dir_link " << " " << dirname << "/" << filename
+                      << " to " << ino
+                      << std::endl;
+          }
+
+	  if (!noop) {
+	    FileRef file = _get_file(ino);
+	    ceph_assert(file->fnode.ino);
+	    map<string,DirRef>::iterator q = dir_map.find(dirname);
+	    ceph_assert(q != dir_map.end());
+	    map<string,FileRef>::iterator r = q->second->file_map.find(filename);
+	    ceph_assert(r == q->second->file_map.end());
+
+            vselector->sub_usage(file->vselector_hint, file->fnode);
+            file->vselector_hint =
+              vselector->get_hint_by_dir(dirname);
+            vselector->add_usage(file->vselector_hint, file->fnode);
+
+	    q->second->file_map[filename] = file;
+	    ++file->refs;
+	  }
+	}
+	break;
+
+      case bluefs_transaction_t::OP_DIR_UNLINK:
+        {
+	  string dirname, filename;
+	  decode(dirname, p);
+	  decode(filename, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_dir_unlink " << " " << dirname << "/" << filename
+                   << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_dir_unlink " << " " << dirname << "/" << filename
+                      << std::endl;
+          }
+ 
+	  if (!noop) {
+	    map<string,DirRef>::iterator q = dir_map.find(dirname);
+	    ceph_assert(q != dir_map.end());
+	    map<string,FileRef>::iterator r = q->second->file_map.find(filename);
+	    ceph_assert(r != q->second->file_map.end());
+            ceph_assert(r->second->refs > 0); 
+	    --r->second->refs;
+	    q->second->file_map.erase(r);
+	  }
+	}
+	break;
+
+      case bluefs_transaction_t::OP_DIR_CREATE:
+        {
+	  string dirname;
+	  decode(dirname, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_dir_create " << dirname << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_dir_create " << dirname << std::endl;
+          }
+
+	  if (!noop) {
+	    map<string,DirRef>::iterator q = dir_map.find(dirname);
+	    ceph_assert(q == dir_map.end());
+	    dir_map[dirname] = ceph::make_ref<Dir>();
+	  }
+	}
+	break;
+
+      case bluefs_transaction_t::OP_DIR_REMOVE:
+        {
+	  string dirname;
+	  decode(dirname, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_dir_remove " << dirname << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_dir_remove " << dirname << std::endl;
+          }
+
+	  if (!noop) {
+	    map<string,DirRef>::iterator q = dir_map.find(dirname);
+	    ceph_assert(q != dir_map.end());
+	    ceph_assert(q->second->file_map.empty());
+	    dir_map.erase(q);
+	  }
+	}
+	break;
+
+      case bluefs_transaction_t::OP_FILE_UPDATE:
+        {
+	  bluefs_fnode_t fnode;
+	  decode(fnode, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_file_update " << " " << fnode << " " << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_file_update " << " " << fnode << std::endl;
+          }
+          if (!noop) {
+	    FileRef f = _get_file(fnode.ino);
+	    if (cct->_conf->bluefs_log_replay_check_allocations) {
+              int r = _check_allocations(f->fnode,
+		used_blocks, false, "OP_FILE_UPDATE");
+              if (r < 0) {
+                return r;
+              }
+            }
+            if (fnode.ino != 1) {
+              vselector->sub_usage(f->vselector_hint, f->fnode);
+            }
+            f->fnode = fnode;
+            if (fnode.ino != 1) {
+              vselector->add_usage(f->vselector_hint, f->fnode);
+            }
+
+	    if (fnode.ino > ino_last) {
+	      ino_last = fnode.ino;
+	    }
+            if (cct->_conf->bluefs_log_replay_check_allocations) {
+              int r = _check_allocations(f->fnode,
+		used_blocks, true, "OP_FILE_UPDATE");
+              if (r < 0) {
+                return r;
+              }
+            }
+	  } else if (noop && fnode.ino == 1) {
+	    FileRef f = _get_file(fnode.ino);
+	    f->fnode = fnode;
+	  }
+        }
+	break;
+      case bluefs_transaction_t::OP_FILE_UPDATE_INC:
+	{
+	  bluefs_fnode_delta_t delta;
+	  decode(delta, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+	    << ":  op_file_update_inc " << " " << delta << " " << dendl;
+	  if (unlikely(to_stdout)) {
+	    std::cout << " 0x" << std::hex << pos << std::dec
+	      << ":  op_file_update_inc " << " " << delta << std::endl;
+	  }
+	  if (!noop) {
+	    FileRef f = _get_file(delta.ino);
+	    bluefs_fnode_t& fnode = f->fnode;
+	    if (delta.offset != fnode.allocated) {
+	      derr << __func__ << " invalid op_file_update_inc, new extents miss end of file"
+		   << " fnode=" << fnode
+		   << " delta=" << delta
+		   << dendl;
+	      ceph_assert(delta.offset == fnode.allocated);
+	    }
+	    if (cct->_conf->bluefs_log_replay_check_allocations) {
+              int r = _check_allocations(fnode,
+		used_blocks, false, "OP_FILE_UPDATE_INC");
+              if (r < 0) {
+                return r;
+              }
+            }
+
+	    fnode.ino = delta.ino;
+	    fnode.mtime = delta.mtime;
+	    if (fnode.ino != 1) {
+	      vselector->sub_usage(f->vselector_hint, fnode);
+	    }
+	    fnode.size = delta.size;
+	    fnode.claim_extents(delta.extents);
+	    dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+		     << ":  op_file_update_inc produced " << " " << fnode << " " << dendl;
+
+	    if (fnode.ino != 1) {
+	      vselector->add_usage(f->vselector_hint, fnode);
+	    }
+
+	    if (fnode.ino > ino_last) {
+	      ino_last = fnode.ino;
+	    }
+	    if (cct->_conf->bluefs_log_replay_check_allocations) {
+              int r = _check_allocations(f->fnode,
+		used_blocks, true, "OP_FILE_UPDATE_INC");
+              if (r < 0) {
+                return r;
+              }
+	    }
+	  } else if (noop && delta.ino == 1) {
+	    // we need to track bluefs log, even in noop mode
+	    FileRef f = _get_file(1);
+	    bluefs_fnode_t& fnode = f->fnode;
+	    fnode.ino = delta.ino;
+	    fnode.mtime = delta.mtime;
+	    fnode.size = delta.size;
+	    fnode.claim_extents(delta.extents);
+	  }
+	}
+      break;
+
+      case bluefs_transaction_t::OP_FILE_REMOVE:
+        {
+	  uint64_t ino;
+	  decode(ino, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_file_remove " << ino << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_file_remove " << ino << std::endl;
+          }
+
+          if (!noop) {
+            auto p = file_map.find(ino);
+            ceph_assert(p != file_map.end());
+            vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
+            if (cct->_conf->bluefs_log_replay_check_allocations) {
+	      int r = _check_allocations(p->second->fnode,
+		used_blocks, false, "OP_FILE_REMOVE");
+              if (r < 0) {
+		return r;
+              }
+            }
+            file_map.erase(p);
+          }
+        }
+	break;
+
+      default:
+	derr << __func__ << " 0x" << std::hex << pos << std::dec
+             << ": stop: unrecognized op " << (int)op << dendl;
+	delete log_reader;
+        return -EIO;
+      }
+    }
+    ceph_assert(p.end());
+
+    // we successfully replayed the transaction; bump the seq and log size
+    ++log_seq;
+    log_file->fnode.size = log_reader->buf.pos;
+  }
+  if (!noop) {
+    vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+  }
+
+  dout(10) << __func__ << " log file size was 0x"
+           << std::hex << log_file->fnode.size << std::dec << dendl;
+  if (unlikely(to_stdout)) {
+    std::cout << " log file size was 0x"
+              << std::hex << log_file->fnode.size << std::dec << std::endl;
+  }
+
+  delete log_reader;
+
+  if (!noop) {
+    // verify file link counts are all >0
+    for (auto& p : file_map) {
+      if (p.second->refs == 0 &&
+	  p.second->fnode.ino > 1) {
+	derr << __func__ << " file with link count 0: " << p.second->fnode
+	     << dendl;
+	return -EIO;
+      }
+    }
+  }
+
+  dout(10) << __func__ << " done" << dendl;
+  return 0;
+}
+
+int BlueFS::log_dump()
+{
+  // only dump log file's content
+  ceph_assert(log_writer == nullptr && "cannot log_dump on mounted BlueFS");
+  int r = _open_super();
+  if (r < 0) {
+    derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  _init_logger();
+  r = _replay(true, true);
+  if (r < 0) {
+    derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
+  }
+  _shutdown_logger();
+  super = bluefs_super_t();
+  return r;
+}
+
+int BlueFS::device_migrate_to_existing(
+  CephContext *cct,
+  const set<int>& devs_source,
+  int dev_target,
+  const bluefs_layout_t& layout)
+{
+  vector<byte> buf;
+  bool buffered = cct->_conf->bluefs_buffered_io;
+
+  dout(10) << __func__ << " devs_source " << devs_source
+	   << " dev_target " << dev_target << dendl;
+  assert(dev_target < (int)MAX_BDEV);
+
+  int flags = 0;
+  flags |= devs_source.count(BDEV_DB) ?
+    (REMOVE_DB | RENAME_SLOW2DB) : 0;
+  flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
+  int dev_target_new = dev_target;
+
+  // Slow device without separate DB one is addressed via BDEV_DB
+  // Hence need renaming.
+  if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
+    dev_target_new = BDEV_DB;
+    dout(0) << __func__ << " super to be written to " << dev_target << dendl;
+  }
+
+  for (auto& [ino, file_ref] : file_map) {
+    //do not copy log
+    if (file_ref->fnode.ino == 1) {
+      continue;
+    }
+    dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
+
+    auto& fnode_extents = file_ref->fnode.extents;
+
+    bool rewrite = std::any_of(
+      fnode_extents.begin(),
+      fnode_extents.end(),
+      [=](auto& ext) {
+	return ext.bdev != dev_target && devs_source.count(ext.bdev);
+      });
+    if (rewrite) {
+      dout(10) << __func__ << "  migrating" << dendl;
+
+      // read entire file
+      bufferlist bl;
+      for (auto old_ext : fnode_extents) {
+	buf.resize(old_ext.length);
+	int r = bdev[old_ext.bdev]->read_random(
+	  old_ext.offset,
+	  old_ext.length,
+	  (char*)&buf.at(0),
+	  buffered);
+	if (r != 0) {
+	  derr << __func__ << " failed to read 0x" << std::hex
+	       << old_ext.offset << "~" << old_ext.length << std::dec
+	       << " from " << (int)dev_target << dendl;
+	  return -EIO;
+	}
+	bl.append((char*)&buf[0], old_ext.length);
+      }
+
+      // write entire file
+      PExtentVector extents;
+      auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
+      if (l < 0) {
+	derr << __func__ << " unable to allocate len 0x" << std::hex
+	     << bl.length() << std::dec << " from " << (int)dev_target
+	     << ": " << cpp_strerror(l) << dendl;
+	return -ENOSPC;
+      }
+
+      uint64_t off = 0;
+      for (auto& i : extents) {
+	bufferlist cur;
+	uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
+	ceph_assert(cur_len > 0);
+	cur.substr_of(bl, off, cur_len);
+	int r = bdev[dev_target]->write(i.offset, cur, buffered);
+	ceph_assert(r == 0);
+	off += cur_len;
+      }
+
+      // release old extents
+      for (auto old_ext : fnode_extents) {
+	PExtentVector to_release;
+	to_release.emplace_back(old_ext.offset, old_ext.length);
+	alloc[old_ext.bdev]->release(to_release);
+        if (is_shared_alloc(old_ext.bdev)) {
+          shared_alloc->bluefs_used -= to_release.size();
+        }
+      }
+
+      // update fnode
+      fnode_extents.clear();
+      for (auto& i : extents) {
+	fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
+      }
+    } else {
+      for (auto& ext : fnode_extents) {
+	if (dev_target != dev_target_new && ext.bdev == dev_target) {
+	  dout(20) << __func__ << "  " << " ... adjusting extent 0x"
+		   << std::hex << ext.offset << std::dec
+		   << " bdev " << dev_target << " -> " << dev_target_new
+		   << dendl;
+	  ext.bdev = dev_target_new;
+	}
+      }
+    }
+  }
+  // new logging device in the current naming scheme
+  int new_log_dev_cur = bdev[BDEV_WAL] ?
+    BDEV_WAL :
+    bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
+
+  // new logging device in new naming scheme
+  int new_log_dev_next = new_log_dev_cur;
+
+  if (devs_source.count(new_log_dev_cur)) {
+    // SLOW device is addressed via BDEV_DB too hence either WAL or DB
+    new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
+      BDEV_DB :
+      BDEV_WAL;
+
+    dout(0) << __func__ << " log moved from " << new_log_dev_cur
+      << " to " << new_log_dev_next << dendl;
+
+    new_log_dev_cur =
+      (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
+        BDEV_SLOW :
+        new_log_dev_next;
+  }
+
+  _rewrite_log_and_layout_sync(
+    false,
+    (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
+    new_log_dev_cur,
+    new_log_dev_next,
+    flags,
+    layout);
+  return 0;
+}
+
+int BlueFS::device_migrate_to_new(
+  CephContext *cct,
+  const set<int>& devs_source,
+  int dev_target,
+  const bluefs_layout_t& layout)
+{
+  vector<byte> buf;
+  bool buffered = cct->_conf->bluefs_buffered_io;
+
+  dout(10) << __func__ << " devs_source " << devs_source
+	   << " dev_target " << dev_target << dendl;
+  assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
+
+  int flags = 0;
+
+  flags |= devs_source.count(BDEV_DB) ?
+    (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
+    0;
+  flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
+  int dev_target_new = dev_target; //FIXME: remove, makes no sense
+
+  for (auto& p : file_map) {
+    //do not copy log
+    if (p.second->fnode.ino == 1) {
+      continue;
+    }
+    dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
+
+    auto& fnode_extents = p.second->fnode.extents;
+
+    bool rewrite = false;
+    for (auto ext_it = fnode_extents.begin();
+	 ext_it != p.second->fnode.extents.end();
+	 ++ext_it) {
+      if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
+	rewrite = true;
+	break;
+      }
+    }
+    if (rewrite) {
+      dout(10) << __func__ << "  migrating" << dendl;
+
+      // read entire file
+      bufferlist bl;
+      for (auto old_ext : fnode_extents) {
+	buf.resize(old_ext.length);
+	int r = bdev[old_ext.bdev]->read_random(
+	  old_ext.offset,
+	  old_ext.length,
+	  (char*)&buf.at(0),
+	  buffered);
+	if (r != 0) {
+	  derr << __func__ << " failed to read 0x" << std::hex
+	       << old_ext.offset << "~" << old_ext.length << std::dec
+	       << " from " << (int)dev_target << dendl;
+	  return -EIO;
+	}
+	bl.append((char*)&buf[0], old_ext.length);
+      }
+
+      // write entire file
+      PExtentVector extents;
+      auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
+      if (l < 0) {
+	derr << __func__ << " unable to allocate len 0x" << std::hex
+	     << bl.length() << std::dec << " from " << (int)dev_target
+	     << ": " << cpp_strerror(l) << dendl;
+	return -ENOSPC;
+      }
+
+      uint64_t off = 0;
+      for (auto& i : extents) {
+	bufferlist cur;
+	uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
+	ceph_assert(cur_len > 0);
+	cur.substr_of(bl, off, cur_len);
+	int r = bdev[dev_target]->write(i.offset, cur, buffered);
+	ceph_assert(r == 0);
+	off += cur_len;
+      }
+
+      // release old extents
+      for (auto old_ext : fnode_extents) {
+	PExtentVector to_release;
+	to_release.emplace_back(old_ext.offset, old_ext.length);
+	alloc[old_ext.bdev]->release(to_release);
+        if (is_shared_alloc(old_ext.bdev)) {
+          shared_alloc->bluefs_used -= to_release.size();
+        }
+      }
+
+      // update fnode
+      fnode_extents.clear();
+      for (auto& i : extents) {
+	fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
+      }
+    }
+  }
+  // new logging device in the current naming scheme
+  int new_log_dev_cur =
+    bdev[BDEV_NEWWAL] ?
+      BDEV_NEWWAL :
+      bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
+        BDEV_WAL :
+	bdev[BDEV_NEWDB] ?
+	  BDEV_NEWDB :
+	  bdev[BDEV_DB] && !(flags & REMOVE_DB)?
+	    BDEV_DB :
+	    BDEV_SLOW;
+
+  // new logging device in new naming scheme
+  int new_log_dev_next =
+    new_log_dev_cur == BDEV_NEWWAL ?
+      BDEV_WAL :
+      new_log_dev_cur == BDEV_NEWDB ?
+	BDEV_DB :
+        new_log_dev_cur;
+
+  int super_dev =
+    dev_target == BDEV_NEWDB ?
+      BDEV_NEWDB :
+      bdev[BDEV_DB] ?
+        BDEV_DB :
+	BDEV_SLOW;
+
+  _rewrite_log_and_layout_sync(
+    false,
+    super_dev,
+    new_log_dev_cur,
+    new_log_dev_next,
+    flags,
+    layout);
+  return 0;
+}
+
+BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
+{
+  auto p = file_map.find(ino);
+  if (p == file_map.end()) {
+    FileRef f = ceph::make_ref<File>();
+    file_map[ino] = f;
+    dout(30) << __func__ << " ino " << ino << " = " << f
+	     << " (new)" << dendl;
+    return f;
+  } else {
+    dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
+    return p->second;
+  }
+}
+
+void BlueFS::_drop_link(FileRef file)
+{
+  dout(20) << __func__ << " had refs " << file->refs
+	   << " on " << file->fnode << dendl;
+  ceph_assert(file->refs > 0);
+  --file->refs;
+  if (file->refs == 0) {
+    dout(20) << __func__ << " destroying " << file->fnode << dendl;
+    ceph_assert(file->num_reading.load() == 0);
+    vselector->sub_usage(file->vselector_hint, file->fnode);
+    log_t.op_file_remove(file->fnode.ino);
+    for (auto& r : file->fnode.extents) {
+      pending_release[r.bdev].insert(r.offset, r.length);
+    }
+    file_map.erase(file->fnode.ino);
+    file->deleted = true;
+
+    if (file->dirty_seq) {
+      ceph_assert(file->dirty_seq > log_seq_stable);
+      ceph_assert(dirty_files.count(file->dirty_seq));
+      auto it = dirty_files[file->dirty_seq].iterator_to(*file);
+      dirty_files[file->dirty_seq].erase(it);
+      file->dirty_seq = 0;
+    }
+  }
+}
+
+int64_t BlueFS::_read_random(
+  FileReader *h,         ///< [in] read from here
+  uint64_t off,          ///< [in] offset
+  uint64_t len,          ///< [in] this many bytes
+  char *out)             ///< [out] copy it here
+{
+  auto* buf = &h->buf;
+
+  int64_t ret = 0;
+  dout(10) << __func__ << " h " << h
+           << " 0x" << std::hex << off << "~" << len << std::dec
+	   << " from " << h->file->fnode << dendl;
+
+  ++h->file->num_reading;
+
+  if (!h->ignore_eof &&
+      off + len > h->file->fnode.size) {
+    if (off > h->file->fnode.size)
+      len = 0;
+    else
+      len = h->file->fnode.size - off;
+    dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
+	     << std::hex << len << std::dec << dendl;
+  }
+  logger->inc(l_bluefs_read_random_count, 1);
+  logger->inc(l_bluefs_read_random_bytes, len);
+
+  std::shared_lock s_lock(h->lock);
+  buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
+  while (len > 0) {
+    if (off < buf->bl_off || off >= buf->get_buf_end()) {
+      s_lock.unlock();
+      uint64_t x_off = 0;
+      auto p = h->file->fnode.seek(off, &x_off);
+      ceph_assert(p != h->file->fnode.extents.end());
+      uint64_t l = std::min(p->length - x_off, len);
+      //hard cap to 1GB
+      l = std::min(l, uint64_t(1) << 30);
+      dout(20) << __func__ << " read random 0x"
+	       << std::hex << x_off << "~" << l << std::dec
+	       << " of " << *p << dendl;
+      int r;
+      if (!cct->_conf->bluefs_check_for_zeros) {
+	r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
+				       cct->_conf->bluefs_buffered_io);
+      } else {
+	r = read_random(p->bdev, p->offset + x_off, l, out,
+			cct->_conf->bluefs_buffered_io);
+      }
+      ceph_assert(r == 0);
+      off += l;
+      len -= l;
+      ret += l;
+      out += l;
+
+      logger->inc(l_bluefs_read_random_disk_count, 1);
+      logger->inc(l_bluefs_read_random_disk_bytes, l);
+      if (len > 0) {
+	s_lock.lock();
+      }
+    } else {
+      auto left = buf->get_buf_remaining(off);
+      int64_t r = std::min(len, left);
+      logger->inc(l_bluefs_read_random_buffer_count, 1);
+      logger->inc(l_bluefs_read_random_buffer_bytes, r);
+      dout(20) << __func__ << " left 0x" << std::hex << left
+	      << " 0x" << off << "~" << len << std::dec
+	      << dendl;
+
+      auto p = buf->bl.begin();
+      p.seek(off - buf->bl_off);
+      p.copy(r, out);
+      out += r;
+
+      dout(30) << __func__ << " result chunk (0x"
+	       << std::hex << r << std::dec << " bytes):\n";
+      bufferlist t;
+      t.substr_of(buf->bl, off - buf->bl_off, r);
+      t.hexdump(*_dout);
+      *_dout << dendl;
+
+      off += r;
+      len -= r;
+      ret += r;
+      buf->pos += r;
+    }
+  }
+  dout(20) << __func__ << " got " << ret << dendl;
+  --h->file->num_reading;
+  return ret;
+}
+
+int64_t BlueFS::_read(
+  FileReader *h,         ///< [in] read from here
+  uint64_t off,          ///< [in] offset
+  size_t len,            ///< [in] this many bytes
+  bufferlist *outbl,     ///< [out] optional: reference the result here
+  char *out)             ///< [out] optional: or copy it here
+{
+  FileReaderBuffer *buf = &(h->buf);
+
+  bool prefetch = !outbl && !out;
+  dout(10) << __func__ << " h " << h
+           << " 0x" << std::hex << off << "~" << len << std::dec
+	   << " from " << h->file->fnode
+	   << (prefetch ? " prefetch" : "")
+	   << dendl;
+
+  ++h->file->num_reading;
+
+  if (!h->ignore_eof &&
+      off + len > h->file->fnode.size) {
+    if (off > h->file->fnode.size)
+      len = 0;
+    else
+      len = h->file->fnode.size - off;
+    dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
+	     << std::hex << len << std::dec << dendl;
+  }
+  logger->inc(l_bluefs_read_count, 1);
+  logger->inc(l_bluefs_read_bytes, len);
+  if (prefetch) {
+    logger->inc(l_bluefs_read_prefetch_count, 1);
+    logger->inc(l_bluefs_read_prefetch_bytes, len);
+  }
+
+  if (outbl)
+    outbl->clear();
+
+  int64_t ret = 0;
+  std::shared_lock s_lock(h->lock);
+  while (len > 0) {
+    size_t left;
+    if (off < buf->bl_off || off >= buf->get_buf_end()) {
+      s_lock.unlock();
+      std::unique_lock u_lock(h->lock);
+      buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
+      if (off < buf->bl_off || off >= buf->get_buf_end()) {
+        // if precondition hasn't changed during locking upgrade.
+        buf->bl.clear();
+        buf->bl_off = off & super.block_mask();
+        uint64_t x_off = 0;
+        auto p = h->file->fnode.seek(buf->bl_off, &x_off);
+	if (p == h->file->fnode.extents.end()) {
+	  dout(5) << __func__ << " reading less then required "
+		  << ret << "<" << ret + len << dendl;
+	  break;
+	}
+
+        uint64_t want = round_up_to(len + (off & ~super.block_mask()),
+				    super.block_size);
+        want = std::max(want, buf->max_prefetch);
+        uint64_t l = std::min(p->length - x_off, want);
+        //hard cap to 1GB
+	l = std::min(l, uint64_t(1) << 30);
+        uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
+        if (!h->ignore_eof &&
+	    buf->bl_off + l > eof_offset) {
+	  l = eof_offset - buf->bl_off;
+        }
+        dout(20) << __func__ << " fetching 0x"
+                 << std::hex << x_off << "~" << l << std::dec
+                 << " of " << *p << dendl;
+	int r;
+	// when reading BlueFS log (only happens on startup) use non-buffered io
+	// it makes it in sync with logic in _flush_range()
+	bool use_buffered_io = h->file->fnode.ino == 1 ? false : cct->_conf->bluefs_buffered_io;
+	if (!cct->_conf->bluefs_check_for_zeros) {
+	  r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
+				  use_buffered_io);
+	} else {
+	  r = read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
+		   use_buffered_io);
+	}
+        ceph_assert(r == 0);
+      }
+      u_lock.unlock();
+      s_lock.lock();
+      // we should recheck if buffer is valid after lock downgrade
+      continue; 
+    }
+    left = buf->get_buf_remaining(off);
+    dout(20) << __func__ << " left 0x" << std::hex << left
+             << " len 0x" << len << std::dec << dendl;
+
+    int64_t r = std::min(len, left);
+    if (outbl) {
+      bufferlist t;
+      t.substr_of(buf->bl, off - buf->bl_off, r);
+      outbl->claim_append(t);
+    }
+    if (out) {
+      auto p = buf->bl.begin();
+      p.seek(off - buf->bl_off);
+      p.copy(r, out);
+      out += r;
+    }
+
+    dout(30) << __func__ << " result chunk (0x"
+             << std::hex << r << std::dec << " bytes):\n";
+    bufferlist t;
+    t.substr_of(buf->bl, off - buf->bl_off, r);
+    t.hexdump(*_dout);
+    *_dout << dendl;
+
+    off += r;
+    len -= r;
+    ret += r;
+    buf->pos += r;
+  }
+
+  dout(20) << __func__ << " got " << ret << dendl;
+  ceph_assert(!outbl || (int)outbl->length() == ret);
+  --h->file->num_reading;
+  return ret;
+}
+
+void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
+{
+  dout(10) << __func__ << " file " << f->fnode
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+           << dendl;
+  if (offset & ~super.block_mask()) {
+    offset &= super.block_mask();
+    length = round_up_to(length, super.block_size);
+  }
+  uint64_t x_off = 0;
+  auto p = f->fnode.seek(offset, &x_off);
+  while (length > 0 && p != f->fnode.extents.end()) {
+    uint64_t x_len = std::min(p->length - x_off, length);
+    bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
+    dout(20) << __func__  << " 0x" << std::hex << x_off << "~" << x_len
+             << std:: dec << " of " << *p << dendl;
+    offset += x_len;
+    length -= x_len;
+  }
+}
+
+uint64_t BlueFS::_estimate_log_size()
+{
+  int avg_dir_size = 40;  // fixme
+  int avg_file_size = 12;
+  uint64_t size = 4096 * 2;
+  size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
+  size += dir_map.size() + (1 + avg_dir_size);
+  size += file_map.size() * (1 + avg_dir_size + avg_file_size);
+  return round_up_to(size, super.block_size);
+}
+
+void BlueFS::compact_log()
+{
+  std::unique_lock<ceph::mutex> l(lock);
+  if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
+    if (cct->_conf->bluefs_compact_log_sync) {
+      _compact_log_sync();
+    } else {
+      _compact_log_async(l);
+    }
+  }
+}
+
+bool BlueFS::_should_compact_log()
+{
+  uint64_t current = log_writer->file->fnode.size;
+  uint64_t expected = _estimate_log_size();
+  float ratio = (float)current / (float)expected;
+  dout(10) << __func__ << " current 0x" << std::hex << current
+	   << " expected " << expected << std::dec
+	   << " ratio " << ratio
+	   << (new_log ? " (async compaction in progress)" : "")
+	   << dendl;
+  if (new_log ||
+      current < cct->_conf->bluefs_log_compact_min_size ||
+      ratio < cct->_conf->bluefs_log_compact_min_ratio) {
+    return false;
+  }
+  return true;
+}
+
+void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
+					int flags)
+{
+  t->seq = 1;
+  t->uuid = super.uuid;
+  dout(20) << __func__ << " op_init" << dendl;
+
+  t->op_init();
+  for (auto& [ino, file_ref] : file_map) {
+    if (ino == 1)
+      continue;
+    ceph_assert(ino > 1);
+
+    for(auto& e : file_ref->fnode.extents) {
+      auto bdev = e.bdev;
+      auto bdev_new = bdev;
+      ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
+      if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
+	bdev_new = BDEV_DB;
+      }
+      if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
+	bdev_new = BDEV_SLOW;
+      }
+      if (bdev == BDEV_NEWDB) {
+	// REMOVE_DB xor RENAME_DB
+	ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
+	ceph_assert(!(flags & RENAME_SLOW2DB));
+	bdev_new = BDEV_DB;
+      }
+      if (bdev == BDEV_NEWWAL) {
+	ceph_assert(flags & REMOVE_WAL);
+	bdev_new = BDEV_WAL;
+      }
+      e.bdev = bdev_new;
+    }
+    dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
+    t->op_file_update(file_ref->fnode);
+  }
+  for (auto& [path, dir_ref] : dir_map) {
+    dout(20) << __func__ << " op_dir_create " << path << dendl;
+    t->op_dir_create(path);
+    for (auto& [fname, file_ref] : dir_ref->file_map) {
+      dout(20) << __func__ << " op_dir_link " << path << "/" << fname
+	       << " to " << file_ref->fnode.ino << dendl;
+      t->op_dir_link(path, fname, file_ref->fnode.ino);
+    }
+  }
+}
+
+void BlueFS::_compact_log_sync()
+{
+  dout(10) << __func__ << dendl;
+  auto prefer_bdev =
+    vselector->select_prefer_bdev(log_writer->file->vselector_hint);
+  _rewrite_log_and_layout_sync(true,
+    BDEV_DB,
+    prefer_bdev,
+    prefer_bdev,
+    0,
+    super.memorized_layout);
+  logger->inc(l_bluefs_log_compactions);
+}
+
+void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback,
+					  int super_dev,
+					  int log_dev,
+					  int log_dev_new,
+					  int flags,
+					  std::optional<bluefs_layout_t> layout)
+{
+  File *log_file = log_writer->file.get();
+
+  // clear out log (be careful who calls us!!!)
+  log_t.clear();
+
+  dout(20) << __func__ << " super_dev:" << super_dev
+                       << " log_dev:" << log_dev
+                       << " log_dev_new:" << log_dev_new
+		       << " flags:" << flags
+		       << dendl;
+  bluefs_transaction_t t;
+  _compact_log_dump_metadata(&t, flags);
+
+  dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
+  t.op_jump_seq(log_seq);
+
+  bufferlist bl;
+  encode(t, bl);
+  _pad_bl(bl);
+
+  uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
+  dout(20) << __func__ << " need " << need << dendl;
+
+  bluefs_fnode_t old_fnode;
+  int r;
+  log_file->fnode.swap_extents(old_fnode);
+  if (allocate_with_fallback) {
+    r = _allocate(log_dev, need, &log_file->fnode);
+    ceph_assert(r == 0);
+  } else {
+    PExtentVector extents;
+    r = _allocate_without_fallback(log_dev,
+			       need,
+			       &extents);
+    ceph_assert(r == 0);
+    for (auto& p : extents) {
+      log_file->fnode.append_extent(
+	bluefs_extent_t(log_dev, p.offset, p.length));
+    }
+  }
+
+  _close_writer(log_writer);
+
+  // we will write it to super
+  log_file->fnode.reset_delta();
+  log_file->fnode.size = bl.length();
+  vselector->sub_usage(log_file->vselector_hint, old_fnode);
+  vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+
+  log_writer = _create_writer(log_file);
+  log_writer->append(bl);
+  r = _flush(log_writer, true);
+  ceph_assert(r == 0);
+#ifdef HAVE_LIBAIO
+  if (!cct->_conf->bluefs_sync_write) {
+    list<aio_t> completed_ios;
+    _claim_completed_aios(log_writer, &completed_ios);
+    wait_for_aio(log_writer);
+    completed_ios.clear();
+  }
+#endif
+  flush_bdev();
+
+  super.memorized_layout = layout;
+  super.log_fnode = log_file->fnode;
+  // rename device if needed
+  if (log_dev != log_dev_new) {
+    dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
+    for (auto& p : super.log_fnode.extents) {
+      p.bdev = log_dev_new;
+    }
+  }
+  dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
+
+  ++super.version;
+  _write_super(super_dev);
+  flush_bdev();
+
+  dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
+  for (auto& r : old_fnode.extents) {
+    pending_release[r.bdev].insert(r.offset, r.length);
+  }
+}
+
+/*
+ * 1. Allocate a new extent to continue the log, and then log an event
+ * that jumps the log write position to the new extent.  At this point, the
+ * old extent(s) won't be written to, and reflect everything to compact.
+ * New events will be written to the new region that we'll keep.
+ *
+ * 2. While still holding the lock, encode a bufferlist that dumps all of the
+ * in-memory fnodes and names.  This will become the new beginning of the
+ * log.  The last event will jump to the log continuation extent from #1.
+ *
+ * 3. Queue a write to a new extent for the new beginnging of the log.
+ *
+ * 4. Drop lock and wait
+ *
+ * 5. Retake the lock.
+ *
+ * 6. Update the log_fnode to splice in the new beginning.
+ *
+ * 7. Write the new superblock.
+ *
+ * 8. Release the old log space.  Clean up.
+ */
+void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l)
+{
+  dout(10) << __func__ << dendl;
+  File *log_file = log_writer->file.get();
+  ceph_assert(!new_log);
+  ceph_assert(!new_log_writer);
+
+  // create a new log [writer] so that we know compaction is in progress
+  // (see _should_compact_log)
+  new_log = ceph::make_ref<File>();
+  new_log->fnode.ino = 0;   // so that _flush_range won't try to log the fnode
+
+  // 0. wait for any racing flushes to complete.  (We do not want to block
+  // in _flush_sync_log with jump_to set or else a racing thread might flush
+  // our entries and our jump_to update won't be correct.)
+  while (log_flushing) {
+    dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
+    log_cond.wait(l);
+  }
+
+  vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
+
+  // 1. allocate new log space and jump to it.
+  old_log_jump_to = log_file->fnode.get_allocated();
+  dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
+           << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
+  int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
+		    cct->_conf->bluefs_max_log_runway,
+                    &log_file->fnode);
+  ceph_assert(r == 0);
+  //adjust usage as flush below will need it
+  vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+  dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
+
+  // update the log file change and log a jump to the offset where we want to
+  // write the new entries
+  log_t.op_file_update(log_file->fnode);
+  log_t.op_jump(log_seq, old_log_jump_to);
+
+  flush_bdev();  // FIXME?
+
+  _flush_and_sync_log(l, 0, old_log_jump_to);
+
+  // 2. prepare compacted log
+  bluefs_transaction_t t;
+  //avoid record two times in log_t and _compact_log_dump_metadata.
+  log_t.clear();
+  _compact_log_dump_metadata(&t, 0);
+
+  uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
+				     std::max(alloc_size[BDEV_DB],
+					      alloc_size[BDEV_SLOW]));
+
+  // conservative estimate for final encoded size
+  new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
+                                max_alloc_size);
+  t.op_jump(log_seq, new_log_jump_to);
+
+  // allocate
+  //FIXME: check if we want DB here?
+  r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
+                    &new_log->fnode);
+  ceph_assert(r == 0);
+
+  // we might have some more ops in log_t due to _allocate call
+  t.claim_ops(log_t);
+
+  bufferlist bl;
+  encode(t, bl);
+  _pad_bl(bl);
+
+  dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
+	   << std::dec << dendl;
+
+  new_log_writer = _create_writer(new_log);
+  new_log_writer->append(bl);
+
+  // 3. flush
+  r = _flush(new_log_writer, true);
+  ceph_assert(r == 0);
+
+  // 4. wait
+  _flush_bdev_safely(new_log_writer);
+
+  // 5. update our log fnode
+  // discard first old_log_jump_to extents
+
+  dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
+	   << " of " << log_file->fnode.extents << dendl;
+
+  vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
+
+  uint64_t discarded = 0;
+  mempool::bluefs::vector<bluefs_extent_t> old_extents;
+  while (discarded < old_log_jump_to) {
+    ceph_assert(!log_file->fnode.extents.empty());
+    bluefs_extent_t& e = log_file->fnode.extents.front();
+    bluefs_extent_t temp = e;
+    if (discarded + e.length <= old_log_jump_to) {
+      dout(10) << __func__ << " remove old log extent " << e << dendl;
+      discarded += e.length;
+      log_file->fnode.pop_front_extent();
+    } else {
+      dout(10) << __func__ << " remove front of old log extent " << e << dendl;
+      uint64_t drop = old_log_jump_to - discarded;
+      temp.length = drop;
+      e.offset += drop;
+      e.length -= drop;
+      discarded += drop;
+      dout(10) << __func__ << "   kept " << e << " removed " << temp << dendl;
+    }
+    old_extents.push_back(temp);
+  }
+  auto from = log_file->fnode.extents.begin();
+  auto to = log_file->fnode.extents.end();
+  while (from != to) {
+    new_log->fnode.append_extent(*from);
+    ++from;
+  }
+  // we will write it to super
+  new_log->fnode.reset_delta();
+
+  // clear the extents from old log file, they are added to new log
+  log_file->fnode.clear_extents();
+  // swap the log files. New log file is the log file now.
+  new_log->fnode.swap_extents(log_file->fnode);
+
+  log_writer->pos = log_writer->file->fnode.size =
+    log_writer->pos - old_log_jump_to + new_log_jump_to;
+
+  vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+
+  // 6. write the super block to reflect the changes
+  dout(10) << __func__ << " writing super" << dendl;
+  super.log_fnode = log_file->fnode;
+  ++super.version;
+  _write_super(BDEV_DB);
+
+  lock.unlock();
+  flush_bdev();
+  lock.lock();
+
+  // 7. release old space
+  dout(10) << __func__ << " release old log extents " << old_extents << dendl;
+  for (auto& r : old_extents) {
+    pending_release[r.bdev].insert(r.offset, r.length);
+  }
+
+  // delete the new log, remove from the dirty files list
+  _close_writer(new_log_writer);
+  if (new_log->dirty_seq) {
+    ceph_assert(dirty_files.count(new_log->dirty_seq));
+    auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
+    dirty_files[new_log->dirty_seq].erase(it);
+  }
+  new_log_writer = nullptr;
+  new_log = nullptr;
+  log_cond.notify_all();
+
+  dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
+  logger->inc(l_bluefs_log_compactions);
+}
+
+void BlueFS::_pad_bl(bufferlist& bl)
+{
+  uint64_t partial = bl.length() % super.block_size;
+  if (partial) {
+    dout(10) << __func__ << " padding with 0x" << std::hex
+	     << super.block_size - partial << " zeros" << std::dec << dendl;
+    bl.append_zero(super.block_size - partial);
+  }
+}
+
+
+int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
+				uint64_t want_seq,
+				uint64_t jump_to)
+{
+  while (log_flushing) {
+    dout(10) << __func__ << " want_seq " << want_seq
+	     << " log is currently flushing, waiting" << dendl;
+    ceph_assert(!jump_to);
+    log_cond.wait(l);
+  }
+  if (want_seq && want_seq <= log_seq_stable) {
+    dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
+	     << log_seq_stable << ", done" << dendl;
+    ceph_assert(!jump_to);
+    return 0;
+  }
+  if (log_t.empty() && dirty_files.empty()) {
+    dout(10) << __func__ << " want_seq " << want_seq
+	     << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
+    ceph_assert(!jump_to);
+    return 0;
+  }
+
+  vector<interval_set<uint64_t>> to_release(pending_release.size());
+  to_release.swap(pending_release);
+
+  uint64_t seq = log_t.seq = ++log_seq;
+  ceph_assert(want_seq == 0 || want_seq <= seq);
+  log_t.uuid = super.uuid;
+
+  // log dirty files
+  auto lsi = dirty_files.find(seq);
+  if (lsi != dirty_files.end()) {
+    dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
+    for (auto &f : lsi->second) {
+      dout(20) << __func__ << "   op_file_update_inc " << f.fnode << dendl;
+      log_t.op_file_update_inc(f.fnode);
+    }
+  }
+
+  dout(10) << __func__ << " " << log_t << dendl;
+  ceph_assert(!log_t.empty());
+
+  // allocate some more space (before we run out)?
+  // BTW: this triggers `flush()` in the `page_aligned_appender` of `log_writer`.
+  int64_t runway = log_writer->file->fnode.get_allocated() -
+    log_writer->get_effective_write_pos();
+  bool just_expanded_log = false;
+  if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
+    dout(10) << __func__ << " allocating more log runway (0x"
+	     << std::hex << runway << std::dec  << " remaining)" << dendl;
+    while (new_log_writer) {
+      dout(10) << __func__ << " waiting for async compaction" << dendl;
+      log_cond.wait(l);
+    }
+    vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
+    int r = _allocate(
+      vselector->select_prefer_bdev(log_writer->file->vselector_hint),
+      cct->_conf->bluefs_max_log_runway,
+      &log_writer->file->fnode);
+    ceph_assert(r == 0);
+    vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
+    log_t.op_file_update_inc(log_writer->file->fnode);
+    just_expanded_log = true;
+  }
+
+  bufferlist bl;
+  bl.reserve(super.block_size);
+  encode(log_t, bl);
+  // pad to block boundary
+  size_t realign = super.block_size - (bl.length() % super.block_size);
+  if (realign && realign != super.block_size)
+    bl.append_zero(realign);
+
+  logger->inc(l_bluefs_logged_bytes, bl.length());
+
+  if (just_expanded_log) {
+    ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
+  }
+
+  log_writer->append(bl);
+
+  log_t.clear();
+  log_t.seq = 0;  // just so debug output is less confusing
+  log_flushing = true;
+
+  int r = _flush(log_writer, true);
+  ceph_assert(r == 0);
+
+  if (jump_to) {
+    dout(10) << __func__ << " jumping log offset from 0x" << std::hex
+	     << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
+    log_writer->pos = jump_to;
+    vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
+    log_writer->file->fnode.size = jump_to;
+    vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
+  }
+
+  _flush_bdev_safely(log_writer);
+
+  log_flushing = false;
+  log_cond.notify_all();
+
+  // clean dirty files
+  if (seq > log_seq_stable) {
+    log_seq_stable = seq;
+    dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
+
+    auto p = dirty_files.begin();
+    while (p != dirty_files.end()) {
+      if (p->first > log_seq_stable) {
+        dout(20) << __func__ << " done cleaning up dirty files" << dendl;
+        break;
+      }
+
+      auto l = p->second.begin();
+      while (l != p->second.end()) {
+        File *file = &*l;
+        ceph_assert(file->dirty_seq > 0);
+        ceph_assert(file->dirty_seq <= log_seq_stable);
+        dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
+        file->dirty_seq = 0;
+        p->second.erase(l++);
+      }
+
+      ceph_assert(p->second.empty());
+      dirty_files.erase(p++);
+    }
+  } else {
+    dout(20) << __func__ << " log_seq_stable " << log_seq_stable
+             << " already >= out seq " << seq
+             << ", we lost a race against another log flush, done" << dendl;
+  }
+
+  for (unsigned i = 0; i < to_release.size(); ++i) {
+    if (!to_release[i].empty()) {
+      /* OK, now we have the guarantee alloc[i] won't be null. */
+      int r = 0;
+      if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
+	r = bdev[i]->queue_discard(to_release[i]);
+	if (r == 0)
+	  continue;
+      } else if (cct->_conf->bdev_enable_discard) {
+	for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
+	  bdev[i]->discard(p.get_start(), p.get_len());
+	}
+      }
+      alloc[i]->release(to_release[i]);
+      if (is_shared_alloc(i)) {
+        shared_alloc->bluefs_used -= to_release[i].size();
+      }
+    }
+  }
+
+  _update_logger_stats();
+
+  return 0;
+}
+
+ceph::bufferlist BlueFS::FileWriter::flush_buffer(
+  CephContext* const cct,
+  const bool partial,
+  const unsigned length,
+  const bluefs_super_t& super)
+{
+  ceph::bufferlist bl;
+  if (partial) {
+    tail_block.splice(0, tail_block.length(), &bl);
+  }
+  const auto remaining_len = length - bl.length();
+  buffer.splice(0, remaining_len, &bl);
+  if (buffer.length()) {
+    dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec
+             << " unflushed" << dendl;
+  }
+  if (const unsigned tail = bl.length() & ~super.block_mask(); tail) {
+    const auto padding_len = super.block_size - tail;
+    dout(20) << __func__ << " caching tail of 0x"
+             << std::hex << tail
+             << " and padding block with 0x" << padding_len
+             << " buffer.length() " << buffer.length()
+             << std::dec << dendl;
+    // We need to go through the `buffer_appender` to get a chance to
+    // preserve in-memory contiguity and not mess with the alignment.
+    // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
+    buffer_appender.append_zero(padding_len);
+    buffer.splice(buffer.length() - padding_len, padding_len, &bl);
+    // Deep copy the tail here. This allows to avoid costlier copy on
+    // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
+    // of memory allocations.
+    // The alternative approach would be to place the entire tail and
+    // padding on a dedicated, 4 KB long memory chunk. This shouldn't
+    // trigger the rebuild while still being less expensive.
+    buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail);
+    buffer.splice(buffer.length() - tail, tail, &tail_block);
+  } else {
+    tail_block.clear();
+  }
+  return bl;
+}
+
+int BlueFS::_signal_dirty_to_log(FileWriter *h)
+{
+  if (h->file->deleted) {
+    dout(10) << __func__ << "  deleted, no-op" << dendl;
+    return 0;
+  }
+
+  h->file->fnode.mtime = ceph_clock_now();
+  ceph_assert(h->file->fnode.ino >= 1);
+  if (h->file->dirty_seq == 0) {
+    h->file->dirty_seq = log_seq + 1;
+    dirty_files[h->file->dirty_seq].push_back(*h->file);
+    dout(20) << __func__ << " dirty_seq = " << log_seq + 1
+	     << " (was clean)" << dendl;
+  } else {
+    if (h->file->dirty_seq != log_seq + 1) {
+      // need re-dirty, erase from list first
+      ceph_assert(dirty_files.count(h->file->dirty_seq));
+      auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
+      dirty_files[h->file->dirty_seq].erase(it);
+      h->file->dirty_seq = log_seq + 1;
+      dirty_files[h->file->dirty_seq].push_back(*h->file);
+      dout(20) << __func__ << " dirty_seq = " << log_seq + 1
+	       << " (was " << h->file->dirty_seq << ")" << dendl;
+    } else {
+      dout(20) << __func__ << " dirty_seq = " << log_seq + 1
+	       << " (unchanged, do nothing) " << dendl;
+    }
+  }
+  return 0;
+}
+
+int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
+{
+  dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
+	   << " 0x" << offset << "~" << length << std::dec
+	   << " to " << h->file->fnode << dendl;
+  if (h->file->deleted) {
+    dout(10) << __func__ << "  deleted, no-op" << dendl;
+    return 0;
+  }
+
+  ceph_assert(h->file->num_readers.load() == 0);
+
+  bool buffered;
+  if (h->file->fnode.ino == 1)
+    buffered = false;
+  else
+    buffered = cct->_conf->bluefs_buffered_io;
+
+  if (offset + length <= h->pos)
+    return 0;
+  if (offset < h->pos) {
+    length -= h->pos - offset;
+    offset = h->pos;
+    dout(10) << " still need 0x"
+             << std::hex << offset << "~" << length << std::dec
+             << dendl;
+  }
+  ceph_assert(offset <= h->file->fnode.size);
+
+  uint64_t allocated = h->file->fnode.get_allocated();
+  vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
+  // do not bother to dirty the file if we are overwriting
+  // previously allocated extents.
+
+  if (allocated < offset + length) {
+    // we should never run out of log space here; see the min runway check
+    // in _flush_and_sync_log.
+    ceph_assert(h->file->fnode.ino != 1);
+    int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
+		      offset + length - allocated,
+		      &h->file->fnode);
+    if (r < 0) {
+      derr << __func__ << " allocated: 0x" << std::hex << allocated
+           << " offset: 0x" << offset << " length: 0x" << length << std::dec
+           << dendl;
+      vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
+      ceph_abort_msg("bluefs enospc");
+      return r;
+    }
+    h->file->is_dirty = true;
+  }
+  if (h->file->fnode.size < offset + length) {
+    h->file->fnode.size = offset + length;
+    if (h->file->fnode.ino > 1) {
+      // we do not need to dirty the log file (or it's compacting
+      // replacement) when the file size changes because replay is
+      // smart enough to discover it on its own.
+      h->file->is_dirty = true;
+    }
+  }
+  dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
+
+  uint64_t x_off = 0;
+  auto p = h->file->fnode.seek(offset, &x_off);
+  ceph_assert(p != h->file->fnode.extents.end());
+  dout(20) << __func__ << " in " << *p << " x_off 0x"
+           << std::hex << x_off << std::dec << dendl;
+
+  unsigned partial = x_off & ~super.block_mask();
+  if (partial) {
+    dout(20) << __func__ << " using partial tail 0x"
+             << std::hex << partial << std::dec << dendl;
+    x_off -= partial;
+    offset -= partial;
+    length += partial;
+    dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
+    for (auto p : h->iocv) {
+      if (p) {
+	p->aio_wait();
+      }
+    }
+  }
+
+  auto bl = h->flush_buffer(cct, partial, length, super);
+  ceph_assert(bl.length() >= length);
+  h->pos = offset + length;
+  length = bl.length();
+
+  switch (h->writer_type) {
+  case WRITER_WAL:
+    logger->inc(l_bluefs_bytes_written_wal, length);
+    break;
+  case WRITER_SST:
+    logger->inc(l_bluefs_bytes_written_sst, length);
+    break;
+  }
+
+  dout(30) << "dump:\n";
+  bl.hexdump(*_dout);
+  *_dout << dendl;
+
+  uint64_t bloff = 0;
+  uint64_t bytes_written_slow = 0;
+  while (length > 0) {
+    uint64_t x_len = std::min(p->length - x_off, length);
+    bufferlist t;
+    t.substr_of(bl, bloff, x_len);
+    if (cct->_conf->bluefs_sync_write) {
+      bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
+    } else {
+      bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
+    }
+    h->dirty_devs[p->bdev] = true;
+    if (p->bdev == BDEV_SLOW) {
+      bytes_written_slow += t.length();
+    }
+
+    bloff += x_len;
+    length -= x_len;
+    ++p;
+    x_off = 0;
+  }
+  if (bytes_written_slow) {
+    logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
+  }
+  for (unsigned i = 0; i < MAX_BDEV; ++i) {
+    if (bdev[i]) {
+      if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
+        bdev[i]->aio_submit(h->iocv[i]);
+      }
+    }
+  }
+  vselector->add_usage(h->file->vselector_hint, h->file->fnode);
+  dout(20) << __func__ << " h " << h << " pos now 0x"
+           << std::hex << h->pos << std::dec << dendl;
+  return 0;
+}
+
+#ifdef HAVE_LIBAIO
+// we need to retire old completed aios so they don't stick around in
+// memory indefinitely (along with their bufferlist refs).
+void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
+{
+  for (auto p : h->iocv) {
+    if (p) {
+      ls->splice(ls->end(), p->running_aios);
+    }
+  }
+  dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
+}
+
+void BlueFS::wait_for_aio(FileWriter *h)
+{
+  // NOTE: this is safe to call without a lock, as long as our reference is
+  // stable.
+  utime_t start;
+  lgeneric_subdout(cct, bluefs, 10) << __func__;
+  start = ceph_clock_now();
+  *_dout << " " << h << dendl;
+  for (auto p : h->iocv) {
+    if (p) {
+      p->aio_wait();
+    }
+  }
+  dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
+}
+#endif
+
+int BlueFS::_flush(FileWriter *h, bool force, std::unique_lock<ceph::mutex>& l)
+{
+  bool flushed = false;
+  int r = _flush(h, force, &flushed);
+  if (r == 0 && flushed) {
+    _maybe_compact_log(l);
+  }
+  return r;
+}
+
+int BlueFS::_flush(FileWriter *h, bool force, bool *flushed)
+{
+  uint64_t length = h->get_buffer_length();
+  uint64_t offset = h->pos;
+  if (flushed) {
+    *flushed = false;
+  }
+  if (!force &&
+      length < cct->_conf->bluefs_min_flush_size) {
+    dout(10) << __func__ << " " << h << " ignoring, length " << length
+	     << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
+	     << dendl;
+    return 0;
+  }
+  if (length == 0) {
+    dout(10) << __func__ << " " << h << " no dirty data on "
+	     << h->file->fnode << dendl;
+    return 0;
+  }
+  dout(10) << __func__ << " " << h << " 0x"
+           << std::hex << offset << "~" << length << std::dec
+	   << " to " << h->file->fnode << dendl;
+  ceph_assert(h->pos <= h->file->fnode.size);
+  int r = _flush_range(h, offset, length);
+  if (flushed) {
+    *flushed = true;
+  }
+  return r;
+}
+
+int BlueFS::_truncate(FileWriter *h, uint64_t offset)
+{
+  dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
+           << " file " << h->file->fnode << dendl;
+  if (h->file->deleted) {
+    dout(10) << __func__ << "  deleted, no-op" << dendl;
+    return 0;
+  }
+
+  // we never truncate internal log files
+  ceph_assert(h->file->fnode.ino > 1);
+
+  // truncate off unflushed data?
+  if (h->pos < offset &&
+      h->pos + h->get_buffer_length() > offset) {
+    dout(20) << __func__ << " tossing out last " << offset - h->pos
+	     << " unflushed bytes" << dendl;
+    ceph_abort_msg("actually this shouldn't happen");
+  }
+  if (h->get_buffer_length()) {
+    int r = _flush(h, true);
+    if (r < 0)
+      return r;
+  }
+  if (offset == h->file->fnode.size) {
+    return 0;  // no-op!
+  }
+  if (offset > h->file->fnode.size) {
+    ceph_abort_msg("truncate up not supported");
+  }
+  ceph_assert(h->file->fnode.size >= offset);
+  _flush_bdev_safely(h);
+  vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
+  h->file->fnode.size = offset;
+  vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
+
+  log_t.op_file_update_inc(h->file->fnode);
+  return 0;
+}
+
+int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
+{
+  dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
+  int r = _flush(h, true);
+  if (r < 0)
+     return r;
+  if (h->file->is_dirty) {
+    _signal_dirty_to_log(h);
+    h->file->is_dirty = false;
+  }
+  uint64_t old_dirty_seq = h->file->dirty_seq;
+
+  _flush_bdev_safely(h);
+
+  if (old_dirty_seq) {
+    uint64_t s = log_seq;
+    dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
+	     << ") on " << h->file->fnode << ", flushing log" << dendl;
+    _flush_and_sync_log(l, old_dirty_seq);
+    ceph_assert(h->file->dirty_seq == 0 ||  // cleaned
+	   h->file->dirty_seq > s);    // or redirtied by someone else
+  }
+  return 0;
+}
+
+void BlueFS::_flush_bdev_safely(FileWriter *h)
+{
+  std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
+  h->dirty_devs.fill(false);
+#ifdef HAVE_LIBAIO
+  if (!cct->_conf->bluefs_sync_write) {
+    list<aio_t> completed_ios;
+    _claim_completed_aios(h, &completed_ios);
+    lock.unlock();
+    wait_for_aio(h);
+    completed_ios.clear();
+    flush_bdev(flush_devs);
+    lock.lock();
+  } else
+#endif
+  {
+    lock.unlock();
+    flush_bdev(flush_devs);
+    lock.lock();
+  }
+}
+
+void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
+{
+  // NOTE: this is safe to call without a lock.
+  dout(20) << __func__ << dendl;
+  for (unsigned i = 0; i < MAX_BDEV; i++) {
+    if (dirty_bdevs[i])
+      bdev[i]->flush();
+  }
+}
+
+void BlueFS::flush_bdev()
+{
+  // NOTE: this is safe to call without a lock.
+  dout(20) << __func__ << dendl;
+  for (unsigned i = 0; i < MAX_BDEV; i++) {
+    // alloc space from BDEV_SLOW is unexpected.
+    // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device.
+    if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) {
+      bdev[i]->flush();
+    }
+  }
+}
+
+const char* BlueFS::get_device_name(unsigned id)
+{
+  if (id >= MAX_BDEV) return "BDEV_INV";
+  const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
+  return names[id];
+}
+
+int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
+		      PExtentVector* extents)
+{
+  dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
+           << " from " << (int)id << dendl;
+  assert(id < alloc.size());
+  if (!alloc[id]) {
+    return -ENOENT;
+  }
+  extents->reserve(4);  // 4 should be (more than) enough for most allocations
+  int64_t need = round_up_to(len, alloc_size[id]);
+  int64_t alloc_len = alloc[id]->allocate(need, alloc_size[id], 0, extents);
+  if (alloc_len < 0 || alloc_len < need) {
+    if (alloc_len > 0) {
+      alloc[id]->release(*extents);
+    }
+    derr << __func__ << " unable to allocate 0x" << std::hex << need
+	 << " on bdev " << (int)id
+         << ", allocator name " << alloc[id]->get_name()
+         << ", allocator type " << alloc[id]->get_type()
+         << ", capacity 0x" << alloc[id]->get_capacity()
+         << ", block size 0x" << alloc[id]->get_block_size()
+         << ", free 0x" << alloc[id]->get_free()
+         << ", fragmentation " << alloc[id]->get_fragmentation()
+         << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
+	 << std::dec << dendl;
+    alloc[id]->dump();
+    return -ENOSPC;
+  }
+  if (is_shared_alloc(id)) {
+    shared_alloc->bluefs_used += alloc_len;
+  }
+
+  return 0;
+}
+
+int BlueFS::_allocate(uint8_t id, uint64_t len,
+		      bluefs_fnode_t* node)
+{
+  dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
+           << " from " << (int)id << dendl;
+  ceph_assert(id < alloc.size());
+  int64_t alloc_len = 0;
+  PExtentVector extents;
+  uint64_t hint = 0;
+  int64_t need = len;
+  if (alloc[id]) {
+    need = round_up_to(len, alloc_size[id]);
+    if (!node->extents.empty() && node->extents.back().bdev == id) {
+      hint = node->extents.back().end();
+    }   
+    extents.reserve(4);  // 4 should be (more than) enough for most allocations
+    alloc_len = alloc[id]->allocate(need, alloc_size[id], hint, &extents);
+  }
+  if (alloc_len < 0 || alloc_len < need) {
+    if (alloc[id]) {
+      if (alloc_len > 0) {
+        alloc[id]->release(extents);
+      }
+      dout(1) << __func__ << " unable to allocate 0x" << std::hex << need
+	      << " on bdev " << (int)id
+              << ", allocator name " << alloc[id]->get_name()
+              << ", allocator type " << alloc[id]->get_type()
+              << ", capacity 0x" << alloc[id]->get_capacity()
+              << ", block size 0x" << alloc[id]->get_block_size()
+              << ", free 0x" << alloc[id]->get_free()
+              << ", fragmentation " << alloc[id]->get_fragmentation()
+              << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
+	      << std::dec << dendl;
+    }
+
+    if (id != BDEV_SLOW) {
+      dout(20) << __func__ << " fallback to bdev "
+               << (int)id + 1
+	       << dendl;
+      return _allocate(id + 1, len, node);
+    } else {
+      derr << __func__ << " allocation failed, needed 0x" << std::hex << need
+           << dendl;
+    }
+    return -ENOSPC;
+  } else {
+    uint64_t used = _get_used(id);
+    if (max_bytes[id] < used) {
+      logger->set(max_bytes_pcounters[id], used);
+      max_bytes[id] = used;
+    }
+    if (is_shared_alloc(id)) {
+      shared_alloc->bluefs_used += alloc_len;
+    }
+  }
+
+  for (auto& p : extents) {
+    node->append_extent(bluefs_extent_t(id, p.offset, p.length));
+  }
+   
+  return 0;
+}
+
+int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
+{
+  dout(10) << __func__ << " file " << f->fnode << " 0x"
+	   << std::hex << off << "~" << len << std::dec << dendl;
+  if (f->deleted) {
+    dout(10) << __func__ << "  deleted, no-op" << dendl;
+    return 0;
+  }
+  ceph_assert(f->fnode.ino > 1);
+  uint64_t allocated = f->fnode.get_allocated();
+  if (off + len > allocated) {
+    uint64_t want = off + len - allocated;
+
+    vselector->sub_usage(f->vselector_hint, f->fnode);
+    int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
+      want,
+      &f->fnode);
+    vselector->add_usage(f->vselector_hint, f->fnode);
+    if (r < 0)
+      return r;
+    log_t.op_file_update_inc(f->fnode);
+  }
+  return 0;
+}
+
+void BlueFS::sync_metadata(bool avoid_compact)
+{
+  std::unique_lock l(lock);
+  if (log_t.empty() && dirty_files.empty()) {
+    dout(10) << __func__ << " - no pending log events" << dendl;
+  } else {
+    utime_t start;
+    lgeneric_subdout(cct, bluefs, 10) << __func__;
+    start = ceph_clock_now();
+    *_dout <<  dendl;
+    flush_bdev(); // FIXME?
+    _flush_and_sync_log(l);
+    dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
+  }
+
+  if (!avoid_compact) {
+    _maybe_compact_log(l);
+  }
+}
+
+void BlueFS::_maybe_compact_log(std::unique_lock<ceph::mutex>& l)
+{
+  if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
+      _should_compact_log()) {
+    if (cct->_conf->bluefs_compact_log_sync) {
+      _compact_log_sync();
+    } else {
+      _compact_log_async(l);
+    }
+  }
+}
+
+int BlueFS::open_for_write(
+  std::string_view dirname,
+  std::string_view filename,
+  FileWriter **h,
+  bool overwrite)
+{
+  std::lock_guard l(lock);
+  dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
+  map<string,DirRef>::iterator p = dir_map.find(dirname);
+  DirRef dir;
+  if (p == dir_map.end()) {
+    // implicitly create the dir
+    dout(20) << __func__ << "  dir " << dirname
+	     << " does not exist" << dendl;
+    return -ENOENT;
+  } else {
+    dir = p->second;
+  }
+
+  FileRef file;
+  bool create = false;
+  bool truncate = false;
+  map<string,FileRef>::iterator q = dir->file_map.find(filename);
+  if (q == dir->file_map.end()) {
+    if (overwrite) {
+      dout(20) << __func__ << " dir " << dirname << " (" << dir
+	       << ") file " << filename
+	       << " does not exist" << dendl;
+      return -ENOENT;
+    }
+    file = ceph::make_ref<File>();
+    file->fnode.ino = ++ino_last;
+    file_map[ino_last] = file;
+    dir->file_map[string{filename}] = file;
+    ++file->refs;
+    create = true;
+  } else {
+    // overwrite existing file?
+    file = q->second;
+    if (overwrite) {
+      dout(20) << __func__ << " dir " << dirname << " (" << dir
+	       << ") file " << filename
+	       << " already exists, overwrite in place" << dendl;
+    } else {
+      dout(20) << __func__ << " dir " << dirname << " (" << dir
+	       << ") file " << filename
+	       << " already exists, truncate + overwrite" << dendl;
+      vselector->sub_usage(file->vselector_hint, file->fnode);
+      file->fnode.size = 0;
+      for (auto& p : file->fnode.extents) {
+	pending_release[p.bdev].insert(p.offset, p.length);
+      }
+      truncate = true;
+
+      file->fnode.clear_extents();
+    }
+  }
+  ceph_assert(file->fnode.ino > 1);
+
+  file->fnode.mtime = ceph_clock_now();
+  file->vselector_hint = vselector->get_hint_by_dir(dirname);
+  if (create || truncate) {
+    vselector->add_usage(file->vselector_hint, file->fnode); // update file count
+  }
+
+  dout(20) << __func__ << " mapping " << dirname << "/" << filename
+	   << " vsel_hint " << file->vselector_hint
+	   << dendl;
+
+  log_t.op_file_update(file->fnode);
+  if (create)
+    log_t.op_dir_link(dirname, filename, file->fnode.ino);
+
+  *h = _create_writer(file);
+
+  if (boost::algorithm::ends_with(filename, ".log")) {
+    (*h)->writer_type = BlueFS::WRITER_WAL;
+    if (logger && !overwrite) {
+      logger->inc(l_bluefs_files_written_wal);
+    }
+  } else if (boost::algorithm::ends_with(filename, ".sst")) {
+    (*h)->writer_type = BlueFS::WRITER_SST;
+    if (logger) {
+      logger->inc(l_bluefs_files_written_sst);
+    }
+  }
+
+  dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
+  return 0;
+}
+
+BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
+{
+  FileWriter *w = new FileWriter(f);
+  for (unsigned i = 0; i < MAX_BDEV; ++i) {
+    if (bdev[i]) {
+      w->iocv[i] = new IOContext(cct, NULL);
+    }
+  }
+  return w;
+}
+
+void BlueFS::_close_writer(FileWriter *h)
+{
+  dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
+  //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
+  for (unsigned i=0; i<MAX_BDEV; ++i) {
+    if (bdev[i]) {
+      if (h->iocv[i]) {
+	h->iocv[i]->aio_wait();
+	bdev[i]->queue_reap_ioc(h->iocv[i]);
+      }
+    }
+  }
+  // sanity
+  if (h->file->fnode.size >= (1ull << 30)) {
+    dout(10) << __func__ << " file is unexpectedly large:" << h->file->fnode << dendl;
+  }
+  delete h;
+}
+
+uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h)
+{
+  std::lock_guard l(lock);
+  return h->file->dirty_seq;
+}
+
+bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
+{
+  std::lock_guard l(lock);
+  return h->dirty_devs[dev];
+}
+
+int BlueFS::open_for_read(
+  std::string_view dirname,
+  std::string_view filename,
+  FileReader **h,
+  bool random)
+{
+  std::lock_guard l(lock);
+  dout(10) << __func__ << " " << dirname << "/" << filename
+	   << (random ? " (random)":" (sequential)") << dendl;
+  map<string,DirRef>::iterator p = dir_map.find(dirname);
+  if (p == dir_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+    return -ENOENT;
+  }
+  DirRef dir = p->second;
+
+  map<string,FileRef>::iterator q = dir->file_map.find(filename);
+  if (q == dir->file_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " (" << dir
+	     << ") file " << filename
+	     << " not found" << dendl;
+    return -ENOENT;
+  }
+  File *file = q->second.get();
+
+  *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
+		      random, false);
+  dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
+  return 0;
+}
+
+int BlueFS::rename(
+  std::string_view old_dirname, std::string_view old_filename,
+  std::string_view new_dirname, std::string_view new_filename)
+{
+  std::lock_guard l(lock);
+  dout(10) << __func__ << " " << old_dirname << "/" << old_filename
+	   << " -> " << new_dirname << "/" << new_filename << dendl;
+  map<string,DirRef>::iterator p = dir_map.find(old_dirname);
+  if (p == dir_map.end()) {
+    dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
+    return -ENOENT;
+  }
+  DirRef old_dir = p->second;
+  map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
+  if (q == old_dir->file_map.end()) {
+    dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
+	     << ") file " << old_filename
+	     << " not found" << dendl;
+    return -ENOENT;
+  }
+  FileRef file = q->second;
+
+  p = dir_map.find(new_dirname);
+  if (p == dir_map.end()) {
+    dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
+    return -ENOENT;
+  }
+  DirRef new_dir = p->second;
+  q = new_dir->file_map.find(new_filename);
+  if (q != new_dir->file_map.end()) {
+    dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
+	     << ") file " << new_filename
+	     << " already exists, unlinking" << dendl;
+    ceph_assert(q->second != file);
+    log_t.op_dir_unlink(new_dirname, new_filename);
+    _drop_link(q->second);
+  }
+
+  dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
+	   << " " << file->fnode << dendl;
+
+  new_dir->file_map[string{new_filename}] = file;
+  old_dir->file_map.erase(string{old_filename});
+
+  log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
+  log_t.op_dir_unlink(old_dirname, old_filename);
+  return 0;
+}
+
+int BlueFS::mkdir(std::string_view dirname)
+{
+  std::lock_guard l(lock);
+  dout(10) << __func__ << " " << dirname << dendl;
+  map<string,DirRef>::iterator p = dir_map.find(dirname);
+  if (p != dir_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
+    return -EEXIST;
+  }
+  dir_map[string{dirname}] = ceph::make_ref<Dir>();
+  log_t.op_dir_create(dirname);
+  return 0;
+}
+
+int BlueFS::rmdir(std::string_view dirname)
+{
+  std::lock_guard l(lock);
+  dout(10) << __func__ << " " << dirname << dendl;
+  auto p = dir_map.find(dirname);
+  if (p == dir_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
+    return -ENOENT;
+  }
+  DirRef dir = p->second;
+  if (!dir->file_map.empty()) {
+    dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
+    return -ENOTEMPTY;
+  }
+  dir_map.erase(string{dirname});
+  log_t.op_dir_remove(dirname);
+  return 0;
+}
+
+bool BlueFS::dir_exists(std::string_view dirname)
+{
+  std::lock_guard l(lock);
+  map<string,DirRef>::iterator p = dir_map.find(dirname);
+  bool exists = p != dir_map.end();
+  dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
+  return exists;
+}
+
+int BlueFS::stat(std::string_view dirname, std::string_view filename,
+		 uint64_t *size, utime_t *mtime)
+{
+  std::lock_guard l(lock);
+  dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
+  map<string,DirRef>::iterator p = dir_map.find(dirname);
+  if (p == dir_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+    return -ENOENT;
+  }
+  DirRef dir = p->second;
+  map<string,FileRef>::iterator q = dir->file_map.find(filename);
+  if (q == dir->file_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " (" << dir
+	     << ") file " << filename
+	     << " not found" << dendl;
+    return -ENOENT;
+  }
+  File *file = q->second.get();
+  dout(10) << __func__ << " " << dirname << "/" << filename
+	   << " " << file->fnode << dendl;
+  if (size)
+    *size = file->fnode.size;
+  if (mtime)
+    *mtime = file->fnode.mtime;
+  return 0;
+}
+
+int BlueFS::lock_file(std::string_view dirname, std::string_view filename,
+		      FileLock **plock)
+{
+  std::lock_guard l(lock);
+  dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
+  map<string,DirRef>::iterator p = dir_map.find(dirname);
+  if (p == dir_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+    return -ENOENT;
+  }
+  DirRef dir = p->second;
+  auto q = dir->file_map.find(filename);
+  FileRef file;
+  if (q == dir->file_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " (" << dir
+	     << ") file " << filename
+	     << " not found, creating" << dendl;
+    file = ceph::make_ref<File>();
+    file->fnode.ino = ++ino_last;
+    file->fnode.mtime = ceph_clock_now();
+    file_map[ino_last] = file;
+    dir->file_map[string{filename}] = file;
+    ++file->refs;
+    log_t.op_file_update(file->fnode);
+    log_t.op_dir_link(dirname, filename, file->fnode.ino);
+  } else {
+    file = q->second;
+    if (file->locked) {
+      dout(10) << __func__ << " already locked" << dendl;
+      return -ENOLCK;
+    }
+  }
+  file->locked = true;
+  *plock = new FileLock(file);
+  dout(10) << __func__ << " locked " << file->fnode
+	   << " with " << *plock << dendl;
+  return 0;
+}
+
+int BlueFS::unlock_file(FileLock *fl)
+{
+  std::lock_guard l(lock);
+  dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
+  ceph_assert(fl->file->locked);
+  fl->file->locked = false;
+  delete fl;
+  return 0;
+}
+
+int BlueFS::readdir(std::string_view dirname, vector<string> *ls)
+{
+  // dirname may contain a trailing /
+  if (!dirname.empty() && dirname.back() == '/') {
+    dirname.remove_suffix(1);
+  }
+  std::lock_guard l(lock);
+  dout(10) << __func__ << " " << dirname << dendl;
+  if (dirname.empty()) {
+    // list dirs
+    ls->reserve(dir_map.size() + 2);
+    for (auto& q : dir_map) {
+      ls->push_back(q.first);
+    }
+  } else {
+    // list files in dir
+    map<string,DirRef>::iterator p = dir_map.find(dirname);
+    if (p == dir_map.end()) {
+      dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+      return -ENOENT;
+    }
+    DirRef dir = p->second;
+    ls->reserve(dir->file_map.size() + 2);
+    for (auto& q : dir->file_map) {
+      ls->push_back(q.first);
+    }
+  }
+  ls->push_back(".");
+  ls->push_back("..");
+  return 0;
+}
+
+int BlueFS::unlink(std::string_view dirname, std::string_view filename)
+{
+  std::lock_guard l(lock);
+  dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
+  map<string,DirRef>::iterator p = dir_map.find(dirname);
+  if (p == dir_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+    return -ENOENT;
+  }
+  DirRef dir = p->second;
+  map<string,FileRef>::iterator q = dir->file_map.find(filename);
+  if (q == dir->file_map.end()) {
+    dout(20) << __func__ << " file " << dirname << "/" << filename
+	     << " not found" << dendl;
+    return -ENOENT;
+  }
+  FileRef file = q->second;
+  if (file->locked) {
+    dout(20) << __func__ << " file " << dirname << "/" << filename
+             << " is locked" << dendl;
+    return -EBUSY;
+  }
+  dir->file_map.erase(string{filename});
+  log_t.op_dir_unlink(dirname, filename);
+  _drop_link(file);
+  return 0;
+}
+
+bool BlueFS::wal_is_rotational()
+{
+  if (bdev[BDEV_WAL]) {
+    return bdev[BDEV_WAL]->is_rotational();
+  } else if (bdev[BDEV_DB]) {
+    return bdev[BDEV_DB]->is_rotational();
+  }
+  return bdev[BDEV_SLOW]->is_rotational();
+}
+
+/*
+  Algorithm.
+  do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
+  Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
+  and try if using it will produce healthy bluefs transaction.
+  We encode already known bluefs log extents and search disk for these bytes.
+  When we find it, we decode following bytes as extent.
+  We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
+ */
+int BlueFS::do_replay_recovery_read(FileReader *log_reader,
+				    size_t replay_pos,
+				    size_t read_offset,
+				    size_t read_len,
+				    bufferlist* bl) {
+  dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
+    " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
+
+  bluefs_fnode_t& log_fnode = log_reader->file->fnode;
+  bufferlist bin_extents;
+  ::encode(log_fnode.extents, bin_extents);
+  dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
+
+  // cannot process if too small to effectively search
+  ceph_assert(bin_extents.length() >= 32);
+  bufferlist last_32;
+  last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
+
+  //read fixed part from replay_pos to end of bluefs_log extents
+  bufferlist fixed;
+  uint64_t e_off = 0;
+  auto e = log_fnode.seek(replay_pos, &e_off);
+  ceph_assert(e != log_fnode.extents.end());
+  int r = bdev[e->bdev]->read(e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
+				  cct->_conf->bluefs_buffered_io);
+  ceph_assert(r == 0);
+  //capture dev of last good extent
+  uint8_t last_e_dev = e->bdev;
+  uint64_t last_e_off = e->offset;
+  ++e;
+  while (e != log_fnode.extents.end()) {
+    r = bdev[e->bdev]->read(e->offset, e->length, &fixed, ioc[e->bdev],
+				  cct->_conf->bluefs_buffered_io);
+    ceph_assert(r == 0);
+    last_e_dev = e->bdev;
+    ++e;
+  }
+  ceph_assert(replay_pos + fixed.length() == read_offset);
+
+  dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
+
+  struct compare {
+    bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
+      if (a.bdev < b.bdev) return true;
+      if (a.offset < b.offset) return true;
+      return a.length < b.length;
+    }
+  };
+  std::set<bluefs_extent_t, compare> extents_rejected;
+  for (int dcnt = 0; dcnt < 3; dcnt++) {
+    uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
+    if (bdev[dev] == nullptr) continue;
+    dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
+    interval_set<uint64_t> disk_regions;
+    disk_regions.insert(0, bdev[dev]->get_size());
+    for (auto f : file_map) {
+      auto& e = f.second->fnode.extents;
+      for (auto& p : e) {
+	if (p.bdev == dev) {
+	  disk_regions.erase(p.offset, p.length);
+	}
+      }
+    }
+    size_t disk_regions_count = disk_regions.num_intervals();
+    dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
+
+    auto reg = disk_regions.lower_bound(last_e_off);
+    //for all except first, start from beginning
+    last_e_off = 0;
+    if (reg == disk_regions.end()) {
+      reg = disk_regions.begin();
+    }
+    const uint64_t chunk_size = 4 * 1024 * 1024;
+    const uint64_t page_size = 4096;
+    const uint64_t max_extent_size = 16;
+    uint64_t overlay_size = last_32.length() + max_extent_size;
+    for (size_t i = 0; i < disk_regions_count; reg++, i++) {
+      if (reg == disk_regions.end()) {
+	reg = disk_regions.begin();
+      }
+      uint64_t pos = reg.get_start();
+      uint64_t len = reg.get_len();
+
+      std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
+      char* raw_data = raw_data_p.get();
+      memset(raw_data, 0, page_size);
+
+      while (len > last_32.length()) {
+	uint64_t chunk_len = len > chunk_size ? chunk_size : len;
+	dout(5) << __func__ << " read "
+		<< get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len << std::dec << dendl;
+	r = bdev[dev]->read_random(pos, chunk_len, raw_data + page_size, cct->_conf->bluefs_buffered_io);
+	ceph_assert(r == 0);
+
+	//search for fixed_last_32
+	char* chunk_b = raw_data + page_size;
+	char* chunk_e = chunk_b + chunk_len;
+
+	char* search_b = chunk_b - overlay_size;
+	char* search_e = chunk_e;
+
+	for (char* sp = search_b; ; sp += last_32.length()) {
+	  sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
+	  if (sp == nullptr) {
+	    break;
+	  }
+
+	  char* n = sp + last_32.length();
+	  dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
+	  bufferlist test;
+	  test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
+	  bluefs_extent_t ne;
+	  try {
+	    bufferlist::const_iterator p = test.begin();
+	    ::decode(ne, p);
+	  } catch (buffer::error& e) {
+	    continue;
+	  }
+	  if (extents_rejected.count(ne) != 0) {
+	    dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
+	    continue;
+	  }
+	  //insert as rejected already. if we succeed, it wouldn't make difference.
+	  extents_rejected.insert(ne);
+
+	  if (ne.bdev >= MAX_BDEV ||
+	      bdev[ne.bdev] == nullptr ||
+	      ne.length > 16 * 1024 * 1024 ||
+	      (ne.length & 4095) != 0 ||
+	      ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
+	      (ne.offset & 4095) != 0) {
+	    dout(5) << __func__ << " refusing extent " << ne << dendl;
+	    continue;
+	  }
+	  dout(5) << __func__ << " checking extent " << ne << dendl;
+
+	  //read candidate extent - whole
+	  bufferlist candidate;
+	  candidate.append(fixed);
+	  r = bdev[ne.bdev]->read(ne.offset, ne.length, &candidate, ioc[ne.bdev],
+				cct->_conf->bluefs_buffered_io);
+	  ceph_assert(r == 0);
+
+	  //check if transaction & crc is ok
+	  bluefs_transaction_t t;
+	  try {
+	    bufferlist::const_iterator p = candidate.begin();
+	    ::decode(t, p);
+	  }
+	  catch (buffer::error& e) {
+	    dout(5) << __func__ << " failed match" << dendl;
+	    continue;
+	  }
+
+	  //success, it seems a probable candidate
+	  uint64_t l = std::min<uint64_t>(ne.length, read_len);
+	  //trim to required size
+	  bufferlist requested_read;
+	  requested_read.substr_of(candidate, fixed.length(), l);
+	  bl->append(requested_read);
+	  dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
+	  log_fnode.append_extent(ne);
+	  log_fnode.recalc_allocated();
+	  log_reader->buf.pos += l;
+	  return l;
+	}
+	//save overlay for next search
+	memcpy(search_b, chunk_e - overlay_size, overlay_size);
+	pos += chunk_len;
+	len -= chunk_len;
+      }
+    }
+  }
+  return 0;
+}
+
+size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size)
+{
+  size_t total = 0;
+  auto iterated_allocation = [&](size_t off, size_t len) {
+    //only count in size that is alloc_size aligned
+    size_t dist_to_alignment;
+    size_t offset_in_block = off & (alloc_size - 1);
+    if (offset_in_block == 0)
+      dist_to_alignment = 0;
+    else
+      dist_to_alignment = alloc_size - offset_in_block;
+    if (dist_to_alignment >= len)
+      return;
+    len -= dist_to_alignment;
+    total += p2align(len, alloc_size);
+  };
+  if (alloc[dev]) {
+    alloc[dev]->foreach(iterated_allocation);
+  }
+  return total;
+}
+// ===============================================
+// OriginalVolumeSelector
+
+void* OriginalVolumeSelector::get_hint_for_log() const {
+  return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
+}
+void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
+  uint8_t res = BlueFS::BDEV_DB;
+  if (dirname.length() > 5) {
+    // the "db.slow" and "db.wal" directory names are hard-coded at
+    // match up with bluestore.  the slow device is always the second
+    // one (when a dedicated block.db device is present and used at
+    // bdev 0).  the wal device is always last.
+    if (boost::algorithm::ends_with(dirname, ".slow") && slow_total) {
+      res = BlueFS::BDEV_SLOW;
+    } else if (boost::algorithm::ends_with(dirname, ".wal") && wal_total) {
+      res = BlueFS::BDEV_WAL;
+    }
+  }
+  return reinterpret_cast<void*>(res);
+}
+
+uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
+{
+  return (uint8_t)(reinterpret_cast<uint64_t>(hint));
+}
+
+void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
+{
+  res.emplace_back(base, db_total);
+  res.emplace_back(base + ".slow",
+    slow_total ? slow_total : db_total); // use fake non-zero value if needed to
+                                         // avoid RocksDB complains
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "OriginalVolumeSelector: "
+
+void OriginalVolumeSelector::dump(ostream& sout) {
+  sout<< "wal_total:" << wal_total
+    << ", db_total:" << db_total
+    << ", slow_total:" << slow_total
+    << std::endl;
+}
+
+// ===============================================
+// FitToFastVolumeSelector
+
+void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const {
+  res.emplace_back(base, 1);  // size of the last db_path has no effect
+}
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
new file mode 100644
index 000000000..988c7e37f
--- /dev/null
+++ b/src/os/bluestore/BlueFS.h
@@ -0,0 +1,707 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OS_BLUESTORE_BLUEFS_H
+#define CEPH_OS_BLUESTORE_BLUEFS_H
+
+#include <atomic>
+#include <mutex>
+#include <limits>
+
+#include "bluefs_types.h"
+#include "blk/BlockDevice.h"
+
+#include "common/RefCountedObj.h"
+#include "common/ceph_context.h"
+#include "global/global_context.h"
+#include "include/common_fwd.h"
+
+#include "boost/intrusive/list.hpp"
+#include "boost/dynamic_bitset.hpp"
+
+class Allocator;
+
+enum {
+  l_bluefs_first = 732600,
+  l_bluefs_db_total_bytes,
+  l_bluefs_db_used_bytes,
+  l_bluefs_wal_total_bytes,
+  l_bluefs_wal_used_bytes,
+  l_bluefs_slow_total_bytes,
+  l_bluefs_slow_used_bytes,
+  l_bluefs_num_files,
+  l_bluefs_log_bytes,
+  l_bluefs_log_compactions,
+  l_bluefs_logged_bytes,
+  l_bluefs_files_written_wal,
+  l_bluefs_files_written_sst,
+  l_bluefs_bytes_written_wal,
+  l_bluefs_bytes_written_sst,
+  l_bluefs_bytes_written_slow,
+  l_bluefs_max_bytes_wal,
+  l_bluefs_max_bytes_db,
+  l_bluefs_max_bytes_slow,
+  l_bluefs_read_random_count,
+  l_bluefs_read_random_bytes,
+  l_bluefs_read_random_disk_count,
+  l_bluefs_read_random_disk_bytes,
+  l_bluefs_read_random_buffer_count,
+  l_bluefs_read_random_buffer_bytes,
+  l_bluefs_read_count,
+  l_bluefs_read_bytes,
+  l_bluefs_read_prefetch_count,
+  l_bluefs_read_prefetch_bytes,
+  l_bluefs_read_zeros_candidate,
+  l_bluefs_read_zeros_errors,
+
+  l_bluefs_last,
+};
+
+class BlueFSVolumeSelector {
+public:
+  typedef std::vector<std::pair<std::string, uint64_t>> paths;
+
+  virtual ~BlueFSVolumeSelector() {
+  }
+  virtual void* get_hint_for_log() const = 0;
+  virtual void* get_hint_by_dir(std::string_view dirname) const = 0;
+
+  virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
+  virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
+  virtual void add_usage(void* file_hint, uint64_t fsize) = 0;
+  virtual void sub_usage(void* file_hint, uint64_t fsize) = 0;
+  virtual uint8_t select_prefer_bdev(void* hint) = 0;
+  virtual void get_paths(const std::string& base, paths& res) const = 0;
+  virtual void dump(std::ostream& sout) = 0;
+};
+
+struct bluefs_shared_alloc_context_t {
+  bool need_init = false;
+  Allocator* a = nullptr;
+
+  std::atomic<uint64_t> bluefs_used = 0;
+
+  void set(Allocator* _a) {
+    a = _a;
+    need_init = true;
+    bluefs_used = 0;
+  }
+  void reset() {
+    a = nullptr;
+  }
+};
+
+class BlueFS {
+public:
+  CephContext* cct;
+  static constexpr unsigned MAX_BDEV = 5;
+  static constexpr unsigned BDEV_WAL = 0;
+  static constexpr unsigned BDEV_DB = 1;
+  static constexpr unsigned BDEV_SLOW = 2;
+  static constexpr unsigned BDEV_NEWWAL = 3;
+  static constexpr unsigned BDEV_NEWDB = 4;
+
+  enum {
+    WRITER_UNKNOWN,
+    WRITER_WAL,
+    WRITER_SST,
+  };
+
+  struct File : public RefCountedObject {
+    MEMPOOL_CLASS_HELPERS();
+
+    bluefs_fnode_t fnode;
+    int refs;
+    uint64_t dirty_seq;
+    bool locked;
+    bool deleted;
+    bool is_dirty;
+    boost::intrusive::list_member_hook<> dirty_item;
+
+    std::atomic_int num_readers, num_writers;
+    std::atomic_int num_reading;
+
+    void* vselector_hint = nullptr;
+
+  private:
+    FRIEND_MAKE_REF(File);
+    File()
+      :
+	refs(0),
+	dirty_seq(0),
+	locked(false),
+	deleted(false),
+	is_dirty(false),
+	num_readers(0),
+	num_writers(0),
+	num_reading(0),
+        vselector_hint(nullptr)
+      {}
+    ~File() override {
+      ceph_assert(num_readers.load() == 0);
+      ceph_assert(num_writers.load() == 0);
+      ceph_assert(num_reading.load() == 0);
+      ceph_assert(!locked);
+    }
+  };
+  using FileRef = ceph::ref_t<File>;
+
+  typedef boost::intrusive::list<
+      File,
+      boost::intrusive::member_hook<
+        File,
+	boost::intrusive::list_member_hook<>,
+	&File::dirty_item> > dirty_file_list_t;
+
+  struct Dir : public RefCountedObject {
+    MEMPOOL_CLASS_HELPERS();
+
+    mempool::bluefs::map<std::string, FileRef, std::less<>> file_map;
+
+  private:
+    FRIEND_MAKE_REF(Dir);
+    Dir() = default;
+  };
+  using DirRef = ceph::ref_t<Dir>;
+
+  struct FileWriter {
+    MEMPOOL_CLASS_HELPERS();
+
+    FileRef file;
+    uint64_t pos = 0;       ///< start offset for buffer
+  private:
+    ceph::buffer::list buffer;      ///< new data to write (at end of file)
+    ceph::buffer::list tail_block;  ///< existing partial block at end of file, if any
+  public:
+    unsigned get_buffer_length() const {
+      return buffer.length();
+    }
+    ceph::bufferlist flush_buffer(
+      CephContext* cct,
+      const bool partial,
+      const unsigned length,
+      const bluefs_super_t& super);
+    ceph::buffer::list::page_aligned_appender buffer_appender;  //< for const char* only
+  public:
+    int writer_type = 0;    ///< WRITER_*
+    int write_hint = WRITE_LIFE_NOT_SET;
+
+    ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock");
+    std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
+    std::array<bool, MAX_BDEV> dirty_devs;
+
+    FileWriter(FileRef f)
+      : file(std::move(f)),
+       buffer_appender(buffer.get_page_aligned_appender(
+                         g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
+      ++file->num_writers;
+      iocv.fill(nullptr);
+      dirty_devs.fill(false);
+      if (file->fnode.ino == 1) {
+	write_hint = WRITE_LIFE_MEDIUM;
+      }
+    }
+    // NOTE: caller must call BlueFS::close_writer()
+    ~FileWriter() {
+      --file->num_writers;
+    }
+
+    // note: BlueRocksEnv uses this append exclusively, so it's safe
+    // to use buffer_appender exclusively here (e.g., it's notion of
+    // offset will remain accurate).
+    void append(const char *buf, size_t len) {
+      uint64_t l0 = get_buffer_length();
+      ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
+      buffer_appender.append(buf, len);
+    }
+
+    // note: used internally only, for ino 1 or 0.
+    void append(ceph::buffer::list& bl) {
+      uint64_t l0 = get_buffer_length();
+      ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max());
+      buffer.claim_append(bl);
+    }
+
+    void append_zero(size_t len) {
+      uint64_t l0 = get_buffer_length();
+      ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
+      buffer_appender.append_zero(len);
+    }
+
+    uint64_t get_effective_write_pos() {
+      return pos + buffer.length();
+    }
+  };
+
+  struct FileReaderBuffer {
+    MEMPOOL_CLASS_HELPERS();
+
+    uint64_t bl_off = 0;    ///< prefetch buffer logical offset
+    ceph::buffer::list bl;          ///< prefetch buffer
+    uint64_t pos = 0;       ///< current logical offset
+    uint64_t max_prefetch;  ///< max allowed prefetch
+
+    explicit FileReaderBuffer(uint64_t mpf)
+      : max_prefetch(mpf) {}
+
+    uint64_t get_buf_end() const {
+      return bl_off + bl.length();
+    }
+    uint64_t get_buf_remaining(uint64_t p) const {
+      if (p >= bl_off && p < bl_off + bl.length())
+	return bl_off + bl.length() - p;
+      return 0;
+    }
+
+    void skip(size_t n) {
+      pos += n;
+    }
+
+    // For the sake of simplicity, we invalidate completed rather than
+    // for the provided extent
+    void invalidate_cache(uint64_t offset, uint64_t length) {
+      if (offset >= bl_off && offset < get_buf_end()) {
+	bl.clear();
+	bl_off = 0;
+      }
+    }
+  };
+
+  struct FileReader {
+    MEMPOOL_CLASS_HELPERS();
+
+    FileRef file;
+    FileReaderBuffer buf;
+    bool random;
+    bool ignore_eof;        ///< used when reading our log file
+
+    ceph::shared_mutex lock {
+     ceph::make_shared_mutex(std::string(), false, false, false)
+    };
+
+
+    FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
+      : file(f),
+	buf(mpf),
+	random(rand),
+	ignore_eof(ie) {
+      ++file->num_readers;
+    }
+    ~FileReader() {
+      --file->num_readers;
+    }
+  };
+
+  struct FileLock {
+    MEMPOOL_CLASS_HELPERS();
+
+    FileRef file;
+    explicit FileLock(FileRef f) : file(std::move(f)) {}
+  };
+
+private:
+  ceph::mutex lock = ceph::make_mutex("BlueFS::lock");
+
+  PerfCounters *logger = nullptr;
+
+  uint64_t max_bytes[MAX_BDEV] = {0};
+  uint64_t max_bytes_pcounters[MAX_BDEV] = {
+    l_bluefs_max_bytes_wal,
+    l_bluefs_max_bytes_db,
+    l_bluefs_max_bytes_slow,
+  };
+
+  // cache
+  mempool::bluefs::map<std::string, DirRef, std::less<>> dir_map;          ///< dirname -> Dir
+  mempool::bluefs::unordered_map<uint64_t, FileRef> file_map; ///< ino -> File
+
+  // map of dirty files, files of same dirty_seq are grouped into list.
+  std::map<uint64_t, dirty_file_list_t> dirty_files;
+
+  bluefs_super_t super;        ///< latest superblock (as last written)
+  uint64_t ino_last = 0;       ///< last assigned ino (this one is in use)
+  uint64_t log_seq = 0;        ///< last used log seq (by current pending log_t)
+  uint64_t log_seq_stable = 0; ///< last stable/synced log seq
+  FileWriter *log_writer = 0;  ///< writer for the log
+  bluefs_transaction_t log_t;  ///< pending, unwritten log transaction
+  bool log_flushing = false;   ///< true while flushing the log
+  ceph::condition_variable log_cond;
+
+  uint64_t new_log_jump_to = 0;
+  uint64_t old_log_jump_to = 0;
+  FileRef new_log = nullptr;
+  FileWriter *new_log_writer = nullptr;
+
+  /*
+   * There are up to 3 block devices:
+   *
+   *  BDEV_DB   db/      - the primary db device
+   *  BDEV_WAL  db.wal/  - a small, fast device, specifically for the WAL
+   *  BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
+   */
+  std::vector<BlockDevice*> bdev;                  ///< block devices we can use
+  std::vector<IOContext*> ioc;                     ///< IOContexts for bdevs
+  std::vector<uint64_t> block_reserved;            ///< starting reserve extent per device
+  std::vector<Allocator*> alloc;                   ///< allocators for bdevs
+  std::vector<uint64_t> alloc_size;                ///< alloc size for each device
+  std::vector<interval_set<uint64_t>> pending_release; ///< extents to release
+  //std::vector<interval_set<uint64_t>> block_unused_too_granular;
+
+  BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
+
+  std::unique_ptr<BlueFSVolumeSelector> vselector;
+
+  bluefs_shared_alloc_context_t* shared_alloc = nullptr;
+  unsigned shared_alloc_id = unsigned(-1);
+  inline bool is_shared_alloc(unsigned id) const {
+    return id == shared_alloc_id;
+  }
+
+  class SocketHook;
+  SocketHook* asok_hook = nullptr;
+  // used to trigger zeros into read (debug / verify)
+  std::atomic<uint64_t> inject_read_zeros{0};
+
+  void _init_logger();
+  void _shutdown_logger();
+  void _update_logger_stats();
+
+  void _init_alloc();
+  void _stop_alloc();
+
+  void _pad_bl(ceph::buffer::list& bl);  ///< pad ceph::buffer::list to block size w/ zeros
+
+  uint64_t _get_used(unsigned id) const;
+  uint64_t _get_total(unsigned id) const;
+
+
+  FileRef _get_file(uint64_t ino);
+  void _drop_link(FileRef f);
+
+  unsigned _get_slow_device_id() {
+    return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB;
+  }
+  const char* get_device_name(unsigned id);
+  int _allocate(uint8_t bdev, uint64_t len,
+		bluefs_fnode_t* node);
+  int _allocate_without_fallback(uint8_t id, uint64_t len,
+				 PExtentVector* extents);
+
+  /* signal replay log to include h->file in nearest log flush */
+  int _signal_dirty_to_log(FileWriter *h);
+  int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
+  int _flush(FileWriter *h, bool force, std::unique_lock<ceph::mutex>& l);
+  int _flush(FileWriter *h, bool force, bool *flushed = nullptr);
+  int _fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l);
+
+#ifdef HAVE_LIBAIO
+  void _claim_completed_aios(FileWriter *h, std::list<aio_t> *ls);
+  void wait_for_aio(FileWriter *h);  // safe to call without a lock
+#endif
+
+  int _flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
+			  uint64_t want_seq = 0,
+			  uint64_t jump_to = 0);
+  uint64_t _estimate_log_size();
+  bool _should_compact_log();
+
+  enum {
+    REMOVE_DB = 1,
+    REMOVE_WAL = 2,
+    RENAME_SLOW2DB = 4,
+    RENAME_DB2SLOW = 8,
+  };
+  void _compact_log_dump_metadata(bluefs_transaction_t *t,
+				  int flags);
+  void _compact_log_sync();
+  void _compact_log_async(std::unique_lock<ceph::mutex>& l);
+
+  void _rewrite_log_and_layout_sync(bool allocate_with_fallback,
+				    int super_dev,
+				    int log_dev,
+				    int new_log_dev,
+				    int flags,
+				    std::optional<bluefs_layout_t> layout);
+
+  //void _aio_finish(void *priv);
+
+  void _flush_bdev_safely(FileWriter *h);
+  void flush_bdev();  // this is safe to call without a lock
+  void flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs);  // this is safe to call without a lock
+
+  int _preallocate(FileRef f, uint64_t off, uint64_t len);
+  int _truncate(FileWriter *h, uint64_t off);
+
+  int64_t _read(
+    FileReader *h,   ///< [in] read from here
+    uint64_t offset, ///< [in] offset
+    size_t len,      ///< [in] this many bytes
+    ceph::buffer::list *outbl,   ///< [out] optional: reference the result here
+    char *out);      ///< [out] optional: or copy it here
+  int64_t _read_random(
+    FileReader *h,   ///< [in] read from here
+    uint64_t offset, ///< [in] offset
+    uint64_t len,    ///< [in] this many bytes
+    char *out);      ///< [out] optional: or copy it here
+
+  void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length);
+
+  int _open_super();
+  int _write_super(int dev);
+  int _check_allocations(const bluefs_fnode_t& fnode,
+    boost::dynamic_bitset<uint64_t>* used_blocks,
+    bool is_alloc, //true when allocating, false when deallocating
+    const char* op_name);
+  int _verify_alloc_granularity(
+    __u8 id, uint64_t offset, uint64_t length,
+    const char *op);
+  int _replay(bool noop, bool to_stdout = false); ///< replay journal
+
+  FileWriter *_create_writer(FileRef f);
+  void _close_writer(FileWriter *h);
+
+  // always put the super in the second 4k block.  FIXME should this be
+  // block size independent?
+  unsigned get_super_offset() {
+    return 4096;
+  }
+  unsigned get_super_length() {
+    return 4096;
+  }
+
+public:
+  BlueFS(CephContext* cct);
+  ~BlueFS();
+
+  // the super is always stored on bdev 0
+  int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout);
+  int mount();
+  int maybe_verify_layout(const bluefs_layout_t& layout) const;
+  void umount(bool avoid_compact = false);
+  int prepare_new_device(int id, const bluefs_layout_t& layout);
+  
+  int log_dump();
+
+  void collect_metadata(std::map<std::string,std::string> *pm, unsigned skip_bdev_id);
+  void get_devices(std::set<std::string> *ls);
+  uint64_t get_alloc_size(int id) {
+    return alloc_size[id];
+  }
+  int fsck();
+
+  int device_migrate_to_new(
+    CephContext *cct,
+    const std::set<int>& devs_source,
+    int dev_target,
+    const bluefs_layout_t& layout);
+  int device_migrate_to_existing(
+    CephContext *cct,
+    const std::set<int>& devs_source,
+    int dev_target,
+    const bluefs_layout_t& layout);
+
+  uint64_t get_used();
+  uint64_t get_total(unsigned id);
+  uint64_t get_free(unsigned id);
+  uint64_t get_used(unsigned id);
+  void dump_perf_counters(ceph::Formatter *f);
+
+  void dump_block_extents(std::ostream& out);
+
+  /// get current extents that we own for given block device
+  int get_block_extents(unsigned id, interval_set<uint64_t> *extents);
+
+  int open_for_write(
+    std::string_view dir,
+    std::string_view file,
+    FileWriter **h,
+    bool overwrite);
+
+  int open_for_read(
+    std::string_view dir,
+    std::string_view file,
+    FileReader **h,
+    bool random = false);
+
+  void close_writer(FileWriter *h) {
+    std::lock_guard l(lock);
+    _close_writer(h);
+  }
+
+  int rename(std::string_view old_dir, std::string_view old_file,
+	     std::string_view new_dir, std::string_view new_file);
+
+  int readdir(std::string_view dirname, std::vector<std::string> *ls);
+
+  int unlink(std::string_view dirname, std::string_view filename);
+  int mkdir(std::string_view dirname);
+  int rmdir(std::string_view dirname);
+  bool wal_is_rotational();
+
+  bool dir_exists(std::string_view dirname);
+  int stat(std::string_view dirname, std::string_view filename,
+	   uint64_t *size, utime_t *mtime);
+
+  int lock_file(std::string_view dirname, std::string_view filename, FileLock **p);
+  int unlock_file(FileLock *l);
+
+  void compact_log();
+
+  /// sync any uncommitted state to disk
+  void sync_metadata(bool avoid_compact);
+  /// test and compact log, if necessary
+  void _maybe_compact_log(std::unique_lock<ceph::mutex>& l);
+
+  void set_volume_selector(BlueFSVolumeSelector* s) {
+    vselector.reset(s);
+  }
+  void dump_volume_selector(std::ostream& sout) {
+    vselector->dump(sout);
+  }
+  void get_vselector_paths(const std::string& base,
+                           BlueFSVolumeSelector::paths& res) const {
+    return vselector->get_paths(base, res);
+  }
+
+  int add_block_device(unsigned bdev, const std::string& path, bool trim,
+                       uint64_t reserved,
+		       bluefs_shared_alloc_context_t* _shared_alloc = nullptr);
+  bool bdev_support_label(unsigned id);
+  uint64_t get_block_device_size(unsigned bdev) const;
+
+  // handler for discard event
+  void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
+
+  void flush(FileWriter *h, bool force = false) {
+    std::unique_lock l(lock);
+    int r = _flush(h, force, l);
+    ceph_assert(r == 0);
+  }
+
+  void append_try_flush(FileWriter *h, const char* buf, size_t len) {
+    size_t max_size = 1ull << 30; // cap to 1GB
+    while (len > 0) {
+      bool need_flush = true;
+      auto l0 = h->get_buffer_length();
+      if (l0 < max_size) {
+	size_t l = std::min(len, max_size - l0);
+	h->append(buf, l);
+	buf += l;
+	len -= l;
+	need_flush = h->get_buffer_length() >= cct->_conf->bluefs_min_flush_size;
+      }
+      if (need_flush) {
+	flush(h, true);
+	// make sure we've made any progress with flush hence the
+	// loop doesn't iterate forever
+	ceph_assert(h->get_buffer_length() < max_size);
+      }
+    }
+  }
+  void flush_range(FileWriter *h, uint64_t offset, uint64_t length) {
+    std::lock_guard l(lock);
+    _flush_range(h, offset, length);
+  }
+  int fsync(FileWriter *h) {
+    std::unique_lock l(lock);
+    int r = _fsync(h, l);
+    _maybe_compact_log(l);
+    return r;
+  }
+  int64_t read(FileReader *h, uint64_t offset, size_t len,
+	   ceph::buffer::list *outbl, char *out) {
+    // no need to hold the global lock here; we only touch h and
+    // h->file, and read vs write or delete is already protected (via
+    // atomics and asserts).
+    return _read(h, offset, len, outbl, out);
+  }
+  int64_t read_random(FileReader *h, uint64_t offset, size_t len,
+		  char *out) {
+    // no need to hold the global lock here; we only touch h and
+    // h->file, and read vs write or delete is already protected (via
+    // atomics and asserts).
+    return _read_random(h, offset, len, out);
+  }
+  void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) {
+    std::lock_guard l(lock);
+    _invalidate_cache(f, offset, len);
+  }
+  int preallocate(FileRef f, uint64_t offset, uint64_t len) {
+    std::lock_guard l(lock);
+    return _preallocate(f, offset, len);
+  }
+  int truncate(FileWriter *h, uint64_t offset) {
+    std::lock_guard l(lock);
+    return _truncate(h, offset);
+  }
+  int do_replay_recovery_read(FileReader *log,
+			      size_t log_pos,
+			      size_t read_offset,
+			      size_t read_len,
+			      bufferlist* bl);
+
+  size_t probe_alloc_avail(int dev, uint64_t alloc_size);
+
+  /// test purpose methods
+  const PerfCounters* get_perf_counters() const {
+    return logger;
+  }
+  uint64_t debug_get_dirty_seq(FileWriter *h);
+  bool debug_get_is_dev_dirty(FileWriter *h, uint8_t dev);
+
+private:
+  // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...)
+  // They are used for checking if read values are all 0, and reread if so.
+  int read(uint8_t ndev, uint64_t off, uint64_t len,
+	   ceph::buffer::list *pbl, IOContext *ioc, bool buffered);
+  int read_random(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered);
+};
+
+class OriginalVolumeSelector : public BlueFSVolumeSelector {
+  uint64_t wal_total;
+  uint64_t db_total;
+  uint64_t slow_total;
+
+public:
+  OriginalVolumeSelector(
+    uint64_t _wal_total,
+    uint64_t _db_total,
+    uint64_t _slow_total)
+    : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {}
+
+  void* get_hint_for_log() const override;
+  void* get_hint_by_dir(std::string_view dirname) const override;
+
+  void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+    // do nothing
+    return;
+  }
+  void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+    // do nothing
+    return;
+  }
+  void add_usage(void* hint, uint64_t fsize) override {
+    // do nothing
+    return;
+  }
+  void sub_usage(void* hint, uint64_t fsize) override {
+    // do nothing
+    return;
+  }
+
+  uint8_t select_prefer_bdev(void* hint) override;
+  void get_paths(const std::string& base, paths& res) const override;
+  void dump(std::ostream& sout) override;
+};
+
+class FitToFastVolumeSelector : public OriginalVolumeSelector {
+public:
+  FitToFastVolumeSelector(
+    uint64_t _wal_total,
+    uint64_t _db_total,
+    uint64_t _slow_total)
+    : OriginalVolumeSelector(_wal_total, _db_total, _slow_total) {}
+
+  void get_paths(const std::string& base, paths& res) const override;
+};
+
+#endif
diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc
new file mode 100644
index 000000000..f8a2b7025
--- /dev/null
+++ b/src/os/bluestore/BlueRocksEnv.cc
@@ -0,0 +1,594 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BlueRocksEnv.h"
+#include "BlueFS.h"
+#include "include/stringify.h"
+#include "kv/RocksDBStore.h"
+#include "string.h"
+
+namespace {
+
+rocksdb::Status err_to_status(int r)
+{
+  switch (r) {
+  case 0:
+    return rocksdb::Status::OK();
+  case -ENOENT:
+    return rocksdb::Status::NotFound(rocksdb::Status::kNone);
+  case -EINVAL:
+    return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone);
+  case -EIO:
+  case -EEXIST:
+    return rocksdb::Status::IOError(rocksdb::Status::kNone);
+  case -ENOLCK:
+    return rocksdb::Status::IOError(strerror(r));
+  default:
+    // FIXME :(
+    ceph_abort_msg("unrecognized error code");
+    return rocksdb::Status::NotSupported(rocksdb::Status::kNone);
+  }
+}
+
+std::pair<std::string_view, std::string_view>
+split(const std::string &fn)
+{
+  size_t slash = fn.rfind('/');
+  assert(slash != fn.npos);
+  size_t file_begin = slash + 1;
+  while (slash && fn[slash - 1] == '/')
+    --slash;
+  return {string_view(fn.data(), slash),
+          string_view(fn.data() + file_begin,
+	              fn.size() - file_begin)};
+}
+
+}
+
+// A file abstraction for reading sequentially through a file
+class BlueRocksSequentialFile : public rocksdb::SequentialFile {
+  BlueFS *fs;
+  BlueFS::FileReader *h;
+ public:
+  BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
+  ~BlueRocksSequentialFile() override {
+    delete h;
+  }
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // REQUIRES: External synchronization
+  rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override {
+    int64_t r = fs->read(h, h->buf.pos, n, NULL, scratch);
+    ceph_assert(r >= 0);
+    *result = rocksdb::Slice(scratch, r);
+    return rocksdb::Status::OK();
+  }
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  rocksdb::Status Skip(uint64_t n) override {
+    h->buf.skip(n);
+    return rocksdb::Status::OK();
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
+    h->buf.invalidate_cache(offset, length);
+    fs->invalidate_cache(h->file, offset, length);
+    return rocksdb::Status::OK();
+  }
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile {
+  BlueFS *fs;
+  BlueFS::FileReader *h;
+ public:
+  BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
+  ~BlueRocksRandomAccessFile() override {
+    delete h;
+  }
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result,
+		       char* scratch) const override {
+    int64_t r = fs->read_random(h, offset, n, scratch);
+    ceph_assert(r >= 0);
+    *result = rocksdb::Slice(scratch, r);
+    return rocksdb::Status::OK();
+  }
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to eachother by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return snprintf(id, max_size, "%016llx",
+		    (unsigned long long)h->file->fnode.ino);
+  };
+
+  // Readahead the file starting from offset by n bytes for caching.
+  rocksdb::Status Prefetch(uint64_t offset, size_t n) override {
+    fs->read(h, offset, n, nullptr, nullptr);
+    return rocksdb::Status::OK();
+  }
+
+  //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+
+  void Hint(AccessPattern pattern) override {
+    if (pattern == RANDOM)
+      h->buf.max_prefetch = 4096;
+    else if (pattern == SEQUENTIAL)
+      h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch;
+  }
+
+  bool use_direct_io() const override {
+    return !fs->cct->_conf->bluefs_buffered_io;
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
+    h->buf.invalidate_cache(offset, length);
+    fs->invalidate_cache(h->file, offset, length);
+    return rocksdb::Status::OK();
+  }
+};
+
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class BlueRocksWritableFile : public rocksdb::WritableFile {
+  BlueFS *fs;
+  BlueFS::FileWriter *h;
+ public:
+  BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {}
+  ~BlueRocksWritableFile() override {
+    fs->close_writer(h);
+  }
+
+  // Indicates if the class makes use of unbuffered I/O
+  /*bool UseOSBuffer() const {
+    return true;
+    }*/
+
+  // This is needed when you want to allocate
+  // AlignedBuffer for use with file I/O classes
+  // Used for unbuffered file I/O when UseOSBuffer() returns false
+  /*size_t GetRequiredBufferAlignment() const {
+    return c_DefaultPageSize;
+    }*/
+
+  rocksdb::Status Append(const rocksdb::Slice& data) override {
+    fs->append_try_flush(h, data.data(), data.size());
+    return rocksdb::Status::OK();
+  }
+
+  // Positioned write for unbuffered access default forward
+  // to simple append as most of the tests are buffered by default
+  rocksdb::Status PositionedAppend(
+    const rocksdb::Slice& /* data */,
+    uint64_t /* offset */) override {
+    return rocksdb::Status::NotSupported();
+  }
+
+  // Truncate is necessary to trim the file to the correct size
+  // before closing. It is not always possible to keep track of the file
+  // size due to whole pages writes. The behavior is undefined if called
+  // with other writes to follow.
+  rocksdb::Status Truncate(uint64_t size) override {
+    // we mirror the posix env, which does nothing here; instead, it
+    // truncates to the final size on close.  whatever!
+    return rocksdb::Status::OK();
+    //int r = fs->truncate(h, size);
+    //  return err_to_status(r);
+  }
+
+  rocksdb::Status Close() override {
+    fs->fsync(h);
+
+    // mimic posix env, here.  shrug.
+    size_t block_size;
+    size_t last_allocated_block;
+    GetPreallocationStatus(&block_size, &last_allocated_block);
+    if (last_allocated_block > 0) {
+      int r = fs->truncate(h, h->pos);
+      if (r < 0)
+	return err_to_status(r);
+    }
+
+    return rocksdb::Status::OK();
+  }
+
+  rocksdb::Status Flush() override {
+    fs->flush(h);
+    return rocksdb::Status::OK();
+  }
+
+  rocksdb::Status Sync() override { // sync data
+    fs->fsync(h);
+    return rocksdb::Status::OK();
+  }
+
+  // true if Sync() and Fsync() are safe to call concurrently with Append()
+  // and Flush().
+  bool IsSyncThreadSafe() const override {
+    return true;
+  }
+
+  // Indicates the upper layers if the current WritableFile implementation
+  // uses direct IO.
+  bool UseDirectIO() const {
+    return false;
+  }
+
+  void SetWriteLifeTimeHint(rocksdb::Env::WriteLifeTimeHint hint) override {
+    h->write_hint = (const int)hint;
+  }
+
+  /*
+   * Get the size of valid data in the file.
+   */
+  uint64_t GetFileSize() override {
+    return h->file->fnode.size + h->get_buffer_length();;
+  }
+
+  // For documentation, refer to RandomAccessFile::GetUniqueId()
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return snprintf(id, max_size, "%016llx",
+		    (unsigned long long)h->file->fnode.ino);
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
+    fs->fsync(h);
+    fs->invalidate_cache(h->file, offset, length);
+    return rocksdb::Status::OK();
+  }
+
+  using rocksdb::WritableFile::RangeSync;
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  rocksdb::Status RangeSync(off_t offset, off_t nbytes) {
+    // round down to page boundaries
+    int partial = offset & 4095;
+    offset -= partial;
+    nbytes += partial;
+    nbytes &= ~4095;
+    if (nbytes)
+      fs->flush_range(h, offset, nbytes);
+    return rocksdb::Status::OK();
+  }
+
+ protected:
+  using rocksdb::WritableFile::Allocate;
+  /*
+   * Pre-allocate space for a file.
+   */
+  rocksdb::Status Allocate(off_t offset, off_t len) {
+    int r = fs->preallocate(h->file, offset, len);
+    return err_to_status(r);
+  }
+};
+
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class BlueRocksDirectory : public rocksdb::Directory {
+  BlueFS *fs;
+ public:
+  explicit BlueRocksDirectory(BlueFS *f) : fs(f) {}
+
+  // Fsync directory. Can be called concurrently from multiple threads.
+  rocksdb::Status Fsync() override {
+    // it is sufficient to flush the log.
+    fs->sync_metadata(false);
+    return rocksdb::Status::OK();
+  }
+};
+
+// Identifies a locked file.
+class BlueRocksFileLock : public rocksdb::FileLock {
+ public:
+  BlueFS *fs;
+  BlueFS::FileLock *lock;
+  BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { }
+  ~BlueRocksFileLock() override {
+  }
+};
+
+
+// --------------------
+// --- BlueRocksEnv ---
+// --------------------
+
+BlueRocksEnv::BlueRocksEnv(BlueFS *f)
+  : EnvWrapper(Env::Default()),  // forward most of it to POSIX
+    fs(f)
+{
+
+}
+
+rocksdb::Status BlueRocksEnv::NewSequentialFile(
+  const std::string& fname,
+  std::unique_ptr<rocksdb::SequentialFile>* result,
+  const rocksdb::EnvOptions& options)
+{
+  if (fname[0] == '/')
+    return target()->NewSequentialFile(fname, result, options);
+  auto [dir, file] = split(fname);
+  BlueFS::FileReader *h;
+  int r = fs->open_for_read(dir, file, &h, false);
+  if (r < 0)
+    return err_to_status(r);
+  result->reset(new BlueRocksSequentialFile(fs, h));
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::NewRandomAccessFile(
+  const std::string& fname,
+  std::unique_ptr<rocksdb::RandomAccessFile>* result,
+  const rocksdb::EnvOptions& options)
+{
+  auto [dir, file] = split(fname);
+  BlueFS::FileReader *h;
+  int r = fs->open_for_read(dir, file, &h, true);
+  if (r < 0)
+    return err_to_status(r);
+  result->reset(new BlueRocksRandomAccessFile(fs, h));
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::NewWritableFile(
+  const std::string& fname,
+  std::unique_ptr<rocksdb::WritableFile>* result,
+  const rocksdb::EnvOptions& options)
+{
+  auto [dir, file] = split(fname);
+  BlueFS::FileWriter *h;
+  int r = fs->open_for_write(dir, file, &h, false);
+  if (r < 0)
+    return err_to_status(r);
+  result->reset(new BlueRocksWritableFile(fs, h));
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::ReuseWritableFile(
+  const std::string& new_fname,
+  const std::string& old_fname,
+  std::unique_ptr<rocksdb::WritableFile>* result,
+  const rocksdb::EnvOptions& options)
+{
+  auto [old_dir, old_file] = split(old_fname);
+  auto [new_dir, new_file] = split(new_fname);
+
+  int r = fs->rename(old_dir, old_file, new_dir, new_file);
+  if (r < 0)
+    return err_to_status(r);
+
+  BlueFS::FileWriter *h;
+  r = fs->open_for_write(new_dir, new_file, &h, true);
+  if (r < 0)
+    return err_to_status(r);
+  result->reset(new BlueRocksWritableFile(fs, h));
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::NewDirectory(
+  const std::string& name,
+  std::unique_ptr<rocksdb::Directory>* result)
+{
+  if (!fs->dir_exists(name))
+    return rocksdb::Status::NotFound(name, strerror(ENOENT));
+  result->reset(new BlueRocksDirectory(fs));
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname)
+{
+  if (fname[0] == '/')
+    return target()->FileExists(fname);
+  auto [dir, file] = split(fname);
+  if (fs->stat(dir, file, NULL, NULL) == 0)
+    return rocksdb::Status::OK();
+  return err_to_status(-ENOENT);
+}
+
+rocksdb::Status BlueRocksEnv::GetChildren(
+  const std::string& dir,
+  std::vector<std::string>* result)
+{
+  result->clear();
+  int r = fs->readdir(dir, result);
+  if (r < 0)
+    return rocksdb::Status::NotFound(dir, strerror(ENOENT));//    return err_to_status(r);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname)
+{
+  auto [dir, file] = split(fname);
+  int r = fs->unlink(dir, file);
+  if (r < 0)
+    return err_to_status(r);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname)
+{
+  int r = fs->mkdir(dirname);
+  if (r < 0)
+    return err_to_status(r);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname)
+{
+  int r = fs->mkdir(dirname);
+  if (r < 0 && r != -EEXIST)
+    return err_to_status(r);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname)
+{
+  int r = fs->rmdir(dirname);
+  if (r < 0)
+    return err_to_status(r);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::GetFileSize(
+  const std::string& fname,
+  uint64_t* file_size)
+{
+  auto [dir, file] = split(fname);
+  int r = fs->stat(dir, file, file_size, NULL);
+  if (r < 0)
+    return err_to_status(r);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname,
+						      uint64_t* file_mtime)
+{
+  auto [dir, file] = split(fname);
+  utime_t mtime;
+  int r = fs->stat(dir, file, NULL, &mtime);
+  if (r < 0)
+    return err_to_status(r);
+  *file_mtime = mtime.sec();
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::RenameFile(
+  const std::string& src,
+  const std::string& target)
+{
+  auto [old_dir, old_file] = split(src);
+  auto [new_dir, new_file] = split(target);
+
+  int r = fs->rename(old_dir, old_file, new_dir, new_file);
+  if (r < 0)
+    return err_to_status(r);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::LinkFile(
+  const std::string& src,
+  const std::string& target)
+{
+  ceph_abort();
+}
+
+rocksdb::Status BlueRocksEnv::AreFilesSame(
+  const std::string& first,
+  const std::string& second, bool* res)
+{
+  for (auto& path : {first, second}) {
+    if (fs->dir_exists(path)) {
+      continue;
+    }
+    auto [dir, file] = split(path);
+    int r = fs->stat(dir, file, nullptr, nullptr);
+    if (!r) {
+      continue;
+    } else if (r == -ENOENT) {
+      return rocksdb::Status::NotFound("AreFilesSame", path);
+    } else {
+      return err_to_status(r);
+    }
+  }
+  *res = (first == second);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::LockFile(
+  const std::string& fname,
+  rocksdb::FileLock** lock)
+{
+  auto [dir, file] = split(fname);
+  BlueFS::FileLock *l = NULL;
+  int r = fs->lock_file(dir, file, &l);
+  if (r < 0)
+    return err_to_status(r);
+  *lock = new BlueRocksFileLock(fs, l);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock)
+{
+  BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock);
+  int r = fs->unlock_file(l->lock);
+  if (r < 0)
+    return err_to_status(r);
+  delete lock;
+  lock = nullptr;
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::GetAbsolutePath(
+  const std::string& db_path,
+  std::string* output_path)
+{
+  // this is a lie...
+  *output_path = "/" + db_path;
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::NewLogger(
+  const std::string& fname,
+  std::shared_ptr<rocksdb::Logger>* result)
+{
+  // ignore the filename :)
+  result->reset(create_rocksdb_ceph_logger());
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path)
+{
+  static int foo = 0;
+  *path = "temp_" + stringify(++foo);
+  return rocksdb::Status::OK();
+}
diff --git a/src/os/bluestore/BlueRocksEnv.h b/src/os/bluestore/BlueRocksEnv.h
new file mode 100644
index 000000000..62bcddcf6
--- /dev/null
+++ b/src/os/bluestore/BlueRocksEnv.h
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OS_BLUESTORE_BLUEROCKSENV_H
+#define CEPH_OS_BLUESTORE_BLUEROCKSENV_H
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/env_mirror.h"
+
+#include "include/ceph_assert.h"
+#include "kv/RocksDBStore.h"
+
+class BlueFS;
+
+class BlueRocksEnv : public rocksdb::EnvWrapper {
+public:
+  // Create a brand new sequentially-readable file with the specified name.
+  // On success, stores a pointer to the new file in *result and returns OK.
+  // On failure, stores nullptr in *result and returns non-OK.  If the file does
+  // not exist, returns a non-OK status.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  rocksdb::Status NewSequentialFile(
+    const std::string& fname,
+    std::unique_ptr<rocksdb::SequentialFile>* result,
+    const rocksdb::EnvOptions& options) override;
+
+  // Create a brand new random access read-only file with the
+  // specified name.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure, stores nullptr in *result and
+  // returns non-OK.  If the file does not exist, returns a non-OK
+  // status.
+  //
+  // The returned file may be concurrently accessed by multiple threads.
+  rocksdb::Status NewRandomAccessFile(
+    const std::string& fname,
+    std::unique_ptr<rocksdb::RandomAccessFile>* result,
+    const rocksdb::EnvOptions& options) override;
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure, stores nullptr in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  rocksdb::Status NewWritableFile(
+    const std::string& fname,
+    std::unique_ptr<rocksdb::WritableFile>* result,
+    const rocksdb::EnvOptions& options) override;
+
+  // Reuse an existing file by renaming it and opening it as writable.
+  rocksdb::Status ReuseWritableFile(
+    const std::string& fname,
+    const std::string& old_fname,
+    std::unique_ptr<rocksdb::WritableFile>* result,
+    const rocksdb::EnvOptions& options) override;
+
+  // Create an object that represents a directory. Will fail if directory
+  // doesn't exist. If the directory exists, it will open the directory
+  // and create a new Directory object.
+  //
+  // On success, stores a pointer to the new Directory in
+  // *result and returns OK. On failure stores nullptr in *result and
+  // returns non-OK.
+  rocksdb::Status NewDirectory(
+    const std::string& name,
+    std::unique_ptr<rocksdb::Directory>* result) override;
+
+  // Returns OK if the named file exists.
+  //         NotFound if the named file does not exist,
+  //                  the calling process does not have permission to determine
+  //                  whether this file exists, or if the path is invalid.
+  //         IOError if an IO Error was encountered
+  rocksdb::Status FileExists(const std::string& fname) override;
+
+  // Store in *result the names of the children of the specified directory.
+  // The names are relative to "dir".
+  // Original contents of *results are dropped.
+  rocksdb::Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) override;
+
+  // Delete the named file.
+  rocksdb::Status DeleteFile(const std::string& fname) override;
+
+  // Create the specified directory. Returns error if directory exists.
+  rocksdb::Status CreateDir(const std::string& dirname) override;
+
+  // Create directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  rocksdb::Status CreateDirIfMissing(const std::string& dirname) override;
+
+  // Delete the specified directory.
+  rocksdb::Status DeleteDir(const std::string& dirname) override;
+
+  // Store the size of fname in *file_size.
+  rocksdb::Status GetFileSize(const std::string& fname, uint64_t* file_size) override;
+
+  // Store the last modification time of fname in *file_mtime.
+  rocksdb::Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime) override;
+  // Rename file src to target.
+  rocksdb::Status RenameFile(const std::string& src,
+                            const std::string& target) override;
+  // Hard Link file src to target.
+  rocksdb::Status LinkFile(const std::string& src, const std::string& target) override;
+
+  // Tell if two files are identical
+  rocksdb::Status AreFilesSame(const std::string& first,
+			       const std::string& second, bool* res) override;
+
+  // Lock the specified file.  Used to prevent concurrent access to
+  // the same db by multiple processes.  On failure, stores nullptr in
+  // *lock and returns non-OK.
+  //
+  // On success, stores a pointer to the object that represents the
+  // acquired lock in *lock and returns OK.  The caller should call
+  // UnlockFile(*lock) to release the lock.  If the process exits,
+  // the lock will be automatically released.
+  //
+  // If somebody else already holds the lock, finishes immediately
+  // with a failure.  I.e., this call does not wait for existing locks
+  // to go away.
+  //
+  // May create the named file if it does not already exist.
+  rocksdb::Status LockFile(const std::string& fname, rocksdb::FileLock** lock) override;
+
+  // Release the lock acquired by a previous successful call to LockFile.
+  // REQUIRES: lock was returned by a successful LockFile() call
+  // REQUIRES: lock has not already been unlocked.
+  rocksdb::Status UnlockFile(rocksdb::FileLock* lock) override;
+
+  // *path is set to a temporary directory that can be used for testing. It may
+  // or may not have just been created. The directory may or may not differ
+  // between runs of the same process, but subsequent calls will return the
+  // same directory.
+  rocksdb::Status GetTestDirectory(std::string* path) override;
+
+  // Create and return a log file for storing informational messages.
+  rocksdb::Status NewLogger(
+    const std::string& fname,
+    std::shared_ptr<rocksdb::Logger>* result) override;
+
+  // Get full directory name for this db.
+  rocksdb::Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) override;
+
+  explicit BlueRocksEnv(BlueFS *f);
+private:
+  BlueFS *fs;
+};
+
+#endif
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
new file mode 100644
index 000000000..751d26e1e
--- /dev/null
+++ b/src/os/bluestore/BlueStore.cc
@@ -0,0 +1,16955 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <algorithm>
+
+#include <boost/container/flat_set.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include "include/cpp-btree/btree_set.h"
+
+#include "BlueStore.h"
+#include "bluestore_common.h"
+#include "os/kv.h"
+#include "include/compat.h"
+#include "include/intarith.h"
+#include "include/stringify.h"
+#include "include/str_map.h"
+#include "include/util.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/PriorityCache.h"
+#include "common/RWLock.h"
+#include "Allocator.h"
+#include "FreelistManager.h"
+#include "BlueFS.h"
+#include "BlueRocksEnv.h"
+#include "auth/Crypto.h"
+#include "common/EventTrace.h"
+#include "perfglue/heap_profiler.h"
+#include "common/blkdev.h"
+#include "common/numa.h"
+#include "common/pretty_binary.h"
+
+#if defined(WITH_LTTNG)
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/bluestore.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+
+using bid_t = decltype(BlueStore::Blob::id);
+
+// bluestore_cache_onode
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
+			      bluestore_cache_onode);
+
+// bluestore_cache_other
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
+			      bluestore_Buffer);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
+			      bluestore_Extent);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
+			      bluestore_Blob);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
+			      bluestore_SharedBlob);
+
+// bluestore_txc
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
+			      bluestore_txc);
+using std::deque;
+using std::min;
+using std::make_pair;
+using std::numeric_limits;
+using std::pair;
+using std::list;
+using std::map;
+using std::max;
+using std::ostream;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::coarse_mono_clock;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_timespan;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+
+// kv store prefixes
+const string PREFIX_SUPER = "S";       // field -> value
+const string PREFIX_STAT = "T";        // field -> value(int64 array)
+const string PREFIX_COLL = "C";        // collection name -> cnode_t
+const string PREFIX_OBJ = "O";         // object name -> onode_t
+const string PREFIX_OMAP = "M";        // u64 + keyname -> value
+const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
+const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
+const string PREFIX_PERPG_OMAP = "p";   // u64(pool) + u32(hash) + u64(id) + keyname -> value
+const string PREFIX_DEFERRED = "L";    // id -> deferred_transaction_t
+const string PREFIX_ALLOC = "B";       // u64 offset -> u64 length (freelist)
+const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
+const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t
+const string PREFIX_ZONED_FM_META = "Z";  // (see ZonedFreelistManager)
+const string PREFIX_ZONED_FM_INFO = "z";  // (see ZonedFreelistManager)
+const string PREFIX_ZONED_CL_INFO = "G";  // (per-zone cleaner metadata)
+
+const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
+
+// write a label in the first block.  always use this size.  note that
+// bluefs makes a matching assumption about the location of its
+// superblock (always the second block of the device).
+#define BDEV_LABEL_BLOCK_SIZE  4096
+
+// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
+#define SUPER_RESERVED  8192
+
+#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
+
+
+/*
+ * extent map blob encoding
+ *
+ * we use the low bits of the blobid field to indicate some common scenarios
+ * and spanning vs local ids.  See ExtentMap::{encode,decode}_some().
+ */
+#define BLOBID_FLAG_CONTIGUOUS 0x1  // this extent starts at end of previous
+#define BLOBID_FLAG_ZEROOFFSET 0x2  // blob_offset is 0
+#define BLOBID_FLAG_SAMELENGTH 0x4  // length matches previous extent
+#define BLOBID_FLAG_SPANNING   0x8  // has spanning blob id
+#define BLOBID_SHIFT_BITS        4
+
+/*
+ * object name key structure
+ *
+ * encoded u8: shard + 2^7 (so that it sorts properly)
+ * encoded u64: poolid + 2^63 (so that it sorts properly)
+ * encoded u32: hash (bit reversed)
+ *
+ * escaped string: namespace
+ *
+ * escaped string: key or object name
+ * 1 char: '<', '=', or '>'.  if =, then object key == object name, and
+ *         we are done.  otherwise, we are followed by the object name.
+ * escaped string: object name (unless '=' above)
+ *
+ * encoded u64: snap
+ * encoded u64: generation
+ * 'o'
+ */
+#define ONODE_KEY_SUFFIX 'o'
+
+/*
+ * extent shard key
+ *
+ * object prefix key
+ * u32
+ * 'x'
+ */
+#define EXTENT_SHARD_KEY_SUFFIX 'x'
+
+/*
+ * string encoding in the key
+ *
+ * The key string needs to lexicographically sort the same way that
+ * ghobject_t does.  We do this by escaping anything <= to '#' with #
+ * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
+ * hex digits.
+ *
+ * We use ! as a terminator for strings; this works because it is < #
+ * and will get escaped if it is present in the string.
+ *
+ * NOTE: There is a bug in this implementation: due to implicit
+ * character type conversion in comparison it may produce unexpected
+ * ordering. Unfortunately fixing the bug would mean invalidating the
+ * keys in existing deployments. Instead we do additional sorting
+ * where it is needed.
+ */
+template<typename S>
+static void append_escaped(const string &in, S *out)
+{
+  char hexbyte[in.length() * 3 + 1];
+  char* ptr = &hexbyte[0];
+  for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+    if (*i <= '#') { // bug: unexpected result for *i > 0x7f
+      *ptr++ = '#';
+      *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
+      *ptr++ = "0123456789abcdef"[*i & 0x0f];
+    } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
+      *ptr++ = '~';
+      *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
+      *ptr++ = "0123456789abcdef"[*i & 0x0f];
+    } else {
+      *ptr++  = *i;
+    }
+  }
+  *ptr++ = '!';
+  out->append(hexbyte, ptr - &hexbyte[0]);
+}
+
+inline unsigned h2i(char c)
+{
+  if ((c >= '0') && (c <= '9')) {
+    return c - 0x30;
+  } else if ((c >= 'a') && (c <= 'f')) {
+    return c - 'a' + 10;
+  } else if ((c >= 'A') && (c <= 'F')) {
+    return c - 'A' + 10;
+  } else {
+    return 256; // make it always larger than 255
+  }
+}
+
+static int decode_escaped(const char *p, string *out)
+{
+  char buff[256];
+  char* ptr = &buff[0];
+  char* max = &buff[252];
+  const char *orig_p = p;
+  while (*p && *p != '!') {
+    if (*p == '#' || *p == '~') {
+      unsigned hex = 0;
+      p++;
+      hex = h2i(*p++) << 4;
+      if (hex > 255) {
+        return -EINVAL;
+      }
+      hex |= h2i(*p++);
+      if (hex > 255) {
+        return -EINVAL;
+      }
+      *ptr++ = hex;
+    } else {
+      *ptr++ = *p++;
+    }
+    if (ptr > max) {
+       out->append(buff, ptr-buff);
+       ptr = &buff[0];
+    }
+  }
+  if (ptr != buff) {
+     out->append(buff, ptr-buff);
+  }
+  return p - orig_p;
+}
+
+template<typename T>
+static void _key_encode_shard(shard_id_t shard, T *key)
+{
+  key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
+}
+
+static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
+{
+  pshard->id = (uint8_t)*key - (uint8_t)0x80;
+  return key + 1;
+}
+
+static void get_coll_range(const coll_t& cid, int bits,
+  ghobject_t *temp_start, ghobject_t *temp_end,
+  ghobject_t *start, ghobject_t *end, bool legacy)
+{
+  spg_t pgid;
+  constexpr uint32_t MAX_HASH = std::numeric_limits<uint32_t>::max();
+  // use different nspaces due to we use different schemes when encoding
+  // keys for listing objects
+  const std::string_view MAX_NSPACE = legacy ? "\x7f" : "\xff";
+  if (cid.is_pg(&pgid)) {
+    start->shard_id = pgid.shard;
+    *temp_start = *start;
+
+    start->hobj.pool = pgid.pool();
+    temp_start->hobj.pool = -2ll - pgid.pool();
+
+    *end = *start;
+    *temp_end = *temp_start;
+
+    uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
+    start->hobj.set_bitwise_key_u32(reverse_hash);
+    temp_start->hobj.set_bitwise_key_u32(reverse_hash);
+
+    uint64_t end_hash = reverse_hash  + (1ull << (32 - bits));
+    if (end_hash > MAX_HASH) {
+      // make sure end hobj is even greater than the maximum possible hobj
+      end->hobj.set_bitwise_key_u32(MAX_HASH);
+      temp_end->hobj.set_bitwise_key_u32(MAX_HASH);
+      end->hobj.nspace = MAX_NSPACE;
+    } else {
+      end->hobj.set_bitwise_key_u32(end_hash);
+      temp_end->hobj.set_bitwise_key_u32(end_hash);
+    }
+  } else {
+    start->shard_id = shard_id_t::NO_SHARD;
+    start->hobj.pool = -1ull;
+
+    *end = *start;
+    start->hobj.set_bitwise_key_u32(0);
+    end->hobj.set_bitwise_key_u32(MAX_HASH);
+    end->hobj.nspace = MAX_NSPACE;
+    // no separate temp section
+    *temp_start = *end;
+    *temp_end = *end;
+  }
+
+  start->generation = 0;
+  end->generation = 0;
+  temp_start->generation = 0;
+  temp_end->generation = 0;
+}
+
+static void get_shared_blob_key(uint64_t sbid, string *key)
+{
+  key->clear();
+  _key_encode_u64(sbid, key);
+}
+
+static int get_key_shared_blob(const string& key, uint64_t *sbid)
+{
+  const char *p = key.c_str();
+  if (key.length() < sizeof(uint64_t))
+    return -1;
+  _key_decode_u64(p, sbid);
+  return 0;
+}
+
+template<typename S>
+static void _key_encode_prefix(const ghobject_t& oid, S *key)
+{
+  _key_encode_shard(oid.shard_id, key);
+  _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
+  _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
+}
+
+static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
+{
+  p = _key_decode_shard(p, &oid->shard_id);
+
+  uint64_t pool;
+  p = _key_decode_u64(p, &pool);
+  oid->hobj.pool = pool - 0x8000000000000000ull;
+
+  unsigned hash;
+  p = _key_decode_u32(p, &hash);
+
+  oid->hobj.set_bitwise_key_u32(hash);
+
+  return p;
+}
+
+#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
+
+template<typename S>
+static int get_key_object(const S& key, ghobject_t *oid)
+{
+  int r;
+  const char *p = key.c_str();
+
+  if (key.length() < ENCODED_KEY_PREFIX_LEN)
+    return -1;
+
+  p = _key_decode_prefix(p, oid);
+
+  if (key.length() == ENCODED_KEY_PREFIX_LEN)
+    return -2;
+
+  r = decode_escaped(p, &oid->hobj.nspace);
+  if (r < 0)
+    return -2;
+  p += r + 1;
+
+  string k;
+  r = decode_escaped(p, &k);
+  if (r < 0)
+    return -3;
+  p += r + 1;
+  if (*p == '=') {
+    // no key
+    ++p;
+    oid->hobj.oid.name = k;
+  } else if (*p == '<' || *p == '>') {
+    // key + name
+    ++p;
+    r = decode_escaped(p, &oid->hobj.oid.name);
+    if (r < 0)
+      return -5;
+    p += r + 1;
+    oid->hobj.set_key(k);
+  } else {
+    // malformed
+    return -6;
+  }
+
+  p = _key_decode_u64(p, &oid->hobj.snap.val);
+  p = _key_decode_u64(p, &oid->generation);
+
+  if (*p != ONODE_KEY_SUFFIX) {
+    return -7;
+  }
+  p++;
+  if (*p) {
+    // if we get something other than a null terminator here,
+    // something goes wrong.
+    return -8;
+  }
+
+  return 0;
+}
+
+template<typename S>
+static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
+{
+  key->clear();
+
+  size_t max_len = ENCODED_KEY_PREFIX_LEN +
+                  (oid.hobj.nspace.length() * 3 + 1) +
+                  (oid.hobj.get_key().length() * 3 + 1) +
+                   1 + // for '<', '=', or '>'
+                  (oid.hobj.oid.name.length() * 3 + 1) +
+                   8 + 8 + 1;
+  key->reserve(max_len);
+
+  _key_encode_prefix(oid, key);
+
+  append_escaped(oid.hobj.nspace, key);
+
+  if (oid.hobj.get_key().length()) {
+    // is a key... could be < = or >.
+    append_escaped(oid.hobj.get_key(), key);
+    // (ASCII chars < = and > sort in that order, yay)
+    int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
+    if (r) {
+      key->append(r > 0 ? ">" : "<");
+      append_escaped(oid.hobj.oid.name, key);
+    } else {
+      // same as no key
+      key->append("=");
+    }
+  } else {
+    // no key
+    append_escaped(oid.hobj.oid.name, key);
+    key->append("=");
+  }
+
+  _key_encode_u64(oid.hobj.snap, key);
+  _key_encode_u64(oid.generation, key);
+
+  key->push_back(ONODE_KEY_SUFFIX);
+
+  // sanity check
+  if (true) {
+    ghobject_t t;
+    int r = get_key_object(*key, &t);
+    if (r || t != oid) {
+      derr << "  r " << r << dendl;
+      derr << "key " << pretty_binary_string(*key) << dendl;
+      derr << "oid " << oid << dendl;
+      derr << "  t " << t << dendl;
+      ceph_assert(r == 0 && t == oid);
+    }
+  }
+}
+
+// extent shard keys are the onode key, plus a u32, plus 'x'.  the trailing
+// char lets us quickly test whether it is a shard key without decoding any
+// of the prefix bytes.
+template<typename S>
+static void get_extent_shard_key(const S& onode_key, uint32_t offset,
+				 string *key)
+{
+  key->clear();
+  key->reserve(onode_key.length() + 4 + 1);
+  key->append(onode_key.c_str(), onode_key.size());
+  _key_encode_u32(offset, key);
+  key->push_back(EXTENT_SHARD_KEY_SUFFIX);
+}
+
+static void rewrite_extent_shard_key(uint32_t offset, string *key)
+{
+  ceph_assert(key->size() > sizeof(uint32_t) + 1);
+  ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
+  _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
+}
+
+template<typename S>
+static void generate_extent_shard_key_and_apply(
+  const S& onode_key,
+  uint32_t offset,
+  string *key,
+  std::function<void(const string& final_key)> apply)
+{
+  if (key->empty()) { // make full key
+    ceph_assert(!onode_key.empty());
+    get_extent_shard_key(onode_key, offset, key);
+  } else {
+    rewrite_extent_shard_key(offset, key);
+  }
+  apply(*key);
+}
+
+int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
+{
+  ceph_assert(key.size() > sizeof(uint32_t) + 1);
+  ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
+  int okey_len = key.size() - sizeof(uint32_t) - 1;
+  *onode_key = key.substr(0, okey_len);
+  const char *p = key.data() + okey_len;
+  _key_decode_u32(p, offset);
+  return 0;
+}
+
+static bool is_extent_shard_key(const string& key)
+{
+  return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
+}
+
+static void get_deferred_key(uint64_t seq, string *out)
+{
+  _key_encode_u64(seq, out);
+}
+
+static void get_pool_stat_key(int64_t pool_id, string *key)
+{
+  key->clear();
+  _key_encode_u64(pool_id, key);
+}
+
+static int get_key_pool_stat(const string& key, uint64_t* pool_id)
+{
+  const char *p = key.c_str();
+  if (key.length() < sizeof(uint64_t))
+    return -1;
+  _key_decode_u64(p, pool_id);
+  return 0;
+}
+
+
+template <int LogLevelV>
+void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
+{
+  uint64_t pos = 0;
+  for (auto& s : em.shards) {
+    dout(LogLevelV) << __func__ << "  shard " << *s.shard_info
+		    << (s.loaded ? " (loaded)" : "")
+		    << (s.dirty ? " (dirty)" : "")
+		    << dendl;
+  }
+  for (auto& e : em.extent_map) {
+    dout(LogLevelV) << __func__ << "  " << e << dendl;
+    ceph_assert(e.logical_offset >= pos);
+    pos = e.logical_offset + e.length;
+    const bluestore_blob_t& blob = e.blob->get_blob();
+    if (blob.has_csum()) {
+      vector<uint64_t> v;
+      unsigned n = blob.get_csum_count();
+      for (unsigned i = 0; i < n; ++i)
+	v.push_back(blob.get_csum_item(i));
+      dout(LogLevelV) << __func__ << "      csum: " << std::hex << v << std::dec
+		      << dendl;
+    }
+    std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
+    for (auto& i : e.blob->shared_blob->bc.buffer_map) {
+      dout(LogLevelV) << __func__ << "       0x" << std::hex << i.first
+		      << "~" << i.second->length << std::dec
+		      << " " << *i.second << dendl;
+    }
+  }
+}
+
+template <int LogLevelV>
+void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
+{
+  if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
+    return;
+  dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
+		  << " nid " << o.onode.nid
+		  << " size 0x" << std::hex << o.onode.size
+		  << " (" << std::dec << o.onode.size << ")"
+		  << " expected_object_size " << o.onode.expected_object_size
+		  << " expected_write_size " << o.onode.expected_write_size
+		  << " in " << o.onode.extent_map_shards.size() << " shards"
+		  << ", " << o.extent_map.spanning_blob_map.size()
+		  << " spanning blobs"
+		  << dendl;
+  for (auto p = o.onode.attrs.begin();
+       p != o.onode.attrs.end();
+       ++p) {
+    dout(LogLevelV) << __func__ << "  attr " << p->first
+		    << " len " << p->second.length() << dendl;
+  }
+  _dump_extent_map<LogLevelV>(cct, o.extent_map);
+}
+
+template <int LogLevelV>
+void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
+{
+  dout(LogLevelV) << __func__ << " transaction dump:\n";
+  JSONFormatter f(true);
+  f.open_object_section("transaction");
+  t->dump(&f);
+  f.close_section();
+  f.flush(*_dout);
+  *_dout << dendl;
+}
+
+// Buffer
+
+ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
+{
+  out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
+      << b.offset << "~" << b.length << std::dec
+      << " " << BlueStore::Buffer::get_state_name(b.state);
+  if (b.flags)
+    out << " " << BlueStore::Buffer::get_flag_name(b.flags);
+  return out << ")";
+}
+
+namespace {
+
+/*
+ * Due to a bug in key string encoding (see a comment for append_escaped)
+ * the KeyValueDB iterator does not lexicographically sort the same
+ * way that ghobject_t does: objects with the same hash may have wrong order.
+ *
+ * This is the iterator wrapper that fixes the keys order.
+ */
+
+class CollectionListIterator {
+public:
+  CollectionListIterator(const KeyValueDB::Iterator &it)
+    : m_it(it) {
+  }
+  virtual ~CollectionListIterator() {
+  }
+
+  virtual bool valid() const = 0;
+  virtual const ghobject_t &oid() const = 0;
+  virtual void lower_bound(const ghobject_t &oid) = 0;
+  virtual void upper_bound(const ghobject_t &oid) = 0;
+  virtual void next() = 0;
+
+  virtual int cmp(const ghobject_t &oid) const = 0;
+
+  bool is_ge(const ghobject_t &oid) const {
+    return cmp(oid) >= 0;
+  }
+
+  bool is_lt(const ghobject_t &oid) const {
+    return cmp(oid) < 0;
+  }
+
+protected:
+  KeyValueDB::Iterator m_it;
+};
+
+class SimpleCollectionListIterator : public CollectionListIterator {
+public:
+  SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
+    : CollectionListIterator(it), m_cct(cct) {
+  }
+
+  bool valid() const override {
+    return m_it->valid();
+  }
+
+  const ghobject_t &oid() const override {
+    ceph_assert(valid());
+
+    return m_oid;
+  }
+
+  void lower_bound(const ghobject_t &oid) override {
+    string key;
+    get_object_key(m_cct, oid, &key);
+
+    m_it->lower_bound(key);
+    get_oid();
+  }
+
+  void upper_bound(const ghobject_t &oid) override {
+    string key;
+    get_object_key(m_cct, oid, &key);
+
+    m_it->upper_bound(key);
+    get_oid();
+  }
+
+  void next() override {
+    ceph_assert(valid());
+
+    m_it->next();
+    get_oid();
+  }
+
+  int cmp(const ghobject_t &oid) const override {
+    ceph_assert(valid());
+
+    string key;
+    get_object_key(m_cct, oid, &key);
+
+    return m_it->key().compare(key);
+  }
+
+private:
+  CephContext *m_cct;
+  ghobject_t m_oid;
+
+  void get_oid() {
+    m_oid = ghobject_t();
+    while (m_it->valid() && is_extent_shard_key(m_it->key())) {
+      m_it->next();
+    }
+    if (!valid()) {
+      return;
+    }
+
+    int r = get_key_object(m_it->key(), &m_oid);
+    ceph_assert(r == 0);
+  }
+};
+
+class SortedCollectionListIterator : public CollectionListIterator {
+public:
+  SortedCollectionListIterator(const KeyValueDB::Iterator &it)
+    : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
+  }
+
+  bool valid() const override {
+    return m_chunk_iter != m_chunk.end();
+  }
+
+  const ghobject_t &oid() const override {
+    ceph_assert(valid());
+
+    return m_chunk_iter->first;
+  }
+
+  void lower_bound(const ghobject_t &oid) override {
+    std::string key;
+    _key_encode_prefix(oid, &key);
+
+    m_it->lower_bound(key);
+    m_chunk_iter = m_chunk.end();
+    if (!get_next_chunk()) {
+      return;
+    }
+
+    if (this->oid().shard_id != oid.shard_id ||
+        this->oid().hobj.pool != oid.hobj.pool ||
+        this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
+      return;
+    }
+
+    m_chunk_iter = m_chunk.lower_bound(oid);
+    if (m_chunk_iter == m_chunk.end()) {
+      get_next_chunk();
+    }
+  }
+
+  void upper_bound(const ghobject_t &oid) override {
+    lower_bound(oid);
+
+    if (valid() && this->oid() == oid) {
+      next();
+    }
+  }
+
+  void next() override {
+    ceph_assert(valid());
+
+    m_chunk_iter++;
+    if (m_chunk_iter == m_chunk.end()) {
+      get_next_chunk();
+    }
+  }
+
+  int cmp(const ghobject_t &oid) const override {
+    ceph_assert(valid());
+
+    if (this->oid() < oid) {
+      return -1;
+    }
+    if (this->oid() > oid) {
+      return 1;
+    }
+    return 0;
+  }
+
+private:
+  std::map<ghobject_t, std::string> m_chunk;
+  std::map<ghobject_t, std::string>::iterator m_chunk_iter;
+
+  bool get_next_chunk() {
+    while (m_it->valid() && is_extent_shard_key(m_it->key())) {
+      m_it->next();
+    }
+
+    if (!m_it->valid()) {
+      return false;
+    }
+
+    ghobject_t oid;
+    int r = get_key_object(m_it->key(), &oid);
+    ceph_assert(r == 0);
+
+    m_chunk.clear();
+    while (true) {
+      m_chunk.insert({oid, m_it->key()});
+
+      do {
+        m_it->next();
+      } while (m_it->valid() && is_extent_shard_key(m_it->key()));
+
+      if (!m_it->valid()) {
+        break;
+      }
+
+      ghobject_t next;
+      r = get_key_object(m_it->key(), &next);
+      ceph_assert(r == 0);
+      if (next.shard_id != oid.shard_id ||
+          next.hobj.pool != oid.hobj.pool ||
+          next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
+        break;
+      }
+      oid = next;
+    }
+
+    m_chunk_iter = m_chunk.begin();
+    return true;
+  }
+};
+
+} // anonymous namespace
+
+// Garbage Collector
+
+void BlueStore::GarbageCollector::process_protrusive_extents(
+  const BlueStore::ExtentMap& extent_map, 
+  uint64_t start_offset,
+  uint64_t end_offset,
+  uint64_t start_touch_offset,
+  uint64_t end_touch_offset,
+  uint64_t min_alloc_size)
+{
+  ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
+
+  uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
+  uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
+
+  dout(30) << __func__ << " (hex): [" << std::hex
+           << lookup_start_offset << ", " << lookup_end_offset 
+           << ")" << std::dec << dendl;
+
+  for (auto it = extent_map.seek_lextent(lookup_start_offset);
+       it != extent_map.extent_map.end() &&
+         it->logical_offset < lookup_end_offset;
+       ++it) {
+    uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
+    uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
+
+    dout(30) << __func__ << " " << *it
+             << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
+             << dendl;
+
+    Blob* b = it->blob.get();
+
+    if (it->logical_offset >=start_touch_offset &&
+        it->logical_end() <= end_touch_offset) {
+      // Process extents within the range affected by 
+      // the current write request.
+      // Need to take into account if existing extents
+      // can be merged with them (uncompressed case)
+      if (!b->get_blob().is_compressed()) {
+        if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
+	  --blob_info_counted->expected_allocations; // don't need to allocate
+                                                     // new AU for compressed
+                                                     // data since another
+                                                     // collocated uncompressed
+                                                     // blob already exists
+          dout(30) << __func__  << " --expected:"
+                   << alloc_unit_start << dendl;
+        }
+        used_alloc_unit = alloc_unit_end;
+        blob_info_counted =  nullptr;
+      }
+    } else if (b->get_blob().is_compressed()) {
+
+      // additionally we take compressed blobs that were not impacted
+      // by the write into account too
+      BlobInfo& bi =
+        affected_blobs.emplace(
+          b, BlobInfo(b->get_referenced_bytes())).first->second;
+
+      int adjust =
+       (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
+      bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
+      dout(30) << __func__  << " expected_allocations=" 
+               << bi.expected_allocations << " end_au:"
+               << alloc_unit_end << dendl;
+
+      blob_info_counted =  &bi;
+      used_alloc_unit = alloc_unit_end;
+
+      ceph_assert(it->length <= bi.referenced_bytes);
+       bi.referenced_bytes -= it->length;
+      dout(30) << __func__ << " affected_blob:" << *b
+               << " unref 0x" << std::hex << it->length
+               << " referenced = 0x" << bi.referenced_bytes
+               << std::dec << dendl;
+      // NOTE: we can't move specific blob to resulting GC list here
+      // when reference counter == 0 since subsequent extents might
+      // decrement its expected_allocation. 
+      // Hence need to enumerate all the extents first.
+      if (!bi.collect_candidate) {
+        bi.first_lextent = it;
+        bi.collect_candidate = true;
+      }
+      bi.last_lextent = it;
+    } else {
+      if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
+        // don't need to allocate new AU for compressed data since another
+        // collocated uncompressed blob already exists
+    	--blob_info_counted->expected_allocations;
+        dout(30) << __func__  << " --expected_allocations:"
+		 << alloc_unit_start << dendl;
+      }
+      used_alloc_unit = alloc_unit_end;
+      blob_info_counted = nullptr;
+    }
+  }
+
+  for (auto b_it = affected_blobs.begin();
+       b_it != affected_blobs.end();
+       ++b_it) {
+    Blob* b = b_it->first;
+    BlobInfo& bi = b_it->second;
+    if (bi.referenced_bytes == 0) {
+      uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
+      int64_t blob_expected_for_release =
+        round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
+
+      dout(30) << __func__ << " " << *(b_it->first)
+               << " expected4release=" << blob_expected_for_release
+               << " expected_allocations=" << bi.expected_allocations
+               << dendl;
+      int64_t benefit = blob_expected_for_release - bi.expected_allocations;
+      if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
+        if (bi.collect_candidate) {
+          auto it = bi.first_lextent;
+          bool bExit = false;
+          do {
+            if (it->blob.get() == b) {
+              extents_to_collect.insert(it->logical_offset, it->length);
+            }
+            bExit = it == bi.last_lextent;
+            ++it;
+          } while (!bExit);
+        }
+        expected_for_release += blob_expected_for_release;
+        expected_allocations += bi.expected_allocations;
+      }
+    }
+  }
+}
+
+int64_t BlueStore::GarbageCollector::estimate(
+  uint64_t start_offset,
+  uint64_t length,
+  const BlueStore::ExtentMap& extent_map,
+  const BlueStore::old_extent_map_t& old_extents,
+  uint64_t min_alloc_size)
+{
+
+  affected_blobs.clear();
+  extents_to_collect.clear();
+  used_alloc_unit = boost::optional<uint64_t >();
+  blob_info_counted = nullptr;
+
+  uint64_t gc_start_offset = start_offset;
+  uint64_t gc_end_offset = start_offset + length;
+
+  uint64_t end_offset = start_offset + length;
+
+  for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
+    Blob* b = it->e.blob.get();
+    if (b->get_blob().is_compressed()) {
+
+      // update gc_start_offset/gc_end_offset if needed
+      gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
+      gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
+
+      auto o = it->e.logical_offset;
+      auto l = it->e.length;
+
+      uint64_t ref_bytes = b->get_referenced_bytes();
+      // micro optimization to bypass blobs that have no more references
+      if (ref_bytes != 0) {
+        dout(30) << __func__ << " affected_blob:" << *b
+                 << " unref 0x" << std::hex << o << "~" << l
+                 << std::dec << dendl;
+	affected_blobs.emplace(b, BlobInfo(ref_bytes));
+      }
+    }
+  }
+  dout(30) << __func__ << " gc range(hex): [" << std::hex
+           << gc_start_offset << ", " << gc_end_offset 
+           << ")" << std::dec << dendl;
+
+  // enumerate preceeding extents to check if they reference affected blobs
+  if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
+    process_protrusive_extents(extent_map,
+                               gc_start_offset,
+			       gc_end_offset,
+			       start_offset,
+			       end_offset,
+			       min_alloc_size);
+  }
+  return expected_for_release - expected_allocations;
+}
+
+// LruOnodeCacheShard
+struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
+  typedef boost::intrusive::list<
+    BlueStore::Onode,
+    boost::intrusive::member_hook<
+      BlueStore::Onode,
+      boost::intrusive::list_member_hook<>,
+      &BlueStore::Onode::lru_item> > list_t;
+
+  list_t lru;
+
+  explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
+
+  void _add(BlueStore::Onode* o, int level) override
+  {
+    if (o->put_cache()) {
+      (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
+    } else {
+      ++num_pinned;
+    }
+    ++num; // we count both pinned and unpinned entries
+    dout(20) << __func__ << " " << this << " " << o->oid << " added, num=" << num << dendl;
+  }
+  void _rm(BlueStore::Onode* o) override
+  {
+    if (o->pop_cache()) {
+      lru.erase(lru.iterator_to(*o));
+    } else {
+      ceph_assert(num_pinned);
+      --num_pinned;
+    }
+    ceph_assert(num);
+    --num;
+    dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
+  }
+  void _pin(BlueStore::Onode* o) override
+  {
+    lru.erase(lru.iterator_to(*o));
+    ++num_pinned;
+    dout(20) << __func__ << this << " " << " " << " " << o->oid << " pinned" << dendl;
+  }
+  void _unpin(BlueStore::Onode* o) override
+  {
+    lru.push_front(*o);
+    ceph_assert(num_pinned);
+    --num_pinned;
+    dout(20) << __func__ << this << " " << " " << " " << o->oid << " unpinned" << dendl;
+  }
+  void _unpin_and_rm(BlueStore::Onode* o) override
+  {
+    o->pop_cache();
+    ceph_assert(num_pinned);
+    --num_pinned;
+    ceph_assert(num);
+    --num;
+  }
+  void _trim_to(uint64_t new_size) override
+  {
+    if (new_size >= lru.size()) {
+      return; // don't even try
+    } 
+    uint64_t n = lru.size() - new_size;
+    auto p = lru.end();
+    ceph_assert(p != lru.begin());
+    --p;
+    ceph_assert(num >= n);
+    num -= n;
+    while (n-- > 0) {
+      BlueStore::Onode *o = &*p;
+      dout(20) << __func__ << "  rm " << o->oid << " "
+               << o->nref << " " << o->cached << " " << o->pinned << dendl;
+      if (p != lru.begin()) {
+        lru.erase(p--);
+      } else {
+        ceph_assert(n == 0);
+        lru.erase(p);
+      }
+      auto pinned = !o->pop_cache();
+      ceph_assert(!pinned);
+      o->c->onode_map._remove(o->oid);
+    }
+  }
+  void move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
+  {
+    if (to == this) {
+      return;
+    }
+    ceph_assert(o->cached);
+    ceph_assert(o->pinned);
+    ceph_assert(num);
+    ceph_assert(num_pinned);
+    --num_pinned;
+    --num;
+    ++to->num_pinned;
+    ++to->num;
+  }
+  void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
+  {
+    *onodes += num;
+    *pinned_onodes += num_pinned;
+  }
+};
+
+// OnodeCacheShard
+BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
+    CephContext* cct,
+    string type,
+    PerfCounters *logger)
+{
+  BlueStore::OnodeCacheShard *c = nullptr;
+  // Currently we only implement an LRU cache for onodes
+  c = new LruOnodeCacheShard(cct);
+  c->logger = logger;
+  return c;
+}
+
+// LruBufferCacheShard
+struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
+  typedef boost::intrusive::list<
+    BlueStore::Buffer,
+    boost::intrusive::member_hook<
+      BlueStore::Buffer,
+      boost::intrusive::list_member_hook<>,
+      &BlueStore::Buffer::lru_item> > list_t;
+  list_t lru;
+
+  explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
+
+  void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
+    if (near) {
+      auto q = lru.iterator_to(*near);
+      lru.insert(q, *b);
+    } else if (level > 0) {
+      lru.push_front(*b);
+    } else {
+      lru.push_back(*b);
+    }
+    buffer_bytes += b->length;
+    num = lru.size();
+  }
+  void _rm(BlueStore::Buffer *b) override {
+    ceph_assert(buffer_bytes >= b->length);
+    buffer_bytes -= b->length;
+    auto q = lru.iterator_to(*b);
+    lru.erase(q);
+    num = lru.size();
+  }
+  void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
+    src->_rm(b);
+    _add(b, 0, nullptr);
+  }
+  void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
+    ceph_assert((int64_t)buffer_bytes + delta >= 0);
+    buffer_bytes += delta;
+  }
+  void _touch(BlueStore::Buffer *b) override {
+    auto p = lru.iterator_to(*b);
+    lru.erase(p);
+    lru.push_front(*b);
+    num = lru.size();
+    _audit("_touch_buffer end");
+  }
+
+  void _trim_to(uint64_t max) override
+  {
+    while (buffer_bytes > max) {
+      auto i = lru.rbegin();
+      if (i == lru.rend()) {
+        // stop if lru is now empty
+        break;
+      }
+
+      BlueStore::Buffer *b = &*i;
+      ceph_assert(b->is_clean());
+      dout(20) << __func__ << " rm " << *b << dendl;
+      b->space->_rm_buffer(this, b);
+    }
+    num = lru.size();
+  }
+
+  void add_stats(uint64_t *extents,
+                 uint64_t *blobs,
+                 uint64_t *buffers,
+                 uint64_t *bytes) override {
+    *extents += num_extents;
+    *blobs += num_blobs;
+    *buffers += num;
+    *bytes += buffer_bytes;
+  }
+#ifdef DEBUG_CACHE
+  void _audit(const char *s) override
+  {
+    dout(10) << __func__ << " " << when << " start" << dendl;
+    uint64_t s = 0;
+    for (auto i = lru.begin(); i != lru.end(); ++i) {
+      s += i->length;
+    }
+    if (s != buffer_bytes) {
+      derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
+           << dendl;
+      for (auto i = lru.begin(); i != lru.end(); ++i) {
+        derr << __func__ << " " << *i << dendl;
+      }
+      ceph_assert(s == buffer_bytes);
+    }
+    dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
+             << " ok" << dendl;
+  }
+#endif
+};
+
+// TwoQBufferCacheShard
+
+struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
+  typedef boost::intrusive::list<
+    BlueStore::Buffer,
+    boost::intrusive::member_hook<
+      BlueStore::Buffer,
+      boost::intrusive::list_member_hook<>,
+      &BlueStore::Buffer::lru_item> > list_t;
+  list_t hot;      ///< "Am" hot buffers
+  list_t warm_in;  ///< "A1in" newly warm buffers
+  list_t warm_out; ///< "A1out" empty buffers we've evicted
+
+  enum {
+    BUFFER_NEW = 0,
+    BUFFER_WARM_IN,   ///< in warm_in
+    BUFFER_WARM_OUT,  ///< in warm_out
+    BUFFER_HOT,       ///< in hot
+    BUFFER_TYPE_MAX
+  };
+
+  uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
+
+public:
+  explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
+
+  void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
+  {
+    dout(20) << __func__ << " level " << level << " near " << near
+             << " on " << *b
+             << " which has cache_private " << b->cache_private << dendl;
+    if (near) {
+      b->cache_private = near->cache_private;
+      switch (b->cache_private) {
+      case BUFFER_WARM_IN:
+        warm_in.insert(warm_in.iterator_to(*near), *b);
+        break;
+      case BUFFER_WARM_OUT:
+        ceph_assert(b->is_empty());
+        warm_out.insert(warm_out.iterator_to(*near), *b);
+        break;
+      case BUFFER_HOT:
+        hot.insert(hot.iterator_to(*near), *b);
+        break;
+      default:
+        ceph_abort_msg("bad cache_private");
+      }
+    } else if (b->cache_private == BUFFER_NEW) {
+      b->cache_private = BUFFER_WARM_IN;
+      if (level > 0) {
+        warm_in.push_front(*b);
+      } else {
+        // take caller hint to start at the back of the warm queue
+        warm_in.push_back(*b);
+      }
+    } else {
+      // we got a hint from discard
+      switch (b->cache_private) {
+      case BUFFER_WARM_IN:
+        // stay in warm_in.  move to front, even though 2Q doesn't actually
+        // do this.
+        dout(20) << __func__ << " move to front of warm " << *b << dendl;
+        warm_in.push_front(*b);
+        break;
+      case BUFFER_WARM_OUT:
+        b->cache_private = BUFFER_HOT;
+        // move to hot.  fall-thru
+      case BUFFER_HOT:
+        dout(20) << __func__ << " move to front of hot " << *b << dendl;
+        hot.push_front(*b);
+        break;
+      default:
+        ceph_abort_msg("bad cache_private");
+      }
+    }
+    if (!b->is_empty()) {
+      buffer_bytes += b->length;
+      list_bytes[b->cache_private] += b->length;
+    }
+    num = hot.size() + warm_in.size();
+  }
+
+  void _rm(BlueStore::Buffer *b) override
+  {
+    dout(20) << __func__ << " " << *b << dendl;
+    if (!b->is_empty()) {
+      ceph_assert(buffer_bytes >= b->length);
+      buffer_bytes -= b->length;
+      ceph_assert(list_bytes[b->cache_private] >= b->length);
+      list_bytes[b->cache_private] -= b->length;
+    }
+    switch (b->cache_private) {
+    case BUFFER_WARM_IN:
+      warm_in.erase(warm_in.iterator_to(*b));
+      break;
+    case BUFFER_WARM_OUT:
+      warm_out.erase(warm_out.iterator_to(*b));
+      break;
+    case BUFFER_HOT:
+      hot.erase(hot.iterator_to(*b));
+      break;
+    default:
+      ceph_abort_msg("bad cache_private");
+    }
+    num = hot.size() + warm_in.size();
+  }
+
+  void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
+  {
+    TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
+    src->_rm(b);
+
+    // preserve which list we're on (even if we can't preserve the order!)
+    switch (b->cache_private) {
+    case BUFFER_WARM_IN:
+      ceph_assert(!b->is_empty());
+      warm_in.push_back(*b);
+      break;
+    case BUFFER_WARM_OUT:
+      ceph_assert(b->is_empty());
+      warm_out.push_back(*b);
+      break;
+    case BUFFER_HOT:
+      ceph_assert(!b->is_empty());
+      hot.push_back(*b);
+      break;
+    default:
+      ceph_abort_msg("bad cache_private");
+    }
+    if (!b->is_empty()) {
+      buffer_bytes += b->length;
+      list_bytes[b->cache_private] += b->length;
+    }
+    num = hot.size() + warm_in.size();
+  }
+
+  void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
+  {
+    dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
+    if (!b->is_empty()) {
+      ceph_assert((int64_t)buffer_bytes + delta >= 0);
+      buffer_bytes += delta;
+      ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
+      list_bytes[b->cache_private] += delta;
+    }
+  }
+
+  void _touch(BlueStore::Buffer *b) override {
+    switch (b->cache_private) {
+    case BUFFER_WARM_IN:
+      // do nothing (somewhat counter-intuitively!)
+      break;
+    case BUFFER_WARM_OUT:
+      // move from warm_out to hot LRU
+      ceph_abort_msg("this happens via discard hint");
+      break;
+    case BUFFER_HOT:
+      // move to front of hot LRU
+      hot.erase(hot.iterator_to(*b));
+      hot.push_front(*b);
+      break;
+    }
+    num = hot.size() + warm_in.size();
+    _audit("_touch_buffer end");
+  }
+
+  void _trim_to(uint64_t max) override
+  {
+    if (buffer_bytes > max) {
+      uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
+      uint64_t khot = max - kin;
+
+      // pre-calculate kout based on average buffer size too,
+      // which is typical(the warm_in and hot lists may change later)
+      uint64_t kout = 0;
+      uint64_t buffer_num = hot.size() + warm_in.size();
+      if (buffer_num) {
+        uint64_t avg_size = buffer_bytes / buffer_num;
+        ceph_assert(avg_size);
+        uint64_t calculated_num = max / avg_size;
+        kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
+      }
+
+      if (list_bytes[BUFFER_HOT] < khot) {
+        // hot is small, give slack to warm_in
+        kin += khot - list_bytes[BUFFER_HOT];
+      } else if (list_bytes[BUFFER_WARM_IN] < kin) {
+        // warm_in is small, give slack to hot
+        khot += kin - list_bytes[BUFFER_WARM_IN];
+      }
+
+      // adjust warm_in list
+      int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
+      uint64_t evicted = 0;
+
+      while (to_evict_bytes > 0) {
+        auto p = warm_in.rbegin();
+        if (p == warm_in.rend()) {
+          // stop if warm_in list is now empty
+          break;
+        }
+
+        BlueStore::Buffer *b = &*p;
+        ceph_assert(b->is_clean());
+        dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
+        ceph_assert(buffer_bytes >= b->length);
+        buffer_bytes -= b->length;
+        ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
+        list_bytes[BUFFER_WARM_IN] -= b->length;
+        to_evict_bytes -= b->length;
+        evicted += b->length;
+        b->state = BlueStore::Buffer::STATE_EMPTY;
+        b->data.clear();
+        warm_in.erase(warm_in.iterator_to(*b));
+        warm_out.push_front(*b);
+        b->cache_private = BUFFER_WARM_OUT;
+      }
+
+      if (evicted > 0) {
+        dout(20) << __func__ << " evicted " << byte_u_t(evicted)
+                 << " from warm_in list, done evicting warm_in buffers"
+                 << dendl;
+      }
+
+      // adjust hot list
+      to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
+      evicted = 0;
+
+      while (to_evict_bytes > 0) {
+        auto p = hot.rbegin();
+        if (p == hot.rend()) {
+          // stop if hot list is now empty
+          break;
+        }
+
+        BlueStore::Buffer *b = &*p;
+        dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
+        ceph_assert(b->is_clean());
+        // adjust evict size before buffer goes invalid
+        to_evict_bytes -= b->length;
+        evicted += b->length;
+        b->space->_rm_buffer(this, b);
+      }
+
+      if (evicted > 0) {
+        dout(20) << __func__ << " evicted " << byte_u_t(evicted)
+                 << " from hot list, done evicting hot buffers"
+                 << dendl;
+      }
+
+      // adjust warm out list too, if necessary
+      int64_t n = warm_out.size() - kout;
+      while (n-- > 0) {
+        BlueStore::Buffer *b = &*warm_out.rbegin();
+        ceph_assert(b->is_empty());
+        dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
+        b->space->_rm_buffer(this, b);
+      }
+    }
+    num = hot.size() + warm_in.size();
+  }
+
+  void add_stats(uint64_t *extents,
+                 uint64_t *blobs,
+                 uint64_t *buffers,
+                 uint64_t *bytes) override {
+    *extents += num_extents;
+    *blobs += num_blobs;
+    *buffers += num;
+    *bytes += buffer_bytes;
+  }
+
+#ifdef DEBUG_CACHE
+  void _audit(const char *s) override
+  {
+    dout(10) << __func__ << " " << when << " start" << dendl;
+    uint64_t s = 0;
+    for (auto i = hot.begin(); i != hot.end(); ++i) {
+      s += i->length;
+    }
+
+    uint64_t hot_bytes = s;
+    if (hot_bytes != list_bytes[BUFFER_HOT]) {
+      derr << __func__ << " hot_list_bytes "
+           << list_bytes[BUFFER_HOT]
+           << " != actual " << hot_bytes
+           << dendl;
+      ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
+    }
+
+    for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
+      s += i->length;
+    }
+
+    uint64_t warm_in_bytes = s - hot_bytes;
+    if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
+      derr << __func__ << " warm_in_list_bytes "
+           << list_bytes[BUFFER_WARM_IN]
+           << " != actual " << warm_in_bytes
+           << dendl;
+      ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
+    }
+
+    if (s != buffer_bytes) {
+      derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
+           << dendl;
+      ceph_assert(s == buffer_bytes);
+    }
+
+    dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
+             << " ok" << dendl;
+  }
+#endif
+};
+
+// BuferCacheShard
+
+BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
+    CephContext* cct,
+    string type,
+    PerfCounters *logger)
+{
+  BufferCacheShard *c = nullptr;
+  if (type == "lru")
+    c = new LruBufferCacheShard(cct);
+  else if (type == "2q")
+    c = new TwoQBufferCacheShard(cct);
+  else
+    ceph_abort_msg("unrecognized cache type");
+  c->logger = logger;
+  return c;
+}
+
+// BufferSpace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
+
+void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
+{
+  // note: we already hold cache->lock
+  ldout(cache->cct, 20) << __func__ << dendl;
+  while (!buffer_map.empty()) {
+    _rm_buffer(cache, buffer_map.begin());
+  }
+}
+
+int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
+{
+  // note: we already hold cache->lock
+  ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
+           << std::dec << dendl;
+  int cache_private = 0;
+  cache->_audit("discard start");
+  auto i = _data_lower_bound(offset);
+  uint32_t end = offset + length;
+  while (i != buffer_map.end()) {
+    Buffer *b = i->second.get();
+    if (b->offset >= end) {
+      break;
+    }
+    if (b->cache_private > cache_private) {
+      cache_private = b->cache_private;
+    }
+    if (b->offset < offset) {
+      int64_t front = offset - b->offset;
+      if (b->end() > end) {
+	// drop middle (split)
+	uint32_t tail = b->end() - end;
+	if (b->data.length()) {
+	  bufferlist bl;
+	  bl.substr_of(b->data, b->length - tail, tail);
+	  Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
+	  nb->maybe_rebuild();
+	  _add_buffer(cache, nb, 0, b);
+	} else {
+	  _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail,
+                                        b->flags),
+	              0, b);
+	}
+	if (!b->is_writing()) {
+	  cache->_adjust_size(b, front - (int64_t)b->length);
+	}
+	b->truncate(front);
+	b->maybe_rebuild();
+	cache->_audit("discard end 1");
+	break;
+      } else {
+	// drop tail
+	if (!b->is_writing()) {
+	  cache->_adjust_size(b, front - (int64_t)b->length);
+	}
+	b->truncate(front);
+	b->maybe_rebuild();
+	++i;
+	continue;
+      }
+    }
+    if (b->end() <= end) {
+      // drop entire buffer
+      _rm_buffer(cache, i++);
+      continue;
+    }
+    // drop front
+    uint32_t keep = b->end() - end;
+    if (b->data.length()) {
+      bufferlist bl;
+      bl.substr_of(b->data, b->length - keep, keep);
+      Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
+      nb->maybe_rebuild();
+      _add_buffer(cache, nb, 0, b);
+    } else {
+      _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep,
+                                    b->flags),
+                  0, b);
+    }
+    _rm_buffer(cache, i);
+    cache->_audit("discard end 2");
+    break;
+  }
+  return cache_private;
+}
+
+void BlueStore::BufferSpace::read(
+  BufferCacheShard* cache, 
+  uint32_t offset,
+  uint32_t length,
+  BlueStore::ready_regions_t& res,
+  interval_set<uint32_t>& res_intervals,
+  int flags)
+{
+  res.clear();
+  res_intervals.clear();
+  uint32_t want_bytes = length;
+  uint32_t end = offset + length;
+
+  {
+    std::lock_guard l(cache->lock);
+    for (auto i = _data_lower_bound(offset);
+         i != buffer_map.end() && offset < end && i->first < end;
+         ++i) {
+      Buffer *b = i->second.get();
+      ceph_assert(b->end() > offset);
+
+      bool val = false;
+      if (flags & BYPASS_CLEAN_CACHE)
+        val = b->is_writing();
+      else
+        val = b->is_writing() || b->is_clean();
+      if (val) {
+        if (b->offset < offset) {
+	  uint32_t skip = offset - b->offset;
+	  uint32_t l = min(length, b->length - skip);
+	  res[offset].substr_of(b->data, skip, l);
+	  res_intervals.insert(offset, l);
+	  offset += l;
+	  length -= l;
+	  if (!b->is_writing()) {
+	    cache->_touch(b);
+          }
+	  continue;
+        }
+        if (b->offset > offset) {
+	  uint32_t gap = b->offset - offset;
+	  if (length <= gap) {
+	    break;
+	  }
+	  offset += gap;
+	  length -= gap;
+        }
+        if (!b->is_writing()) {
+	  cache->_touch(b);
+        }
+        if (b->length > length) {
+	  res[offset].substr_of(b->data, 0, length);
+	  res_intervals.insert(offset, length);
+          break;
+        } else {
+	  res[offset].append(b->data);
+	  res_intervals.insert(offset, b->length);
+          if (b->length == length)
+            break;
+	  offset += b->length;
+	  length -= b->length;
+        }
+      }
+    }
+  }
+
+  uint64_t hit_bytes = res_intervals.size();
+  ceph_assert(hit_bytes <= want_bytes);
+  uint64_t miss_bytes = want_bytes - hit_bytes;
+  cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
+  cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
+}
+
+void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
+{
+  auto i = writing.begin();
+  while (i != writing.end()) {
+    if (i->seq > seq) {
+      break;
+    }
+    if (i->seq < seq) {
+      ++i;
+      continue;
+    }
+
+    Buffer *b = &*i;
+    ceph_assert(b->is_writing());
+
+    if (b->flags & Buffer::FLAG_NOCACHE) {
+      writing.erase(i++);
+      ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
+      buffer_map.erase(b->offset);
+    } else {
+      b->state = Buffer::STATE_CLEAN;
+      writing.erase(i++);
+      b->maybe_rebuild();
+      b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
+      cache->_add(b, 1, nullptr);
+      ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
+    }
+  }
+  cache->_trim();
+  cache->_audit("finish_write end");
+}
+
+void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
+{
+  std::lock_guard lk(cache->lock);
+  if (buffer_map.empty())
+    return;
+
+  auto p = --buffer_map.end();
+  while (true) {
+    if (p->second->end() <= pos)
+      break;
+
+    if (p->second->offset < pos) {
+      ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
+      size_t left = pos - p->second->offset;
+      size_t right = p->second->length - left;
+      if (p->second->data.length()) {
+	bufferlist bl;
+	bl.substr_of(p->second->data, left, right);
+	r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+                                        0, bl, p->second->flags),
+		      0, p->second.get());
+      } else {
+	r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+                                        0, right, p->second->flags),
+		      0, p->second.get());
+      }
+      cache->_adjust_size(p->second.get(), -right);
+      p->second->truncate(left);
+      break;
+    }
+
+    ceph_assert(p->second->end() > pos);
+    ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
+    if (p->second->data.length()) {
+      r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+                               p->second->offset - pos, p->second->data, p->second->flags),
+                    0, p->second.get());
+    } else {
+      r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+                               p->second->offset - pos, p->second->length, p->second->flags),
+                    0, p->second.get());
+    }
+    if (p == buffer_map.begin()) {
+      _rm_buffer(cache, p);
+      break;
+    } else {
+      _rm_buffer(cache, p--);
+    }
+  }
+  ceph_assert(writing.empty());
+  cache->_trim();
+}
+
+// OnodeSpace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
+
+BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid,
+  OnodeRef& o)
+{
+  std::lock_guard l(cache->lock);
+  auto p = onode_map.find(oid);
+  if (p != onode_map.end()) {
+    ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
+			  << " raced, returning existing " << p->second
+			  << dendl;
+    return p->second;
+  }
+  ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
+  onode_map[oid] = o;
+  cache->_add(o.get(), 1);
+  cache->_trim();
+  return o;
+}
+
+void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
+{
+  ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
+  onode_map.erase(oid);
+}
+
+BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
+{
+  ldout(cache->cct, 30) << __func__ << dendl;
+  OnodeRef o;
+  bool hit = false;
+
+  {
+    std::lock_guard l(cache->lock);
+    ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
+    if (p == onode_map.end()) {
+      ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
+    } else {
+      ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
+                            << " " << p->second->nref
+                            << " " << p->second->cached
+                            << " " << p->second->pinned
+			    << dendl;
+      // This will pin onode and implicitly touch the cache when Onode
+      // eventually will become unpinned
+      o = p->second;
+      ceph_assert(!o->cached || o->pinned);
+
+      hit = true;
+    }
+  }
+
+  if (hit) {
+    cache->logger->inc(l_bluestore_onode_hits);
+  } else {
+    cache->logger->inc(l_bluestore_onode_misses);
+  }
+  return o;
+}
+
+void BlueStore::OnodeSpace::clear()
+{
+  std::lock_guard l(cache->lock);
+  ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
+  for (auto &p : onode_map) {
+    cache->_rm(p.second.get());
+  }
+  onode_map.clear();
+}
+
+bool BlueStore::OnodeSpace::empty()
+{
+  std::lock_guard l(cache->lock);
+  return onode_map.empty();
+}
+
+void BlueStore::OnodeSpace::rename(
+  OnodeRef& oldo,
+  const ghobject_t& old_oid,
+  const ghobject_t& new_oid,
+  const mempool::bluestore_cache_meta::string& new_okey)
+{
+  std::lock_guard l(cache->lock);
+  ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
+			<< dendl;
+  ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
+  po = onode_map.find(old_oid);
+  pn = onode_map.find(new_oid);
+  ceph_assert(po != pn);
+
+  ceph_assert(po != onode_map.end());
+  if (pn != onode_map.end()) {
+    ldout(cache->cct, 30) << __func__ << "  removing target " << pn->second
+			  << dendl;
+    cache->_rm(pn->second.get());
+    onode_map.erase(pn);
+  }
+  OnodeRef o = po->second;
+
+  // install a non-existent onode at old location
+  oldo.reset(new Onode(o->c, old_oid, o->key));
+  po->second = oldo;
+  cache->_add(oldo.get(), 1);
+  // add at new position and fix oid, key.
+  // This will pin 'o' and implicitly touch cache
+  // when it will eventually become unpinned
+  onode_map.insert(make_pair(new_oid, o));
+  ceph_assert(o->pinned);
+
+  o->oid = new_oid;
+  o->key = new_okey;
+  cache->_trim();
+}
+
+bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
+{
+  std::lock_guard l(cache->lock);
+  ldout(cache->cct, 20) << __func__ << dendl;
+  for (auto& i : onode_map) {
+    if (f(i.second.get())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <int LogLevelV = 30>
+void BlueStore::OnodeSpace::dump(CephContext *cct)
+{
+  for (auto& i : onode_map) {
+    ldout(cct, LogLevelV) << i.first << " : " << i.second
+      << " " << i.second->nref
+      << " " << i.second->cached
+      << " " << i.second->pinned
+      << dendl;
+  }
+}
+
+// SharedBlob
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
+#undef dout_context
+#define dout_context coll->store->cct
+
+void BlueStore::SharedBlob::dump(Formatter* f) const
+{
+  f->dump_bool("loaded", loaded);
+  if (loaded) {
+    persistent->dump(f);
+  } else {
+    f->dump_unsigned("sbid_unloaded", sbid_unloaded);
+  }
+}
+
+ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
+{
+  out << "SharedBlob(" << &sb;
+  
+  if (sb.loaded) {
+    out << " loaded " << *sb.persistent;
+  } else {
+    out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
+  }
+  return out << ")";
+}
+
+BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
+  : coll(_coll), sbid_unloaded(i)
+{
+  ceph_assert(sbid_unloaded > 0);
+  if (get_cache()) {
+    get_cache()->add_blob();
+  }
+}
+
+BlueStore::SharedBlob::~SharedBlob()
+{
+  if (loaded && persistent) {
+    delete persistent; 
+  }
+}
+
+void BlueStore::SharedBlob::put()
+{
+  if (--nref == 0) {
+    dout(20) << __func__ << " " << this
+	     << " removing self from set " << get_parent()
+	     << dendl;
+  again:
+    auto coll_snap = coll;
+    if (coll_snap) {
+      std::lock_guard l(coll_snap->cache->lock);
+      if (coll_snap != coll) {
+	goto again;
+      }
+      if (!coll_snap->shared_blob_set.remove(this, true)) {
+	// race with lookup
+	return;
+      }
+      bc._clear(coll_snap->cache);
+      coll_snap->cache->rm_blob();
+    }
+    delete this;
+  }
+}
+
+void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
+{
+  ceph_assert(persistent);
+  persistent->ref_map.get(offset, length);
+}
+
+void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
+				    PExtentVector *r,
+				    bool *unshare)
+{
+  ceph_assert(persistent);
+  persistent->ref_map.put(offset, length, r,
+    unshare && !*unshare ? unshare : nullptr);
+}
+
+void BlueStore::SharedBlob::finish_write(uint64_t seq)
+{
+  while (true) {
+    BufferCacheShard *cache = coll->cache;
+    std::lock_guard l(cache->lock);
+    if (coll->cache != cache) {
+      dout(20) << __func__
+	       << " raced with sb cache update, was " << cache
+	       << ", now " << coll->cache << ", retrying"
+	       << dendl;
+      continue;
+    }
+    bc._finish_write(cache, seq);
+    break;
+  }
+}
+
+// SharedBlobSet
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
+
+template <int LogLevelV = 30>
+void BlueStore::SharedBlobSet::dump(CephContext *cct)
+{
+  std::lock_guard l(lock);
+  for (auto& i : sb_map) {
+    ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
+  }
+}
+
+// Blob
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
+
+void BlueStore::Blob::dump(Formatter* f) const
+{
+  if (is_spanning()) {
+    f->dump_unsigned("spanning_id ", id);
+  }
+  blob.dump(f);
+  if (shared_blob) {
+    f->dump_object("shared", *shared_blob);
+  }
+}
+
+ostream& operator<<(ostream& out, const BlueStore::Blob& b)
+{
+  out << "Blob(" << &b;
+  if (b.is_spanning()) {
+    out << " spanning " << b.id;
+  }
+  out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
+  if (b.shared_blob) {
+    out << " " << *b.shared_blob;
+  } else {
+    out << " (shared_blob=NULL)";
+  }
+  out << ")";
+  return out;
+}
+
+void BlueStore::Blob::discard_unallocated(Collection *coll)
+{
+  if (get_blob().is_shared()) {
+    return;
+  }
+  if (get_blob().is_compressed()) {
+    bool discard = false;
+    bool all_invalid = true;
+    for (auto e : get_blob().get_extents()) {
+      if (!e.is_valid()) {
+        discard = true;
+      } else {
+        all_invalid = false;
+      }
+    }
+    ceph_assert(discard == all_invalid); // in case of compressed blob all
+				    // or none pextents are invalid.
+    if (discard) {
+      shared_blob->bc.discard(shared_blob->get_cache(), 0,
+                              get_blob().get_logical_length());
+    }
+  } else {
+    size_t pos = 0;
+    for (auto e : get_blob().get_extents()) {
+      if (!e.is_valid()) {
+	dout(20) << __func__ << " 0x" << std::hex << pos
+		 << "~" << e.length
+		 << std::dec << dendl;
+	shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
+      }
+      pos += e.length;
+    }
+    if (get_blob().can_prune_tail()) {
+      dirty_blob().prune_tail();
+      used_in_blob.prune_tail(get_blob().get_ondisk_length());
+      dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
+    }
+  }
+}
+
+void BlueStore::Blob::get_ref(
+  Collection *coll,
+  uint32_t offset,
+  uint32_t length)
+{
+  // Caller has to initialize Blob's logical length prior to increment 
+  // references.  Otherwise one is neither unable to determine required
+  // amount of counters in case of per-au tracking nor obtain min_release_size
+  // for single counter mode.
+  ceph_assert(get_blob().get_logical_length() != 0);
+  dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+           << std::dec << " " << *this << dendl;
+
+  if (used_in_blob.is_empty()) {
+    uint32_t min_release_size =
+      get_blob().get_release_size(coll->store->min_alloc_size);
+    uint64_t l = get_blob().get_logical_length();
+    dout(20) << __func__ << " init 0x" << std::hex << l << ", "
+             << min_release_size << std::dec << dendl;
+    used_in_blob.init(l, min_release_size);
+  }
+  used_in_blob.get(
+    offset,
+    length);
+}
+
+bool BlueStore::Blob::put_ref(
+  Collection *coll,
+  uint32_t offset,
+  uint32_t length,
+  PExtentVector *r)
+{
+  PExtentVector logical;
+
+  dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+           << std::dec << " " << *this << dendl;
+  
+  bool empty = used_in_blob.put(
+    offset,
+    length,
+    &logical);
+  r->clear();
+  // nothing to release
+  if (!empty && logical.empty()) {
+    return false;
+  }
+
+  bluestore_blob_t& b = dirty_blob();
+  return b.release_extents(empty, logical, r);
+}
+
+bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
+                		     uint32_t target_blob_size,
+		                     uint32_t b_offset,
+		                     uint32_t *length0) {
+  ceph_assert(min_alloc_size);
+  ceph_assert(target_blob_size);
+  if (!get_blob().is_mutable()) {
+    return false;
+  }
+
+  uint32_t length = *length0;
+  uint32_t end = b_offset + length;
+
+  // Currently for the sake of simplicity we omit blob reuse if data is
+  // unaligned with csum chunk. Later we can perform padding if needed.
+  if (get_blob().has_csum() &&
+     ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
+      (end % get_blob().get_csum_chunk_size()) != 0)) {
+    return false;
+  }
+
+  auto blen = get_blob().get_logical_length();
+  uint32_t new_blen = blen;
+
+  // make sure target_blob_size isn't less than current blob len
+  target_blob_size = std::max(blen, target_blob_size);
+
+  if (b_offset >= blen) {
+    // new data totally stands out of the existing blob
+    new_blen = end;
+  } else {
+    // new data overlaps with the existing blob
+    new_blen = std::max(blen, end);
+
+    uint32_t overlap = 0;
+    if (new_blen > blen) {
+      overlap = blen - b_offset;
+    } else {
+      overlap = length;
+    }
+
+    if (!get_blob().is_unallocated(b_offset, overlap)) {
+      // abort if any piece of the overlap has already been allocated
+      return false;
+    }
+  }
+
+  if (new_blen > blen) {
+    int64_t overflow = int64_t(new_blen) - target_blob_size;
+    // Unable to decrease the provided length to fit into max_blob_size
+    if (overflow >= length) {
+      return false;
+    }
+
+    // FIXME: in some cases we could reduce unused resolution
+    if (get_blob().has_unused()) {
+      return false;
+    }
+
+    if (overflow > 0) {
+      new_blen -= overflow;
+      length -= overflow;
+      *length0 = length;
+    }
+
+    if (new_blen > blen) {
+      dirty_blob().add_tail(new_blen);
+      used_in_blob.add_tail(new_blen,
+                            get_blob().get_release_size(min_alloc_size));
+    }
+  }
+  return true;
+}
+
+void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
+{
+  dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
+	   << " start " << *this << dendl;
+  ceph_assert(blob.can_split());
+  ceph_assert(used_in_blob.can_split());
+  bluestore_blob_t &lb = dirty_blob();
+  bluestore_blob_t &rb = r->dirty_blob();
+
+  used_in_blob.split(
+    blob_offset,
+    &(r->used_in_blob));
+
+  lb.split(blob_offset, rb);
+  shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
+
+  dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
+	   << " finish " << *this << dendl;
+  dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
+	   << "    and " << *r << dendl;
+}
+
+#ifndef CACHE_BLOB_BL
+void BlueStore::Blob::decode(
+  Collection *coll,
+  bufferptr::const_iterator& p,
+  uint64_t struct_v,
+  uint64_t* sbid,
+  bool include_ref_map)
+{
+  denc(blob, p, struct_v);
+  if (blob.is_shared()) {
+    denc(*sbid, p);
+  }
+  if (include_ref_map) {
+    if (struct_v > 1) {
+      used_in_blob.decode(p);
+    } else {
+      used_in_blob.clear();
+      bluestore_extent_ref_map_t legacy_ref_map;
+      legacy_ref_map.decode(p);
+      for (auto r : legacy_ref_map.ref_map) {
+        get_ref(
+          coll,
+          r.first,
+          r.second.refs * r.second.length);
+      }
+    }
+  }
+}
+#endif
+
+// Extent
+
+void BlueStore::Extent::dump(Formatter* f) const
+{
+  f->dump_unsigned("logical_offset", logical_offset);
+  f->dump_unsigned("length", length);
+  f->dump_unsigned("blob_offset", blob_offset);
+  f->dump_object("blob", *blob);
+}
+
+ostream& operator<<(ostream& out, const BlueStore::Extent& e)
+{
+  return out << std::hex << "0x" << e.logical_offset << "~" << e.length
+	     << ": 0x" << e.blob_offset << "~" << e.length << std::dec
+	     << " " << *e.blob;
+}
+
+// OldExtent
+BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
+						   uint32_t lo,
+						   uint32_t o,
+						   uint32_t l,
+						   BlobRef& b) {
+  OldExtent* oe = new OldExtent(lo, o, l, b);
+  b->put_ref(c.get(), o, l, &(oe->r));
+  oe->blob_empty = !b->is_referenced();
+  return oe;
+}
+
+// ExtentMap
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
+#undef dout_context
+#define dout_context onode->c->store->cct
+
+BlueStore::ExtentMap::ExtentMap(Onode *o)
+  : onode(o),
+    inline_bl(
+      o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
+}
+
+void BlueStore::ExtentMap::dump(Formatter* f) const
+{
+  f->open_array_section("extents");
+
+  for (auto& e : extent_map) {
+      f->dump_object("extent", e);
+  }
+  f->close_section();
+}
+
+void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
+  CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
+  uint64_t& length, uint64_t& dstoff) {
+
+  auto cct = onode->c->store->cct;
+  bool inject_21040 =
+    cct->_conf->bluestore_debug_inject_bug21040;  
+  vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
+  for (auto& e : oldo->extent_map.extent_map) {
+    e.blob->last_encoded_id = -1;
+  }
+
+  int n = 0;
+  uint64_t end = srcoff + length;
+  uint32_t dirty_range_begin = 0;
+  uint32_t dirty_range_end = 0;
+  bool src_dirty = false;
+  for (auto ep = oldo->extent_map.seek_lextent(srcoff);
+    ep != oldo->extent_map.extent_map.end();
+    ++ep) {
+    auto& e = *ep;
+    if (e.logical_offset >= end) {
+      break;
+    }
+    dout(20) << __func__ << "  src " << e << dendl;
+    BlobRef cb;
+    bool blob_duped = true;
+    if (e.blob->last_encoded_id >= 0) {
+      cb = id_to_blob[e.blob->last_encoded_id];
+      blob_duped = false;
+    } else { 
+      // dup the blob
+      const bluestore_blob_t& blob = e.blob->get_blob();
+      // make sure it is shared
+      if (!blob.is_shared()) {
+        c->make_blob_shared(b->_assign_blobid(txc), e.blob);
+	if (!inject_21040 && !src_dirty) {
+          src_dirty = true;
+          dirty_range_begin = e.logical_offset;
+	} else if (inject_21040 &&
+	           dirty_range_begin == 0 && dirty_range_end == 0) {
+	  dirty_range_begin = e.logical_offset;
+	}        
+        ceph_assert(e.logical_end() > 0);
+        // -1 to exclude next potential shard
+        dirty_range_end = e.logical_end() - 1;
+      } else {
+        c->load_shared_blob(e.blob->shared_blob);
+      }
+      cb = new Blob();
+      e.blob->last_encoded_id = n;
+      id_to_blob[n] = cb;
+      e.blob->dup(*cb);
+      // bump the extent refs on the copied blob's extents
+      for (auto p : blob.get_extents()) {
+        if (p.is_valid()) {
+          e.blob->shared_blob->get_ref(p.offset, p.length);
+        }
+      }
+      txc->write_shared_blob(e.blob->shared_blob);
+      dout(20) << __func__ << "    new " << *cb << dendl;
+    }
+
+    int skip_front, skip_back;
+    if (e.logical_offset < srcoff) {
+      skip_front = srcoff - e.logical_offset;
+    } else {
+      skip_front = 0;
+    }
+    if (e.logical_end() > end) {
+      skip_back = e.logical_end() - end;
+    } else {
+      skip_back = 0;
+    }
+
+    Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
+      e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
+    newo->extent_map.extent_map.insert(*ne);
+    ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
+    // fixme: we may leave parts of new blob unreferenced that could
+    // be freed (relative to the shared_blob).
+    txc->statfs_delta.stored() += ne->length;
+    if (e.blob->get_blob().is_compressed()) {
+      txc->statfs_delta.compressed_original() += ne->length;
+      if (blob_duped) {
+        txc->statfs_delta.compressed() +=
+          cb->get_blob().get_compressed_payload_length();
+      }
+    }
+    dout(20) << __func__ << "  dst " << *ne << dendl;
+    ++n;
+  }
+  if ((!inject_21040 && src_dirty) ||
+       (inject_21040 && dirty_range_end > dirty_range_begin)) {
+    oldo->extent_map.dirty_range(dirty_range_begin,
+      dirty_range_end - dirty_range_begin);
+    txc->write_onode(oldo);
+  }
+  txc->write_onode(newo);
+
+  if (dstoff + length > newo->onode.size) {
+    newo->onode.size = dstoff + length;
+  }
+  newo->extent_map.dirty_range(dstoff, length);
+}
+void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
+                                  bool force)
+{
+  auto cct = onode->c->store->cct; //used by dout
+  dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
+  if (onode->onode.extent_map_shards.empty()) {
+    if (inline_bl.length() == 0) {
+      unsigned n;
+      // we need to encode inline_bl to measure encoded length
+      bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
+      inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
+      ceph_assert(!never_happen);
+      size_t len = inline_bl.length();
+      dout(20) << __func__ << "  inline shard " << len << " bytes from " << n
+	       << " extents" << dendl;
+      if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
+	request_reshard(0, OBJECT_MAX_SIZE);
+	return;
+      }
+    }
+    // will persist in the onode key.
+  } else {
+    // pending shard update
+    struct dirty_shard_t {
+      Shard *shard;
+      bufferlist bl;
+      dirty_shard_t(Shard *s) : shard(s) {}
+    };
+    vector<dirty_shard_t> encoded_shards;
+    // allocate slots for all shards in a single call instead of
+    // doing multiple allocations - one per each dirty shard
+    encoded_shards.reserve(shards.size());
+
+    auto p = shards.begin();
+    auto prev_p = p;
+    while (p != shards.end()) {
+      ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
+      auto n = p;
+      ++n;
+      if (p->dirty) {
+	uint32_t endoff;
+	if (n == shards.end()) {
+	  endoff = OBJECT_MAX_SIZE;
+	} else {
+	  endoff = n->shard_info->offset;
+	}
+	encoded_shards.emplace_back(dirty_shard_t(&(*p)));
+        bufferlist& bl = encoded_shards.back().bl;
+	if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
+			bl, &p->extents)) {
+	  if (force) {
+	    derr << __func__ << "  encode_some needs reshard" << dendl;
+	    ceph_assert(!force);
+	  }
+	}
+        size_t len = bl.length();
+
+	dout(20) << __func__ << "  shard 0x" << std::hex
+		 << p->shard_info->offset << std::dec << " is " << len
+		 << " bytes (was " << p->shard_info->bytes << ") from "
+		 << p->extents << " extents" << dendl;
+
+        if (!force) {
+	  if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
+	    // we are big; reshard ourselves
+	    request_reshard(p->shard_info->offset, endoff);
+	  }
+	  // avoid resharding the trailing shard, even if it is small
+	  else if (n != shards.end() &&
+		   len < g_conf()->bluestore_extent_map_shard_min_size) {
+            ceph_assert(endoff != OBJECT_MAX_SIZE);
+	    if (p == shards.begin()) {
+	      // we are the first shard, combine with next shard
+	      request_reshard(p->shard_info->offset, endoff + 1);
+	    } else {
+	      // combine either with the previous shard or the next,
+	      // whichever is smaller
+	      if (prev_p->shard_info->bytes > n->shard_info->bytes) {
+		request_reshard(p->shard_info->offset, endoff + 1);
+	      } else {
+		request_reshard(prev_p->shard_info->offset, endoff);
+	      }
+	    }
+	  }
+        }
+      }
+      prev_p = p;
+      p = n;
+    }
+    if (needs_reshard()) {
+      return;
+    }
+
+    // schedule DB update for dirty shards
+    string key;
+    for (auto& it : encoded_shards) {
+      it.shard->dirty = false;
+      it.shard->shard_info->bytes = it.bl.length();
+      generate_extent_shard_key_and_apply(
+	onode->key,
+	it.shard->shard_info->offset,
+	&key,
+        [&](const string& final_key) {
+          t->set(PREFIX_OBJ, final_key, it.bl);
+        }
+      );
+    }
+  }
+}
+
+bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
+{
+  if (spanning_blob_map.empty())
+    return 0;
+  bid_t bid = spanning_blob_map.rbegin()->first + 1;
+  // bid is valid and available.
+  if (bid >= 0)
+    return bid;
+  // Find next unused bid;
+  bid = rand() % (numeric_limits<bid_t>::max() + 1);
+  const auto begin_bid = bid;
+  do {
+    if (!spanning_blob_map.count(bid))
+      return bid;
+    else {
+      bid++;
+      if (bid < 0) bid = 0;
+    }
+  } while (bid != begin_bid);
+  auto cct = onode->c->store->cct; // used by dout
+  _dump_onode<0>(cct, *onode);
+  ceph_abort_msg("no available blob id");
+}
+
+void BlueStore::ExtentMap::reshard(
+  KeyValueDB *db,
+  KeyValueDB::Transaction t)
+{
+  auto cct = onode->c->store->cct; // used by dout
+
+  dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
+	   << needs_reshard_end << ")" << std::dec
+	   << " of " << onode->onode.extent_map_shards.size()
+	   << " shards on " << onode->oid << dendl;
+  for (auto& p : spanning_blob_map) {
+    dout(20) << __func__ << "   spanning blob " << p.first << " " << *p.second
+	     << dendl;
+  }
+  // determine shard index range
+  unsigned si_begin = 0, si_end = 0;
+  if (!shards.empty()) {
+    while (si_begin + 1 < shards.size() &&
+	   shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
+      ++si_begin;
+    }
+    needs_reshard_begin = shards[si_begin].shard_info->offset;
+    for (si_end = si_begin; si_end < shards.size(); ++si_end) {
+      if (shards[si_end].shard_info->offset >= needs_reshard_end) {
+	needs_reshard_end = shards[si_end].shard_info->offset;
+	break;
+      }
+    }
+    if (si_end == shards.size()) {
+      needs_reshard_end = OBJECT_MAX_SIZE;
+    }
+    dout(20) << __func__ << "   shards [" << si_begin << "," << si_end << ")"
+	     << " over 0x[" << std::hex << needs_reshard_begin << ","
+	     << needs_reshard_end << ")" << std::dec << dendl;
+  }
+
+  fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
+
+  // we may need to fault in a larger interval later must have all
+  // referring extents for spanning blobs loaded in order to have
+  // accurate use_tracker values.
+  uint32_t spanning_scan_begin = needs_reshard_begin;
+  uint32_t spanning_scan_end = needs_reshard_end;
+
+  // remove old keys
+  string key;
+  for (unsigned i = si_begin; i < si_end; ++i) {
+    generate_extent_shard_key_and_apply(
+      onode->key, shards[i].shard_info->offset, &key,
+      [&](const string& final_key) {
+	t->rmkey(PREFIX_OBJ, final_key);
+      }
+      );
+  }
+
+  // calculate average extent size
+  unsigned bytes = 0;
+  unsigned extents = 0;
+  if (onode->onode.extent_map_shards.empty()) {
+    bytes = inline_bl.length();
+    extents = extent_map.size();
+  } else {
+    for (unsigned i = si_begin; i < si_end; ++i) {
+      bytes += shards[i].shard_info->bytes;
+      extents += shards[i].extents;
+    }
+  }
+  unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
+  unsigned slop = target *
+    cct->_conf->bluestore_extent_map_shard_target_size_slop;
+  unsigned extent_avg = bytes / std::max(1u, extents);
+  dout(20) << __func__ << "  extent_avg " << extent_avg << ", target " << target
+	   << ", slop " << slop << dendl;
+
+  // reshard
+  unsigned estimate = 0;
+  unsigned offset = needs_reshard_begin;
+  vector<bluestore_onode_t::shard_info> new_shard_info;
+  unsigned max_blob_end = 0;
+  Extent dummy(needs_reshard_begin);
+  for (auto e = extent_map.lower_bound(dummy);
+       e != extent_map.end();
+       ++e) {
+    if (e->logical_offset >= needs_reshard_end) {
+      break;
+    }
+    dout(30) << " extent " << *e << dendl;
+
+    // disfavor shard boundaries that span a blob
+    bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
+    if (estimate &&
+	estimate + extent_avg > target + (would_span ? slop : 0)) {
+      // new shard
+      if (offset == needs_reshard_begin) {
+	new_shard_info.emplace_back(bluestore_onode_t::shard_info());
+	new_shard_info.back().offset = offset;
+	dout(20) << __func__ << "  new shard 0x" << std::hex << offset
+                 << std::dec << dendl;
+      }
+      offset = e->logical_offset;
+      new_shard_info.emplace_back(bluestore_onode_t::shard_info());
+      new_shard_info.back().offset = offset;
+      dout(20) << __func__ << "  new shard 0x" << std::hex << offset
+	       << std::dec << dendl;
+      estimate = 0;
+    }
+    estimate += extent_avg;
+    unsigned bs = e->blob_start();
+    if (bs < spanning_scan_begin) {
+      spanning_scan_begin = bs;
+    }
+    uint32_t be = e->blob_end();
+    if (be > max_blob_end) {
+      max_blob_end = be;
+    }
+    if (be > spanning_scan_end) {
+      spanning_scan_end = be;
+    }
+  }
+  if (new_shard_info.empty() && (si_begin > 0 ||
+				 si_end < shards.size())) {
+    // we resharded a partial range; we must produce at least one output
+    // shard
+    new_shard_info.emplace_back(bluestore_onode_t::shard_info());
+    new_shard_info.back().offset = needs_reshard_begin;
+    dout(20) << __func__ << "  new shard 0x" << std::hex << needs_reshard_begin
+	     << std::dec << " (singleton degenerate case)" << dendl;
+  }
+
+  auto& sv = onode->onode.extent_map_shards;
+  dout(20) << __func__ << "  new " << new_shard_info << dendl;
+  dout(20) << __func__ << "  old " << sv << dendl;
+  if (sv.empty()) {
+    // no old shards to keep
+    sv.swap(new_shard_info);
+    init_shards(true, true);
+  } else {
+    // splice in new shards
+    sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
+    shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
+    sv.insert(
+      sv.begin() + si_begin,
+      new_shard_info.begin(),
+      new_shard_info.end());
+    shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
+    si_end = si_begin + new_shard_info.size();
+
+    ceph_assert(sv.size() == shards.size());
+
+    // note that we need to update every shard_info of shards here,
+    // as sv might have been totally re-allocated above
+    for (unsigned i = 0; i < shards.size(); i++) {
+      shards[i].shard_info = &sv[i];
+    }
+
+    // mark newly added shards as dirty
+    for (unsigned i = si_begin; i < si_end; ++i) {
+      shards[i].loaded = true;
+      shards[i].dirty = true;
+    }
+  }
+  dout(20) << __func__ << "  fin " << sv << dendl;
+  inline_bl.clear();
+
+  if (sv.empty()) {
+    // no more shards; unspan all previously spanning blobs
+    auto p = spanning_blob_map.begin();
+    while (p != spanning_blob_map.end()) {
+      p->second->id = -1;
+      dout(30) << __func__ << " un-spanning " << *p->second << dendl;
+      p = spanning_blob_map.erase(p);
+    }
+  } else {
+    // identify new spanning blobs
+    dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
+	     << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
+    if (spanning_scan_begin < needs_reshard_begin) {
+      fault_range(db, spanning_scan_begin,
+		  needs_reshard_begin - spanning_scan_begin);
+    }
+    if (spanning_scan_end > needs_reshard_end) {
+      fault_range(db, needs_reshard_end,
+		  spanning_scan_end - needs_reshard_end);
+    }
+    auto sp = sv.begin() + si_begin;
+    auto esp = sv.end();
+    unsigned shard_start = sp->offset;
+    unsigned shard_end;
+    ++sp;
+    if (sp == esp) {
+      shard_end = OBJECT_MAX_SIZE;
+    } else {
+      shard_end = sp->offset;
+    }
+    Extent dummy(needs_reshard_begin);
+
+    bool was_too_many_blobs_check = false;
+    auto too_many_blobs_threshold =
+      g_conf()->bluestore_debug_too_many_blobs_threshold;
+    auto& dumped_onodes = onode->c->onode_map.cache->dumped_onodes;
+    decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oid_slot = nullptr;
+    decltype(onode->c->onode_map.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
+
+    for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
+      if (e->logical_offset >= needs_reshard_end) {
+	break;
+      }
+      dout(30) << " extent " << *e << dendl;
+      while (e->logical_offset >= shard_end) {
+	shard_start = shard_end;
+	ceph_assert(sp != esp);
+	++sp;
+	if (sp == esp) {
+	  shard_end = OBJECT_MAX_SIZE;
+	} else {
+	  shard_end = sp->offset;
+	}
+	dout(30) << __func__ << "  shard 0x" << std::hex << shard_start
+		 << " to 0x" << shard_end << std::dec << dendl;
+      }
+
+      if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
+	if (!e->blob->is_spanning()) {
+	  // We have two options: (1) split the blob into pieces at the
+	  // shard boundaries (and adjust extents accordingly), or (2)
+	  // mark it spanning.  We prefer to cut the blob if we can.  Note that
+	  // we may have to split it multiple times--potentially at every
+	  // shard boundary.
+	  bool must_span = false;
+	  BlobRef b = e->blob;
+	  if (b->can_split()) {
+	    uint32_t bstart = e->blob_start();
+	    uint32_t bend = e->blob_end();
+	    for (const auto& sh : shards) {
+	      if (bstart < sh.shard_info->offset &&
+		  bend > sh.shard_info->offset) {
+		uint32_t blob_offset = sh.shard_info->offset - bstart;
+		if (b->can_split_at(blob_offset)) {
+		  dout(20) << __func__ << "    splitting blob, bstart 0x"
+			   << std::hex << bstart << " blob_offset 0x"
+			   << blob_offset << std::dec << " " << *b << dendl;
+		  b = split_blob(b, blob_offset, sh.shard_info->offset);
+		  // switch b to the new right-hand side, in case it
+		  // *also* has to get split.
+		  bstart += blob_offset;
+		  onode->c->store->logger->inc(l_bluestore_blob_split);
+		} else {
+		  must_span = true;
+		  break;
+		}
+	      }
+	    }
+	  } else {
+	    must_span = true;
+	  }
+	  if (must_span) {
+            auto bid = allocate_spanning_blob_id();
+            b->id = bid;
+	    spanning_blob_map[b->id] = b;
+	    dout(20) << __func__ << "    adding spanning " << *b << dendl;
+	    if (!was_too_many_blobs_check &&
+	      too_many_blobs_threshold &&
+	      spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
+
+	      was_too_many_blobs_check = true;
+	      for (size_t i = 0; i < dumped_onodes.size(); ++i) {
+		if (dumped_onodes[i].first == onode->oid) {
+		  oid_slot = &dumped_onodes[i];
+		  break;
+		}
+		if (!oldest_slot || (oldest_slot &&
+		    dumped_onodes[i].second < oldest_slot->second)) {
+		  oldest_slot = &dumped_onodes[i];
+		}
+	      }
+	    }
+	  }
+	}
+      } else {
+	if (e->blob->is_spanning()) {
+	  spanning_blob_map.erase(e->blob->id);
+	  e->blob->id = -1;
+	  dout(30) << __func__ << "    un-spanning " << *e->blob << dendl;
+	}
+      }
+    }
+    bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
+      (oid_slot &&
+	(mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
+    if (do_dump) {
+      dout(0) << __func__
+	      << " spanning blob count exceeds threshold, "
+	      << spanning_blob_map.size() << " spanning blobs"
+	      << dendl;
+      _dump_onode<0>(cct, *onode);
+      if (oid_slot) {
+	oid_slot->second = mono_clock::now();
+      } else {
+	ceph_assert(oldest_slot);
+	oldest_slot->first = onode->oid;
+	oldest_slot->second = mono_clock::now();
+      }
+    }
+  }
+
+  clear_needs_reshard();
+}
+
+bool BlueStore::ExtentMap::encode_some(
+  uint32_t offset,
+  uint32_t length,
+  bufferlist& bl,
+  unsigned *pn)
+{
+  Extent dummy(offset);
+  auto start = extent_map.lower_bound(dummy);
+  uint32_t end = offset + length;
+
+  __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
+                     // serialization only. Hence there is no specific
+                     // handling at ExtentMap level.
+
+  unsigned n = 0;
+  size_t bound = 0;
+  bool must_reshard = false;
+  for (auto p = start;
+       p != extent_map.end() && p->logical_offset < end;
+       ++p, ++n) {
+    ceph_assert(p->logical_offset >= offset);
+    p->blob->last_encoded_id = -1;
+    if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
+      dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
+	       << std::dec << " hit new spanning blob " << *p << dendl;
+      request_reshard(p->blob_start(), p->blob_end());
+      must_reshard = true;
+    }
+    if (!must_reshard) {
+      denc_varint(0, bound); // blobid
+      denc_varint(0, bound); // logical_offset
+      denc_varint(0, bound); // len
+      denc_varint(0, bound); // blob_offset
+
+      p->blob->bound_encode(
+        bound,
+        struct_v,
+        p->blob->shared_blob->get_sbid(),
+        false);
+    }
+  }
+  if (must_reshard) {
+    return true;
+  }
+
+  denc(struct_v, bound);
+  denc_varint(0, bound); // number of extents
+
+  {
+    auto app = bl.get_contiguous_appender(bound);
+    denc(struct_v, app);
+    denc_varint(n, app);
+    if (pn) {
+      *pn = n;
+    }
+
+    n = 0;
+    uint64_t pos = 0;
+    uint64_t prev_len = 0;
+    for (auto p = start;
+	 p != extent_map.end() && p->logical_offset < end;
+	 ++p, ++n) {
+      unsigned blobid;
+      bool include_blob = false;
+      if (p->blob->is_spanning()) {
+	blobid = p->blob->id << BLOBID_SHIFT_BITS;
+	blobid |= BLOBID_FLAG_SPANNING;
+      } else if (p->blob->last_encoded_id < 0) {
+	p->blob->last_encoded_id = n + 1;  // so it is always non-zero
+	include_blob = true;
+	blobid = 0;  // the decoder will infer the id from n
+      } else {
+	blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
+      }
+      if (p->logical_offset == pos) {
+	blobid |= BLOBID_FLAG_CONTIGUOUS;
+      }
+      if (p->blob_offset == 0) {
+	blobid |= BLOBID_FLAG_ZEROOFFSET;
+      }
+      if (p->length == prev_len) {
+	blobid |= BLOBID_FLAG_SAMELENGTH;
+      } else {
+	prev_len = p->length;
+      }
+      denc_varint(blobid, app);
+      if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
+	denc_varint_lowz(p->logical_offset - pos, app);
+      }
+      if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
+	denc_varint_lowz(p->blob_offset, app);
+      }
+      if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
+	denc_varint_lowz(p->length, app);
+      }
+      pos = p->logical_end();
+      if (include_blob) {
+	p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
+      }
+    }
+  }
+  /*derr << __func__ << bl << dendl;
+  derr << __func__ << ":";
+  bl.hexdump(*_dout);
+  *_dout << dendl;
+  */
+  return false;
+}
+
+unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
+{
+  /*
+  derr << __func__ << ":";
+  bl.hexdump(*_dout);
+  *_dout << dendl;
+  */
+
+  ceph_assert(bl.get_num_buffers() <= 1);
+  auto p = bl.front().begin_deep();
+  __u8 struct_v;
+  denc(struct_v, p);
+  // Version 2 differs from v1 in blob's ref_map
+  // serialization only. Hence there is no specific
+  // handling at ExtentMap level below.
+  ceph_assert(struct_v == 1 || struct_v == 2);
+
+  uint32_t num;
+  denc_varint(num, p);
+  vector<BlobRef> blobs(num);
+  uint64_t pos = 0;
+  uint64_t prev_len = 0;
+  unsigned n = 0;
+
+  while (!p.end()) {
+    Extent *le = new Extent();
+    uint64_t blobid;
+    denc_varint(blobid, p);
+    if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
+      uint64_t gap;
+      denc_varint_lowz(gap, p);
+      pos += gap;
+    }
+    le->logical_offset = pos;
+    if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
+      denc_varint_lowz(le->blob_offset, p);
+    } else {
+      le->blob_offset = 0;
+    }
+    if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
+      denc_varint_lowz(prev_len, p);
+    }
+    le->length = prev_len;
+
+    if (blobid & BLOBID_FLAG_SPANNING) {
+      dout(30) << __func__ << "  getting spanning blob "
+	       << (blobid >> BLOBID_SHIFT_BITS) << dendl;
+      le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
+    } else {
+      blobid >>= BLOBID_SHIFT_BITS;
+      if (blobid) {
+	le->assign_blob(blobs[blobid - 1]);
+	ceph_assert(le->blob);
+      } else {
+	Blob *b = new Blob();
+        uint64_t sbid = 0;
+        b->decode(onode->c, p, struct_v, &sbid, false);
+	blobs[n] = b;
+	onode->c->open_shared_blob(sbid, b);
+	le->assign_blob(b);
+      }
+      // we build ref_map dynamically for non-spanning blobs
+      le->blob->get_ref(
+	onode->c,
+	le->blob_offset,
+	le->length);
+    }
+    pos += prev_len;
+    ++n;
+    extent_map.insert(*le);
+  }
+
+  ceph_assert(n == num);
+  return num;
+}
+
+void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
+{
+  // Version 2 differs from v1 in blob's ref_map
+  // serialization only. Hence there is no specific
+  // handling at ExtentMap level.
+  __u8 struct_v = 2;
+
+  denc(struct_v, p);
+  denc_varint((uint32_t)0, p);
+  size_t key_size = 0;
+  denc_varint((uint32_t)0, key_size);
+  p += spanning_blob_map.size() * key_size;
+  for (const auto& i : spanning_blob_map) {
+    i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
+  }
+}
+
+void BlueStore::ExtentMap::encode_spanning_blobs(
+  bufferlist::contiguous_appender& p)
+{
+  // Version 2 differs from v1 in blob's ref_map
+  // serialization only. Hence there is no specific
+  // handling at ExtentMap level.
+  __u8 struct_v = 2;
+
+  denc(struct_v, p);
+  denc_varint(spanning_blob_map.size(), p);
+  for (auto& i : spanning_blob_map) {
+    denc_varint(i.second->id, p);
+    i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
+  }
+}
+
+void BlueStore::ExtentMap::decode_spanning_blobs(
+  bufferptr::const_iterator& p)
+{
+  __u8 struct_v;
+  denc(struct_v, p);
+  // Version 2 differs from v1 in blob's ref_map
+  // serialization only. Hence there is no specific
+  // handling at ExtentMap level.
+  ceph_assert(struct_v == 1 || struct_v == 2);
+
+  unsigned n;
+  denc_varint(n, p);
+  while (n--) {
+    BlobRef b(new Blob());
+    denc_varint(b->id, p);
+    spanning_blob_map[b->id] = b;
+    uint64_t sbid = 0;
+    b->decode(onode->c, p, struct_v, &sbid, true);
+    onode->c->open_shared_blob(sbid, b);
+  }
+}
+
+void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
+{
+  shards.resize(onode->onode.extent_map_shards.size());
+  unsigned i = 0;
+  for (auto &s : onode->onode.extent_map_shards) {
+    shards[i].shard_info = &s;
+    shards[i].loaded = loaded;
+    shards[i].dirty = dirty;
+    ++i;
+  }
+}
+
+void BlueStore::ExtentMap::fault_range(
+  KeyValueDB *db,
+  uint32_t offset,
+  uint32_t length)
+{
+  dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << std::dec << dendl;
+  auto start = seek_shard(offset);
+  auto last = seek_shard(offset + length);
+
+  if (start < 0)
+    return;
+
+  ceph_assert(last >= start);
+  string key;
+  while (start <= last) {
+    ceph_assert((size_t)start < shards.size());
+    auto p = &shards[start];
+    if (!p->loaded) {
+      dout(30) << __func__ << " opening shard 0x" << std::hex
+	       << p->shard_info->offset << std::dec << dendl;
+      bufferlist v;
+      generate_extent_shard_key_and_apply(
+	onode->key, p->shard_info->offset, &key,
+        [&](const string& final_key) {
+          int r = db->get(PREFIX_OBJ, final_key, &v);
+          if (r < 0) {
+	    derr << __func__ << " missing shard 0x" << std::hex
+		 << p->shard_info->offset << std::dec << " for " << onode->oid
+		 << dendl;
+	    ceph_assert(r >= 0);
+          }
+        }
+      );
+      p->extents = decode_some(v);
+      p->loaded = true;
+      dout(20) << __func__ << " open shard 0x" << std::hex
+	       << p->shard_info->offset
+	       << " for range 0x" << offset << "~" << length << std::dec
+	       << " (" << v.length() << " bytes)" << dendl;
+      ceph_assert(p->dirty == false);
+      ceph_assert(v.length() == p->shard_info->bytes);
+      onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
+    } else {
+      onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
+    }
+    ++start;
+  }
+}
+
+void BlueStore::ExtentMap::dirty_range(
+  uint32_t offset,
+  uint32_t length)
+{
+  dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << std::dec << dendl;
+  if (shards.empty()) {
+    dout(20) << __func__ << " mark inline shard dirty" << dendl;
+    inline_bl.clear();
+    return;
+  }
+  auto start = seek_shard(offset);
+  if (length == 0) {
+    length = 1;
+  }
+  auto last = seek_shard(offset + length - 1);
+  if (start < 0)
+    return;
+
+  ceph_assert(last >= start);
+  while (start <= last) {
+    ceph_assert((size_t)start < shards.size());
+    auto p = &shards[start];
+    if (!p->loaded) {
+      derr << __func__ << "on write 0x" << std::hex << offset
+	   << "~" << length << " shard 0x" << p->shard_info->offset
+	   << std::dec << " is not loaded, can't mark dirty" << dendl;
+      ceph_abort_msg("can't mark unloaded shard dirty");
+    }
+    if (!p->dirty) {
+      dout(20) << __func__ << " mark shard 0x" << std::hex
+	       << p->shard_info->offset << std::dec << " dirty" << dendl;
+      p->dirty = true;
+    }
+    ++start;
+  }
+}
+
+BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
+  uint64_t offset)
+{
+  Extent dummy(offset);
+  return extent_map.find(dummy);
+}
+
+BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
+  uint64_t offset)
+{
+  Extent dummy(offset);
+  auto fp = extent_map.lower_bound(dummy);
+  if (fp != extent_map.begin()) {
+    --fp;
+    if (fp->logical_end() <= offset) {
+      ++fp;
+    }
+  }
+  return fp;
+}
+
+BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
+  uint64_t offset) const
+{
+  Extent dummy(offset);
+  auto fp = extent_map.lower_bound(dummy);
+  if (fp != extent_map.begin()) {
+    --fp;
+    if (fp->logical_end() <= offset) {
+      ++fp;
+    }
+  }
+  return fp;
+}
+
+bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
+{
+  auto fp = seek_lextent(offset);
+  if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
+    return false;
+  }
+  return true;
+}
+
+int BlueStore::ExtentMap::compress_extent_map(
+  uint64_t offset,
+  uint64_t length)
+{
+  if (extent_map.empty())
+    return 0;
+  int removed = 0;
+  auto p = seek_lextent(offset);
+  if (p != extent_map.begin()) {
+    --p;  // start to the left of offset
+  }
+  // the caller should have just written to this region
+  ceph_assert(p != extent_map.end());
+
+  // identify the *next* shard
+  auto pshard = shards.begin();
+  while (pshard != shards.end() &&
+	 p->logical_offset >= pshard->shard_info->offset) {
+    ++pshard;
+  }
+  uint64_t shard_end;
+  if (pshard != shards.end()) {
+    shard_end = pshard->shard_info->offset;
+  } else {
+    shard_end = OBJECT_MAX_SIZE;
+  }
+
+  auto n = p;
+  for (++n; n != extent_map.end(); p = n++) {
+    if (n->logical_offset > offset + length) {
+      break;  // stop after end
+    }
+    while (n != extent_map.end() &&
+	   p->logical_end() == n->logical_offset &&
+	   p->blob == n->blob &&
+	   p->blob_offset + p->length == n->blob_offset &&
+	   n->logical_offset < shard_end) {
+      dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+	       << " next shard 0x" << shard_end << std::dec
+	       << " merging " << *p << " and " << *n << dendl;
+      p->length += n->length;
+      rm(n++);
+      ++removed;
+    }
+    if (n == extent_map.end()) {
+      break;
+    }
+    if (n->logical_offset >= shard_end) {
+      ceph_assert(pshard != shards.end());
+      ++pshard;
+      if (pshard != shards.end()) {
+	shard_end = pshard->shard_info->offset;
+      } else {
+	shard_end = OBJECT_MAX_SIZE;
+      }
+    }
+  }
+  if (removed) {
+    onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
+  }
+  return removed;
+}
+
+void BlueStore::ExtentMap::punch_hole(
+  CollectionRef &c, 
+  uint64_t offset,
+  uint64_t length,
+  old_extent_map_t *old_extents)
+{
+  auto p = seek_lextent(offset);
+  uint64_t end = offset + length;
+  while (p != extent_map.end()) {
+    if (p->logical_offset >= end) {
+      break;
+    }
+    if (p->logical_offset < offset) {
+      if (p->logical_end() > end) {
+	// split and deref middle
+	uint64_t front = offset - p->logical_offset;
+	OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front, 
+					  length, p->blob);
+	old_extents->push_back(*oe);
+	add(end,
+	    p->blob_offset + front + length,
+	    p->length - front - length,
+	    p->blob);
+	p->length = front;
+	break;
+      } else {
+	// deref tail
+	ceph_assert(p->logical_end() > offset); // else seek_lextent bug
+	uint64_t keep = offset - p->logical_offset;
+	OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
+					  p->length - keep, p->blob);
+	old_extents->push_back(*oe);
+	p->length = keep;
+	++p;
+	continue;
+      }
+    }
+    if (p->logical_offset + p->length <= end) {
+      // deref whole lextent
+      OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
+				        p->length, p->blob);
+      old_extents->push_back(*oe);
+      rm(p++);
+      continue;
+    }
+    // deref head
+    uint64_t keep = p->logical_end() - end;
+    OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
+				      p->length - keep, p->blob);
+    old_extents->push_back(*oe);
+
+    add(end, p->blob_offset + p->length - keep, keep, p->blob);
+    rm(p);
+    break;
+  }
+}
+
+BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
+  CollectionRef &c,
+  uint64_t logical_offset,
+  uint64_t blob_offset, uint64_t length, BlobRef b,
+  old_extent_map_t *old_extents)
+{
+  // We need to have completely initialized Blob to increment its ref counters.
+  ceph_assert(b->get_blob().get_logical_length() != 0);
+
+  // Do get_ref prior to punch_hole to prevent from putting reused blob into 
+  // old_extents list if we overwre the blob totally
+  // This might happen during WAL overwrite.
+  b->get_ref(onode->c, blob_offset, length);
+
+  if (old_extents) {
+    punch_hole(c, logical_offset, length, old_extents);
+  }
+
+  Extent *le = new Extent(logical_offset, blob_offset, length, b);
+  extent_map.insert(*le);
+  if (spans_shard(logical_offset, length)) {
+    request_reshard(logical_offset, logical_offset + length);
+  }
+  return le;
+}
+
+BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
+  BlobRef lb,
+  uint32_t blob_offset,
+  uint32_t pos)
+{
+  uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
+  dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
+	   << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
+	   << dendl;
+  BlobRef rb = onode->c->new_blob();
+  lb->split(onode->c, blob_offset, rb.get());
+
+  for (auto ep = seek_lextent(pos);
+       ep != extent_map.end() && ep->logical_offset < end_pos;
+       ++ep) {
+    if (ep->blob != lb) {
+      continue;
+    }
+    if (ep->logical_offset < pos) {
+      // split extent
+      size_t left = pos - ep->logical_offset;
+      Extent *ne = new Extent(pos, 0, ep->length - left, rb);
+      extent_map.insert(*ne);
+      ep->length = left;
+      dout(30) << __func__ << "  split " << *ep << dendl;
+      dout(30) << __func__ << "     to " << *ne << dendl;
+    } else {
+      // switch blob
+      ceph_assert(ep->blob_offset >= blob_offset);
+
+      ep->blob = rb;
+      ep->blob_offset -= blob_offset;
+      dout(30) << __func__ << "  adjusted " << *ep << dendl;
+    }
+  }
+  return rb;
+}
+
+// Onode
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
+
+void BlueStore::Onode::get() {
+  if (++nref >= 2 && !pinned) {
+    OnodeCacheShard* ocs = c->get_onode_cache();
+    ocs->lock.lock();
+    // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
+    while (ocs != c->get_onode_cache()) {
+      ocs->lock.unlock();
+      ocs = c->get_onode_cache();
+      ocs->lock.lock();
+    }
+    bool was_pinned = pinned;
+    pinned = nref >= 2;
+    bool r = !was_pinned && pinned;
+    if (cached && r) {
+      ocs->_pin(this);
+    }
+    ocs->lock.unlock();
+  }
+}
+void BlueStore::Onode::put() {
+  ++put_nref;
+  int n = --nref;
+  if (n == 1) {
+    OnodeCacheShard* ocs = c->get_onode_cache();
+    ocs->lock.lock();
+    // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
+    while (ocs != c->get_onode_cache()) {
+      ocs->lock.unlock();
+      ocs = c->get_onode_cache();
+      ocs->lock.lock();
+    }
+    bool need_unpin = pinned;
+    pinned = pinned && nref >= 2;
+    need_unpin = need_unpin && !pinned;
+    if (cached && need_unpin) {
+      if (exists) {
+        ocs->_unpin(this);
+      } else {
+        ocs->_unpin_and_rm(this);
+        // remove will also decrement nref
+        c->onode_map._remove(oid);
+      }
+    }
+    ocs->lock.unlock();
+  }
+  auto pn = --put_nref;
+  if (nref == 0 && pn == 0) {
+    delete this;
+  }
+}
+
+BlueStore::Onode* BlueStore::Onode::decode(
+  CollectionRef c,
+  const ghobject_t& oid,
+  const string& key,
+  const bufferlist& v)
+{
+  Onode* on = new Onode(c.get(), oid, key);
+  on->exists = true;
+  auto p = v.front().begin_deep();
+  on->onode.decode(p);
+  for (auto& i : on->onode.attrs) {
+    i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+  }
+
+  // initialize extent_map
+  on->extent_map.decode_spanning_blobs(p);
+  if (on->onode.extent_map_shards.empty()) {
+    denc(on->extent_map.inline_bl, p);
+    on->extent_map.decode_some(on->extent_map.inline_bl);
+    on->extent_map.inline_bl.reassign_to_mempool(
+      mempool::mempool_bluestore_cache_data);
+  }
+  else {
+    on->extent_map.init_shards(false, false);
+  }
+  return on;
+}
+
+void BlueStore::Onode::flush()
+{
+  if (flushing_count.load()) {
+    ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
+    waiting_count++;
+    std::unique_lock l(flush_lock);
+    while (flushing_count.load()) {
+      flush_cond.wait(l);
+    }
+    waiting_count--;
+  }
+  ldout(c->store->cct, 20) << __func__ << " done" << dendl;
+}
+
+void BlueStore::Onode::dump(Formatter* f) const
+{
+  onode.dump(f);
+  extent_map.dump(f);
+}
+
+const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags)
+{
+  if (bluestore_onode_t::is_pgmeta_omap(flags)) {
+    return PREFIX_PGMETA_OMAP;
+  }
+  if (bluestore_onode_t::is_perpg_omap(flags)) {
+    return PREFIX_PERPG_OMAP;
+  }
+  if (bluestore_onode_t::is_perpool_omap(flags)) {
+    return PREFIX_PERPOOL_OMAP;
+  }
+  return PREFIX_OMAP;
+}
+
+// '-' < '.' < '~'
+void BlueStore::Onode::calc_omap_header(
+  uint8_t flags,
+  const Onode* o,
+  std::string* out)
+{
+  if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
+    if (bluestore_onode_t::is_perpg_omap(flags)) {
+      _key_encode_u64(o->c->pool(), out);
+      _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
+    } else if (bluestore_onode_t::is_perpool_omap(flags)) {
+      _key_encode_u64(o->c->pool(), out);
+    }
+  }
+  _key_encode_u64(o->onode.nid, out);
+  out->push_back('-');
+}
+
+void BlueStore::Onode::calc_omap_key(uint8_t flags,
+				    const Onode* o,
+				    const std::string& key,
+				    std::string* out)
+{
+  if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
+    if (bluestore_onode_t::is_perpg_omap(flags)) {
+      _key_encode_u64(o->c->pool(), out);
+      _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
+    } else if (bluestore_onode_t::is_perpool_omap(flags)) {
+      _key_encode_u64(o->c->pool(), out);
+    }
+  }
+  _key_encode_u64(o->onode.nid, out);
+  out->push_back('.');
+  out->append(key);
+}
+
+void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
+{
+  if (!onode.is_pgmeta_omap()) {
+    if (onode.is_perpg_omap()) {
+      _key_encode_u64(c->pool(), out);
+      _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
+    } else if (onode.is_perpool_omap()) {
+      _key_encode_u64(c->pool(), out);
+    }
+  }
+  _key_encode_u64(onode.nid, out);
+  out->append(old.c_str() + out->length(), old.size() - out->length());
+}
+
+void BlueStore::Onode::calc_omap_tail(
+  uint8_t flags,
+  const Onode* o,
+  std::string* out)
+{
+  if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
+    if (bluestore_onode_t::is_perpg_omap(flags)) {
+      _key_encode_u64(o->c->pool(), out);
+      _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
+    } else if (bluestore_onode_t::is_perpool_omap(flags)) {
+      _key_encode_u64(o->c->pool(), out);
+    }
+  }
+  _key_encode_u64(o->onode.nid, out);
+  out->push_back('~');
+}
+
+void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
+{
+  size_t pos = sizeof(uint64_t) + 1;
+  if (!onode.is_pgmeta_omap()) {
+    if (onode.is_perpg_omap()) {
+      pos += sizeof(uint64_t) + sizeof(uint32_t);
+    } else if (onode.is_perpool_omap()) {
+      pos += sizeof(uint64_t);
+    }
+  }
+  *user_key = key.substr(pos);
+}
+
+// =======================================================
+// WriteContext
+ 
+/// Checks for writes to the same pextent within a blob
+bool BlueStore::WriteContext::has_conflict(
+  BlobRef b,
+  uint64_t loffs,
+  uint64_t loffs_end,
+  uint64_t min_alloc_size)
+{
+  ceph_assert((loffs % min_alloc_size) == 0);
+  ceph_assert((loffs_end % min_alloc_size) == 0);
+  for (auto w : writes) {
+    if (b == w.b) {
+      auto loffs2 = p2align(w.logical_offset, min_alloc_size);
+      auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
+      if ((loffs <= loffs2 && loffs_end > loffs2) ||
+          (loffs >= loffs2 && loffs < loffs2_end)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+ 
+// =======================================================
+
+// DeferredBatch
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
+#undef dout_context
+#define dout_context cct
+
+void BlueStore::DeferredBatch::prepare_write(
+  CephContext *cct,
+  uint64_t seq, uint64_t offset, uint64_t length,
+  bufferlist::const_iterator& blp)
+{
+  _discard(cct, offset, length);
+  auto i = iomap.insert(make_pair(offset, deferred_io()));
+  ceph_assert(i.second);  // this should be a new insertion
+  i.first->second.seq = seq;
+  blp.copy(length, i.first->second.bl);
+  i.first->second.bl.reassign_to_mempool(
+    mempool::mempool_bluestore_writing_deferred);
+  dout(20) << __func__ << " seq " << seq
+	   << " 0x" << std::hex << offset << "~" << length
+	   << " crc " << i.first->second.bl.crc32c(-1)
+	   << std::dec << dendl;
+  seq_bytes[seq] += length;
+#ifdef DEBUG_DEFERRED
+  _audit(cct);
+#endif
+}
+
+void BlueStore::DeferredBatch::_discard(
+  CephContext *cct, uint64_t offset, uint64_t length)
+{
+  generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+		   << std::dec << dendl;
+  auto p = iomap.lower_bound(offset);
+  if (p != iomap.begin()) {
+    --p;
+    auto end = p->first + p->second.bl.length();
+    if (end > offset) {
+      bufferlist head;
+      head.substr_of(p->second.bl, 0, offset - p->first);
+      dout(20) << __func__ << "  keep head " << p->second.seq
+	       << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+	       << " -> 0x" << head.length() << std::dec << dendl;
+      auto i = seq_bytes.find(p->second.seq);
+      ceph_assert(i != seq_bytes.end());
+      if (end > offset + length) {
+	bufferlist tail;
+	tail.substr_of(p->second.bl, offset + length - p->first,
+		       end - (offset + length));
+	dout(20) << __func__ << "  keep tail " << p->second.seq
+		 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+		 << " -> 0x" << tail.length() << std::dec << dendl;
+	auto &n = iomap[offset + length];
+	n.bl.swap(tail);
+	n.seq = p->second.seq;
+	i->second -= length;
+      } else {
+	i->second -= end - offset;
+      }
+      ceph_assert(i->second >= 0);
+      p->second.bl.swap(head);
+    }
+    ++p;
+  }
+  while (p != iomap.end()) {
+    if (p->first >= offset + length) {
+      break;
+    }
+    auto i = seq_bytes.find(p->second.seq);
+    ceph_assert(i != seq_bytes.end());
+    auto end = p->first + p->second.bl.length();
+    if (end > offset + length) {
+      unsigned drop_front = offset + length - p->first;
+      unsigned keep_tail = end - (offset + length);
+      dout(20) << __func__ << "  truncate front " << p->second.seq
+	       << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+	       << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
+	       << " to 0x" << (offset + length) << "~" << keep_tail
+	       << std::dec << dendl;
+      auto &s = iomap[offset + length];
+      s.seq = p->second.seq;
+      s.bl.substr_of(p->second.bl, drop_front, keep_tail);
+      i->second -= drop_front;
+    } else {
+      dout(20) << __func__ << "  drop " << p->second.seq
+	       << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+	       << std::dec << dendl;
+      i->second -= p->second.bl.length();
+    }
+    ceph_assert(i->second >= 0);
+    p = iomap.erase(p);
+  }
+}
+
+void BlueStore::DeferredBatch::_audit(CephContext *cct)
+{
+  map<uint64_t,int> sb;
+  for (auto p : seq_bytes) {
+    sb[p.first] = 0;  // make sure we have the same set of keys
+  }
+  uint64_t pos = 0;
+  for (auto& p : iomap) {
+    ceph_assert(p.first >= pos);
+    sb[p.second.seq] += p.second.bl.length();
+    pos = p.first + p.second.bl.length();
+  }
+  ceph_assert(sb == seq_bytes);
+}
+
+
+// Collection
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
+
+BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
+  : CollectionImpl(store_->cct, cid),
+    store(store_),
+    cache(bc),
+    exists(true),
+    onode_map(oc),
+    commit_queue(nullptr)
+{
+}
+
+bool BlueStore::Collection::flush_commit(Context *c)
+{
+  return osr->flush_commit(c);
+}
+
+void BlueStore::Collection::flush()
+{
+  osr->flush();
+}
+
+void BlueStore::Collection::flush_all_but_last()
+{
+  osr->flush_all_but_last();
+}
+
+void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
+{
+  ceph_assert(!b->shared_blob);
+  const bluestore_blob_t& blob = b->get_blob();
+  if (!blob.is_shared()) {
+    b->shared_blob = new SharedBlob(this);
+    return;
+  }
+
+  b->shared_blob = shared_blob_set.lookup(sbid);
+  if (b->shared_blob) {
+    ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
+			  << std::dec << " had " << *b->shared_blob << dendl;
+  } else {
+    b->shared_blob = new SharedBlob(sbid, this);
+    shared_blob_set.add(this, b->shared_blob.get());
+    ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
+			  << std::dec << " opened " << *b->shared_blob
+			  << dendl;
+  }
+}
+
+void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
+{
+  if (!sb->is_loaded()) {
+
+    bufferlist v;
+    string key;
+    auto sbid = sb->get_sbid();
+    get_shared_blob_key(sbid, &key);
+    int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
+    if (r < 0) {
+	lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
+			  << std::dec << " not found at key "
+			  << pretty_binary_string(key) << dendl;
+      ceph_abort_msg("uh oh, missing shared_blob");
+    }
+
+    sb->loaded = true;
+    sb->persistent = new bluestore_shared_blob_t(sbid);
+    auto p = v.cbegin();
+    decode(*(sb->persistent), p);
+    ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
+			  << std::dec << " loaded shared_blob " << *sb << dendl;
+  }
+}
+
+void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
+{
+  ldout(store->cct, 10) << __func__ << " " << *b << dendl;
+  ceph_assert(!b->shared_blob->is_loaded());
+
+  // update blob
+  bluestore_blob_t& blob = b->dirty_blob();
+  blob.set_flag(bluestore_blob_t::FLAG_SHARED);
+
+  // update shared blob
+  b->shared_blob->loaded = true;
+  b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
+  shared_blob_set.add(this, b->shared_blob.get());
+  for (auto p : blob.get_extents()) {
+    if (p.is_valid()) {
+      b->shared_blob->get_ref(
+	p.offset,
+	p.length);
+    }
+  }
+  ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
+}
+
+uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
+{
+  ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
+  ceph_assert(sb->is_loaded());
+
+  uint64_t sbid = sb->get_sbid();
+  shared_blob_set.remove(sb);
+  sb->loaded = false;
+  delete sb->persistent;
+  sb->sbid_unloaded = 0;
+  ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
+  return sbid;
+}
+
+BlueStore::OnodeRef BlueStore::Collection::get_onode(
+  const ghobject_t& oid,
+  bool create,
+  bool is_createop)
+{
+  ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
+
+  spg_t pgid;
+  if (cid.is_pg(&pgid)) {
+    if (!oid.match(cnode.bits, pgid.ps())) {
+      lderr(store->cct) << __func__ << " oid " << oid << " not part of "
+			<< pgid << " bits " << cnode.bits << dendl;
+      ceph_abort();
+    }
+  }
+
+  OnodeRef o = onode_map.lookup(oid);
+  if (o)
+    return o;
+
+  string key;
+  get_object_key(store->cct, oid, &key);
+
+  ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
+			<< pretty_binary_string(key) << dendl;
+
+  bufferlist v;
+  int r = -ENOENT;
+  Onode *on;
+  if (!is_createop) {
+    r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
+    ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
+  }
+  if (v.length() == 0) {
+    ceph_assert(r == -ENOENT);
+    if (!create)
+      return OnodeRef();
+
+    // new object, new onode
+    on = new Onode(this, oid, key);
+  } else {
+    // loaded
+    ceph_assert(r >= 0);
+    on = Onode::decode(this, oid, key, v);
+  }
+  o.reset(on);
+  return onode_map.add(oid, o);
+}
+
+void BlueStore::Collection::split_cache(
+  Collection *dest)
+{
+  ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
+
+  auto *ocache = get_onode_cache();
+  auto *ocache_dest = dest->get_onode_cache();
+
+ // lock cache shards
+  std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
+  std::lock_guard l(ocache->lock, std::adopt_lock);
+  std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
+  std::lock_guard l3(cache->lock, std::adopt_lock);
+  std::lock_guard l4(dest->cache->lock, std::adopt_lock);
+
+  int destbits = dest->cnode.bits;
+  spg_t destpg;
+  bool is_pg = dest->cid.is_pg(&destpg);
+  ceph_assert(is_pg);
+
+  auto p = onode_map.onode_map.begin();
+  while (p != onode_map.onode_map.end()) {
+    OnodeRef o = p->second;
+    if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
+      // onode does not belong to this child
+      ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
+			    << dendl;
+      ++p;
+    } else {
+      ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
+			    << dendl;
+
+      // ensuring that nref is always >= 2 and hence onode is pinned and 
+      // physically out of cache during the transition
+      OnodeRef o_pin = o;
+      ceph_assert(o->pinned);
+
+      p = onode_map.onode_map.erase(p);
+      dest->onode_map.onode_map[o->oid] = o;
+      if (o->cached) {
+        get_onode_cache()->move_pinned(dest->get_onode_cache(), o.get());
+      }
+      o->c = dest;
+
+      // move over shared blobs and buffers.  cover shared blobs from
+      // both extent map and spanning blob map (the full extent map
+      // may not be faulted in)
+      vector<SharedBlob*> sbvec;
+      for (auto& e : o->extent_map.extent_map) {
+	sbvec.push_back(e.blob->shared_blob.get());
+      }
+      for (auto& b : o->extent_map.spanning_blob_map) {
+	sbvec.push_back(b.second->shared_blob.get());
+      }
+      for (auto sb : sbvec) {
+	if (sb->coll == dest) {
+	  ldout(store->cct, 20) << __func__ << "  already moved " << *sb
+				<< dendl;
+	  continue;
+	}
+	ldout(store->cct, 20) << __func__ << "  moving " << *sb << dendl;
+	if (sb->get_sbid()) {
+	  ldout(store->cct, 20) << __func__
+				<< "   moving registration " << *sb << dendl;
+	  shared_blob_set.remove(sb);
+	  dest->shared_blob_set.add(dest, sb);
+	}
+	sb->coll = dest;
+	if (dest->cache != cache) {
+	  for (auto& i : sb->bc.buffer_map) {
+	    if (!i.second->is_writing()) {
+	      ldout(store->cct, 20) << __func__ << "   moving " << *i.second
+				    << dendl;
+	      dest->cache->_move(cache, i.second.get());
+	    }
+	  }
+	}
+      }
+    }
+  }
+  dest->cache->_trim();
+}
+
+// =======================================================
+
+// MempoolThread
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
+#undef dout_context
+#define dout_context store->cct
+
+void *BlueStore::MempoolThread::entry()
+{
+  std::unique_lock l{lock};
+
+  uint32_t prev_config_change = store->config_changed.load();
+  uint64_t base = store->osd_memory_base;
+  double fragmentation = store->osd_memory_expected_fragmentation;
+  uint64_t target = store->osd_memory_target;
+  uint64_t min = store->osd_memory_cache_min;
+  uint64_t max = min;
+
+  // When setting the maximum amount of memory to use for cache, first 
+  // assume some base amount of memory for the OSD and then fudge in
+  // some overhead for fragmentation that scales with cache usage.
+  uint64_t ltarget = (1.0 - fragmentation) * target;
+  if (ltarget > base + min) {
+    max = ltarget - base;
+  }
+
+  binned_kv_cache = store->db->get_priority_cache();
+  binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
+  if (store->cache_autotune && binned_kv_cache != nullptr) {
+    pcm = std::make_shared<PriorityCache::Manager>(
+        store->cct, min, max, target, true, "bluestore-pricache");
+    pcm->insert("kv", binned_kv_cache, true);
+    pcm->insert("meta", meta_cache, true);
+    pcm->insert("data", data_cache, true);
+    if (binned_kv_onode_cache != nullptr) {
+      pcm->insert("kv_onode", binned_kv_onode_cache, true);
+    }
+  }
+
+  utime_t next_balance = ceph_clock_now();
+  utime_t next_resize = ceph_clock_now();
+  utime_t next_deferred_force_submit = ceph_clock_now();
+  utime_t alloc_stats_dump_clock = ceph_clock_now();
+
+  bool interval_stats_trim = false;
+  while (!stop) {
+    // Update pcm cache settings if related configuration was changed
+    uint32_t cur_config_change = store->config_changed.load();
+    if (cur_config_change != prev_config_change) {
+      _update_cache_settings();
+      prev_config_change = cur_config_change;
+    }
+
+    // Before we trim, check and see if it's time to rebalance/resize.
+    double autotune_interval = store->cache_autotune_interval;
+    double resize_interval = store->osd_memory_cache_resize_interval;
+    double max_defer_interval = store->max_defer_interval;
+
+    double alloc_stats_dump_interval =
+      store->cct->_conf->bluestore_alloc_stats_dump_interval;
+
+    if (alloc_stats_dump_interval > 0 &&
+        alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
+      store->_record_allocation_stats();
+      alloc_stats_dump_clock = ceph_clock_now();
+    }
+    if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
+      _adjust_cache_settings();
+
+      // Log events at 5 instead of 20 when balance happens.
+      interval_stats_trim = true;
+
+      if (pcm != nullptr) {
+        pcm->balance();
+      }
+
+      next_balance = ceph_clock_now();
+      next_balance += autotune_interval;
+    }
+    if (resize_interval > 0 && next_resize < ceph_clock_now()) {
+      if (ceph_using_tcmalloc() && pcm != nullptr) {
+        pcm->tune_memory();
+      }
+      next_resize = ceph_clock_now();
+      next_resize += resize_interval;
+    }
+
+    if (max_defer_interval > 0 &&
+	next_deferred_force_submit < ceph_clock_now()) {
+      if (store->get_deferred_last_submitted() + max_defer_interval <
+	  ceph_clock_now()) {
+	store->deferred_try_submit();
+      }
+      next_deferred_force_submit = ceph_clock_now();
+      next_deferred_force_submit += max_defer_interval/3;
+    }
+
+    // Now Resize the shards 
+    _resize_shards(interval_stats_trim);
+    interval_stats_trim = false;
+
+    store->_update_cache_logger();
+    auto wait = ceph::make_timespan(
+      store->cct->_conf->bluestore_cache_trim_interval);
+    cond.wait_for(l, wait);
+  }
+  // do final dump
+  store->_record_allocation_stats();
+  stop = false;
+  pcm = nullptr;
+  return NULL;
+}
+
+void BlueStore::MempoolThread::_adjust_cache_settings()
+{
+  if (binned_kv_cache != nullptr) {
+    binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
+  }
+  if (binned_kv_onode_cache != nullptr) {
+    binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
+  }
+  meta_cache->set_cache_ratio(store->cache_meta_ratio);
+  data_cache->set_cache_ratio(store->cache_data_ratio);
+}
+
+void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
+{
+  size_t onode_shards = store->onode_cache_shards.size();
+  size_t buffer_shards = store->buffer_cache_shards.size();
+  int64_t kv_used = store->db->get_cache_usage();
+  int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
+  int64_t meta_used = meta_cache->_get_used_bytes();
+  int64_t data_used = data_cache->_get_used_bytes();
+
+  uint64_t cache_size = store->cache_size;
+  int64_t kv_alloc =
+     static_cast<int64_t>(store->cache_kv_ratio * cache_size); 
+  int64_t kv_onode_alloc =
+     static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
+  int64_t meta_alloc =
+     static_cast<int64_t>(store->cache_meta_ratio * cache_size);
+  int64_t data_alloc =
+     static_cast<int64_t>(store->cache_data_ratio * cache_size);
+
+  if (pcm != nullptr && binned_kv_cache != nullptr) {
+    cache_size = pcm->get_tuned_mem();
+    kv_alloc = binned_kv_cache->get_committed_size();
+    meta_alloc = meta_cache->get_committed_size();
+    data_alloc = data_cache->get_committed_size();
+    if (binned_kv_onode_cache != nullptr) {
+      kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
+    }
+  }
+  
+  if (interval_stats) {
+    dout(5) << __func__  << " cache_size: " << cache_size
+                  << " kv_alloc: " << kv_alloc
+                  << " kv_used: " << kv_used
+                  << " kv_onode_alloc: " << kv_onode_alloc
+                  << " kv_onode_used: " << kv_onode_used
+                  << " meta_alloc: " << meta_alloc
+                  << " meta_used: " << meta_used
+                  << " data_alloc: " << data_alloc
+                  << " data_used: " << data_used << dendl;
+  } else {
+    dout(20) << __func__  << " cache_size: " << cache_size
+                   << " kv_alloc: " << kv_alloc
+                   << " kv_used: " << kv_used
+                   << " kv_onode_alloc: " << kv_onode_alloc
+                   << " kv_onode_used: " << kv_onode_used
+                   << " meta_alloc: " << meta_alloc
+                   << " meta_used: " << meta_used
+                   << " data_alloc: " << data_alloc
+                   << " data_used: " << data_used << dendl;
+  }
+
+  uint64_t max_shard_onodes = static_cast<uint64_t>(
+      (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
+  uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
+
+  dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
+                 << " max_shard_buffer: " << max_shard_buffer << dendl;
+
+  for (auto i : store->onode_cache_shards) {
+    i->set_max(max_shard_onodes);
+  }
+  for (auto i : store->buffer_cache_shards) {
+    i->set_max(max_shard_buffer);
+  }
+}
+
+void BlueStore::MempoolThread::_update_cache_settings()
+{
+  // Nothing to do if pcm is not used.
+  if (pcm == nullptr) {
+    return;
+  }
+
+  uint64_t target = store->osd_memory_target;
+  uint64_t base = store->osd_memory_base;
+  uint64_t min = store->osd_memory_cache_min;
+  uint64_t max = min;
+  double fragmentation = store->osd_memory_expected_fragmentation;
+
+  uint64_t ltarget = (1.0 - fragmentation) * target;
+  if (ltarget > base + min) {
+    max = ltarget - base;
+  }
+
+  // set pcm cache levels
+  pcm->set_target_memory(target);
+  pcm->set_min_memory(min);
+  pcm->set_max_memory(max);
+
+  dout(5) << __func__  << " updated pcm target: " << target
+                << " pcm min: " << min
+                << " pcm max: " << max
+                << dendl;
+}
+
+// =======================================================
+
+// OmapIteratorImpl
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
+
+BlueStore::OmapIteratorImpl::OmapIteratorImpl(
+  CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
+  : c(c), o(o), it(it)
+{
+  std::shared_lock l(c->lock);
+  if (o->onode.has_omap()) {
+    o->get_omap_key(string(), &head);
+    o->get_omap_tail(&tail);
+    it->lower_bound(head);
+  }
+}
+
+string BlueStore::OmapIteratorImpl::_stringify() const
+{
+  stringstream s;
+  s << " omap_iterator(cid = " << c->cid
+    <<", oid = " << o->oid << ")";
+  return s.str();
+}
+
+int BlueStore::OmapIteratorImpl::seek_to_first()
+{
+  std::shared_lock l(c->lock);
+  auto start1 = mono_clock::now();
+  if (o->onode.has_omap()) {
+    it->lower_bound(head);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  c->store->log_latency(
+    __func__,
+    l_bluestore_omap_seek_to_first_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+  return 0;
+}
+
+int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
+{
+  std::shared_lock l(c->lock);
+  auto start1 = mono_clock::now();
+  if (o->onode.has_omap()) {
+    string key;
+    o->get_omap_key(after, &key);
+    ldout(c->store->cct,20) << __func__ << " after " << after << " key "
+			    << pretty_binary_string(key) << dendl;
+    it->upper_bound(key);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  c->store->log_latency_fn(
+    __func__,
+    l_bluestore_omap_upper_bound_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age,
+    [&] (const ceph::timespan& lat) {
+      return ", after = " + after +
+	_stringify();
+    }
+  );
+  return 0;
+}
+
+int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
+{
+  std::shared_lock l(c->lock);
+  auto start1 = mono_clock::now();
+  if (o->onode.has_omap()) {
+    string key;
+    o->get_omap_key(to, &key);
+    ldout(c->store->cct,20) << __func__ << " to " << to << " key "
+			    << pretty_binary_string(key) << dendl;
+    it->lower_bound(key);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  c->store->log_latency_fn(
+    __func__,
+    l_bluestore_omap_lower_bound_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age,
+    [&] (const ceph::timespan& lat) {
+      return ", to = " + to +
+	_stringify();
+    }
+  );
+  return 0;
+}
+
+bool BlueStore::OmapIteratorImpl::valid()
+{
+  std::shared_lock l(c->lock);
+  bool r = o->onode.has_omap() && it && it->valid() &&
+    it->raw_key().second < tail;
+  if (it && it->valid()) {
+    ldout(c->store->cct,20) << __func__ << " is at "
+			    << pretty_binary_string(it->raw_key().second)
+			    << dendl;
+  }
+  return r;
+}
+
+int BlueStore::OmapIteratorImpl::next()
+{
+  int r = -1;
+  std::shared_lock l(c->lock);
+  auto start1 = mono_clock::now();
+  if (o->onode.has_omap()) {
+    it->next();
+    r = 0;
+  }
+  c->store->log_latency(
+    __func__,
+    l_bluestore_omap_next_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+  return r;
+}
+
+string BlueStore::OmapIteratorImpl::key()
+{
+  std::shared_lock l(c->lock);
+  ceph_assert(it->valid());
+  string db_key = it->raw_key().second;
+  string user_key;
+  o->decode_omap_key(db_key, &user_key);
+
+  return user_key;
+}
+
+bufferlist BlueStore::OmapIteratorImpl::value()
+{
+  std::shared_lock l(c->lock);
+  ceph_assert(it->valid());
+  return it->value();
+}
+
+
+// =====================================
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore(" << path << ") "
+#undef dout_context
+#define dout_context cct
+
+
+static void aio_cb(void *priv, void *priv2)
+{
+  BlueStore *store = static_cast<BlueStore*>(priv);
+  BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
+  c->aio_finish(store);
+}
+
+static void discard_cb(void *priv, void *priv2)
+{
+  BlueStore *store = static_cast<BlueStore*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  store->handle_discard(*tmp);
+}
+
+void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(shared_alloc.a);
+  shared_alloc.a->release(to_release);
+}
+
+BlueStore::BlueStore(CephContext *cct, const string& path)
+  : BlueStore(cct, path, 0) {}
+
+BlueStore::BlueStore(CephContext *cct,
+  const string& path,
+  uint64_t _min_alloc_size)
+  : ObjectStore(cct, path),
+    throttle(cct),
+    finisher(cct, "commit_finisher", "cfin"),
+    kv_sync_thread(this),
+    kv_finalize_thread(this),
+    zoned_cleaner_thread(this),
+    min_alloc_size(_min_alloc_size),
+    min_alloc_size_order(ctz(_min_alloc_size)),
+    mempool_thread(this)
+{
+  _init_logger();
+  cct->_conf.add_observer(this);
+  set_cache_shards(1);
+}
+
+BlueStore::~BlueStore()
+{
+  cct->_conf.remove_observer(this);
+  _shutdown_logger();
+  ceph_assert(!mounted);
+  ceph_assert(db == NULL);
+  ceph_assert(bluefs == NULL);
+  ceph_assert(fsid_fd < 0);
+  ceph_assert(path_fd < 0);
+  for (auto i : onode_cache_shards) {
+    delete i;
+  }
+  for (auto i : buffer_cache_shards) {
+    delete i;
+  }
+  onode_cache_shards.clear();
+  buffer_cache_shards.clear();
+}
+
+const char **BlueStore::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "bluestore_csum_type",
+    "bluestore_compression_mode",
+    "bluestore_compression_algorithm",
+    "bluestore_compression_min_blob_size",
+    "bluestore_compression_min_blob_size_ssd",
+    "bluestore_compression_min_blob_size_hdd",
+    "bluestore_compression_max_blob_size",
+    "bluestore_compression_max_blob_size_ssd",
+    "bluestore_compression_max_blob_size_hdd",
+    "bluestore_compression_required_ratio",
+    "bluestore_max_alloc_size",
+    "bluestore_prefer_deferred_size",
+    "bluestore_prefer_deferred_size_hdd",
+    "bluestore_prefer_deferred_size_ssd",
+    "bluestore_deferred_batch_ops",
+    "bluestore_deferred_batch_ops_hdd",
+    "bluestore_deferred_batch_ops_ssd",
+    "bluestore_throttle_bytes",
+    "bluestore_throttle_deferred_bytes",
+    "bluestore_throttle_cost_per_io_hdd",
+    "bluestore_throttle_cost_per_io_ssd",
+    "bluestore_throttle_cost_per_io",
+    "bluestore_max_blob_size",
+    "bluestore_max_blob_size_ssd",
+    "bluestore_max_blob_size_hdd",
+    "osd_memory_target",
+    "osd_memory_target_cgroup_limit_ratio",
+    "osd_memory_base",
+    "osd_memory_cache_min",
+    "osd_memory_expected_fragmentation",
+    "bluestore_cache_autotune",
+    "bluestore_cache_autotune_interval",
+    "bluestore_warn_on_legacy_statfs",
+    "bluestore_warn_on_no_per_pool_omap",
+    "bluestore_max_defer_interval",
+    NULL
+  };
+  return KEYS;
+}
+
+void BlueStore::handle_conf_change(const ConfigProxy& conf,
+				   const std::set<std::string> &changed)
+{
+  if (changed.count("bluestore_warn_on_legacy_statfs")) {
+    _check_legacy_statfs_alert();
+  }
+  if (changed.count("bluestore_warn_on_no_per_pool_omap") ||
+      changed.count("bluestore_warn_on_no_per_pg_omap")) {
+    _check_no_per_pg_or_pool_omap_alert();
+  }
+
+  if (changed.count("bluestore_csum_type")) {
+    _set_csum();
+  }
+  if (changed.count("bluestore_compression_mode") ||
+      changed.count("bluestore_compression_algorithm") ||
+      changed.count("bluestore_compression_min_blob_size") ||
+      changed.count("bluestore_compression_max_blob_size")) {
+    if (bdev) {
+      _set_compression();
+    }
+  }
+  if (changed.count("bluestore_max_blob_size") ||
+      changed.count("bluestore_max_blob_size_ssd") ||
+      changed.count("bluestore_max_blob_size_hdd")) {
+    if (bdev) {
+      // only after startup
+      _set_blob_size();
+    }
+  }
+  if (changed.count("bluestore_prefer_deferred_size") ||
+      changed.count("bluestore_prefer_deferred_size_hdd") ||
+      changed.count("bluestore_prefer_deferred_size_ssd") ||
+      changed.count("bluestore_max_alloc_size") ||
+      changed.count("bluestore_deferred_batch_ops") ||
+      changed.count("bluestore_deferred_batch_ops_hdd") ||
+      changed.count("bluestore_deferred_batch_ops_ssd")) {
+    if (bdev) {
+      // only after startup
+      _set_alloc_sizes();
+    }
+  }
+  if (changed.count("bluestore_throttle_cost_per_io") ||
+      changed.count("bluestore_throttle_cost_per_io_hdd") ||
+      changed.count("bluestore_throttle_cost_per_io_ssd")) {
+    if (bdev) {
+      _set_throttle_params();
+    }
+  }
+  if (changed.count("bluestore_throttle_bytes") ||
+      changed.count("bluestore_throttle_deferred_bytes") ||
+      changed.count("bluestore_throttle_trace_rate")) {
+    throttle.reset_throttle(conf);
+  }
+  if (changed.count("bluestore_max_defer_interval")) {
+    if (bdev) {
+      _set_max_defer_interval();
+    }
+  }
+  if (changed.count("osd_memory_target") ||
+      changed.count("osd_memory_base") ||
+      changed.count("osd_memory_cache_min") ||
+      changed.count("osd_memory_expected_fragmentation")) {
+    _update_osd_memory_options();
+  }
+}
+
+void BlueStore::_set_compression()
+{
+  auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
+  if (m) {
+    _clear_compression_alert();
+    comp_mode = *m;
+  } else {
+    derr << __func__ << " unrecognized value '"
+         << cct->_conf->bluestore_compression_mode
+         << "' for bluestore_compression_mode, reverting to 'none'"
+         << dendl;
+    comp_mode = Compressor::COMP_NONE;
+    string s("unknown mode: ");
+    s += cct->_conf->bluestore_compression_mode;
+    _set_compression_alert(true, s.c_str());
+  }
+
+  compressor = nullptr;
+
+  if (cct->_conf->bluestore_compression_min_blob_size) {
+    comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
+  } else {
+    ceph_assert(bdev);
+    if (_use_rotational_settings()) {
+      comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
+    } else {
+      comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
+    }
+  }
+
+  if (cct->_conf->bluestore_compression_max_blob_size) {
+    comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
+  } else {
+    ceph_assert(bdev);
+    if (_use_rotational_settings()) {
+      comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
+    } else {
+      comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
+    }
+  }
+
+  auto& alg_name = cct->_conf->bluestore_compression_algorithm;
+  if (!alg_name.empty()) {
+    compressor = Compressor::create(cct, alg_name);
+    if (!compressor) {
+      derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
+           << dendl;
+      _set_compression_alert(false, alg_name.c_str());
+    }
+  }
+ 
+  dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
+	   << " alg " << (compressor ? compressor->get_type_name() : "(none)")
+	   << " min_blob " << comp_min_blob_size
+	   << " max_blob " << comp_max_blob_size
+	   << dendl;
+}
+
+void BlueStore::_set_csum()
+{
+  csum_type = Checksummer::CSUM_NONE;
+  int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
+  if (t > Checksummer::CSUM_NONE)
+    csum_type = t;
+
+  dout(10) << __func__ << " csum_type "
+	   << Checksummer::get_csum_type_string(csum_type)
+	   << dendl;
+}
+
+void BlueStore::_set_throttle_params()
+{
+  if (cct->_conf->bluestore_throttle_cost_per_io) {
+    throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
+  } else {
+    ceph_assert(bdev);
+    if (_use_rotational_settings()) {
+      throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
+    } else {
+      throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
+    }
+  }
+
+  dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
+	   << dendl;
+}
+void BlueStore::_set_blob_size()
+{
+  if (cct->_conf->bluestore_max_blob_size) {
+    max_blob_size = cct->_conf->bluestore_max_blob_size;
+  } else {
+    ceph_assert(bdev);
+    if (_use_rotational_settings()) {
+      max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
+    } else {
+      max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
+    }
+  }
+  dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
+           << std::dec << dendl;
+}
+
+void BlueStore::_update_osd_memory_options()
+{
+  osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
+  osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
+  osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
+  osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
+  config_changed++;
+  dout(10) << __func__
+           << " osd_memory_target " << osd_memory_target
+           << " osd_memory_base " << osd_memory_base
+           << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
+           << " osd_memory_cache_min " << osd_memory_cache_min
+           << dendl;
+}
+
+int BlueStore::_set_cache_sizes()
+{
+  ceph_assert(bdev);
+  cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
+  cache_autotune_interval =
+      cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
+  osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
+  osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
+  osd_memory_expected_fragmentation =
+      cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
+  osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
+  osd_memory_cache_resize_interval = 
+      cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
+
+  if (cct->_conf->bluestore_cache_size) {
+    cache_size = cct->_conf->bluestore_cache_size;
+  } else {
+    // choose global cache size based on backend type
+    if (_use_rotational_settings()) {
+      cache_size = cct->_conf->bluestore_cache_size_hdd;
+    } else {
+      cache_size = cct->_conf->bluestore_cache_size_ssd;
+    }
+  }
+
+  cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
+  if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
+    derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
+         << ") must be in range [0,1.0]" << dendl;
+    return -EINVAL;
+  }
+
+  cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
+  if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
+    derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
+         << ") must be in range [0,1.0]" << dendl;
+    return -EINVAL;
+  }
+
+  cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
+  if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
+    derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
+         << ") must be in range [0,1.0]" << dendl;
+    return -EINVAL;
+  }
+
+  if (cache_meta_ratio + cache_kv_ratio > 1.0) {
+    derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
+         << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
+         << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
+         << dendl;
+    return -EINVAL;
+  }
+
+  cache_data_ratio = (double)1.0 - 
+                     (double)cache_meta_ratio - 
+                     (double)cache_kv_ratio - 
+                     (double)cache_kv_onode_ratio;
+  if (cache_data_ratio < 0) {
+    // deal with floating point imprecision
+    cache_data_ratio = 0;
+  }
+    
+  dout(1) << __func__ << " cache_size " << cache_size
+          << " meta " << cache_meta_ratio
+	  << " kv " << cache_kv_ratio
+	  << " data " << cache_data_ratio
+	  << dendl;
+  return 0;
+}
+
+int BlueStore::write_meta(const std::string& key, const std::string& value)
+{
+  bluestore_bdev_label_t label;
+  string p = path + "/block";
+  int r = _read_bdev_label(cct, p, &label);
+  if (r < 0) {
+    return ObjectStore::write_meta(key, value);
+  }
+  label.meta[key] = value;
+  r = _write_bdev_label(cct, p, label);
+  ceph_assert(r == 0);
+  return ObjectStore::write_meta(key, value);
+}
+
+int BlueStore::read_meta(const std::string& key, std::string *value)
+{
+  bluestore_bdev_label_t label;
+  string p = path + "/block";
+  int r = _read_bdev_label(cct, p, &label);
+  if (r < 0) {
+    return ObjectStore::read_meta(key, value);
+  }
+  auto i = label.meta.find(key);
+  if (i == label.meta.end()) {
+    return ObjectStore::read_meta(key, value);
+  }
+  *value = i->second;
+  return 0;
+}
+
+void BlueStore::_init_logger()
+{
+  PerfCountersBuilder b(cct, "bluestore",
+                        l_bluestore_first, l_bluestore_last);
+  b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
+		 "Average kv_thread flush latency",
+		 "fl_l", PerfCountersBuilder::PRIO_INTERESTING);
+  b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
+		 "Average kv_thread commit latency");
+  b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
+		 "Average kv_sync thread latency",
+		 "ks_l", PerfCountersBuilder::PRIO_INTERESTING);
+  b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
+		 "Average kv_finalize thread latency",
+		 "kf_l", PerfCountersBuilder::PRIO_INTERESTING);
+  b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
+    "Average prepare state latency");
+  b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
+		 "Average aio_wait state latency",
+		 "io_l", PerfCountersBuilder::PRIO_INTERESTING);
+  b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
+    "Average io_done state latency");
+  b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
+    "Average kv_queued state latency");
+  b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
+    "Average kv_commiting state latency");
+  b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
+    "Average kv_done state latency");
+  b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
+    "Average deferred_queued state latency");
+  b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
+    "Average aio_wait state latency");
+  b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
+    "Average cleanup state latency");
+  b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
+    "Average finishing state latency");
+  b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
+    "Average done state latency");
+  b.add_time_avg(l_bluestore_throttle_lat, "throttle_lat",
+		 "Average submit throttle latency",
+		 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
+  b.add_time_avg(l_bluestore_submit_lat, "submit_lat",
+		 "Average submit latency",
+		 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
+  b.add_time_avg(l_bluestore_commit_lat, "commit_lat",
+		 "Average commit latency",
+		 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
+  b.add_time_avg(l_bluestore_read_lat, "read_lat",
+		 "Average read latency",
+		 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
+  b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
+    "Average read onode metadata latency");
+  b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
+    "Average read latency");
+  b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
+    "Average compress latency");
+  b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
+    "Average decompress latency");
+  b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
+    "Average checksum latency");
+  b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
+    "Sum for beneficial compress ops");
+  b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
+    "Sum for compress ops rejected due to low net gain of space");
+  b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
+		    "Sum for write-op padded bytes", NULL, 0, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluestore_deferred_write_ops, "deferred_write_ops",
+		    "Sum for deferred write op");
+  b.add_u64_counter(l_bluestore_deferred_write_bytes, "deferred_write_bytes",
+		    "Sum for deferred write bytes", "def", 0, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
+		    "Sum for write penalty read ops");
+  b.add_u64(l_bluestore_allocated, "bluestore_allocated",
+    "Sum for allocated bytes");
+  b.add_u64(l_bluestore_stored, "bluestore_stored",
+    "Sum for stored bytes");
+  b.add_u64(l_bluestore_compressed, "bluestore_compressed",
+    "Sum for stored compressed bytes",
+    "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluestore_compressed_allocated, "bluestore_compressed_allocated",
+    "Sum for bytes allocated for compressed data",
+    "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluestore_compressed_original, "bluestore_compressed_original",
+    "Sum for original bytes that were compressed",
+    "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluestore_onodes, "bluestore_onodes",
+	    "Number of onodes in cache");
+  b.add_u64(l_bluestore_pinned_onodes, "bluestore_pinned_onodes",
+            "Number of pinned onodes in cache");
+  b.add_u64_counter(l_bluestore_onode_hits, "onode_hits",
+		    "Count of onode cache lookup hits",
+		    "o_ht", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluestore_onode_misses, "onode_misses",
+		    "Count of onode cache lookup misses",
+		    "o_ms", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluestore_onode_shard_hits, "onode_shard_hits",
+		    "Sum for onode-shard lookups hit in the cache");
+  b.add_u64_counter(l_bluestore_onode_shard_misses,
+		    "bluestore_onode_shard_misses",
+		    "Sum for onode-shard lookups missed in the cache");
+  b.add_u64(l_bluestore_extents, "bluestore_extents",
+	    "Number of extents in cache");
+  b.add_u64(l_bluestore_blobs, "bluestore_blobs",
+	    "Number of blobs in cache");
+  b.add_u64(l_bluestore_buffers, "bluestore_buffers",
+	    "Number of buffers in cache");
+  b.add_u64(l_bluestore_buffer_bytes, "bluestore_buffer_bytes",
+	    "Number of buffer bytes in cache", NULL, 0, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluestore_buffer_hit_bytes, "bluestore_buffer_hit_bytes",
+	    "Sum for bytes of read hit in the cache", NULL, 0, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluestore_buffer_miss_bytes, "bluestore_buffer_miss_bytes",
+	    "Sum for bytes of read missed in the cache", NULL, 0, unit_t(UNIT_BYTES));
+
+  b.add_u64_counter(l_bluestore_write_big, "bluestore_write_big",
+		    "Large aligned writes into fresh blobs");
+  b.add_u64_counter(l_bluestore_write_big_bytes, "bluestore_write_big_bytes",
+		    "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs",
+		    "Large aligned writes into fresh blobs (blobs)");
+  b.add_u64_counter(l_bluestore_write_big_deferred,
+		    "bluestore_write_big_deferred",
+		    "Big overwrites using deferred");
+  b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small",
+		    "Small writes into existing or sparse small blobs");
+  b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes",
+		    "Small writes into existing or sparse small blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluestore_write_small_unused,
+		    "bluestore_write_small_unused",
+		    "Small writes into unused portion of existing blob");
+  b.add_u64_counter(l_bluestore_write_deferred,
+		    "bluestore_write_deferred",
+		    "Total deferred writes submitted");
+  b.add_u64_counter(l_bluestore_write_deferred_bytes,
+		    "bluestore_write_deferred_bytes",
+		    "Total bytes submitted as deferred writes");
+  b.add_u64_counter(l_bluestore_write_small_pre_read,
+		    "bluestore_write_small_pre_read",
+		    "Small writes that required we read some data (possibly "
+		    "cached) to fill out the block");
+  b.add_u64_counter(l_bluestore_write_new, "bluestore_write_new",
+		    "Write into new blob");
+
+  b.add_u64_counter(l_bluestore_txc, "bluestore_txc", "Transactions committed");
+  b.add_u64_counter(l_bluestore_onode_reshard, "bluestore_onode_reshard",
+		    "Onode extent map reshard events");
+  b.add_u64_counter(l_bluestore_blob_split, "bluestore_blob_split",
+		    "Sum for blob splitting due to resharding");
+  b.add_u64_counter(l_bluestore_extent_compress, "bluestore_extent_compress",
+		    "Sum for extents that have been removed due to compression");
+  b.add_u64_counter(l_bluestore_gc_merged, "bluestore_gc_merged",
+		    "Sum for extents that have been merged due to garbage "
+		    "collection");
+  b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
+                    "Read EIO errors propagated to high level callers");
+  b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
+                    "Read operations that required at least one retry due to failed checksum validation");
+  b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
+            "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
+  b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
+    "Average omap iterator seek_to_first call latency");
+  b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
+    "Average omap iterator upper_bound call latency");
+  b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
+    "Average omap iterator lower_bound call latency");
+  b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
+    "Average omap iterator next call latency");
+  b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
+    "Average omap get_keys call latency");
+  b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
+    "Average omap get_values call latency");
+  b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
+    "Average collection listing latency");
+  b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
+    "Average removal latency");
+
+  logger = b.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+}
+
+int BlueStore::_reload_logger()
+{
+  struct store_statfs_t store_statfs;
+  int r = statfs(&store_statfs);
+  if (r >= 0) {
+    logger->set(l_bluestore_allocated, store_statfs.allocated);
+    logger->set(l_bluestore_stored, store_statfs.data_stored);
+    logger->set(l_bluestore_compressed, store_statfs.data_compressed);
+    logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
+    logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
+  }
+  return r;
+}
+
+void BlueStore::_shutdown_logger()
+{
+  cct->get_perfcounters_collection()->remove(logger);
+  delete logger;
+}
+
+int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
+				     uuid_d *fsid)
+{
+  bluestore_bdev_label_t label;
+  int r = _read_bdev_label(cct, path, &label);
+  if (r < 0)
+    return r;
+  *fsid = label.osd_uuid;
+  return 0;
+}
+
+int BlueStore::_open_path()
+{
+  // sanity check(s)
+  ceph_assert(path_fd < 0);
+  path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
+  if (path_fd < 0) {
+    int r = -errno;
+    derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
+	 << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void BlueStore::_close_path()
+{
+  VOID_TEMP_FAILURE_RETRY(::close(path_fd));
+  path_fd = -1;
+}
+
+int BlueStore::_write_bdev_label(CephContext *cct,
+				 string path, bluestore_bdev_label_t label)
+{
+  dout(10) << __func__ << " path " << path << " label " << label << dendl;
+  bufferlist bl;
+  encode(label, bl);
+  uint32_t crc = bl.crc32c(-1);
+  encode(crc, bl);
+  ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
+  bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
+  z.zero();
+  bl.append(std::move(z));
+
+  int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC|O_DIRECT));
+  if (fd < 0) {
+    fd = -errno;
+    derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
+	 << dendl;
+    return fd;
+  }
+  bl.rebuild_aligned_size_and_memory(BDEV_LABEL_BLOCK_SIZE, BDEV_LABEL_BLOCK_SIZE, IOV_MAX);
+  int r = bl.write_fd(fd);
+  if (r < 0) {
+    derr << __func__ << " failed to write to " << path
+	 << ": " << cpp_strerror(r) << dendl;
+    goto out;
+  }
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " failed to fsync " << path
+	 << ": " << cpp_strerror(r) << dendl;
+  }
+out:
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  return r;
+}
+
+int BlueStore::_read_bdev_label(CephContext* cct, string path,
+				bluestore_bdev_label_t *label)
+{
+  dout(10) << __func__ << dendl;
+  int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
+  if (fd < 0) {
+    fd = -errno;
+    derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
+	 << dendl;
+    return fd;
+  }
+  bufferlist bl;
+  int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  if (r < 0) {
+    derr << __func__ << " failed to read from " << path
+	 << ": " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  uint32_t crc, expected_crc;
+  auto p = bl.cbegin();
+  try {
+    decode(*label, p);
+    bufferlist t;
+    t.substr_of(bl, 0, p.get_off());
+    crc = t.crc32c(-1);
+    decode(expected_crc, p);
+  }
+  catch (ceph::buffer::error& e) {
+    dout(2) << __func__ << " unable to decode label at offset " << p.get_off()
+	 << ": " << e.what()
+	 << dendl;
+    return -ENOENT;
+  }
+  if (crc != expected_crc) {
+    derr << __func__ << " bad crc on label, expected " << expected_crc
+	 << " != actual " << crc << dendl;
+    return -EIO;
+  }
+  dout(10) << __func__ << " got " << *label << dendl;
+  return 0;
+}
+
+int BlueStore::_check_or_set_bdev_label(
+  string path, uint64_t size, string desc, bool create)
+{
+  bluestore_bdev_label_t label;
+  if (create) {
+    label.osd_uuid = fsid;
+    label.size = size;
+    label.btime = ceph_clock_now();
+    label.description = desc;
+    int r = _write_bdev_label(cct, path, label);
+    if (r < 0)
+      return r;
+  } else {
+    int r = _read_bdev_label(cct, path, &label);
+    if (r < 0)
+      return r;
+    if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
+      dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
+	   << " and fsid " << fsid << " check bypassed" << dendl;
+    } else if (label.osd_uuid != fsid) {
+      derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
+	   << " does not match our fsid " << fsid << dendl;
+      return -EIO;
+    }
+  }
+  return 0;
+}
+
+void BlueStore::_set_alloc_sizes(void)
+{
+  max_alloc_size = cct->_conf->bluestore_max_alloc_size;
+
+  if (cct->_conf->bluestore_prefer_deferred_size) {
+    prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
+  } else {
+    ceph_assert(bdev);
+    if (_use_rotational_settings()) {
+      prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
+    } else {
+      prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
+    }
+  }
+
+  if (cct->_conf->bluestore_deferred_batch_ops) {
+    deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
+  } else {
+    ceph_assert(bdev);
+    if (_use_rotational_settings()) {
+      deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
+    } else {
+      deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
+    }
+  }
+
+  dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
+	   << std::dec << " order " << (int)min_alloc_size_order
+	   << " max_alloc_size 0x" << std::hex << max_alloc_size
+	   << " prefer_deferred_size 0x" << prefer_deferred_size
+	   << std::dec
+	   << " deferred_batch_ops " << deferred_batch_ops
+	   << dendl;
+}
+
+int BlueStore::_open_bdev(bool create)
+{
+  ceph_assert(bdev == NULL);
+  string p = path + "/block";
+  bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
+  int r = bdev->open(p);
+  if (r < 0)
+    goto fail;
+
+  if (create && cct->_conf->bdev_enable_discard) {
+    bdev->discard(0, bdev->get_size());
+  }
+
+  if (bdev->supported_bdev_label()) {
+    r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
+    if (r < 0)
+      goto fail_close;
+  }
+
+  // initialize global block parameters
+  block_size = bdev->get_block_size();
+  block_mask = ~(block_size - 1);
+  block_size_order = ctz(block_size);
+  ceph_assert(block_size == 1u << block_size_order);
+  _set_max_defer_interval();
+  // and set cache_size based on device type
+  r = _set_cache_sizes();
+  if (r < 0) {
+    goto fail_close;
+  }
+
+  if (bdev->is_smr()) {
+    freelist_type = "zoned";
+  }
+  return 0;
+
+ fail_close:
+  bdev->close();
+ fail:
+  delete bdev;
+  bdev = NULL;
+  return r;
+}
+
+void BlueStore::_validate_bdev()
+{
+  ceph_assert(bdev);
+  uint64_t dev_size = bdev->get_size();
+  ceph_assert(dev_size > _get_ondisk_reserved());
+}
+
+void BlueStore::_close_bdev()
+{
+  ceph_assert(bdev);
+  bdev->close();
+  delete bdev;
+  bdev = NULL;
+}
+
+int BlueStore::_open_fm(KeyValueDB::Transaction t, bool read_only)
+{
+  int r;
+
+  ceph_assert(fm == NULL);
+  fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
+  ceph_assert(fm);
+  if (t) {
+    // create mode. initialize freespace
+    dout(20) << __func__ << " initializing freespace" << dendl;
+    {
+      bufferlist bl;
+      bl.append(freelist_type);
+      t->set(PREFIX_SUPER, "freelist_type", bl);
+    }
+    // being able to allocate in units less than bdev block size 
+    // seems to be a bad idea.
+    ceph_assert( cct->_conf->bdev_block_size <= (int64_t)min_alloc_size);
+
+    uint64_t alloc_size = min_alloc_size;
+    if (bdev->is_smr()) {
+      alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
+    }
+
+    fm->create(bdev->get_size(), alloc_size, t);
+
+    // allocate superblock reserved space.  note that we do not mark
+    // bluefs space as allocated in the freelist; we instead rely on
+    // bluefs doing that itself.
+    auto reserved = _get_ondisk_reserved();
+    fm->allocate(0, reserved, t);
+
+    if (cct->_conf->bluestore_debug_prefill > 0) {
+      uint64_t end = bdev->get_size() - reserved;
+      dout(1) << __func__ << " pre-fragmenting freespace, using "
+	      << cct->_conf->bluestore_debug_prefill << " with max free extent "
+	      << cct->_conf->bluestore_debug_prefragment_max << dendl;
+      uint64_t start = p2roundup(reserved, min_alloc_size);
+      uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
+      float r = cct->_conf->bluestore_debug_prefill;
+      r /= 1.0 - r;
+      bool stop = false;
+
+      while (!stop && start < end) {
+	uint64_t l = (rand() % max_b + 1) * min_alloc_size;
+	if (start + l > end) {
+	  l = end - start;
+          l = p2align(l, min_alloc_size);
+        }
+        ceph_assert(start + l <= end);
+
+	uint64_t u = 1 + (uint64_t)(r * (double)l);
+	u = p2roundup(u, min_alloc_size);
+        if (start + l + u > end) {
+          u = end - (start + l);
+          // trim to align so we don't overflow again
+          u = p2align(u, min_alloc_size);
+          stop = true;
+        }
+        ceph_assert(start + l + u <= end);
+
+	dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
+		 << " use 0x" << u << std::dec << dendl;
+
+        if (u == 0) {
+          // break if u has been trimmed to nothing
+          break;
+        }
+
+	fm->allocate(start + l, u, t);
+	start += l + u;
+      }
+    }
+    r = _write_out_fm_meta(0);
+    ceph_assert(r == 0);
+  } else {
+    r = fm->init(db, read_only,
+      [&](const std::string& key, std::string* result) {
+        return read_meta(key, result);
+    });
+    if (r < 0) {
+      derr << __func__ << " freelist init failed: " << cpp_strerror(r) << dendl;
+      delete fm;
+      fm = NULL;
+      return r;
+    }
+  }
+  // if space size tracked by free list manager is that higher than actual
+  // dev size one can hit out-of-space allocation which will result
+  // in data loss and/or assertions
+  // Probably user altered the device size somehow.
+  // The only fix for now is to redeploy OSD.
+  if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
+    ostringstream ss;
+    ss << "slow device size mismatch detected, "
+	<< " fm size(" << fm->get_size()
+	<< ") > slow device size(" << bdev->get_size()
+	<< "), Please stop using this OSD as it might cause data loss.";
+    _set_disk_size_mismatch_alert(ss.str());
+  }
+  return 0;
+}
+
+void BlueStore::_close_fm()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(fm);
+  fm->shutdown();
+  delete fm;
+  fm = NULL;
+}
+
+int BlueStore::_write_out_fm_meta(uint64_t target_size)
+{
+  int r = 0;
+  string p = path + "/block";
+
+  std::vector<std::pair<string, string>> fm_meta;
+  fm->get_meta(target_size, &fm_meta);
+
+  for (auto& m : fm_meta) {
+    r = write_meta(m.first, m.second);
+    ceph_assert(r == 0);
+  }
+  return r;
+}
+
+int BlueStore::_create_alloc()
+{
+  ceph_assert(shared_alloc.a == NULL);
+  ceph_assert(bdev->get_size());
+
+  uint64_t alloc_size = min_alloc_size;
+  if (bdev->is_smr()) {
+    int r = _zoned_check_config_settings();
+    if (r < 0)
+      return r;
+    alloc_size = _zoned_piggyback_device_parameters_onto(alloc_size);
+  }
+
+  shared_alloc.set(Allocator::create(cct, cct->_conf->bluestore_allocator,
+    bdev->get_size(),
+    alloc_size, "block"));
+
+  if (!shared_alloc.a) {
+    lderr(cct) << __func__ << "Failed to create allocator:: "
+      << cct->_conf->bluestore_allocator
+      << dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+int BlueStore::_init_alloc()
+{
+  int r = _create_alloc();
+  if (r < 0) {
+    return r;
+  }
+  ceph_assert(shared_alloc.a != NULL);
+
+  if (bdev->is_smr()) {
+    shared_alloc.a->zoned_set_zone_states(fm->get_zone_states(db));
+  }
+
+  uint64_t num = 0, bytes = 0;
+
+  dout(1) << __func__ << " opening allocation metadata" << dendl;
+  // initialize from freelist
+  fm->enumerate_reset();
+  uint64_t offset, length;
+  while (fm->enumerate_next(db, &offset, &length)) {
+    shared_alloc.a->init_add_free(offset, length);
+    ++num;
+    bytes += length;
+  }
+  fm->enumerate_reset();
+
+  dout(1) << __func__
+          << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
+          << std::hex
+          << ", allocator type " << shared_alloc.a->get_type()
+          << ", capacity 0x" << shared_alloc.a->get_capacity()
+          << ", block size 0x" << shared_alloc.a->get_block_size()
+          << ", free 0x" << shared_alloc.a->get_free()
+          << ", fragmentation " << shared_alloc.a->get_fragmentation()
+          << std::dec << dendl;
+
+  return 0;
+}
+
+void BlueStore::_close_alloc()
+{
+  ceph_assert(bdev);
+  bdev->discard_drain();
+
+  ceph_assert(shared_alloc.a);
+  shared_alloc.a->shutdown();
+  delete shared_alloc.a;
+  shared_alloc.reset();
+}
+
+int BlueStore::_open_fsid(bool create)
+{
+  ceph_assert(fsid_fd < 0);
+  int flags = O_RDWR|O_CLOEXEC;
+  if (create)
+    flags |= O_CREAT;
+  fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
+  if (fsid_fd < 0) {
+    int err = -errno;
+    derr << __func__ << " " << cpp_strerror(err) << dendl;
+    return err;
+  }
+  return 0;
+}
+
+int BlueStore::_read_fsid(uuid_d *uuid)
+{
+  char fsid_str[40];
+  memset(fsid_str, 0, sizeof(fsid_str));
+  int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
+  if (ret < 0) {
+    derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  if (ret > 36)
+    fsid_str[36] = 0;
+  else
+    fsid_str[ret] = 0;
+  if (!uuid->parse(fsid_str)) {
+    derr << __func__ << " unparsable uuid " << fsid_str << dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+int BlueStore::_write_fsid()
+{
+  int r = ::ftruncate(fsid_fd, 0);
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  string str = stringify(fsid) + "\n";
+  r = safe_write(fsid_fd, str.c_str(), str.length());
+  if (r < 0) {
+    derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = ::fsync(fsid_fd);
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void BlueStore::_close_fsid()
+{
+  VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+  fsid_fd = -1;
+}
+
+int BlueStore::_lock_fsid()
+{
+  struct flock l;
+  memset(&l, 0, sizeof(l));
+  l.l_type = F_WRLCK;
+  l.l_whence = SEEK_SET;
+  int r = ::fcntl(fsid_fd, F_SETLK, &l);
+  if (r < 0) {
+    int err = errno;
+    derr << __func__ << " failed to lock " << path << "/fsid"
+	 << " (is another ceph-osd still running?)"
+	 << cpp_strerror(err) << dendl;
+    return -err;
+  }
+  return 0;
+}
+
+bool BlueStore::is_rotational()
+{
+  if (bdev) {
+    return bdev->is_rotational();
+  }
+
+  bool rotational = true;
+  int r = _open_path();
+  if (r < 0)
+    goto out;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+  r = _read_fsid(&fsid);
+  if (r < 0)
+    goto out_fsid;
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_fsid;
+  r = _open_bdev(false);
+  if (r < 0)
+    goto out_fsid;
+  rotational = bdev->is_rotational();
+  _close_bdev();
+ out_fsid:
+  _close_fsid();
+ out_path:
+  _close_path();
+  out:
+  return rotational;
+}
+
+bool BlueStore::is_journal_rotational()
+{
+  if (!bluefs) {
+    dout(5) << __func__ << " bluefs disabled, default to store media type"
+            << dendl;
+    return is_rotational();
+  }
+  dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
+  return bluefs->wal_is_rotational();
+}
+
+bool BlueStore::_use_rotational_settings()
+{
+  if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
+    return true;
+  }
+  if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
+    return false;
+  }
+  return bdev->is_rotational();
+}
+
+bool BlueStore::test_mount_in_use()
+{
+  // most error conditions mean the mount is not in use (e.g., because
+  // it doesn't exist).  only if we fail to lock do we conclude it is
+  // in use.
+  bool ret = false;
+  int r = _open_path();
+  if (r < 0)
+    return false;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+  r = _lock_fsid();
+  if (r < 0)
+    ret = true; // if we can't lock, it is in use
+  _close_fsid();
+ out_path:
+  _close_path();
+  return ret;
+}
+
+int BlueStore::_minimal_open_bluefs(bool create)
+{
+  int r;
+  bluefs = new BlueFS(cct);
+
+  string bfn;
+  struct stat st;
+
+  bfn = path + "/block.db";
+  if (::stat(bfn.c_str(), &st) == 0) {
+    r = bluefs->add_block_device(
+      BlueFS::BDEV_DB, bfn,
+      create && cct->_conf->bdev_enable_discard,
+      SUPER_RESERVED);
+    if (r < 0) {
+      derr << __func__ << " add block device(" << bfn << ") returned: "
+            << cpp_strerror(r) << dendl;
+      goto free_bluefs;
+    }
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
+      r = _check_or_set_bdev_label(
+	bfn,
+	bluefs->get_block_device_size(BlueFS::BDEV_DB),
+        "bluefs db", create);
+      if (r < 0) {
+        derr << __func__
+	      << " check block device(" << bfn << ") label returned: "
+              << cpp_strerror(r) << dendl;
+        goto free_bluefs;
+      }
+    }
+    bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
+    bluefs_layout.dedicated_db = true;
+  } else {
+    r = -errno;
+    if (::lstat(bfn.c_str(), &st) == -1) {
+      r = 0;
+      bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
+    } else {
+      derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+	    << cpp_strerror(r) << dendl;
+      goto free_bluefs;
+    }
+  }
+
+  // shared device
+  bfn = path + "/block";
+  // never trim here
+  r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
+                               0, // no need to provide valid 'reserved' for shared dev
+                               &shared_alloc);
+  if (r < 0) {
+    derr << __func__ << " add block device(" << bfn << ") returned: "
+	  << cpp_strerror(r) << dendl;
+    goto free_bluefs;
+  }
+
+  bfn = path + "/block.wal";
+  if (::stat(bfn.c_str(), &st) == 0) {
+    r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
+				 create && cct->_conf->bdev_enable_discard,
+                                 BDEV_LABEL_BLOCK_SIZE);
+    if (r < 0) {
+      derr << __func__ << " add block device(" << bfn << ") returned: "
+	    << cpp_strerror(r) << dendl;
+      goto free_bluefs;
+    }
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
+      r = _check_or_set_bdev_label(
+	bfn,
+	bluefs->get_block_device_size(BlueFS::BDEV_WAL),
+        "bluefs wal", create);
+      if (r < 0) {
+        derr << __func__ << " check block device(" << bfn
+              << ") label returned: " << cpp_strerror(r) << dendl;
+        goto free_bluefs;
+      }
+    }
+
+    bluefs_layout.dedicated_wal = true;
+  } else {
+    r = 0;
+    if (::lstat(bfn.c_str(), &st) != -1) {
+      r = -errno;
+      derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+           << cpp_strerror(r) << dendl;
+      goto free_bluefs;
+    }
+  }
+  return 0;
+
+free_bluefs:
+  ceph_assert(bluefs);
+  delete bluefs;
+  bluefs = NULL;
+  return r;
+}
+
+int BlueStore::_open_bluefs(bool create, bool read_only)
+{
+  int r = _minimal_open_bluefs(create);
+  if (r < 0) {
+    return r;
+  }
+  BlueFSVolumeSelector* vselector = nullptr;
+  if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW) {
+
+    string options = cct->_conf->bluestore_rocksdb_options;
+    string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
+    if (!options_annex.empty()) {
+      if (!options.empty() &&
+        *options.rbegin() != ',') {
+        options += ',';
+      }
+      options += options_annex;
+    }
+
+    rocksdb::Options rocks_opts;
+    r = RocksDBStore::ParseOptionsFromStringStatic(
+      cct,
+      options,
+      rocks_opts,
+      nullptr);
+    if (r < 0) {
+      return r;
+    }
+    if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
+      vselector = new FitToFastVolumeSelector(
+        bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+        bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+        bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100);
+    } else {
+      double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
+      vselector =
+        new RocksDBBlueFSVolumeSelector(
+          bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+          bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+          bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
+          1024 * 1024 * 1024, //FIXME: set expected l0 size here
+          rocks_opts.max_bytes_for_level_base,
+          rocks_opts.max_bytes_for_level_multiplier,
+          reserved_factor,
+          cct->_conf->bluestore_volume_selection_reserved,
+          cct->_conf->bluestore_volume_selection_policy == "use_some_extra");
+    }    
+  }
+  if (create) {
+    bluefs->mkfs(fsid, bluefs_layout);
+  }
+  bluefs->set_volume_selector(vselector);
+  r = bluefs->mount();
+  if (r < 0) {
+    derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
+  }
+  ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
+  return r;
+}
+
+void BlueStore::_close_bluefs(bool cold_close)
+{
+  bluefs->umount(cold_close);
+  _minimal_close_bluefs();
+}
+
+void BlueStore::_minimal_close_bluefs()
+{
+  delete bluefs;
+  bluefs = NULL;
+}
+
+int BlueStore::_is_bluefs(bool create, bool* ret)
+{
+  if (create) {
+    *ret = cct->_conf->bluestore_bluefs;
+  } else {
+    string s;
+    int r = read_meta("bluefs", &s);
+    if (r < 0) {
+      derr << __func__ << " unable to read 'bluefs' meta" << dendl;
+      return -EIO;
+    }
+    if (s == "1") {
+      *ret = true;
+    } else if (s == "0") {
+      *ret = false;
+    } else {
+      derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
+	   << dendl;
+      return -EIO;
+    }
+  }
+  return 0;
+}
+
+/*
+* opens both DB and dependant super_meta, FreelistManager and allocator
+* in the proper order
+*/
+int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
+{
+  dout(0) << __func__ << " read-only:" << read_only
+          << " repair:" << to_repair << dendl;
+  {
+    string type;
+    int r = read_meta("type", &type);
+    if (r < 0) {
+      derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
+        << dendl;
+      return r;
+    }
+
+    if (type != "bluestore") {
+      derr << __func__ << " expected bluestore, but type is " << type << dendl;
+      return -EIO;
+    }
+  }
+
+  int r = _open_path();
+  if (r < 0)
+    return r;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+
+  r = _read_fsid(&fsid);
+  if (r < 0)
+    goto out_fsid;
+
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_fsid;
+
+  r = _open_bdev(false);
+  if (r < 0)
+    goto out_fsid;
+
+  // open in read-only first to read FM list and init allocator
+  // as they might be needed for some BlueFS procedures
+  r = _open_db(false, false, true);
+  if (r < 0)
+    goto out_bdev;
+
+  r = _open_super_meta();
+  if (r < 0) {
+    goto out_db;
+  }
+
+  r = _open_fm(nullptr, true);
+  if (r < 0)
+    goto out_db;
+
+  r = _init_alloc();
+  if (r < 0)
+    goto out_fm;
+
+  // Re-open in the proper mode(s).
+
+  // Can't simply bypass second open for read-only mode as we need to
+  // load allocated extents from bluefs into allocator.
+  // And now it's time to do that
+  //
+  _close_db(true);
+
+  r = _open_db(false, to_repair, read_only);
+  if (r < 0) {
+    goto out_alloc;
+  }
+  return 0;
+
+out_alloc:
+  _close_alloc();
+out_fm:
+  _close_fm();
+ out_db:
+  _close_db(read_only);
+ out_bdev:
+  _close_bdev();
+ out_fsid:
+  _close_fsid();
+ out_path:
+  _close_path();
+  return r;
+}
+
+void BlueStore::_close_db_and_around(bool read_only)
+{
+  _close_db(read_only);
+  _close_fm();
+  _close_alloc();
+  _close_bdev();
+  _close_fsid();
+  _close_path();
+}
+
+int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair)
+{
+  _kv_only = true;
+  int r = _open_db_and_around(false, to_repair);
+  if (r == 0) {
+    *pdb = db;
+  } else {
+    *pdb = nullptr;
+  }
+  return r;
+}
+
+int BlueStore::close_db_environment()
+{
+  _close_db_and_around(false);
+  return 0;
+}
+
+/* gets access to bluefs supporting RocksDB */
+BlueFS* BlueStore::get_bluefs() {
+  return bluefs;
+}
+
+int BlueStore::_prepare_db_environment(bool create, bool read_only,
+				       std::string* _fn, std::string* _kv_backend)
+{
+  int r;
+  ceph_assert(!db);
+  std::string& fn=*_fn;
+  std::string& kv_backend=*_kv_backend;
+  fn = path + "/db";
+  std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
+
+  if (create) {
+    kv_backend = cct->_conf->bluestore_kvbackend;
+  } else {
+    r = read_meta("kv_backend", &kv_backend);
+    if (r < 0) {
+      derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
+      return -EIO;
+    }
+  }
+  dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
+
+  bool do_bluefs;
+  r = _is_bluefs(create, &do_bluefs);
+  if (r < 0) {
+    return r;
+  }
+  dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
+
+  map<string,string> kv_options;
+  // force separate wal dir for all new deployments.
+  kv_options["separate_wal_dir"] = 1;
+  rocksdb::Env *env = NULL;
+  if (do_bluefs) {
+    dout(10) << __func__ << " initializing bluefs" << dendl;
+    if (kv_backend != "rocksdb") {
+      derr << " backend must be rocksdb to use bluefs" << dendl;
+      return -EINVAL;
+    }
+
+    r = _open_bluefs(create, read_only);
+    if (r < 0) {
+      return r;
+    }
+
+    if (cct->_conf->bluestore_bluefs_env_mirror) {
+      rocksdb::Env* a = new BlueRocksEnv(bluefs);
+      rocksdb::Env* b = rocksdb::Env::Default();
+      if (create) {
+        string cmd = "rm -rf " + path + "/db " +
+          path + "/db.slow " +
+          path + "/db.wal";
+        int r = system(cmd.c_str());
+        (void)r;
+      }
+      env = new rocksdb::EnvMirror(b, a, false, true);
+    } else {
+      env = new BlueRocksEnv(bluefs);
+
+      // simplify the dir names, too, as "seen" by rocksdb
+      fn = "db";
+    }
+    BlueFSVolumeSelector::paths paths;
+    bluefs->get_vselector_paths(fn, paths);
+
+    {
+      ostringstream db_paths;
+      bool first = true;
+      for (auto& p : paths) {
+        if (!first) {
+          db_paths << " ";
+        }
+        first = false;
+        db_paths << p.first << "," << p.second;
+
+      }
+      kv_options["db_paths"] = db_paths.str();
+      dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
+    }
+
+    if (create) {
+      for (auto& p : paths) {
+        env->CreateDir(p.first);
+      }
+      // Selectors don't provide wal path so far hence create explicitly
+      env->CreateDir(fn + ".wal");
+    } else {
+      std::vector<std::string> res;
+      // check for dir presence
+      auto r = env->GetChildren(fn+".wal", &res);
+      if (r.IsNotFound()) {
+	kv_options.erase("separate_wal_dir");
+      }
+    }
+  } else {
+    string walfn = path + "/db.wal";
+
+    if (create) {
+      int r = ::mkdir(fn.c_str(), 0755);
+      if (r < 0)
+	r = -errno;
+      if (r < 0 && r != -EEXIST) {
+	derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
+	     << dendl;
+	return r;
+      }
+
+      // wal_dir, too!
+      r = ::mkdir(walfn.c_str(), 0755);
+      if (r < 0)
+	r = -errno;
+      if (r < 0 && r != -EEXIST) {
+	derr << __func__ << " failed to create " << walfn
+	  << ": " << cpp_strerror(r)
+	  << dendl;
+	return r;
+      }
+    } else {
+      struct stat st;
+      r = ::stat(walfn.c_str(), &st);
+      if (r < 0 && errno == ENOENT) {
+	kv_options.erase("separate_wal_dir");
+      }
+    }
+  }
+
+
+  db = KeyValueDB::create(cct,
+			  kv_backend,
+			  fn,
+			  kv_options,
+			  static_cast<void*>(env));
+  if (!db) {
+    derr << __func__ << " error creating db" << dendl;
+    if (bluefs) {
+      _close_bluefs(read_only);
+    }
+    // delete env manually here since we can't depend on db to do this
+    // under this case
+    delete env;
+    env = NULL;
+    return -EIO;
+  }
+
+  FreelistManager::setup_merge_operators(db, freelist_type);
+  db->set_merge_operator(PREFIX_STAT, merge_op);
+  db->set_cache_size(cache_kv_ratio * cache_size);
+  return 0;
+}
+
+int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
+{
+  int r;
+  ceph_assert(!(create && read_only));
+  string options;
+  string options_annex;
+  stringstream err;
+  string kv_dir_fn;
+  string kv_backend;
+  std::string sharding_def;
+  r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend);
+  if (r < 0) {
+    derr << __func__ << " failed to prepare db environment: " << err.str() << dendl;
+    return -EIO;
+  }
+  if (kv_backend == "rocksdb") {
+    options = cct->_conf->bluestore_rocksdb_options;
+    options_annex = cct->_conf->bluestore_rocksdb_options_annex;
+    if (!options_annex.empty()) {
+      if (!options.empty() &&
+        *options.rbegin() != ',') {
+        options += ',';
+      }
+      options += options_annex;
+    }
+
+    if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
+      sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs");
+    }
+  }
+
+  db->init(options);
+  if (to_repair_db)
+    return 0;
+  if (create) {
+    r = db->create_and_open(err, sharding_def);
+  } else {
+    // we pass in cf list here, but it is only used if the db already has
+    // column families created.
+    r = read_only ?
+      db->open_read_only(err, sharding_def) :
+      db->open(err, sharding_def);
+  }
+  if (r) {
+    derr << __func__ << " erroring opening db: " << err.str() << dendl;
+    _close_db(read_only);
+    return -EIO;
+  }
+  dout(1) << __func__ << " opened " << kv_backend
+	  << " path " << kv_dir_fn << " options " << options << dendl;
+  return 0;
+}
+
+void BlueStore::_close_db(bool cold_close)
+{
+  ceph_assert(db);
+  delete db;
+  db = NULL;
+  if (bluefs) {
+    _close_bluefs(cold_close);
+  }
+}
+
+void BlueStore::_dump_alloc_on_failure()
+{
+  auto dump_interval =
+    cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
+  if (dump_interval > 0 &&
+    next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
+    shared_alloc.a->dump();
+    next_dump_on_bluefs_alloc_failure = ceph_clock_now();
+    next_dump_on_bluefs_alloc_failure += dump_interval;
+  }
+}
+
+int BlueStore::_open_collections()
+{
+  dout(10) << __func__ << dendl;
+  collections_had_errors = false;
+  ceph_assert(coll_map.empty());
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
+  for (it->upper_bound(string());
+       it->valid();
+       it->next()) {
+    coll_t cid;
+    if (cid.parse(it->key())) {
+      auto c = ceph::make_ref<Collection>(
+	  this,
+	  onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
+          buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
+	  cid);
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      try {
+        decode(c->cnode, p);
+      } catch (ceph::buffer::error& e) {
+        derr << __func__ << " failed to decode cnode, key:"
+             << pretty_binary_string(it->key()) << dendl;
+        return -EIO;
+      }   
+      dout(20) << __func__ << " opened " << cid << " " << c
+	       << " " << c->cnode << dendl;
+      _osr_attach(c.get());
+      coll_map[cid] = c;
+
+    } else {
+      derr << __func__ << " unrecognized collection " << it->key() << dendl;
+      collections_had_errors = true;
+    }
+  }
+  return 0;
+}
+
+void BlueStore::_fsck_collections(int64_t* errors)
+{
+  if (collections_had_errors) {
+    dout(10) << __func__ << dendl;
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE);
+    for (it->upper_bound(string());
+      it->valid();
+      it->next()) {
+      coll_t cid;
+      if (!cid.parse(it->key())) {
+        derr << __func__ << " unrecognized collection " << it->key() << dendl;
+        if (errors) {
+          (*errors)++;
+        }
+      }
+    }
+  }
+}
+
+void BlueStore::_set_per_pool_omap()
+{
+  per_pool_omap = OMAP_BULK;
+  bufferlist bl;
+  db->get(PREFIX_SUPER, "per_pool_omap", &bl);
+  if (bl.length()) {
+    auto s = bl.to_str();
+    if (s == stringify(OMAP_PER_POOL)) {
+      per_pool_omap = OMAP_PER_POOL;
+    } else if (s == stringify(OMAP_PER_PG)) {
+      per_pool_omap = OMAP_PER_PG;
+    } else {
+      ceph_assert(s == stringify(OMAP_BULK));
+    }
+    dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl;
+  } else {
+    dout(10) << __func__ << " per_pool_omap not present" << dendl;
+  }
+  _check_no_per_pg_or_pool_omap_alert();
+}
+
+void BlueStore::_open_statfs()
+{
+  osd_pools.clear();
+  vstatfs.reset();
+
+  bufferlist bl;
+  int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
+  if (r >= 0) {
+    per_pool_stat_collection = false;
+    if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
+      auto it = bl.cbegin();
+      vstatfs.decode(it);
+      dout(10) << __func__ << " store_statfs is found" << dendl;
+    } else {
+      dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
+    }
+    _check_legacy_statfs_alert();
+  } else {
+    per_pool_stat_collection = true;
+    dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
+    for (it->upper_bound(string());
+	 it->valid();
+	 it->next()) {
+
+      uint64_t pool_id;
+      int r = get_key_pool_stat(it->key(), &pool_id);
+      ceph_assert(r == 0);
+
+      bufferlist bl;
+      bl = it->value();
+      auto p = bl.cbegin();
+      auto& st = osd_pools[pool_id];
+      try {
+        st.decode(p);
+        vstatfs += st;
+
+        dout(30) << __func__ << " pool " << pool_id
+		 << " statfs " << st << dendl;
+      } catch (ceph::buffer::error& e) {
+        derr << __func__ << " failed to decode pool stats, key:"
+             << pretty_binary_string(it->key()) << dendl;
+      }   
+    }
+  }
+  dout(30) << __func__ << " statfs " << vstatfs << dendl;
+
+}
+
+int BlueStore::_setup_block_symlink_or_file(
+  string name,
+  string epath,
+  uint64_t size,
+  bool create)
+{
+  dout(20) << __func__ << " name " << name << " path " << epath
+	   << " size " << size << " create=" << (int)create << dendl;
+  int r = 0;
+  int flags = O_RDWR|O_CLOEXEC;
+  if (create)
+    flags |= O_CREAT;
+  if (epath.length()) {
+    r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
+    if (r < 0) {
+      r = -errno;
+      derr << __func__ << " failed to create " << name << " symlink to "
+           << epath << ": " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
+      int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
+      if (fd < 0) {
+	r = -errno;
+	derr << __func__ << " failed to open " << epath << " file: "
+	     << cpp_strerror(r) << dendl;
+	return r;
+      }
+      // write the Transport ID of the NVMe device
+      // a transport id looks like: "trtype:PCIe traddr:0000:02:00.0"
+      // where "0000:02:00.0" is the selector of a PCI device, see
+      // the first column of "lspci -mm -n -D"
+      string trid{"trtype:PCIe "};
+      trid += "traddr:";
+      trid += epath.substr(strlen(SPDK_PREFIX));
+      r = ::write(fd, trid.c_str(), trid.size());
+      ceph_assert(r == static_cast<int>(trid.size()));
+      dout(1) << __func__ << " created " << name << " symlink to "
+              << epath << dendl;
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+    }
+  }
+  if (size) {
+    int fd = ::openat(path_fd, name.c_str(), flags, 0644);
+    if (fd >= 0) {
+      // block file is present
+      struct stat st;
+      int r = ::fstat(fd, &st);
+      if (r == 0 &&
+	  S_ISREG(st.st_mode) &&   // if it is a regular file
+	  st.st_size == 0) {       // and is 0 bytes
+	r = ::ftruncate(fd, size);
+	if (r < 0) {
+	  r = -errno;
+	  derr << __func__ << " failed to resize " << name << " file to "
+	       << size << ": " << cpp_strerror(r) << dendl;
+	  VOID_TEMP_FAILURE_RETRY(::close(fd));
+	  return r;
+	}
+
+	if (cct->_conf->bluestore_block_preallocate_file) {
+          r = ::ceph_posix_fallocate(fd, 0, size);
+          if (r > 0) {
+	    derr << __func__ << " failed to prefallocate " << name << " file to "
+	      << size << ": " << cpp_strerror(r) << dendl;
+	    VOID_TEMP_FAILURE_RETRY(::close(fd));
+	    return -r;
+	  }
+	}
+	dout(1) << __func__ << " resized " << name << " file to "
+		<< byte_u_t(size) << dendl;
+      }
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+    } else {
+      int r = -errno;
+      if (r != -ENOENT) {
+	derr << __func__ << " failed to open " << name << " file: "
+	     << cpp_strerror(r) << dendl;
+	return r;
+      }
+    }
+  }
+  return 0;
+}
+
+int BlueStore::mkfs()
+{
+  dout(1) << __func__ << " path " << path << dendl;
+  int r;
+  uuid_d old_fsid;
+  uint64_t reserved;
+  if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
+    derr << __func__ << " osd_max_object_size "
+	 << cct->_conf->osd_max_object_size << " > bluestore max "
+	 << OBJECT_MAX_SIZE << dendl;
+    return -EINVAL;
+  }
+
+  {
+    string done;
+    r = read_meta("mkfs_done", &done);
+    if (r == 0) {
+      dout(1) << __func__ << " already created" << dendl;
+      if (cct->_conf->bluestore_fsck_on_mkfs) {
+        r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
+        if (r < 0) {
+          derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
+               << dendl;
+          return r;
+        }
+        if (r > 0) {
+          derr << __func__ << " fsck found " << r << " errors" << dendl;
+          r = -EIO;
+        }
+      }
+      return r; // idempotent
+    }
+  }
+
+  {
+    string type;
+    r = read_meta("type", &type);
+    if (r == 0) {
+      if (type != "bluestore") {
+	derr << __func__ << " expected bluestore, but type is " << type << dendl;
+	return -EIO;
+      }
+    } else {
+      r = write_meta("type", "bluestore");
+      if (r < 0)
+        return r;
+    }
+  }
+
+  freelist_type = "bitmap";
+
+  r = _open_path();
+  if (r < 0)
+    return r;
+
+  r = _open_fsid(true);
+  if (r < 0)
+    goto out_path_fd;
+
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_close_fsid;
+
+  r = _read_fsid(&old_fsid);
+  if (r < 0 || old_fsid.is_zero()) {
+    if (fsid.is_zero()) {
+      fsid.generate_random();
+      dout(1) << __func__ << " generated fsid " << fsid << dendl;
+    } else {
+      dout(1) << __func__ << " using provided fsid " << fsid << dendl;
+    }
+    // we'll write it later.
+  } else {
+    if (!fsid.is_zero() && fsid != old_fsid) {
+      derr << __func__ << " on-disk fsid " << old_fsid
+	   << " != provided " << fsid << dendl;
+      r = -EINVAL;
+      goto out_close_fsid;
+    }
+    fsid = old_fsid;
+  }
+
+  r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
+				   cct->_conf->bluestore_block_size,
+				   cct->_conf->bluestore_block_create);
+  if (r < 0)
+    goto out_close_fsid;
+  if (cct->_conf->bluestore_bluefs) {
+    r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
+	cct->_conf->bluestore_block_wal_size,
+	cct->_conf->bluestore_block_wal_create);
+    if (r < 0)
+      goto out_close_fsid;
+    r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
+	cct->_conf->bluestore_block_db_size,
+	cct->_conf->bluestore_block_db_create);
+    if (r < 0)
+      goto out_close_fsid;
+  }
+
+  r = _open_bdev(true);
+  if (r < 0)
+    goto out_close_fsid;
+
+  // choose min_alloc_size
+  if (cct->_conf->bluestore_min_alloc_size) {
+    min_alloc_size = cct->_conf->bluestore_min_alloc_size;
+  } else {
+    ceph_assert(bdev);
+    if (_use_rotational_settings()) {
+      min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
+    } else {
+      min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
+    }
+  }
+  _validate_bdev();
+
+  // make sure min_alloc_size is power of 2 aligned.
+  if (!isp2(min_alloc_size)) {
+    derr << __func__ << " min_alloc_size 0x"
+	 << std::hex << min_alloc_size << std::dec
+	 << " is not power of 2 aligned!"
+	 << dendl;
+    r = -EINVAL;
+    goto out_close_bdev;
+  }
+
+  r = _create_alloc();
+  if (r < 0) {
+    goto out_close_bdev;
+  }
+
+  reserved = _get_ondisk_reserved();
+  shared_alloc.a->init_add_free(reserved,
+    p2align(bdev->get_size(), min_alloc_size) - reserved);
+
+  r = _open_db(true);
+  if (r < 0)
+    goto out_close_alloc;
+
+  {
+    KeyValueDB::Transaction t = db->get_transaction();
+    r = _open_fm(t, true);
+    if (r < 0)
+      goto out_close_db;
+    {
+      bufferlist bl;
+      encode((uint64_t)0, bl);
+      t->set(PREFIX_SUPER, "nid_max", bl);
+      t->set(PREFIX_SUPER, "blobid_max", bl);
+    }
+
+    {
+      bufferlist bl;
+      encode((uint64_t)min_alloc_size, bl);
+      t->set(PREFIX_SUPER, "min_alloc_size", bl);
+    }
+    {
+      bufferlist bl;
+      if (cct->_conf.get_val<bool>("bluestore_debug_legacy_omap")) {
+	bl.append(stringify(OMAP_BULK));
+      } else {
+	bl.append(stringify(OMAP_PER_PG));
+      }
+      t->set(PREFIX_SUPER, "per_pool_omap", bl);
+    }
+    ondisk_format = latest_ondisk_format;
+    _prepare_ondisk_format_super(t);
+    db->submit_transaction_sync(t);
+  }
+
+  r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
+  if (r < 0)
+    goto out_close_fm;
+
+  r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
+  if (r < 0)
+    goto out_close_fm;
+
+  if (fsid != old_fsid) {
+    r = _write_fsid();
+    if (r < 0) {
+      derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
+      goto out_close_fm;
+    }
+  }
+
+ out_close_fm:
+  _close_fm();
+ out_close_db:
+  _close_db(false);
+ out_close_alloc:
+  _close_alloc();
+ out_close_bdev:
+  _close_bdev();
+ out_close_fsid:
+  _close_fsid();
+ out_path_fd:
+  _close_path();
+
+  if (r == 0 &&
+      cct->_conf->bluestore_fsck_on_mkfs) {
+    int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
+    if (rc < 0)
+      return rc;
+    if (rc > 0) {
+      derr << __func__ << " fsck found " << rc << " errors" << dendl;
+      r = -EIO;
+    }
+  }
+
+  if (r == 0) {
+    // indicate success by writing the 'mkfs_done' file
+    r = write_meta("mkfs_done", "yes");
+  }
+
+  if (r < 0) {
+    derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
+  } else {
+    dout(0) << __func__ << " success" << dendl;
+  }
+  return r;
+}
+
+int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
+{
+  dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
+  int r;
+  ceph_assert(path_fd < 0);
+
+  ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
+
+  if (!cct->_conf->bluestore_bluefs) {
+    derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+    return -EIO;
+  }
+
+  r = _open_db_and_around(true);
+
+  if (id == BlueFS::BDEV_NEWWAL) {
+    string p = path + "/block.wal";
+    r = _setup_block_symlink_or_file("block.wal", dev_path,
+	cct->_conf->bluestore_block_wal_size,
+	true);
+    ceph_assert(r == 0);
+
+    r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
+				 cct->_conf->bdev_enable_discard,
+                                 BDEV_LABEL_BLOCK_SIZE);
+    ceph_assert(r == 0);
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
+      r = _check_or_set_bdev_label(
+	p,
+	bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
+        "bluefs wal",
+	true);
+      ceph_assert(r == 0);
+    }
+
+    bluefs_layout.dedicated_wal = true;
+  } else if (id == BlueFS::BDEV_NEWDB) {
+    string p = path + "/block.db";
+    r = _setup_block_symlink_or_file("block.db", dev_path,
+	cct->_conf->bluestore_block_db_size,
+	true);
+    ceph_assert(r == 0);
+
+    r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
+				 cct->_conf->bdev_enable_discard,
+                                 SUPER_RESERVED);
+    ceph_assert(r == 0);
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
+      r = _check_or_set_bdev_label(
+	p,
+	bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
+        "bluefs db",
+	true);
+      ceph_assert(r == 0);
+    }
+    bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
+    bluefs_layout.dedicated_db = true;
+  }
+
+  bluefs->umount();
+  bluefs->mount();
+
+  r = bluefs->prepare_new_device(id, bluefs_layout);
+  ceph_assert(r == 0);
+
+  if (r < 0) {
+    derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
+  } else {
+    dout(0) << __func__ << " success" << dendl;
+  }
+
+  _close_db_and_around(true);
+  return r;
+}
+
+int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
+  int id)
+{
+  dout(10) << __func__ << " id:" << id << dendl;
+  ceph_assert(path_fd < 0);
+
+  ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
+
+  if (!cct->_conf->bluestore_bluefs) {
+    derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+    return -EIO;
+  }
+
+  int r = _open_db_and_around(true);
+
+  uint64_t used_space = 0;
+  for(auto src_id : devs_source) {
+    used_space += bluefs->get_used(src_id);
+  }
+  uint64_t target_free = bluefs->get_free(id);
+  if (target_free < used_space) {
+    derr << __func__
+         << " can't migrate, free space at target: " << target_free
+	 << " is less than required space: " << used_space
+	 << dendl;
+    r = -ENOSPC;
+    goto shutdown;
+  }
+  if (devs_source.count(BlueFS::BDEV_DB)) {
+    bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
+    bluefs_layout.dedicated_db = false;
+  }
+  if (devs_source.count(BlueFS::BDEV_WAL)) {
+    bluefs_layout.dedicated_wal = false;
+  }
+  r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
+  if (r < 0) {
+    derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
+    goto shutdown;
+  }
+
+  if (devs_source.count(BlueFS::BDEV_DB)) {
+    r = unlink(string(path + "/block.db").c_str());
+    ceph_assert(r == 0);
+  }
+  if (devs_source.count(BlueFS::BDEV_WAL)) {
+    r = unlink(string(path + "/block.wal").c_str());
+    ceph_assert(r == 0);
+  }
+
+shutdown:
+  _close_db_and_around(true);
+  return r;
+}
+
+int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
+  int id,
+  const string& dev_path)
+{
+  dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
+  int r;
+  ceph_assert(path_fd < 0);
+
+  ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
+
+  if (!cct->_conf->bluestore_bluefs) {
+    derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+    return -EIO;
+  }
+
+  r = _open_db_and_around(true);
+
+  string link_db;
+  string link_wal;
+  if (devs_source.count(BlueFS::BDEV_DB) &&
+      bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
+    link_db = path + "/block.db";
+    bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
+    bluefs_layout.dedicated_db = false;
+  }
+  if (devs_source.count(BlueFS::BDEV_WAL)) {
+    link_wal = path + "/block.wal";
+    bluefs_layout.dedicated_wal = false;
+  }
+
+  size_t target_size;
+  string target_name;
+  if (id == BlueFS::BDEV_NEWWAL) {
+    target_name = "block.wal";
+    target_size = cct->_conf->bluestore_block_wal_size;
+    bluefs_layout.dedicated_wal = true;
+
+    r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
+				 cct->_conf->bdev_enable_discard,
+                                 BDEV_LABEL_BLOCK_SIZE);
+    ceph_assert(r == 0);
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
+      r = _check_or_set_bdev_label(
+	dev_path,
+	bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
+        "bluefs wal",
+	true);
+      ceph_assert(r == 0);
+    }
+  } else if (id == BlueFS::BDEV_NEWDB) {
+    target_name = "block.db";
+    target_size = cct->_conf->bluestore_block_db_size;
+    bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
+    bluefs_layout.dedicated_db = true;
+
+    r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
+				 cct->_conf->bdev_enable_discard,
+                                 SUPER_RESERVED);
+    ceph_assert(r == 0);
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
+      r = _check_or_set_bdev_label(
+	dev_path,
+	bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
+        "bluefs db",
+	true);
+      ceph_assert(r == 0);
+    }
+  }
+
+  bluefs->umount();
+  bluefs->mount();
+
+  r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
+
+  if (r < 0) {
+    derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
+    goto shutdown;
+  }
+
+  if (!link_db.empty()) {
+    r = unlink(link_db.c_str());
+    ceph_assert(r == 0);
+  }
+  if (!link_wal.empty()) {
+    r = unlink(link_wal.c_str());
+    ceph_assert(r == 0);
+  }
+  r = _setup_block_symlink_or_file(
+    target_name,
+    dev_path,
+    target_size,
+    true);
+  ceph_assert(r == 0);
+  dout(0) << __func__ << " success" << dendl;
+
+shutdown:
+  _close_db_and_around(true);
+
+  return r;
+}
+
+string BlueStore::get_device_path(unsigned id)
+{
+  string res;
+  if (id < BlueFS::MAX_BDEV) {
+    switch (id) {
+    case BlueFS::BDEV_WAL:
+      res = path + "/block.wal";
+      break;
+    case BlueFS::BDEV_DB:
+      if (id == bluefs_layout.shared_bdev) {
+	res = path + "/block";
+      } else {
+	res = path + "/block.db";
+      }
+      break;
+    case BlueFS::BDEV_SLOW:
+      res = path + "/block";
+      break;
+    }
+  }
+  return res;
+}
+
+int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
+{
+  bluestore_bdev_label_t label;
+  int r = _read_bdev_label(cct, path, &label);
+  if (r < 0) {
+    derr << "unable to read label for " << path << ": "
+          << cpp_strerror(r) << dendl;
+  } else {
+    label.size = size;
+    r = _write_bdev_label(cct, path, label);
+    if (r < 0) {
+      derr << "unable to write label for " << path << ": "
+            << cpp_strerror(r) << dendl;
+    }
+  }
+  return r;
+}
+
+int BlueStore::expand_devices(ostream& out)
+{
+  int r = _open_db_and_around(true);
+  ceph_assert(r == 0);
+  bluefs->dump_block_extents(out);
+  out << "Expanding DB/WAL..." << std::endl;
+  for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
+    if (devid == bluefs_layout.shared_bdev ) {
+      continue;
+    }
+    uint64_t size = bluefs->get_block_device_size(devid);
+    if (size == 0) {
+      // no bdev
+      continue;
+    }
+
+    out << devid
+	<<" : expanding " << " to 0x" << size << std::dec << std::endl;
+    string p = get_device_path(devid);
+    const char* path = p.c_str();
+    if (path == nullptr) {
+      derr << devid
+	    <<": can't find device path " << dendl;
+      continue;
+    }
+    if (bluefs->bdev_support_label(devid)) {
+      if (_set_bdev_label_size(p, size) >= 0) {
+        out << devid
+          << " : size label updated to " << size
+          << std::endl;
+      }
+    }
+  }
+  uint64_t size0 = fm->get_size();
+  uint64_t size = bdev->get_size();
+  if (size0 < size) {
+    out << bluefs_layout.shared_bdev
+      << " : expanding " << " from 0x" << std::hex
+      << size0 << " to 0x" << size << std::dec << std::endl;
+    _write_out_fm_meta(size);
+    if (bdev->supported_bdev_label()) {
+      if (_set_bdev_label_size(path, size) >= 0) {
+        out << bluefs_layout.shared_bdev
+          << " : size label updated to " << size
+          << std::endl;
+      }
+    }
+    _close_db_and_around(true);
+
+    // mount in read/write to sync expansion changes
+    r = _mount();
+    ceph_assert(r == 0);
+    umount();
+  } else {
+    _close_db_and_around(true);
+  }
+  return r;
+}
+
+int BlueStore::dump_bluefs_sizes(ostream& out)
+{
+  int r = _open_db_and_around(true);
+  ceph_assert(r == 0);
+  bluefs->dump_block_extents(out);
+  _close_db_and_around(true);
+  return r;
+}
+
+void BlueStore::set_cache_shards(unsigned num)
+{
+  dout(10) << __func__ << " " << num << dendl;
+  size_t oold = onode_cache_shards.size();
+  size_t bold = buffer_cache_shards.size();
+  ceph_assert(num >= oold && num >= bold);
+  onode_cache_shards.resize(num);
+  buffer_cache_shards.resize(num);
+  for (unsigned i = oold; i < num; ++i) {
+    onode_cache_shards[i] = 
+        OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
+                                 logger);
+  }
+  for (unsigned i = bold; i < num; ++i) {
+    buffer_cache_shards[i] = 
+        BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
+                                 logger);
+  }
+}
+
+int BlueStore::_mount()
+{
+  dout(1) << __func__ << " path " << path << dendl;
+
+  _kv_only = false;
+  if (cct->_conf->bluestore_fsck_on_mount) {
+    int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
+    if (rc < 0)
+      return rc;
+    if (rc > 0) {
+      derr << __func__ << " fsck found " << rc << " errors" << dendl;
+      return -EIO;
+    }
+  }
+
+  if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
+    derr << __func__ << " osd_max_object_size "
+	 << cct->_conf->osd_max_object_size << " > bluestore max "
+	 << OBJECT_MAX_SIZE << dendl;
+    return -EINVAL;
+  }
+
+  int r = _open_db_and_around(false);
+  if (r < 0) {
+    return r;
+  }
+
+  r = _upgrade_super();
+  if (r < 0) {
+    goto out_db;
+  }
+
+  r = _open_collections();
+  if (r < 0)
+    goto out_db;
+
+  r = _reload_logger();
+  if (r < 0)
+    goto out_coll;
+
+  _kv_start();
+
+  if (bdev->is_smr()) {
+    _zoned_cleaner_start();
+  }
+
+  r = _deferred_replay();
+  if (r < 0)
+    goto out_stop;
+
+  mempool_thread.init();
+
+  if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
+    cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
+
+    auto was_per_pool_omap = per_pool_omap;
+
+    dout(1) << __func__ << " quick-fix on mount" << dendl;
+    _fsck_on_open(FSCK_SHALLOW, true);
+
+    //reread statfs
+    //FIXME minor: replace with actual open/close?
+    _open_statfs();
+    _check_legacy_statfs_alert();
+
+    //set again as hopefully it has been fixed
+    if (was_per_pool_omap != OMAP_PER_PG) {
+      _set_per_pool_omap();
+    }
+  }
+
+  mounted = true;
+  return 0;
+
+ out_stop:
+  if (bdev->is_smr()) {
+    _zoned_cleaner_stop();
+  }
+  _kv_stop();
+ out_coll:
+  _shutdown_cache();
+ out_db:
+  _close_db_and_around(false);
+  return r;
+}
+
+int BlueStore::umount()
+{
+  ceph_assert(_kv_only || mounted);
+  dout(1) << __func__ << dendl;
+
+  _osr_drain_all();
+
+  mounted = false;
+  if (!_kv_only) {
+    mempool_thread.shutdown();
+    if (bdev->is_smr()) {
+      dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
+      _zoned_cleaner_stop();
+    }
+    dout(20) << __func__ << " stopping kv thread" << dendl;
+    _kv_stop();
+    _shutdown_cache();
+    dout(20) << __func__ << " closing" << dendl;
+
+  }
+  _close_db_and_around(false);
+
+  if (cct->_conf->bluestore_fsck_on_umount) {
+    int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
+    if (rc < 0)
+      return rc;
+    if (rc > 0) {
+      derr << __func__ << " fsck found " << rc << " errors" << dendl;
+      return -EIO;
+    }
+  }
+  return 0;
+}
+
+int BlueStore::cold_open()
+{
+  return _open_db_and_around(true);
+}
+
+int BlueStore::cold_close()
+{
+  _close_db_and_around(true);
+  return 0;
+}
+
+// derr wrapper to limit enormous output and avoid log flooding.
+// Of limited use where such output is expected for now
+#define fsck_derr(err_cnt, threshold) \
+  if (err_cnt <= threshold) {         \
+    bool need_skip_print = err_cnt == threshold; \
+    derr
+
+#define fsck_dendl \
+    dendl;          \
+    if (need_skip_print) \
+      derr << "more error lines skipped..." << dendl; \
+  }
+
+int _fsck_sum_extents(
+  const PExtentVector& extents,
+  bool compressed,
+  store_statfs_t& expected_statfs)
+{
+  for (auto e : extents) {
+    if (!e.is_valid())
+      continue;
+    expected_statfs.allocated += e.length;
+    if (compressed) {
+      expected_statfs.data_compressed_allocated += e.length;
+    }
+  }
+  return 0;
+}
+
+int BlueStore::_fsck_check_extents(
+  std::string_view ctx_descr,
+  const PExtentVector& extents,
+  bool compressed,
+  mempool_dynamic_bitset &used_blocks,
+  uint64_t granularity,
+  BlueStoreRepairer* repairer,
+  store_statfs_t& expected_statfs,
+  FSCKDepth depth)
+{
+  dout(30) << __func__ << " " << ctx_descr << ", extents " << extents << dendl;
+  int errors = 0;
+  for (auto e : extents) {
+    if (!e.is_valid())
+      continue;
+    expected_statfs.allocated += e.length;
+    if (compressed) {
+      expected_statfs.data_compressed_allocated += e.length;
+    }
+    if (depth != FSCK_SHALLOW) {
+      bool already = false;
+      apply_for_bitset_range(
+        e.offset, e.length, granularity, used_blocks,
+        [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+	  if (bs.test(pos)) {
+	    if (repairer) {
+	      repairer->note_misreference(
+	        pos * min_alloc_size, min_alloc_size, !already);
+	    }
+            if (!already) {
+              derr << __func__ << "::fsck error: " << ctx_descr << ", extent " << e
+		   << " or a subset is already allocated (misreferenced)" << dendl;
+	      ++errors;
+	      already = true;
+	    }
+	  }
+	  else
+	    bs.set(pos);
+        });
+
+      if (e.end() > bdev->get_size()) {
+        derr << "fsck error:  " << ctx_descr << ", extent " << e
+	     << " past end of block device" << dendl;
+        ++errors;
+      }
+    }
+  }
+  return errors;
+}
+
+void BlueStore::_fsck_check_pool_statfs(
+  BlueStore::per_pool_statfs& expected_pool_statfs,
+  int64_t& errors,
+  int64_t& warnings,
+  BlueStoreRepairer* repairer)
+{
+  auto it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
+  if (it) {
+    for (it->lower_bound(string()); it->valid(); it->next()) {
+      string key = it->key();
+      if (key == BLUESTORE_GLOBAL_STATFS_KEY) {
+        if (repairer) {
+	  ++errors;
+	  repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
+	  derr << "fsck error: " << "legacy statfs record found, removing"
+	       << dendl;
+	}
+	continue;
+      }
+      uint64_t pool_id;
+      if (get_key_pool_stat(key, &pool_id) < 0) {
+	derr << "fsck error: bad key " << key
+	     << "in statfs namespece" << dendl;
+	if (repairer) {
+	  repairer->remove_key(db, PREFIX_STAT, key);
+	}
+	++errors;
+	continue;
+      }
+
+      volatile_statfs vstatfs;
+      bufferlist bl = it->value();
+      auto blp = bl.cbegin();
+      try {
+	vstatfs.decode(blp);
+      } catch (ceph::buffer::error& e) {
+        derr << "fsck error: failed to decode Pool StatFS record"
+	     << pretty_binary_string(key) << dendl;
+        if (repairer) {
+	  dout(20) << __func__ << " undecodable Pool StatFS record, key:'"
+	           << pretty_binary_string(key)
+		   << "', removing" << dendl;
+          repairer->remove_key(db, PREFIX_STAT, key);
+        }
+        ++errors;
+	vstatfs.reset();
+      }
+      auto stat_it = expected_pool_statfs.find(pool_id);
+      if (stat_it == expected_pool_statfs.end()) {
+        if (vstatfs.is_empty()) {
+          // we don't consider that as an error since empty pool statfs
+          // are left in DB for now
+	  dout(20) << "fsck inf: found empty stray Pool StatFS record for pool id 0x"
+	            << std::hex << pool_id << std::dec << dendl;
+	  if (repairer) {
+	    // but we need to increment error count in case of repair
+	    // to have proper counters at the end
+	    // (as repairer increments recovery counter anyway).
+	    ++errors;
+	  }
+        } else {
+	  derr << "fsck error: found stray Pool StatFS record for pool id 0x"
+	       << std::hex << pool_id << std::dec << dendl;
+	  ++errors;
+	}
+	if (repairer) {
+	  repairer->remove_key(db, PREFIX_STAT, key);
+	}
+	continue;
+      }
+      store_statfs_t statfs;
+      vstatfs.publish(&statfs);
+      if (!(stat_it->second == statfs)) {
+        derr << "fsck error: actual " << statfs
+	     << " != expected " << stat_it->second
+	     << " for pool "
+	     << std::hex << pool_id << std::dec << dendl;
+	if (repairer) {
+	  repairer->fix_statfs(db, key, stat_it->second);
+	}
+        ++errors;
+      }
+      expected_pool_statfs.erase(stat_it);
+    }
+  } // if (it)
+  for (auto& s : expected_pool_statfs) {
+    if (s.second.is_zero()) {
+      // we might lack empty statfs recs in DB
+      continue;
+    }
+    derr << "fsck error: missing Pool StatFS record for pool "
+	 << std::hex << s.first << std::dec << dendl;
+    if (repairer) {
+      string key;
+      get_pool_stat_key(s.first, &key);
+      repairer->fix_statfs(db, key, s.second);
+    }
+    ++errors;
+  }
+  if (!per_pool_stat_collection &&
+      repairer) {
+    // by virtue of running this method, we correct the top-level
+    // error of having global stats
+    repairer->inc_repaired();
+  }
+}
+
+void BlueStore::_fsck_repair_shared_blobs(
+  BlueStoreRepairer& repairer,
+  shared_blob_2hash_tracker_t& sb_ref_counts,
+  sb_info_space_efficient_map_t& sb_info)
+{
+  auto sb_ref_mismatches = sb_ref_counts.count_non_zero();
+  dout(1) << __func__ << " repairing shared_blobs, ref mismatch estimate: "
+	  << sb_ref_mismatches << dendl;
+  if (!sb_ref_mismatches) // not expected to succeed, just in case
+    return;
+
+
+  auto foreach_shared_blob = [&](std::function<
+    void (coll_t,
+          ghobject_t,
+          uint64_t,
+          const bluestore_blob_t&)> cb) {
+      auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+      if (it) {
+        CollectionRef c;
+        spg_t pgid;
+        for (it->lower_bound(string()); it->valid(); it->next()) {
+          dout(30) << __func__ << " key "
+	           << pretty_binary_string(it->key())
+	           << dendl;
+          if (is_extent_shard_key(it->key())) {
+	    continue;
+          }
+
+          ghobject_t oid;
+          int r = get_key_object(it->key(), &oid);
+          if (r < 0) {
+	    continue;
+          }
+
+          if (!c ||
+	    oid.shard_id != pgid.shard ||
+	    oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+	    !c->contains(oid)) {
+	    c = nullptr;
+	    for (auto& p : coll_map) {
+	      if (p.second->contains(oid)) {
+	        c = p.second;
+	        break;
+	      }
+	    }
+	    if (!c) {
+	      continue;
+	    }
+          }
+          dout(20) << __func__
+                   << " inspecting shared blob refs for col:" << c->cid
+	           << " obj:" << oid
+	           << dendl;
+
+          OnodeRef o;
+          o.reset(Onode::decode(c, oid, it->key(), it->value()));
+          o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+
+          _dump_onode<30>(cct, *o);
+
+          mempool::bluestore_fsck::set<BlobRef> passed_sbs;
+          for (auto& e : o->extent_map.extent_map) {
+	    auto& b = e.blob->get_blob();
+	    if (b.is_shared() && passed_sbs.count(e.blob) == 0) {
+	      auto sbid = e.blob->shared_blob->get_sbid();
+	      cb(c->cid, oid, sbid, b);
+	      passed_sbs.emplace(e.blob);
+	    }
+          } // for ... extent_map
+        } // for ... it->valid
+      } //if (it(PREFIX_OBJ))
+    }; //foreach_shared_blob fn declaration
+
+  mempool::bluestore_fsck::map<uint64_t, bluestore_extent_ref_map_t> refs_map;
+
+  // first iteration over objects to identify all the broken sbids
+  foreach_shared_blob( [&](coll_t cid,
+                           ghobject_t oid,
+                           uint64_t sbid,
+                           const bluestore_blob_t& b) {
+    auto it = refs_map.lower_bound(sbid);
+    if(it != refs_map.end() && it->first == sbid) {
+      return;
+    }
+    for (auto& p : b.get_extents()) {
+      if (p.is_valid() &&
+	  !sb_ref_counts.test_all_zero_range(sbid,
+					     p.offset,
+					     p.length)) {
+	refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t());
+        dout(20) << __func__
+                 << " broken shared blob found for col:" << cid
+	         << " obj:" << oid
+	         << " sbid 0x " << std::hex << sbid << std::dec
+	         << dendl;
+	break;
+      }
+    }
+  });
+
+  // second iteration over objects to build new ref map for the broken sbids
+  foreach_shared_blob( [&](coll_t cid,
+                           ghobject_t oid,
+                           uint64_t sbid,
+                           const bluestore_blob_t& b) {
+    auto it = refs_map.find(sbid);
+    if(it == refs_map.end()) {
+      return;
+    }
+    for (auto& p : b.get_extents()) {
+      if (p.is_valid()) {
+	it->second.get(p.offset, p.length);
+	break;
+      }
+    }
+  });
+
+  // update shared blob records
+  auto ref_it = refs_map.begin();
+  while (ref_it != refs_map.end()) {
+    size_t cnt = 0;
+    const size_t max_transactions = 4096;
+    KeyValueDB::Transaction txn = db->get_transaction();
+    for (cnt = 0;
+      cnt < max_transactions && ref_it != refs_map.end();
+      ref_it++) {
+      auto sbid = ref_it->first;
+      dout(20) << __func__ << " repaired shared_blob 0x"
+	<< std::hex << sbid << std::dec
+	<< ref_it->second << dendl;
+      repairer.fix_shared_blob(txn, sbid, &ref_it->second, 0);
+      cnt++;
+    }
+    if (cnt) {
+      db->submit_transaction_sync(txn);
+      cnt = 0;
+    }
+  }
+  // remove stray shared blob records
+  size_t cnt = 0;
+  const size_t max_transactions = 4096;
+  KeyValueDB::Transaction txn = db->get_transaction();
+  sb_info.foreach_stray([&](const sb_info_t& sbi) {
+    auto sbid = sbi.get_sbid();
+    dout(20) << __func__ << " removing stray shared_blob 0x"
+      << std::hex << sbid << std::dec
+      << dendl;
+    repairer.fix_shared_blob(txn, sbid, nullptr, 0);
+    cnt++;
+    if (cnt >= max_transactions) {}
+      db->submit_transaction_sync(txn);
+      txn = db->get_transaction();
+      cnt = 0;
+    });
+  if (cnt > 0) {
+    db->submit_transaction_sync(txn);
+  }
+
+  // amount of repairs to report to be equal to previously
+  // determined error estimation, not the actual number of updated shared blobs
+  repairer.inc_repaired(sb_ref_mismatches);
+}
+
+BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
+  BlueStore::FSCKDepth depth,
+  int64_t pool_id,
+  BlueStore::CollectionRef c,
+  const ghobject_t& oid,
+  const string& key,
+  const bufferlist& value,
+  mempool::bluestore_fsck::list<string>* expecting_shards,
+  map<BlobRef, bluestore_blob_t::unused_t>* referenced,
+  const BlueStore::FSCK_ObjectCtx& ctx)
+{
+  auto& errors = ctx.errors;
+  auto& num_objects = ctx.num_objects;
+  auto& num_extents = ctx.num_extents;
+  auto& num_blobs = ctx.num_blobs;
+  auto& num_sharded_objects = ctx.num_sharded_objects;
+  auto& num_spanning_blobs = ctx.num_spanning_blobs;
+  auto used_blocks = ctx.used_blocks;
+  auto sb_info_lock = ctx.sb_info_lock;
+  auto& sb_info = ctx.sb_info;
+  auto& sb_ref_counts = ctx.sb_ref_counts;
+  auto repairer = ctx.repairer;
+
+  store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
+    &ctx.expected_pool_statfs[pool_id] :
+    &ctx.expected_store_statfs;
+
+  dout(10) << __func__ << "  " << oid << dendl;
+  OnodeRef o;
+  o.reset(Onode::decode(c, oid, key, value));
+  ++num_objects;
+
+  num_spanning_blobs += o->extent_map.spanning_blob_map.size();
+
+  o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+  _dump_onode<30>(cct, *o);
+  // shards
+  if (!o->extent_map.shards.empty()) {
+    ++num_sharded_objects;
+    if (depth != FSCK_SHALLOW) {
+      ceph_assert(expecting_shards);
+      for (auto& s : o->extent_map.shards) {
+        dout(20) << __func__ << "    shard " << *s.shard_info << dendl;
+        expecting_shards->push_back(string());
+        get_extent_shard_key(o->key, s.shard_info->offset,
+          &expecting_shards->back());
+        if (s.shard_info->offset >= o->onode.size) {
+          derr << "fsck error: " << oid << " shard 0x" << std::hex
+            << s.shard_info->offset << " past EOF at 0x" << o->onode.size
+            << std::dec << dendl;
+          ++errors;
+        }
+      }
+    }
+  }
+
+  // lextents
+  uint64_t pos = 0;
+  mempool::bluestore_fsck::map<BlobRef,
+    bluestore_blob_use_tracker_t> ref_map;
+  for (auto& l : o->extent_map.extent_map) {
+    dout(20) << __func__ << "    " << l << dendl;
+    if (l.logical_offset < pos) {
+      derr << "fsck error: " << oid << " lextent at 0x"
+        << std::hex << l.logical_offset
+        << " overlaps with the previous, which ends at 0x" << pos
+        << std::dec << dendl;
+      ++errors;
+    }
+    if (depth != FSCK_SHALLOW &&
+      o->extent_map.spans_shard(l.logical_offset, l.length)) {
+      derr << "fsck error: " << oid << " lextent at 0x"
+        << std::hex << l.logical_offset << "~" << l.length
+        << " spans a shard boundary"
+        << std::dec << dendl;
+      ++errors;
+    }
+    pos = l.logical_offset + l.length;
+    res_statfs->data_stored += l.length;
+    ceph_assert(l.blob);
+    const bluestore_blob_t& blob = l.blob->get_blob();
+
+    auto& ref = ref_map[l.blob];
+    if (ref.is_empty()) {
+      uint32_t min_release_size = blob.get_release_size(min_alloc_size);
+      uint32_t l = blob.get_logical_length();
+      ref.init(l, min_release_size);
+    }
+    ref.get(
+      l.blob_offset,
+      l.length);
+    ++num_extents;
+    if (depth != FSCK_SHALLOW &&
+      blob.has_unused()) {
+      ceph_assert(referenced);
+      auto p = referenced->find(l.blob);
+      bluestore_blob_t::unused_t* pu;
+      if (p == referenced->end()) {
+        pu = &(*referenced)[l.blob];
+      }
+      else {
+        pu = &p->second;
+      }
+      uint64_t blob_len = blob.get_logical_length();
+      ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
+      ceph_assert(l.blob_offset + l.length <= blob_len);
+      uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
+      uint64_t start = l.blob_offset / chunk_size;
+      uint64_t end =
+        round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
+      for (auto i = start; i < end; ++i) {
+        (*pu) |= (1u << i);
+      }
+    }
+  } //for (auto& l : o->extent_map.extent_map)
+
+  for (auto& i : ref_map) {
+    ++num_blobs;
+    const bluestore_blob_t& blob = i.first->get_blob();
+    bool equal =
+      depth == FSCK_SHALLOW ? true :
+      i.first->get_blob_use_tracker().equal(i.second);
+    if (!equal) {
+      derr << "fsck error: " << oid << " blob " << *i.first
+        << " doesn't match expected ref_map " << i.second << dendl;
+      ++errors;
+    }
+    if (blob.is_compressed()) {
+      res_statfs->data_compressed += blob.get_compressed_payload_length();
+      res_statfs->data_compressed_original +=
+        i.first->get_referenced_bytes();
+    }
+    if (depth != FSCK_SHALLOW && repairer) {
+      for (auto e : blob.get_extents()) {
+	if (!e.is_valid())
+	  continue;
+	repairer->set_space_used(e.offset, e.length, c->cid, oid);
+      }
+    }
+    if (blob.is_shared()) {
+      if (i.first->shared_blob->get_sbid() > blobid_max) {
+        derr << "fsck error: " << oid << " blob " << blob
+          << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
+          << blobid_max << dendl;
+        ++errors;
+      } else if (i.first->shared_blob->get_sbid() == 0) {
+        derr << "fsck error: " << oid << " blob " << blob
+          << " marked as shared but has uninitialized sbid"
+          << dendl;
+        ++errors;
+      }
+      // the below lock is optional and provided in multithreading mode only
+      if (sb_info_lock) {
+        sb_info_lock->lock();
+      }
+      auto sbid = i.first->shared_blob->get_sbid();
+      sb_info_t& sbi = sb_info.add_or_adopt(i.first->shared_blob->get_sbid());
+      ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID ||
+        sbi.pool_id == oid.hobj.get_logical_pool());
+      sbi.pool_id = oid.hobj.get_logical_pool();
+      bool compressed = blob.is_compressed();
+      for (auto e : blob.get_extents()) {
+        if (e.is_valid()) {
+	  if (compressed) {
+	    ceph_assert(sbi.allocated_chunks <= 0);
+	    sbi.allocated_chunks -= (e.length >> min_alloc_size_order);
+	  } else {
+	    ceph_assert(sbi.allocated_chunks >= 0);
+	    sbi.allocated_chunks += (e.length >> min_alloc_size_order);
+	  }
+	  sb_ref_counts.inc_range(sbid, e.offset, e.length, 1);
+        }
+      }
+      if (sb_info_lock) {
+        sb_info_lock->unlock();
+      }
+    } else if (depth != FSCK_SHALLOW) {
+      ceph_assert(used_blocks);
+      string ctx_descr = " oid " + stringify(oid);
+      errors += _fsck_check_extents(ctx_descr,
+	blob.get_extents(),
+        blob.is_compressed(),
+        *used_blocks,
+        fm->get_alloc_size(),
+	repairer,
+        *res_statfs,
+        depth);
+    } else {
+      errors += _fsck_sum_extents(
+        blob.get_extents(),
+        blob.is_compressed(),
+        *res_statfs);
+    }
+  } // for (auto& i : ref_map)
+
+  {
+    auto &sbm = o->extent_map.spanning_blob_map;
+    size_t broken = 0;
+    BlobRef first_broken;
+    for (auto it = sbm.begin(); it != sbm.end();) {
+      auto it1 = it++;
+      if (ref_map.count(it1->second) == 0) {
+        if (!broken) {
+          first_broken = it1->second;
+          ++errors;
+        }
+        broken++;
+        if (repairer) {
+          sbm.erase(it1);
+        }
+      }
+    }
+    if (broken) {
+      derr << "fsck error: " << oid << " - " << broken
+           << " zombie spanning blob(s) found, the first one: "
+           << *first_broken << dendl;
+      if(repairer) {
+        repairer->fix_spanning_blobs(
+	  db,
+	  [&](KeyValueDB::Transaction txn) {
+	    _record_onode(o, txn);
+	  });
+      }
+    }
+  }
+
+  if (o->onode.has_omap()) {
+    _fsck_check_object_omap(depth, o, ctx);
+  }
+
+  return o;
+}
+
+#include "common/WorkQueue.h"
+
+class ShallowFSCKThreadPool : public ThreadPool
+{
+public:
+  ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
+    ThreadPool(cct_, nm, tn, n) {
+  }
+  void worker(ThreadPool::WorkThread* wt) override {
+    int next_wq = 0;
+    while (!_stop) {
+      next_wq %= work_queues.size();
+      WorkQueue_ *wq = work_queues[next_wq++];
+
+      void* item = wq->_void_dequeue();
+      if (item) {
+        processing++;
+        TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
+        wq->_void_process(item, tp_handle);
+        processing--;
+      }
+    }
+  }
+  template <size_t BatchLen>
+  struct FSCKWorkQueue : public ThreadPool::WorkQueue_
+  {
+    struct Entry {
+      int64_t pool_id;
+      BlueStore::CollectionRef c;
+      ghobject_t oid;
+      string key;
+      bufferlist value;
+    };
+    struct Batch {
+      std::atomic<size_t> running = { 0 };
+      size_t entry_count = 0;
+      std::array<Entry, BatchLen> entries;
+
+      int64_t errors = 0;
+      int64_t warnings = 0;
+      uint64_t num_objects = 0;
+      uint64_t num_extents = 0;
+      uint64_t num_blobs = 0;
+      uint64_t num_sharded_objects = 0;
+      uint64_t num_spanning_blobs = 0;
+      store_statfs_t expected_store_statfs;
+      BlueStore::per_pool_statfs expected_pool_statfs;
+    };
+
+    size_t batchCount;
+    BlueStore* store = nullptr;
+
+    ceph::mutex* sb_info_lock = nullptr;
+    sb_info_space_efficient_map_t* sb_info = nullptr;
+    shared_blob_2hash_tracker_t* sb_ref_counts = nullptr;
+    BlueStoreRepairer* repairer = nullptr;
+
+    Batch* batches = nullptr;
+    size_t last_batch_pos = 0;
+    bool batch_acquired = false;
+
+    FSCKWorkQueue(std::string n,
+                  size_t _batchCount,
+                  BlueStore* _store,
+                  ceph::mutex* _sb_info_lock,
+                  sb_info_space_efficient_map_t& _sb_info,
+		  shared_blob_2hash_tracker_t& _sb_ref_counts,
+                  BlueStoreRepairer* _repairer) :
+      WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()),
+      batchCount(_batchCount),
+      store(_store),
+      sb_info_lock(_sb_info_lock),
+      sb_info(&_sb_info),
+      sb_ref_counts(&_sb_ref_counts),
+      repairer(_repairer)
+    {
+      batches = new Batch[batchCount];
+    }
+    ~FSCKWorkQueue() {
+      delete[] batches;
+    }
+
+    /// Remove all work items from the queue.
+    void _clear() override {
+      //do nothing
+    }
+    /// Check whether there is anything to do.
+    bool _empty() override {
+      ceph_assert(false);
+    }
+
+    /// Get the next work item to process.
+    void* _void_dequeue() override {
+      size_t pos = rand() % batchCount;
+      size_t pos0 = pos;
+      do {
+        auto& batch = batches[pos];
+        if (batch.running.fetch_add(1) == 0) {
+          if (batch.entry_count) {
+            return &batch;
+          }
+        }
+        batch.running--;
+        pos++;
+        pos %= batchCount;
+      } while (pos != pos0);
+      return nullptr;
+    }
+    /** @brief Process the work item.
+     * This function will be called several times in parallel
+     * and must therefore be thread-safe. */
+    void _void_process(void* item, TPHandle& handle) override {
+      Batch* batch = (Batch*)item;
+
+      BlueStore::FSCK_ObjectCtx ctx(
+        batch->errors,
+        batch->warnings,
+        batch->num_objects,
+        batch->num_extents,
+        batch->num_blobs,
+        batch->num_sharded_objects,
+        batch->num_spanning_blobs,
+        nullptr, // used_blocks
+        nullptr, //used_omap_head
+        sb_info_lock,
+        *sb_info,
+	*sb_ref_counts,
+        batch->expected_store_statfs,
+        batch->expected_pool_statfs,
+        repairer);
+
+      for (size_t i = 0; i < batch->entry_count; i++) {
+        auto& entry = batch->entries[i];
+
+        store->fsck_check_objects_shallow(
+          BlueStore::FSCK_SHALLOW,
+          entry.pool_id,
+          entry.c,
+          entry.oid,
+          entry.key,
+          entry.value,
+          nullptr, // expecting_shards - this will need a protection if passed
+          nullptr, // referenced
+          ctx);
+      }
+      //std::cout << "processed " << batch << std::endl;
+      batch->entry_count = 0;
+      batch->running--;
+    }
+    /** @brief Synchronously finish processing a work item.
+     * This function is called after _void_process with the global thread pool lock held,
+     * so at most one copy will execute simultaneously for a given thread pool.
+     * It can be used for non-thread-safe finalization. */
+    void _void_process_finish(void*) override {
+      ceph_assert(false);
+    }
+
+    bool queue(
+      int64_t pool_id,
+      BlueStore::CollectionRef c,
+      const ghobject_t& oid,
+      const string& key,
+      const bufferlist& value) {
+      bool res = false;
+      size_t pos0 = last_batch_pos;
+      if (!batch_acquired) {
+        do {
+          auto& batch = batches[last_batch_pos];
+          if (batch.running.fetch_add(1) == 0) {
+            if (batch.entry_count < BatchLen) {
+              batch_acquired = true;
+              break;
+            }
+          }
+          batch.running.fetch_sub(1);
+          last_batch_pos++;
+          last_batch_pos %= batchCount;
+        } while (last_batch_pos != pos0);
+      }
+      if (batch_acquired) {
+        auto& batch = batches[last_batch_pos];
+        ceph_assert(batch.running);
+        ceph_assert(batch.entry_count < BatchLen);
+
+        auto& entry = batch.entries[batch.entry_count];
+        entry.pool_id = pool_id;
+        entry.c = c;
+        entry.oid = oid;
+        entry.key = key;
+        entry.value = value;
+
+        ++batch.entry_count;
+        if (batch.entry_count == BatchLen) {
+          batch_acquired = false;
+          batch.running.fetch_sub(1);
+          last_batch_pos++;
+          last_batch_pos %= batchCount;
+        }
+        res = true;
+      }
+      return res;
+    }
+
+    void finalize(ThreadPool& tp,
+                  BlueStore::FSCK_ObjectCtx& ctx) {
+      if (batch_acquired) {
+        auto& batch = batches[last_batch_pos];
+        ceph_assert(batch.running);
+        batch.running.fetch_sub(1);
+      }
+      tp.stop();
+
+      for (size_t i = 0; i < batchCount; i++) {
+        auto& batch = batches[i];
+
+        //process leftovers if any
+        if (batch.entry_count) {
+          TPHandle tp_handle(store->cct,
+            nullptr,
+            timeout_interval,
+            suicide_interval);
+          ceph_assert(batch.running == 0);
+
+          batch.running++; // just to be on-par with the regular call
+          _void_process(&batch, tp_handle);
+        }
+        ceph_assert(batch.entry_count == 0);
+
+        ctx.errors += batch.errors;
+        ctx.warnings += batch.warnings;
+        ctx.num_objects += batch.num_objects;
+        ctx.num_extents += batch.num_extents;
+        ctx.num_blobs += batch.num_blobs;
+        ctx.num_sharded_objects += batch.num_sharded_objects;
+        ctx.num_spanning_blobs += batch.num_spanning_blobs;
+
+        ctx.expected_store_statfs.add(batch.expected_store_statfs);
+
+        for (auto it = batch.expected_pool_statfs.begin();
+          it != batch.expected_pool_statfs.end();
+          it++) {
+          ctx.expected_pool_statfs[it->first].add(it->second);
+        }
+      }
+    }
+  };
+};
+
+void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
+  OnodeRef& o,
+  const BlueStore::FSCK_ObjectCtx& ctx)
+{
+  auto& errors = ctx.errors;
+  auto& warnings = ctx.warnings;
+  auto repairer = ctx.repairer;
+
+  ceph_assert(o->onode.has_omap());
+  if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
+    if (per_pool_omap == OMAP_PER_POOL) {
+      fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+        << "fsck error: " << o->oid
+        << " has omap that is not per-pool or pgmeta"
+        << fsck_dendl;
+      ++errors;
+    } else {
+      const char* w;
+      int64_t num;
+      if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
+        ++errors;
+        num = errors;
+        w = "error";
+      } else {
+        ++warnings;
+        num = warnings;
+        w = "warning";
+      }
+      fsck_derr(num, MAX_FSCK_ERROR_LINES)
+        << "fsck " << w << ": " << o->oid
+        << " has omap that is not per-pool or pgmeta"
+        << fsck_dendl;
+    }
+  } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) {
+    if (per_pool_omap == OMAP_PER_PG) {
+      fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+        << "fsck error: " << o->oid
+        << " has omap that is not per-pg or pgmeta"
+        << fsck_dendl;
+      ++errors;
+    } else {
+      const char* w;
+      int64_t num;
+      if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) {
+        ++errors;
+        num = errors;
+        w = "error";
+      } else {
+        ++warnings;
+        num = warnings;
+        w = "warning";
+      }
+      fsck_derr(num, MAX_FSCK_ERROR_LINES)
+        << "fsck " << w << ": " << o->oid
+        << " has omap that is not per-pg or pgmeta"
+        << fsck_dendl;
+    }
+  }
+  if (repairer &&
+    !o->onode.is_perpg_omap() &&
+    !o->onode.is_pgmeta_omap()) {
+    dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl;
+    bufferlist header;
+    map<string, bufferlist> kv;
+    {
+      KeyValueDB::Transaction txn = db->get_transaction();
+      uint64_t txn_cost = 0;
+      const string& prefix = Onode::calc_omap_prefix(o->onode.flags);
+      uint8_t new_flags = o->onode.flags |
+	bluestore_onode_t::FLAG_PERPOOL_OMAP |
+	bluestore_onode_t::FLAG_PERPG_OMAP;
+      const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags);
+
+      KeyValueDB::Iterator it = db->get_iterator(prefix);
+      string head, tail;
+      o->get_omap_header(&head);
+      o->get_omap_tail(&tail);
+      it->lower_bound(head);
+      // head
+      if (it->valid() && it->key() == head) {
+	dout(30) << __func__ << "  got header" << dendl;
+	header = it->value();
+	if (header.length()) {
+	  string new_head;
+	  Onode::calc_omap_header(new_flags, o.get(), &new_head);
+	  txn->set(new_omap_prefix, new_head, header);
+	  txn_cost += new_head.length() + header.length();
+	}
+	it->next();
+      }
+      // tail
+      {
+	string new_tail;
+	Onode::calc_omap_tail(new_flags, o.get(), &new_tail);
+	bufferlist empty;
+	txn->set(new_omap_prefix, new_tail, empty);
+	txn_cost += new_tail.length() + new_tail.length();
+      }
+      // values
+      string final_key;
+      Onode::calc_omap_key(new_flags, o.get(), string(), &final_key);
+      size_t base_key_len = final_key.size();
+      while (it->valid() && it->key() < tail) {
+	string user_key;
+	o->decode_omap_key(it->key(), &user_key);
+	dout(20) << __func__ << "  got " << pretty_binary_string(it->key())
+	  << " -> " << user_key << dendl;
+
+	final_key.resize(base_key_len);
+	final_key += user_key;
+	auto v = it->value();
+	txn->set(new_omap_prefix, final_key, v);
+	txn_cost += final_key.length() + v.length();
+
+	// submit a portion if cost exceeds 16MB
+	if (txn_cost >= 16 * (1 << 20) ) {
+	  db->submit_transaction_sync(txn);
+	  txn = db->get_transaction();
+	  txn_cost = 0;
+	}
+	it->next();
+      }
+      if (txn_cost > 0) {
+	db->submit_transaction_sync(txn);
+      }
+    }
+    // finalize: remove legacy data
+    {
+      KeyValueDB::Transaction txn = db->get_transaction();
+      // remove old keys
+      const string& old_omap_prefix = o->get_omap_prefix();
+      string old_head, old_tail;
+      o->get_omap_header(&old_head);
+      o->get_omap_tail(&old_tail);
+      txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
+      txn->rmkey(old_omap_prefix, old_tail);
+      // set flag
+      o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP);
+      _record_onode(o, txn);
+      db->submit_transaction_sync(txn);
+      repairer->inc_repaired();
+      repairer->request_compaction();
+    }
+  }
+}
+
+void BlueStore::_fsck_check_objects(FSCKDepth depth,
+  BlueStore::FSCK_ObjectCtx& ctx)
+{
+  auto& errors = ctx.errors;
+  auto sb_info_lock = ctx.sb_info_lock;
+  auto& sb_info = ctx.sb_info;
+  auto& sb_ref_counts = ctx.sb_ref_counts;
+  auto repairer = ctx.repairer;
+
+  uint64_t_btree_t used_nids;
+
+  size_t processed_myself = 0;
+
+  auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+  mempool::bluestore_fsck::list<string> expecting_shards;
+  if (it) {
+    const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
+    typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
+    std::unique_ptr<WQ> wq(
+      new WQ(
+        "FSCKWorkQueue",
+        (thread_count ? : 1) * 32,
+        this,
+        sb_info_lock,
+        sb_info,
+	sb_ref_counts,
+        repairer));
+
+    ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
+
+    thread_pool.add_work_queue(wq.get());
+    if (depth == FSCK_SHALLOW && thread_count > 0) {
+      //not the best place but let's check anyway
+      ceph_assert(sb_info_lock);
+      thread_pool.start();
+    }
+
+    //fill global if not overriden below
+    CollectionRef c;
+    int64_t pool_id = -1;
+    spg_t pgid;
+    for (it->lower_bound(string()); it->valid(); it->next()) {
+      dout(30) << __func__ << " key "
+        << pretty_binary_string(it->key()) << dendl;
+      if (is_extent_shard_key(it->key())) {
+        if (depth == FSCK_SHALLOW) {
+          continue;
+        }
+        while (!expecting_shards.empty() &&
+          expecting_shards.front() < it->key()) {
+          derr << "fsck error: missing shard key "
+            << pretty_binary_string(expecting_shards.front())
+            << dendl;
+          ++errors;
+          expecting_shards.pop_front();
+        }
+        if (!expecting_shards.empty() &&
+          expecting_shards.front() == it->key()) {
+          // all good
+          expecting_shards.pop_front();
+          continue;
+        }
+
+        uint32_t offset;
+        string okey;
+        get_key_extent_shard(it->key(), &okey, &offset);
+        derr << "fsck error: stray shard 0x" << std::hex << offset
+          << std::dec << dendl;
+        if (expecting_shards.empty()) {
+          derr << "fsck error: " << pretty_binary_string(it->key())
+            << " is unexpected" << dendl;
+          ++errors;
+          continue;
+        }
+        while (expecting_shards.front() > it->key()) {
+          derr << "fsck error:   saw " << pretty_binary_string(it->key())
+            << dendl;
+          derr << "fsck error:   exp "
+            << pretty_binary_string(expecting_shards.front()) << dendl;
+          ++errors;
+          expecting_shards.pop_front();
+          if (expecting_shards.empty()) {
+            break;
+          }
+        }
+        continue;
+      }
+
+      ghobject_t oid;
+      int r = get_key_object(it->key(), &oid);
+      if (r < 0) {
+        derr << "fsck error: bad object key "
+          << pretty_binary_string(it->key()) << dendl;
+        ++errors;
+        continue;
+      }
+      if (!c ||
+        oid.shard_id != pgid.shard ||
+        oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+        !c->contains(oid)) {
+        c = nullptr;
+        for (auto& p : coll_map) {
+          if (p.second->contains(oid)) {
+            c = p.second;
+            break;
+          }
+        }
+        if (!c) {
+          derr << "fsck error: stray object " << oid
+            << " not owned by any collection" << dendl;
+          ++errors;
+          continue;
+        }
+        pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
+        dout(20) << __func__ << "  collection " << c->cid << " " << c->cnode
+          << dendl;
+      }
+
+      if (depth != FSCK_SHALLOW &&
+        !expecting_shards.empty()) {
+        for (auto& k : expecting_shards) {
+          derr << "fsck error: missing shard key "
+            << pretty_binary_string(k) << dendl;
+        }
+        ++errors;
+        expecting_shards.clear();
+      }
+
+      bool queued = false;
+      if (depth == FSCK_SHALLOW && thread_count > 0) {
+        queued = wq->queue(
+          pool_id,
+          c,
+          oid,
+          it->key(),
+          it->value());
+      }
+      OnodeRef o;
+      map<BlobRef, bluestore_blob_t::unused_t> referenced;
+
+      if (!queued) {
+        ++processed_myself;
+
+         o = fsck_check_objects_shallow(
+          depth,
+          pool_id,
+          c,
+          oid,
+          it->key(),
+          it->value(),
+          &expecting_shards,
+          &referenced,
+          ctx);
+      }
+
+      if (depth != FSCK_SHALLOW) {
+        ceph_assert(o != nullptr);
+        if (o->onode.nid) {
+          if (o->onode.nid > nid_max) {
+            derr << "fsck error: " << oid << " nid " << o->onode.nid
+              << " > nid_max " << nid_max << dendl;
+            ++errors;
+          }
+          if (used_nids.count(o->onode.nid)) {
+            derr << "fsck error: " << oid << " nid " << o->onode.nid
+              << " already in use" << dendl;
+            ++errors;
+            continue; // go for next object
+          }
+          used_nids.insert(o->onode.nid);
+        }
+        for (auto& i : referenced) {
+          dout(20) << __func__ << "  referenced 0x" << std::hex << i.second
+            << std::dec << " for " << *i.first << dendl;
+          const bluestore_blob_t& blob = i.first->get_blob();
+          if (i.second & blob.unused) {
+            derr << "fsck error: " << oid << " blob claims unused 0x"
+              << std::hex << blob.unused
+              << " but extents reference 0x" << i.second << std::dec
+              << " on blob " << *i.first << dendl;
+            ++errors;
+          }
+          if (blob.has_csum()) {
+            uint64_t blob_len = blob.get_logical_length();
+            uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
+            unsigned csum_count = blob.get_csum_count();
+            unsigned csum_chunk_size = blob.get_csum_chunk_size();
+            for (unsigned p = 0; p < csum_count; ++p) {
+              unsigned pos = p * csum_chunk_size;
+              unsigned firstbit = pos / unused_chunk_size;    // [firstbit,lastbit]
+              unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
+              unsigned mask = 1u << firstbit;
+              for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
+                mask |= 1u << b;
+              }
+              if ((blob.unused & mask) == mask) {
+                // this csum chunk region is marked unused
+                if (blob.get_csum_item(p) != 0) {
+                  derr << "fsck error: " << oid
+                    << " blob claims csum chunk 0x" << std::hex << pos
+                    << "~" << csum_chunk_size
+                    << " is unused (mask 0x" << mask << " of unused 0x"
+                    << blob.unused << ") but csum is non-zero 0x"
+                    << blob.get_csum_item(p) << std::dec << " on blob "
+                    << *i.first << dendl;
+                  ++errors;
+                }
+              }
+            }
+          }
+        }
+        // omap
+        if (o->onode.has_omap()) {
+          ceph_assert(ctx.used_omap_head);
+          if (ctx.used_omap_head->count(o->onode.nid)) {
+            derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
+                 << " already in use" << dendl;
+            ++errors;
+          } else {
+            ctx.used_omap_head->insert(o->onode.nid);
+          }
+        } // if (o->onode.has_omap())
+        if (depth == FSCK_DEEP) {
+          bufferlist bl;
+          uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
+          uint64_t offset = 0;
+          do {
+            uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
+            int r = _do_read(c.get(), o, offset, l, bl,
+              CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+            if (r < 0) {
+              ++errors;
+              derr << "fsck error: " << oid << std::hex
+                << " error during read: "
+                << " " << offset << "~" << l
+                << " " << cpp_strerror(r) << std::dec
+                << dendl;
+              break;
+            }
+            offset += l;
+          } while (offset < o->onode.size);
+        } // deep
+      } //if (depth != FSCK_SHALLOW)
+    } // for (it->lower_bound(string()); it->valid(); it->next())
+    if (depth == FSCK_SHALLOW && thread_count > 0) {
+      wq->finalize(thread_pool, ctx);
+      if (processed_myself) {
+        // may be needs more threads?
+        dout(0) << __func__ << " partial offload"
+                << ", done myself " << processed_myself
+                << " of " << ctx.num_objects
+                << "objects, threads " << thread_count
+                << dendl;
+      }
+    }
+  } // if (it)
+}
+/**
+An overview for currently implemented repair logics 
+performed in fsck in two stages: detection(+preparation) and commit.
+Detection stage (in processing order):
+  (Issue -> Repair action to schedule)
+  - Detect undecodable keys for Shared Blobs -> Remove
+  - Detect undecodable records for Shared Blobs -> Remove 
+    (might trigger missed Shared Blob detection below)
+  - Detect stray records for Shared Blobs -> Remove
+  - Detect misreferenced pextents -> Fix
+    Prepare Bloom-like filter to track cid/oid -> pextent 
+    Prepare list of extents that are improperly referenced
+    Enumerate Onode records that might use 'misreferenced' pextents
+    (Bloom-like filter applied to reduce computation)
+      Per each questinable Onode enumerate all blobs and identify broken ones 
+      (i.e. blobs having 'misreferences')
+      Rewrite each broken blob data by allocating another extents and 
+      copying data there
+      If blob is shared - unshare it and mark corresponding Shared Blob 
+      for removal
+      Release previously allocated space
+      Update Extent Map
+  - Detect missed Shared Blobs -> Recreate
+  - Detect undecodable deferred transaction -> Remove
+  - Detect Freelist Manager's 'false free' entries -> Mark as used
+  - Detect Freelist Manager's leaked entries -> Mark as free
+  - Detect statfs inconsistency - Update
+  Commit stage (separate DB commit per each step):
+  - Apply leaked FM entries fix
+  - Apply 'false free' FM entries fix
+  - Apply 'Remove' actions
+  - Apply fix for misreference pextents
+  - Apply Shared Blob recreate 
+    (can be merged with the step above if misreferences were dectected)
+  - Apply StatFS update
+*/
+int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
+{
+  dout(1) << __func__
+    << (repair ? " repair" : " check")
+    << (depth == FSCK_DEEP ? " (deep)" :
+      depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
+    << dendl;
+
+  // in deep mode we need R/W write access to be able to replay deferred ops
+  bool read_only = !(repair || depth == FSCK_DEEP);
+
+  int r = _open_db_and_around(read_only);
+  if (r < 0)
+    return r;
+
+  if (!read_only) {
+    r = _upgrade_super();
+    if (r < 0) {
+      goto out_db;
+    }
+  }
+
+  r = _open_collections();
+  if (r < 0)
+    goto out_db;
+
+  mempool_thread.init();
+
+  // we need finisher and kv_{sync,finalize}_thread *just* for replay
+  // enable in repair or deep mode modes only
+  if (!read_only) {
+    _kv_start();
+    r = _deferred_replay();
+    _kv_stop();
+  }
+  if (r < 0)
+    goto out_scan;
+
+  r = _fsck_on_open(depth, repair);
+
+out_scan:
+  mempool_thread.shutdown();
+  _shutdown_cache();
+out_db:
+  _close_db_and_around(false);
+
+  return r;
+}
+
+int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
+{
+  uint64_t sb_hash_size = uint64_t(
+    cct->_conf.get_val<Option::size_t>("osd_memory_target") *
+    cct->_conf.get_val<double>(
+      "bluestore_fsck_shared_blob_tracker_size"));
+
+  dout(1) << __func__
+	  << " <<<START>>>"
+	  << (repair ? " repair" : " check")
+	  << (depth == FSCK_DEEP ? " (deep)" :
+                depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
+          << " start sb_tracker_hash_size:" << sb_hash_size
+          << dendl;
+  int64_t errors = 0;
+  int64_t warnings = 0;
+  unsigned repaired = 0;
+
+  uint64_t_btree_t used_omap_head;
+  uint64_t_btree_t used_sbids;
+
+  mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
+  KeyValueDB::Iterator it;
+  store_statfs_t expected_store_statfs, actual_statfs;
+  per_pool_statfs expected_pool_statfs;
+
+  sb_info_space_efficient_map_t sb_info;
+  shared_blob_2hash_tracker_t sb_ref_counts(
+    sb_hash_size,
+    min_alloc_size);
+  size_t sb_ref_mismatches = 0;
+
+  uint64_t num_objects = 0;
+  uint64_t num_extents = 0;
+  uint64_t num_blobs = 0;
+  uint64_t num_spanning_blobs = 0;
+  uint64_t num_shared_blobs = 0;
+  uint64_t num_sharded_objects = 0;
+  BlueStoreRepairer repairer;
+
+  auto alloc_size = fm->get_alloc_size();
+
+  utime_t start = ceph_clock_now();
+
+  _fsck_collections(&errors);
+  used_blocks.resize(fm->get_alloc_units());
+
+  if (bluefs) {
+    interval_set<uint64_t> bluefs_extents;
+
+    int r = bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
+    ceph_assert(r == 0);
+    for (auto [start, len] : bluefs_extents) {
+      apply_for_bitset_range(start, len, alloc_size, used_blocks,
+        [&](uint64_t pos, mempool_dynamic_bitset& bs) {
+          ceph_assert(pos < bs.size());
+          bs.set(pos);
+        }
+      );
+    }
+  }
+
+  bluefs_used_blocks = used_blocks;
+
+  apply_for_bitset_range(
+    0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
+    [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+      bs.set(pos);
+    }
+  );
+
+
+  if (repair) {
+    repairer.init_space_usage_tracker(
+      bdev->get_size(),
+      min_alloc_size);
+  }
+
+  if (bluefs) {
+    int r = bluefs->fsck();
+    if (r < 0) {
+      return r;
+    }
+    if (r > 0)
+      errors += r;
+  }
+
+  if (!per_pool_stat_collection) {
+    const char *w;
+    if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
+      w = "error";
+      ++errors;
+    } else {
+      w = "warning";
+      ++warnings;
+    }
+    derr << "fsck " << w << ": store not yet converted to per-pool stats"
+	 << dendl;
+  }
+  if (per_pool_omap != OMAP_PER_PG) {
+    const char *w;
+    if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
+      w = "error";
+      ++errors;
+    } else {
+      w = "warning";
+      ++warnings;
+    }
+    derr << "fsck " << w << ": store not yet converted to per-pg omap"
+	 << dendl;
+  }
+
+  // get expected statfs; reset unaffected fields to be able to compare
+  // structs
+  statfs(&actual_statfs);
+  actual_statfs.total = 0;
+  actual_statfs.internally_reserved = 0;
+  actual_statfs.available = 0;
+  actual_statfs.internal_metadata = 0;
+  actual_statfs.omap_allocated = 0;
+
+  if (g_conf()->bluestore_debug_fsck_abort) {
+    dout(1) << __func__ << " debug abort" << dendl;
+    goto out_scan;
+  }
+
+  dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl;
+  it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
+  if (it) {
+    for (it->lower_bound(string()); it->valid(); it->next()) {
+      string key = it->key();
+      uint64_t sbid;
+      if (get_key_shared_blob(key, &sbid) < 0) {
+        // Failed to parse the key.
+	// This gonna to be handled at the second stage
+	continue;
+      }
+      bluestore_shared_blob_t shared_blob(sbid);
+      bufferlist bl = it->value();
+      auto blp = bl.cbegin();
+      try {
+	decode(shared_blob, blp);
+      }
+      catch (ceph::buffer::error& e) {
+	// this gonna to be handled at the second stage
+	continue;
+      }
+      dout(20) << __func__ << "  " << shared_blob << dendl;
+      auto& sbi = sb_info.add_maybe_stray(sbid);
+
+      // primarily to silent the 'unused' warning
+      ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID);
+
+      for (auto& r : shared_blob.ref_map.ref_map) {
+	sb_ref_counts.inc_range(
+	  sbid,
+	  r.first,
+	  r.second.length,
+	  -r.second.refs);
+      }
+    }
+  } // if (it) //checking shared_blobs (phase1)
+
+  // walk PREFIX_OBJ
+  {
+    dout(1) << __func__ << " walking object keyspace" << dendl;
+    ceph::mutex sb_info_lock =  ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
+    BlueStore::FSCK_ObjectCtx ctx(
+      errors,
+      warnings,
+      num_objects,
+      num_extents,
+      num_blobs,
+      num_sharded_objects,
+      num_spanning_blobs,
+      &used_blocks,
+      &used_omap_head,
+      //no need for the below lock when in non-shallow mode as
+      // there is no multithreading in this case
+      depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
+      sb_info,
+      sb_ref_counts,
+      expected_store_statfs,
+      expected_pool_statfs,
+      repair ? &repairer : nullptr);
+
+    _fsck_check_objects(depth, ctx);
+  }
+
+  sb_ref_mismatches = sb_ref_counts.count_non_zero();
+  if (sb_ref_mismatches != 0) {
+    derr << "fsck error: shared blob references aren't matching, at least "
+      << sb_ref_mismatches << " found" << dendl;
+    errors += sb_ref_mismatches;
+  }
+
+  if (depth != FSCK_SHALLOW && repair) {
+    _fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info);
+  }
+  dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl;
+  it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
+  if (it) {
+    // FIXME minor: perhaps simplify for shallow mode?
+    // fill global if not overriden below
+    auto expected_statfs = &expected_store_statfs;
+    for (it->lower_bound(string()); it->valid(); it->next()) {
+      string key = it->key();
+      uint64_t sbid;
+      if (get_key_shared_blob(key, &sbid)) {
+	derr << "fsck error: bad key '" << key
+	  << "' in shared blob namespace" << dendl;
+	if (repair) {
+	  repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+	}
+	++errors;
+	continue;
+      }
+      auto p = sb_info.find(sbid);
+      if (p == sb_info.end()) {
+        if (sb_ref_mismatches > 0) {
+	  // highly likely this has been already reported before, ignoring...
+	  dout(5) << __func__ << " found duplicate(?) stray shared blob data for sbid 0x"
+	    << std::hex << sbid << std::dec << dendl;
+	} else {
+	  derr<< "fsck error: found stray shared blob data for sbid 0x"
+	    << std::hex << sbid << std::dec << dendl;
+	  ++errors;
+	  if (repair) {
+	    repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+	  }
+	}
+      } else {
+	++num_shared_blobs;
+	sb_info_t& sbi = *p;
+	bluestore_shared_blob_t shared_blob(sbid);
+	bufferlist bl = it->value();
+	auto blp = bl.cbegin();
+	try {
+	  decode(shared_blob, blp);
+	}
+	catch (ceph::buffer::error& e) {
+	  ++errors;
+
+	  derr << "fsck error: failed to decode Shared Blob"
+	    << pretty_binary_string(key) << dendl;
+	  if (repair) {
+	    dout(20) << __func__ << " undecodable Shared Blob, key:'"
+	      << pretty_binary_string(key)
+	      << "', removing" << dendl;
+	    repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+	  }
+	  continue;
+	}
+	dout(20) << __func__ << "  " << shared_blob << dendl;
+	PExtentVector extents;
+	for (auto& r : shared_blob.ref_map.ref_map) {
+	  extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
+	}
+	if (sbi.pool_id != sb_info_t::INVALID_POOL_ID &&
+	    (per_pool_stat_collection || repair)) {
+	  expected_statfs = &expected_pool_statfs[sbi.pool_id];
+	}
+	std::stringstream ss;
+	ss << "sbid 0x" << std::hex << sbid << std::dec;
+	errors += _fsck_check_extents(ss.str(),
+	  extents,
+	  sbi.allocated_chunks < 0,
+	  used_blocks,
+	  fm->get_alloc_size(),
+	  repair ? &repairer : nullptr,
+	  *expected_statfs,
+	  depth);
+      }
+    }
+  } // if (it) /* checking shared_blobs (phase 2)*/
+
+  if (repair && repairer.preprocess_misreference(db)) {
+
+    dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
+    auto& misref_extents = repairer.get_misreferences();
+    interval_set<uint64_t> to_release;
+    it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+    if (it) {
+      // fill global if not overriden below
+      auto expected_statfs = &expected_store_statfs;
+
+      CollectionRef c;
+      spg_t pgid;
+      KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
+      bool bypass_rest = false;
+      for (it->lower_bound(string()); it->valid() && !bypass_rest;
+	   it->next()) {
+	dout(30) << __func__ << " key "
+		 << pretty_binary_string(it->key()) << dendl;
+	if (is_extent_shard_key(it->key())) {
+	  continue;
+	}
+
+	ghobject_t oid;
+	int r = get_key_object(it->key(), &oid);
+	if (r < 0 || !repairer.is_used(oid)) {
+	  continue;
+	}
+
+	if (!c ||
+	    oid.shard_id != pgid.shard ||
+	    oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+	    !c->contains(oid)) {
+	  c = nullptr;
+	  for (auto& p : coll_map) {
+	    if (p.second->contains(oid)) {
+	      c = p.second;
+	      break;
+	    }
+	  }
+	  if (!c) {
+	    continue;
+	  }
+	  if (per_pool_stat_collection || repair) {
+	    auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
+	    expected_statfs = &expected_pool_statfs[pool_id];
+	  }
+	}
+	if (!repairer.is_used(c->cid)) {
+	  continue;
+	}
+
+	dout(20) << __func__ << " check misreference for col:" << c->cid
+		  << " obj:" << oid << dendl;
+
+        OnodeRef o;
+        o.reset(Onode::decode(c, oid, it->key(), it->value()));
+	o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+	mempool::bluestore_fsck::set<BlobRef> blobs;
+
+	for (auto& e : o->extent_map.extent_map) {
+	  blobs.insert(e.blob);
+	}
+	bool need_onode_update = false;
+	bool first_dump = true;
+	for(auto b : blobs) {
+	  bool broken_blob = false;
+	  auto& pextents = b->dirty_blob().dirty_extents();
+	  for (auto& e : pextents) {
+	    if (!e.is_valid()) {
+	      continue;
+	    }
+	    // for the sake of simplicity and proper shared blob handling
+	    // always rewrite the whole blob even when it's partially
+	    // misreferenced.
+	    if (misref_extents.intersects(e.offset, e.length)) {
+	      if (first_dump) {
+		first_dump = false;
+		_dump_onode<10>(cct, *o);
+	      }
+	      broken_blob = true;
+	      break;
+	    }
+	  }
+	  if (!broken_blob)
+	    continue;
+	  bool compressed = b->get_blob().is_compressed();
+          need_onode_update = true;
+	  dout(10) << __func__
+		    << " fix misreferences in oid:" << oid
+		    << " " << *b << dendl;
+	  uint64_t b_off = 0;
+	  PExtentVector pext_to_release;
+	  pext_to_release.reserve(pextents.size());
+	  // rewriting all valid pextents
+	  for (auto e = pextents.begin(); e != pextents.end();
+	         e++) {
+	    auto b_off_cur = b_off;
+	    b_off += e->length;
+	    if (!e->is_valid()) {
+	      continue;
+	    }
+	    PExtentVector exts;
+	    int64_t alloc_len =
+              shared_alloc.a->allocate(e->length, min_alloc_size,
+				       0, 0, &exts);
+	    if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
+	      derr << __func__
+	           << " failed to allocate 0x" << std::hex << e->length
+		   << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
+		   << " min_alloc_size 0x" << min_alloc_size
+		   << " available 0x " << shared_alloc.a->get_free()
+		   << std::dec << dendl;
+	      if (alloc_len > 0) {
+                shared_alloc.a->release(exts);
+	      }
+	      bypass_rest = true;
+	      break;
+	    }
+            expected_statfs->allocated += e->length;
+	    if (compressed) {
+	      expected_statfs->data_compressed_allocated += e->length;
+	    }
+
+	    bufferlist bl;
+	    IOContext ioc(cct, NULL, true); // allow EIO
+	    r = bdev->read(e->offset, e->length, &bl, &ioc, false);
+	    if (r < 0) {
+	      derr << __func__ << " failed to read from 0x" << std::hex << e->offset
+		    <<"~" << e->length << std::dec << dendl;
+	      ceph_abort_msg("read failed, wtf");
+	    }
+	    pext_to_release.push_back(*e);
+	    e = pextents.erase(e);
+    	    e = pextents.insert(e, exts.begin(), exts.end());
+	    b->get_blob().map_bl(
+	      b_off_cur,
+	      bl,
+	      [&](uint64_t offset, bufferlist& t) {
+		int r = bdev->write(offset, t, false);
+		ceph_assert(r == 0);
+	      });
+	    e += exts.size() - 1;
+            for (auto& p : exts) {
+	      fm->allocate(p.offset, p.length, txn);
+	    }
+	  } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
+
+	  if (b->get_blob().is_shared()) {
+            b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
+
+	    auto sbid = b->shared_blob->get_sbid();
+	    auto sb_it = sb_info.find(sbid);
+	    ceph_assert(sb_it != sb_info.end());
+	    sb_info_t& sbi = *sb_it;
+
+	    if (sbi.allocated_chunks < 0) {
+	      // NB: it's crucial to use compressed_allocated_chunks from sb_info_t
+	      // as we originally used that value while accumulating
+	      // expected_statfs
+	      expected_statfs->allocated -= uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
+	      expected_statfs->data_compressed_allocated -=
+		uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
+	    } else {
+	      expected_statfs->allocated -= uint64_t(sbi.allocated_chunks) << min_alloc_size_order;
+	    }
+	    sbi.allocated_chunks = 0;
+	    repairer.fix_shared_blob(txn, sbid, nullptr, 0);
+
+	    // relying on blob's pextents to decide what to release.
+	    for (auto& p : pext_to_release) {
+	      to_release.union_insert(p.offset, p.length);
+	    }
+	  } else {
+	    for (auto& p : pext_to_release) {
+	      expected_statfs->allocated -= p.length;
+	      if (compressed) {
+		expected_statfs->data_compressed_allocated -= p.length;
+	      }
+	      to_release.union_insert(p.offset, p.length);
+	    }
+	  }
+	  if (bypass_rest) {
+	    break;
+	  }
+	} // for(auto b : blobs) 
+	if (need_onode_update) {
+	  o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
+	  _record_onode(o, txn);
+	}
+      } // for (it->lower_bound(string()); it->valid(); it->next())
+
+      for (auto it = to_release.begin(); it != to_release.end(); ++it) {
+	dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
+		 << "~" << it.get_len() << std::dec << dendl;
+	fm->release(it.get_start(), it.get_len(), txn);
+      }
+      shared_alloc.a->release(to_release);
+      to_release.clear();
+    } // if (it) {
+  } //if (repair && repairer.preprocess_misreference()) {
+  sb_info.clear();
+  sb_ref_counts.reset();
+
+  // check global stats only if fscking (not repairing) w/o per-pool stats
+  if (!per_pool_stat_collection &&
+      !repair &&
+      !(actual_statfs == expected_store_statfs)) {
+    derr << "fsck error: actual " << actual_statfs
+	 << " != expected " << expected_store_statfs << dendl;
+    if (repair) {
+      repairer.fix_statfs(db, BLUESTORE_GLOBAL_STATFS_KEY,
+			  expected_store_statfs);
+    }
+    ++errors;
+  }
+
+  dout(1) << __func__ << " checking pool_statfs" << dendl;
+  _fsck_check_pool_statfs(expected_pool_statfs,
+			  errors, warnings, repair ? &repairer : nullptr);
+
+  if (depth != FSCK_SHALLOW) {
+    dout(1) << __func__ << " checking for stray omap data " << dendl;
+    it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
+    if (it) {
+      uint64_t last_omap_head = 0;
+      for (it->lower_bound(string()); it->valid(); it->next()) {
+        uint64_t omap_head;
+
+        _key_decode_u64(it->key().c_str(), &omap_head);
+
+        if (used_omap_head.count(omap_head) == 0 &&
+           omap_head != last_omap_head) {
+          fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+            << "fsck error: found stray omap data on omap_head "
+            << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
+          ++errors;
+          last_omap_head = omap_head;
+        }
+      }
+    }
+    it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE);
+    if (it) {
+      uint64_t last_omap_head = 0;
+      for (it->lower_bound(string()); it->valid(); it->next()) {
+        uint64_t omap_head;
+        _key_decode_u64(it->key().c_str(), &omap_head);
+        if (used_omap_head.count(omap_head) == 0 &&
+	    omap_head != last_omap_head) {
+          fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+            << "fsck error: found stray (pgmeta) omap data on omap_head "
+            << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
+          last_omap_head = omap_head;
+	  ++errors;
+        }
+      }
+    }
+    it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE);
+    if (it) {
+      uint64_t last_omap_head = 0;
+      for (it->lower_bound(string()); it->valid(); it->next()) {
+        uint64_t pool;
+        uint64_t omap_head;
+        string k = it->key();
+        const char *c = k.c_str();
+        c = _key_decode_u64(c, &pool);
+        c = _key_decode_u64(c, &omap_head);
+        if (used_omap_head.count(omap_head) == 0 &&
+          omap_head != last_omap_head) {
+          fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+            << "fsck error: found stray (per-pool) omap data on omap_head "
+            << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
+          ++errors;
+          last_omap_head = omap_head;
+        }
+      }
+    }
+    it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE);
+    if (it) {
+      uint64_t last_omap_head = 0;
+      for (it->lower_bound(string()); it->valid(); it->next()) {
+        uint64_t pool;
+        uint32_t hash;
+        uint64_t omap_head;
+        string k = it->key();
+        const char* c = k.c_str();
+        c = _key_decode_u64(c, &pool);
+        c = _key_decode_u32(c, &hash);
+        c = _key_decode_u64(c, &omap_head);
+        if (used_omap_head.count(omap_head) == 0 &&
+          omap_head != last_omap_head) {
+          fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+            << "fsck error: found stray (per-pg) omap data on omap_head "
+            << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
+          ++errors;
+          last_omap_head = omap_head;
+        }
+      }
+    }
+    dout(1) << __func__ << " checking deferred events" << dendl;
+    it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE);
+    if (it) {
+      for (it->lower_bound(string()); it->valid(); it->next()) {
+        bufferlist bl = it->value();
+        auto p = bl.cbegin();
+        bluestore_deferred_transaction_t wt;
+        try {
+	  decode(wt, p);
+        } catch (ceph::buffer::error& e) {
+	  derr << "fsck error: failed to decode deferred txn "
+	       << pretty_binary_string(it->key()) << dendl;
+	  if (repair) {
+            dout(20) << __func__ << " undecodable deferred TXN record, key: '"
+		     << pretty_binary_string(it->key())
+		     << "', removing" << dendl;
+	    repairer.remove_key(db, PREFIX_DEFERRED, it->key());
+	  }
+	  continue;
+        }
+        dout(20) << __func__ << "  deferred " << wt.seq
+	         << " ops " << wt.ops.size()
+	         << " released 0x" << std::hex << wt.released << std::dec << dendl;
+        for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
+          apply_for_bitset_range(
+            e.get_start(), e.get_len(), alloc_size, used_blocks,
+            [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+              bs.set(pos);
+            }
+          );
+        }
+      }
+    }
+
+    dout(1) << __func__ << " checking freelist vs allocated" << dendl;
+    {
+      fm->enumerate_reset();
+      uint64_t offset, length;
+      while (fm->enumerate_next(db, &offset, &length)) {
+        bool intersects = false;
+        apply_for_bitset_range(
+          offset, length, alloc_size, used_blocks,
+          [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+	    ceph_assert(pos < bs.size());
+            if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
+	      if (offset == SUPER_RESERVED &&
+	          length == min_alloc_size - SUPER_RESERVED) {
+	        // this is due to the change just after luminous to min_alloc_size
+	        // granularity allocations, and our baked in assumption at the top
+	        // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
+	        // (vs luminous's round_up_to(SUPER_RESERVED,block_size)).  harmless,
+	        // since we will never allocate this region below min_alloc_size.
+	        dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
+		         << " and min_alloc_size, 0x" << std::hex << offset << "~"
+		         << length << std::dec << dendl;
+	      } else {
+                intersects = true;
+	        if (repair) {
+		  repairer.fix_false_free(db, fm,
+					  pos * min_alloc_size,
+					  min_alloc_size);
+	        }
+	      }
+            } else {
+	      bs.set(pos);
+            }
+          }
+        );
+        if (intersects) {
+	  derr << "fsck error: free extent 0x" << std::hex << offset
+	        << "~" << length << std::dec
+	        << " intersects allocated blocks" << dendl;
+	  ++errors;
+        }
+      }
+      fm->enumerate_reset();
+      size_t count = used_blocks.count();
+      if (used_blocks.size() != count) {
+        ceph_assert(used_blocks.size() > count);
+        used_blocks.flip();
+        size_t start = used_blocks.find_first();
+        while (start != decltype(used_blocks)::npos) {
+	  size_t cur = start;
+	  while (true) {
+	    size_t next = used_blocks.find_next(cur);
+	    if (next != cur + 1) {
+	      ++errors;
+	      derr << "fsck error: leaked extent 0x" << std::hex
+		   << ((uint64_t)start * fm->get_alloc_size()) << "~"
+		   << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
+		   << dendl;
+	      if (repair) {
+	        repairer.fix_leaked(db,
+				    fm,
+				    start * min_alloc_size,
+				    (cur + 1 - start) * min_alloc_size);
+	      }
+	      start = next;
+	      break;
+	    }
+	    cur = next;
+	  }
+        }
+        used_blocks.flip();
+      }
+    }
+  }
+  if (repair) {
+    if (per_pool_omap != OMAP_PER_PG) {
+      dout(5) << __func__ << " fixing per_pg_omap" << dendl;
+      repairer.fix_per_pool_omap(db, OMAP_PER_PG);
+    }
+
+    dout(5) << __func__ << " applying repair results" << dendl;
+    repaired = repairer.apply(db);
+    dout(5) << __func__ << " repair applied" << dendl;
+  }
+
+out_scan:
+  dout(2) << __func__ << " " << num_objects << " objects, "
+	  << num_sharded_objects << " of them sharded.  "
+	  << dendl;
+  dout(2) << __func__ << " " << num_extents << " extents to "
+	  << num_blobs << " blobs, "
+	  << num_spanning_blobs << " spanning, "
+	  << num_shared_blobs << " shared."
+	  << dendl;
+
+  utime_t duration = ceph_clock_now() - start;
+  dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
+	  << warnings << " warnings, "
+	  << repaired << " repaired, "
+	  << (errors + warnings - (int)repaired) << " remaining in "
+	  << duration << " seconds" << dendl;
+
+  // In non-repair mode we should return error count only as
+  // it indicates if store status is OK.
+  // In repair mode both errors and warnings are taken into account
+  // since repaired counter relates to them both.
+  return repair ? errors + warnings - (int)repaired : errors;
+}
+
+/// methods to inject various errors fsck can repair
+void BlueStore::inject_broken_shared_blob_key(const string& key,
+				  const bufferlist& bl)
+{
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+  txn->set(PREFIX_SHARED_BLOB, key, bl);
+  db->submit_transaction_sync(txn);
+};
+
+void BlueStore::inject_no_shared_blob_key()
+{
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+  ceph_assert(blobid_last > 0);
+  // kill the last used sbid, this can be broken due to blobid preallocation
+  // in rare cases, leaving as-is for the sake of simplicity
+  uint64_t sbid = blobid_last;
+
+  string key;
+  dout(5) << __func__<< " " << sbid << dendl;
+  get_shared_blob_key(sbid, &key);
+  txn->rmkey(PREFIX_SHARED_BLOB, key);
+  db->submit_transaction_sync(txn);
+};
+
+void BlueStore::inject_stray_shared_blob_key(uint64_t sbid)
+{
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  dout(5) << __func__ << " " << sbid << dendl;
+
+  string key;
+  get_shared_blob_key(sbid, &key);
+  bluestore_shared_blob_t persistent(sbid);
+  persistent.ref_map.get(0xdead0000, 0x1000);
+  bufferlist bl;
+  encode(persistent, bl);
+  dout(20) << __func__ << " sbid " << sbid
+    << " takes " << bl.length() << " bytes, updating"
+    << dendl;
+
+  txn->set(PREFIX_SHARED_BLOB, key, bl);
+  db->submit_transaction_sync(txn);
+};
+
+
+void BlueStore::inject_leaked(uint64_t len)
+{
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  PExtentVector exts;
+  int64_t alloc_len = shared_alloc.a->allocate(len, min_alloc_size,
+					   min_alloc_size * 256, 0, &exts);
+  ceph_assert(alloc_len >= (int64_t)len);
+  for (auto& p : exts) {
+    fm->allocate(p.offset, p.length, txn);
+  }
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
+{
+  KeyValueDB::Transaction txn;
+  OnodeRef o;
+  CollectionRef c = _get_collection(cid);
+  ceph_assert(c);
+  {
+    std::unique_lock l{c->lock}; // just to avoid internal asserts
+    o = c->get_onode(oid, false);
+    ceph_assert(o);
+    o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+  }
+
+  bool injected = false;
+  txn = db->get_transaction();
+  auto& em = o->extent_map.extent_map;
+  std::vector<const PExtentVector*> v;
+  if (em.size()) {
+    v.push_back(&em.begin()->blob->get_blob().get_extents());
+  }
+  if (em.size() > 1) {
+    auto it = em.end();
+    --it;
+    v.push_back(&(it->blob->get_blob().get_extents()));
+  }
+  for (auto pext : v) {
+    if (pext->size()) {
+      auto p = pext->begin();
+      while (p != pext->end()) {
+	if (p->is_valid()) {
+	  dout(20) << __func__ << " release 0x" << std::hex << p->offset
+	           << "~" << p->length << std::dec << dendl;
+	  fm->release(p->offset, p->length, txn);
+	  injected = true;
+	  break;
+	}
+	++p;
+      }
+    }
+  }
+  ceph_assert(injected);
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_legacy_omap()
+{
+  dout(1) << __func__ << dendl;
+  per_pool_omap = OMAP_BULK;
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+  txn->rmkey(PREFIX_SUPER, "per_pool_omap");
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
+{
+  dout(1) << __func__ << " "
+          << cid << " " << oid
+          <<dendl;
+  KeyValueDB::Transaction txn;
+  OnodeRef o;
+  CollectionRef c = _get_collection(cid);
+  ceph_assert(c);
+  {
+    std::unique_lock l{ c->lock }; // just to avoid internal asserts
+    o = c->get_onode(oid, false);
+    ceph_assert(o);
+  }
+  o->onode.clear_flag(
+    bluestore_onode_t::FLAG_PERPG_OMAP |
+    bluestore_onode_t::FLAG_PERPOOL_OMAP |
+    bluestore_onode_t::FLAG_PGMETA_OMAP);
+  txn = db->get_transaction();
+  _record_onode(o, txn);
+  db->submit_transaction_sync(txn);
+}
+
+
+void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
+{
+  BlueStoreRepairer repairer;
+  repairer.fix_statfs(db, key, new_statfs);
+  repairer.apply(db);
+}
+
+void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  volatile_statfs v;
+  v = new_statfs;
+  bufferlist bl;
+  v.encode(bl);
+  t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+  db->submit_transaction_sync(t);
+}
+
+void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
+				    coll_t cid2, ghobject_t oid2,
+				    uint64_t offset)
+{
+  OnodeRef o1;
+  CollectionRef c1 = _get_collection(cid1);
+  ceph_assert(c1);
+  {
+    std::unique_lock l{c1->lock}; // just to avoid internal asserts
+    o1 = c1->get_onode(oid1, false);
+    ceph_assert(o1);
+    o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
+  }
+  OnodeRef o2;
+  CollectionRef c2 = _get_collection(cid2);
+  ceph_assert(c2);
+  {
+    std::unique_lock l{c2->lock}; // just to avoid internal asserts
+    o2 = c2->get_onode(oid2, false);
+    ceph_assert(o2);
+    o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
+  }
+  Extent& e1 = *(o1->extent_map.seek_lextent(offset));
+  Extent& e2 = *(o2->extent_map.seek_lextent(offset));
+
+  // require onode/extent layout to be the same (and simple)
+  // to make things easier
+  ceph_assert(o1->onode.extent_map_shards.empty());
+  ceph_assert(o2->onode.extent_map_shards.empty());
+  ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
+  ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
+  ceph_assert(e1.logical_offset == e2.logical_offset);
+  ceph_assert(e1.length == e2.length);
+  ceph_assert(e1.blob_offset == e2.blob_offset);
+
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  // along with misreference error this will create space leaks errors
+  e2.blob->dirty_blob() = e1.blob->get_blob();
+  o2->extent_map.dirty_range(offset, e2.length);
+  o2->extent_map.update(txn, false);
+
+  _record_onode(o2, txn);
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
+                                            int16_t blob_id)
+{
+  OnodeRef o;
+  CollectionRef c = _get_collection(cid);
+  ceph_assert(c);
+  {
+    std::unique_lock l{ c->lock }; // just to avoid internal asserts
+    o = c->get_onode(oid, false);
+    ceph_assert(o);
+    o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+  }
+
+  BlobRef b = c->new_blob();
+  b->id = blob_id;
+  o->extent_map.spanning_blob_map[blob_id] = b;
+
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  _record_onode(o, txn);
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_bluefs_file(std::string_view dir, std::string_view name, size_t new_size)
+{
+  ceph_assert(bluefs);
+
+  BlueFS::FileWriter* p_handle = nullptr;
+  auto ret = bluefs->open_for_write(dir, name, &p_handle, false);
+  ceph_assert(ret == 0);
+
+  std::string s('0', new_size);
+  bufferlist bl;
+  bl.append(s);
+  p_handle->append(bl);
+
+  bluefs->fsync(p_handle);
+  bluefs->close_writer(p_handle);
+}
+
+void BlueStore::collect_metadata(map<string,string> *pm)
+{
+  dout(10) << __func__ << dendl;
+  bdev->collect_metadata("bluestore_bdev_", pm);
+  if (bluefs) {
+    (*pm)["bluefs"] = "1";
+    // this value is for backward compatibility only
+    (*pm)["bluefs_single_shared_device"] = \
+      stringify((int)bluefs_layout.single_shared_device());
+    (*pm)["bluefs_dedicated_db"] = \
+       stringify((int)bluefs_layout.dedicated_db);
+    (*pm)["bluefs_dedicated_wal"] = \
+       stringify((int)bluefs_layout.dedicated_wal);
+    bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
+  } else {
+    (*pm)["bluefs"] = "0";
+  }
+
+  // report numa mapping for underlying devices
+  int node = -1;
+  set<int> nodes;
+  set<string> failed;
+  int r = get_numa_node(&node, &nodes, &failed);
+  if (r >= 0) {
+    if (!failed.empty()) {
+      (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
+    }
+    if (!nodes.empty()) {
+      dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
+      (*pm)["objectstore_numa_nodes"] = stringify(nodes);
+    }
+    if (node >= 0) {
+      (*pm)["objectstore_numa_node"] = stringify(node);
+    }
+  }
+}
+
+int BlueStore::get_numa_node(
+  int *final_node,
+  set<int> *out_nodes,
+  set<string> *out_failed)
+{
+  int node = -1;
+  set<string> devices;
+  get_devices(&devices);
+  set<int> nodes;
+  set<string> failed;
+  for (auto& devname : devices) {
+    int n;
+    BlkDev bdev(devname);
+    int r = bdev.get_numa_node(&n);
+    if (r < 0) {
+      dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
+	       << dendl;
+      failed.insert(devname);
+      continue;
+    }
+    dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
+	     << dendl;
+    nodes.insert(n);
+    if (node < 0) {
+      node = n;
+    }
+  }
+  if (node >= 0 && nodes.size() == 1 && failed.empty()) {
+    *final_node = node;
+  }
+  if (out_nodes) {
+    *out_nodes = nodes;
+  }
+  if (out_failed) {
+    *out_failed = failed;
+  }
+  return 0;
+}
+
+int BlueStore::get_devices(set<string> *ls)
+{
+  if (bdev) {
+    bdev->get_devices(ls);
+    if (bluefs) {
+      bluefs->get_devices(ls);
+    }
+    return 0;
+  }
+  
+  // grumble, we haven't started up yet.
+  int r = _open_path();
+  if (r < 0)
+    goto out;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+  r = _read_fsid(&fsid);
+  if (r < 0)
+    goto out_fsid;
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_fsid;
+  r = _open_bdev(false);
+  if (r < 0)
+    goto out_fsid;
+  r = _minimal_open_bluefs(false);
+  if (r < 0)
+    goto out_bdev;
+  bdev->get_devices(ls);
+  if (bluefs) {
+    bluefs->get_devices(ls);
+  }
+  r = 0;
+  _minimal_close_bluefs();
+ out_bdev:
+  _close_bdev();
+ out_fsid:
+  _close_fsid();
+ out_path:
+  _close_path();
+ out:
+  return r;
+}
+
+void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
+{
+  buf->reset();
+
+  auto prefix = per_pool_omap == OMAP_BULK ?
+    PREFIX_OMAP :
+    per_pool_omap == OMAP_PER_POOL ?
+      PREFIX_PERPOOL_OMAP :
+      PREFIX_PERPG_OMAP;
+  buf->omap_allocated =
+    db->estimate_prefix_size(prefix, string());
+
+  uint64_t bfree = shared_alloc.a->get_free();
+
+  if (bluefs) {
+    buf->internally_reserved = 0;
+    // include dedicated db, too, if that isn't the shared device.
+    if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
+      buf->total += bluefs->get_total(BlueFS::BDEV_DB);
+    }
+    // call any non-omap bluefs space "internal metadata"
+    buf->internal_metadata =
+      bluefs->get_used()
+      - buf->omap_allocated;
+  }
+
+  uint64_t thin_total, thin_avail;
+  if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
+    buf->total += thin_total;
+
+    // we are limited by both the size of the virtual device and the
+    // underlying physical device.
+    bfree = std::min(bfree, thin_avail);
+
+    buf->allocated = thin_total - thin_avail;
+  } else {
+    buf->total += bdev->get_size();
+  }
+  buf->available = bfree;
+}
+
+int BlueStore::statfs(struct store_statfs_t *buf,
+		      osd_alert_list_t* alerts)
+{
+  if (alerts) {
+    alerts->clear();
+    _log_alerts(*alerts);
+  }
+  _get_statfs_overall(buf);
+  {
+    std::lock_guard l(vstatfs_lock);
+    buf->allocated = vstatfs.allocated();
+    buf->data_stored = vstatfs.stored();
+    buf->data_compressed = vstatfs.compressed();
+    buf->data_compressed_original = vstatfs.compressed_original();
+    buf->data_compressed_allocated = vstatfs.compressed_allocated();
+  }
+
+  dout(20) << __func__ << " " << *buf << dendl;
+  return 0;
+}
+
+int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+			   bool *out_per_pool_omap)
+{
+  dout(20) << __func__ << " pool " << pool_id<< dendl;
+
+  if (!per_pool_stat_collection) {
+    dout(20) << __func__ << " not supported in legacy mode " << dendl;
+    return -ENOTSUP;
+  }
+  buf->reset();
+
+  {
+    std::lock_guard l(vstatfs_lock);
+    osd_pools[pool_id].publish(buf);
+  }
+
+  string key_prefix;
+  _key_encode_u64(pool_id, &key_prefix);
+  *out_per_pool_omap = per_pool_omap != OMAP_BULK;
+  if (*out_per_pool_omap) {
+    auto prefix = per_pool_omap == OMAP_PER_POOL ?
+      PREFIX_PERPOOL_OMAP :
+      PREFIX_PERPG_OMAP;
+    buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
+  }
+
+  dout(10) << __func__ << *buf << dendl;
+  return 0;
+}
+
+void BlueStore::_check_legacy_statfs_alert()
+{
+  string s;
+  if (!per_pool_stat_collection &&
+      cct->_conf->bluestore_warn_on_legacy_statfs) {
+    s = "legacy statfs reporting detected, "
+        "suggest to run store repair to get consistent statistic reports";
+  }
+  std::lock_guard l(qlock);
+  legacy_statfs_alert = s;
+}
+
+void BlueStore::_check_no_per_pg_or_pool_omap_alert()
+{
+  string per_pg, per_pool;
+  if (per_pool_omap != OMAP_PER_PG) {
+    if (cct->_conf->bluestore_warn_on_no_per_pg_omap) {
+      per_pg = "legacy (not per-pg) omap detected, "
+	"suggest to run store repair to benefit from faster PG removal";
+    }
+    if (per_pool_omap != OMAP_PER_POOL) {
+      if (cct->_conf->bluestore_warn_on_no_per_pool_omap) {
+	per_pool = "legacy (not per-pool) omap detected, "
+	  "suggest to run store repair to benefit from per-pool omap usage statistics";
+      }
+    }
+  }
+  std::lock_guard l(qlock);
+  no_per_pg_omap_alert = per_pg;
+  no_per_pool_omap_alert = per_pool;
+}
+
+// ---------------
+// cache
+
+BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
+{
+  std::shared_lock l(coll_lock);
+  ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+  if (cp == coll_map.end())
+    return CollectionRef();
+  return cp->second;
+}
+
+void BlueStore::_queue_reap_collection(CollectionRef& c)
+{
+  dout(10) << __func__ << " " << c << " " << c->cid << dendl;
+  // _reap_collections and this in the same thread,
+  // so no need a lock.
+  removed_collections.push_back(c);
+}
+
+void BlueStore::_reap_collections()
+{
+
+  list<CollectionRef> removed_colls;
+  {
+    // _queue_reap_collection and this in the same thread.
+    // So no need a lock.
+    if (!removed_collections.empty())
+      removed_colls.swap(removed_collections);
+    else
+      return;
+  }
+
+  list<CollectionRef>::iterator p = removed_colls.begin();
+  while (p != removed_colls.end()) {
+    CollectionRef c = *p;
+    dout(10) << __func__ << " " << c << " " << c->cid << dendl;
+    if (c->onode_map.map_any([&](Onode* o) {
+	  ceph_assert(!o->exists);
+	  if (o->flushing_count.load()) {
+	    dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
+		     << " flush_txns " << o->flushing_count << dendl;
+	    return true;
+	  }
+	  return false;
+	})) {
+      ++p;
+      continue;
+    }
+    c->onode_map.clear();
+    p = removed_colls.erase(p);
+    dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
+  }
+  if (removed_colls.empty()) {
+    dout(10) << __func__ << " all reaped" << dendl;
+  } else {
+    removed_collections.splice(removed_collections.begin(), removed_colls);
+  }
+}
+
+void BlueStore::_update_cache_logger()
+{
+  uint64_t num_onodes = 0;
+  uint64_t num_pinned_onodes = 0;
+  uint64_t num_extents = 0;
+  uint64_t num_blobs = 0;
+  uint64_t num_buffers = 0;
+  uint64_t num_buffer_bytes = 0;
+  for (auto c : onode_cache_shards) {
+    c->add_stats(&num_onodes, &num_pinned_onodes);
+  }
+  for (auto c : buffer_cache_shards) {
+    c->add_stats(&num_extents, &num_blobs,
+                 &num_buffers, &num_buffer_bytes);
+  }
+  logger->set(l_bluestore_onodes, num_onodes);
+  logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
+  logger->set(l_bluestore_extents, num_extents);
+  logger->set(l_bluestore_blobs, num_blobs);
+  logger->set(l_bluestore_buffers, num_buffers);
+  logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
+}
+
+// ---------------
+// read operations
+
+ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
+{
+  return _get_collection(cid);
+}
+
+ObjectStore::CollectionHandle BlueStore::create_new_collection(
+  const coll_t& cid)
+{
+  std::unique_lock l{coll_lock};
+  auto c = ceph::make_ref<Collection>(
+    this,
+    onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
+    buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
+    cid);
+  new_coll_map[cid] = c;
+  _osr_attach(c.get());
+  return c;
+}
+
+void BlueStore::set_collection_commit_queue(
+    const coll_t& cid,
+    ContextQueue *commit_queue)
+{
+  if (commit_queue) {
+    std::shared_lock l(coll_lock);
+    if (coll_map.count(cid)) {
+      coll_map[cid]->commit_queue = commit_queue;
+    } else if (new_coll_map.count(cid)) {
+      new_coll_map[cid]->commit_queue = commit_queue;
+    }
+  }
+}
+
+
+bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
+  if (!c->exists)
+    return false;
+
+  bool r = true;
+
+  {
+    std::shared_lock l(c->lock);
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists)
+      r = false;
+  }
+
+  return r;
+}
+
+int BlueStore::stat(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  struct stat *st,
+  bool allow_eio)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  if (!c->exists)
+    return -ENOENT;
+  dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
+
+  {
+    std::shared_lock l(c->lock);
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists)
+      return -ENOENT;
+    st->st_size = o->onode.size;
+    st->st_blksize = 4096;
+    st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
+    st->st_nlink = 1;
+  }
+
+  int r = 0;
+  if (_debug_mdata_eio(oid)) {
+    r = -EIO;
+    derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+  }
+  return r;
+}
+int BlueStore::set_collection_opts(
+  CollectionHandle& ch,
+  const pool_opts_t& opts)
+{
+  Collection *c = static_cast<Collection *>(ch.get());
+  dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  std::unique_lock l{c->lock};
+  c->pool_opts = opts;
+  return 0;
+}
+
+int BlueStore::read(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t length,
+  bufferlist& bl,
+  uint32_t op_flags)
+{
+  auto start = mono_clock::now();
+  Collection *c = static_cast<Collection *>(c_.get());
+  const coll_t &cid = c->get_cid();
+  dout(15) << __func__ << " " << cid << " " << oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << dendl;
+  if (!c->exists)
+    return -ENOENT;
+
+  bl.clear();
+  int r;
+  {
+    std::shared_lock l(c->lock);
+    auto start1 = mono_clock::now();
+    OnodeRef o = c->get_onode(oid, false);
+    log_latency("get_onode@read",
+      l_bluestore_read_onode_meta_lat,
+      mono_clock::now() - start1,
+      cct->_conf->bluestore_log_op_age);
+    if (!o || !o->exists) {
+      r = -ENOENT;
+      goto out;
+    }
+
+    if (offset == length && offset == 0)
+      length = o->onode.size;
+
+    r = _do_read(c, o, offset, length, bl, op_flags);
+    if (r == -EIO) {
+      logger->inc(l_bluestore_read_eio);
+    }
+  }
+
+ out:
+  if (r >= 0 && _debug_data_eio(oid)) {
+    r = -EIO;
+    derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+  } else if (oid.hobj.pool > 0 &&  /* FIXME, see #23029 */
+	     cct->_conf->bluestore_debug_random_read_err &&
+	     (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
+			     100.0)) == 0) {
+    dout(0) << __func__ << ": inject random EIO" << dendl;
+    r = -EIO;
+  }
+  dout(10) << __func__ << " " << cid << " " << oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << " = " << r << dendl;
+  log_latency(__func__,
+    l_bluestore_read_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  return r;
+}
+
+void BlueStore::_read_cache(
+  OnodeRef o,
+  uint64_t offset,
+  size_t length,
+  int read_cache_policy,
+  ready_regions_t& ready_regions,
+  blobs2read_t& blobs2read)
+{
+  // build blob-wise list to of stuff read (that isn't cached)
+  unsigned left = length;
+  uint64_t pos = offset;
+  auto lp = o->extent_map.seek_lextent(offset);
+  while (left > 0 && lp != o->extent_map.extent_map.end()) {
+    if (pos < lp->logical_offset) {
+      unsigned hole = lp->logical_offset - pos;
+      if (hole >= left) {
+        break;
+      }
+      dout(30) << __func__ << "  hole 0x" << std::hex << pos << "~" << hole
+               << std::dec << dendl;
+      pos += hole;
+      left -= hole;
+    }
+    BlobRef& bptr = lp->blob;
+    unsigned l_off = pos - lp->logical_offset;
+    unsigned b_off = l_off + lp->blob_offset;
+    unsigned b_len = std::min(left, lp->length - l_off);
+
+    ready_regions_t cache_res;
+    interval_set<uint32_t> cache_interval;
+    bptr->shared_blob->bc.read(
+      bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
+      read_cache_policy);
+    dout(20) << __func__ << "  blob " << *bptr << std::hex
+             << " need 0x" << b_off << "~" << b_len
+             << " cache has 0x" << cache_interval
+             << std::dec << dendl;
+
+    auto pc = cache_res.begin();
+    uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
+    while (b_len > 0) {
+      unsigned l;
+      if (pc != cache_res.end() &&
+          pc->first == b_off) {
+        l = pc->second.length();
+        ready_regions[pos] = std::move(pc->second);
+        dout(30) << __func__ << "    use cache 0x" << std::hex << pos << ": 0x"
+                 << b_off << "~" << l << std::dec << dendl;
+        ++pc;
+      } else {
+        l = b_len;
+        if (pc != cache_res.end()) {
+          ceph_assert(pc->first > b_off);
+          l = pc->first - b_off;
+        }
+        dout(30) << __func__ << "    will read 0x" << std::hex << pos << ": 0x"
+                 << b_off << "~" << l << std::dec << dendl;
+        // merge regions
+        {
+          uint64_t r_off = b_off;
+          uint64_t r_len = l;
+          uint64_t front = r_off % chunk_size;
+          if (front) {
+            r_off -= front;
+            r_len += front;
+          }
+          unsigned tail = r_len % chunk_size;
+          if (tail) {
+            r_len += chunk_size - tail;
+          }
+          bool merged = false;
+          regions2read_t& r2r = blobs2read[bptr];
+          if (r2r.size()) {
+            read_req_t& pre = r2r.back();
+            if (r_off <= (pre.r_off + pre.r_len)) {
+              front += (r_off - pre.r_off);
+              pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
+              pre.regs.emplace_back(region_t(pos, b_off, l, front));
+              merged = true;
+            }
+          }
+          if (!merged) {
+            read_req_t req(r_off, r_len);
+            req.regs.emplace_back(region_t(pos, b_off, l, front));
+            r2r.emplace_back(std::move(req));
+          }
+        }
+      }
+      pos += l;
+      b_off += l;
+      left -= l;
+      b_len -= l;
+    }
+    ++lp;
+  }
+}
+
+int BlueStore::_prepare_read_ioc(
+  blobs2read_t& blobs2read,
+  vector<bufferlist>* compressed_blob_bls,
+  IOContext* ioc)
+{
+  for (auto& p : blobs2read) {
+    const BlobRef& bptr = p.first;
+    regions2read_t& r2r = p.second;
+    dout(20) << __func__ << "  blob " << *bptr << std::hex
+             << " need " << r2r << std::dec << dendl;
+    if (bptr->get_blob().is_compressed()) {
+      // read the whole thing
+      if (compressed_blob_bls->empty()) {
+        // ensure we avoid any reallocation on subsequent blobs
+        compressed_blob_bls->reserve(blobs2read.size());
+      }
+      compressed_blob_bls->push_back(bufferlist());
+      bufferlist& bl = compressed_blob_bls->back();
+      auto r = bptr->get_blob().map(
+        0, bptr->get_blob().get_ondisk_length(),
+        [&](uint64_t offset, uint64_t length) {
+          int r = bdev->aio_read(offset, length, &bl, ioc);
+          if (r < 0)
+            return r;
+          return 0;
+        });
+      if (r < 0) {
+        derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
+        if (r == -EIO) {
+          // propagate EIO to caller
+          return r;
+        }
+        ceph_assert(r == 0);
+      }
+    } else {
+      // read the pieces
+      for (auto& req : r2r) {
+        dout(20) << __func__ << "    region 0x" << std::hex
+                 << req.regs.front().logical_offset
+                 << ": 0x" << req.regs.front().blob_xoffset
+                 << " reading 0x" << req.r_off
+                 << "~" << req.r_len << std::dec
+                 << dendl;
+
+        // read it
+        auto r = bptr->get_blob().map(
+          req.r_off, req.r_len,
+          [&](uint64_t offset, uint64_t length) {
+            int r = bdev->aio_read(offset, length, &req.bl, ioc);
+            if (r < 0)
+              return r;
+            return 0;
+          });
+        if (r < 0) {
+          derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
+               << dendl;
+          if (r == -EIO) {
+            // propagate EIO to caller
+            return r;
+          }
+          ceph_assert(r == 0);
+        }
+        ceph_assert(req.bl.length() == req.r_len);
+      }
+    }
+  }
+  return 0;
+}
+
+int BlueStore::_generate_read_result_bl(
+  OnodeRef o,
+  uint64_t offset,
+  size_t length,
+  ready_regions_t& ready_regions,
+  vector<bufferlist>& compressed_blob_bls,
+  blobs2read_t& blobs2read,
+  bool buffered,
+  bool* csum_error,
+  bufferlist& bl)
+{
+ // enumerate and decompress desired blobs
+  auto p = compressed_blob_bls.begin();
+  blobs2read_t::iterator b2r_it = blobs2read.begin();
+  while (b2r_it != blobs2read.end()) {
+    const BlobRef& bptr = b2r_it->first;
+    regions2read_t& r2r = b2r_it->second;
+    dout(20) << __func__ << "  blob " << *bptr << std::hex
+             << " need 0x" << r2r << std::dec << dendl;
+    if (bptr->get_blob().is_compressed()) {
+      ceph_assert(p != compressed_blob_bls.end());
+      bufferlist& compressed_bl = *p++;
+      if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
+                       r2r.front().regs.front().logical_offset) < 0) {
+        *csum_error = true;
+        return -EIO;
+      }
+      bufferlist raw_bl;
+      auto r = _decompress(compressed_bl, &raw_bl);
+      if (r < 0)
+        return r;
+      if (buffered) {
+        bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
+                                       raw_bl);
+      }
+      for (auto& req : r2r) {
+        for (auto& r : req.regs) {
+          ready_regions[r.logical_offset].substr_of(
+            raw_bl, r.blob_xoffset, r.length);
+        }
+      }
+    } else {
+      for (auto& req : r2r) {
+        if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
+                         req.regs.front().logical_offset) < 0) {
+          *csum_error = true;
+          return -EIO;
+        }
+        if (buffered) {
+          bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
+                                         req.r_off, req.bl);
+        }
+
+        // prune and keep result
+        for (const auto& r : req.regs) {
+          ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
+        }
+      }
+    }
+    ++b2r_it;
+  }
+
+  // generate a resulting buffer
+  auto pr = ready_regions.begin();
+  auto pr_end = ready_regions.end();
+  uint64_t pos = 0;
+  while (pos < length) {
+    if (pr != pr_end && pr->first == pos + offset) {
+      dout(30) << __func__ << " assemble 0x" << std::hex << pos
+               << ": data from 0x" << pr->first << "~" << pr->second.length()
+               << std::dec << dendl;
+      pos += pr->second.length();
+      bl.claim_append(pr->second);
+      ++pr;
+    } else {
+      uint64_t l = length - pos;
+      if (pr != pr_end) {
+        ceph_assert(pr->first > pos + offset);
+        l = pr->first - (pos + offset);
+      }
+      dout(30) << __func__ << " assemble 0x" << std::hex << pos
+               << ": zeros for 0x" << (pos + offset) << "~" << l
+               << std::dec << dendl;
+      bl.append_zero(l);
+      pos += l;
+    }
+  }
+  ceph_assert(bl.length() == length);
+  ceph_assert(pos == length);
+  ceph_assert(pr == pr_end);
+  return 0;
+}
+
+int BlueStore::_do_read(
+  Collection *c,
+  OnodeRef o,
+  uint64_t offset,
+  size_t length,
+  bufferlist& bl,
+  uint32_t op_flags,
+  uint64_t retry_count)
+{
+  FUNCTRACE(cct);
+  int r = 0;
+  int read_cache_policy = 0; // do not bypass clean or dirty cache
+
+  dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+           << " size 0x" << o->onode.size << " (" << std::dec
+           << o->onode.size << ")" << dendl;
+  bl.clear();
+
+  if (offset >= o->onode.size) {
+    return r;
+  }
+
+  // generally, don't buffer anything, unless the client explicitly requests
+  // it.
+  bool buffered = false;
+  if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+    dout(20) << __func__ << " will do buffered read" << dendl;
+    buffered = true;
+  } else if (cct->_conf->bluestore_default_buffered_read &&
+	     (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+			  CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
+    dout(20) << __func__ << " defaulting to buffered read" << dendl;
+    buffered = true;
+  }
+
+  if (offset + length > o->onode.size) {
+    length = o->onode.size - offset;
+  }
+
+  auto start = mono_clock::now();
+  o->extent_map.fault_range(db, offset, length);
+  log_latency(__func__,
+    l_bluestore_read_onode_meta_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  _dump_onode<30>(cct, *o);
+
+  // for deep-scrub, we only read dirty cache and bypass clean cache in
+  // order to read underlying block device in case there are silent disk errors.
+  if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
+    dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
+    read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
+  }
+
+  // build blob-wise list to of stuff read (that isn't cached)
+  ready_regions_t ready_regions;
+  blobs2read_t blobs2read;
+  _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
+
+
+  // read raw blob data.
+  start = mono_clock::now(); // for the sake of simplicity
+                             // measure the whole block below.
+                             // The error isn't that much...
+  vector<bufferlist> compressed_blob_bls;
+  IOContext ioc(cct, NULL, true); // allow EIO
+  r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
+  // we always issue aio for reading, so errors other than EIO are not allowed
+  if (r < 0)
+    return r;
+
+  int64_t num_ios = blobs2read.size();
+  if (ioc.has_pending_aios()) {
+    num_ios = ioc.get_num_ios();
+    bdev->aio_submit(&ioc);
+    dout(20) << __func__ << " waiting for aio" << dendl;
+    ioc.aio_wait();
+    r = ioc.get_return_value();
+    if (r < 0) {
+      ceph_assert(r == -EIO); // no other errors allowed
+      return -EIO;
+    }
+  }
+  log_latency_fn(__func__,
+    l_bluestore_read_wait_aio_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age,
+    [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
+  );
+
+  bool csum_error = false;
+  r = _generate_read_result_bl(o, offset, length, ready_regions,
+                              compressed_blob_bls, blobs2read,
+                              buffered, &csum_error, bl);
+  if (csum_error) {
+    // Handles spurious read errors caused by a kernel bug.
+    // We sometimes get all-zero pages as a result of the read under
+    // high memory pressure. Retrying the failing read succeeds in most 
+    // cases.
+    // See also: http://tracker.ceph.com/issues/22464
+    if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+      return -EIO;
+    }
+    return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
+  }
+  r = bl.length();
+  if (retry_count) {
+    logger->inc(l_bluestore_reads_with_retries);
+    dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
+            << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
+    stringstream s;
+    s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
+    _set_spurious_read_errors_alert(s.str());
+  }
+  return r;
+}
+
+int BlueStore::_verify_csum(OnodeRef& o,
+			    const bluestore_blob_t* blob, uint64_t blob_xoffset,
+			    const bufferlist& bl,
+			    uint64_t logical_offset) const
+{
+  int bad;
+  uint64_t bad_csum;
+  auto start = mono_clock::now();
+  int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
+  if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
+      (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
+    derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
+    bad = blob_xoffset;
+    r = -1;
+    bad_csum = 0xDEADBEEF;
+  }
+  if (r < 0) {
+    if (r == -1) {
+      PExtentVector pex;
+      blob->map(
+	bad,
+	blob->get_csum_chunk_size(),
+	[&](uint64_t offset, uint64_t length) {
+	  pex.emplace_back(bluestore_pextent_t(offset, length));
+          return 0;
+	});
+      derr << __func__ << " bad "
+	   << Checksummer::get_csum_type_string(blob->csum_type)
+	   << "/0x" << std::hex << blob->get_csum_chunk_size()
+	   << " checksum at blob offset 0x" << bad
+	   << ", got 0x" << bad_csum << ", expected 0x"
+	   << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
+	   << ", device location " << pex
+	   << ", logical extent 0x" << std::hex
+	   << (logical_offset + bad - blob_xoffset) << "~"
+	   << blob->get_csum_chunk_size() << std::dec
+	   << ", object " << o->oid
+	   << dendl;
+    } else {
+      derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
+    }
+  }
+  log_latency(__func__,
+    l_bluestore_csum_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  if (cct->_conf->bluestore_ignore_data_csum) {
+    return 0;
+  }
+  return r;
+}
+
+int BlueStore::_decompress(bufferlist& source, bufferlist* result)
+{
+  int r = 0;
+  auto start = mono_clock::now();
+  auto i = source.cbegin();
+  bluestore_compression_header_t chdr;
+  decode(chdr, i);
+  int alg = int(chdr.type);
+  CompressorRef cp = compressor;
+  if (!cp || (int)cp->get_type() != alg) {
+    cp = Compressor::create(cct, alg);
+  }
+
+  if (!cp.get()) {
+    // if compressor isn't available - error, because cannot return
+    // decompressed data?
+    
+    const char* alg_name = Compressor::get_comp_alg_name(alg);
+    derr << __func__ << " can't load decompressor " << alg_name << dendl;
+    _set_compression_alert(false, alg_name);
+    r = -EIO;
+  } else {
+    r = cp->decompress(i, chdr.length, *result, chdr.compressor_message);
+    if (r < 0) {
+      derr << __func__ << " decompression failed with exit code " << r << dendl;
+      r = -EIO;
+    }
+  }
+  log_latency(__func__,
+    l_bluestore_decompress_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  return r;
+}
+
+// this stores fiemap into interval_set, other variations
+// use it internally
+int BlueStore::_fiemap(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t length,
+  interval_set<uint64_t>& destset)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  if (!c->exists)
+    return -ENOENT;
+  {
+    std::shared_lock l(c->lock);
+
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists) {
+      return -ENOENT;
+    }
+    _dump_onode<30>(cct, *o);
+
+    dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+	     << " size 0x" << o->onode.size << std::dec << dendl;
+
+    boost::intrusive::set<Extent>::iterator ep, eend;
+    if (offset >= o->onode.size)
+      goto out;
+
+    if (offset + length > o->onode.size) {
+      length = o->onode.size - offset;
+    }
+
+    o->extent_map.fault_range(db, offset, length);
+    eend = o->extent_map.extent_map.end();
+    ep = o->extent_map.seek_lextent(offset);
+    while (length > 0) {
+      dout(20) << __func__ << " offset " << offset << dendl;
+      if (ep != eend && ep->logical_offset + ep->length <= offset) {
+        ++ep;
+        continue;
+      }
+
+      uint64_t x_len = length;
+      if (ep != eend && ep->logical_offset <= offset) {
+        uint64_t x_off = offset - ep->logical_offset;
+        x_len = std::min(x_len, ep->length - x_off);
+        dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
+	         << x_len << std::dec << " blob " << ep->blob << dendl;
+        destset.insert(offset, x_len);
+        length -= x_len;
+        offset += x_len;
+        if (x_off + x_len == ep->length)
+	  ++ep;
+        continue;
+      }
+      if (ep != eend &&
+	  ep->logical_offset > offset &&
+	  ep->logical_offset - offset < x_len) {
+        x_len = ep->logical_offset - offset;
+      }
+      offset += x_len;
+      length -= x_len;
+    }
+  }
+
+ out:
+  dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << " size = 0x(" << destset << ")" << std::dec << dendl;
+  return 0;
+}
+
+int BlueStore::fiemap(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t length,
+  bufferlist& bl)
+{
+  interval_set<uint64_t> m;
+  int r = _fiemap(c_, oid, offset, length, m);
+  if (r >= 0) {
+    encode(m, bl);
+  }
+  return r;
+}
+
+int BlueStore::fiemap(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t length,
+  map<uint64_t, uint64_t>& destmap)
+{
+  interval_set<uint64_t> m;
+  int r = _fiemap(c_, oid, offset, length, m);
+  if (r >= 0) {
+    destmap = std::move(m).detach();
+  }
+  return r;
+}
+
+int BlueStore::readv(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  interval_set<uint64_t>& m,
+  bufferlist& bl,
+  uint32_t op_flags)
+{
+  auto start = mono_clock::now();
+  Collection *c = static_cast<Collection *>(c_.get());
+  const coll_t &cid = c->get_cid();
+  dout(15) << __func__ << " " << cid << " " << oid
+           << " fiemap " << m
+           << dendl;
+  if (!c->exists)
+    return -ENOENT;
+
+  bl.clear();
+  int r;
+  {
+    std::shared_lock l(c->lock);
+    auto start1 = mono_clock::now();
+    OnodeRef o = c->get_onode(oid, false);
+    log_latency("get_onode@read",
+      l_bluestore_read_onode_meta_lat,
+      mono_clock::now() - start1,
+      cct->_conf->bluestore_log_op_age);
+    if (!o || !o->exists) {
+      r = -ENOENT;
+      goto out;
+    }
+
+    if (m.empty()) {
+      r = 0;
+      goto out;
+    }
+
+    r = _do_readv(c, o, m, bl, op_flags);
+    if (r == -EIO) {
+      logger->inc(l_bluestore_read_eio);
+    }
+  }
+
+ out:
+  if (r >= 0 && _debug_data_eio(oid)) {
+    r = -EIO;
+    derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+  } else if (oid.hobj.pool > 0 &&  /* FIXME, see #23029 */
+             cct->_conf->bluestore_debug_random_read_err &&
+             (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
+                             100.0)) == 0) {
+    dout(0) << __func__ << ": inject random EIO" << dendl;
+    r = -EIO;
+  }
+  dout(10) << __func__ << " " << cid << " " << oid
+           << " fiemap " << m << std::dec
+           << " = " << r << dendl;
+  log_latency(__func__,
+    l_bluestore_read_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  return r;
+}
+
+int BlueStore::_do_readv(
+  Collection *c,
+  OnodeRef o,
+  const interval_set<uint64_t>& m,
+  bufferlist& bl,
+  uint32_t op_flags,
+  uint64_t retry_count)
+{
+  FUNCTRACE(cct);
+  int r = 0;
+  int read_cache_policy = 0; // do not bypass clean or dirty cache
+
+  dout(20) << __func__ << " fiemap " << m << std::hex
+           << " size 0x" << o->onode.size << " (" << std::dec
+           << o->onode.size << ")" << dendl;
+
+  // generally, don't buffer anything, unless the client explicitly requests
+  // it.
+  bool buffered = false;
+  if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+    dout(20) << __func__ << " will do buffered read" << dendl;
+    buffered = true;
+  } else if (cct->_conf->bluestore_default_buffered_read &&
+             (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+                          CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
+    dout(20) << __func__ << " defaulting to buffered read" << dendl;
+    buffered = true;
+  }
+  // this method must be idempotent since we may call it several times
+  // before we finally read the expected result.
+  bl.clear();
+
+  // call fiemap first!
+  ceph_assert(m.range_start() <= o->onode.size);
+  ceph_assert(m.range_end() <= o->onode.size);
+  auto start = mono_clock::now();
+  o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
+  log_latency(__func__,
+    l_bluestore_read_onode_meta_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  _dump_onode<30>(cct, *o);
+
+  IOContext ioc(cct, NULL, true); // allow EIO
+  vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
+  raw_results.reserve(m.num_intervals());
+  int i = 0;
+  for (auto p = m.begin(); p != m.end(); p++, i++) {
+    raw_results.push_back({});
+    _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
+                std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
+    r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
+    // we always issue aio for reading, so errors other than EIO are not allowed
+    if (r < 0)
+      return r;
+  }
+
+  auto num_ios = m.size();
+  if (ioc.has_pending_aios()) {
+    num_ios = ioc.get_num_ios();
+    bdev->aio_submit(&ioc);
+    dout(20) << __func__ << " waiting for aio" << dendl;
+    ioc.aio_wait();
+    r = ioc.get_return_value();
+    if (r < 0) {
+      ceph_assert(r == -EIO); // no other errors allowed
+      return -EIO;
+    }
+  }
+  log_latency_fn(__func__,
+    l_bluestore_read_wait_aio_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age,
+    [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
+  );
+
+  ceph_assert(raw_results.size() == (size_t)m.num_intervals());
+  i = 0;
+  for (auto p = m.begin(); p != m.end(); p++, i++) {
+    bool csum_error = false;
+    bufferlist t;
+    r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
+                                 std::get<0>(raw_results[i]),
+                                 std::get<1>(raw_results[i]),
+                                 std::get<2>(raw_results[i]),
+                                 buffered, &csum_error, t);
+    if (csum_error) {
+      // Handles spurious read errors caused by a kernel bug.
+      // We sometimes get all-zero pages as a result of the read under
+      // high memory pressure. Retrying the failing read succeeds in most
+      // cases.
+      // See also: http://tracker.ceph.com/issues/22464
+      if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+        return -EIO;
+      }
+      return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
+    }
+    bl.claim_append(t);
+  }
+  if (retry_count) {
+    logger->inc(l_bluestore_reads_with_retries);
+    dout(5) << __func__ << " read fiemap " << m
+            << " failed " << retry_count << " times before succeeding"
+            << dendl;
+  }
+  return bl.length();
+}
+
+int BlueStore::dump_onode(CollectionHandle &c_,
+  const ghobject_t& oid,
+  const string& section_name,
+  Formatter *f)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+
+  int r;
+  {
+    std::shared_lock l(c->lock);
+
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists) {
+      r = -ENOENT;
+      goto out;
+    }
+    // FIXME minor: actually the next line isn't enough to
+    // load shared blobs. Leaving as is for now..
+    //
+    o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+
+    _dump_onode<0>(cct, *o);
+    f->open_object_section(section_name.c_str());
+    o->dump(f);
+    f->close_section();
+    r = 0;
+  }
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::getattr(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  const char *name,
+  bufferptr& value)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
+  if (!c->exists)
+    return -ENOENT;
+
+  int r;
+  {
+    std::shared_lock l(c->lock);
+    mempool::bluestore_cache_meta::string k(name);
+
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists) {
+      r = -ENOENT;
+      goto out;
+    }
+
+    if (!o->onode.attrs.count(k)) {
+      r = -ENODATA;
+      goto out;
+    }
+    value = o->onode.attrs[k];
+    r = 0;
+  }
+ out:
+  if (r == 0 && _debug_mdata_eio(oid)) {
+    r = -EIO;
+    derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+  }
+  dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::getattrs(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  map<string,bufferptr>& aset)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+
+  int r;
+  {
+    std::shared_lock l(c->lock);
+
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists) {
+      r = -ENOENT;
+      goto out;
+    }
+    for (auto& i : o->onode.attrs) {
+      aset.emplace(i.first.c_str(), i.second);
+    }
+    r = 0;
+  }
+
+ out:
+  if (r == 0 && _debug_mdata_eio(oid)) {
+    r = -EIO;
+    derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+  }
+  dout(10) << __func__ << " " << c->cid << " " << oid
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::list_collections(vector<coll_t>& ls)
+{
+  std::shared_lock l(coll_lock);
+  ls.reserve(coll_map.size());
+  for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
+       p != coll_map.end();
+       ++p)
+    ls.push_back(p->first);
+  return 0;
+}
+
+bool BlueStore::collection_exists(const coll_t& c)
+{
+  std::shared_lock l(coll_lock);
+  return coll_map.count(c);
+}
+
+int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
+{
+  dout(15) << __func__ << " " << ch->cid << dendl;
+  vector<ghobject_t> ls;
+  ghobject_t next;
+  int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
+			  &ls, &next);
+  if (r < 0) {
+    derr << __func__ << " collection_list returned: " << cpp_strerror(r)
+         << dendl;
+    return r;
+  }
+  *empty = ls.empty();
+  dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
+  return 0;
+}
+
+int BlueStore::collection_bits(CollectionHandle& ch)
+{
+  dout(15) << __func__ << " " << ch->cid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l(c->lock);
+  dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
+  return c->cnode.bits;
+}
+
+int BlueStore::collection_list(
+  CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
+  vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  c->flush();
+  dout(15) << __func__ << " " << c->cid
+           << " start " << start << " end " << end << " max " << max << dendl;
+  int r;
+  {
+    std::shared_lock l(c->lock);
+    r = _collection_list(c, start, end, max, false, ls, pnext);
+  }
+
+  dout(10) << __func__ << " " << c->cid
+    << " start " << start << " end " << end << " max " << max
+    << " = " << r << ", ls.size() = " << ls->size()
+    << ", next = " << (pnext ? *pnext : ghobject_t())  << dendl;
+  return r;
+}
+
+int BlueStore::collection_list_legacy(
+  CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
+  vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  c->flush();
+  dout(15) << __func__ << " " << c->cid
+           << " start " << start << " end " << end << " max " << max << dendl;
+  int r;
+  {
+    std::shared_lock l(c->lock);
+    r = _collection_list(c, start, end, max, true, ls, pnext);
+  }
+
+  dout(10) << __func__ << " " << c->cid
+    << " start " << start << " end " << end << " max " << max
+    << " = " << r << ", ls.size() = " << ls->size()
+    << ", next = " << (pnext ? *pnext : ghobject_t())  << dendl;
+  return r;
+}
+
+int BlueStore::_collection_list(
+  Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
+  bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+
+  if (!c->exists)
+    return -ENOENT;
+
+  ghobject_t static_next;
+  std::unique_ptr<CollectionListIterator> it;
+  ghobject_t coll_range_temp_start, coll_range_temp_end;
+  ghobject_t coll_range_start, coll_range_end;
+  ghobject_t pend;
+  bool temp;
+
+  if (!pnext)
+    pnext = &static_next;
+
+  auto log_latency = make_scope_guard(
+    [&, start_time = mono_clock::now(), func_name = __func__] {
+    log_latency_fn(
+      func_name,
+      l_bluestore_remove_lat,
+      mono_clock::now() - start_time,
+      cct->_conf->bluestore_log_collection_list_age,
+      [&](const ceph::timespan& lat) {
+	ostringstream ostr;
+	ostr << ", lat = " << timespan_str(lat)
+	     << " cid =" << c->cid
+	     << " start " << start << " end " << end
+	     << " max " << max;
+	return ostr.str();
+      });
+  });
+
+  if (start.is_max() || start.hobj.is_max()) {
+    *pnext = ghobject_t::get_max();
+    return 0;
+  }
+  get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
+                 &coll_range_temp_end, &coll_range_start, &coll_range_end, legacy);
+  dout(20) << __func__
+    << " range " << coll_range_temp_start
+    << " to " << coll_range_temp_end
+    << " and " << coll_range_start
+    << " to " << coll_range_end
+    << " start " << start << dendl;
+  if (legacy) {
+    it = std::make_unique<SimpleCollectionListIterator>(
+      cct, db->get_iterator(PREFIX_OBJ));
+  } else {
+    it = std::make_unique<SortedCollectionListIterator>(
+      db->get_iterator(PREFIX_OBJ));
+  }
+  if (start == ghobject_t() ||
+    start.hobj == hobject_t() ||
+    start == c->cid.get_min_hobj()) {
+    it->upper_bound(coll_range_temp_start);
+    temp = true;
+  } else {
+    if (start.hobj.is_temp()) {
+      temp = true;
+      ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
+    } else {
+      temp = false;
+      ceph_assert(start >= coll_range_start && start < coll_range_end);
+    }
+    dout(20) << __func__ << " temp=" << (int)temp << dendl;
+    it->lower_bound(start);
+  }
+  if (end.hobj.is_max()) {
+    pend = temp ? coll_range_temp_end : coll_range_end;
+  } else {
+    if (end.hobj.is_temp()) {
+      if (temp) {
+        pend = end;
+      } else {
+        *pnext = ghobject_t::get_max();
+        return 0;
+      }
+    } else {
+      pend = temp ? coll_range_temp_end : end;
+    }
+  }
+  dout(20) << __func__ << " pend " << pend << dendl;
+  while (true) {
+    if (!it->valid() || it->is_ge(pend)) {
+      if (!it->valid())
+	dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
+      else
+	dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
+      if (temp) {
+	if (end.hobj.is_temp()) {
+          if (it->valid() && it->is_lt(coll_range_temp_end)) {
+            *pnext = it->oid();
+            return 0;
+          }
+	  break;
+	}
+	dout(30) << __func__ << " switch to non-temp namespace" << dendl;
+	temp = false;
+	it->upper_bound(coll_range_start);
+        if (end.hobj.is_max())
+          pend = coll_range_end;
+        else
+          pend = end;
+	dout(30) << __func__ << " pend " << pend << dendl;
+	continue;
+      }
+      if (it->valid() && it->is_lt(coll_range_end)) {
+        *pnext = it->oid();
+        return 0;
+      }
+      break;
+    }
+    dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
+    if (ls->size() >= (unsigned)max) {
+      dout(20) << __func__ << " reached max " << max << dendl;
+      *pnext = it->oid();
+      return 0;
+    }
+    ls->push_back(it->oid());
+    it->next();
+  }
+  *pnext = ghobject_t::get_max();
+  return 0;
+}
+
+int BlueStore::omap_get(
+  CollectionHandle &c_,    ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  bufferlist *header,      ///< [out] omap header
+  map<string, bufferlist> *out /// < [out] Key to value map
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  return _omap_get(c, oid, header, out);
+}
+
+int BlueStore::_omap_get(
+  Collection *c,    ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  bufferlist *header,      ///< [out] omap header
+  map<string, bufferlist> *out /// < [out] Key to value map
+  )
+{
+  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  std::shared_lock l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  r = _onode_omap_get(o, header, out);
+ out:
+  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+	   << dendl;
+  return r;
+}
+
+int BlueStore::_onode_omap_get(
+  const OnodeRef &o,           ///< [in] Object containing omap
+  bufferlist *header,          ///< [out] omap header
+  map<string, bufferlist> *out /// < [out] Key to value map
+)
+{
+  int r = 0;
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.has_omap())
+    goto out;
+  o->flush();
+  {
+    const string& prefix = o->get_omap_prefix();
+    string head, tail;
+    o->get_omap_header(&head);
+    o->get_omap_tail(&tail);
+    KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
+    it->lower_bound(head);
+    while (it->valid()) {
+      if (it->key() == head) {
+        dout(30) << __func__ << "  got header" << dendl;
+        *header = it->value();
+      } else if (it->key() >= tail) {
+        dout(30) << __func__ << "  reached tail" << dendl;
+        break;
+      } else {
+        string user_key;
+        o->decode_omap_key(it->key(), &user_key);
+        dout(20) << __func__ << "  got " << pretty_binary_string(it->key())
+          << " -> " << user_key << dendl;
+        (*out)[user_key] = it->value();
+      }
+      it->next();
+    }
+  }
+out:
+  return r;
+}
+
+int BlueStore::omap_get_header(
+  CollectionHandle &c_,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  bufferlist *header,      ///< [out] omap header
+  bool allow_eio ///< [in] don't assert on eio
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  std::shared_lock l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.has_omap())
+    goto out;
+  o->flush();
+  {
+    string head;
+    o->get_omap_header(&head);
+    if (db->get(o->get_omap_prefix(), head, header) >= 0) {
+      dout(30) << __func__ << "  got header" << dendl;
+    } else {
+      dout(30) << __func__ << "  no header" << dendl;
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+	   << dendl;
+  return r;
+}
+
+int BlueStore::omap_get_keys(
+  CollectionHandle &c_,              ///< [in] Collection containing oid
+  const ghobject_t &oid, ///< [in] Object containing omap
+  set<string> *keys      ///< [out] Keys defined on oid
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  auto start1 = mono_clock::now();
+  std::shared_lock l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.has_omap())
+    goto out;
+  o->flush();
+  {
+    const string& prefix = o->get_omap_prefix();
+    string head, tail;
+    o->get_omap_key(string(), &head);
+    o->get_omap_tail(&tail);
+    KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
+    it->lower_bound(head);
+    while (it->valid()) {
+      if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      }
+      string user_key;
+      o->decode_omap_key(it->key(), &user_key);
+      dout(20) << __func__ << "  got " << pretty_binary_string(it->key())
+	       << " -> " << user_key << dendl;
+      keys->insert(user_key);
+      it->next();
+    }
+  }
+ out:
+  c->store->log_latency(
+    __func__,
+    l_bluestore_omap_get_keys_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+	   << dendl;
+  return r;
+}
+
+int BlueStore::omap_get_values(
+  CollectionHandle &c_,        ///< [in] Collection containing oid
+  const ghobject_t &oid,       ///< [in] Object containing omap
+  const set<string> &keys,     ///< [in] Keys to get
+  map<string, bufferlist> *out ///< [out] Returned keys and values
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  std::shared_lock l(c->lock);
+  auto start1 = mono_clock::now();
+  int r = 0;
+  string final_key;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.has_omap()) {
+    goto out;
+  }
+  o->flush();
+  {
+    const string& prefix = o->get_omap_prefix();
+    o->get_omap_key(string(), &final_key);
+    size_t base_key_len = final_key.size();
+    for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+      final_key.resize(base_key_len); // keep prefix
+      final_key += *p;
+      bufferlist val;
+      if (db->get(prefix, final_key, &val) >= 0) {
+	dout(30) << __func__ << "  got " << pretty_binary_string(final_key)
+		 << " -> " << *p << dendl;
+	out->insert(make_pair(*p, val));
+      }
+    }
+  }
+ out:
+  c->store->log_latency(
+    __func__,
+    l_bluestore_omap_get_values_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+	   << dendl;
+  return r;
+}
+
+#ifdef WITH_SEASTAR
+int BlueStore::omap_get_values(
+  CollectionHandle &c_,        ///< [in] Collection containing oid
+  const ghobject_t &oid,       ///< [in] Object containing omap
+  const std::optional<string> &start_after,     ///< [in] Keys to get
+  map<string, bufferlist> *output ///< [out] Returned keys and values
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  std::shared_lock l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.has_omap()) {
+    goto out;
+  }
+  o->flush();
+  {
+    ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
+    if (!iter) {
+      r = -ENOENT;
+      goto out;
+    }
+    iter->upper_bound(*start_after);
+    for (; iter->valid(); iter->next()) {
+      output->insert(make_pair(iter->key(), iter->value()));
+    }
+  }
+
+out:
+  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+          << dendl;
+  return r;
+}
+#endif
+
+int BlueStore::omap_check_keys(
+  CollectionHandle &c_,    ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  const set<string> &keys, ///< [in] Keys to check
+  set<string> *out         ///< [out] Subset of keys defined on oid
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  std::shared_lock l(c->lock);
+  int r = 0;
+  string final_key;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.has_omap()) {
+    goto out;
+  }
+  o->flush();
+  {
+    const string& prefix = o->get_omap_prefix();
+    o->get_omap_key(string(), &final_key);
+    size_t base_key_len = final_key.size();
+    for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+      final_key.resize(base_key_len); // keep prefix
+      final_key += *p;
+      bufferlist val;
+      if (db->get(prefix, final_key, &val) >= 0) {
+	dout(30) << __func__ << "  have " << pretty_binary_string(final_key)
+		 << " -> " << *p << dendl;
+	out->insert(*p);
+      } else {
+	dout(30) << __func__ << "  miss " << pretty_binary_string(final_key)
+		 << " -> " << *p << dendl;
+      }
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+	   << dendl;
+  return r;
+}
+
+ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
+  CollectionHandle &c_,              ///< [in] collection
+  const ghobject_t &oid  ///< [in] object
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
+  if (!c->exists) {
+    return ObjectMap::ObjectMapIterator();
+  }
+  std::shared_lock l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
+    return ObjectMap::ObjectMapIterator();
+  }
+  o->flush();
+  dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
+  auto bounds = KeyValueDB::IteratorBounds();
+  if (o->onode.has_omap()) {
+    std::string lower_bound, upper_bound;
+    o->get_omap_key(string(), &lower_bound);
+    o->get_omap_tail(&upper_bound);
+    bounds.lower_bound = std::move(lower_bound);
+    bounds.upper_bound = std::move(upper_bound);
+  }
+  KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds));
+  return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
+}
+
+// -----------------
+// write helpers
+
+uint64_t BlueStore::_get_ondisk_reserved() const {
+  ceph_assert(min_alloc_size);
+  return round_up_to(
+    std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
+}
+
+void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
+{
+  dout(10) << __func__ << " ondisk_format " << ondisk_format
+	   << " min_compat_ondisk_format " << min_compat_ondisk_format
+	   << dendl;
+  ceph_assert(ondisk_format == latest_ondisk_format);
+  {
+    bufferlist bl;
+    encode(ondisk_format, bl);
+    t->set(PREFIX_SUPER, "ondisk_format", bl);
+  }
+  {
+    bufferlist bl;
+    encode(min_compat_ondisk_format, bl);
+    t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
+  }
+}
+
+int BlueStore::_open_super_meta()
+{
+  // nid
+  {
+    nid_max = 0;
+    bufferlist bl;
+    db->get(PREFIX_SUPER, "nid_max", &bl);
+    auto p = bl.cbegin();
+    try {
+      uint64_t v;
+      decode(v, p);
+      nid_max = v;
+    } catch (ceph::buffer::error& e) {
+      derr << __func__ << " unable to read nid_max" << dendl;
+      return -EIO;
+    }
+    dout(1) << __func__ << " old nid_max " << nid_max << dendl;
+    nid_last = nid_max.load();
+  }
+
+  // blobid
+  {
+    blobid_max = 0;
+    bufferlist bl;
+    db->get(PREFIX_SUPER, "blobid_max", &bl);
+    auto p = bl.cbegin();
+    try {
+      uint64_t v;
+      decode(v, p);
+      blobid_max = v;
+    } catch (ceph::buffer::error& e) {
+      derr << __func__ << " unable to read blobid_max" << dendl;
+      return -EIO;
+    }
+    dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
+    blobid_last = blobid_max.load();
+  }
+
+  // freelist
+  {
+    bufferlist bl;
+    db->get(PREFIX_SUPER, "freelist_type", &bl);
+    if (bl.length()) {
+      freelist_type = std::string(bl.c_str(), bl.length());
+      dout(1) << __func__ << " freelist_type " << freelist_type << dendl;
+    } else {
+      ceph_abort_msg("Not Support extent freelist manager");
+    }
+  }
+
+  // ondisk format
+  int32_t compat_ondisk_format = 0;
+  {
+    bufferlist bl;
+    int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
+    if (r < 0) {
+      // base case: kraken bluestore is v1 and readable by v1
+      dout(20) << __func__ << " missing ondisk_format; assuming kraken"
+	       << dendl;
+      ondisk_format = 1;
+      compat_ondisk_format = 1;
+    } else {
+      auto p = bl.cbegin();
+      try {
+	decode(ondisk_format, p);
+      } catch (ceph::buffer::error& e) {
+	derr << __func__ << " unable to read ondisk_format" << dendl;
+	return -EIO;
+      }
+      bl.clear();
+      {
+	r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
+	ceph_assert(!r);
+	auto p = bl.cbegin();
+	try {
+	  decode(compat_ondisk_format, p);
+	} catch (ceph::buffer::error& e) {
+	  derr << __func__ << " unable to read compat_ondisk_format" << dendl;
+	  return -EIO;
+	}
+      }
+    }
+    dout(1) << __func__ << " ondisk_format " << ondisk_format
+	     << " compat_ondisk_format " << compat_ondisk_format
+	     << dendl;
+  }
+
+  if (latest_ondisk_format < compat_ondisk_format) {
+    derr << __func__ << " compat_ondisk_format is "
+	 << compat_ondisk_format << " but we only understand version "
+	 << latest_ondisk_format << dendl;
+    return -EPERM;
+  }
+
+  {
+    bufferlist bl;
+    db->get(PREFIX_SUPER, "min_alloc_size", &bl);
+    auto p = bl.cbegin();
+    try {
+      uint64_t val;
+      decode(val, p);
+      min_alloc_size = val;
+      min_alloc_size_order = ctz(val);
+      ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
+    } catch (ceph::buffer::error& e) {
+      derr << __func__ << " unable to read min_alloc_size" << dendl;
+      return -EIO;
+    }
+    dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
+	     << std::dec << dendl;
+  }
+
+  _set_per_pool_omap();
+
+  _open_statfs();
+  _set_alloc_sizes();
+  _set_throttle_params();
+
+  _set_csum();
+  _set_compression();
+  _set_blob_size();
+
+  _validate_bdev();
+  return 0;
+}
+
+int BlueStore::_upgrade_super()
+{
+  dout(1) << __func__ << " from " << ondisk_format << ", latest "
+	  << latest_ondisk_format << dendl;
+  if (ondisk_format < latest_ondisk_format) {
+    ceph_assert(ondisk_format > 0);
+    ceph_assert(ondisk_format < latest_ondisk_format);
+
+    KeyValueDB::Transaction t = db->get_transaction();
+    if (ondisk_format == 1) {
+      // changes:
+      // - super: added ondisk_format
+      // - super: added min_readable_ondisk_format
+      // - super: added min_compat_ondisk_format
+      // - super: added min_alloc_size
+      // - super: removed min_min_alloc_size
+      {
+	bufferlist bl;
+	db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
+	auto p = bl.cbegin();
+	try {
+	  uint64_t val;
+	  decode(val, p);
+	  min_alloc_size = val;
+	} catch (ceph::buffer::error& e) {
+	  derr << __func__ << " failed to read min_min_alloc_size" << dendl;
+	  return -EIO;
+	}
+	t->set(PREFIX_SUPER, "min_alloc_size", bl);
+	t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
+      }
+      ondisk_format = 2;
+    }
+    if (ondisk_format == 2) {
+      // changes:
+      // - onode has FLAG_PERPOOL_OMAP.  Note that we do not know that *all*
+      //   oondes are using the per-pool prefix until a repair is run; at that
+      //   point the per_pool_omap=1 key will be set.
+      // - super: added per_pool_omap key, which indicates that *all* objects
+      //   are using the new prefix and key format
+      ondisk_format = 3;
+    }
+    if (ondisk_format == 3) {
+      // changes:
+      // - FreelistManager keeps meta within bdev label
+      int r = _write_out_fm_meta(0);
+      ceph_assert(r == 0);
+      ondisk_format = 4;
+    }
+    // This to be the last operation
+    _prepare_ondisk_format_super(t);
+    int r = db->submit_transaction_sync(t);
+    ceph_assert(r == 0);
+  }
+  // done
+  dout(1) << __func__ << " done" << dendl;
+  return 0;
+}
+
+void BlueStore::_assign_nid(TransContext *txc, OnodeRef o)
+{
+  if (o->onode.nid) {
+    ceph_assert(o->exists);
+    return;
+  }
+  uint64_t nid = ++nid_last;
+  dout(20) << __func__ << " " << nid << dendl;
+  o->onode.nid = nid;
+  txc->last_nid = nid;
+  o->exists = true;
+}
+
+uint64_t BlueStore::_assign_blobid(TransContext *txc)
+{
+  uint64_t bid = ++blobid_last;
+  dout(20) << __func__ << " " << bid << dendl;
+  txc->last_blobid = bid;
+  return bid;
+}
+
+void BlueStore::get_db_statistics(Formatter *f)
+{
+  db->get_statistics(f);
+}
+
+BlueStore::TransContext *BlueStore::_txc_create(
+  Collection *c, OpSequencer *osr,
+  list<Context*> *on_commits,
+  TrackedOpRef osd_op)
+{
+  TransContext *txc = new TransContext(cct, c, osr, on_commits);
+  txc->t = db->get_transaction();
+
+#ifdef WITH_BLKIN
+  if (osd_op && osd_op->pg_trace) {
+    txc->trace.init("TransContext", &trace_endpoint,
+                    &osd_op->pg_trace);
+    txc->trace.event("txc create");
+    txc->trace.keyval("txc seq", txc->seq);
+  }
+#endif
+
+  osr->queue_new(txc);
+  dout(20) << __func__ << " osr " << osr << " = " << txc
+	   << " seq " << txc->seq << dendl;
+  return txc;
+}
+
+void BlueStore::_txc_calc_cost(TransContext *txc)
+{
+  // one "io" for the kv commit
+  auto ios = 1 + txc->ioc.get_num_ios();
+  auto cost = throttle_cost_per_io.load();
+  txc->cost = ios * cost + txc->bytes;
+  txc->ios = ios;
+  dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
+	   << ios << " ios * " << cost << " + " << txc->bytes
+	   << " bytes)" << dendl;
+}
+
+void BlueStore::_txc_update_store_statfs(TransContext *txc)
+{
+  if (txc->statfs_delta.is_empty())
+    return;
+
+  logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
+  logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
+  logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
+  logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
+  logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
+
+  bufferlist bl;
+  txc->statfs_delta.encode(bl);
+  if (per_pool_stat_collection) {
+    string key;
+    get_pool_stat_key(txc->osd_pool_id, &key);
+    txc->t->merge(PREFIX_STAT, key, bl);
+
+    std::lock_guard l(vstatfs_lock);
+    auto& stats = osd_pools[txc->osd_pool_id];
+    stats += txc->statfs_delta;
+    
+    vstatfs += txc->statfs_delta; //non-persistent in this mode
+
+  } else {
+    txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+
+    std::lock_guard l(vstatfs_lock);
+    vstatfs += txc->statfs_delta;
+  } 
+  txc->statfs_delta.reset();
+}
+
+void BlueStore::_txc_state_proc(TransContext *txc)
+{
+  while (true) {
+    dout(10) << __func__ << " txc " << txc
+	     << " " << txc->get_state_name() << dendl;
+    switch (txc->get_state()) {
+    case TransContext::STATE_PREPARE:
+      throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
+      if (txc->ioc.has_pending_aios()) {
+	txc->set_state(TransContext::STATE_AIO_WAIT);
+#ifdef WITH_BLKIN
+        if (txc->trace) {
+          txc->trace.keyval("pending aios", txc->ioc.num_pending.load());
+        }
+#endif
+	txc->had_ios = true;
+	_txc_aio_submit(txc);
+	return;
+      }
+      // ** fall-thru **
+
+    case TransContext::STATE_AIO_WAIT:
+      {
+	mono_clock::duration lat = throttle.log_state_latency(
+	  *txc, logger, l_bluestore_state_aio_wait_lat);
+	if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
+	  dout(0) << __func__ << " slow aio_wait, txc = " << txc
+		  << ", latency = " << lat
+		  << dendl;
+	}
+      }
+
+      _txc_finish_io(txc);  // may trigger blocked txc's too
+      return;
+
+    case TransContext::STATE_IO_DONE:
+      ceph_assert(ceph_mutex_is_locked(txc->osr->qlock));  // see _txc_finish_io
+      if (txc->had_ios) {
+	++txc->osr->txc_with_unstable_io;
+      }
+      throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
+      txc->set_state(TransContext::STATE_KV_QUEUED);
+      if (cct->_conf->bluestore_sync_submit_transaction) {
+	if (txc->last_nid >= nid_max ||
+	    txc->last_blobid >= blobid_max) {
+	  dout(20) << __func__
+		   << " last_{nid,blobid} exceeds max, submit via kv thread"
+		   << dendl;
+	} else if (txc->osr->kv_committing_serially) {
+	  dout(20) << __func__ << " prior txc submitted via kv thread, us too"
+		   << dendl;
+	  // note: this is starvation-prone.  once we have a txc in a busy
+	  // sequencer that is committing serially it is possible to keep
+	  // submitting new transactions fast enough that we get stuck doing
+	  // so.  the alternative is to block here... fixme?
+	} else if (txc->osr->txc_with_unstable_io) {
+	  dout(20) << __func__ << " prior txc(s) with unstable ios "
+		   << txc->osr->txc_with_unstable_io.load() << dendl;
+	} else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
+		   rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
+		   == 0) {
+	  dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
+		   << dendl;
+	} else {
+	  _txc_apply_kv(txc, true);
+	}
+      }
+      {
+	std::lock_guard l(kv_lock);
+	kv_queue.push_back(txc);
+	if (!kv_sync_in_progress) {
+	  kv_sync_in_progress = true;
+	  kv_cond.notify_one();
+	}
+	if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) {
+	  kv_queue_unsubmitted.push_back(txc);
+	  ++txc->osr->kv_committing_serially;
+	}
+	if (txc->had_ios)
+	  kv_ios++;
+	kv_throttle_costs += txc->cost;
+      }
+      return;
+    case TransContext::STATE_KV_SUBMITTED:
+      _txc_committed_kv(txc);
+      // ** fall-thru **
+
+    case TransContext::STATE_KV_DONE:
+      throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
+      if (txc->deferred_txn) {
+	txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
+	_deferred_queue(txc);
+	return;
+      }
+      txc->set_state(TransContext::STATE_FINISHING);
+      break;
+
+    case TransContext::STATE_DEFERRED_CLEANUP:
+      throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
+      txc->set_state(TransContext::STATE_FINISHING);
+      // ** fall-thru **
+
+    case TransContext::STATE_FINISHING:
+      throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
+      _txc_finish(txc);
+      return;
+
+    default:
+      derr << __func__ << " unexpected txc " << txc
+	   << " state " << txc->get_state_name() << dendl;
+      ceph_abort_msg("unexpected txc state");
+      return;
+    }
+  }
+}
+
+void BlueStore::_txc_finish_io(TransContext *txc)
+{
+  dout(20) << __func__ << " " << txc << dendl;
+
+  /*
+   * we need to preserve the order of kv transactions,
+   * even though aio will complete in any order.
+   */
+
+  OpSequencer *osr = txc->osr.get();
+  std::lock_guard l(osr->qlock);
+  txc->set_state(TransContext::STATE_IO_DONE);
+  txc->ioc.release_running_aios();
+  OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
+  while (p != osr->q.begin()) {
+    --p;
+    if (p->get_state() < TransContext::STATE_IO_DONE) {
+      dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
+	       << p->get_state_name() << dendl;
+      return;
+    }
+    if (p->get_state() > TransContext::STATE_IO_DONE) {
+      ++p;
+      break;
+    }
+  }
+  do {
+    _txc_state_proc(&*p++);
+  } while (p != osr->q.end() &&
+	   p->get_state() == TransContext::STATE_IO_DONE);
+
+  if (osr->kv_submitted_waiters) {
+    osr->qcond.notify_all();
+  }
+}
+
+void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
+{
+  dout(20) << __func__ << " txc " << txc
+	   << " onodes " << txc->onodes
+	   << " shared_blobs " << txc->shared_blobs
+	   << dendl;
+
+  // finalize onodes
+  for (auto o : txc->onodes) {
+    _record_onode(o, t);
+    o->flushing_count++;
+  }
+
+  // objects we modified but didn't affect the onode
+  auto p = txc->modified_objects.begin();
+  while (p != txc->modified_objects.end()) {
+    if (txc->onodes.count(*p) == 0) {
+      (*p)->flushing_count++;
+      ++p;
+    } else {
+      // remove dups with onodes list to avoid problems in _txc_finish
+      p = txc->modified_objects.erase(p);
+    }
+  }
+
+  // finalize shared_blobs
+  for (auto sb : txc->shared_blobs) {
+    string key;
+    auto sbid = sb->get_sbid();
+    get_shared_blob_key(sbid, &key);
+    if (sb->persistent->empty()) {
+      dout(20) << __func__ << " shared_blob 0x"
+               << std::hex << sbid << std::dec
+	       << " is empty" << dendl;
+      t->rmkey(PREFIX_SHARED_BLOB, key);
+    } else {
+      bufferlist bl;
+      encode(*(sb->persistent), bl);
+      dout(20) << __func__ << " shared_blob 0x"
+               << std::hex << sbid << std::dec
+	       << " is " << bl.length() << " " << *sb << dendl;
+      t->set(PREFIX_SHARED_BLOB, key, bl);
+    }
+  }
+}
+
+void BlueStore::BSPerfTracker::update_from_perfcounters(
+  PerfCounters &logger)
+{
+  os_commit_latency_ns.consume_next(
+    logger.get_tavg_ns(
+      l_bluestore_commit_lat));
+  os_apply_latency_ns.consume_next(
+    logger.get_tavg_ns(
+      l_bluestore_commit_lat));
+}
+
+// For every object we maintain <zone_num+oid, offset> tuple in the key-value
+// store.  When a new object written to a zone, we insert the corresponding
+// tuple to the database.  When an object is truncated, we remove the
+// corresponding tuple.  When an object is overwritten, we remove the old tuple
+// and insert a new tuple corresponding to the new location of the object.  The
+// cleaner can now identify live objects within the zone <zone_num> by
+// enumerating all the keys starting with <zone_num> prefix.
+void BlueStore::_zoned_update_cleaning_metadata(TransContext *txc) {
+  for (const auto &[o, offsets] : txc->zoned_onode_to_offset_map) {
+    std::string key;
+    get_object_key(cct, o->oid, &key);
+    for (auto offset : offsets) {
+      if (offset > 0) {
+	bufferlist offset_bl;
+	encode(offset, offset_bl);
+        txc->t->set(_zoned_get_prefix(offset), key, offset_bl);
+      } else {
+        txc->t->rmkey(_zoned_get_prefix(-offset), key);
+      }
+    }
+  }
+}
+
+std::string BlueStore::_zoned_get_prefix(uint64_t offset) {
+  uint64_t zone_num = offset / bdev->get_zone_size();
+  std::string zone_key;
+  _key_encode_u64(zone_num, &zone_key);
+  return PREFIX_ZONED_CL_INFO + zone_key;
+}
+
+// For now, to avoid interface changes we piggyback zone_size (in MiB) and the
+// first sequential zone number onto min_alloc_size and pass it to functions
+// Allocator::create and FreelistManager::create.
+uint64_t BlueStore::_zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size) {
+  uint64_t zone_size = bdev->get_zone_size();
+  uint64_t zone_size_mb = zone_size / (1024 * 1024);
+  uint64_t first_seq_zone = bdev->get_conventional_region_size() / zone_size;
+  min_alloc_size |= (zone_size_mb << 32);
+  min_alloc_size |= (first_seq_zone << 48);
+  return min_alloc_size;
+}
+
+int BlueStore::_zoned_check_config_settings() {
+  if (cct->_conf->bluestore_allocator != "zoned") {
+    dout(1) << __func__ << " The drive is HM-SMR but "
+	    << cct->_conf->bluestore_allocator << " allocator is specified. "
+	    << "Only zoned allocator can be used with HM-SMR drive." << dendl;
+    return -EINVAL;
+  }
+
+  // At least for now we want to use large min_alloc_size with HM-SMR drives.
+  // Populating used_blocks bitset on a debug build of ceph-osd takes about 5
+  // minutes with a 14 TB HM-SMR drive and 4 KiB min_alloc_size.
+  if (min_alloc_size < 64 * 1024) {
+    dout(1) << __func__ << " The drive is HM-SMR but min_alloc_size is "
+	    << min_alloc_size << ". "
+	    << "Please set to at least 64 KiB." << dendl;
+    return -EINVAL;
+  }
+
+  // We don't want to defer writes with HM-SMR because it violates sequential
+  // write requirement.
+  if (prefer_deferred_size) {
+    dout(1) << __func__ << " The drive is HM-SMR but prefer_deferred_size is "
+	    << prefer_deferred_size << ". "
+	    << "Please set to 0." << dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
+{
+  dout(20) << __func__ << " txc " << txc << std::hex
+	   << " allocated 0x" << txc->allocated
+	   << " released 0x" << txc->released
+	   << std::dec << dendl;
+
+  // We have to handle the case where we allocate *and* deallocate the
+  // same region in this transaction.  The freelist doesn't like that.
+  // (Actually, the only thing that cares is the BitmapFreelistManager
+  // debug check. But that's important.)
+  interval_set<uint64_t> tmp_allocated, tmp_released;
+  interval_set<uint64_t> *pallocated = &txc->allocated;
+  interval_set<uint64_t> *preleased = &txc->released;
+  if (!txc->allocated.empty() && !txc->released.empty()) {
+    interval_set<uint64_t> overlap;
+    overlap.intersection_of(txc->allocated, txc->released);
+    if (!overlap.empty()) {
+      tmp_allocated = txc->allocated;
+      tmp_allocated.subtract(overlap);
+      tmp_released = txc->released;
+      tmp_released.subtract(overlap);
+      dout(20) << __func__ << "  overlap 0x" << std::hex << overlap
+	       << ", new allocated 0x" << tmp_allocated
+	       << " released 0x" << tmp_released << std::dec
+	       << dendl;
+      pallocated = &tmp_allocated;
+      preleased = &tmp_released;
+    }
+  }
+
+  // update freelist with non-overlap sets
+  for (interval_set<uint64_t>::iterator p = pallocated->begin();
+       p != pallocated->end();
+       ++p) {
+    fm->allocate(p.get_start(), p.get_len(), t);
+  }
+  for (interval_set<uint64_t>::iterator p = preleased->begin();
+       p != preleased->end();
+       ++p) {
+    dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
+	     << "~" << p.get_len() << std::dec << dendl;
+    fm->release(p.get_start(), p.get_len(), t);
+  }
+
+  if (bdev->is_smr()) {
+    _zoned_update_cleaning_metadata(txc);
+  }
+
+  _txc_update_store_statfs(txc);
+}
+
+void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
+{
+  ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED);
+  {
+#if defined(WITH_LTTNG)
+    auto start = mono_clock::now();
+#endif
+
+#ifdef WITH_BLKIN
+    if (txc->trace) {
+      txc->trace.event("db async submit");
+    }
+#endif
+
+    int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
+    ceph_assert(r == 0);
+    txc->set_state(TransContext::STATE_KV_SUBMITTED);
+    if (txc->osr->kv_submitted_waiters) {
+      std::lock_guard l(txc->osr->qlock);
+      txc->osr->qcond.notify_all();
+    }
+
+#if defined(WITH_LTTNG)
+    if (txc->tracing) {
+      tracepoint(
+	bluestore,
+	transaction_kv_submit_latency,
+	txc->osr->get_sequencer_id(),
+	txc->seq,
+	sync_submit_transaction,
+	ceph::to_seconds<double>(mono_clock::now() - start));
+    }
+#endif
+  }
+
+  for (auto ls : { &txc->onodes, &txc->modified_objects }) {
+    for (auto& o : *ls) {
+      dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
+	       << dendl;
+      if (--o->flushing_count == 0 && o->waiting_count.load()) {
+        std::lock_guard l(o->flush_lock);
+	o->flush_cond.notify_all();
+      }
+    }
+  }
+}
+
+void BlueStore::_txc_committed_kv(TransContext *txc)
+{
+  dout(20) << __func__ << " txc " << txc << dendl;
+  throttle.complete_kv(*txc);
+  {
+    std::lock_guard l(txc->osr->qlock);
+    txc->set_state(TransContext::STATE_KV_DONE);
+    if (txc->ch->commit_queue) {
+      txc->ch->commit_queue->queue(txc->oncommits);
+    } else {
+      finisher.queue(txc->oncommits);
+    }
+  }
+  throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
+  log_latency_fn(
+    __func__,
+    l_bluestore_commit_lat,
+    mono_clock::now() - txc->start,
+    cct->_conf->bluestore_log_op_age,
+    [&](auto lat) {
+      return ", txc = " + stringify(txc);
+    }
+  );
+}
+
+void BlueStore::_txc_finish(TransContext *txc)
+{
+  dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
+  ceph_assert(txc->get_state() == TransContext::STATE_FINISHING);
+
+  for (auto& sb : txc->shared_blobs_written) {
+    sb->finish_write(txc->seq);
+  }
+  txc->shared_blobs_written.clear();
+
+  while (!txc->removed_collections.empty()) {
+    _queue_reap_collection(txc->removed_collections.front());
+    txc->removed_collections.pop_front();
+  }
+
+  OpSequencerRef osr = txc->osr;
+  bool empty = false;
+  bool submit_deferred = false;
+  OpSequencer::q_list_t releasing_txc;
+  {
+    std::lock_guard l(osr->qlock);
+    txc->set_state(TransContext::STATE_DONE);
+    bool notify = false;
+    while (!osr->q.empty()) {
+      TransContext *txc = &osr->q.front();
+      dout(20) << __func__ << "  txc " << txc << " " << txc->get_state_name()
+	       << dendl;
+      if (txc->get_state() != TransContext::STATE_DONE) {
+	if (txc->get_state() == TransContext::STATE_PREPARE &&
+	  deferred_aggressive) {
+	  // for _osr_drain_preceding()
+          notify = true;
+	}
+	if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED &&
+	    osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
+	  submit_deferred = true;
+	}
+        break;
+      }
+
+      osr->q.pop_front();
+      releasing_txc.push_back(*txc);
+    }
+
+    if (osr->q.empty()) {
+      dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
+      empty = true;
+    }
+
+    // only drain()/drain_preceding() need wakeup,
+    // other cases use kv_submitted_waiters
+    if (notify || empty) {
+      osr->qcond.notify_all();
+    }
+  }
+
+  while (!releasing_txc.empty()) {
+    // release to allocator only after all preceding txc's have also
+    // finished any deferred writes that potentially land in these
+    // blocks
+    auto txc = &releasing_txc.front();
+    _txc_release_alloc(txc);
+    releasing_txc.pop_front();
+    throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
+    throttle.complete(*txc);
+    delete txc;
+  }
+
+  if (submit_deferred) {
+    // we're pinning memory; flush!  we could be more fine-grained here but
+    // i'm not sure it's worth the bother.
+    deferred_try_submit();
+  }
+
+  if (empty && osr->zombie) {
+    std::lock_guard l(zombie_osr_lock);
+    if (zombie_osr_set.erase(osr->cid)) {
+      dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
+    } else {
+      dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
+	       << dendl;
+    }
+  }
+}
+
+void BlueStore::_txc_release_alloc(TransContext *txc)
+{
+  // it's expected we're called with lazy_release_lock already taken!
+  if (likely(!cct->_conf->bluestore_debug_no_reuse_blocks)) {
+    int r = 0;
+    if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
+      r = bdev->queue_discard(txc->released);
+      if (r == 0) {
+	dout(10) << __func__ << "(queued) " << txc << " " << std::hex
+		 << txc->released << std::dec << dendl;
+	goto out;
+      }
+    } else if (cct->_conf->bdev_enable_discard) {
+      for (auto p = txc->released.begin(); p != txc->released.end(); ++p) {
+	  bdev->discard(p.get_start(), p.get_len());
+      }
+    }
+    dout(10) << __func__ << "(sync) " << txc << " " << std::hex
+             << txc->released << std::dec << dendl;
+    shared_alloc.a->release(txc->released);
+  }
+
+out:
+  txc->allocated.clear();
+  txc->released.clear();
+}
+
+void BlueStore::_osr_attach(Collection *c)
+{
+  // note: caller has RWLock on coll_map
+  auto q = coll_map.find(c->cid);
+  if (q != coll_map.end()) {
+    c->osr = q->second->osr;
+    ldout(cct, 10) << __func__ << " " << c->cid
+		   << " reusing osr " << c->osr << " from existing coll "
+		   << q->second << dendl;
+  } else {
+    std::lock_guard l(zombie_osr_lock);
+    auto p = zombie_osr_set.find(c->cid);
+    if (p == zombie_osr_set.end()) {
+      c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
+      ldout(cct, 10) << __func__ << " " << c->cid
+		     << " fresh osr " << c->osr << dendl;
+    } else {
+      c->osr = p->second;
+      zombie_osr_set.erase(p);
+      ldout(cct, 10) << __func__ << " " << c->cid
+		     << " resurrecting zombie osr " << c->osr << dendl;
+      c->osr->zombie = false;
+    }
+  }
+}
+
+void BlueStore::_osr_register_zombie(OpSequencer *osr)
+{
+  std::lock_guard l(zombie_osr_lock);
+  dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
+  osr->zombie = true;
+  auto i = zombie_osr_set.emplace(osr->cid, osr);
+  // this is either a new insertion or the same osr is already there
+  ceph_assert(i.second || i.first->second == osr);
+}
+
+void BlueStore::_osr_drain_preceding(TransContext *txc)
+{
+  OpSequencer *osr = txc->osr.get();
+  dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
+  ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
+  {
+    // submit anything pending
+    osr->deferred_lock.lock();
+    if (osr->deferred_pending && !osr->deferred_running) {
+      _deferred_submit_unlock(osr);
+    } else {
+      osr->deferred_lock.unlock();
+    }
+  }
+  {
+    // wake up any previously finished deferred events
+    std::lock_guard l(kv_lock);
+    if (!kv_sync_in_progress) {
+      kv_sync_in_progress = true;
+      kv_cond.notify_one();
+    }
+  }
+  osr->drain_preceding(txc);
+  --deferred_aggressive;
+  dout(10) << __func__ << " " << osr << " done" << dendl;
+}
+
+void BlueStore::_osr_drain(OpSequencer *osr)
+{
+  dout(10) << __func__ << " " << osr << dendl;
+  ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
+  {
+    // submit anything pending
+    osr->deferred_lock.lock();
+    if (osr->deferred_pending && !osr->deferred_running) {
+      _deferred_submit_unlock(osr);
+    } else {
+      osr->deferred_lock.unlock();
+    }
+  }
+  {
+    // wake up any previously finished deferred events
+    std::lock_guard l(kv_lock);
+    if (!kv_sync_in_progress) {
+      kv_sync_in_progress = true;
+      kv_cond.notify_one();
+    }
+  }
+  osr->drain();
+  --deferred_aggressive;
+  dout(10) << __func__ << " " << osr << " done" << dendl;
+}
+
+void BlueStore::_osr_drain_all()
+{
+  dout(10) << __func__ << dendl;
+
+  set<OpSequencerRef> s;
+  vector<OpSequencerRef> zombies;
+  {
+    std::shared_lock l(coll_lock);
+    for (auto& i : coll_map) {
+      s.insert(i.second->osr);
+    }
+  }
+  {
+    std::lock_guard l(zombie_osr_lock);
+    for (auto& i : zombie_osr_set) {
+      s.insert(i.second);
+      zombies.push_back(i.second);
+    }
+  }
+  dout(20) << __func__ << " osr_set " << s << dendl;
+
+  ++deferred_aggressive;
+  {
+    // submit anything pending
+    deferred_try_submit();
+  }
+  {
+    // wake up any previously finished deferred events
+    std::lock_guard l(kv_lock);
+    kv_cond.notify_one();
+  }
+  {
+    std::lock_guard l(kv_finalize_lock);
+    kv_finalize_cond.notify_one();
+  }
+  for (auto osr : s) {
+    dout(20) << __func__ << " drain " << osr << dendl;
+    osr->drain();
+  }
+  --deferred_aggressive;
+
+  {
+    std::lock_guard l(zombie_osr_lock);
+    for (auto& osr : zombies) {
+      if (zombie_osr_set.erase(osr->cid)) {
+	dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
+	ceph_assert(osr->q.empty());
+      } else if (osr->zombie) {
+	dout(10) << __func__ << " empty zombie osr " << osr
+		 << " already reaped" << dendl;
+	ceph_assert(osr->q.empty());
+      } else {
+	dout(10) << __func__ << " empty zombie osr " << osr
+		 << " resurrected" << dendl;
+      }
+    }
+  }
+
+  dout(10) << __func__ << " done" << dendl;
+}
+
+
+void BlueStore::_kv_start()
+{
+  dout(10) << __func__ << dendl;
+
+  finisher.start();
+  kv_sync_thread.create("bstore_kv_sync");
+  kv_finalize_thread.create("bstore_kv_final");
+}
+
+void BlueStore::_kv_stop()
+{
+  dout(10) << __func__ << dendl;
+  {
+    std::unique_lock l{kv_lock};
+    while (!kv_sync_started) {
+      kv_cond.wait(l);
+    }
+    kv_stop = true;
+    kv_cond.notify_all();
+  }
+  {
+    std::unique_lock l{kv_finalize_lock};
+    while (!kv_finalize_started) {
+      kv_finalize_cond.wait(l);
+    }
+    kv_finalize_stop = true;
+    kv_finalize_cond.notify_all();
+  }
+  kv_sync_thread.join();
+  kv_finalize_thread.join();
+  ceph_assert(removed_collections.empty());
+  {
+    std::lock_guard l(kv_lock);
+    kv_stop = false;
+  }
+  {
+    std::lock_guard l(kv_finalize_lock);
+    kv_finalize_stop = false;
+  }
+  dout(10) << __func__ << " stopping finishers" << dendl;
+  finisher.wait_for_empty();
+  finisher.stop();
+  dout(10) << __func__ << " stopped" << dendl;
+}
+
+void BlueStore::_kv_sync_thread()
+{
+  dout(10) << __func__ << " start" << dendl;
+  deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
+  std::unique_lock l{kv_lock};
+  ceph_assert(!kv_sync_started);
+  kv_sync_started = true;
+  kv_cond.notify_all();
+
+  auto t0 = mono_clock::now();
+  timespan twait = ceph::make_timespan(0);
+  size_t kv_submitted = 0;
+
+  while (true) {
+    auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
+    auto observation_period =
+      ceph::make_timespan(period);
+    auto elapsed = mono_clock::now() - t0;
+    if (period && elapsed >= observation_period) {
+      dout(5) << __func__ << " utilization: idle "
+	      << twait << " of " << elapsed
+	      << ", submitted: " << kv_submitted
+	      <<dendl;
+      t0 = mono_clock::now();
+      twait = ceph::make_timespan(0);
+      kv_submitted = 0;
+    }
+    ceph_assert(kv_committing.empty());
+    if (kv_queue.empty() &&
+	((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
+	 !deferred_aggressive)) {
+      if (kv_stop)
+	break;
+      dout(20) << __func__ << " sleep" << dendl;
+      auto t = mono_clock::now();
+      kv_sync_in_progress = false;
+      kv_cond.wait(l);
+      twait += mono_clock::now() - t;
+
+      dout(20) << __func__ << " wake" << dendl;
+    } else {
+      deque<TransContext*> kv_submitting;
+      deque<DeferredBatch*> deferred_done, deferred_stable;
+      uint64_t aios = 0, costs = 0;
+
+      dout(20) << __func__ << " committing " << kv_queue.size()
+	       << " submitting " << kv_queue_unsubmitted.size()
+	       << " deferred done " << deferred_done_queue.size()
+	       << " stable " << deferred_stable_queue.size()
+	       << dendl;
+      kv_committing.swap(kv_queue);
+      kv_submitting.swap(kv_queue_unsubmitted);
+      deferred_done.swap(deferred_done_queue);
+      deferred_stable.swap(deferred_stable_queue);
+      aios = kv_ios;
+      costs = kv_throttle_costs;
+      kv_ios = 0;
+      kv_throttle_costs = 0;
+      l.unlock();
+
+      dout(30) << __func__ << " committing " << kv_committing << dendl;
+      dout(30) << __func__ << " submitting " << kv_submitting << dendl;
+      dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
+      dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
+
+      auto start = mono_clock::now();
+
+      bool force_flush = false;
+      // if bluefs is sharing the same device as data (only), then we
+      // can rely on the bluefs commit to flush the device and make
+      // deferred aios stable.  that means that if we do have done deferred
+      // txcs AND we are not on a single device, we need to force a flush.
+      if (bluefs && bluefs_layout.single_shared_device()) {
+	if (aios) {
+	  force_flush = true;
+	} else if (kv_committing.empty() && deferred_stable.empty()) {
+	  force_flush = true;  // there's nothing else to commit!
+	} else if (deferred_aggressive) {
+	  force_flush = true;
+	}
+      } else {
+      	if (aios || !deferred_done.empty()) {
+	  force_flush = true;
+      	} else {
+	  dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
+      	}
+      }
+
+      if (force_flush) {
+	dout(20) << __func__ << " num_aios=" << aios
+		 << " force_flush=" << (int)force_flush
+		 << ", flushing, deferred done->stable" << dendl;
+	// flush/barrier on block device
+	bdev->flush();
+
+	// if we flush then deferred done are now deferred stable
+	deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
+			       deferred_done.end());
+	deferred_done.clear();
+      }
+      auto after_flush = mono_clock::now();
+
+      // we will use one final transaction to force a sync
+      KeyValueDB::Transaction synct = db->get_transaction();
+
+      // increase {nid,blobid}_max?  note that this covers both the
+      // case where we are approaching the max and the case we passed
+      // it.  in either case, we increase the max in the earlier txn
+      // we submit.
+      uint64_t new_nid_max = 0, new_blobid_max = 0;
+      if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
+	KeyValueDB::Transaction t =
+	  kv_submitting.empty() ? synct : kv_submitting.front()->t;
+	new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
+	bufferlist bl;
+	encode(new_nid_max, bl);
+	t->set(PREFIX_SUPER, "nid_max", bl);
+	dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
+      }
+      if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
+	KeyValueDB::Transaction t =
+	  kv_submitting.empty() ? synct : kv_submitting.front()->t;
+	new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
+	bufferlist bl;
+	encode(new_blobid_max, bl);
+	t->set(PREFIX_SUPER, "blobid_max", bl);
+	dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
+      }
+
+      for (auto txc : kv_committing) {
+	throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
+	if (txc->get_state() == TransContext::STATE_KV_QUEUED) {
+	  ++kv_submitted;
+	  _txc_apply_kv(txc, false);
+	  --txc->osr->kv_committing_serially;
+	} else {
+	  ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
+	}
+	if (txc->had_ios) {
+	  --txc->osr->txc_with_unstable_io;
+	}
+      }
+
+      // release throttle *before* we commit.  this allows new ops
+      // to be prepared and enter pipeline while we are waiting on
+      // the kv commit sync/flush.  then hopefully on the next
+      // iteration there will already be ops awake.  otherwise, we
+      // end up going to sleep, and then wake up when the very first
+      // transaction is ready for commit.
+      throttle.release_kv_throttle(costs);
+
+      // cleanup sync deferred keys
+      for (auto b : deferred_stable) {
+	for (auto& txc : b->txcs) {
+	  bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
+	  ceph_assert(wt.released.empty()); // only kraken did this
+	  string key;
+	  get_deferred_key(wt.seq, &key);
+	  synct->rm_single_key(PREFIX_DEFERRED, key);
+	}
+      }
+
+#if defined(WITH_LTTNG)
+      auto sync_start = mono_clock::now();
+#endif
+      // submit synct synchronously (block and wait for it to commit)
+      int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
+      ceph_assert(r == 0);
+
+#ifdef WITH_BLKIN
+      for (auto txc : kv_committing) {
+        if (txc->trace) {
+          txc->trace.event("db sync submit");
+          txc->trace.keyval("kv_committing size", kv_committing.size());
+        }
+      }
+#endif
+
+      int committing_size = kv_committing.size();
+      int deferred_size = deferred_stable.size();
+
+#if defined(WITH_LTTNG)
+      double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
+      for (auto txc: kv_committing) {
+	if (txc->tracing) {
+	  tracepoint(
+	    bluestore,
+	    transaction_kv_sync_latency,
+	    txc->osr->get_sequencer_id(),
+	    txc->seq,
+	    kv_committing.size(),
+	    deferred_done.size(),
+	    deferred_stable.size(),
+	    sync_latency);
+	}
+      }
+#endif
+
+      {
+	std::unique_lock m{kv_finalize_lock};
+	if (kv_committing_to_finalize.empty()) {
+	  kv_committing_to_finalize.swap(kv_committing);
+	} else {
+	  kv_committing_to_finalize.insert(
+	      kv_committing_to_finalize.end(),
+	      kv_committing.begin(),
+	      kv_committing.end());
+	  kv_committing.clear();
+	}
+	if (deferred_stable_to_finalize.empty()) {
+	  deferred_stable_to_finalize.swap(deferred_stable);
+	} else {
+	  deferred_stable_to_finalize.insert(
+	      deferred_stable_to_finalize.end(),
+	      deferred_stable.begin(),
+	      deferred_stable.end());
+	  deferred_stable.clear();
+	}
+	if (!kv_finalize_in_progress) {
+	  kv_finalize_in_progress = true;
+	  kv_finalize_cond.notify_one();
+	}
+      }
+
+      if (new_nid_max) {
+	nid_max = new_nid_max;
+	dout(10) << __func__ << " nid_max now " << nid_max << dendl;
+      }
+      if (new_blobid_max) {
+	blobid_max = new_blobid_max;
+	dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
+      }
+
+      {
+	auto finish = mono_clock::now();
+	ceph::timespan dur_flush = after_flush - start;
+	ceph::timespan dur_kv = finish - after_flush;
+	ceph::timespan dur = finish - start;
+	dout(20) << __func__ << " committed " << committing_size
+	  << " cleaned " << deferred_size
+	  << " in " << dur
+	  << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
+	  << dendl;
+	log_latency("kv_flush",
+	  l_bluestore_kv_flush_lat,
+	  dur_flush,
+	  cct->_conf->bluestore_log_op_age);
+	log_latency("kv_commit",
+	  l_bluestore_kv_commit_lat,
+	  dur_kv,
+	  cct->_conf->bluestore_log_op_age);
+	log_latency("kv_sync",
+	  l_bluestore_kv_sync_lat,
+	  dur,
+	  cct->_conf->bluestore_log_op_age);
+      }
+
+      l.lock();
+      // previously deferred "done" are now "stable" by virtue of this
+      // commit cycle.
+      deferred_stable_queue.swap(deferred_done);
+    }
+  }
+  dout(10) << __func__ << " finish" << dendl;
+  kv_sync_started = false;
+}
+
+void BlueStore::_kv_finalize_thread()
+{
+  deque<TransContext*> kv_committed;
+  deque<DeferredBatch*> deferred_stable;
+  dout(10) << __func__ << " start" << dendl;
+  std::unique_lock l(kv_finalize_lock);
+  ceph_assert(!kv_finalize_started);
+  kv_finalize_started = true;
+  kv_finalize_cond.notify_all();
+  while (true) {
+    ceph_assert(kv_committed.empty());
+    ceph_assert(deferred_stable.empty());
+    if (kv_committing_to_finalize.empty() &&
+	deferred_stable_to_finalize.empty()) {
+      if (kv_finalize_stop)
+	break;
+      dout(20) << __func__ << " sleep" << dendl;
+      kv_finalize_in_progress = false;
+      kv_finalize_cond.wait(l);
+      dout(20) << __func__ << " wake" << dendl;
+    } else {
+      kv_committed.swap(kv_committing_to_finalize);
+      deferred_stable.swap(deferred_stable_to_finalize);
+      l.unlock();
+      dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
+      dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
+
+      auto start = mono_clock::now();
+
+      while (!kv_committed.empty()) {
+	TransContext *txc = kv_committed.front();
+	ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
+	_txc_state_proc(txc);
+	kv_committed.pop_front();
+      }
+
+      for (auto b : deferred_stable) {
+	auto p = b->txcs.begin();
+	while (p != b->txcs.end()) {
+	  TransContext *txc = &*p;
+	  p = b->txcs.erase(p); // unlink here because
+	  _txc_state_proc(txc); // this may destroy txc
+	}
+	delete b;
+      }
+      deferred_stable.clear();
+
+      if (!deferred_aggressive) {
+	if (deferred_queue_size >= deferred_batch_ops.load() ||
+	    throttle.should_submit_deferred()) {
+	  deferred_try_submit();
+	}
+      }
+
+      // this is as good a place as any ...
+      _reap_collections();
+
+      logger->set(l_bluestore_fragmentation,
+	  (uint64_t)(shared_alloc.a->get_fragmentation() * 1000));
+
+      log_latency("kv_final",
+	l_bluestore_kv_final_lat,
+	mono_clock::now() - start,
+	cct->_conf->bluestore_log_op_age);
+
+      l.lock();
+    }
+  }
+  dout(10) << __func__ << " finish" << dendl;
+  kv_finalize_started = false;
+}
+
+void BlueStore::_zoned_cleaner_start() {
+  dout(10) << __func__ << dendl;
+
+  zoned_cleaner_thread.create("bstore_zcleaner");
+}
+
+void BlueStore::_zoned_cleaner_stop() {
+  dout(10) << __func__ << dendl;
+  {
+    std::unique_lock l{zoned_cleaner_lock};
+    while (!zoned_cleaner_started) {
+      zoned_cleaner_cond.wait(l);
+    }
+    zoned_cleaner_stop = true;
+    zoned_cleaner_cond.notify_all();
+  }
+  zoned_cleaner_thread.join();
+  {
+    std::lock_guard l{zoned_cleaner_lock};
+    zoned_cleaner_stop = false;
+  }
+  dout(10) << __func__ << " done" << dendl;
+}
+
+void BlueStore::_zoned_cleaner_thread() {
+  dout(10) << __func__ << " start" << dendl;
+  std::unique_lock l{zoned_cleaner_lock};
+  ceph_assert(!zoned_cleaner_started);
+  zoned_cleaner_started = true;
+  zoned_cleaner_cond.notify_all();
+  std::deque<uint64_t> zones_to_clean;
+  while (true) {
+    if (zoned_cleaner_queue.empty()) {
+      if (zoned_cleaner_stop) {
+	break;
+      }
+      dout(20) << __func__ << " sleep" << dendl;
+      zoned_cleaner_cond.wait(l);
+      dout(20) << __func__ << " wake" << dendl;
+    } else {
+      zones_to_clean.swap(zoned_cleaner_queue);
+      l.unlock();
+      while (!zones_to_clean.empty()) {
+	_zoned_clean_zone(zones_to_clean.front());
+	zones_to_clean.pop_front();
+      }
+      l.lock();
+    }
+  }
+  dout(10) << __func__ << " finish" << dendl;
+  zoned_cleaner_started = false;
+}
+
+void BlueStore::_zoned_clean_zone(uint64_t zone_num) {
+  dout(10) << __func__ << " cleaning zone " << zone_num << dendl;
+}
+
+bluestore_deferred_op_t *BlueStore::_get_deferred_op(
+  TransContext *txc, uint64_t len)
+{
+  if (!txc->deferred_txn) {
+    txc->deferred_txn = new bluestore_deferred_transaction_t;
+  }
+  txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
+  logger->inc(l_bluestore_write_deferred);
+  logger->inc(l_bluestore_write_deferred_bytes, len);
+  return &txc->deferred_txn->ops.back();
+}
+
+void BlueStore::_deferred_queue(TransContext *txc)
+{
+  dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
+
+  DeferredBatch *tmp;
+  txc->osr->deferred_lock.lock();
+  {
+    if (!txc->osr->deferred_pending) {
+      tmp = new DeferredBatch(cct, txc->osr.get());
+    } else {
+      tmp  = txc->osr->deferred_pending;
+    }
+  }
+
+  tmp->txcs.push_back(*txc);
+  bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
+  for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
+    const auto& op = *opi;
+    ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
+    bufferlist::const_iterator p = op.data.begin();
+    for (auto e : op.extents) {
+      tmp->prepare_write(cct, wt.seq, e.offset, e.length, p);
+    }
+  }
+
+  {
+    ++deferred_queue_size;
+    txc->osr->deferred_pending = tmp;
+    // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
+    // So we should add osr into deferred_queue.
+    if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) {
+      deferred_lock.lock();
+      deferred_queue.push_back(*txc->osr);
+      deferred_lock.unlock();
+    }
+
+    if (deferred_aggressive &&
+	!txc->osr->deferred_running) {
+      _deferred_submit_unlock(txc->osr.get());
+    } else {
+      txc->osr->deferred_lock.unlock();
+    }
+  }
+ }
+
+void BlueStore::deferred_try_submit()
+{
+  dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
+	   << deferred_queue_size << " txcs" << dendl;
+  vector<OpSequencerRef> osrs;
+
+  {
+    std::lock_guard l(deferred_lock);
+    osrs.reserve(deferred_queue.size());
+    for (auto& osr : deferred_queue) {
+      osrs.push_back(&osr);
+    }
+  }
+
+  for (auto& osr : osrs) {
+    osr->deferred_lock.lock();
+    if (osr->deferred_pending) {
+      if (!osr->deferred_running) {
+	_deferred_submit_unlock(osr.get());
+      } else {
+	osr->deferred_lock.unlock();
+	dout(20) << __func__ << "  osr " << osr << " already has running"
+		 << dendl;
+      }
+    } else {
+      osr->deferred_lock.unlock();
+      dout(20) << __func__ << "  osr " << osr << " has no pending" << dendl;
+    }
+  }
+
+  {
+    std::lock_guard l(deferred_lock);
+    deferred_last_submitted = ceph_clock_now();
+  }
+}
+
+void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
+{
+  dout(10) << __func__ << " osr " << osr
+	   << " " << osr->deferred_pending->iomap.size() << " ios pending "
+	   << dendl;
+  ceph_assert(osr->deferred_pending);
+  ceph_assert(!osr->deferred_running);
+
+  auto b = osr->deferred_pending;
+  deferred_queue_size -= b->seq_bytes.size();
+  ceph_assert(deferred_queue_size >= 0);
+
+  osr->deferred_running = osr->deferred_pending;
+  osr->deferred_pending = nullptr;
+
+  osr->deferred_lock.unlock();
+
+  for (auto& txc : b->txcs) {
+    throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
+  }
+  uint64_t start = 0, pos = 0;
+  bufferlist bl;
+  auto i = b->iomap.begin();
+  while (true) {
+    if (i == b->iomap.end() || i->first != pos) {
+      if (bl.length()) {
+	dout(20) << __func__ << " write 0x" << std::hex
+		 << start << "~" << bl.length()
+		 << " crc " << bl.crc32c(-1) << std::dec << dendl;
+	if (!g_conf()->bluestore_debug_omit_block_device_write) {
+	  logger->inc(l_bluestore_deferred_write_ops);
+	  logger->inc(l_bluestore_deferred_write_bytes, bl.length());
+	  int r = bdev->aio_write(start, bl, &b->ioc, false);
+	  ceph_assert(r == 0);
+	}
+      }
+      if (i == b->iomap.end()) {
+	break;
+      }
+      start = 0;
+      pos = i->first;
+      bl.clear();
+    }
+    dout(20) << __func__ << "   seq " << i->second.seq << " 0x"
+	     << std::hex << pos << "~" << i->second.bl.length() << std::dec
+	     << dendl;
+    if (!bl.length()) {
+      start = pos;
+    }
+    pos += i->second.bl.length();
+    bl.claim_append(i->second.bl);
+    ++i;
+  }
+
+  bdev->aio_submit(&b->ioc);
+}
+
+struct C_DeferredTrySubmit : public Context {
+  BlueStore *store;
+  C_DeferredTrySubmit(BlueStore *s) : store(s) {}
+  void finish(int r) {
+    store->deferred_try_submit();
+  }
+};
+
+void BlueStore::_deferred_aio_finish(OpSequencer *osr)
+{
+  dout(10) << __func__ << " osr " << osr << dendl;
+  ceph_assert(osr->deferred_running);
+  DeferredBatch *b = osr->deferred_running;
+
+  {
+    osr->deferred_lock.lock();
+    ceph_assert(osr->deferred_running == b);
+    osr->deferred_running = nullptr;
+    if (!osr->deferred_pending) {
+      dout(20) << __func__ << " dequeueing" << dendl;
+      {
+	deferred_lock.lock();
+	auto q = deferred_queue.iterator_to(*osr);
+	deferred_queue.erase(q);
+	deferred_lock.unlock();
+      }
+      osr->deferred_lock.unlock();
+    } else {
+      osr->deferred_lock.unlock();
+      if (deferred_aggressive) {
+	dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
+	finisher.queue(new C_DeferredTrySubmit(this));
+      } else {
+	dout(20) << __func__ << " leaving queued, more pending" << dendl;
+      }
+    }
+  }
+
+  {
+    uint64_t costs = 0;
+    {
+      for (auto& i : b->txcs) {
+	TransContext *txc = &i;
+	throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
+	txc->set_state(TransContext::STATE_DEFERRED_CLEANUP);
+	costs += txc->cost;
+      }
+    }
+    throttle.release_deferred_throttle(costs);
+  }
+
+  {
+    std::lock_guard l(kv_lock);
+    deferred_done_queue.emplace_back(b);
+
+    // in the normal case, do not bother waking up the kv thread; it will
+    // catch us on the next commit anyway.
+    if (deferred_aggressive && !kv_sync_in_progress) {
+	kv_sync_in_progress = true;
+	kv_cond.notify_one();
+    }
+  }
+}
+
+int BlueStore::_deferred_replay()
+{
+  dout(10) << __func__ << " start" << dendl;
+  int count = 0;
+  int r = 0;
+  interval_set<uint64_t> bluefs_extents;
+  if (bluefs) {
+    bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents);
+  }
+  CollectionRef ch = _get_collection(coll_t::meta());
+  bool fake_ch = false;
+  if (!ch) {
+    // hmm, replaying initial mkfs?
+    ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
+    fake_ch = true;
+  }
+  OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
+  for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
+    dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
+	     << dendl;
+    bluestore_deferred_transaction_t *deferred_txn =
+      new bluestore_deferred_transaction_t;
+    bufferlist bl = it->value();
+    auto p = bl.cbegin();
+    try {
+      decode(*deferred_txn, p);
+    } catch (ceph::buffer::error& e) {
+      derr << __func__ << " failed to decode deferred txn "
+	   << pretty_binary_string(it->key()) << dendl;
+      delete deferred_txn;
+      r = -EIO;
+      goto out;
+    }
+    bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents);
+    if (has_some) {
+      TransContext *txc = _txc_create(ch.get(), osr,  nullptr);
+      txc->deferred_txn = deferred_txn;
+      txc->set_state(TransContext::STATE_KV_DONE);
+      _txc_state_proc(txc);
+    } else {
+      delete deferred_txn;
+    }
+  }
+ out:
+  dout(20) << __func__ << " draining osr" << dendl;
+  _osr_register_zombie(osr);
+  _osr_drain_all();
+  if (fake_ch) {
+    new_coll_map.clear();
+  }
+  dout(10) << __func__ << " completed " << count << " events" << dendl;
+  return r;
+}
+
+bool BlueStore::_eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn,
+					     interval_set<uint64_t>& bluefs_extents)
+{
+  bool has_some = false;
+  dout(30) << __func__ << " bluefs_extents: " << std::hex << bluefs_extents << std::dec << dendl;
+  auto it = deferred_txn->ops.begin();
+  while (it != deferred_txn->ops.end()) {
+    // We process a pair of _data_/_extents_ (here: it->data/it->extents)
+    // by eliminating _extents_ that belong to bluefs, removing relevant parts of _data_
+    // example:
+    // +------------+---------------+---------------+---------------+
+    // | data       | aaaaaaaabbbbb | bbbbcccccdddd | ddddeeeeeefff |
+    // | extent     | 40000 - 44000 | 50000 - 58000 | 58000 - 60000 |
+    // | in bluefs? |       no      |      yes      |       no      |
+    // +------------+---------------+---------------+---------------+
+    // result:
+    // +------------+---------------+---------------+
+    // | data       | aaaaaaaabbbbb | ddddeeeeeefff |
+    // | extent     | 40000 - 44000 | 58000 - 60000 |
+    // +------------+---------------+---------------+
+    PExtentVector new_extents;
+    ceph::buffer::list new_data;
+    uint32_t data_offset = 0; // this tracks location of extent 'e' inside it->data
+    dout(30) << __func__ << " input extents: " << it->extents << dendl;
+    for (auto& e: it->extents) {
+      interval_set<uint64_t> region;
+      region.insert(e.offset, e.length);
+
+      auto mi = bluefs_extents.lower_bound(e.offset);
+      if (mi != bluefs_extents.begin()) {
+	--mi;
+	if (mi.get_end() <= e.offset) {
+	  ++mi;
+	}
+      }
+      while (mi != bluefs_extents.end() && mi.get_start() < e.offset + e.length) {
+	// The interval_set does not like (asserts) when we erase interval that does not exist.
+	// Hence we do we implement (region-mi) by ((region+mi)-mi).
+	region.union_insert(mi.get_start(), mi.get_len());
+	region.erase(mi.get_start(), mi.get_len());
+	++mi;
+      }
+      // 'region' is now a subset of e, without parts used by bluefs
+      // we trim coresponding parts from it->data (actally constructing new_data / new_extents)
+      for (auto ki = region.begin(); ki != region.end(); ki++) {
+	ceph::buffer::list chunk;
+	// A chunk from it->data; data_offset is a an offset where 'e' was located;
+	// 'ki.get_start() - e.offset' is an offset of ki inside 'e'.
+	chunk.substr_of(it->data, data_offset + (ki.get_start() - e.offset), ki.get_len());
+	new_data.claim_append(chunk);
+	new_extents.emplace_back(bluestore_pextent_t(ki.get_start(), ki.get_len()));
+      }
+      data_offset += e.length;
+    }
+    dout(30) << __func__ << " output extents: " << new_extents << dendl;
+    if (it->data.length() != new_data.length()) {
+      dout(10) << __func__ << " trimmed deferred extents: " << it->extents << "->" << new_extents << dendl;
+    }
+    if (new_extents.size() == 0) {
+      it = deferred_txn->ops.erase(it);
+    } else {
+      has_some = true;
+      std::swap(it->extents, new_extents);
+      std::swap(it->data, new_data);
+      ++it;
+    }
+  }
+  return has_some;
+}
+
+// ---------------------------
+// transactions
+
+int BlueStore::queue_transactions(
+  CollectionHandle& ch,
+  vector<Transaction>& tls,
+  TrackedOpRef op,
+  ThreadPool::TPHandle *handle)
+{
+  FUNCTRACE(cct);
+  list<Context *> on_applied, on_commit, on_applied_sync;
+  ObjectStore::Transaction::collect_contexts(
+    tls, &on_applied, &on_commit, &on_applied_sync);
+
+  auto start = mono_clock::now();
+
+  Collection *c = static_cast<Collection*>(ch.get());
+  OpSequencer *osr = c->osr.get();
+  dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
+
+  // prepare
+  TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
+				  &on_commit, op);
+
+  // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
+  // submission to happen atomically because if I/O submission happens in a
+  // different order than I/O allocation, we end up issuing non-sequential
+  // writes to the drive.  This is a temporary solution until ZONE APPEND
+  // support matures in the kernel.  For more information please see:
+  // https://www.usenix.org/conference/vault20/presentation/bjorling
+  if (bdev->is_smr()) {
+    atomic_alloc_and_submit_lock.lock();
+  }
+  for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
+    txc->bytes += (*p).get_num_bytes();
+    _txc_add_transaction(txc, &(*p));
+  }
+  _txc_calc_cost(txc);
+
+  _txc_write_nodes(txc, txc->t);
+
+  // journal deferred items
+  if (txc->deferred_txn) {
+    txc->deferred_txn->seq = ++deferred_seq;
+    bufferlist bl;
+    encode(*txc->deferred_txn, bl);
+    string key;
+    get_deferred_key(txc->deferred_txn->seq, &key);
+    txc->t->set(PREFIX_DEFERRED, key, bl);
+  }
+
+  _txc_finalize_kv(txc, txc->t);
+
+#ifdef WITH_BLKIN
+  if (txc->trace) {
+    txc->trace.event("txc encode finished");
+  }
+#endif
+
+  if (handle)
+    handle->suspend_tp_timeout();
+
+  auto tstart = mono_clock::now();
+
+  if (!throttle.try_start_transaction(
+	*db,
+	*txc,
+	tstart)) {
+    // ensure we do not block here because of deferred writes
+    dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
+	     << dendl;
+    ++deferred_aggressive;
+    deferred_try_submit();
+    {
+      // wake up any previously finished deferred events
+      std::lock_guard l(kv_lock);
+      if (!kv_sync_in_progress) {
+	kv_sync_in_progress = true;
+	kv_cond.notify_one();
+      }
+    }
+    throttle.finish_start_transaction(*db, *txc, tstart);
+    --deferred_aggressive;
+  }
+  auto tend = mono_clock::now();
+
+  if (handle)
+    handle->reset_tp_timeout();
+
+  logger->inc(l_bluestore_txc);
+
+  // execute (start)
+  _txc_state_proc(txc);
+
+  if (bdev->is_smr()) {
+    atomic_alloc_and_submit_lock.unlock();
+  }
+
+  // we're immediately readable (unlike FileStore)
+  for (auto c : on_applied_sync) {
+    c->complete(0);
+  }
+  if (!on_applied.empty()) {
+    if (c->commit_queue) {
+      c->commit_queue->queue(on_applied);
+    } else {
+      finisher.queue(on_applied);
+    }
+  }
+
+#ifdef WITH_BLKIN
+  if (txc->trace) {
+    txc->trace.event("txc applied");
+  }
+#endif
+
+  log_latency("submit_transact",
+    l_bluestore_submit_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  log_latency("throttle_transact",
+    l_bluestore_throttle_lat,
+    tend - tstart,
+    cct->_conf->bluestore_log_op_age);
+  return 0;
+}
+
+void BlueStore::_txc_aio_submit(TransContext *txc)
+{
+  dout(10) << __func__ << " txc " << txc << dendl;
+  bdev->aio_submit(&txc->ioc);
+}
+
+void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
+{
+  Transaction::iterator i = t->begin();
+
+  _dump_transaction<30>(cct, t);
+
+  vector<CollectionRef> cvec(i.colls.size());
+  unsigned j = 0;
+  for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
+       ++p, ++j) {
+    cvec[j] = _get_collection(*p);
+  }
+  
+  vector<OnodeRef> ovec(i.objects.size());
+
+  for (int pos = 0; i.have_op(); ++pos) {
+    Transaction::Op *op = i.decode_op();
+    int r = 0;
+
+    // no coll or obj
+    if (op->op == Transaction::OP_NOP)
+      continue;
+
+
+    // collection operations
+    CollectionRef &c = cvec[op->cid];
+
+    // initialize osd_pool_id and do a smoke test that all collections belong
+    // to the same pool
+    spg_t pgid;
+    if (!!c ? c->cid.is_pg(&pgid) : false) {
+      ceph_assert(txc->osd_pool_id == META_POOL_ID ||
+                  txc->osd_pool_id == pgid.pool());
+      txc->osd_pool_id = pgid.pool();
+    }
+
+    switch (op->op) {
+    case Transaction::OP_RMCOLL:
+      {
+        const coll_t &cid = i.get_cid(op->cid);
+	r = _remove_collection(txc, cid, &c);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_MKCOLL:
+      {
+	ceph_assert(!c);
+	const coll_t &cid = i.get_cid(op->cid);
+	r = _create_collection(txc, cid, op->split_bits, &c);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_SPLIT_COLLECTION:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_SPLIT_COLLECTION2:
+      {
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+	r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_MERGE_COLLECTION:
+      {
+        uint32_t bits = op->split_bits;
+	r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_COLL_HINT:
+      {
+        uint32_t type = op->hint;
+        bufferlist hint;
+        i.decode_bl(hint);
+        auto hiter = hint.cbegin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          decode(pg_num, hiter);
+          decode(num_objs, hiter);
+          dout(10) << __func__ << " collection hint objects is a no-op, "
+		   << " pg_num " << pg_num << " num_objects " << num_objs
+		   << dendl;
+        } else {
+          // Ignore the hint
+          dout(10) << __func__ << " unknown collection hint " << type << dendl;
+        }
+	continue;
+      }
+      break;
+
+    case Transaction::OP_COLL_SETATTR:
+      r = -EOPNOTSUPP;
+      break;
+
+    case Transaction::OP_COLL_RMATTR:
+      r = -EOPNOTSUPP;
+      break;
+
+    case Transaction::OP_COLL_RENAME:
+      ceph_abort_msg("not implemented");
+      break;
+    }
+    if (r < 0) {
+      derr << __func__ << " error " << cpp_strerror(r)
+           << " not handled on operation " << op->op
+           << " (op " << pos << ", counting from 0)" << dendl;
+      _dump_transaction<0>(cct, t);
+      ceph_abort_msg("unexpected error");
+    }
+
+    // these operations implicity create the object
+    bool create = false;
+    if (op->op == Transaction::OP_TOUCH ||
+	op->op == Transaction::OP_CREATE ||
+	op->op == Transaction::OP_WRITE ||
+	op->op == Transaction::OP_ZERO) {
+      create = true;
+    }
+
+    // object operations
+    std::unique_lock l(c->lock);
+    OnodeRef &o = ovec[op->oid];
+    if (!o) {
+      ghobject_t oid = i.get_oid(op->oid);
+      o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
+    }
+    if (!create && (!o || !o->exists)) {
+      dout(10) << __func__ << " op " << op->op << " got ENOENT on "
+	       << i.get_oid(op->oid) << dendl;
+      r = -ENOENT;
+      goto endop;
+    }
+
+    switch (op->op) {
+    case Transaction::OP_CREATE:
+    case Transaction::OP_TOUCH:
+      r = _touch(txc, c, o);
+      break;
+
+    case Transaction::OP_WRITE:
+      {
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	uint32_t fadvise_flags = i.get_fadvise_flags();
+        bufferlist bl;
+        i.decode_bl(bl);
+	r = _write(txc, c, o, off, len, bl, fadvise_flags);
+      }
+      break;
+
+    case Transaction::OP_ZERO:
+      {
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	r = _zero(txc, c, o, off, len);
+      }
+      break;
+
+    case Transaction::OP_TRIMCACHE:
+      {
+        // deprecated, no-op
+      }
+      break;
+
+    case Transaction::OP_TRUNCATE:
+      {
+        uint64_t off = op->off;
+	r = _truncate(txc, c, o, off);
+      }
+      break;
+
+    case Transaction::OP_REMOVE:
+      {
+	r = _remove(txc, c, o);
+      }
+      break;
+
+    case Transaction::OP_SETATTR:
+      {
+        string name = i.decode_string();
+        bufferptr bp;
+        i.decode_bp(bp);
+	r = _setattr(txc, c, o, name, bp);
+      }
+      break;
+
+    case Transaction::OP_SETATTRS:
+      {
+        map<string, bufferptr> aset;
+        i.decode_attrset(aset);
+	r = _setattrs(txc, c, o, aset);
+      }
+      break;
+
+    case Transaction::OP_RMATTR:
+      {
+	string name = i.decode_string();
+	r = _rmattr(txc, c, o, name);
+      }
+      break;
+
+    case Transaction::OP_RMATTRS:
+      {
+	r = _rmattrs(txc, c, o);
+      }
+      break;
+
+    case Transaction::OP_CLONE:
+      {
+	OnodeRef& no = ovec[op->dest_oid];
+	if (!no) {
+          const ghobject_t& noid = i.get_oid(op->dest_oid);
+	  no = c->get_onode(noid, true);
+	}
+	r = _clone(txc, c, o, no);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_CLONERANGE2:
+      {
+	OnodeRef& no = ovec[op->dest_oid];
+	if (!no) {
+	  const ghobject_t& noid = i.get_oid(op->dest_oid);
+	  no = c->get_onode(noid, true);
+	}
+        uint64_t srcoff = op->off;
+        uint64_t len = op->len;
+        uint64_t dstoff = op->dest_off;
+	r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
+      }
+      break;
+
+    case Transaction::OP_COLL_ADD:
+      ceph_abort_msg("not implemented");
+      break;
+
+    case Transaction::OP_COLL_REMOVE:
+      ceph_abort_msg("not implemented");
+      break;
+
+    case Transaction::OP_COLL_MOVE:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_COLL_MOVE_RENAME:
+    case Transaction::OP_TRY_RENAME:
+      {
+	ceph_assert(op->cid == op->dest_cid);
+	const ghobject_t& noid = i.get_oid(op->dest_oid);
+	OnodeRef& no = ovec[op->dest_oid];
+	if (!no) {
+	  no = c->get_onode(noid, false);
+	}
+	r = _rename(txc, c, o, no, noid);
+      }
+      break;
+
+    case Transaction::OP_OMAP_CLEAR:
+      {
+	r = _omap_clear(txc, c, o);
+      }
+      break;
+    case Transaction::OP_OMAP_SETKEYS:
+      {
+	bufferlist aset_bl;
+        i.decode_attrset_bl(&aset_bl);
+	r = _omap_setkeys(txc, c, o, aset_bl);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYS:
+      {
+	bufferlist keys_bl;
+        i.decode_keyset_bl(&keys_bl);
+	r = _omap_rmkeys(txc, c, o, keys_bl);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYRANGE:
+      {
+        string first, last;
+        first = i.decode_string();
+        last = i.decode_string();
+	r = _omap_rmkey_range(txc, c, o, first, last);
+      }
+      break;
+    case Transaction::OP_OMAP_SETHEADER:
+      {
+        bufferlist bl;
+        i.decode_bl(bl);
+	r = _omap_setheader(txc, c, o, bl);
+      }
+      break;
+
+    case Transaction::OP_SETALLOCHINT:
+      {
+	r = _set_alloc_hint(txc, c, o,
+			    op->expected_object_size,
+			    op->expected_write_size,
+			    op->hint);
+      }
+      break;
+
+    default:
+      derr << __func__ << " bad op " << op->op << dendl;
+      ceph_abort();
+    }
+
+  endop:
+    if (r < 0) {
+      bool ok = false;
+
+      if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+			    op->op == Transaction::OP_CLONE ||
+			    op->op == Transaction::OP_CLONERANGE2 ||
+			    op->op == Transaction::OP_COLL_ADD ||
+			    op->op == Transaction::OP_SETATTR ||
+			    op->op == Transaction::OP_SETATTRS ||
+			    op->op == Transaction::OP_RMATTR ||
+			    op->op == Transaction::OP_OMAP_SETKEYS ||
+			    op->op == Transaction::OP_OMAP_RMKEYS ||
+			    op->op == Transaction::OP_OMAP_RMKEYRANGE ||
+			    op->op == Transaction::OP_OMAP_SETHEADER))
+	// -ENOENT is usually okay
+	ok = true;
+      if (r == -ENODATA)
+	ok = true;
+
+      if (!ok) {
+	const char *msg = "unexpected error code";
+
+	if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+			     op->op == Transaction::OP_CLONE ||
+			     op->op == Transaction::OP_CLONERANGE2))
+	  msg = "ENOENT on clone suggests osd bug";
+
+	if (r == -ENOSPC)
+	  // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+	  // by partially applying transactions.
+	  msg = "ENOSPC from bluestore, misconfigured cluster";
+
+	if (r == -ENOTEMPTY) {
+	  msg = "ENOTEMPTY suggests garbage data in osd data dir";
+	}
+
+        derr << __func__ << " error " << cpp_strerror(r)
+             << " not handled on operation " << op->op
+             << " (op " << pos << ", counting from 0)"
+             << dendl;
+        derr << msg << dendl;
+        _dump_transaction<0>(cct, t);
+	ceph_abort_msg("unexpected error");
+      }
+    }
+  }
+}
+
+
+
+// -----------------
+// write operations
+
+int BlueStore::_touch(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef &o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  _assign_nid(txc, o);
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+void BlueStore::_pad_zeros(
+  bufferlist *bl, uint64_t *offset,
+  uint64_t chunk_size)
+{
+  auto length = bl->length();
+  dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
+	   << " chunk_size 0x" << chunk_size << std::dec << dendl;
+  dout(40) << "before:\n";
+  bl->hexdump(*_dout);
+  *_dout << dendl;
+  // front
+  size_t front_pad = *offset % chunk_size;
+  size_t back_pad = 0;
+  size_t pad_count = 0;
+  if (front_pad) {
+    size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
+    bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size);
+    z.zero(0, front_pad, false);
+    pad_count += front_pad;
+    bl->begin().copy(front_copy, z.c_str() + front_pad);
+    if (front_copy + front_pad < chunk_size) {
+      back_pad = chunk_size - (length + front_pad);
+      z.zero(front_pad + length, back_pad, false);
+      pad_count += back_pad;
+    }
+    bufferlist old, t;
+    old.swap(*bl);
+    t.substr_of(old, front_copy, length - front_copy);
+    bl->append(z);
+    bl->claim_append(t);
+    *offset -= front_pad;
+    length += pad_count;
+  }
+
+  // back
+  uint64_t end = *offset + length;
+  unsigned back_copy = end % chunk_size;
+  if (back_copy) {
+    ceph_assert(back_pad == 0);
+    back_pad = chunk_size - back_copy;
+    ceph_assert(back_copy <= length);
+    bufferptr tail(chunk_size);
+    bl->begin(length - back_copy).copy(back_copy, tail.c_str());
+    tail.zero(back_copy, back_pad, false);
+    bufferlist old;
+    old.swap(*bl);
+    bl->substr_of(old, 0, length - back_copy);
+    bl->append(tail);
+    length += back_pad;
+    pad_count += back_pad;
+  }
+  dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
+	   << back_pad << " on front/back, now 0x" << *offset << "~"
+	   << length << std::dec << dendl;
+  dout(40) << "after:\n";
+  bl->hexdump(*_dout);
+  *_dout << dendl;
+  if (pad_count)
+    logger->inc(l_bluestore_write_pad_bytes, pad_count);
+  ceph_assert(bl->length() == length);
+}
+
+void BlueStore::_do_write_small(
+    TransContext *txc,
+    CollectionRef &c,
+    OnodeRef o,
+    uint64_t offset, uint64_t length,
+    bufferlist::iterator& blp,
+    WriteContext *wctx)
+{
+  dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << std::dec << dendl;
+  ceph_assert(length < min_alloc_size);
+
+  uint64_t end_offs = offset + length;
+
+  logger->inc(l_bluestore_write_small);
+  logger->inc(l_bluestore_write_small_bytes, length);
+
+  bufferlist bl;
+  blp.copy(length, bl);
+
+  auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+  auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
+  uint32_t alloc_len = min_alloc_size;
+  auto offset0 = p2align<uint64_t>(offset, alloc_len);
+
+  bool any_change;
+
+  // search suitable extent in both forward and reverse direction in
+  // [offset - target_max_blob_size, offset + target_max_blob_size] range
+  // then check if blob can be reused via can_reuse_blob func or apply
+  // direct/deferred write (the latter for extents including or higher
+  // than 'offset' only).
+  o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
+
+  // On zoned devices, the first goal is to support non-overwrite workloads,
+  // such as RGW, with large, aligned objects.  Therefore, for user writes
+  // _do_write_small should not trigger.  OSDs, however, write and update a tiny
+  // amount of metadata, such as OSD maps, to disk.  For those cases, we
+  // temporarily just pad them to min_alloc_size and write them to a new place
+  // on every update.
+  if (bdev->is_smr()) {
+    BlobRef b = c->new_blob();
+    uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
+    uint64_t b_off0 = b_off;
+    _pad_zeros(&bl, &b_off0, min_alloc_size);
+    o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+    wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
+    return;
+  }
+
+  // Look for an existing mutable blob we can use.
+  auto begin = o->extent_map.extent_map.begin();
+  auto end = o->extent_map.extent_map.end();
+  auto ep = o->extent_map.seek_lextent(offset);
+  if (ep != begin) {
+    --ep;
+    if (ep->blob_end() <= offset) {
+      ++ep;
+    }
+  }
+  auto prev_ep = end;
+  if (ep != begin) {
+    prev_ep = ep;
+    --prev_ep;
+  }
+
+  boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
+  // We don't want to have more blobs than min alloc units fit
+  // into 2 max blobs
+  size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
+  bool above_blob_threshold = false;
+
+  inspected_blobs.reserve(blob_threshold);
+
+  uint64_t max_off = 0;
+  auto start_ep = ep;
+  auto end_ep = ep; // exclusively
+  do {
+    any_change = false;
+
+    if (ep != end && ep->logical_offset < offset + max_bsize) {
+      BlobRef b = ep->blob;
+      if (!above_blob_threshold) {
+	inspected_blobs.insert(&b->get_blob());
+	above_blob_threshold = inspected_blobs.size() >= blob_threshold;
+      }
+      max_off = ep->logical_end();
+      auto bstart = ep->blob_start();
+
+      dout(20) << __func__ << " considering " << *b
+	       << " bstart 0x" << std::hex << bstart << std::dec << dendl;
+      if (bstart >= end_offs) {
+	dout(20) << __func__ << " ignoring distant " << *b << dendl;
+      } else if (!b->get_blob().is_mutable()) {
+	dout(20) << __func__ << " ignoring immutable " << *b << dendl;
+      } else if (ep->logical_offset % min_alloc_size !=
+		  ep->blob_offset % min_alloc_size) {
+	dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
+      } else {
+	uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
+	// can we pad our head/tail out with zeros?
+	uint64_t head_pad, tail_pad;
+	head_pad = p2phase(offset, chunk_size);
+	tail_pad = p2nphase(end_offs, chunk_size);
+	if (head_pad || tail_pad) {
+	  o->extent_map.fault_range(db, offset - head_pad,
+				    end_offs - offset + head_pad + tail_pad);
+	}
+	if (head_pad &&
+	    o->extent_map.has_any_lextents(offset - head_pad, head_pad)) {
+	  head_pad = 0;
+	}
+	if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
+	  tail_pad = 0;
+	}
+
+	uint64_t b_off = offset - head_pad - bstart;
+	uint64_t b_len = length + head_pad + tail_pad;
+
+	// direct write into unused blocks of an existing mutable blob?
+	if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
+	    b->get_blob().get_ondisk_length() >= b_off + b_len &&
+	    b->get_blob().is_unused(b_off, b_len) &&
+	    b->get_blob().is_allocated(b_off, b_len)) {
+	  _apply_padding(head_pad, tail_pad, bl);
+
+	  dout(20) << __func__ << "  write to unused 0x" << std::hex
+		   << b_off << "~" << b_len
+		   << " pad 0x" << head_pad << " + 0x" << tail_pad
+		   << std::dec << " of mutable " << *b << dendl;
+	  _buffer_cache_write(txc, b, b_off, bl,
+			      wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+	  if (!g_conf()->bluestore_debug_omit_block_device_write) {
+	    if (b_len < prefer_deferred_size) {
+	      dout(20) << __func__ << " deferring small 0x" << std::hex
+		       << b_len << std::dec << " unused write via deferred" << dendl;
+	      bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
+	      op->op = bluestore_deferred_op_t::OP_WRITE;
+	      b->get_blob().map(
+		b_off, b_len,
+		[&](uint64_t offset, uint64_t length) {
+		  op->extents.emplace_back(bluestore_pextent_t(offset, length));
+		  return 0;
+		});
+	      op->data = bl;
+	    } else {
+	      b->get_blob().map_bl(
+		b_off, bl,
+		[&](uint64_t offset, bufferlist& t) {
+		  bdev->aio_write(offset, t,
+				  &txc->ioc, wctx->buffered);
+		});
+	    }
+	  }
+	  b->dirty_blob().calc_csum(b_off, bl);
+	  dout(20) << __func__ << "  lex old " << *ep << dendl;
+	  Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
+						 b,
+						 &wctx->old_extents);
+	  b->dirty_blob().mark_used(le->blob_offset, le->length);
+
+	  txc->statfs_delta.stored() += le->length;
+	  dout(20) << __func__ << "  lex " << *le << dendl;
+	  logger->inc(l_bluestore_write_small_unused);
+	  return;
+	}
+	// read some data to fill out the chunk?
+	uint64_t head_read = p2phase(b_off, chunk_size);
+	uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
+	if ((head_read || tail_read) &&
+	    (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
+	    head_read + tail_read < min_alloc_size) {
+	  b_off -= head_read;
+	  b_len += head_read + tail_read;
+
+	} else {
+	  head_read = tail_read = 0;
+	}
+
+	// chunk-aligned deferred overwrite?
+	if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
+	    b_off % chunk_size == 0 &&
+	    b_len % chunk_size == 0 &&
+	    b->get_blob().is_allocated(b_off, b_len)) {
+
+	  _apply_padding(head_pad, tail_pad, bl);
+
+	  dout(20) << __func__ << "  reading head 0x" << std::hex << head_read
+		   << " and tail 0x" << tail_read << std::dec << dendl;
+	  if (head_read) {
+	    bufferlist head_bl;
+	    int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
+			     head_bl, 0);
+	    ceph_assert(r >= 0 && r <= (int)head_read);
+	    size_t zlen = head_read - r;
+	    if (zlen) {
+	      head_bl.append_zero(zlen);
+	      logger->inc(l_bluestore_write_pad_bytes, zlen);
+	    }
+	    head_bl.claim_append(bl);
+	    bl.swap(head_bl);
+	    logger->inc(l_bluestore_write_penalty_read_ops);
+	  }
+	  if (tail_read) {
+	    bufferlist tail_bl;
+	    int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
+			     tail_bl, 0);
+	    ceph_assert(r >= 0 && r <= (int)tail_read);
+	    size_t zlen = tail_read - r;
+	    if (zlen) {
+	      tail_bl.append_zero(zlen);
+	      logger->inc(l_bluestore_write_pad_bytes, zlen);
+	    }
+	    bl.claim_append(tail_bl);
+	    logger->inc(l_bluestore_write_penalty_read_ops);
+	  }
+          logger->inc(l_bluestore_write_small_pre_read);
+
+	  _buffer_cache_write(txc, b, b_off, bl,
+			      wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+	  b->dirty_blob().calc_csum(b_off, bl);
+
+	  if (!g_conf()->bluestore_debug_omit_block_device_write) {
+	    bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
+	    op->op = bluestore_deferred_op_t::OP_WRITE;
+	    int r = b->get_blob().map(
+	      b_off, b_len,
+	      [&](uint64_t offset, uint64_t length) {
+		op->extents.emplace_back(bluestore_pextent_t(offset, length));
+		return 0;
+	      });
+	    ceph_assert(r == 0);
+	    op->data = std::move(bl);
+	    dout(20) << __func__ << "  deferred write 0x" << std::hex << b_off << "~"
+		     << b_len << std::dec << " of mutable " << *b
+		     << " at " << op->extents << dendl;
+	  }
+
+	  Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
+						 b, &wctx->old_extents);
+	  b->dirty_blob().mark_used(le->blob_offset, le->length);
+	  txc->statfs_delta.stored() += le->length;
+	  dout(20) << __func__ << "  lex " << *le << dendl;
+	  return;
+	}
+	// try to reuse blob if we can
+	if (b->can_reuse_blob(min_alloc_size,
+			      max_bsize,
+			      offset0 - bstart,
+			      &alloc_len)) {
+	  ceph_assert(alloc_len == min_alloc_size); // expecting data always
+					       // fit into reused blob
+	  // Need to check for pending writes desiring to
+	  // reuse the same pextent. The rationale is that during GC two chunks
+	  // from garbage blobs(compressed?) can share logical space within the same
+	  // AU. That's in turn might be caused by unaligned len in clone_range2.
+	  // Hence the second write will fail in an attempt to reuse blob at
+	  // do_alloc_write().
+	  if (!wctx->has_conflict(b,
+				  offset0,
+				  offset0 + alloc_len, 
+				  min_alloc_size)) {
+
+	    // we can't reuse pad_head/pad_tail since they might be truncated 
+	    // due to existent extents
+	    uint64_t b_off = offset - bstart;
+	    uint64_t b_off0 = b_off;
+	    _pad_zeros(&bl, &b_off0, chunk_size);
+
+	    dout(20) << __func__ << " reuse blob " << *b << std::hex
+		     << " (0x" << b_off0 << "~" << bl.length() << ")"
+		     << " (0x" << b_off << "~" << length << ")"
+		     << std::dec << dendl;
+
+	    o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+	    wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+			false, false);
+	    logger->inc(l_bluestore_write_small_unused);
+	    return;
+	  }
+	}
+      }
+      ++ep;
+      end_ep = ep;
+      any_change = true;
+    } // if (ep != end && ep->logical_offset < offset + max_bsize)
+
+    // check extent for reuse in reverse order
+    if (prev_ep != end && prev_ep->logical_offset >= min_off) {
+      BlobRef b = prev_ep->blob;
+      if (!above_blob_threshold) {
+	inspected_blobs.insert(&b->get_blob());
+	above_blob_threshold = inspected_blobs.size() >= blob_threshold;
+      }
+      start_ep = prev_ep;
+      auto bstart = prev_ep->blob_start();
+      dout(20) << __func__ << " considering " << *b
+	       << " bstart 0x" << std::hex << bstart << std::dec << dendl;
+      if (b->can_reuse_blob(min_alloc_size,
+			    max_bsize,
+                            offset0 - bstart,
+                            &alloc_len)) {
+	ceph_assert(alloc_len == min_alloc_size); // expecting data always
+					     // fit into reused blob
+	// Need to check for pending writes desiring to
+	// reuse the same pextent. The rationale is that during GC two chunks
+	// from garbage blobs(compressed?) can share logical space within the same
+	// AU. That's in turn might be caused by unaligned len in clone_range2.
+	// Hence the second write will fail in an attempt to reuse blob at
+	// do_alloc_write().
+	if (!wctx->has_conflict(b,
+				offset0,
+				offset0 + alloc_len, 
+				min_alloc_size)) {
+
+	  uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
+	  uint64_t b_off = offset - bstart;
+	  uint64_t b_off0 = b_off;
+	  _pad_zeros(&bl, &b_off0, chunk_size);
+
+	  dout(20) << __func__ << " reuse blob " << *b << std::hex
+		    << " (0x" << b_off0 << "~" << bl.length() << ")"
+		    << " (0x" << b_off << "~" << length << ")"
+		    << std::dec << dendl;
+
+	  o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+	  wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+		      false, false);
+	  logger->inc(l_bluestore_write_small_unused);
+	  return;
+	}
+      } 
+      if (prev_ep != begin) {
+	--prev_ep;
+	any_change = true;
+      } else {
+	prev_ep = end; // to avoid useless first extent re-check
+      }
+    } // if (prev_ep != end && prev_ep->logical_offset >= min_off) 
+  } while (any_change);
+
+  if (above_blob_threshold) {
+    dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
+            << " " << std::hex << min_off << "~" << max_off << std::dec
+	    << dendl;
+    ceph_assert(start_ep != end_ep);
+    for (auto ep = start_ep; ep != end_ep; ++ep) {
+      dout(20) << __func__ << " inserting for GC "
+              << std::hex << ep->logical_offset << "~" << ep->length
+	      << std::dec << dendl;
+
+      wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
+    }
+    // insert newly written extent to GC
+    wctx->extents_to_gc.union_insert(offset, length);
+      dout(20) << __func__ << " inserting (last) for GC "
+              << std::hex << offset << "~" << length
+	      << std::dec << dendl;
+  }
+  // new blob.
+  BlobRef b = c->new_blob();
+  uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
+  uint64_t b_off0 = b_off;
+  _pad_zeros(&bl, &b_off0, block_size);
+  o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+  wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+    min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
+                                  // doesn't match disk one only
+    true);
+
+  return;
+}
+
+bool BlueStore::BigDeferredWriteContext::can_defer(
+    BlueStore::extent_map_t::iterator ep,
+    uint64_t prefer_deferred_size,
+    uint64_t block_size,
+    uint64_t offset,
+    uint64_t l)
+{
+  bool res = false;
+  auto& blob = ep->blob->get_blob();
+  if (offset >= ep->blob_start() &&
+    blob.is_mutable()) {
+    off = offset;
+    b_off = offset - ep->blob_start();
+    uint64_t chunk_size = blob.get_chunk_size(block_size);
+    uint64_t ondisk = blob.get_ondisk_length();
+    used = std::min(l, ondisk - b_off);
+
+    // will read some data to fill out the chunk?
+    head_read = p2phase<uint64_t>(b_off, chunk_size);
+    tail_read = p2nphase<uint64_t>(b_off + used, chunk_size);
+    b_off -= head_read;
+
+    ceph_assert(b_off % chunk_size == 0);
+    ceph_assert(blob_aligned_len() % chunk_size == 0);
+
+    res = blob_aligned_len() < prefer_deferred_size &&
+      blob_aligned_len() <= ondisk &&
+      blob.is_allocated(b_off, blob_aligned_len());
+    if (res) {
+      blob_ref = ep->blob;
+      blob_start = ep->blob_start();
+    }
+  }
+  return res;
+}
+
+bool BlueStore::BigDeferredWriteContext::apply_defer()
+{
+  int r = blob_ref->get_blob().map(
+    b_off, blob_aligned_len(),
+    [&](const bluestore_pextent_t& pext,
+      uint64_t offset,
+      uint64_t length) {
+        // apply deferred if overwrite breaks blob continuity only.
+        // if it totally overlaps some pextent - fallback to regular write
+        if (pext.offset < offset ||
+          pext.end() > offset + length) {
+          res_extents.emplace_back(bluestore_pextent_t(offset, length));
+          return 0;
+        }
+        return -1;
+    });
+  return r >= 0;
+}
+
+void BlueStore::_do_write_big_apply_deferred(
+    TransContext* txc,
+    CollectionRef& c,
+    OnodeRef o,
+    BlueStore::BigDeferredWriteContext& dctx,
+    bufferlist::iterator& blp,
+    WriteContext* wctx)
+{
+  bufferlist bl;
+  dout(20) << __func__ << "  reading head 0x" << std::hex << dctx.head_read
+    << " and tail 0x" << dctx.tail_read << std::dec << dendl;
+  if (dctx.head_read) {
+    int r = _do_read(c.get(), o,
+      dctx.off - dctx.head_read,
+      dctx.head_read,
+      bl,
+      0);
+    ceph_assert(r >= 0 && r <= (int)dctx.head_read);
+    size_t zlen = dctx.head_read - r;
+    if (zlen) {
+      bl.append_zero(zlen);
+      logger->inc(l_bluestore_write_pad_bytes, zlen);
+    }
+    logger->inc(l_bluestore_write_penalty_read_ops);
+  }
+  blp.copy(dctx.used, bl);
+
+  if (dctx.tail_read) {
+    bufferlist tail_bl;
+    int r = _do_read(c.get(), o,
+      dctx.off + dctx.used, dctx.tail_read,
+      tail_bl, 0);
+    ceph_assert(r >= 0 && r <= (int)dctx.tail_read);
+    size_t zlen = dctx.tail_read - r;
+    if (zlen) {
+      tail_bl.append_zero(zlen);
+      logger->inc(l_bluestore_write_pad_bytes, zlen);
+    }
+    bl.claim_append(tail_bl);
+    logger->inc(l_bluestore_write_penalty_read_ops);
+  }
+  auto& b0 = dctx.blob_ref;
+  _buffer_cache_write(txc, b0, dctx.b_off, bl,
+    wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+  b0->dirty_blob().calc_csum(dctx.b_off, bl);
+
+  Extent* le = o->extent_map.set_lextent(c, dctx.off,
+    dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
+
+  // in fact this is a no-op for big writes but left here to maintain
+  // uniformity and avoid missing after some refactor.
+  b0->dirty_blob().mark_used(le->blob_offset, le->length);
+  txc->statfs_delta.stored() += le->length;
+
+  if (!g_conf()->bluestore_debug_omit_block_device_write) {
+    bluestore_deferred_op_t* op = _get_deferred_op(txc, bl.length());
+    op->op = bluestore_deferred_op_t::OP_WRITE;
+    op->extents.swap(dctx.res_extents);
+    op->data = std::move(bl);
+  }
+}
+
+void BlueStore::_do_write_big(
+    TransContext *txc,
+    CollectionRef &c,
+    OnodeRef o,
+    uint64_t offset, uint64_t length,
+    bufferlist::iterator& blp,
+    WriteContext *wctx)
+{
+  dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << " target_blob_size 0x" << wctx->target_blob_size << std::dec
+	   << " compress " << (int)wctx->compress
+	   << dendl;
+  logger->inc(l_bluestore_write_big);
+  logger->inc(l_bluestore_write_big_bytes, length);
+  auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+  uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
+  while (length > 0) {
+    bool new_blob = false;
+    BlobRef b;
+    uint32_t b_off = 0;
+    uint32_t l = 0;
+
+    //attempting to reuse existing blob
+    if (!wctx->compress) {
+      // enforce target blob alignment with max_bsize
+      l = max_bsize - p2phase(offset, max_bsize);
+      l = std::min(uint64_t(l), length);
+
+      auto end = o->extent_map.extent_map.end();
+
+      dout(20) << __func__ << " may be defer: 0x" << std::hex
+	       << offset << "~" << l
+               << std::dec << dendl;
+
+      if (prefer_deferred_size_snapshot &&
+          l <= prefer_deferred_size_snapshot * 2) {
+        // Single write that spans two adjusted existing blobs can result
+        // in up to two deferred blocks of 'prefer_deferred_size'
+        // So we're trying to minimize the amount of resulting blobs
+        // and preserve 2 blobs rather than inserting one more in between
+        // E.g. write 0x10000~20000 over existing blobs
+        // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
+        // performance point of view) to result in two deferred writes to
+        // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
+
+        // look for an existing mutable blob we can write into
+        auto ep = o->extent_map.seek_lextent(offset);
+        auto ep_next = end;
+        BigDeferredWriteContext head_info, tail_info;
+
+        bool will_defer = ep != end ?
+          head_info.can_defer(ep,
+            prefer_deferred_size_snapshot,
+            block_size,
+            offset,
+            l) :
+          false;
+        auto offset_next = offset + head_info.used;
+        auto remaining = l - head_info.used;
+        if (will_defer && remaining) {
+          will_defer = false;
+          if (remaining <= prefer_deferred_size_snapshot) {
+            ep_next = o->extent_map.seek_lextent(offset_next);
+            // check if we can defer remaining totally
+            will_defer = ep_next == end ?
+              false :
+              tail_info.can_defer(ep_next,
+                prefer_deferred_size_snapshot,
+                block_size,
+                offset_next,
+                remaining);
+            will_defer = will_defer && remaining == tail_info.used;
+          }
+        }
+        if (will_defer) {
+          dout(20) << __func__ << " " << *(head_info.blob_ref)
+            << " deferring big " << std::hex
+            << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
+            << std::dec << " write via deferred"
+            << dendl;
+          if (remaining) {
+            dout(20) << __func__ << " " << *(tail_info.blob_ref)
+              << " deferring big " << std::hex
+              << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
+              << std::dec << " write via deferred"
+              << dendl;
+          }
+
+          will_defer = head_info.apply_defer();
+          if (!will_defer) {
+            dout(20) << __func__
+              << " deferring big fell back, head isn't continuous"
+              << dendl;
+          } else if (remaining) {
+            will_defer = tail_info.apply_defer();
+            if (!will_defer) {
+              dout(20) << __func__
+                << " deferring big fell back, tail isn't continuous"
+                << dendl;
+            }
+          }
+        }
+        if (will_defer) {
+          _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
+          if (remaining) {
+            _do_write_big_apply_deferred(txc, c, o, tail_info,
+              blp, wctx);
+          }
+	  dout(20) << __func__ << " defer big: 0x" << std::hex
+		   << offset << "~" << l
+		   << std::dec << dendl;
+          offset += l;
+          length -= l;
+          logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1);
+          logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1);
+          continue;
+        }
+      }
+      dout(20) << __func__ << " lookup for blocks to reuse..." << dendl;
+
+      o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
+
+      // seek again as punch_hole could invalidate ep
+      auto ep = o->extent_map.seek_lextent(offset);
+      auto begin = o->extent_map.extent_map.begin();
+      auto prev_ep = end;
+      if (ep != begin) {
+        prev_ep = ep;
+        --prev_ep;
+      }
+
+      auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
+      // search suitable extent in both forward and reverse direction in
+      // [offset - target_max_blob_size, offset + target_max_blob_size] range
+      // then check if blob can be reused via can_reuse_blob func.
+      bool any_change;
+      do {
+	any_change = false;
+	if (ep != end && ep->logical_offset < offset + max_bsize) {
+          dout(20) << __func__ << " considering " << *ep
+                   << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
+
+          if (offset >= ep->blob_start() &&
+              ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
+	                               offset - ep->blob_start(),
+	                               &l)) {
+	    b = ep->blob;
+            b_off = offset - ep->blob_start();
+            prev_ep = end; // to avoid check below
+	    dout(20) << __func__ << " reuse blob " << *b << std::hex
+		     << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
+	  } else {
+	    ++ep;
+	    any_change = true;
+	  }
+	}
+
+	if (prev_ep != end && prev_ep->logical_offset >= min_off) {
+          dout(20) << __func__ << " considering rev " << *prev_ep
+                   << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
+          if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
+                                    	    offset - prev_ep->blob_start(),
+                                    	    &l)) {
+	    b = prev_ep->blob;
+	    b_off = offset - prev_ep->blob_start();
+	    dout(20) << __func__ << " reuse blob " << *b << std::hex
+		     << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
+	  } else if (prev_ep != begin) {
+	    --prev_ep;
+	    any_change = true;
+	  } else {
+	    prev_ep = end; // to avoid useless first extent re-check
+	  }
+	}
+      } while (b == nullptr && any_change);
+    } else {
+      // trying to utilize as longer chunk as permitted in case of compression.
+      l = std::min(max_bsize, length);
+      o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
+    } // if (!wctx->compress)
+
+    if (b == nullptr) {
+      b = c->new_blob();
+      b_off = 0;
+      new_blob = true;
+    }
+    bufferlist t;
+    blp.copy(l, t);
+    wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
+    dout(20) << __func__ << " schedule write big: 0x"
+      << std::hex << offset << "~" << l << std::dec
+      << (new_blob ? " new " : " reuse ")
+      << *b << dendl;
+    offset += l;
+    length -= l;
+    logger->inc(l_bluestore_write_big_blobs);
+  }
+}
+
+int BlueStore::_do_alloc_write(
+  TransContext *txc,
+  CollectionRef coll,
+  OnodeRef o,
+  WriteContext *wctx)
+{
+  dout(20) << __func__ << " txc " << txc
+	   << " " << wctx->writes.size() << " blobs"
+	   << dendl;
+  if (wctx->writes.empty()) {
+    return 0;
+  }
+
+  CompressorRef c;
+  double crr = 0;
+  if (wctx->compress) {
+    c = select_option(
+      "compression_algorithm",
+      compressor,
+      [&]() {
+        string val;
+        if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
+          CompressorRef cp = compressor;
+          if (!cp || cp->get_type_name() != val) {
+            cp = Compressor::create(cct, val);
+	    if (!cp) {
+	      if (_set_compression_alert(false, val.c_str())) {
+	        derr << __func__ << " unable to initialize " << val.c_str()
+		     << " compressor" << dendl;
+	      }
+	    }
+          }
+          return boost::optional<CompressorRef>(cp);
+        }
+        return boost::optional<CompressorRef>();
+      }
+    );
+
+    crr = select_option(
+      "compression_required_ratio",
+      cct->_conf->bluestore_compression_required_ratio,
+      [&]() {
+        double val;
+        if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
+          return boost::optional<double>(val);
+        }
+        return boost::optional<double>();
+      }
+    );
+  }
+
+  // checksum
+  int64_t csum = csum_type.load();
+  csum = select_option(
+    "csum_type",
+    csum,
+    [&]() {
+      int64_t val;
+      if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
+        return boost::optional<int64_t>(val);
+      }
+      return boost::optional<int64_t>();
+    }
+  );
+
+  // compress (as needed) and calc needed space
+  uint64_t need = 0;
+  uint64_t data_size = 0;
+  // 'need' is amount of space that must be provided by allocator.
+  // 'data_size' is a size of data that will be transferred to disk.
+  // Note that data_size is always <= need. This comes from:
+  // - write to blob was unaligned, and there is free space
+  // - data has been compressed
+  //
+  // We make one decision and apply it to all blobs.
+  // All blobs will be deferred or none will.
+  // We assume that allocator does its best to provide contiguous space,
+  // and the condition is : (data_size < deferred).
+
+  auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+  for (auto& wi : wctx->writes) {
+    if (c && wi.blob_length > min_alloc_size) {
+      auto start = mono_clock::now();
+
+      // compress
+      ceph_assert(wi.b_off == 0);
+      ceph_assert(wi.blob_length == wi.bl.length());
+
+      // FIXME: memory alignment here is bad
+      bufferlist t;
+      boost::optional<int32_t> compressor_message;
+      int r = c->compress(wi.bl, t, compressor_message);
+      uint64_t want_len_raw = wi.blob_length * crr;
+      uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
+      bool rejected = false;
+      uint64_t compressed_len = t.length();
+      // do an approximate (fast) estimation for resulting blob size
+      // that doesn't take header overhead  into account
+      uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
+      if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
+	bluestore_compression_header_t chdr;
+	chdr.type = c->get_type();
+	chdr.length = t.length();
+	chdr.compressor_message = compressor_message;
+	encode(chdr, wi.compressed_bl);
+	wi.compressed_bl.claim_append(t);
+
+	compressed_len = wi.compressed_bl.length();
+	result_len = p2roundup(compressed_len, min_alloc_size);
+	if (result_len <= want_len && result_len < wi.blob_length) {
+	  // Cool. We compressed at least as much as we were hoping to.
+	  // pad out to min_alloc_size
+	  wi.compressed_bl.append_zero(result_len - compressed_len);
+	  wi.compressed_len = compressed_len;
+	  wi.compressed = true;
+	  logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
+	  dout(20) << __func__ << std::hex << "  compressed 0x" << wi.blob_length
+		   << " -> 0x" << compressed_len << " => 0x" << result_len
+		   << " with " << c->get_type()
+		   << std::dec << dendl;
+	  txc->statfs_delta.compressed() += compressed_len;
+	  txc->statfs_delta.compressed_original() += wi.blob_length;
+	  txc->statfs_delta.compressed_allocated() += result_len;
+	  logger->inc(l_bluestore_compress_success_count);
+	  need += result_len;
+	  data_size += result_len;
+	} else {
+	  rejected = true;
+	}
+      } else if (r != 0) {
+	dout(5) << __func__ << std::hex << "  0x" << wi.blob_length
+		 << " bytes compressed using " << c->get_type_name()
+		 << std::dec
+		 << " failed with errcode = " << r
+		 << ", leaving uncompressed"
+		 << dendl;
+	logger->inc(l_bluestore_compress_rejected_count);
+	need += wi.blob_length;
+	data_size += wi.bl.length();
+      } else {
+	rejected = true;
+      }
+
+      if (rejected) {
+	dout(20) << __func__ << std::hex << "  0x" << wi.blob_length
+		 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
+		 << " with " << c->get_type()
+		 << ", which is more than required 0x" << want_len_raw
+		 << " -> 0x" << want_len
+		 << ", leaving uncompressed"
+		 << std::dec << dendl;
+	logger->inc(l_bluestore_compress_rejected_count);
+	need += wi.blob_length;
+	data_size += wi.bl.length();
+      }
+      log_latency("compress@_do_alloc_write",
+	l_bluestore_compress_lat,
+        mono_clock::now() - start,
+	cct->_conf->bluestore_log_op_age );
+    } else {
+      need += wi.blob_length;
+      data_size += wi.bl.length();
+    }
+  }
+  PExtentVector prealloc;
+  prealloc.reserve(2 * wctx->writes.size());
+  int64_t prealloc_left = 0;
+  prealloc_left = shared_alloc.a->allocate(
+    need, min_alloc_size, need,
+    0, &prealloc);
+  if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
+    derr << __func__ << " failed to allocate 0x" << std::hex << need
+         << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
+         << " min_alloc_size 0x" << min_alloc_size
+         << " available 0x " << shared_alloc.a->get_free()
+         << std::dec << dendl;
+    if (prealloc.size()) {
+      shared_alloc.a->release(prealloc);
+    }
+    return -ENOSPC;
+  }
+  _collect_allocation_stats(need, min_alloc_size, prealloc.size());
+
+  if (bdev->is_smr()) {
+    std::deque<uint64_t> zones_to_clean;
+    if (shared_alloc.a->zoned_get_zones_to_clean(&zones_to_clean)) {
+      std::lock_guard l{zoned_cleaner_lock};
+      zoned_cleaner_queue.swap(zones_to_clean);
+      zoned_cleaner_cond.notify_one();
+    }
+  }
+
+  dout(20) << __func__ << std::hex << " need=0x" << need << " data=0x" << data_size
+	   << " prealloc " << prealloc << dendl;
+  auto prealloc_pos = prealloc.begin();
+  ceph_assert(prealloc_pos != prealloc.end());
+
+  for (auto& wi : wctx->writes) {
+    bluestore_blob_t& dblob = wi.b->dirty_blob();
+    uint64_t b_off = wi.b_off;
+    bufferlist *l = &wi.bl;
+    uint64_t final_length = wi.blob_length;
+    uint64_t csum_length = wi.blob_length;
+    if (wi.compressed) {
+      final_length = wi.compressed_bl.length();
+      csum_length = final_length;
+      unsigned csum_order = ctz(csum_length);
+      l = &wi.compressed_bl;
+      dblob.set_compressed(wi.blob_length, wi.compressed_len);
+      if (csum != Checksummer::CSUM_NONE) {
+        dout(20) << __func__
+		 << " initialize csum setting for compressed blob " << *wi.b
+                 << " csum_type " << Checksummer::get_csum_type_string(csum)
+                 << " csum_order " << csum_order
+                 << " csum_length 0x" << std::hex << csum_length
+                 << " blob_length 0x" << wi.blob_length
+                 << " compressed_length 0x" << wi.compressed_len << std::dec
+                 << dendl;
+        dblob.init_csum(csum, csum_order, csum_length);
+      }
+    } else if (wi.new_blob) {
+      unsigned csum_order;
+      // initialize newly created blob only
+      ceph_assert(dblob.is_mutable());
+      if (l->length() != wi.blob_length) {
+        // hrm, maybe we could do better here, but let's not bother.
+        dout(20) << __func__ << " forcing csum_order to block_size_order "
+                << block_size_order << dendl;
+	csum_order = block_size_order;
+      } else {
+        csum_order = std::min(wctx->csum_order, ctz(l->length()));
+      }
+      // try to align blob with max_blob_size to improve
+      // its reuse ratio, e.g. in case of reverse write
+      uint32_t suggested_boff =
+       (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
+      if ((suggested_boff % (1 << csum_order)) == 0 &&
+           suggested_boff + final_length <= max_bsize &&
+           suggested_boff > b_off) {
+        dout(20) << __func__ << " forcing blob_offset to 0x"
+                 << std::hex << suggested_boff << std::dec << dendl;
+        ceph_assert(suggested_boff >= b_off);
+        csum_length += suggested_boff - b_off;
+        b_off = suggested_boff;
+      }
+      if (csum != Checksummer::CSUM_NONE) {
+        dout(20) << __func__
+		 << " initialize csum setting for new blob " << *wi.b
+                 << " csum_type " << Checksummer::get_csum_type_string(csum)
+                 << " csum_order " << csum_order
+                 << " csum_length 0x" << std::hex << csum_length << std::dec
+                 << dendl;
+        dblob.init_csum(csum, csum_order, csum_length);
+      }
+    }
+
+    PExtentVector extents;
+    int64_t left = final_length;
+    auto prefer_deferred_size_snapshot = prefer_deferred_size.load();
+    while (left > 0) {
+      ceph_assert(prealloc_left > 0);
+      if (prealloc_pos->length <= left) {
+	prealloc_left -= prealloc_pos->length;
+	left -= prealloc_pos->length;
+	txc->statfs_delta.allocated() += prealloc_pos->length;
+	extents.push_back(*prealloc_pos);
+	++prealloc_pos;
+      } else {
+	extents.emplace_back(prealloc_pos->offset, left);
+	prealloc_pos->offset += left;
+	prealloc_pos->length -= left;
+	prealloc_left -= left;
+	txc->statfs_delta.allocated() += left;
+	left = 0;
+	break;
+      }
+    }
+    for (auto& p : extents) {
+      txc->allocated.insert(p.offset, p.length);
+    }
+    dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
+
+    dout(20) << __func__ << " blob " << *wi.b << dendl;
+    if (dblob.has_csum()) {
+      dblob.calc_csum(b_off, *l);
+    }
+
+    if (wi.mark_unused) {
+      ceph_assert(!dblob.is_compressed());
+      auto b_end = b_off + wi.bl.length();
+      if (b_off) {
+        dblob.add_unused(0, b_off);
+      }
+      uint64_t llen = dblob.get_logical_length();
+      if (b_end < llen) {
+        dblob.add_unused(b_end, llen - b_end);
+      }
+    }
+
+    Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
+                                           b_off + (wi.b_off0 - wi.b_off),
+                                           wi.length0,
+                                           wi.b,
+                                           nullptr);
+    wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
+    txc->statfs_delta.stored() += le->length;
+    dout(20) << __func__ << "  lex " << *le << dendl;
+    _buffer_cache_write(txc, wi.b, b_off, wi.bl,
+                        wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+    // queue io
+    if (!g_conf()->bluestore_debug_omit_block_device_write) {
+      if (data_size < prefer_deferred_size_snapshot) {
+	dout(20) << __func__ << " deferring 0x" << std::hex
+		 << l->length() << std::dec << " write via deferred" << dendl;
+	bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length());
+	op->op = bluestore_deferred_op_t::OP_WRITE;
+	int r = wi.b->get_blob().map(
+	  b_off, l->length(),
+	  [&](uint64_t offset, uint64_t length) {
+	    op->extents.emplace_back(bluestore_pextent_t(offset, length));
+	    return 0;
+	  });
+        ceph_assert(r == 0);
+	op->data = *l;
+      } else {
+	wi.b->get_blob().map_bl(
+	  b_off, *l,
+	  [&](uint64_t offset, bufferlist& t) {
+	    bdev->aio_write(offset, t, &txc->ioc, false);
+	  });
+	logger->inc(l_bluestore_write_new);
+      }
+    }
+  }
+  ceph_assert(prealloc_pos == prealloc.end());
+  ceph_assert(prealloc_left == 0);
+  return 0;
+}
+
+void BlueStore::_wctx_finish(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef o,
+  WriteContext *wctx,
+  set<SharedBlob*> *maybe_unshared_blobs)
+{
+  auto oep = wctx->old_extents.begin();
+  while (oep != wctx->old_extents.end()) {
+    auto &lo = *oep;
+    oep = wctx->old_extents.erase(oep);
+    dout(20) << __func__ << " lex_old " << lo.e << dendl;
+    BlobRef b = lo.e.blob;
+    const bluestore_blob_t& blob = b->get_blob();
+    if (blob.is_compressed()) {
+      if (lo.blob_empty) {
+	txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
+      }
+      txc->statfs_delta.compressed_original() -= lo.e.length;
+    }
+    auto& r = lo.r;
+    txc->statfs_delta.stored() -= lo.e.length;
+    if (!r.empty()) {
+      dout(20) << __func__ << "  blob " << *b << " release " << r << dendl;
+      if (blob.is_shared()) {
+	PExtentVector final;
+        c->load_shared_blob(b->shared_blob);
+	bool unshare = false;
+	bool* unshare_ptr =
+	  !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
+	for (auto e : r) {
+	  b->shared_blob->put_ref(
+	    e.offset, e.length, &final,
+	    unshare_ptr);
+	}
+	if (unshare) {
+	  ceph_assert(maybe_unshared_blobs);
+	  maybe_unshared_blobs->insert(b->shared_blob.get());
+	}
+	dout(20) << __func__ << "  shared_blob release " << final
+		 << " from " << *b->shared_blob << dendl;
+	txc->write_shared_blob(b->shared_blob);
+	r.clear();
+	r.swap(final);
+      }
+    }
+    // we can't invalidate our logical extents as we drop them because
+    // other lextents (either in our onode or others) may still
+    // reference them.  but we can throw out anything that is no
+    // longer allocated.  Note that this will leave behind edge bits
+    // that are no longer referenced but not deallocated (until they
+    // age out of the cache naturally).
+    b->discard_unallocated(c.get());
+    for (auto e : r) {
+      dout(20) << __func__ << "  release " << e << dendl;
+      txc->released.insert(e.offset, e.length);
+      txc->statfs_delta.allocated() -= e.length;
+      if (blob.is_compressed()) {
+        txc->statfs_delta.compressed_allocated() -= e.length;
+      }
+    }
+
+    if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
+      dout(20) << __func__ << "  spanning_blob_map removing empty " << *b
+	       << dendl;
+      o->extent_map.spanning_blob_map.erase(b->id);
+    }
+    delete &lo;
+  }
+}
+
+void BlueStore::_do_write_data(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef o,
+  uint64_t offset,
+  uint64_t length,
+  bufferlist& bl,
+  WriteContext *wctx)
+{
+  uint64_t end = offset + length;
+  bufferlist::iterator p = bl.begin();
+
+  if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
+      (length != min_alloc_size)) {
+    // we fall within the same block
+    _do_write_small(txc, c, o, offset, length, p, wctx);
+  } else {
+    uint64_t head_offset, head_length;
+    uint64_t middle_offset, middle_length;
+    uint64_t tail_offset, tail_length;
+
+    head_offset = offset;
+    head_length = p2nphase(offset, min_alloc_size);
+
+    tail_offset = p2align(end, min_alloc_size);
+    tail_length = p2phase(end, min_alloc_size);
+
+    middle_offset = head_offset + head_length;
+    middle_length = length - head_length - tail_length;
+
+    if (head_length) {
+      _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
+    }
+
+    _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
+
+    if (tail_length) {
+      _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
+    }
+  }
+}
+
+void BlueStore::_choose_write_options(
+   CollectionRef& c,
+   OnodeRef o,
+   uint32_t fadvise_flags,
+   WriteContext *wctx)
+{
+  if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+    dout(20) << __func__ << " will do buffered write" << dendl;
+    wctx->buffered = true;
+  } else if (cct->_conf->bluestore_default_buffered_write &&
+	     (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+			       CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
+    dout(20) << __func__ << " defaulting to buffered write" << dendl;
+    wctx->buffered = true;
+  }
+
+  // apply basic csum block size
+  wctx->csum_order = block_size_order;
+
+  // compression parameters
+  unsigned alloc_hints = o->onode.alloc_hint_flags;
+  auto cm = select_option(
+    "compression_mode",
+    comp_mode.load(),
+    [&]() {
+      string val;
+      if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
+	return boost::optional<Compressor::CompressionMode>(
+	  Compressor::get_comp_mode_type(val));
+      }
+      return boost::optional<Compressor::CompressionMode>();
+    }
+  );
+
+  wctx->compress = (cm != Compressor::COMP_NONE) &&
+    ((cm == Compressor::COMP_FORCE) ||
+     (cm == Compressor::COMP_AGGRESSIVE &&
+      (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
+     (cm == Compressor::COMP_PASSIVE &&
+      (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
+
+  if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
+      (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
+      (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
+                      CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
+      (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
+
+    dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
+
+    if (o->onode.expected_write_size) {
+      wctx->csum_order = std::max(min_alloc_size_order,
+			          (uint8_t)ctz(o->onode.expected_write_size));
+    } else {
+      wctx->csum_order = min_alloc_size_order;
+    }
+
+    if (wctx->compress) {
+      wctx->target_blob_size = select_option(
+        "compression_max_blob_size",
+        comp_max_blob_size.load(),
+        [&]() {
+          int64_t val;
+          if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
+   	    return boost::optional<uint64_t>((uint64_t)val);
+          }
+          return boost::optional<uint64_t>();
+        }
+      );
+    }
+  } else {
+    if (wctx->compress) {
+      wctx->target_blob_size = select_option(
+        "compression_min_blob_size",
+        comp_min_blob_size.load(),
+        [&]() {
+          int64_t val;
+          if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
+   	    return boost::optional<uint64_t>((uint64_t)val);
+          }
+          return boost::optional<uint64_t>();
+        }
+      );
+    }
+  }
+
+  uint64_t max_bsize = max_blob_size.load();
+  if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
+    wctx->target_blob_size = max_bsize;
+  }
+
+  // set the min blob size floor at 2x the min_alloc_size, or else we
+  // won't be able to allocate a smaller extent for the compressed
+  // data.
+  if (wctx->compress &&
+      wctx->target_blob_size < min_alloc_size * 2) {
+    wctx->target_blob_size = min_alloc_size * 2;
+  }
+
+  dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
+           << " target_blob_size 0x" << std::hex << wctx->target_blob_size
+	   << " compress=" << (int)wctx->compress
+	   << " buffered=" << (int)wctx->buffered
+           << std::dec << dendl;
+}
+
+int BlueStore::_do_gc(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef o,
+  const WriteContext& wctx,
+  uint64_t *dirty_start,
+  uint64_t *dirty_end)
+{
+
+  bool dirty_range_updated = false;
+  WriteContext wctx_gc;
+  wctx_gc.fork(wctx); // make a clone for garbage collection
+
+  auto & extents_to_collect = wctx.extents_to_gc;
+  for (auto it = extents_to_collect.begin();
+       it != extents_to_collect.end();
+       ++it) {
+    bufferlist bl;
+    auto offset = (*it).first;
+    auto length = (*it).second;
+    dout(20) << __func__ << " processing " << std::hex
+            << offset << "~" << length << std::dec
+	    << dendl;
+    int r = _do_read(c.get(), o, offset, length, bl, 0);
+    ceph_assert(r == (int)length);
+
+    _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
+    logger->inc(l_bluestore_gc_merged, length);
+
+    if (*dirty_start > offset) {
+      *dirty_start = offset;
+      dirty_range_updated = true;
+    }
+
+    if (*dirty_end < offset + length) {
+      *dirty_end = offset + length;
+      dirty_range_updated = true;
+    }
+  }
+  if (dirty_range_updated) {
+    o->extent_map.fault_range(db, *dirty_start, *dirty_end);
+  }
+
+  dout(30) << __func__ << " alloc write" << dendl;
+  int r = _do_alloc_write(txc, c, o, &wctx_gc);
+  if (r < 0) {
+    derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
+         << dendl;
+    return r;
+  }
+
+  _wctx_finish(txc, c, o, &wctx_gc);
+  return 0;
+}
+
+int BlueStore::_do_write(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef o,
+  uint64_t offset,
+  uint64_t length,
+  bufferlist& bl,
+  uint32_t fadvise_flags)
+{
+  int r = 0;
+
+  dout(20) << __func__
+	   << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length
+	   << " - have 0x" << o->onode.size
+	   << " (" << std::dec << o->onode.size << ")"
+	   << " bytes" << std::hex
+	   << " fadvise_flags 0x" << fadvise_flags
+	   << " alloc_hint 0x" << o->onode.alloc_hint_flags
+           << " expected_object_size " << o->onode.expected_object_size
+           << " expected_write_size " << o->onode.expected_write_size
+           << std::dec
+	   << dendl;
+  _dump_onode<30>(cct, *o);
+
+  if (length == 0) {
+    return 0;
+  }
+
+  uint64_t end = offset + length;
+
+  GarbageCollector gc(c->store->cct);
+  int64_t benefit = 0;
+  auto dirty_start = offset;
+  auto dirty_end = end;
+
+  WriteContext wctx;
+  _choose_write_options(c, o, fadvise_flags, &wctx);
+  o->extent_map.fault_range(db, offset, length);
+  _do_write_data(txc, c, o, offset, length, bl, &wctx);
+  r = _do_alloc_write(txc, c, o, &wctx);
+  if (r < 0) {
+    derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
+	 << dendl;
+    goto out;
+  }
+
+  if (wctx.extents_to_gc.empty() ||
+      wctx.extents_to_gc.range_start() > offset ||
+      wctx.extents_to_gc.range_end() < offset + length) {
+    benefit = gc.estimate(offset,
+			  length,
+			  o->extent_map,
+			  wctx.old_extents,
+			  min_alloc_size);
+  }
+
+  if (bdev->is_smr()) {
+    if (wctx.old_extents.empty()) {
+      txc->zoned_note_new_object(o);
+    } else {
+      int64_t old_ondisk_offset = wctx.old_extents.begin()->r.begin()->offset;
+      txc->zoned_note_updated_object(o, old_ondisk_offset);
+    }
+  }
+
+  // NB: _wctx_finish() will empty old_extents
+  // so we must do gc estimation before that
+  _wctx_finish(txc, c, o, &wctx);
+  if (end > o->onode.size) {
+    dout(20) << __func__ << " extending size to 0x" << std::hex << end
+             << std::dec << dendl;
+    o->onode.size = end;
+  }
+
+  if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
+    wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
+    dout(20) << __func__
+             << " perform garbage collection for compressed extents, "
+             << "expected benefit = " << benefit << " AUs" << dendl;
+  }
+  if (!wctx.extents_to_gc.empty()) {
+    dout(20) << __func__ << " perform garbage collection" << dendl;
+
+    r = _do_gc(txc, c, o,
+      wctx,
+      &dirty_start, &dirty_end);
+    if (r < 0) {
+      derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
+            << dendl;
+      goto out;
+    }
+    dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
+	    << "~" << dirty_end - dirty_start << std::dec << dendl;
+  }
+  o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
+  o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
+
+  r = 0;
+
+ out:
+  return r;
+}
+
+int BlueStore::_write(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& o,
+		      uint64_t offset, size_t length,
+		      bufferlist& bl,
+		      uint32_t fadvise_flags)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << dendl;
+  int r = 0;
+  if (offset + length >= OBJECT_MAX_SIZE) {
+    r = -E2BIG;
+  } else {
+    _assign_nid(txc, o);
+    r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
+    txc->write_onode(o);
+  }
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_zero(TransContext *txc,
+		     CollectionRef& c,
+		     OnodeRef& o,
+		     uint64_t offset, size_t length)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << dendl;
+  int r = 0;
+  if (offset + length >= OBJECT_MAX_SIZE) {
+    r = -E2BIG;
+  } else {
+    _assign_nid(txc, o);
+    r = _do_zero(txc, c, o, offset, length);
+  }
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_do_zero(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o,
+			uint64_t offset, size_t length)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << dendl;
+  int r = 0;
+
+  _dump_onode<30>(cct, *o);
+
+  WriteContext wctx;
+  o->extent_map.fault_range(db, offset, length);
+  o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
+  o->extent_map.dirty_range(offset, length);
+  _wctx_finish(txc, c, o, &wctx);
+
+  if (length > 0 && offset + length > o->onode.size) {
+    o->onode.size = offset + length;
+    dout(20) << __func__ << " extending size to " << offset + length
+	     << dendl;
+  }
+  txc->write_onode(o);
+
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << " = " << r << dendl;
+  return r;
+}
+
+void BlueStore::_do_truncate(
+  TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset,
+  set<SharedBlob*> *maybe_unshared_blobs)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << std::dec << dendl;
+
+  _dump_onode<30>(cct, *o);
+
+  if (offset == o->onode.size)
+    return;
+
+  WriteContext wctx;
+  if (offset < o->onode.size) {
+    uint64_t length = o->onode.size - offset;
+    o->extent_map.fault_range(db, offset, length);
+    o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
+    o->extent_map.dirty_range(offset, length);
+    _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
+
+    // if we have shards past EOF, ask for a reshard
+    if (!o->onode.extent_map_shards.empty() &&
+	o->onode.extent_map_shards.back().offset >= offset) {
+      dout(10) << __func__ << "  request reshard past EOF" << dendl;
+      if (offset) {
+	o->extent_map.request_reshard(offset - 1, offset + length);
+      } else {
+	o->extent_map.request_reshard(0, length);
+      }
+    }
+  }
+
+  o->onode.size = offset;
+
+  if (bdev->is_smr()) {
+    // On zoned devices, we currently support only removing an object or
+    // truncating it to zero size, both of which fall through this code path.
+    ceph_assert(offset == 0 && !wctx.old_extents.empty());
+    int64_t ondisk_offset = wctx.old_extents.begin()->r.begin()->offset;
+    txc->zoned_note_truncated_object(o, ondisk_offset);
+  }
+
+  txc->write_onode(o);
+}
+
+int BlueStore::_truncate(TransContext *txc,
+			 CollectionRef& c,
+			 OnodeRef& o,
+			 uint64_t offset)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << std::dec
+	   << dendl;
+  int r = 0;
+  if (offset >= OBJECT_MAX_SIZE) {
+    r = -E2BIG;
+  } else {
+    _do_truncate(txc, c, o, offset);
+  }
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << std::dec
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_do_remove(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef o)
+{
+  set<SharedBlob*> maybe_unshared_blobs;
+  bool is_gen = !o->oid.is_no_gen();
+  _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
+  if (o->onode.has_omap()) {
+    o->flush();
+    _do_omap_clear(txc, o);
+  }
+  o->exists = false;
+  string key;
+  for (auto &s : o->extent_map.shards) {
+    dout(20) << __func__ << "  removing shard 0x" << std::hex
+	     << s.shard_info->offset << std::dec << dendl;
+    generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
+      [&](const string& final_key) {
+        txc->t->rmkey(PREFIX_OBJ, final_key);
+      }
+    );
+  }
+  txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
+  txc->note_removed_object(o);
+  o->extent_map.clear();
+  o->onode = bluestore_onode_t();
+  _debug_obj_on_delete(o->oid);
+
+  if (!is_gen || maybe_unshared_blobs.empty()) {
+    return 0;
+  }
+
+  // see if we can unshare blobs still referenced by the head
+  dout(10) << __func__ << " gen and maybe_unshared_blobs "
+	   << maybe_unshared_blobs << dendl;
+  ghobject_t nogen = o->oid;
+  nogen.generation = ghobject_t::NO_GEN;
+  OnodeRef h = c->get_onode(nogen, false);
+
+  if (!h || !h->exists) {
+    return 0;
+  }
+
+  dout(20) << __func__ << " checking for unshareable blobs on " << h
+	   << " " << h->oid << dendl;
+  map<SharedBlob*,bluestore_extent_ref_map_t> expect;
+  for (auto& e : h->extent_map.extent_map) {
+    const bluestore_blob_t& b = e.blob->get_blob();
+    SharedBlob *sb = e.blob->shared_blob.get();
+    if (b.is_shared() &&
+	sb->loaded &&
+	maybe_unshared_blobs.count(sb)) {
+      if (b.is_compressed()) {
+	expect[sb].get(0, b.get_ondisk_length());
+      } else {
+	b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
+	    expect[sb].get(off, len);
+	    return 0;
+	  });
+      }
+    }
+  }
+
+  vector<SharedBlob*> unshared_blobs;
+  unshared_blobs.reserve(maybe_unshared_blobs.size());
+  for (auto& p : expect) {
+    dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
+    if (p.first->persistent->ref_map == p.second) {
+      SharedBlob *sb = p.first;
+      dout(20) << __func__ << "  unsharing " << *sb << dendl;
+      unshared_blobs.push_back(sb);
+      txc->unshare_blob(sb);
+      uint64_t sbid = c->make_blob_unshared(sb);
+      string key;
+      get_shared_blob_key(sbid, &key);
+      txc->t->rmkey(PREFIX_SHARED_BLOB, key);
+    }
+  }
+
+  if (unshared_blobs.empty()) {
+    return 0;
+  }
+
+  for (auto& e : h->extent_map.extent_map) {
+    const bluestore_blob_t& b = e.blob->get_blob();
+    SharedBlob *sb = e.blob->shared_blob.get();
+    if (b.is_shared() &&
+        std::find(unshared_blobs.begin(), unshared_blobs.end(),
+                  sb) != unshared_blobs.end()) {
+      dout(20) << __func__ << "  unsharing " << e << dendl;
+      bluestore_blob_t& blob = e.blob->dirty_blob();
+      blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
+      h->extent_map.dirty_range(e.logical_offset, 1);
+    }
+  }
+  txc->write_onode(h);
+
+  return 0;
+}
+
+int BlueStore::_remove(TransContext *txc,
+		       CollectionRef& c,
+		       OnodeRef &o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " onode " << o.get()
+	   << " txc "<< txc << dendl;
+
+  auto start_time = mono_clock::now();
+  int r = _do_remove(txc, c, o);
+  log_latency_fn(
+    __func__,
+    l_bluestore_remove_lat,
+    mono_clock::now() - start_time,
+    cct->_conf->bluestore_log_op_age,
+    [&](const ceph::timespan& lat) {
+      ostringstream ostr;
+      ostr << ", lat = " << timespan_str(lat)
+        << " cid =" << c->cid
+        << " oid =" << o->oid;
+      return ostr.str();
+    }
+  );
+
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_setattr(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o,
+			const string& name,
+			bufferptr& val)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << " (" << val.length() << " bytes)"
+	   << dendl;
+  int r = 0;
+  if (val.is_partial()) {
+    auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
+						       val.length());
+    b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+  } else {
+    auto& b = o->onode.attrs[name.c_str()] = val;
+    b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+  }
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << " (" << val.length() << " bytes)"
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_setattrs(TransContext *txc,
+			 CollectionRef& c,
+			 OnodeRef& o,
+			 const map<string,bufferptr>& aset)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << aset.size() << " keys"
+	   << dendl;
+  int r = 0;
+  for (map<string,bufferptr>::const_iterator p = aset.begin();
+       p != aset.end(); ++p) {
+    if (p->second.is_partial()) {
+      auto& b = o->onode.attrs[p->first.c_str()] =
+	bufferptr(p->second.c_str(), p->second.length());
+      b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+    } else {
+      auto& b = o->onode.attrs[p->first.c_str()] = p->second;
+      b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+    }
+  }
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << aset.size() << " keys"
+	   << " = " << r << dendl;
+  return r;
+}
+
+
+int BlueStore::_rmattr(TransContext *txc,
+		       CollectionRef& c,
+		       OnodeRef& o,
+		       const string& name)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << dendl;
+  int r = 0;
+  auto it = o->onode.attrs.find(name.c_str());
+  if (it == o->onode.attrs.end())
+    goto out;
+
+  o->onode.attrs.erase(it);
+  txc->write_onode(o);
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_rmattrs(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+
+  if (o->onode.attrs.empty())
+    goto out;
+
+  o->onode.attrs.clear();
+  txc->write_onode(o);
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
+{
+  const string& omap_prefix = o->get_omap_prefix();
+  string prefix, tail;
+  o->get_omap_header(&prefix);
+  o->get_omap_tail(&tail);
+  txc->t->rm_range_keys(omap_prefix, prefix, tail);
+  txc->t->rmkey(omap_prefix, tail);
+  dout(20) << __func__ << " remove range start: "
+           << pretty_binary_string(prefix) << " end: "
+           << pretty_binary_string(tail) << dendl;
+}
+
+int BlueStore::_omap_clear(TransContext *txc,
+			   CollectionRef& c,
+			   OnodeRef& o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  if (o->onode.has_omap()) {
+    o->flush();
+    _do_omap_clear(txc, o);
+    o->onode.clear_omap_flag();
+    txc->write_onode(o);
+  }
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_omap_setkeys(TransContext *txc,
+			     CollectionRef& c,
+			     OnodeRef& o,
+			     bufferlist &bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r;
+  auto p = bl.cbegin();
+  __u32 num;
+  if (!o->onode.has_omap()) {
+    if (o->oid.is_pgmeta()) {
+      o->onode.set_omap_flags_pgmeta();
+    } else {
+      o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
+    }
+    txc->write_onode(o);
+
+    const string& prefix = o->get_omap_prefix();
+    string key_tail;
+    bufferlist tail;
+    o->get_omap_tail(&key_tail);
+    txc->t->set(prefix, key_tail, tail);
+  } else {
+    txc->note_modified_object(o);
+  }
+  const string& prefix = o->get_omap_prefix();
+  string final_key;
+  o->get_omap_key(string(), &final_key);
+  size_t base_key_len = final_key.size();
+  decode(num, p);
+  while (num--) {
+    string key;
+    bufferlist value;
+    decode(key, p);
+    decode(value, p);
+    final_key.resize(base_key_len); // keep prefix
+    final_key += key;
+    dout(20) << __func__ << "  " << pretty_binary_string(final_key)
+	     << " <- " << key << dendl;
+    txc->t->set(prefix, final_key, value);
+  }
+  r = 0;
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_omap_setheader(TransContext *txc,
+			       CollectionRef& c,
+			       OnodeRef &o,
+			       bufferlist& bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r;
+  string key;
+  if (!o->onode.has_omap()) {
+    if (o->oid.is_pgmeta()) {
+      o->onode.set_omap_flags_pgmeta();
+    } else {
+      o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
+    }
+    txc->write_onode(o);
+
+    const string& prefix = o->get_omap_prefix();
+    string key_tail;
+    bufferlist tail;
+    o->get_omap_tail(&key_tail);
+    txc->t->set(prefix, key_tail, tail);
+  } else {
+    txc->note_modified_object(o);
+  }
+  const string& prefix = o->get_omap_prefix();
+  o->get_omap_header(&key);
+  txc->t->set(prefix, key, bl);
+  r = 0;
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_omap_rmkeys(TransContext *txc,
+			    CollectionRef& c,
+			    OnodeRef& o,
+			    bufferlist& bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  auto p = bl.cbegin();
+  __u32 num;
+  string final_key;
+
+  if (!o->onode.has_omap()) {
+    goto out;
+  }
+  {
+    const string& prefix = o->get_omap_prefix();
+    o->get_omap_key(string(), &final_key);
+    size_t base_key_len = final_key.size();
+    decode(num, p);
+    while (num--) {
+      string key;
+      decode(key, p);
+      final_key.resize(base_key_len); // keep prefix
+      final_key += key;
+      dout(20) << __func__ << "  rm " << pretty_binary_string(final_key)
+	       << " <- " << key << dendl;
+      txc->t->rmkey(prefix, final_key);
+    }
+  }
+  txc->note_modified_object(o);
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_omap_rmkey_range(TransContext *txc,
+				 CollectionRef& c,
+				 OnodeRef& o,
+				 const string& first, const string& last)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  string key_first, key_last;
+  int r = 0;
+  if (!o->onode.has_omap()) {
+    goto out;
+  }
+  {
+    const string& prefix = o->get_omap_prefix();
+    o->flush();
+    o->get_omap_key(first, &key_first);
+    o->get_omap_key(last, &key_last);
+    txc->t->rm_range_keys(prefix, key_first, key_last);
+    dout(20) << __func__ << " remove range start: "
+             << pretty_binary_string(key_first) << " end: "
+             << pretty_binary_string(key_last) << dendl;
+  }
+  txc->note_modified_object(o);
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_set_alloc_hint(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef& o,
+  uint64_t expected_object_size,
+  uint64_t expected_write_size,
+  uint32_t flags)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " object_size " << expected_object_size
+	   << " write_size " << expected_write_size
+	   << " flags " << ceph_osd_alloc_hint_flag_string(flags)
+	   << dendl;
+  int r = 0;
+  o->onode.expected_object_size = expected_object_size;
+  o->onode.expected_write_size = expected_write_size;
+  o->onode.alloc_hint_flags = flags;
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " object_size " << expected_object_size
+	   << " write_size " << expected_write_size
+	   << " flags " << ceph_osd_alloc_hint_flag_string(flags)
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_clone(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& oldo,
+		      OnodeRef& newo)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << dendl;
+  int r = 0;
+  if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
+    derr << __func__ << " mismatched hash on " << oldo->oid
+	 << " and " << newo->oid << dendl;
+    return -EINVAL;
+  }
+
+  _assign_nid(txc, newo);
+
+  // clone data
+  oldo->flush();
+  _do_truncate(txc, c, newo, 0);
+  if (cct->_conf->bluestore_clone_cow) {
+    _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
+  } else {
+    bufferlist bl;
+    r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
+    if (r < 0)
+      goto out;
+    r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
+    if (r < 0)
+      goto out;
+  }
+
+  // clone attrs
+  newo->onode.attrs = oldo->onode.attrs;
+
+  // clone omap
+  if (newo->onode.has_omap()) {
+    dout(20) << __func__ << " clearing old omap data" << dendl;
+    newo->flush();
+    _do_omap_clear(txc, newo);
+    newo->onode.clear_omap_flag();
+  }
+  if (oldo->onode.has_omap()) {
+    dout(20) << __func__ << " copying omap data" << dendl;
+    if (newo->oid.is_pgmeta()) {
+      newo->onode.set_omap_flags_pgmeta();
+    } else {
+      newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
+    }
+    const string& prefix = newo->get_omap_prefix();
+    string head, tail;
+    oldo->get_omap_header(&head);
+    oldo->get_omap_tail(&tail);
+    KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
+    it->lower_bound(head);
+    while (it->valid()) {
+      if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      } else {
+	dout(30) << __func__ << "  got header/data "
+		 << pretty_binary_string(it->key()) << dendl;
+        string key;
+	newo->rewrite_omap_key(it->key(), &key);
+	txc->t->set(prefix, key, it->value());
+      }
+      it->next();
+    }
+    string new_tail;
+    bufferlist new_tail_value;
+    newo->get_omap_tail(&new_tail);
+    txc->t->set(prefix, new_tail, new_tail_value);
+  }
+
+  txc->write_onode(newo);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_do_clone_range(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef& oldo,
+  OnodeRef& newo,
+  uint64_t srcoff,
+  uint64_t length,
+  uint64_t dstoff)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid
+	   << " 0x" << std::hex << srcoff << "~" << length << " -> "
+	   << " 0x" << dstoff << "~" << length << std::dec << dendl;
+  oldo->extent_map.fault_range(db, srcoff, length);
+  newo->extent_map.fault_range(db, dstoff, length);
+  _dump_onode<30>(cct, *oldo);
+  _dump_onode<30>(cct, *newo);
+
+  oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
+  _dump_onode<30>(cct, *oldo);
+  _dump_onode<30>(cct, *newo);
+  return 0;
+}
+
+int BlueStore::_clone_range(TransContext *txc,
+			    CollectionRef& c,
+			    OnodeRef& oldo,
+			    OnodeRef& newo,
+			    uint64_t srcoff, uint64_t length, uint64_t dstoff)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
+	   << " to offset 0x" << dstoff << std::dec << dendl;
+  int r = 0;
+
+  if (srcoff + length >= OBJECT_MAX_SIZE ||
+      dstoff + length >= OBJECT_MAX_SIZE) {
+    r = -E2BIG;
+    goto out;
+  }
+  if (srcoff + length > oldo->onode.size) {
+    r = -EINVAL;
+    goto out;
+  }
+
+  _assign_nid(txc, newo);
+
+  if (length > 0) {
+    if (cct->_conf->bluestore_clone_cow) {
+      _do_zero(txc, c, newo, dstoff, length);
+      _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
+    } else {
+      bufferlist bl;
+      r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
+      if (r < 0)
+	goto out;
+      r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
+      if (r < 0)
+	goto out;
+    }
+  }
+
+  txc->write_onode(newo);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
+	   << " to offset 0x" << dstoff << std::dec
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_rename(TransContext *txc,
+		       CollectionRef& c,
+		       OnodeRef& oldo,
+		       OnodeRef& newo,
+		       const ghobject_t& new_oid)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << new_oid << dendl;
+  int r;
+  ghobject_t old_oid = oldo->oid;
+  mempool::bluestore_cache_meta::string new_okey;
+
+  if (newo) {
+    if (newo->exists) {
+      r = -EEXIST;
+      goto out;
+    }
+    ceph_assert(txc->onodes.count(newo) == 0);
+  }
+
+  txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
+
+  // rewrite shards
+  {
+    oldo->extent_map.fault_range(db, 0, oldo->onode.size);
+    get_object_key(cct, new_oid, &new_okey);
+    string key;
+    for (auto &s : oldo->extent_map.shards) {
+      generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
+        [&](const string& final_key) {
+          txc->t->rmkey(PREFIX_OBJ, final_key);
+        }
+      );
+      s.dirty = true;
+    }
+  }
+
+  newo = oldo;
+  txc->write_onode(newo);
+
+  // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
+  // Onode in the old slot
+  c->onode_map.rename(oldo, old_oid, new_oid, new_okey);
+  r = 0;
+
+  // hold a ref to new Onode in old name position, to ensure we don't drop
+  // it from the cache before this txc commits (or else someone may come along
+  // and read newo's metadata via the old name).
+  txc->note_modified_object(oldo);
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
+	   << new_oid << " = " << r << dendl;
+  return r;
+}
+
+// collections
+
+int BlueStore::_create_collection(
+  TransContext *txc,
+  const coll_t &cid,
+  unsigned bits,
+  CollectionRef *c)
+{
+  dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
+  int r;
+  bufferlist bl;
+
+  {
+    std::unique_lock l(coll_lock);
+    if (*c) {
+      r = -EEXIST;
+      goto out;
+    }
+    auto p = new_coll_map.find(cid);
+    ceph_assert(p != new_coll_map.end());
+    *c = p->second;
+    (*c)->cnode.bits = bits;
+    coll_map[cid] = *c;
+    new_coll_map.erase(p);
+  }
+  encode((*c)->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(cid), bl);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
+				  CollectionRef *c)
+{
+  dout(15) << __func__ << " " << cid << dendl;
+  int r;
+
+  (*c)->flush_all_but_last();
+  {
+    std::unique_lock l(coll_lock);
+    if (!*c) {
+      r = -ENOENT;
+      goto out;
+    }
+    size_t nonexistent_count = 0;
+    ceph_assert((*c)->exists);
+    if ((*c)->onode_map.map_any([&](Onode* o) {
+      if (o->exists) {
+        dout(1) << __func__ << " " << o->oid << " " << o
+	        << " exists in onode_map" << dendl;
+          return true;
+      }
+      ++nonexistent_count;
+      return false;
+    })) {
+      r = -ENOTEMPTY;
+      goto out;
+    }
+    vector<ghobject_t> ls;
+    ghobject_t next;
+    // Enumerate onodes in db, up to nonexistent_count + 1
+    // then check if all of them are marked as non-existent.
+    // Bypass the check if (next != ghobject_t::get_max())
+    r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
+                         nonexistent_count + 1, false, &ls, &next);
+    if (r >= 0) {
+      // If true mean collecton has more objects than nonexistent_count,
+      // so bypass check.
+      bool exists = (!next.is_max());
+      for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
+        dout(10) << __func__ << " oid " << *it << dendl;
+        auto onode = (*c)->onode_map.lookup(*it);
+        exists = !onode || onode->exists;
+        if (exists) {
+          dout(1) << __func__ << " " << *it
+	  << " exists in db, "
+	  << (!onode ? "not present in ram" : "present in ram")
+	  << dendl;
+        }
+      }
+      if (!exists) {
+        _do_remove_collection(txc, c);
+        r = 0;
+      } else {
+        dout(10) << __func__ << " " << cid
+                 << " is non-empty" << dendl;
+	r = -ENOTEMPTY;
+      }
+    }
+  }
+out:
+  dout(10) << __func__ << " " << cid << " = " << r << dendl;
+  return r;
+}
+
+void BlueStore::_do_remove_collection(TransContext *txc,
+				      CollectionRef *c)
+{
+  coll_map.erase((*c)->cid);
+  txc->removed_collections.push_back(*c);
+  (*c)->exists = false;
+  _osr_register_zombie((*c)->osr.get());
+  txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
+  c->reset();
+}
+
+int BlueStore::_split_collection(TransContext *txc,
+				CollectionRef& c,
+				CollectionRef& d,
+				unsigned bits, int rem)
+{
+  dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
+	   << " bits " << bits << dendl;
+  std::unique_lock l(c->lock);
+  std::unique_lock l2(d->lock);
+  int r;
+
+  // flush all previous deferred writes on this sequencer.  this is a bit
+  // heavyweight, but we need to make sure all deferred writes complete
+  // before we split as the new collection's sequencer may need to order
+  // this after those writes, and we don't bother with the complexity of
+  // moving those TransContexts over to the new osr.
+  _osr_drain_preceding(txc);
+
+  // move any cached items (onodes and referenced shared blobs) that will
+  // belong to the child collection post-split.  leave everything else behind.
+  // this may include things that don't strictly belong to the now-smaller
+  // parent split, but the OSD will always send us a split for every new
+  // child.
+
+  spg_t pgid, dest_pgid;
+  bool is_pg = c->cid.is_pg(&pgid);
+  ceph_assert(is_pg);
+  is_pg = d->cid.is_pg(&dest_pgid);
+  ceph_assert(is_pg);
+
+  // the destination should initially be empty.
+  ceph_assert(d->onode_map.empty());
+  ceph_assert(d->shared_blob_set.empty());
+  ceph_assert(d->cnode.bits == bits);
+
+  c->split_cache(d.get());
+
+  // adjust bits.  note that this will be redundant for all but the first
+  // split call for this parent (first child).
+  c->cnode.bits = bits;
+  ceph_assert(d->cnode.bits == bits);
+  r = 0;
+
+  bufferlist bl;
+  encode(c->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
+
+  dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
+	   << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_merge_collection(
+  TransContext *txc,
+  CollectionRef *c,
+  CollectionRef& d,
+  unsigned bits)
+{
+  dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
+	   << " bits " << bits << dendl;
+  std::unique_lock l((*c)->lock);
+  std::unique_lock l2(d->lock);
+  int r;
+
+  coll_t cid = (*c)->cid;
+
+  // flush all previous deferred writes on the source collection to ensure
+  // that all deferred writes complete before we merge as the target collection's
+  // sequencer may need to order new ops after those writes.
+
+  _osr_drain((*c)->osr.get());
+
+  // move any cached items (onodes and referenced shared blobs) that will
+  // belong to the child collection post-split.  leave everything else behind.
+  // this may include things that don't strictly belong to the now-smaller
+  // parent split, but the OSD will always send us a split for every new
+  // child.
+
+  spg_t pgid, dest_pgid;
+  bool is_pg = cid.is_pg(&pgid);
+  ceph_assert(is_pg);
+  is_pg = d->cid.is_pg(&dest_pgid);
+  ceph_assert(is_pg);
+
+  // adjust bits.  note that this will be redundant for all but the first
+  // merge call for the parent/target.
+  d->cnode.bits = bits;
+
+  // behavior depends on target (d) bits, so this after that is updated.
+  (*c)->split_cache(d.get());
+
+  // remove source collection
+  {
+    std::unique_lock l3(coll_lock);
+    _do_remove_collection(txc, c);
+  }
+
+  r = 0;
+
+  bufferlist bl;
+  encode(d->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
+
+  dout(10) << __func__ << " " << cid << " to " << d->cid << " "
+	   << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+void BlueStore::log_latency(
+  const char* name,
+  int idx,
+  const ceph::timespan& l,
+  double lat_threshold,
+  const char* info) const
+{
+  logger->tinc(idx, l);
+  if (lat_threshold > 0.0 &&
+      l >= make_timespan(lat_threshold)) {
+    dout(0) << __func__ << " slow operation observed for " << name
+      << ", latency = " << l
+      << info
+      << dendl;
+  }
+}
+
+void BlueStore::log_latency_fn(
+  const char* name,
+  int idx,
+  const ceph::timespan& l,
+  double lat_threshold,
+  std::function<string (const ceph::timespan& lat)> fn) const
+{
+  logger->tinc(idx, l);
+  if (lat_threshold > 0.0 &&
+      l >= make_timespan(lat_threshold)) {
+    dout(0) << __func__ << " slow operation observed for " << name
+      << ", latency = " << l
+      << fn(l)
+      << dendl;
+  }
+}
+
+#if defined(WITH_LTTNG)
+void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
+  KeyValueDB &db,
+  TransContext &txc,
+  mono_clock::time_point start_throttle_acquire)
+{
+  pending_kv_ios += txc.ios;
+  if (txc.deferred_txn) {
+    pending_deferred_ios += txc.ios;
+  }
+
+  uint64_t started = 0;
+  uint64_t completed = 0;
+  if (should_trace(&started, &completed)) {
+    txc.tracing = true;
+    uint64_t rocksdb_base_level,
+      rocksdb_estimate_pending_compaction_bytes,
+      rocksdb_cur_size_all_mem_tables,
+      rocksdb_compaction_pending,
+      rocksdb_mem_table_flush_pending,
+      rocksdb_num_running_compactions,
+      rocksdb_num_running_flushes,
+      rocksdb_actual_delayed_write_rate;
+    db.get_property(
+      "rocksdb.base-level",
+      &rocksdb_base_level);
+    db.get_property(
+      "rocksdb.estimate-pending-compaction-bytes",
+      &rocksdb_estimate_pending_compaction_bytes);
+    db.get_property(
+      "rocksdb.cur-size-all-mem-tables",
+      &rocksdb_cur_size_all_mem_tables);
+    db.get_property(
+      "rocksdb.compaction-pending",
+      &rocksdb_compaction_pending);
+    db.get_property(
+      "rocksdb.mem-table-flush-pending",
+      &rocksdb_mem_table_flush_pending);
+    db.get_property(
+      "rocksdb.num-running-compactions",
+      &rocksdb_num_running_compactions);
+    db.get_property(
+      "rocksdb.num-running-flushes",
+      &rocksdb_num_running_flushes);
+    db.get_property(
+      "rocksdb.actual-delayed-write-rate",
+      &rocksdb_actual_delayed_write_rate);
+
+  
+    tracepoint(
+      bluestore,
+      transaction_initial_state,
+      txc.osr->get_sequencer_id(),
+      txc.seq,
+      throttle_bytes.get_current(),
+      throttle_deferred_bytes.get_current(),
+      pending_kv_ios,
+      pending_deferred_ios,
+      started,
+      completed,
+      ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
+
+    tracepoint(
+      bluestore,
+      transaction_initial_state_rocksdb,
+      txc.osr->get_sequencer_id(),
+      txc.seq,
+      rocksdb_base_level,
+      rocksdb_estimate_pending_compaction_bytes,
+      rocksdb_cur_size_all_mem_tables,
+      rocksdb_compaction_pending,
+      rocksdb_mem_table_flush_pending,
+      rocksdb_num_running_compactions,
+      rocksdb_num_running_flushes,
+      rocksdb_actual_delayed_write_rate);
+  }
+}
+#endif
+
+mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
+  TransContext &txc, PerfCounters *logger, int state)
+{
+  mono_clock::time_point now = mono_clock::now();
+  mono_clock::duration lat = now - txc.last_stamp;
+  logger->tinc(state, lat);
+#if defined(WITH_LTTNG)
+  if (txc.tracing &&
+      state >= l_bluestore_state_prepare_lat &&
+      state <= l_bluestore_state_done_lat) {
+    OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
+    tracepoint(
+      bluestore,
+      transaction_state_duration,
+      txc.osr->get_sequencer_id(),
+      txc.seq,
+      state,
+      ceph::to_seconds<double>(lat));
+  }
+#endif
+  txc.last_stamp = now;
+  return lat;
+}
+
+bool BlueStore::BlueStoreThrottle::try_start_transaction(
+  KeyValueDB &db,
+  TransContext &txc,
+  mono_clock::time_point start_throttle_acquire)
+{
+  throttle_bytes.get(txc.cost);
+
+  if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
+    emit_initial_tracepoint(db, txc, start_throttle_acquire);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void BlueStore::BlueStoreThrottle::finish_start_transaction(
+  KeyValueDB &db,
+  TransContext &txc,
+  mono_clock::time_point start_throttle_acquire)
+{
+  ceph_assert(txc.deferred_txn);
+  throttle_deferred_bytes.get(txc.cost);
+  emit_initial_tracepoint(db, txc, start_throttle_acquire);
+}
+
+#if defined(WITH_LTTNG)
+void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
+{
+  pending_kv_ios -= 1;
+  ios_completed_since_last_traced++;
+  if (txc.tracing) {
+    tracepoint(
+      bluestore,
+      transaction_commit_latency,
+      txc.osr->get_sequencer_id(),
+      txc.seq,
+      ceph::to_seconds<double>(mono_clock::now() - txc.start));
+  }
+}
+#endif
+
+#if defined(WITH_LTTNG)
+void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
+{
+  if (txc.deferred_txn) {
+    pending_deferred_ios -= 1;
+  }
+  if (txc.tracing) {
+    mono_clock::time_point now = mono_clock::now();
+    mono_clock::duration lat = now - txc.start;
+    tracepoint(
+      bluestore,
+      transaction_total_duration,
+      txc.osr->get_sequencer_id(),
+      txc.seq,
+      ceph::to_seconds<double>(lat));
+  }
+}
+#endif
+
+// DB key value Histogram
+#define KEY_SLAB 32
+#define VALUE_SLAB 64
+
+const string prefix_onode = "o";
+const string prefix_onode_shard = "x";
+const string prefix_other = "Z";
+
+int BlueStore::DBHistogram::get_key_slab(size_t sz)
+{
+  return (sz/KEY_SLAB);
+}
+
+string BlueStore::DBHistogram::get_key_slab_to_range(int slab)
+{
+  int lower_bound = slab * KEY_SLAB;
+  int upper_bound = (slab + 1) * KEY_SLAB;
+  string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
+  return ret;
+}
+
+int BlueStore::DBHistogram::get_value_slab(size_t sz)
+{
+  return (sz/VALUE_SLAB);
+}
+
+string BlueStore::DBHistogram::get_value_slab_to_range(int slab)
+{
+  int lower_bound = slab * VALUE_SLAB;
+  int upper_bound = (slab + 1) * VALUE_SLAB;
+  string ret = "[" + stringify(lower_bound) + "," + stringify(upper_bound) + ")";
+  return ret;
+}
+
+void BlueStore::DBHistogram::update_hist_entry(map<string, map<int, struct key_dist> > &key_hist,
+                      const string &prefix, size_t key_size, size_t value_size)
+{
+    uint32_t key_slab = get_key_slab(key_size);
+    uint32_t value_slab = get_value_slab(value_size);
+    key_hist[prefix][key_slab].count++;
+    key_hist[prefix][key_slab].max_len =
+      std::max<size_t>(key_size, key_hist[prefix][key_slab].max_len);
+    key_hist[prefix][key_slab].val_map[value_slab].count++;
+    key_hist[prefix][key_slab].val_map[value_slab].max_len =
+      std::max<size_t>(value_size,
+                       key_hist[prefix][key_slab].val_map[value_slab].max_len);
+}
+
+void BlueStore::DBHistogram::dump(Formatter *f)
+{
+  f->open_object_section("rocksdb_value_distribution");
+  for (auto i : value_hist) {
+    f->dump_unsigned(get_value_slab_to_range(i.first).data(), i.second);
+  }
+  f->close_section();
+
+  f->open_object_section("rocksdb_key_value_histogram");
+  for (auto i : key_hist) {
+    f->dump_string("prefix", i.first);
+    f->open_object_section("key_hist");
+    for ( auto k : i.second) {
+      f->dump_unsigned(get_key_slab_to_range(k.first).data(), k.second.count);
+      f->dump_unsigned("max_len", k.second.max_len);
+      f->open_object_section("value_hist");
+      for ( auto j : k.second.val_map) {
+	f->dump_unsigned(get_value_slab_to_range(j.first).data(), j.second.count);
+	f->dump_unsigned("max_len", j.second.max_len);
+      }
+      f->close_section();
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+//Itrerates through the db and collects the stats
+void BlueStore::generate_db_histogram(Formatter *f)
+{
+  //globals
+  uint64_t num_onodes = 0;
+  uint64_t num_shards = 0;
+  uint64_t num_super = 0;
+  uint64_t num_coll = 0;
+  uint64_t num_omap = 0;
+  uint64_t num_pgmeta_omap = 0;
+  uint64_t num_deferred = 0;
+  uint64_t num_alloc = 0;
+  uint64_t num_stat = 0;
+  uint64_t num_others = 0;
+  uint64_t num_shared_shards = 0;
+  size_t max_key_size =0, max_value_size = 0;
+  uint64_t total_key_size = 0, total_value_size = 0;
+  size_t key_size = 0, value_size = 0;
+  DBHistogram hist;
+
+  auto start = coarse_mono_clock::now();
+
+  KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
+  iter->seek_to_first();
+  while (iter->valid()) {
+    dout(30) << __func__ << " Key: " << iter->key() << dendl;
+    key_size = iter->key_size();
+    value_size = iter->value_size();
+    hist.value_hist[hist.get_value_slab(value_size)]++;
+    max_key_size = std::max(max_key_size, key_size);
+    max_value_size = std::max(max_value_size, value_size);
+    total_key_size += key_size;
+    total_value_size += value_size;
+
+    pair<string,string> key(iter->raw_key());
+
+    if (key.first == PREFIX_SUPER) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
+	num_super++;
+    } else if (key.first == PREFIX_STAT) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
+	num_stat++;
+    } else if (key.first == PREFIX_COLL) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
+	num_coll++;
+    } else if (key.first == PREFIX_OBJ) {
+      if (key.second.back() == ONODE_KEY_SUFFIX) {
+	hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
+	num_onodes++;
+      } else {
+	hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
+	num_shards++;
+      }
+    } else if (key.first == PREFIX_OMAP) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
+	num_omap++;
+    } else if (key.first == PREFIX_PERPOOL_OMAP) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size);
+	num_omap++;
+    } else if (key.first == PREFIX_PERPG_OMAP) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size);
+	num_omap++;
+    } else if (key.first == PREFIX_PGMETA_OMAP) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
+	num_pgmeta_omap++;
+    } else if (key.first == PREFIX_DEFERRED) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
+	num_deferred++;
+    } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
+	num_alloc++;
+    } else if (key.first == PREFIX_SHARED_BLOB) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
+	num_shared_shards++;
+    } else {
+	hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
+	num_others++;
+    }
+    iter->next();
+  }
+
+  ceph::timespan duration = coarse_mono_clock::now() - start;
+  f->open_object_section("rocksdb_key_value_stats");
+  f->dump_unsigned("num_onodes", num_onodes);
+  f->dump_unsigned("num_shards", num_shards);
+  f->dump_unsigned("num_super", num_super);
+  f->dump_unsigned("num_coll", num_coll);
+  f->dump_unsigned("num_omap", num_omap);
+  f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
+  f->dump_unsigned("num_deferred", num_deferred);
+  f->dump_unsigned("num_alloc", num_alloc);
+  f->dump_unsigned("num_stat", num_stat);
+  f->dump_unsigned("num_shared_shards", num_shared_shards);
+  f->dump_unsigned("num_others", num_others);
+  f->dump_unsigned("max_key_size", max_key_size);
+  f->dump_unsigned("max_value_size", max_value_size);
+  f->dump_unsigned("total_key_size", total_key_size);
+  f->dump_unsigned("total_value_size", total_value_size);
+  f->close_section();
+
+  hist.dump(f);
+
+  dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
+
+}
+
+void BlueStore::_shutdown_cache()
+{
+  dout(10) << __func__ << dendl;
+  for (auto i : buffer_cache_shards) {
+    i->flush();
+    ceph_assert(i->empty());
+  }
+  for (auto& p : coll_map) {
+    p.second->onode_map.clear();
+    if (!p.second->shared_blob_set.empty()) {
+      derr << __func__ << " stray shared blobs on " << p.first << dendl;
+      p.second->shared_blob_set.dump<0>(cct);
+    }
+    ceph_assert(p.second->onode_map.empty());
+    ceph_assert(p.second->shared_blob_set.empty());
+  }
+  coll_map.clear();
+  for (auto i : onode_cache_shards) {
+    ceph_assert(i->empty());
+  }
+}
+
+// For external caller.
+// We use a best-effort policy instead, e.g.,
+// we don't care if there are still some pinned onodes/data in the cache
+// after this command is completed.
+int BlueStore::flush_cache(ostream *os)
+{
+  dout(10) << __func__ << dendl;
+  for (auto i : onode_cache_shards) {
+    i->flush();
+  }
+  for (auto i : buffer_cache_shards) {
+    i->flush();
+  }
+
+  return 0;
+}
+
+void BlueStore::_apply_padding(uint64_t head_pad,
+			       uint64_t tail_pad,
+			       bufferlist& padded)
+{
+  if (head_pad) {
+    padded.prepend_zero(head_pad);
+  }
+  if (tail_pad) {
+    padded.append_zero(tail_pad);
+  }
+  if (head_pad || tail_pad) {
+    dout(20) << __func__ << "  can pad head 0x" << std::hex << head_pad
+	      << " tail 0x" << tail_pad << std::dec << dendl;
+    logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
+  }
+}
+
+void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
+{
+  // finalize extent_map shards
+  o->extent_map.update(txn, false);
+  if (o->extent_map.needs_reshard()) {
+    o->extent_map.reshard(db, txn);
+    o->extent_map.update(txn, true);
+    if (o->extent_map.needs_reshard()) {
+      dout(20) << __func__ << " warning: still wants reshard, check options?"
+		<< dendl;
+      o->extent_map.clear_needs_reshard();
+    }
+    logger->inc(l_bluestore_onode_reshard);
+  }
+
+  // bound encode
+  size_t bound = 0;
+  denc(o->onode, bound);
+  o->extent_map.bound_encode_spanning_blobs(bound);
+  if (o->onode.extent_map_shards.empty()) {
+    denc(o->extent_map.inline_bl, bound);
+  }
+
+  // encode
+  bufferlist bl;
+  unsigned onode_part, blob_part, extent_part;
+  {
+    auto p = bl.get_contiguous_appender(bound, true);
+    denc(o->onode, p);
+    onode_part = p.get_logical_offset();
+    o->extent_map.encode_spanning_blobs(p);
+    blob_part = p.get_logical_offset() - onode_part;
+    if (o->onode.extent_map_shards.empty()) {
+      denc(o->extent_map.inline_bl, p);
+    }
+    extent_part = p.get_logical_offset() - onode_part - blob_part;
+  }
+
+  dout(20) << __func__  << " onode " << o->oid << " is " << bl.length()
+	    << " (" << onode_part << " bytes onode + "
+	    << blob_part << " bytes spanning blobs + "
+	    << extent_part << " bytes inline extents)"
+	    << dendl;
+
+
+  txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
+}
+
+void BlueStore::_log_alerts(osd_alert_list_t& alerts)
+{
+  std::lock_guard l(qlock);
+
+  if (!spurious_read_errors_alert.empty() &&
+      cct->_conf->bluestore_warn_on_spurious_read_errors) {
+    alerts.emplace(
+      "BLUESTORE_SPURIOUS_READ_ERRORS",
+      spurious_read_errors_alert);
+  }
+  if (!disk_size_mismatch_alert.empty()) {
+    alerts.emplace(
+      "BLUESTORE_DISK_SIZE_MISMATCH",
+      disk_size_mismatch_alert);
+  }
+  if (!legacy_statfs_alert.empty()) {
+    alerts.emplace(
+      "BLUESTORE_LEGACY_STATFS",
+      legacy_statfs_alert);
+  }
+  if (!spillover_alert.empty() &&
+      cct->_conf->bluestore_warn_on_bluefs_spillover) {
+    alerts.emplace(
+      "BLUEFS_SPILLOVER",
+      spillover_alert);
+  }
+  if (!no_per_pg_omap_alert.empty()) {
+    alerts.emplace(
+      "BLUESTORE_NO_PER_PG_OMAP",
+      no_per_pg_omap_alert);
+  }
+  if (!no_per_pool_omap_alert.empty()) {
+    alerts.emplace(
+      "BLUESTORE_NO_PER_POOL_OMAP",
+      no_per_pool_omap_alert);
+  }
+  string s0(failed_cmode);
+
+  if (!failed_compressors.empty()) {
+    if (!s0.empty()) {
+      s0 += ", ";
+    }
+    s0 += "unable to load:";
+    bool first = true;
+    for (auto& s : failed_compressors) {
+      if (first) {
+	first = false;
+      } else {
+	s0 += ", ";
+      }
+      s0 += s;
+    }
+    alerts.emplace(
+      "BLUESTORE_NO_COMPRESSION",
+      s0);
+  }
+}
+
+void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
+                                          size_t extents)
+{
+  alloc_stats_count++;
+  alloc_stats_fragments += extents;
+  alloc_stats_size += need;
+}
+
+void BlueStore::_record_allocation_stats()
+{
+  // don't care about data consistency,
+  // fields can be partially modified while making the tuple
+  auto t0 = std::make_tuple(
+    alloc_stats_count.exchange(0),
+    alloc_stats_fragments.exchange(0),
+    alloc_stats_size.exchange(0));
+
+  dout(0) << " allocation stats probe "
+    << probe_count << ":"
+    << " cnt: " << std::get<0>(t0)
+    << " frags: " << std::get<1>(t0)
+    << " size: " << std::get<2>(t0)
+    << dendl;
+
+
+  //
+  // Keep the history for probes from the power-of-two sequence:
+  // -1, -2, -4, -8, -16
+  // 
+  size_t base = 1;
+  for (auto& t : alloc_stats_history) {
+    dout(0) << " probe -"
+      << base + (probe_count % base) << ": "
+      << std::get<0>(t)
+      << ",  " << std::get<1>(t)
+      << ", " << std::get<2>(t)
+      << dendl;
+    base <<= 1;
+  }
+  dout(0) << "------------" << dendl;
+
+  ++ probe_count;
+
+  for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) {
+    if ((probe_count % (1 << i)) == 0) {
+      alloc_stats_history[i] = alloc_stats_history[i - 1];
+    }
+  }
+  alloc_stats_history[0].swap(t0);
+}
+
+// ===========================================
+// BlueStoreRepairer
+
+size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
+  const interval_set<uint64_t>& extents)
+{
+  ceph_assert(granularity); // initialized
+  // can't call for the second time
+  ceph_assert(!was_filtered_out);
+  ceph_assert(collections_bfs.size() == objects_bfs.size());
+
+  uint64_t prev_pos = 0;
+  uint64_t npos = collections_bfs.size();
+
+  bloom_vector collections_reduced;
+  bloom_vector objects_reduced;
+
+  for (auto e : extents) {
+    if (e.second == 0) {
+      continue;
+    }
+    uint64_t pos = max(e.first / granularity, prev_pos);
+    uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
+    while (pos != npos && pos < end_pos)  {
+        ceph_assert( collections_bfs[pos].element_count() ==
+          objects_bfs[pos].element_count());
+        if (collections_bfs[pos].element_count()) {
+          collections_reduced.push_back(std::move(collections_bfs[pos]));
+          objects_reduced.push_back(std::move(objects_bfs[pos]));
+        }
+        ++pos;
+    }
+    prev_pos = end_pos;
+  }
+  collections_reduced.swap(collections_bfs);
+  objects_reduced.swap(objects_bfs);
+  was_filtered_out = true;
+  return collections_bfs.size();
+}
+
+bool BlueStoreRepairer::remove_key(KeyValueDB *db,
+				   const string& prefix,
+				   const string& key)
+{
+  std::lock_guard l(lock);
+  if (!remove_key_txn) {
+    remove_key_txn = db->get_transaction();
+  }
+  ++to_repair_cnt;
+  remove_key_txn->rmkey(prefix, key);
+
+  return true;
+}
+
+void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val)
+{
+  std::lock_guard l(lock); // possibly redundant
+  ceph_assert(fix_per_pool_omap_txn == nullptr);
+  fix_per_pool_omap_txn = db->get_transaction();
+  ++to_repair_cnt;
+  bufferlist bl;
+  bl.append(stringify(val));
+  fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
+}
+
+bool BlueStoreRepairer::fix_shared_blob(
+  KeyValueDB::Transaction txn,
+  uint64_t sbid,
+  bluestore_extent_ref_map_t* ref_map,
+  size_t repaired)
+{
+  string key;
+  get_shared_blob_key(sbid, &key);
+  if (ref_map) {
+    bluestore_shared_blob_t persistent(sbid, std::move(*ref_map));
+    bufferlist bl;
+    encode(persistent, bl);
+    txn->set(PREFIX_SHARED_BLOB, key, bl);
+  } else {
+    txn->rmkey(PREFIX_SHARED_BLOB, key);
+  }
+  to_repair_cnt += repaired;
+  return true;
+}
+
+bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
+				   const string& key,
+				   const store_statfs_t& new_statfs)
+{
+  std::lock_guard l(lock);
+  if (!fix_statfs_txn) {
+    fix_statfs_txn = db->get_transaction();
+  }
+  BlueStore::volatile_statfs vstatfs;
+  vstatfs = new_statfs;
+  bufferlist bl;
+  vstatfs.encode(bl);
+  ++to_repair_cnt;
+  fix_statfs_txn->set(PREFIX_STAT, key, bl);
+  return true;
+}
+
+bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
+				   FreelistManager* fm,
+				   uint64_t offset, uint64_t len)
+{
+  std::lock_guard l(lock);
+  if (!fix_fm_leaked_txn) {
+    fix_fm_leaked_txn = db->get_transaction();
+  }
+  ++to_repair_cnt;
+  fm->release(offset, len, fix_fm_leaked_txn);
+  return true;
+}
+bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
+				       FreelistManager* fm,
+				       uint64_t offset, uint64_t len)
+{
+  std::lock_guard l(lock);
+  if (!fix_fm_false_free_txn) {
+    fix_fm_false_free_txn = db->get_transaction();
+  }
+  ++to_repair_cnt;
+  fm->allocate(offset, len, fix_fm_false_free_txn);
+  return true;
+}
+
+bool BlueStoreRepairer::fix_spanning_blobs(
+  KeyValueDB* db,
+  std::function<void(KeyValueDB::Transaction)> f)
+{
+  std::lock_guard l(lock);
+  if (!fix_onode_txn) {
+    fix_onode_txn = db->get_transaction();
+  }
+  f(fix_onode_txn);
+  ++to_repair_cnt;
+  return true;
+}
+
+bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
+{
+  //NB: not for use in multithreading mode!!!
+  if (misreferenced_extents.size()) {
+    size_t n = space_usage_tracker.filter_out(misreferenced_extents);
+    ceph_assert(n > 0);
+    if (!fix_misreferences_txn) {
+      fix_misreferences_txn = db->get_transaction();
+    }
+    return true;
+  }
+  return false;
+}
+
+unsigned BlueStoreRepairer::apply(KeyValueDB* db)
+{
+  //NB: not for use in multithreading mode!!!
+  if (fix_per_pool_omap_txn) {
+    db->submit_transaction_sync(fix_per_pool_omap_txn);
+    fix_per_pool_omap_txn = nullptr;
+  }
+  if (fix_fm_leaked_txn) {
+    db->submit_transaction_sync(fix_fm_leaked_txn);
+    fix_fm_leaked_txn = nullptr;
+  }
+  if (fix_fm_false_free_txn) {
+    db->submit_transaction_sync(fix_fm_false_free_txn);
+    fix_fm_false_free_txn = nullptr;
+  }
+  if (remove_key_txn) {
+    db->submit_transaction_sync(remove_key_txn);
+    remove_key_txn = nullptr;
+  }
+  if (fix_misreferences_txn) {
+    db->submit_transaction_sync(fix_misreferences_txn);
+    fix_misreferences_txn = nullptr;
+  }
+  if (fix_onode_txn) {
+    db->submit_transaction_sync(fix_onode_txn);
+    fix_onode_txn = nullptr;
+  }
+  if (fix_shared_blob_txn) {
+    db->submit_transaction_sync(fix_shared_blob_txn);
+    fix_shared_blob_txn = nullptr;
+  }
+
+  if (fix_statfs_txn) {
+    db->submit_transaction_sync(fix_statfs_txn);
+    fix_statfs_txn = nullptr;
+  }
+  if (need_compact) {
+    db->compact();
+    need_compact = false;
+  }
+  unsigned repaired = to_repair_cnt;
+  to_repair_cnt = 0;
+  return repaired;
+}
+
+// =======================================================
+// RocksDBBlueFSVolumeSelector
+
+uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
+  ceph_assert(h != nullptr);
+  uint64_t hint = reinterpret_cast<uint64_t>(h);
+  uint8_t res;
+  switch (hint) {
+  case LEVEL_SLOW:
+    res = BlueFS::BDEV_SLOW;
+    if (db_avail4slow > 0) {
+      // considering statically available db space vs.
+      // - observed maximums on DB dev for DB/WAL/UNSORTED data
+      // - observed maximum spillovers
+      uint64_t max_db_use = 0; // max db usage we potentially observed
+      max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
+      max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
+      max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
+      // this could go to db hence using it in the estimation
+      max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
+
+      auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
+      uint64_t avail = min(
+        db_avail4slow,
+        max_db_use < db_total ? db_total - max_db_use : 0);
+
+      // considering current DB dev usage for SLOW data
+      if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
+        res = BlueFS::BDEV_DB;
+      }
+    }
+    break;
+  case LEVEL_LOG:
+  case LEVEL_WAL:
+    res = BlueFS::BDEV_WAL;
+    break;
+  case LEVEL_DB:
+  default:
+    res = BlueFS::BDEV_DB;
+    break;
+  }
+  return res;
+}
+
+void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
+{
+  auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST];
+  res.emplace_back(base, db_size);
+  auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST];
+  if (slow_size == 0) {
+    slow_size = db_size;
+  }
+  res.emplace_back(base + ".slow", slow_size);
+}
+
+void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
+  uint8_t res = LEVEL_DB;
+  if (dirname.length() > 5) {
+    // the "db.slow" and "db.wal" directory names are hard-coded at
+    // match up with bluestore.  the slow device is always the second
+    // one (when a dedicated block.db device is present and used at
+    // bdev 0).  the wal device is always last.
+    if (boost::algorithm::ends_with(dirname, ".slow")) {
+      res = LEVEL_SLOW;
+    }
+    else if (boost::algorithm::ends_with(dirname, ".wal")) {
+      res = LEVEL_WAL;
+    }
+  }
+  return reinterpret_cast<void*>(res);
+}
+
+void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
+  auto max_x = per_level_per_dev_usage.get_max_x();
+  auto max_y = per_level_per_dev_usage.get_max_y();
+  sout << "RocksDBBlueFSVolumeSelector: wal_total:" << l_totals[LEVEL_WAL - LEVEL_FIRST]
+    << ", db_total:" << l_totals[LEVEL_DB - LEVEL_FIRST]
+    << ", slow_total:" << l_totals[LEVEL_SLOW - LEVEL_FIRST]
+    << ", db_avail:" << db_avail4slow << std::endl
+    << "Usage matrix:" << std::endl;
+  constexpr std::array<const char*, 8> names{ {
+    "DEV/LEV",
+    "WAL",
+    "DB",
+    "SLOW",
+    "*",
+    "*",
+    "REAL",
+    "FILES",
+  } };
+  const size_t width = 12;
+  for (size_t i = 0; i < names.size(); ++i) {
+    sout.setf(std::ios::left, std::ios::adjustfield);
+    sout.width(width);
+    sout << names[i];
+  }
+  sout << std::endl;
+  for (size_t l = 0; l < max_y; l++) {
+    sout.setf(std::ios::left, std::ios::adjustfield);
+    sout.width(width);
+    switch (l + LEVEL_FIRST) {
+    case LEVEL_LOG:
+      sout << "LOG"; break;
+    case LEVEL_WAL:
+      sout << "WAL"; break;
+    case LEVEL_DB:
+      sout << "DB"; break;
+    case LEVEL_SLOW:
+      sout << "SLOW"; break;
+    case LEVEL_MAX:
+      sout << "TOTALS"; break;
+    }
+    for (size_t d = 0; d < max_x; d++) {
+      sout.setf(std::ios::left, std::ios::adjustfield);
+      sout.width(width);
+      sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
+    }
+    sout.setf(std::ios::left, std::ios::adjustfield);
+    sout.width(width);
+    sout << stringify(per_level_files[l]) << std::endl;
+  }
+  ceph_assert(max_x == per_level_per_dev_max.get_max_x());
+  ceph_assert(max_y == per_level_per_dev_max.get_max_y());
+  sout << "MAXIMUMS:" << std::endl;
+  for (size_t l = 0; l < max_y; l++) {
+    sout.setf(std::ios::left, std::ios::adjustfield);
+    sout.width(width);
+    switch (l + LEVEL_FIRST) {
+    case LEVEL_LOG:
+      sout << "LOG"; break;
+    case LEVEL_WAL:
+      sout << "WAL"; break;
+    case LEVEL_DB:
+      sout << "DB"; break;
+    case LEVEL_SLOW:
+      sout << "SLOW"; break;
+    case LEVEL_MAX:
+      sout << "TOTALS"; break;
+    }
+    for (size_t d = 0; d < max_x - 1; d++) {
+      sout.setf(std::ios::left, std::ios::adjustfield);
+      sout.width(width);
+      sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
+    }
+    sout.setf(std::ios::left, std::ios::adjustfield);
+    sout.width(width);
+    sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
+    if (l < max_y - 1) {
+      sout << std::endl;
+    }
+  }
+}
+
+// =======================================================
+// =======================================================
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h
new file mode 100644
index 000000000..58a718da8
--- /dev/null
+++ b/src/os/bluestore/BlueStore.h
@@ -0,0 +1,3932 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_BLUESTORE_H
+#define CEPH_OSD_BLUESTORE_H
+
+#include "acconfig.h"
+
+#include <unistd.h>
+
+#include <atomic>
+#include <chrono>
+#include <ratio>
+#include <mutex>
+#include <condition_variable>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/unordered_set.hpp>
+#include <boost/intrusive/set.hpp>
+#include <boost/functional/hash.hpp>
+#include <boost/dynamic_bitset.hpp>
+#include <boost/circular_buffer.hpp>
+
+#include "include/cpp-btree/btree_set.h"
+
+#include "include/ceph_assert.h"
+#include "include/interval_set.h"
+#include "include/unordered_map.h"
+#include "include/mempool.h"
+#include "include/hash.h"
+#include "common/bloom_filter.hpp"
+#include "common/Finisher.h"
+#include "common/ceph_mutex.h"
+#include "common/Throttle.h"
+#include "common/perf_counters.h"
+#include "common/PriorityCache.h"
+#include "compressor/Compressor.h"
+#include "os/ObjectStore.h"
+
+#include "bluestore_types.h"
+#include "BlueFS.h"
+#include "common/EventTrace.h"
+
+#ifdef WITH_BLKIN
+#include "common/zipkin_trace.h"
+#endif
+
+class Allocator;
+class FreelistManager;
+class BlueStoreRepairer;
+
+//#define DEBUG_CACHE
+//#define DEBUG_DEFERRED
+
+
+
+// constants for Buffer::optimize()
+#define MAX_BUFFER_SLOP_RATIO_DEN  8  // so actually 1/N
+
+
+enum {
+  l_bluestore_first = 732430,
+  l_bluestore_kv_flush_lat,
+  l_bluestore_kv_commit_lat,
+  l_bluestore_kv_sync_lat,
+  l_bluestore_kv_final_lat,
+  l_bluestore_state_prepare_lat,
+  l_bluestore_state_aio_wait_lat,
+  l_bluestore_state_io_done_lat,
+  l_bluestore_state_kv_queued_lat,
+  l_bluestore_state_kv_committing_lat,
+  l_bluestore_state_kv_done_lat,
+  l_bluestore_state_deferred_queued_lat,
+  l_bluestore_state_deferred_aio_wait_lat,
+  l_bluestore_state_deferred_cleanup_lat,
+  l_bluestore_state_finishing_lat,
+  l_bluestore_state_done_lat,
+  l_bluestore_throttle_lat,
+  l_bluestore_submit_lat,
+  l_bluestore_commit_lat,
+  l_bluestore_read_lat,
+  l_bluestore_read_onode_meta_lat,
+  l_bluestore_read_wait_aio_lat,
+  l_bluestore_compress_lat,
+  l_bluestore_decompress_lat,
+  l_bluestore_csum_lat,
+  l_bluestore_compress_success_count,
+  l_bluestore_compress_rejected_count,
+  l_bluestore_write_pad_bytes,
+  l_bluestore_deferred_write_ops,
+  l_bluestore_deferred_write_bytes,
+  l_bluestore_write_penalty_read_ops,
+  l_bluestore_allocated,
+  l_bluestore_stored,
+  l_bluestore_compressed,
+  l_bluestore_compressed_allocated,
+  l_bluestore_compressed_original,
+  l_bluestore_onodes,
+  l_bluestore_pinned_onodes,
+  l_bluestore_onode_hits,
+  l_bluestore_onode_misses,
+  l_bluestore_onode_shard_hits,
+  l_bluestore_onode_shard_misses,
+  l_bluestore_extents,
+  l_bluestore_blobs,
+  l_bluestore_buffers,
+  l_bluestore_buffer_bytes,
+  l_bluestore_buffer_hit_bytes,
+  l_bluestore_buffer_miss_bytes,
+  l_bluestore_write_big,
+  l_bluestore_write_big_bytes,
+  l_bluestore_write_big_blobs,
+  l_bluestore_write_big_deferred,
+  l_bluestore_write_small,
+  l_bluestore_write_small_bytes,
+  l_bluestore_write_small_unused,
+  l_bluestore_write_deferred,
+  l_bluestore_write_deferred_bytes,
+  l_bluestore_write_small_pre_read,
+  l_bluestore_write_new,
+  l_bluestore_txc,
+  l_bluestore_onode_reshard,
+  l_bluestore_blob_split,
+  l_bluestore_extent_compress,
+  l_bluestore_gc_merged,
+  l_bluestore_read_eio,
+  l_bluestore_reads_with_retries,
+  l_bluestore_fragmentation,
+  l_bluestore_omap_seek_to_first_lat,
+  l_bluestore_omap_upper_bound_lat,
+  l_bluestore_omap_lower_bound_lat,
+  l_bluestore_omap_next_lat,
+  l_bluestore_omap_get_keys_lat,
+  l_bluestore_omap_get_values_lat,
+  l_bluestore_clist_lat,
+  l_bluestore_remove_lat,
+  l_bluestore_last
+};
+
+#define META_POOL_ID ((uint64_t)-1ull)
+
+class BlueStore : public ObjectStore,
+		  public md_config_obs_t {
+  // -----------------------------------------------------
+  // types
+public:
+  // config observer
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set<std::string> &changed) override;
+
+  //handler for discard event
+  void handle_discard(interval_set<uint64_t>& to_release);
+
+  void _set_csum();
+  void _set_compression();
+  void _set_throttle_params();
+  int _set_cache_sizes();
+  void _set_max_defer_interval() {
+    max_defer_interval =
+	cct->_conf.get_val<double>("bluestore_max_defer_interval");
+  }
+
+  struct TransContext;
+
+  typedef std::map<uint64_t, ceph::buffer::list> ready_regions_t;
+
+
+  struct BufferSpace;
+  struct Collection;
+  typedef boost::intrusive_ptr<Collection> CollectionRef;
+
+  struct AioContext {
+    virtual void aio_finish(BlueStore *store) = 0;
+    virtual ~AioContext() {}
+  };
+
+  /// cached buffer
+  struct Buffer {
+    MEMPOOL_CLASS_HELPERS();
+
+    enum {
+      STATE_EMPTY,     ///< empty buffer -- used for cache history
+      STATE_CLEAN,     ///< clean data that is up to date
+      STATE_WRITING,   ///< data that is being written (io not yet complete)
+    };
+    static const char *get_state_name(int s) {
+      switch (s) {
+      case STATE_EMPTY: return "empty";
+      case STATE_CLEAN: return "clean";
+      case STATE_WRITING: return "writing";
+      default: return "???";
+      }
+    }
+    enum {
+      FLAG_NOCACHE = 1,  ///< trim when done WRITING (do not become CLEAN)
+      // NOTE: fix operator<< when you define a second flag
+    };
+    static const char *get_flag_name(int s) {
+      switch (s) {
+      case FLAG_NOCACHE: return "nocache";
+      default: return "???";
+      }
+    }
+
+    BufferSpace *space;
+    uint16_t state;             ///< STATE_*
+    uint16_t cache_private = 0; ///< opaque (to us) value used by Cache impl
+    uint32_t flags;             ///< FLAG_*
+    uint64_t seq;
+    uint32_t offset, length;
+    ceph::buffer::list data;
+
+    boost::intrusive::list_member_hook<> lru_item;
+    boost::intrusive::list_member_hook<> state_item;
+
+    Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, uint32_t l,
+	   unsigned f = 0)
+      : space(space), state(s), flags(f), seq(q), offset(o), length(l) {}
+    Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, ceph::buffer::list& b,
+	   unsigned f = 0)
+      : space(space), state(s), flags(f), seq(q), offset(o),
+	length(b.length()), data(b) {}
+
+    bool is_empty() const {
+      return state == STATE_EMPTY;
+    }
+    bool is_clean() const {
+      return state == STATE_CLEAN;
+    }
+    bool is_writing() const {
+      return state == STATE_WRITING;
+    }
+
+    uint32_t end() const {
+      return offset + length;
+    }
+
+    void truncate(uint32_t newlen) {
+      ceph_assert(newlen < length);
+      if (data.length()) {
+	ceph::buffer::list t;
+	t.substr_of(data, 0, newlen);
+	data = std::move(t);
+      }
+      length = newlen;
+    }
+    void maybe_rebuild() {
+      if (data.length() &&
+	  (data.get_num_buffers() > 1 ||
+	   data.front().wasted() > data.length() / MAX_BUFFER_SLOP_RATIO_DEN)) {
+	data.rebuild();
+      }
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->dump_string("state", get_state_name(state));
+      f->dump_unsigned("seq", seq);
+      f->dump_unsigned("offset", offset);
+      f->dump_unsigned("length", length);
+      f->dump_unsigned("data_length", data.length());
+    }
+  };
+
+  struct BufferCacheShard;
+
+  /// map logical extent range (object) onto buffers
+  struct BufferSpace {
+    enum {
+      BYPASS_CLEAN_CACHE = 0x1,  // bypass clean cache
+    };
+
+    typedef boost::intrusive::list<
+      Buffer,
+      boost::intrusive::member_hook<
+        Buffer,
+	boost::intrusive::list_member_hook<>,
+	&Buffer::state_item> > state_list_t;
+
+    mempool::bluestore_cache_meta::map<uint32_t, std::unique_ptr<Buffer>>
+      buffer_map;
+
+    // we use a bare intrusive list here instead of std::map because
+    // it uses less memory and we expect this to be very small (very
+    // few IOs in flight to the same Blob at the same time).
+    state_list_t writing;   ///< writing buffers, sorted by seq, ascending
+
+    ~BufferSpace() {
+      ceph_assert(buffer_map.empty());
+      ceph_assert(writing.empty());
+    }
+
+    void _add_buffer(BufferCacheShard* cache, Buffer* b, int level, Buffer* near) {
+      cache->_audit("_add_buffer start");
+      buffer_map[b->offset].reset(b);
+      if (b->is_writing()) {
+        // we might get already cached data for which resetting mempool is inppropriate
+        // hence calling try_assign_to_mempool
+        b->data.try_assign_to_mempool(mempool::mempool_bluestore_writing);
+        if (writing.empty() || writing.rbegin()->seq <= b->seq) {
+          writing.push_back(*b);
+        } else {
+          auto it = writing.begin();
+          while (it->seq < b->seq) {
+            ++it;
+          }
+
+          ceph_assert(it->seq >= b->seq);
+          // note that this will insert b before it
+          // hence the order is maintained
+          writing.insert(it, *b);
+        }
+      } else {
+        b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
+        cache->_add(b, level, near);
+      }
+      cache->_audit("_add_buffer end");
+    }
+    void _rm_buffer(BufferCacheShard* cache, Buffer *b) {
+      _rm_buffer(cache, buffer_map.find(b->offset));
+    }
+    void _rm_buffer(BufferCacheShard* cache,
+		    std::map<uint32_t, std::unique_ptr<Buffer>>::iterator p) {
+      ceph_assert(p != buffer_map.end());
+      cache->_audit("_rm_buffer start");
+      if (p->second->is_writing()) {
+        writing.erase(writing.iterator_to(*p->second));
+      } else {
+	cache->_rm(p->second.get());
+      }
+      buffer_map.erase(p);
+      cache->_audit("_rm_buffer end");
+    }
+
+    std::map<uint32_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound(
+      uint32_t offset) {
+      auto i = buffer_map.lower_bound(offset);
+      if (i != buffer_map.begin()) {
+	--i;
+	if (i->first + i->second->length <= offset)
+	  ++i;
+      }
+      return i;
+    }
+
+    // must be called under protection of the Cache lock
+    void _clear(BufferCacheShard* cache);
+
+    // return value is the highest cache_private of a trimmed buffer, or 0.
+    int discard(BufferCacheShard* cache, uint32_t offset, uint32_t length) {
+      std::lock_guard l(cache->lock);
+      int ret = _discard(cache, offset, length);
+      cache->_trim();
+      return ret;
+    }
+    int _discard(BufferCacheShard* cache, uint32_t offset, uint32_t length);
+
+    void write(BufferCacheShard* cache, uint64_t seq, uint32_t offset, ceph::buffer::list& bl,
+	       unsigned flags) {
+      std::lock_guard l(cache->lock);
+      Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl,
+			     flags);
+      b->cache_private = _discard(cache, offset, bl.length());
+      _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
+      cache->_trim();
+    }
+    void _finish_write(BufferCacheShard* cache, uint64_t seq);
+    void did_read(BufferCacheShard* cache, uint32_t offset, ceph::buffer::list& bl) {
+      std::lock_guard l(cache->lock);
+      Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
+      b->cache_private = _discard(cache, offset, bl.length());
+      _add_buffer(cache, b, 1, nullptr);
+      cache->_trim();
+    }
+
+    void read(BufferCacheShard* cache, uint32_t offset, uint32_t length,
+	      BlueStore::ready_regions_t& res,
+	      interval_set<uint32_t>& res_intervals,
+	      int flags = 0);
+
+    void truncate(BufferCacheShard* cache, uint32_t offset) {
+      discard(cache, offset, (uint32_t)-1 - offset);
+    }
+
+    void split(BufferCacheShard* cache, size_t pos, BufferSpace &r);
+
+    void dump(BufferCacheShard* cache, ceph::Formatter *f) const {
+      std::lock_guard l(cache->lock);
+      f->open_array_section("buffers");
+      for (auto& i : buffer_map) {
+	f->open_object_section("buffer");
+	ceph_assert(i.first == i.second->offset);
+	i.second->dump(f);
+	f->close_section();
+      }
+      f->close_section();
+    }
+  };
+
+  struct SharedBlobSet;
+
+  /// in-memory shared blob state (incl cached buffers)
+  struct SharedBlob {
+    MEMPOOL_CLASS_HELPERS();
+
+    std::atomic_int nref = {0}; ///< reference count
+    bool loaded = false;
+
+    CollectionRef coll;
+    union {
+      uint64_t sbid_unloaded;              ///< sbid if persistent isn't loaded
+      bluestore_shared_blob_t *persistent; ///< persistent part of the shared blob if any
+    };
+    BufferSpace bc;             ///< buffer cache
+
+    SharedBlob(Collection *_coll) : coll(_coll), sbid_unloaded(0) {
+      if (get_cache()) {
+	get_cache()->add_blob();
+      }
+    }
+    SharedBlob(uint64_t i, Collection *_coll);
+    ~SharedBlob();
+
+    uint64_t get_sbid() const {
+      return loaded ? persistent->sbid : sbid_unloaded;
+    }
+
+    friend void intrusive_ptr_add_ref(SharedBlob *b) { b->get(); }
+    friend void intrusive_ptr_release(SharedBlob *b) { b->put(); }
+
+    void dump(ceph::Formatter* f) const;
+    friend std::ostream& operator<<(std::ostream& out, const SharedBlob& sb);
+
+    void get() {
+      ++nref;
+    }
+    void put();
+
+    /// get logical references
+    void get_ref(uint64_t offset, uint32_t length);
+
+    /// put logical references, and get back any released extents
+    void put_ref(uint64_t offset, uint32_t length,
+		 PExtentVector *r, bool *unshare);
+
+    void finish_write(uint64_t seq);
+
+    friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
+      return l.get_sbid() == r.get_sbid();
+    }
+    inline BufferCacheShard* get_cache() {
+      return coll ? coll->cache : nullptr;
+    }
+    inline SharedBlobSet* get_parent() {
+      return coll ? &(coll->shared_blob_set) : nullptr;
+    }
+    inline bool is_loaded() const {
+      return loaded;
+    }
+
+  };
+  typedef boost::intrusive_ptr<SharedBlob> SharedBlobRef;
+
+  /// a lookup table of SharedBlobs
+  struct SharedBlobSet {
+    /// protect lookup, insertion, removal
+    ceph::mutex lock = ceph::make_mutex("BlueStore::SharedBlobSet::lock");
+
+    // we use a bare pointer because we don't want to affect the ref
+    // count
+    mempool::bluestore_cache_meta::unordered_map<uint64_t,SharedBlob*> sb_map;
+
+    SharedBlobRef lookup(uint64_t sbid) {
+      std::lock_guard l(lock);
+      auto p = sb_map.find(sbid);
+      if (p == sb_map.end() ||
+	  p->second->nref == 0) {
+        return nullptr;
+      }
+      return p->second;
+    }
+
+    void add(Collection* coll, SharedBlob *sb) {
+      std::lock_guard l(lock);
+      sb_map[sb->get_sbid()] = sb;
+      sb->coll = coll;
+    }
+
+    bool remove(SharedBlob *sb, bool verify_nref_is_zero=false) {
+      std::lock_guard l(lock);
+      ceph_assert(sb->get_parent() == this);
+      if (verify_nref_is_zero && sb->nref != 0) {
+	return false;
+      }
+      // only remove if it still points to us
+      auto p = sb_map.find(sb->get_sbid());
+      if (p != sb_map.end() &&
+	  p->second == sb) {
+	sb_map.erase(p);
+      }
+      return true;
+    }
+
+    bool empty() {
+      std::lock_guard l(lock);
+      return sb_map.empty();
+    }
+
+    template <int LogLevelV>
+    void dump(CephContext *cct);
+  };
+
+//#define CACHE_BLOB_BL  // not sure if this is a win yet or not... :/
+
+  /// in-memory blob metadata and associated cached buffers (if any)
+  struct Blob {
+    MEMPOOL_CLASS_HELPERS();
+
+    std::atomic_int nref = {0};     ///< reference count
+    int16_t id = -1;                ///< id, for spanning blobs only, >= 0
+    int16_t last_encoded_id = -1;   ///< (ephemeral) used during encoding only
+    SharedBlobRef shared_blob;      ///< shared blob state (if any)
+
+  private:
+    mutable bluestore_blob_t blob;  ///< decoded blob metadata
+#ifdef CACHE_BLOB_BL
+    mutable ceph::buffer::list blob_bl;     ///< cached encoded blob, blob is dirty if empty
+#endif
+    /// refs from this shard.  ephemeral if id<0, persisted if spanning.
+    bluestore_blob_use_tracker_t used_in_blob;
+
+  public:
+
+    friend void intrusive_ptr_add_ref(Blob *b) { b->get(); }
+    friend void intrusive_ptr_release(Blob *b) { b->put(); }
+
+    void dump(ceph::Formatter* f) const;
+    friend std::ostream& operator<<(std::ostream& out, const Blob &b);
+
+    const bluestore_blob_use_tracker_t& get_blob_use_tracker() const {
+      return used_in_blob;
+    }
+    bool is_referenced() const {
+      return used_in_blob.is_not_empty();
+    }
+    uint32_t get_referenced_bytes() const {
+      return used_in_blob.get_referenced_bytes();
+    }
+
+    bool is_spanning() const {
+      return id >= 0;
+    }
+
+    bool can_split() const {
+      std::lock_guard l(shared_blob->get_cache()->lock);
+      // splitting a BufferSpace writing list is too hard; don't try.
+      return shared_blob->bc.writing.empty() &&
+             used_in_blob.can_split() &&
+             get_blob().can_split();
+    }
+
+    bool can_split_at(uint32_t blob_offset) const {
+      return used_in_blob.can_split_at(blob_offset) &&
+             get_blob().can_split_at(blob_offset);
+    }
+
+    bool can_reuse_blob(uint32_t min_alloc_size,
+			uint32_t target_blob_size,
+			uint32_t b_offset,
+			uint32_t *length0);
+
+    void dup(Blob& o) {
+      o.shared_blob = shared_blob;
+      o.blob = blob;
+#ifdef CACHE_BLOB_BL
+      o.blob_bl = blob_bl;
+#endif
+    }
+
+    inline const bluestore_blob_t& get_blob() const {
+      return blob;
+    }
+    inline bluestore_blob_t& dirty_blob() {
+#ifdef CACHE_BLOB_BL
+      blob_bl.clear();
+#endif
+      return blob;
+    }
+
+    /// discard buffers for unallocated regions
+    void discard_unallocated(Collection *coll);
+
+    /// get logical references
+    void get_ref(Collection *coll, uint32_t offset, uint32_t length);
+    /// put logical references, and get back any released extents
+    bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
+		 PExtentVector *r);
+
+    /// split the blob
+    void split(Collection *coll, uint32_t blob_offset, Blob *o);
+
+    void get() {
+      ++nref;
+    }
+    void put() {
+      if (--nref == 0)
+	delete this;
+    }
+
+
+#ifdef CACHE_BLOB_BL
+    void _encode() const {
+      if (blob_bl.length() == 0 ) {
+	encode(blob, blob_bl);
+      } else {
+	ceph_assert(blob_bl.length());
+      }
+    }
+    void bound_encode(
+      size_t& p,
+      bool include_ref_map) const {
+      _encode();
+      p += blob_bl.length();
+      if (include_ref_map) {
+	used_in_blob.bound_encode(p);
+      }
+    }
+    void encode(
+      ceph::buffer::list::contiguous_appender& p,
+      bool include_ref_map) const {
+      _encode();
+      p.append(blob_bl);
+      if (include_ref_map) {
+	used_in_blob.encode(p);
+      }
+    }
+    void decode(
+      Collection */*coll*/,
+      ceph::buffer::ptr::const_iterator& p,
+      bool include_ref_map) {
+      const char *start = p.get_pos();
+      denc(blob, p);
+      const char *end = p.get_pos();
+      blob_bl.clear();
+      blob_bl.append(start, end - start);
+      if (include_ref_map) {
+	used_in_blob.decode(p);
+      }
+    }
+#else
+    void bound_encode(
+      size_t& p,
+      uint64_t struct_v,
+      uint64_t sbid,
+      bool include_ref_map) const {
+      denc(blob, p, struct_v);
+      if (blob.is_shared()) {
+        denc(sbid, p);
+      }
+      if (include_ref_map) {
+	used_in_blob.bound_encode(p);
+      }
+    }
+    void encode(
+      ceph::buffer::list::contiguous_appender& p,
+      uint64_t struct_v,
+      uint64_t sbid,
+      bool include_ref_map) const {
+      denc(blob, p, struct_v);
+      if (blob.is_shared()) {
+        denc(sbid, p);
+      }
+      if (include_ref_map) {
+	used_in_blob.encode(p);
+      }
+    }
+    void decode(
+      Collection *coll,
+      ceph::buffer::ptr::const_iterator& p,
+      uint64_t struct_v,
+      uint64_t* sbid,
+      bool include_ref_map);
+#endif
+  };
+  typedef boost::intrusive_ptr<Blob> BlobRef;
+  typedef mempool::bluestore_cache_meta::map<int,BlobRef> blob_map_t;
+
+  /// a logical extent, pointing to (some portion of) a blob
+  typedef boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true> > ExtentBase; //making an alias to avoid build warnings
+  struct Extent : public ExtentBase {
+    MEMPOOL_CLASS_HELPERS();
+
+    uint32_t logical_offset = 0;      ///< logical offset
+    uint32_t blob_offset = 0;         ///< blob offset
+    uint32_t length = 0;              ///< length
+    BlobRef  blob;                    ///< the blob with our data
+
+    /// ctor for lookup only
+    explicit Extent(uint32_t lo) : ExtentBase(), logical_offset(lo) { }
+    /// ctor for delayed initialization (see decode_some())
+    explicit Extent() : ExtentBase() {
+    }
+    /// ctor for general usage
+    Extent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
+      : ExtentBase(),
+        logical_offset(lo), blob_offset(o), length(l) {
+      assign_blob(b);
+    }
+    ~Extent() {
+      if (blob) {
+	blob->shared_blob->get_cache()->rm_extent();
+      }
+    }
+
+    void dump(ceph::Formatter* f) const;
+
+    void assign_blob(const BlobRef& b) {
+      ceph_assert(!blob);
+      blob = b;
+      blob->shared_blob->get_cache()->add_extent();
+    }
+
+    // comparators for intrusive_set
+    friend bool operator<(const Extent &a, const Extent &b) {
+      return a.logical_offset < b.logical_offset;
+    }
+    friend bool operator>(const Extent &a, const Extent &b) {
+      return a.logical_offset > b.logical_offset;
+    }
+    friend bool operator==(const Extent &a, const Extent &b) {
+      return a.logical_offset == b.logical_offset;
+    }
+
+    uint32_t blob_start() const {
+      return logical_offset - blob_offset;
+    }
+
+    uint32_t blob_end() const {
+      return blob_start() + blob->get_blob().get_logical_length();
+    }
+
+    uint32_t logical_end() const {
+      return logical_offset + length;
+    }
+
+    // return true if any piece of the blob is out of
+    // the given range [o, o + l].
+    bool blob_escapes_range(uint32_t o, uint32_t l) const {
+      return blob_start() < o || blob_end() > o + l;
+    }
+  };
+  typedef boost::intrusive::set<Extent> extent_map_t;
+
+
+  friend std::ostream& operator<<(std::ostream& out, const Extent& e);
+
+  struct OldExtent {
+    boost::intrusive::list_member_hook<> old_extent_item;
+    Extent e;
+    PExtentVector r;
+    bool blob_empty; // flag to track the last removed extent that makes blob
+                     // empty - required to update compression stat properly
+    OldExtent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
+      : e(lo, o, l, b), blob_empty(false) {
+    }
+    static OldExtent* create(CollectionRef c,
+                             uint32_t lo,
+			     uint32_t o,
+			     uint32_t l,
+			     BlobRef& b);
+  };
+  typedef boost::intrusive::list<
+      OldExtent,
+      boost::intrusive::member_hook<
+        OldExtent,
+    boost::intrusive::list_member_hook<>,
+    &OldExtent::old_extent_item> > old_extent_map_t;
+
+  struct Onode;
+
+  /// a sharded extent map, mapping offsets to lextents to blobs
+  struct ExtentMap {
+    Onode *onode;
+    extent_map_t extent_map;        ///< map of Extents to Blobs
+    blob_map_t spanning_blob_map;   ///< blobs that span shards
+    typedef boost::intrusive_ptr<Onode> OnodeRef;
+
+    struct Shard {
+      bluestore_onode_t::shard_info *shard_info = nullptr;
+      unsigned extents = 0;  ///< count extents in this shard
+      bool loaded = false;   ///< true if shard is loaded
+      bool dirty = false;    ///< true if shard is dirty and needs reencoding
+    };
+    mempool::bluestore_cache_meta::vector<Shard> shards;    ///< shards
+
+    ceph::buffer::list inline_bl;    ///< cached encoded map, if unsharded; empty=>dirty
+
+    uint32_t needs_reshard_begin = 0;
+    uint32_t needs_reshard_end = 0;
+
+    void dup(BlueStore* b, TransContext*, CollectionRef&, OnodeRef&, OnodeRef&,
+      uint64_t&, uint64_t&, uint64_t&);
+
+    bool needs_reshard() const {
+      return needs_reshard_end > needs_reshard_begin;
+    }
+    void clear_needs_reshard() {
+      needs_reshard_begin = needs_reshard_end = 0;
+    }
+    void request_reshard(uint32_t begin, uint32_t end) {
+      if (begin < needs_reshard_begin) {
+	needs_reshard_begin = begin;
+      }
+      if (end > needs_reshard_end) {
+	needs_reshard_end = end;
+      }
+    }
+
+    struct DeleteDisposer {
+      void operator()(Extent *e) { delete e; }
+    };
+
+    ExtentMap(Onode *o);
+    ~ExtentMap() {
+      extent_map.clear_and_dispose(DeleteDisposer());
+    }
+
+    void clear() {
+      extent_map.clear_and_dispose(DeleteDisposer());
+      shards.clear();
+      inline_bl.clear();
+      clear_needs_reshard();
+    }
+
+    void dump(ceph::Formatter* f) const;
+
+    bool encode_some(uint32_t offset, uint32_t length, ceph::buffer::list& bl,
+		     unsigned *pn);
+    unsigned decode_some(ceph::buffer::list& bl);
+
+    void bound_encode_spanning_blobs(size_t& p);
+    void encode_spanning_blobs(ceph::buffer::list::contiguous_appender& p);
+    void decode_spanning_blobs(ceph::buffer::ptr::const_iterator& p);
+
+    BlobRef get_spanning_blob(int id) {
+      auto p = spanning_blob_map.find(id);
+      ceph_assert(p != spanning_blob_map.end());
+      return p->second;
+    }
+
+    void update(KeyValueDB::Transaction t, bool force);
+    decltype(BlueStore::Blob::id) allocate_spanning_blob_id();
+    void reshard(
+      KeyValueDB *db,
+      KeyValueDB::Transaction t);
+
+    /// initialize Shards from the onode
+    void init_shards(bool loaded, bool dirty);
+
+    /// return index of shard containing offset
+    /// or -1 if not found
+    int seek_shard(uint32_t offset) {
+      size_t end = shards.size();
+      size_t mid, left = 0;
+      size_t right = end; // one passed the right end
+
+      while (left < right) {
+        mid = left + (right - left) / 2;
+        if (offset >= shards[mid].shard_info->offset) {
+          size_t next = mid + 1;
+          if (next >= end || offset < shards[next].shard_info->offset)
+            return mid;
+          //continue to search forwards
+          left = next;
+        } else {
+          //continue to search backwards
+          right = mid;
+        }
+      }
+
+      return -1; // not found
+    }
+
+    /// check if a range spans a shard
+    bool spans_shard(uint32_t offset, uint32_t length) {
+      if (shards.empty()) {
+	return false;
+      }
+      int s = seek_shard(offset);
+      ceph_assert(s >= 0);
+      if (s == (int)shards.size() - 1) {
+	return false; // last shard
+      }
+      if (offset + length <= shards[s+1].shard_info->offset) {
+	return false;
+      }
+      return true;
+    }
+
+    /// ensure that a range of the map is loaded
+    void fault_range(KeyValueDB *db,
+		     uint32_t offset, uint32_t length);
+
+    /// ensure a range of the map is marked dirty
+    void dirty_range(uint32_t offset, uint32_t length);
+
+    /// for seek_lextent test
+    extent_map_t::iterator find(uint64_t offset);
+
+    /// seek to the first lextent including or after offset
+    extent_map_t::iterator seek_lextent(uint64_t offset);
+    extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
+
+    /// add a new Extent
+    void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
+      extent_map.insert(*new Extent(lo, o, l, b));
+    }
+
+    /// remove (and delete) an Extent
+    void rm(extent_map_t::iterator p) {
+      extent_map.erase_and_dispose(p, DeleteDisposer());
+    }
+
+    bool has_any_lextents(uint64_t offset, uint64_t length);
+
+    /// consolidate adjacent lextents in extent_map
+    int compress_extent_map(uint64_t offset, uint64_t length);
+
+    /// punch a logical hole.  add lextents to deref to target list.
+    void punch_hole(CollectionRef &c,
+		    uint64_t offset, uint64_t length,
+		    old_extent_map_t *old_extents);
+
+    /// put new lextent into lextent_map overwriting existing ones if
+    /// any and update references accordingly
+    Extent *set_lextent(CollectionRef &c,
+			uint64_t logical_offset,
+			uint64_t offset, uint64_t length,
+                        BlobRef b,
+			old_extent_map_t *old_extents);
+
+    /// split a blob (and referring extents)
+    BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
+  };
+
+  /// Compressed Blob Garbage collector
+  /*
+  The primary idea of the collector is to estimate a difference between
+  allocation units(AU) currently present for compressed blobs and new AUs
+  required to store that data uncompressed. 
+  Estimation is performed for protrusive extents within a logical range
+  determined by a concatenation of old_extents collection and specific(current)
+  write request.
+  The root cause for old_extents use is the need to handle blob ref counts
+  properly. Old extents still hold blob refs and hence we need to traverse
+  the collection to determine if blob to be released.
+  Protrusive extents are extents that fit into the blob std::set in action
+  (ones that are below the logical range from above) but not removed totally
+  due to the current write. 
+  E.g. for
+  extent1 <loffs = 100, boffs = 100, len  = 100> -> 
+    blob1<compressed, len_on_disk=4096, logical_len=8192>
+  extent2 <loffs = 200, boffs = 200, len  = 100> ->
+    blob2<raw, len_on_disk=4096, llen=4096>
+  extent3 <loffs = 300, boffs = 300, len  = 100> ->
+    blob1<compressed, len_on_disk=4096, llen=8192>
+  extent4 <loffs = 4096, boffs = 0, len  = 100>  ->
+    blob3<raw, len_on_disk=4096, llen=4096>
+  write(300~100)
+  protrusive extents are within the following ranges <0~300, 400~8192-400>
+  In this case existing AUs that might be removed due to GC (i.e. blob1) 
+  use 2x4K bytes.
+  And new AUs expected after GC = 0 since extent1 to be merged into blob2.
+  Hence we should do a collect.
+  */
+  class GarbageCollector
+  {
+  public:
+    /// return amount of allocation units that might be saved due to GC
+    int64_t estimate(
+      uint64_t offset,
+      uint64_t length,
+      const ExtentMap& extent_map,
+      const old_extent_map_t& old_extents,
+      uint64_t min_alloc_size);
+
+    /// return a collection of extents to perform GC on
+    const interval_set<uint64_t>& get_extents_to_collect() const {
+      return extents_to_collect;
+    }
+    GarbageCollector(CephContext* _cct) : cct(_cct) {}
+
+  private:
+    struct BlobInfo {
+      uint64_t referenced_bytes = 0;    ///< amount of bytes referenced in blob
+      int64_t expected_allocations = 0; ///< new alloc units required 
+                                        ///< in case of gc fulfilled
+      bool collect_candidate = false;   ///< indicate if blob has any extents 
+                                        ///< eligible for GC.
+      extent_map_t::const_iterator first_lextent; ///< points to the first 
+                                                  ///< lextent referring to 
+                                                  ///< the blob if any.
+                                                  ///< collect_candidate flag 
+                                                  ///< determines the validity
+      extent_map_t::const_iterator last_lextent;  ///< points to the last 
+                                                  ///< lextent referring to 
+                                                  ///< the blob if any.
+
+      BlobInfo(uint64_t ref_bytes) :
+        referenced_bytes(ref_bytes) {
+      }
+    };
+    CephContext* cct;
+    std::map<Blob*, BlobInfo> affected_blobs; ///< compressed blobs and their ref_map
+                                         ///< copies that are affected by the
+                                         ///< specific write
+
+    ///< protrusive extents that should be collected if GC takes place
+    interval_set<uint64_t> extents_to_collect;
+
+    boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
+                                                ///<  unit when traversing 
+                                                ///< protrusive extents. 
+                                                ///< Other extents mapped to
+                                                ///< this AU to be ignored 
+                                                ///< (except the case where
+                                                ///< uncompressed extent follows
+                                                ///< compressed one - see below).
+    BlobInfo* blob_info_counted = nullptr; ///< std::set if previous allocation unit
+                                           ///< caused expected_allocations
+					   ///< counter increment at this blob.
+                                           ///< if uncompressed extent follows 
+                                           ///< a decrement for the 
+                                	   ///< expected_allocations counter 
+                                           ///< is needed
+    int64_t expected_allocations = 0;      ///< new alloc units required in case
+                                           ///< of gc fulfilled
+    int64_t expected_for_release = 0;      ///< alloc units currently used by
+                                           ///< compressed blobs that might
+                                           ///< gone after GC
+
+  protected:
+    void process_protrusive_extents(const BlueStore::ExtentMap& extent_map, 
+				    uint64_t start_offset,
+				    uint64_t end_offset,
+				    uint64_t start_touch_offset,
+				    uint64_t end_touch_offset,
+				    uint64_t min_alloc_size);
+  };
+
+  struct OnodeSpace;
+  /// an in-memory object
+  struct Onode {
+    MEMPOOL_CLASS_HELPERS();
+
+    std::atomic_int nref;  ///< reference count
+    std::atomic_int put_nref = {0};
+    Collection *c;
+    ghobject_t oid;
+
+    /// key under PREFIX_OBJ where we are stored
+    mempool::bluestore_cache_meta::string key;
+
+    boost::intrusive::list_member_hook<> lru_item;
+
+    bluestore_onode_t onode;  ///< metadata stored as value in kv store
+    bool exists;              ///< true if object logically exists
+    bool cached;              ///< Onode is logically in the cache
+                              /// (it can be pinned and hence physically out
+                              /// of it at the moment though)
+    std::atomic_bool pinned;  ///< Onode is pinned
+                              /// (or should be pinned when cached)
+    ExtentMap extent_map;
+
+    // track txc's that have not been committed to kv store (and whose
+    // effects cannot be read via the kvdb read methods)
+    std::atomic<int> flushing_count = {0};
+    std::atomic<int> waiting_count = {0};
+    /// protect flush_txns
+    ceph::mutex flush_lock = ceph::make_mutex("BlueStore::Onode::flush_lock");
+    ceph::condition_variable flush_cond;   ///< wait here for uncommitted txns
+
+    Onode(Collection *c, const ghobject_t& o,
+	  const mempool::bluestore_cache_meta::string& k)
+      : nref(0),
+	c(c),
+	oid(o),
+	key(k),
+	exists(false),
+        cached(false),
+        pinned(false),
+	extent_map(this) {
+    }
+    Onode(Collection* c, const ghobject_t& o,
+      const std::string& k)
+      : nref(0),
+      c(c),
+      oid(o),
+      key(k),
+      exists(false),
+      cached(false),
+      pinned(false),
+      extent_map(this) {
+    }
+    Onode(Collection* c, const ghobject_t& o,
+      const char* k)
+      : nref(0),
+      c(c),
+      oid(o),
+      key(k),
+      exists(false),
+      cached(false),
+      pinned(false),
+      extent_map(this) {
+    }
+
+    static Onode* decode(
+      CollectionRef c,
+      const ghobject_t& oid,
+      const std::string& key,
+      const ceph::buffer::list& v);
+
+    void dump(ceph::Formatter* f) const;
+
+    void flush();
+    void get();
+    void put();
+
+    inline bool put_cache() {
+      ceph_assert(!cached);
+      cached = true;
+      return !pinned;
+    }
+    inline bool pop_cache() {
+      ceph_assert(cached);
+      cached = false;
+      return !pinned;
+    }
+
+    static const std::string& calc_omap_prefix(uint8_t flags);
+    static void calc_omap_header(uint8_t flags, const Onode* o,
+      std::string* out);
+    static void calc_omap_key(uint8_t flags, const Onode* o,
+      const std::string& key, std::string* out);
+    static void calc_omap_tail(uint8_t flags, const Onode* o,
+      std::string* out);
+
+    const std::string& get_omap_prefix() {
+      return calc_omap_prefix(onode.flags);
+    }
+    void get_omap_header(std::string* out) {
+      calc_omap_header(onode.flags, this, out);
+    }
+    void get_omap_key(const std::string& key, std::string* out) {
+      calc_omap_key(onode.flags, this, key, out);
+    }
+    void get_omap_tail(std::string* out) {
+      calc_omap_tail(onode.flags, this, out);
+    }
+
+    void rewrite_omap_key(const std::string& old, std::string *out);
+    void decode_omap_key(const std::string& key, std::string *user_key);
+
+    // Return the offset of an object on disk.  This function is intended *only*
+    // for use with zoned storage devices because in these devices, the objects
+    // are laid out contiguously on disk, which is not the case in general.
+    // Also, it should always be called after calling extent_map.fault_range(),
+    // so that the extent map is loaded.
+    int64_t zoned_get_ondisk_starting_offset() const {
+      return extent_map.extent_map.begin()->blob->
+	  get_blob().calc_offset(0, nullptr);
+    }
+  };
+  typedef boost::intrusive_ptr<Onode> OnodeRef;
+
+  /// A generic Cache Shard
+  struct CacheShard {
+    CephContext *cct;
+    PerfCounters *logger;
+
+    /// protect lru and other structures
+    ceph::recursive_mutex lock = {
+      ceph::make_recursive_mutex("BlueStore::CacheShard::lock") };
+
+    std::atomic<uint64_t> max = {0};
+    std::atomic<uint64_t> num = {0};
+
+    CacheShard(CephContext* cct) : cct(cct), logger(nullptr) {}
+    virtual ~CacheShard() {}
+
+    void set_max(uint64_t max_) {
+      max = max_;
+    }
+
+    uint64_t _get_num() {
+      return num;
+    }
+
+    virtual void _trim_to(uint64_t new_size) = 0;
+    void _trim() {
+      if (cct->_conf->objectstore_blackhole) {
+	// do not trim if we are throwing away IOs a layer down
+	return;
+      }
+      _trim_to(max);
+    }
+
+    void trim() {
+      std::lock_guard l(lock);
+      _trim();    
+    }
+    void flush() {
+      std::lock_guard l(lock);
+      // we should not be shutting down after the blackhole is enabled
+      assert(!cct->_conf->objectstore_blackhole);
+      _trim_to(0);
+    }
+
+#ifdef DEBUG_CACHE
+    virtual void _audit(const char *s) = 0;
+#else
+    void _audit(const char *s) { /* no-op */ }
+#endif
+  };
+
+  /// A Generic onode Cache Shard
+  struct OnodeCacheShard : public CacheShard {
+    std::atomic<uint64_t> num_pinned = {0};
+
+    std::array<std::pair<ghobject_t, ceph::mono_clock::time_point>, 64> dumped_onodes;
+
+    virtual void _pin(Onode* o) = 0;
+    virtual void _unpin(Onode* o) = 0;
+
+  public:
+    OnodeCacheShard(CephContext* cct) : CacheShard(cct) {}
+    static OnodeCacheShard *create(CephContext* cct, std::string type,
+                                   PerfCounters *logger);
+    virtual void _add(Onode* o, int level) = 0;
+    virtual void _rm(Onode* o) = 0;
+    virtual void _unpin_and_rm(Onode* o) = 0;
+
+    virtual void move_pinned(OnodeCacheShard *to, Onode *o) = 0;
+    virtual void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) = 0;
+    bool empty() {
+      return _get_num() == 0;
+    }
+  };
+
+  /// A Generic buffer Cache Shard
+  struct BufferCacheShard : public CacheShard {
+    std::atomic<uint64_t> num_extents = {0};
+    std::atomic<uint64_t> num_blobs = {0};
+    uint64_t buffer_bytes = 0;
+
+  public:
+    BufferCacheShard(CephContext* cct) : CacheShard(cct) {}
+    static BufferCacheShard *create(CephContext* cct, std::string type, 
+                                    PerfCounters *logger);
+    virtual void _add(Buffer *b, int level, Buffer *near) = 0;
+    virtual void _rm(Buffer *b) = 0;
+    virtual void _move(BufferCacheShard *src, Buffer *b) = 0;
+    virtual void _touch(Buffer *b) = 0;
+    virtual void _adjust_size(Buffer *b, int64_t delta) = 0;
+
+    uint64_t _get_bytes() {
+      return buffer_bytes;
+    }
+
+    void add_extent() {
+      ++num_extents;
+    }
+    void rm_extent() {
+      --num_extents;
+    }
+
+    void add_blob() {
+      ++num_blobs;
+    }
+    void rm_blob() {
+      --num_blobs;
+    }
+
+    virtual void add_stats(uint64_t *extents,
+                           uint64_t *blobs,
+                           uint64_t *buffers,
+                           uint64_t *bytes) = 0;
+
+    bool empty() {
+      std::lock_guard l(lock);
+      return _get_bytes() == 0;
+    }
+  };
+
+  struct OnodeSpace {
+    OnodeCacheShard *cache;
+
+  private:
+    /// forward lookups
+    mempool::bluestore_cache_meta::unordered_map<ghobject_t,OnodeRef> onode_map;
+
+    friend struct Collection; // for split_cache()
+    friend struct Onode; // for put()
+    friend struct LruOnodeCacheShard;
+    void _remove(const ghobject_t& oid);
+  public:
+    OnodeSpace(OnodeCacheShard *c) : cache(c) {}
+    ~OnodeSpace() {
+      clear();
+    }
+
+    OnodeRef add(const ghobject_t& oid, OnodeRef& o);
+    OnodeRef lookup(const ghobject_t& o);
+    void rename(OnodeRef& o, const ghobject_t& old_oid,
+		const ghobject_t& new_oid,
+		const mempool::bluestore_cache_meta::string& new_okey);
+    void clear();
+    bool empty();
+
+    template <int LogLevelV>
+    void dump(CephContext *cct);
+
+    /// return true if f true for any item
+    bool map_any(std::function<bool(Onode*)> f);
+  };
+
+  class OpSequencer;
+  using OpSequencerRef = ceph::ref_t<OpSequencer>;
+
+  struct Collection : public CollectionImpl {
+    BlueStore *store;
+    OpSequencerRef osr;
+    BufferCacheShard *cache;       ///< our cache shard
+    bluestore_cnode_t cnode;
+    ceph::shared_mutex lock =
+      ceph::make_shared_mutex("BlueStore::Collection::lock", true, false);
+
+    bool exists;
+
+    SharedBlobSet shared_blob_set;      ///< open SharedBlobs
+
+    // cache onodes on a per-collection basis to avoid lock
+    // contention.
+    OnodeSpace onode_map;
+
+    //pool options
+    pool_opts_t pool_opts;
+    ContextQueue *commit_queue;
+
+    OnodeCacheShard* get_onode_cache() const {
+      return onode_map.cache;
+    }
+    OnodeRef get_onode(const ghobject_t& oid, bool create, bool is_createop=false);
+
+    // the terminology is confusing here, sorry!
+    //
+    //  blob_t     shared_blob_t
+    //  !shared    unused                -> open
+    //  shared     !loaded               -> open + shared
+    //  shared     loaded                -> open + shared + loaded
+    //
+    // i.e.,
+    //  open = SharedBlob is instantiated
+    //  shared = blob_t shared flag is std::set; SharedBlob is hashed.
+    //  loaded = SharedBlob::shared_blob_t is loaded from kv store
+    void open_shared_blob(uint64_t sbid, BlobRef b);
+    void load_shared_blob(SharedBlobRef sb);
+    void make_blob_shared(uint64_t sbid, BlobRef b);
+    uint64_t make_blob_unshared(SharedBlob *sb);
+
+    BlobRef new_blob() {
+      BlobRef b = new Blob();
+      b->shared_blob = new SharedBlob(this);
+      return b;
+    }
+
+    bool contains(const ghobject_t& oid) {
+      if (cid.is_meta())
+	return oid.hobj.pool == -1;
+      spg_t spgid;
+      if (cid.is_pg(&spgid))
+	return
+	  spgid.pgid.contains(cnode.bits, oid) &&
+	  oid.shard_id == spgid.shard;
+      return false;
+    }
+
+    int64_t pool() const {
+      return cid.pool();
+    }
+
+    void split_cache(Collection *dest);
+
+    bool flush_commit(Context *c) override;
+    void flush() override;
+    void flush_all_but_last();
+
+    Collection(BlueStore *ns, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t c);
+  };
+
+  class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
+    CollectionRef c;
+    OnodeRef o;
+    KeyValueDB::Iterator it;
+    std::string head, tail;
+
+    std::string _stringify() const;
+
+  public:
+    OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
+    int seek_to_first() override;
+    int upper_bound(const std::string &after) override;
+    int lower_bound(const std::string &to) override;
+    bool valid() override;
+    int next() override;
+    std::string key() override;
+    ceph::buffer::list value() override;
+    std::string tail_key() override {
+      return tail;
+    }
+
+    int status() override {
+      return 0;
+    }
+  };
+
+  struct volatile_statfs{
+    enum {
+      STATFS_ALLOCATED = 0,
+      STATFS_STORED,
+      STATFS_COMPRESSED_ORIGINAL,
+      STATFS_COMPRESSED,
+      STATFS_COMPRESSED_ALLOCATED,
+      STATFS_LAST
+    };
+    int64_t values[STATFS_LAST];
+    volatile_statfs() {
+      memset(this, 0, sizeof(volatile_statfs));
+    }
+    void reset() {
+      *this = volatile_statfs();
+    }
+    void publish(store_statfs_t* buf) const {
+      buf->allocated = allocated();
+      buf->data_stored = stored();
+      buf->data_compressed = compressed();
+      buf->data_compressed_original = compressed_original();
+      buf->data_compressed_allocated = compressed_allocated();
+    }
+
+    volatile_statfs& operator+=(const volatile_statfs& other) {
+      for (size_t i = 0; i < STATFS_LAST; ++i) {
+	values[i] += other.values[i];
+      }
+      return *this;
+    }
+    int64_t& allocated() {
+      return values[STATFS_ALLOCATED];
+    }
+    int64_t& stored() {
+      return values[STATFS_STORED];
+    }
+    int64_t& compressed_original() {
+      return values[STATFS_COMPRESSED_ORIGINAL];
+    }
+    int64_t& compressed() {
+      return values[STATFS_COMPRESSED];
+    }
+    int64_t& compressed_allocated() {
+      return values[STATFS_COMPRESSED_ALLOCATED];
+    }
+    int64_t allocated() const {
+      return values[STATFS_ALLOCATED];
+    }
+    int64_t stored() const {
+      return values[STATFS_STORED];
+    }
+    int64_t compressed_original() const {
+      return values[STATFS_COMPRESSED_ORIGINAL];
+    }
+    int64_t compressed() const {
+      return values[STATFS_COMPRESSED];
+    }
+    int64_t compressed_allocated() const {
+      return values[STATFS_COMPRESSED_ALLOCATED];
+    }
+    volatile_statfs& operator=(const store_statfs_t& st) {
+      values[STATFS_ALLOCATED] = st.allocated;
+      values[STATFS_STORED] = st.data_stored;
+      values[STATFS_COMPRESSED_ORIGINAL] = st.data_compressed_original;
+      values[STATFS_COMPRESSED] = st.data_compressed;
+      values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated;
+      return *this;
+    }
+    bool is_empty() {
+      return values[STATFS_ALLOCATED] == 0 &&
+	values[STATFS_STORED] == 0 &&
+	values[STATFS_COMPRESSED] == 0 &&
+	values[STATFS_COMPRESSED_ORIGINAL] == 0 &&
+	values[STATFS_COMPRESSED_ALLOCATED] == 0;
+    }
+    void decode(ceph::buffer::list::const_iterator& it) {
+      using ceph::decode;
+      for (size_t i = 0; i < STATFS_LAST; i++) {
+	decode(values[i], it);
+      }
+    }
+
+    void encode(ceph::buffer::list& bl) {
+      using ceph::encode;
+      for (size_t i = 0; i < STATFS_LAST; i++) {
+	encode(values[i], bl);
+      }
+    }
+  };
+
+  struct TransContext final : public AioContext {
+    MEMPOOL_CLASS_HELPERS();
+
+    typedef enum {
+      STATE_PREPARE,
+      STATE_AIO_WAIT,
+      STATE_IO_DONE,
+      STATE_KV_QUEUED,     // queued for kv_sync_thread submission
+      STATE_KV_SUBMITTED,  // submitted to kv; not yet synced
+      STATE_KV_DONE,
+      STATE_DEFERRED_QUEUED,    // in deferred_queue (pending or running)
+      STATE_DEFERRED_CLEANUP,   // remove deferred kv record
+      STATE_DEFERRED_DONE,
+      STATE_FINISHING,
+      STATE_DONE,
+    } state_t;
+
+    const char *get_state_name() {
+      switch (state) {
+      case STATE_PREPARE: return "prepare";
+      case STATE_AIO_WAIT: return "aio_wait";
+      case STATE_IO_DONE: return "io_done";
+      case STATE_KV_QUEUED: return "kv_queued";
+      case STATE_KV_SUBMITTED: return "kv_submitted";
+      case STATE_KV_DONE: return "kv_done";
+      case STATE_DEFERRED_QUEUED: return "deferred_queued";
+      case STATE_DEFERRED_CLEANUP: return "deferred_cleanup";
+      case STATE_DEFERRED_DONE: return "deferred_done";
+      case STATE_FINISHING: return "finishing";
+      case STATE_DONE: return "done";
+      }
+      return "???";
+    }
+
+#if defined(WITH_LTTNG)
+    const char *get_state_latency_name(int state) {
+      switch (state) {
+      case l_bluestore_state_prepare_lat: return "prepare";
+      case l_bluestore_state_aio_wait_lat: return "aio_wait";
+      case l_bluestore_state_io_done_lat: return "io_done";
+      case l_bluestore_state_kv_queued_lat: return "kv_queued";
+      case l_bluestore_state_kv_committing_lat: return "kv_committing";
+      case l_bluestore_state_kv_done_lat: return "kv_done";
+      case l_bluestore_state_deferred_queued_lat: return "deferred_queued";
+      case l_bluestore_state_deferred_cleanup_lat: return "deferred_cleanup";
+      case l_bluestore_state_finishing_lat: return "finishing";
+      case l_bluestore_state_done_lat: return "done";
+      }
+      return "???";
+    }
+#endif
+
+    inline void set_state(state_t s) {
+       state = s;
+#ifdef WITH_BLKIN
+       if (trace) {
+         trace.event(get_state_name());
+       } 
+#endif
+    }
+    inline state_t get_state() {
+      return state;
+    }
+
+    CollectionRef ch;
+    OpSequencerRef osr;  // this should be ch->osr
+    boost::intrusive::list_member_hook<> sequencer_item;
+
+    uint64_t bytes = 0, ios = 0, cost = 0;
+
+    std::set<OnodeRef> onodes;     ///< these need to be updated/written
+    std::set<OnodeRef> modified_objects;  ///< objects we modified (and need a ref)
+
+    // A map from onode to a vector of object offset.  For new objects created
+    // in the transaction we append the new offset to the vector, for
+    // overwritten objects we append the negative of the previous ondisk offset
+    // followed by the new offset, and for truncated objects we append the
+    // negative of the previous ondisk offset.  We need to maintain a vector of
+    // offsets because *within the same transaction* an object may be truncated
+    // and then written again, or an object may be overwritten multiple times to
+    // different zones.  See update_cleaning_metadata function for how this map
+    // is used.
+    std::map<OnodeRef, std::vector<int64_t>> zoned_onode_to_offset_map;
+
+    std::set<SharedBlobRef> shared_blobs;  ///< these need to be updated/written
+    std::set<SharedBlobRef> shared_blobs_written; ///< update these on io completion
+
+    KeyValueDB::Transaction t; ///< then we will commit this
+    std::list<Context*> oncommits;  ///< more commit completions
+    std::list<CollectionRef> removed_collections; ///< colls we removed
+
+    boost::intrusive::list_member_hook<> deferred_queue_item;
+    bluestore_deferred_transaction_t *deferred_txn = nullptr; ///< if any
+
+    interval_set<uint64_t> allocated, released;
+    volatile_statfs statfs_delta;	   ///< overall store statistics delta
+    uint64_t osd_pool_id = META_POOL_ID;    ///< osd pool id we're operating on
+
+    IOContext ioc;
+    bool had_ios = false;  ///< true if we submitted IOs before our kv txn
+
+    uint64_t seq = 0;
+    ceph::mono_clock::time_point start;
+    ceph::mono_clock::time_point last_stamp;
+
+    uint64_t last_nid = 0;     ///< if non-zero, highest new nid we allocated
+    uint64_t last_blobid = 0;  ///< if non-zero, highest new blobid we allocated
+
+#if defined(WITH_LTTNG)
+    bool tracing = false;
+#endif
+
+#ifdef WITH_BLKIN
+    ZTracer::Trace trace;
+#endif
+
+    explicit TransContext(CephContext* cct, Collection *c, OpSequencer *o,
+			  std::list<Context*> *on_commits)
+      : ch(c),
+	osr(o),
+	ioc(cct, this),
+	start(ceph::mono_clock::now()) {
+      last_stamp = start;
+      if (on_commits) {
+	oncommits.swap(*on_commits);
+      }
+    }
+    ~TransContext() {
+#ifdef WITH_BLKIN
+      if (trace) {
+        trace.event("txc destruct");
+      }
+#endif
+      delete deferred_txn;
+    }
+
+    void write_onode(OnodeRef &o) {
+      onodes.insert(o);
+    }
+    void write_shared_blob(SharedBlobRef &sb) {
+      shared_blobs.insert(sb);
+    }
+    void unshare_blob(SharedBlob *sb) {
+      shared_blobs.erase(sb);
+    }
+
+    /// note we logically modified object (when onode itself is unmodified)
+    void note_modified_object(OnodeRef &o) {
+      // onode itself isn't written, though
+      modified_objects.insert(o);
+    }
+    void note_removed_object(OnodeRef& o) {
+      modified_objects.insert(o);
+      onodes.erase(o);
+    }
+
+    void zoned_note_new_object(OnodeRef &o) {
+      auto [_, ok] = zoned_onode_to_offset_map.emplace(
+	  std::pair<OnodeRef, std::vector<int64_t>>(o, {o->zoned_get_ondisk_starting_offset()}));
+      ceph_assert(ok);
+    }
+
+    void zoned_note_updated_object(OnodeRef &o, int64_t prev_offset) {
+      int64_t new_offset = o->zoned_get_ondisk_starting_offset();
+      auto [it, ok] = zoned_onode_to_offset_map.emplace(
+	  std::pair<OnodeRef, std::vector<int64_t>>(o, {-prev_offset, new_offset}));
+      if (!ok) {
+	it->second.push_back(-prev_offset);
+	it->second.push_back(new_offset);
+      }
+    }
+
+    void zoned_note_truncated_object(OnodeRef &o, int64_t offset) {
+      auto [it, ok] = zoned_onode_to_offset_map.emplace(
+	    std::pair<OnodeRef, std::vector<int64_t>>(o, {-offset}));
+      if (!ok) {
+	it->second.push_back(-offset);
+      }
+    }
+
+    void aio_finish(BlueStore *store) override {
+      store->txc_aio_finish(this);
+    }
+  private:
+    state_t state = STATE_PREPARE;
+  };
+
+  class BlueStoreThrottle {
+#if defined(WITH_LTTNG)
+    const std::chrono::time_point<ceph::mono_clock> time_base = ceph::mono_clock::now();
+
+    // Time of last chosen io (microseconds)
+    std::atomic<uint64_t> previous_emitted_tp_time_mono_mcs = {0};
+    std::atomic<uint64_t> ios_started_since_last_traced = {0};
+    std::atomic<uint64_t> ios_completed_since_last_traced = {0};
+
+    std::atomic_uint pending_kv_ios = {0};
+    std::atomic_uint pending_deferred_ios = {0};
+
+    // Min period between trace points (microseconds)
+    std::atomic<uint64_t> trace_period_mcs = {0};
+
+    bool should_trace(
+      uint64_t *started,
+      uint64_t *completed) {
+      uint64_t min_period_mcs = trace_period_mcs.load(
+	std::memory_order_relaxed);
+
+      if (min_period_mcs == 0) {
+	*started = 1;
+	*completed = ios_completed_since_last_traced.exchange(0);
+	return true;
+      } else {
+	ios_started_since_last_traced++;
+	auto now_mcs = ceph::to_microseconds<uint64_t>(
+	  ceph::mono_clock::now() - time_base);
+	uint64_t previous_mcs = previous_emitted_tp_time_mono_mcs;
+	uint64_t period_mcs = now_mcs - previous_mcs;
+	if (period_mcs > min_period_mcs) {
+	  if (previous_emitted_tp_time_mono_mcs.compare_exchange_strong(
+		previous_mcs, now_mcs)) {
+	    // This would be racy at a sufficiently extreme trace rate, but isn't
+	    // worth the overhead of doing it more carefully.
+	    *started = ios_started_since_last_traced.exchange(0);
+	    *completed = ios_completed_since_last_traced.exchange(0);
+	    return true;
+	  }
+	}
+	return false;
+      }
+    }
+#endif
+
+#if defined(WITH_LTTNG)
+    void emit_initial_tracepoint(
+      KeyValueDB &db,
+      TransContext &txc,
+      ceph::mono_clock::time_point);
+#else
+    void emit_initial_tracepoint(
+      KeyValueDB &db,
+      TransContext &txc,
+      ceph::mono_clock::time_point) {}
+#endif
+
+    Throttle throttle_bytes;           ///< submit to commit
+    Throttle throttle_deferred_bytes;  ///< submit to deferred complete
+
+  public:
+    BlueStoreThrottle(CephContext *cct) :
+      throttle_bytes(cct, "bluestore_throttle_bytes", 0),
+      throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes", 0)
+    {
+      reset_throttle(cct->_conf);
+    }
+
+#if defined(WITH_LTTNG)
+    void complete_kv(TransContext &txc);
+    void complete(TransContext &txc);
+#else
+    void complete_kv(TransContext &txc) {}
+    void complete(TransContext &txc) {}
+#endif
+
+    ceph::mono_clock::duration log_state_latency(
+      TransContext &txc, PerfCounters *logger, int state);
+    bool try_start_transaction(
+      KeyValueDB &db,
+      TransContext &txc,
+      ceph::mono_clock::time_point);
+    void finish_start_transaction(
+      KeyValueDB &db,
+      TransContext &txc,
+      ceph::mono_clock::time_point);
+    void release_kv_throttle(uint64_t cost) {
+      throttle_bytes.put(cost);
+    }
+    void release_deferred_throttle(uint64_t cost) {
+      throttle_deferred_bytes.put(cost);
+    }
+    bool should_submit_deferred() {
+      return throttle_deferred_bytes.past_midpoint();
+    }
+    void reset_throttle(const ConfigProxy &conf) {
+      throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
+      throttle_deferred_bytes.reset_max(
+	conf->bluestore_throttle_bytes +
+	conf->bluestore_throttle_deferred_bytes);
+#if defined(WITH_LTTNG)
+      double rate = conf.get_val<double>("bluestore_throttle_trace_rate");
+      trace_period_mcs = rate > 0 ? floor((1/rate) * 1000000.0) : 0;
+#endif
+    }
+  } throttle;
+
+  typedef boost::intrusive::list<
+    TransContext,
+    boost::intrusive::member_hook<
+      TransContext,
+      boost::intrusive::list_member_hook<>,
+      &TransContext::deferred_queue_item> > deferred_queue_t;
+
+  struct DeferredBatch final : public AioContext {
+    OpSequencer *osr;
+    struct deferred_io {
+      ceph::buffer::list bl;    ///< data
+      uint64_t seq;     ///< deferred transaction seq
+    };
+    std::map<uint64_t,deferred_io> iomap; ///< map of ios in this batch
+    deferred_queue_t txcs;           ///< txcs in this batch
+    IOContext ioc;                   ///< our aios
+    /// bytes of pending io for each deferred seq (may be 0)
+    std::map<uint64_t,int> seq_bytes;
+
+    void _discard(CephContext *cct, uint64_t offset, uint64_t length);
+    void _audit(CephContext *cct);
+
+    DeferredBatch(CephContext *cct, OpSequencer *osr)
+      : osr(osr), ioc(cct, this) {}
+
+    /// prepare a write
+    void prepare_write(CephContext *cct,
+		       uint64_t seq, uint64_t offset, uint64_t length,
+		       ceph::buffer::list::const_iterator& p);
+
+    void aio_finish(BlueStore *store) override {
+      store->_deferred_aio_finish(osr);
+    }
+  };
+
+  class OpSequencer : public RefCountedObject {
+  public:
+    ceph::mutex qlock = ceph::make_mutex("BlueStore::OpSequencer::qlock");
+    ceph::condition_variable qcond;
+    typedef boost::intrusive::list<
+      TransContext,
+      boost::intrusive::member_hook<
+        TransContext,
+	boost::intrusive::list_member_hook<>,
+	&TransContext::sequencer_item> > q_list_t;
+    q_list_t q;  ///< transactions
+
+    boost::intrusive::list_member_hook<> deferred_osr_queue_item;
+
+    DeferredBatch *deferred_running = nullptr;
+    DeferredBatch *deferred_pending = nullptr;
+
+    ceph::mutex deferred_lock = ceph::make_mutex("BlueStore::OpSequencer::deferred_lock");
+
+    BlueStore *store;
+    coll_t cid;
+
+    uint64_t last_seq = 0;
+
+    std::atomic_int txc_with_unstable_io = {0};  ///< num txcs with unstable io
+
+    std::atomic_int kv_committing_serially = {0};
+
+    std::atomic_int kv_submitted_waiters = {0};
+
+    std::atomic_bool zombie = {false};    ///< in zombie_osr std::set (collection going away)
+
+    const uint32_t sequencer_id;
+
+    uint32_t get_sequencer_id() const {
+      return sequencer_id;
+    }
+
+    void queue_new(TransContext *txc) {
+      std::lock_guard l(qlock);
+      txc->seq = ++last_seq;
+      q.push_back(*txc);
+    }
+
+    void drain() {
+      std::unique_lock l(qlock);
+      while (!q.empty())
+	qcond.wait(l);
+    }
+
+    void drain_preceding(TransContext *txc) {
+      std::unique_lock l(qlock);
+      while (&q.front() != txc)
+	qcond.wait(l);
+    }
+
+    bool _is_all_kv_submitted() {
+      // caller must hold qlock & q.empty() must not empty
+      ceph_assert(!q.empty());
+      TransContext *txc = &q.back();
+      if (txc->get_state() >= TransContext::STATE_KV_SUBMITTED) {
+	return true;
+      }
+      return false;
+    }
+
+    void flush() {
+      std::unique_lock l(qlock);
+      while (true) {
+	// std::set flag before the check because the condition
+	// may become true outside qlock, and we need to make
+	// sure those threads see waiters and signal qcond.
+	++kv_submitted_waiters;
+	if (q.empty() || _is_all_kv_submitted()) {
+	  --kv_submitted_waiters;
+	  return;
+	}
+	qcond.wait(l);
+	--kv_submitted_waiters;
+      }
+    }
+
+    void flush_all_but_last() {
+      std::unique_lock l(qlock);
+      assert (q.size() >= 1);
+      while (true) {
+	// std::set flag before the check because the condition
+	// may become true outside qlock, and we need to make
+	// sure those threads see waiters and signal qcond.
+	++kv_submitted_waiters;
+	if (q.size() <= 1) {
+	  --kv_submitted_waiters;
+	  return;
+	} else {
+	  auto it = q.rbegin();
+	  it++;
+	  if (it->get_state() >= TransContext::STATE_KV_SUBMITTED) {
+	    --kv_submitted_waiters;
+	    return;
+          }
+	}
+	qcond.wait(l);
+	--kv_submitted_waiters;
+      }
+      }
+
+    bool flush_commit(Context *c) {
+      std::lock_guard l(qlock);
+      if (q.empty()) {
+	return true;
+      }
+      TransContext *txc = &q.back();
+      if (txc->get_state() >= TransContext::STATE_KV_DONE) {
+	return true;
+      }
+      txc->oncommits.push_back(c);
+      return false;
+    }
+  private:
+    FRIEND_MAKE_REF(OpSequencer);
+    OpSequencer(BlueStore *store, uint32_t sequencer_id, const coll_t& c)
+      : RefCountedObject(store->cct),
+	store(store), cid(c), sequencer_id(sequencer_id) {
+    }
+    ~OpSequencer() {
+      ceph_assert(q.empty());
+    }
+  };
+
+  typedef boost::intrusive::list<
+    OpSequencer,
+    boost::intrusive::member_hook<
+      OpSequencer,
+      boost::intrusive::list_member_hook<>,
+      &OpSequencer::deferred_osr_queue_item> > deferred_osr_queue_t;
+
+  struct KVSyncThread : public Thread {
+    BlueStore *store;
+    explicit KVSyncThread(BlueStore *s) : store(s) {}
+    void *entry() override {
+      store->_kv_sync_thread();
+      return NULL;
+    }
+  };
+  struct KVFinalizeThread : public Thread {
+    BlueStore *store;
+    explicit KVFinalizeThread(BlueStore *s) : store(s) {}
+    void *entry() override {
+      store->_kv_finalize_thread();
+      return NULL;
+    }
+  };
+  struct ZonedCleanerThread : public Thread {
+    BlueStore *store;
+    explicit ZonedCleanerThread(BlueStore *s) : store(s) {}
+    void *entry() override {
+      store->_zoned_cleaner_thread();
+      return nullptr;
+    }
+  };
+
+  struct DBHistogram {
+    struct value_dist {
+      uint64_t count;
+      uint32_t max_len;
+    };
+
+    struct key_dist {
+      uint64_t count;
+      uint32_t max_len;
+      std::map<int, struct value_dist> val_map; ///< slab id to count, max length of value and key
+    };
+
+    std::map<std::string, std::map<int, struct key_dist> > key_hist;
+    std::map<int, uint64_t> value_hist;
+    int get_key_slab(size_t sz);
+    std::string get_key_slab_to_range(int slab);
+    int get_value_slab(size_t sz);
+    std::string get_value_slab_to_range(int slab);
+    void update_hist_entry(std::map<std::string, std::map<int, struct key_dist> > &key_hist,
+			  const std::string &prefix, size_t key_size, size_t value_size);
+    void dump(ceph::Formatter *f);
+  };
+
+  struct BigDeferredWriteContext {
+    uint64_t off = 0;     // original logical offset
+    uint32_t b_off = 0;   // blob relative offset
+    uint32_t used = 0;
+    uint64_t head_read = 0;
+    uint64_t tail_read = 0;
+    BlobRef blob_ref;
+    uint64_t blob_start = 0;
+    PExtentVector res_extents;
+
+    inline uint64_t blob_aligned_len() const {
+      return used + head_read + tail_read;
+    }
+
+    bool can_defer(BlueStore::extent_map_t::iterator ep,
+      uint64_t prefer_deferred_size,
+      uint64_t block_size,
+      uint64_t offset,
+      uint64_t l);
+    bool apply_defer();
+  };
+
+  // --------------------------------------------------------
+  // members
+private:
+  BlueFS *bluefs = nullptr;
+  bluefs_layout_t bluefs_layout;
+  utime_t next_dump_on_bluefs_alloc_failure;
+
+  KeyValueDB *db = nullptr;
+  BlockDevice *bdev = nullptr;
+  std::string freelist_type;
+  FreelistManager *fm = nullptr;
+
+  bluefs_shared_alloc_context_t shared_alloc;
+
+  uuid_d fsid;
+  int path_fd = -1;  ///< open handle to $path
+  int fsid_fd = -1;  ///< open handle (locked) to $path/fsid
+  bool mounted = false;
+
+  ceph::shared_mutex coll_lock = ceph::make_shared_mutex("BlueStore::coll_lock");  ///< rwlock to protect coll_map
+  mempool::bluestore_cache_other::unordered_map<coll_t, CollectionRef> coll_map;
+  bool collections_had_errors = false;
+  std::map<coll_t,CollectionRef> new_coll_map;
+
+  std::vector<OnodeCacheShard*> onode_cache_shards;
+  std::vector<BufferCacheShard*> buffer_cache_shards;
+
+  /// protect zombie_osr_set
+  ceph::mutex zombie_osr_lock = ceph::make_mutex("BlueStore::zombie_osr_lock");
+  uint32_t next_sequencer_id = 0;
+  std::map<coll_t,OpSequencerRef> zombie_osr_set; ///< std::set of OpSequencers for deleted collections
+
+  std::atomic<uint64_t> nid_last = {0};
+  std::atomic<uint64_t> nid_max = {0};
+  std::atomic<uint64_t> blobid_last = {0};
+  std::atomic<uint64_t> blobid_max = {0};
+
+  ceph::mutex deferred_lock = ceph::make_mutex("BlueStore::deferred_lock");
+  ceph::mutex atomic_alloc_and_submit_lock =
+      ceph::make_mutex("BlueStore::atomic_alloc_and_submit_lock");
+  std::atomic<uint64_t> deferred_seq = {0};
+  deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
+  std::atomic_int deferred_queue_size = {0};         ///< num txc's queued across all osrs
+  std::atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
+  Finisher  finisher;
+  utime_t  deferred_last_submitted = utime_t();
+
+  KVSyncThread kv_sync_thread;
+  ceph::mutex kv_lock = ceph::make_mutex("BlueStore::kv_lock");
+  ceph::condition_variable kv_cond;
+  bool _kv_only = false;
+  bool kv_sync_started = false;
+  bool kv_stop = false;
+  bool kv_finalize_started = false;
+  bool kv_finalize_stop = false;
+  std::deque<TransContext*> kv_queue;             ///< ready, already submitted
+  std::deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread
+  std::deque<TransContext*> kv_committing;        ///< currently syncing
+  std::deque<DeferredBatch*> deferred_done_queue;   ///< deferred ios done
+  bool kv_sync_in_progress = false;
+
+  KVFinalizeThread kv_finalize_thread;
+  ceph::mutex kv_finalize_lock = ceph::make_mutex("BlueStore::kv_finalize_lock");
+  ceph::condition_variable kv_finalize_cond;
+  std::deque<TransContext*> kv_committing_to_finalize;   ///< pending finalization
+  std::deque<DeferredBatch*> deferred_stable_to_finalize; ///< pending finalization
+  bool kv_finalize_in_progress = false;
+
+  ZonedCleanerThread zoned_cleaner_thread;
+  ceph::mutex zoned_cleaner_lock = ceph::make_mutex("BlueStore::zoned_cleaner_lock");
+  ceph::condition_variable zoned_cleaner_cond;
+  bool zoned_cleaner_started = false;
+  bool zoned_cleaner_stop = false;
+  std::deque<uint64_t> zoned_cleaner_queue;
+
+  PerfCounters *logger = nullptr;
+
+  std::list<CollectionRef> removed_collections;
+
+  ceph::shared_mutex debug_read_error_lock =
+    ceph::make_shared_mutex("BlueStore::debug_read_error_lock");
+  std::set<ghobject_t> debug_data_error_objects;
+  std::set<ghobject_t> debug_mdata_error_objects;
+
+  std::atomic<int> csum_type = {Checksummer::CSUM_CRC32C};
+
+  uint64_t block_size = 0;     ///< block size of block device (power of 2)
+  uint64_t block_mask = 0;     ///< mask to get just the block offset
+  size_t block_size_order = 0; ///< bits to shift to get block size
+
+  uint64_t min_alloc_size; ///< minimum allocation unit (power of 2)
+  ///< bits for min_alloc_size
+  uint8_t min_alloc_size_order = 0;
+  static_assert(std::numeric_limits<uint8_t>::max() >
+		std::numeric_limits<decltype(min_alloc_size)>::digits,
+		"not enough bits for min_alloc_size");
+
+  enum {
+    // Please preserve the order since it's DB persistent
+    OMAP_BULK = 0,
+    OMAP_PER_POOL = 1,
+    OMAP_PER_PG = 2,
+    } per_pool_omap = OMAP_BULK;
+
+  ///< maximum allocation unit (power of 2)
+  std::atomic<uint64_t> max_alloc_size = {0};
+
+  ///< number threshold for forced deferred writes
+  std::atomic<int> deferred_batch_ops = {0};
+
+  ///< size threshold for forced deferred writes
+  std::atomic<uint64_t> prefer_deferred_size = {0};
+
+  ///< approx cost per io, in bytes
+  std::atomic<uint64_t> throttle_cost_per_io = {0};
+
+  std::atomic<Compressor::CompressionMode> comp_mode =
+    {Compressor::COMP_NONE}; ///< compression mode
+  CompressorRef compressor;
+  std::atomic<uint64_t> comp_min_blob_size = {0};
+  std::atomic<uint64_t> comp_max_blob_size = {0};
+
+  std::atomic<uint64_t> max_blob_size = {0};  ///< maximum blob size
+
+  uint64_t kv_ios = 0;
+  uint64_t kv_throttle_costs = 0;
+
+  // cache trim control
+  uint64_t cache_size = 0;       ///< total cache size
+  double cache_meta_ratio = 0;   ///< cache ratio dedicated to metadata
+  double cache_kv_ratio = 0;     ///< cache ratio dedicated to kv (e.g., rocksdb)
+  double cache_kv_onode_ratio = 0; ///< cache ratio dedicated to kv onodes (e.g., rocksdb onode CF)
+  double cache_data_ratio = 0;   ///< cache ratio dedicated to object data
+  bool cache_autotune = false;   ///< cache autotune setting
+  double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
+  uint64_t osd_memory_target = 0;   ///< OSD memory target when autotuning cache
+  uint64_t osd_memory_base = 0;     ///< OSD base memory when autotuning cache
+  double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
+  uint64_t osd_memory_cache_min = 0; ///< Min memory to assign when autotuning cache
+  double osd_memory_cache_resize_interval = 0; ///< Time to wait between cache resizing 
+  double max_defer_interval = 0; ///< Time to wait between last deferred submit
+  std::atomic<uint32_t> config_changed = {0}; ///< Counter to determine if there is a configuration change.
+
+  typedef std::map<uint64_t, volatile_statfs> osd_pools_map;
+
+  ceph::mutex vstatfs_lock = ceph::make_mutex("BlueStore::vstatfs_lock");
+  volatile_statfs vstatfs;
+  osd_pools_map osd_pools; // protected by vstatfs_lock as well
+
+  bool per_pool_stat_collection = true;
+
+  struct MempoolThread : public Thread {
+  public:
+    BlueStore *store;
+
+    ceph::condition_variable cond;
+    ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock");
+    bool stop = false;
+    std::shared_ptr<PriorityCache::PriCache> binned_kv_cache = nullptr;
+    std::shared_ptr<PriorityCache::PriCache> binned_kv_onode_cache = nullptr;
+    std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
+
+    struct MempoolCache : public PriorityCache::PriCache {
+      BlueStore *store;
+      int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
+      int64_t committed_bytes = 0;
+      double cache_ratio = 0;
+
+      MempoolCache(BlueStore *s) : store(s) {};
+
+      virtual uint64_t _get_used_bytes() const = 0;
+
+      virtual int64_t request_cache_bytes(
+          PriorityCache::Priority pri, uint64_t total_cache) const {
+        int64_t assigned = get_cache_bytes(pri);
+
+        switch (pri) {
+        // All cache items are currently shoved into the PRI1 priority 
+        case PriorityCache::Priority::PRI1:
+          {
+            int64_t request = _get_used_bytes();
+            return(request > assigned) ? request - assigned : 0;
+          }
+        default:
+          break;
+        }
+        return -EOPNOTSUPP;
+      }
+ 
+      virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
+        return cache_bytes[pri];
+      }
+      virtual int64_t get_cache_bytes() const { 
+        int64_t total = 0;
+
+        for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
+          PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
+          total += get_cache_bytes(pri);
+        }
+        return total;
+      }
+      virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+        cache_bytes[pri] = bytes;
+      }
+      virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+        cache_bytes[pri] += bytes;
+      }
+      virtual int64_t commit_cache_size(uint64_t total_cache) {
+        committed_bytes = PriorityCache::get_chunk(
+            get_cache_bytes(), total_cache);
+        return committed_bytes;
+      }
+      virtual int64_t get_committed_size() const {
+        return committed_bytes;
+      }
+      virtual double get_cache_ratio() const {
+        return cache_ratio;
+      }
+      virtual void set_cache_ratio(double ratio) {
+        cache_ratio = ratio;
+      }
+      virtual std::string get_cache_name() const = 0;
+    };
+
+    struct MetaCache : public MempoolCache {
+      MetaCache(BlueStore *s) : MempoolCache(s) {};
+
+      virtual uint64_t _get_used_bytes() const {
+        return mempool::bluestore_Buffer::allocated_bytes() +
+          mempool::bluestore_Blob::allocated_bytes() +
+          mempool::bluestore_Extent::allocated_bytes() +
+          mempool::bluestore_cache_meta::allocated_bytes() +
+          mempool::bluestore_cache_other::allocated_bytes() +
+	   mempool::bluestore_cache_onode::allocated_bytes() +
+          mempool::bluestore_SharedBlob::allocated_bytes() +
+          mempool::bluestore_inline_bl::allocated_bytes();
+      }
+
+      virtual std::string get_cache_name() const {
+        return "BlueStore Meta Cache";
+      }
+
+      uint64_t _get_num_onodes() const {
+        uint64_t onode_num =
+            mempool::bluestore_cache_onode::allocated_items();
+        return (2 > onode_num) ? 2 : onode_num;
+      }
+
+      double get_bytes_per_onode() const {
+        return (double)_get_used_bytes() / (double)_get_num_onodes();
+      }
+    };
+    std::shared_ptr<MetaCache> meta_cache;
+
+    struct DataCache : public MempoolCache {
+      DataCache(BlueStore *s) : MempoolCache(s) {};
+
+      virtual uint64_t _get_used_bytes() const {
+        uint64_t bytes = 0;
+        for (auto i : store->buffer_cache_shards) {
+          bytes += i->_get_bytes();
+        }
+        return bytes; 
+      }
+      virtual std::string get_cache_name() const {
+        return "BlueStore Data Cache";
+      }
+    };
+    std::shared_ptr<DataCache> data_cache;
+
+  public:
+    explicit MempoolThread(BlueStore *s)
+      : store(s),
+        meta_cache(new MetaCache(s)),
+        data_cache(new DataCache(s)) {}
+
+    void *entry() override;
+    void init() {
+      ceph_assert(stop == false);
+      create("bstore_mempool");
+    }
+    void shutdown() {
+      lock.lock();
+      stop = true;
+      cond.notify_all();
+      lock.unlock();
+      join();
+    }
+
+  private:
+    void _adjust_cache_settings();
+    void _update_cache_settings();
+    void _resize_shards(bool interval_stats);
+  } mempool_thread;
+
+#ifdef WITH_BLKIN
+  ZTracer::Endpoint trace_endpoint {"0.0.0.0", 0, "BlueStore"};
+#endif
+
+  // --------------------------------------------------------
+  // private methods
+
+  void _init_logger();
+  void _shutdown_logger();
+  int _reload_logger();
+
+  int _open_path();
+  void _close_path();
+  int _open_fsid(bool create);
+  int _lock_fsid();
+  int _read_fsid(uuid_d *f);
+  int _write_fsid();
+  void _close_fsid();
+  void _set_alloc_sizes();
+  void _set_blob_size();
+  void _set_finisher_num();
+  void _set_per_pool_omap();
+  void _update_osd_memory_options();
+
+  int _open_bdev(bool create);
+  // Verifies if disk space is enough for reserved + min bluefs
+  // and alters the latter if needed.
+  // Depends on min_alloc_size hence should be called after
+  // its initialization (and outside of _open_bdev)
+  void _validate_bdev();
+  void _close_bdev();
+
+  int _minimal_open_bluefs(bool create);
+  void _minimal_close_bluefs();
+  int _open_bluefs(bool create, bool read_only);
+  void _close_bluefs(bool cold_close);
+
+  int _is_bluefs(bool create, bool* ret);
+  /*
+  * opens both DB and dependant super_meta, FreelistManager and allocator
+  * in the proper order
+  */
+  int _open_db_and_around(bool read_only, bool to_repair = false);
+  void _close_db_and_around(bool read_only);
+
+  int _prepare_db_environment(bool create, bool read_only,
+			      std::string* kv_dir, std::string* kv_backend);
+
+  /*
+   * @warning to_repair_db means that we open this db to repair it, will not
+   * hold the rocksdb's file lock.
+   */
+  int _open_db(bool create,
+	       bool to_repair_db=false,
+	       bool read_only = false);
+  void _close_db(bool read_only);
+  int _open_fm(KeyValueDB::Transaction t, bool read_only);
+  void _close_fm();
+  int _write_out_fm_meta(uint64_t target_size);
+  int _create_alloc();
+  int _init_alloc();
+  void _close_alloc();
+  int _open_collections();
+  void _fsck_collections(int64_t* errors);
+  void _close_collections();
+
+  int _setup_block_symlink_or_file(std::string name, std::string path, uint64_t size,
+				   bool create);
+
+  // Functions related to zoned storage.
+  uint64_t _zoned_piggyback_device_parameters_onto(uint64_t min_alloc_size);
+  int _zoned_check_config_settings();
+  void _zoned_update_cleaning_metadata(TransContext *txc);
+  std::string _zoned_get_prefix(uint64_t offset);
+
+public:
+  utime_t get_deferred_last_submitted() {
+    std::lock_guard l(deferred_lock);
+    return deferred_last_submitted;
+  }
+
+  static int _write_bdev_label(CephContext* cct,
+			       std::string path, bluestore_bdev_label_t label);
+  static int _read_bdev_label(CephContext* cct, std::string path,
+			      bluestore_bdev_label_t *label);
+private:
+  int _check_or_set_bdev_label(std::string path, uint64_t size, std::string desc,
+			       bool create);
+  int _set_bdev_label_size(const string& path, uint64_t size);
+
+  int _open_super_meta();
+
+  void _open_statfs();
+  void _get_statfs_overall(struct store_statfs_t *buf);
+
+  void _dump_alloc_on_failure();
+
+  CollectionRef _get_collection(const coll_t& cid);
+  void _queue_reap_collection(CollectionRef& c);
+  void _reap_collections();
+  void _update_cache_logger();
+
+  void _assign_nid(TransContext *txc, OnodeRef o);
+  uint64_t _assign_blobid(TransContext *txc);
+
+  template <int LogLevelV>
+  friend void _dump_onode(CephContext *cct, const Onode& o);
+  template <int LogLevelV>
+  friend void _dump_extent_map(CephContext *cct, const ExtentMap& em);
+  template <int LogLevelV>
+  friend void _dump_transaction(CephContext *cct, Transaction *t);
+
+  TransContext *_txc_create(Collection *c, OpSequencer *osr,
+			    std::list<Context*> *on_commits,
+			    TrackedOpRef osd_op=TrackedOpRef());
+  void _txc_update_store_statfs(TransContext *txc);
+  void _txc_add_transaction(TransContext *txc, Transaction *t);
+  void _txc_calc_cost(TransContext *txc);
+  void _txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t);
+  void _txc_state_proc(TransContext *txc);
+  void _txc_aio_submit(TransContext *txc);
+public:
+  void txc_aio_finish(void *p) {
+    _txc_state_proc(static_cast<TransContext*>(p));
+  }
+private:
+  void _txc_finish_io(TransContext *txc);
+  void _txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t);
+  void _txc_apply_kv(TransContext *txc, bool sync_submit_transaction);
+  void _txc_committed_kv(TransContext *txc);
+  void _txc_finish(TransContext *txc);
+  void _txc_release_alloc(TransContext *txc);
+
+  void _osr_attach(Collection *c);
+  void _osr_register_zombie(OpSequencer *osr);
+  void _osr_drain(OpSequencer *osr);
+  void _osr_drain_preceding(TransContext *txc);
+  void _osr_drain_all();
+
+  void _kv_start();
+  void _kv_stop();
+  void _kv_sync_thread();
+  void _kv_finalize_thread();
+
+  void _zoned_cleaner_start();
+  void _zoned_cleaner_stop();
+  void _zoned_cleaner_thread();
+  void _zoned_clean_zone(uint64_t zone_num);
+
+  bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, uint64_t len);
+  void _deferred_queue(TransContext *txc);
+public:
+  void deferred_try_submit();
+private:
+  void _deferred_submit_unlock(OpSequencer *osr);
+  void _deferred_aio_finish(OpSequencer *osr);
+  int _deferred_replay();
+  bool _eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn,
+				    interval_set<uint64_t>& bluefs_extents);
+
+public:
+  using mempool_dynamic_bitset =
+    boost::dynamic_bitset<uint64_t,
+			  mempool::bluestore_fsck::pool_allocator<uint64_t>>;
+  using  per_pool_statfs =
+    mempool::bluestore_fsck::map<uint64_t, store_statfs_t>;
+
+  enum FSCKDepth {
+    FSCK_REGULAR,
+    FSCK_DEEP,
+    FSCK_SHALLOW
+  };
+  enum {
+    MAX_FSCK_ERROR_LINES = 100,
+  };
+
+private:
+  int _fsck_check_extents(
+    std::string_view ctx_descr,
+    const PExtentVector& extents,
+    bool compressed,
+    mempool_dynamic_bitset &used_blocks,
+    uint64_t granularity,
+    BlueStoreRepairer* repairer,
+    store_statfs_t& expected_statfs,
+    FSCKDepth depth);
+
+  void _fsck_check_pool_statfs(
+    per_pool_statfs& expected_pool_statfs,
+    int64_t& errors,
+    int64_t &warnings,
+    BlueStoreRepairer* repairer);
+  void _fsck_repair_shared_blobs(
+    BlueStoreRepairer& repairer,
+    shared_blob_2hash_tracker_t& sb_ref_counts,
+    sb_info_space_efficient_map_t& sb_info);
+
+  int _fsck(FSCKDepth depth, bool repair);
+  int _fsck_on_open(BlueStore::FSCKDepth depth, bool repair);
+
+  void _buffer_cache_write(
+    TransContext *txc,
+    BlobRef b,
+    uint64_t offset,
+    ceph::buffer::list& bl,
+    unsigned flags) {
+    b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl,
+			     flags);
+    txc->shared_blobs_written.insert(b->shared_blob);
+  }
+
+  int _collection_list(
+    Collection *c, const ghobject_t& start, const ghobject_t& end,
+    int max, bool legacy, std::vector<ghobject_t> *ls, ghobject_t *next);
+
+  template <typename T, typename F>
+  T select_option(const std::string& opt_name, T val1, F f) {
+    //NB: opt_name reserved for future use
+    boost::optional<T> val2 = f();
+    if (val2) {
+      return *val2;
+    }
+    return val1;
+  }
+
+  void _apply_padding(uint64_t head_pad,
+		      uint64_t tail_pad,
+		      ceph::buffer::list& padded);
+
+  void _record_onode(OnodeRef &o, KeyValueDB::Transaction &txn);
+
+  // -- ondisk version ---
+public:
+  const int32_t latest_ondisk_format = 4;        ///< our version
+  const int32_t min_readable_ondisk_format = 1;  ///< what we can read
+  const int32_t min_compat_ondisk_format = 3;    ///< who can read us
+
+private:
+  int32_t ondisk_format = 0;  ///< value detected on mount
+
+  int _upgrade_super();  ///< upgrade (called during open_super)
+  uint64_t _get_ondisk_reserved() const;
+  void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
+
+  // --- public interface ---
+public:
+  BlueStore(CephContext *cct, const std::string& path);
+  BlueStore(CephContext *cct, const std::string& path, uint64_t min_alloc_size); // Ctor for UT only
+  ~BlueStore() override;
+
+  std::string get_type() override {
+    return "bluestore";
+  }
+
+  bool needs_journal() override { return false; };
+  bool wants_journal() override { return false; };
+  bool allows_journal() override { return false; };
+
+  uint64_t get_min_alloc_size() const override {
+    return min_alloc_size;
+  }
+
+  int get_devices(std::set<std::string> *ls) override;
+
+  bool is_rotational() override;
+  bool is_journal_rotational() override;
+
+  std::string get_default_device_class() override {
+    std::string device_class;
+    std::map<std::string, std::string> metadata;
+    collect_metadata(&metadata);
+    auto it = metadata.find("bluestore_bdev_type");
+    if (it != metadata.end()) {
+      device_class = it->second;
+    }
+    return device_class;
+  }
+
+  int get_numa_node(
+    int *numa_node,
+    std::set<int> *nodes,
+    std::set<std::string> *failed) override;
+
+  static int get_block_device_fsid(CephContext* cct, const std::string& path,
+				   uuid_d *fsid);
+
+  bool test_mount_in_use() override;
+
+private:
+  int _mount();
+public:
+  int mount() override {
+    return _mount();
+  }
+  int umount() override;
+
+  int open_db_environment(KeyValueDB **pdb, bool to_repair);
+  int close_db_environment();
+  BlueFS* get_bluefs();
+
+  int write_meta(const std::string& key, const std::string& value) override;
+  int read_meta(const std::string& key, std::string *value) override;
+
+  // open in read-only and limited mode
+  int cold_open();
+  int cold_close();
+
+  int fsck(bool deep) override {
+    return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, false);
+  }
+  int repair(bool deep) override {
+    return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, true);
+  }
+  int quick_fix() override {
+    return _fsck(FSCK_SHALLOW, true);
+  }
+
+  void set_cache_shards(unsigned num) override;
+  void dump_cache_stats(ceph::Formatter *f) override {
+    int onode_count = 0, buffers_bytes = 0;
+    for (auto i: onode_cache_shards) {
+      onode_count += i->_get_num();
+    }
+    for (auto i: buffer_cache_shards) {
+      buffers_bytes += i->_get_bytes();
+    }
+    f->dump_int("bluestore_onode", onode_count);
+    f->dump_int("bluestore_buffers", buffers_bytes);
+  }
+  void dump_cache_stats(std::ostream& ss) override {
+    int onode_count = 0, buffers_bytes = 0;
+    for (auto i: onode_cache_shards) {
+      onode_count += i->_get_num();
+    }
+    for (auto i: buffer_cache_shards) {
+      buffers_bytes += i->_get_bytes();
+    }
+    ss << "bluestore_onode: " << onode_count;
+    ss << "bluestore_buffers: " << buffers_bytes;
+  }
+
+  int validate_hobject_key(const hobject_t &obj) const override {
+    return 0;
+  }
+  unsigned get_max_attr_name_length() override {
+    return 256;  // arbitrary; there is no real limit internally
+  }
+
+  int mkfs() override;
+  int mkjournal() override {
+    return 0;
+  }
+
+  void get_db_statistics(ceph::Formatter *f) override;
+  void generate_db_histogram(ceph::Formatter *f) override;
+  void _shutdown_cache();
+  int flush_cache(std::ostream *os = NULL) override;
+  void dump_perf_counters(ceph::Formatter *f) override {
+    f->open_object_section("perf_counters");
+    logger->dump_formatted(f, false);
+    f->close_section();
+  }
+
+  int add_new_bluefs_device(int id, const std::string& path);
+  int migrate_to_existing_bluefs_device(const std::set<int>& devs_source,
+    int id);
+  int migrate_to_new_bluefs_device(const std::set<int>& devs_source,
+    int id,
+    const std::string& path);
+  int expand_devices(std::ostream& out);
+  std::string get_device_path(unsigned id);
+
+  int dump_bluefs_sizes(ostream& out);
+
+public:
+  int statfs(struct store_statfs_t *buf,
+             osd_alert_list_t* alerts = nullptr) override;
+  int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+		  bool *per_pool_omap) override;
+
+  void collect_metadata(std::map<std::string,std::string> *pm) override;
+
+  bool exists(CollectionHandle &c, const ghobject_t& oid) override;
+  int set_collection_opts(
+    CollectionHandle& c,
+    const pool_opts_t& opts) override;
+  int stat(
+    CollectionHandle &c,
+    const ghobject_t& oid,
+    struct stat *st,
+    bool allow_eio = false) override;
+  int read(
+    CollectionHandle &c,
+    const ghobject_t& oid,
+    uint64_t offset,
+    size_t len,
+    ceph::buffer::list& bl,
+    uint32_t op_flags = 0) override;
+
+private:
+
+  // --------------------------------------------------------
+  // intermediate data structures used while reading
+  struct region_t {
+    uint64_t logical_offset;
+    uint64_t blob_xoffset;   //region offset within the blob
+    uint64_t length;
+
+    // used later in read process
+    uint64_t front = 0;
+
+    region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0)
+      : logical_offset(offset),
+      blob_xoffset(b_offs),
+      length(len),
+      front(front){}
+    region_t(const region_t& from)
+      : logical_offset(from.logical_offset),
+      blob_xoffset(from.blob_xoffset),
+      length(from.length),
+      front(from.front){}
+
+    friend std::ostream& operator<<(std::ostream& out, const region_t& r) {
+      return out << "0x" << std::hex << r.logical_offset << ":"
+        << r.blob_xoffset << "~" << r.length << std::dec;
+    }
+  };
+
+  // merged blob read request
+  struct read_req_t {
+    uint64_t r_off = 0;
+    uint64_t r_len = 0;
+    ceph::buffer::list bl;
+    std::list<region_t> regs; // original read regions
+
+    read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {}
+
+    friend std::ostream& operator<<(std::ostream& out, const read_req_t& r) {
+      out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : [";
+      for (const auto& reg : r.regs)
+        out << reg;
+      return out << "]}" << std::dec;
+    }
+  };
+
+  typedef std::list<read_req_t> regions2read_t;
+  typedef std::map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
+
+  void _read_cache(
+    OnodeRef o,
+    uint64_t offset,
+    size_t length,
+    int read_cache_policy,
+    ready_regions_t& ready_regions,
+    blobs2read_t& blobs2read);
+
+
+  int _prepare_read_ioc(
+    blobs2read_t& blobs2read,
+    std::vector<ceph::buffer::list>* compressed_blob_bls,
+    IOContext* ioc);
+
+  int _generate_read_result_bl(
+    OnodeRef o,
+    uint64_t offset,
+    size_t length,
+    ready_regions_t& ready_regions,
+    std::vector<ceph::buffer::list>& compressed_blob_bls,
+    blobs2read_t& blobs2read,
+    bool buffered,
+    bool* csum_error,
+    ceph::buffer::list& bl);
+
+  int _do_read(
+    Collection *c,
+    OnodeRef o,
+    uint64_t offset,
+    size_t len,
+    ceph::buffer::list& bl,
+    uint32_t op_flags = 0,
+    uint64_t retry_count = 0);
+
+  int _do_readv(
+    Collection *c,
+    OnodeRef o,
+    const interval_set<uint64_t>& m,
+    ceph::buffer::list& bl,
+    uint32_t op_flags = 0,
+    uint64_t retry_count = 0);
+
+  int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
+	      uint64_t offset, size_t len, interval_set<uint64_t>& destset);
+public:
+  int fiemap(CollectionHandle &c, const ghobject_t& oid,
+	     uint64_t offset, size_t len, ceph::buffer::list& bl) override;
+  int fiemap(CollectionHandle &c, const ghobject_t& oid,
+	     uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) override;
+
+  int readv(
+    CollectionHandle &c_,
+    const ghobject_t& oid,
+    interval_set<uint64_t>& m,
+    ceph::buffer::list& bl,
+    uint32_t op_flags) override;
+
+  int dump_onode(CollectionHandle &c, const ghobject_t& oid,
+    const std::string& section_name, ceph::Formatter *f) override;
+
+  int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
+	      ceph::buffer::ptr& value) override;
+
+  int getattrs(CollectionHandle &c, const ghobject_t& oid,
+	       std::map<std::string,ceph::buffer::ptr>& aset) override;
+
+  int list_collections(std::vector<coll_t>& ls) override;
+
+  CollectionHandle open_collection(const coll_t &c) override;
+  CollectionHandle create_new_collection(const coll_t& cid) override;
+  void set_collection_commit_queue(const coll_t& cid,
+				   ContextQueue *commit_queue) override;
+
+  bool collection_exists(const coll_t& c) override;
+  int collection_empty(CollectionHandle& c, bool *empty) override;
+  int collection_bits(CollectionHandle& c) override;
+
+  int collection_list(CollectionHandle &c,
+		      const ghobject_t& start,
+		      const ghobject_t& end,
+		      int max,
+		      std::vector<ghobject_t> *ls, ghobject_t *next) override;
+
+  int collection_list_legacy(CollectionHandle &c,
+                             const ghobject_t& start,
+                             const ghobject_t& end,
+                             int max,
+                             std::vector<ghobject_t> *ls,
+                             ghobject_t *next) override;
+
+  int omap_get(
+    CollectionHandle &c,     ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+    ) override;
+  int _omap_get(
+    Collection *c,     ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+    );
+  int _onode_omap_get(
+    const OnodeRef &o,           ///< [in] Object containing omap
+    ceph::buffer::list *header,          ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+  );
+
+
+  /// Get omap header
+  int omap_get_header(
+    CollectionHandle &c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    bool allow_eio = false ///< [in] don't assert on eio
+    ) override;
+
+  /// Get keys defined on oid
+  int omap_get_keys(
+    CollectionHandle &c,              ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    std::set<std::string> *keys      ///< [out] Keys defined on oid
+    ) override;
+
+  /// Get key values
+  int omap_get_values(
+    CollectionHandle &c,         ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::set<std::string> &keys,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) override;
+
+#ifdef WITH_SEASTAR
+  int omap_get_values(
+    CollectionHandle &c,         ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::optional<std::string> &start_after,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) override;
+#endif
+
+  /// Filters keys into out which are defined on oid
+  int omap_check_keys(
+    CollectionHandle &c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    const std::set<std::string> &keys, ///< [in] Keys to check
+    std::set<std::string> *out         ///< [out] Subset of keys defined on oid
+    ) override;
+
+  ObjectMap::ObjectMapIterator get_omap_iterator(
+    CollectionHandle &c,   ///< [in] collection
+    const ghobject_t &oid  ///< [in] object
+    ) override;
+
+  void set_fsid(uuid_d u) override {
+    fsid = u;
+  }
+  uuid_d get_fsid() override {
+    return fsid;
+  }
+
+  uint64_t estimate_objects_overhead(uint64_t num_objects) override {
+    return num_objects * 300; //assuming per-object overhead is 300 bytes
+  }
+
+  struct BSPerfTracker {
+    PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
+    PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
+
+    objectstore_perf_stat_t get_cur_stats() const {
+      objectstore_perf_stat_t ret;
+      ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
+      ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
+      return ret;
+    }
+
+    void update_from_perfcounters(PerfCounters &logger);
+  } perf_tracker;
+
+  objectstore_perf_stat_t get_cur_stats() override {
+    perf_tracker.update_from_perfcounters(*logger);
+    return perf_tracker.get_cur_stats();
+  }
+  const PerfCounters* get_perf_counters() const override {
+    return logger;
+  }
+  const PerfCounters* get_bluefs_perf_counters() const {
+    return bluefs->get_perf_counters();
+  }
+  KeyValueDB* get_kv() {
+    return db;
+  }
+
+  int queue_transactions(
+    CollectionHandle& ch,
+    std::vector<Transaction>& tls,
+    TrackedOpRef op = TrackedOpRef(),
+    ThreadPool::TPHandle *handle = NULL) override;
+
+  // error injection
+  void inject_data_error(const ghobject_t& o) override {
+    std::unique_lock l(debug_read_error_lock);
+    debug_data_error_objects.insert(o);
+  }
+  void inject_mdata_error(const ghobject_t& o) override {
+    std::unique_lock l(debug_read_error_lock);
+    debug_mdata_error_objects.insert(o);
+  }
+
+  /// methods to inject various errors fsck can repair
+  void inject_broken_shared_blob_key(const std::string& key,
+			 const ceph::buffer::list& bl);
+  void inject_no_shared_blob_key();
+  void inject_stray_shared_blob_key(uint64_t sbid);
+
+  void inject_leaked(uint64_t len);
+  void inject_false_free(coll_t cid, ghobject_t oid);
+  void inject_statfs(const std::string& key, const store_statfs_t& new_statfs);
+  void inject_global_statfs(const store_statfs_t& new_statfs);
+  void inject_misreference(coll_t cid1, ghobject_t oid1,
+			   coll_t cid2, ghobject_t oid2,
+			   uint64_t offset);
+  void inject_zombie_spanning_blob(coll_t cid, ghobject_t oid, int16_t blob_id);
+  // resets global per_pool_omap in DB
+  void inject_legacy_omap();
+  // resets per_pool_omap | pgmeta_omap for onode
+  void inject_legacy_omap(coll_t cid, ghobject_t oid);
+
+  void inject_bluefs_file(std::string_view dir,
+			  std::string_view name,
+			  size_t new_size);
+
+  void compact() override {
+    ceph_assert(db);
+    db->compact();
+  }
+  bool has_builtin_csum() const override {
+    return true;
+  }
+
+  inline void log_latency(const char* name,
+    int idx,
+    const ceph::timespan& lat,
+    double lat_threshold,
+    const char* info = "") const;
+
+  inline void log_latency_fn(const char* name,
+    int idx,
+    const ceph::timespan& lat,
+    double lat_threshold,
+    std::function<std::string (const ceph::timespan& lat)> fn) const;
+
+private:
+  bool _debug_data_eio(const ghobject_t& o) {
+    if (!cct->_conf->bluestore_debug_inject_read_err) {
+      return false;
+    }
+    std::shared_lock l(debug_read_error_lock);
+    return debug_data_error_objects.count(o);
+  }
+  bool _debug_mdata_eio(const ghobject_t& o) {
+    if (!cct->_conf->bluestore_debug_inject_read_err) {
+      return false;
+    }
+    std::shared_lock l(debug_read_error_lock);
+    return debug_mdata_error_objects.count(o);
+  }
+  void _debug_obj_on_delete(const ghobject_t& o) {
+    if (cct->_conf->bluestore_debug_inject_read_err) {
+      std::unique_lock l(debug_read_error_lock);
+      debug_data_error_objects.erase(o);
+      debug_mdata_error_objects.erase(o);
+    }
+  }
+private:
+  ceph::mutex qlock = ceph::make_mutex("BlueStore::Alerts::qlock");
+  std::string failed_cmode;
+  std::set<std::string> failed_compressors;
+  std::string spillover_alert;
+  std::string legacy_statfs_alert;
+  std::string no_per_pool_omap_alert;
+  std::string no_per_pg_omap_alert;
+  std::string disk_size_mismatch_alert;
+  std::string spurious_read_errors_alert;
+
+  void _log_alerts(osd_alert_list_t& alerts);
+  bool _set_compression_alert(bool cmode, const char* s) {
+    std::lock_guard l(qlock);
+    if (cmode) {
+      bool ret = failed_cmode.empty();
+      failed_cmode = s;
+      return ret;
+    }
+    return failed_compressors.emplace(s).second;
+  }
+  void _clear_compression_alert() {
+    std::lock_guard l(qlock);
+    failed_compressors.clear();
+    failed_cmode.clear();
+  }
+
+  void _set_spillover_alert(const std::string& s) {
+    std::lock_guard l(qlock);
+    spillover_alert = s;
+  }
+  void _clear_spillover_alert() {
+    std::lock_guard l(qlock);
+    spillover_alert.clear();
+  }
+
+  void _check_legacy_statfs_alert();
+  void _check_no_per_pg_or_pool_omap_alert();
+  void _set_disk_size_mismatch_alert(const std::string& s) {
+    std::lock_guard l(qlock);
+    disk_size_mismatch_alert = s;
+  }
+  void _set_spurious_read_errors_alert(const string& s) {
+    std::lock_guard l(qlock);
+    spurious_read_errors_alert = s;
+  }
+
+private:
+
+  // --------------------------------------------------------
+  // read processing internal methods
+  int _verify_csum(
+    OnodeRef& o,
+    const bluestore_blob_t* blob,
+    uint64_t blob_xoffset,
+    const ceph::buffer::list& bl,
+    uint64_t logical_offset) const;
+  int _decompress(ceph::buffer::list& source, ceph::buffer::list* result);
+
+
+  // --------------------------------------------------------
+  // write ops
+
+  struct WriteContext {
+    bool buffered = false;          ///< buffered write
+    bool compress = false;          ///< compressed write
+    uint64_t target_blob_size = 0;  ///< target (max) blob size
+    unsigned csum_order = 0;        ///< target checksum chunk order
+
+    old_extent_map_t old_extents;   ///< must deref these blobs
+    interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection
+
+    struct write_item {
+      uint64_t logical_offset;      ///< write logical offset
+      BlobRef b;
+      uint64_t blob_length;
+      uint64_t b_off;
+      ceph::buffer::list bl;
+      uint64_t b_off0; ///< original offset in a blob prior to padding
+      uint64_t length0; ///< original data length prior to padding
+
+      bool mark_unused;
+      bool new_blob; ///< whether new blob was created
+
+      bool compressed = false;
+      ceph::buffer::list compressed_bl;
+      size_t compressed_len = 0;
+
+      write_item(
+	uint64_t logical_offs,
+        BlobRef b,
+        uint64_t blob_len,
+        uint64_t o,
+        ceph::buffer::list& bl,
+        uint64_t o0,
+        uint64_t l0,
+        bool _mark_unused,
+	bool _new_blob)
+       :
+         logical_offset(logical_offs),
+         b(b),
+         blob_length(blob_len),
+         b_off(o),
+         bl(bl),
+         b_off0(o0),
+         length0(l0),
+         mark_unused(_mark_unused),
+	 new_blob(_new_blob) {}
+    };
+    std::vector<write_item> writes;                 ///< blobs we're writing
+
+    /// partial clone of the context
+    void fork(const WriteContext& other) {
+      buffered = other.buffered;
+      compress = other.compress;
+      target_blob_size = other.target_blob_size;
+      csum_order = other.csum_order;
+    }
+    void write(
+      uint64_t loffs,
+      BlobRef b,
+      uint64_t blob_len,
+      uint64_t o,
+      ceph::buffer::list& bl,
+      uint64_t o0,
+      uint64_t len0,
+      bool _mark_unused,
+      bool _new_blob) {
+      writes.emplace_back(loffs,
+                          b,
+                          blob_len,
+                          o,
+                          bl,
+                          o0,
+                          len0,
+                          _mark_unused,
+                          _new_blob);
+    }
+    /// Checks for writes to the same pextent within a blob
+    bool has_conflict(
+      BlobRef b,
+      uint64_t loffs,
+      uint64_t loffs_end,
+      uint64_t min_alloc_size);
+  };
+
+  void _do_write_small(
+    TransContext *txc,
+    CollectionRef &c,
+    OnodeRef o,
+    uint64_t offset, uint64_t length,
+    ceph::buffer::list::iterator& blp,
+    WriteContext *wctx);
+  void _do_write_big_apply_deferred(
+    TransContext* txc,
+    CollectionRef& c,
+    OnodeRef o,
+    BigDeferredWriteContext& dctx,
+    bufferlist::iterator& blp,
+    WriteContext* wctx);
+  void _do_write_big(
+    TransContext *txc,
+    CollectionRef &c,
+    OnodeRef o,
+    uint64_t offset, uint64_t length,
+    ceph::buffer::list::iterator& blp,
+    WriteContext *wctx);
+  int _do_alloc_write(
+    TransContext *txc,
+    CollectionRef c,
+    OnodeRef o,
+    WriteContext *wctx);
+  void _wctx_finish(
+    TransContext *txc,
+    CollectionRef& c,
+    OnodeRef o,
+    WriteContext *wctx,
+    std::set<SharedBlob*> *maybe_unshared_blobs=0);
+
+  int _write(TransContext *txc,
+	     CollectionRef& c,
+	     OnodeRef& o,
+	     uint64_t offset, size_t len,
+	     ceph::buffer::list& bl,
+	     uint32_t fadvise_flags);
+  void _pad_zeros(ceph::buffer::list *bl, uint64_t *offset,
+		  uint64_t chunk_size);
+
+  void _choose_write_options(CollectionRef& c,
+                             OnodeRef o,
+                             uint32_t fadvise_flags,
+                             WriteContext *wctx);
+
+  int _do_gc(TransContext *txc,
+             CollectionRef& c,
+             OnodeRef o,
+             const WriteContext& wctx,
+             uint64_t *dirty_start,
+             uint64_t *dirty_end);
+
+  int _do_write(TransContext *txc,
+		CollectionRef &c,
+		OnodeRef o,
+		uint64_t offset, uint64_t length,
+		ceph::buffer::list& bl,
+		uint32_t fadvise_flags);
+  void _do_write_data(TransContext *txc,
+                      CollectionRef& c,
+                      OnodeRef o,
+                      uint64_t offset,
+                      uint64_t length,
+                      ceph::buffer::list& bl,
+                      WriteContext *wctx);
+
+  int _touch(TransContext *txc,
+	     CollectionRef& c,
+	     OnodeRef& o);
+  int _do_zero(TransContext *txc,
+	       CollectionRef& c,
+	       OnodeRef& o,
+	       uint64_t offset, size_t len);
+  int _zero(TransContext *txc,
+	    CollectionRef& c,
+	    OnodeRef& o,
+	    uint64_t offset, size_t len);
+  void _do_truncate(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef o,
+		   uint64_t offset,
+		   std::set<SharedBlob*> *maybe_unshared_blobs=0);
+  int _truncate(TransContext *txc,
+		CollectionRef& c,
+		OnodeRef& o,
+		uint64_t offset);
+  int _remove(TransContext *txc,
+	      CollectionRef& c,
+	      OnodeRef& o);
+  int _do_remove(TransContext *txc,
+		 CollectionRef& c,
+		 OnodeRef o);
+  int _setattr(TransContext *txc,
+	       CollectionRef& c,
+	       OnodeRef& o,
+	       const std::string& name,
+	       ceph::buffer::ptr& val);
+  int _setattrs(TransContext *txc,
+		CollectionRef& c,
+		OnodeRef& o,
+		const std::map<std::string,ceph::buffer::ptr>& aset);
+  int _rmattr(TransContext *txc,
+	      CollectionRef& c,
+	      OnodeRef& o,
+	      const std::string& name);
+  int _rmattrs(TransContext *txc,
+	       CollectionRef& c,
+	       OnodeRef& o);
+  void _do_omap_clear(TransContext *txc, OnodeRef &o);
+  int _omap_clear(TransContext *txc,
+		  CollectionRef& c,
+		  OnodeRef& o);
+  int _omap_setkeys(TransContext *txc,
+		    CollectionRef& c,
+		    OnodeRef& o,
+		    ceph::buffer::list& bl);
+  int _omap_setheader(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& o,
+		      ceph::buffer::list& header);
+  int _omap_rmkeys(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& o,
+		   ceph::buffer::list& bl);
+  int _omap_rmkey_range(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o,
+			const std::string& first, const std::string& last);
+  int _set_alloc_hint(
+    TransContext *txc,
+    CollectionRef& c,
+    OnodeRef& o,
+    uint64_t expected_object_size,
+    uint64_t expected_write_size,
+    uint32_t flags);
+  int _do_clone_range(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& oldo,
+		      OnodeRef& newo,
+		      uint64_t srcoff, uint64_t length, uint64_t dstoff);
+  int _clone(TransContext *txc,
+	     CollectionRef& c,
+	     OnodeRef& oldo,
+	     OnodeRef& newo);
+  int _clone_range(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& oldo,
+		   OnodeRef& newo,
+		   uint64_t srcoff, uint64_t length, uint64_t dstoff);
+  int _rename(TransContext *txc,
+	      CollectionRef& c,
+	      OnodeRef& oldo,
+	      OnodeRef& newo,
+	      const ghobject_t& new_oid);
+  int _create_collection(TransContext *txc, const coll_t &cid,
+			 unsigned bits, CollectionRef *c);
+  int _remove_collection(TransContext *txc, const coll_t &cid,
+                         CollectionRef *c);
+  void _do_remove_collection(TransContext *txc, CollectionRef *c);
+  int _split_collection(TransContext *txc,
+			CollectionRef& c,
+			CollectionRef& d,
+			unsigned bits, int rem);
+  int _merge_collection(TransContext *txc,
+			CollectionRef *c,
+			CollectionRef& d,
+			unsigned bits);
+
+  void _collect_allocation_stats(uint64_t need, uint32_t alloc_size,
+                                 size_t extents);
+  void _record_allocation_stats();
+private:
+  uint64_t probe_count = 0;
+  std::atomic<uint64_t> alloc_stats_count = {0};
+  std::atomic<uint64_t> alloc_stats_fragments = { 0 };
+  std::atomic<uint64_t> alloc_stats_size = { 0 };
+  // 
+  std::array<std::tuple<uint64_t, uint64_t, uint64_t>, 5> alloc_stats_history =
+  { std::make_tuple(0ul, 0ul, 0ul) };
+
+  inline bool _use_rotational_settings();
+
+public:
+  typedef btree::btree_set<
+    uint64_t, std::less<uint64_t>,
+    mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
+
+  struct FSCK_ObjectCtx {
+    int64_t& errors;
+    int64_t& warnings;
+    uint64_t& num_objects;
+    uint64_t& num_extents;
+    uint64_t& num_blobs;
+    uint64_t& num_sharded_objects;
+    uint64_t& num_spanning_blobs;
+
+    mempool_dynamic_bitset* used_blocks;
+    uint64_t_btree_t* used_omap_head;
+
+    ceph::mutex* sb_info_lock;
+    sb_info_space_efficient_map_t& sb_info;
+    // approximate amount of references per <shared blob, chunk>
+    shared_blob_2hash_tracker_t& sb_ref_counts;
+
+    store_statfs_t& expected_store_statfs;
+    per_pool_statfs& expected_pool_statfs;
+    BlueStoreRepairer* repairer;
+
+    FSCK_ObjectCtx(int64_t& e,
+                   int64_t& w,
+                   uint64_t& _num_objects,
+                   uint64_t& _num_extents,
+                   uint64_t& _num_blobs,
+                   uint64_t& _num_sharded_objects,
+                   uint64_t& _num_spanning_blobs,
+                   mempool_dynamic_bitset* _ub,
+                   uint64_t_btree_t* _used_omap_head,
+
+                   ceph::mutex* _sb_info_lock,
+                   sb_info_space_efficient_map_t& _sb_info,
+		   shared_blob_2hash_tracker_t& _sb_ref_counts,
+                   store_statfs_t& _store_statfs,
+                   per_pool_statfs& _pool_statfs,
+                   BlueStoreRepairer* _repairer) :
+      errors(e),
+      warnings(w),
+      num_objects(_num_objects),
+      num_extents(_num_extents),
+      num_blobs(_num_blobs),
+      num_sharded_objects(_num_sharded_objects),
+      num_spanning_blobs(_num_spanning_blobs),
+      used_blocks(_ub),
+      used_omap_head(_used_omap_head),
+      sb_info_lock(_sb_info_lock),
+      sb_info(_sb_info),
+      sb_ref_counts(_sb_ref_counts),
+      expected_store_statfs(_store_statfs),
+      expected_pool_statfs(_pool_statfs),
+      repairer(_repairer) {
+    }
+  };
+
+  OnodeRef fsck_check_objects_shallow(
+    FSCKDepth depth,
+    int64_t pool_id,
+    CollectionRef c,
+    const ghobject_t& oid,
+    const std::string& key,
+    const ceph::buffer::list& value,
+    mempool::bluestore_fsck::list<std::string>* expecting_shards,
+    std::map<BlobRef, bluestore_blob_t::unused_t>* referenced,
+    const BlueStore::FSCK_ObjectCtx& ctx);
+
+private:
+  void _fsck_check_object_omap(FSCKDepth depth,
+    OnodeRef& o,
+    const BlueStore::FSCK_ObjectCtx& ctx);
+
+  void _fsck_check_objects(FSCKDepth depth,
+    FSCK_ObjectCtx& ctx);
+};
+
+inline std::ostream& operator<<(std::ostream& out, const BlueStore::volatile_statfs& s) {
+  return out 
+    << " allocated:"
+      << s.values[BlueStore::volatile_statfs::STATFS_ALLOCATED]
+    << " stored:"
+      << s.values[BlueStore::volatile_statfs::STATFS_STORED]
+    << " compressed:"
+      << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED]
+    << " compressed_orig:"
+      << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ORIGINAL]
+    << " compressed_alloc:"
+      << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ALLOCATED];
+}
+
+static inline void intrusive_ptr_add_ref(BlueStore::Onode *o) {
+  o->get();
+}
+static inline void intrusive_ptr_release(BlueStore::Onode *o) {
+  o->put();
+}
+
+static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer *o) {
+  o->get();
+}
+static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) {
+  o->put();
+}
+
+class BlueStoreRepairer
+{
+  ceph::mutex lock = ceph::make_mutex("BlueStore::BlueStoreRepairer::lock");
+
+public:
+  // to simplify future potential migration to mempools
+  using fsck_interval = interval_set<uint64_t>;
+
+  // Structure to track what pextents are used for specific cid/oid.
+  // Similar to Bloom filter positive and false-positive matches are 
+  // possible only.
+  // Maintains two lists of bloom filters for both cids and oids
+  //   where each list entry is a BF for specific disk pextent
+  //   The length of the extent per filter is measured on init.
+  // Allows to filter out 'uninteresting' pextents to speadup subsequent
+  //  'is_used' access. 
+  struct StoreSpaceTracker {
+    const uint64_t BLOOM_FILTER_SALT_COUNT = 2;
+    const uint64_t BLOOM_FILTER_TABLE_SIZE = 32; // bytes per single filter
+    const uint64_t BLOOM_FILTER_EXPECTED_COUNT = 16; // arbitrary selected
+    static const uint64_t DEF_MEM_CAP = 128 * 1024 * 1024;
+
+    typedef mempool::bluestore_fsck::vector<bloom_filter> bloom_vector;
+    bloom_vector collections_bfs;
+    bloom_vector objects_bfs;
+    
+    bool was_filtered_out = false; 
+    uint64_t granularity = 0; // extent length for a single filter
+
+    StoreSpaceTracker() {
+    }
+    StoreSpaceTracker(const StoreSpaceTracker& from) :
+      collections_bfs(from.collections_bfs),
+      objects_bfs(from.objects_bfs),
+      granularity(from.granularity) {
+    }
+
+    void init(uint64_t total,
+	      uint64_t min_alloc_size,
+	      uint64_t mem_cap = DEF_MEM_CAP) {
+      ceph_assert(!granularity); // not initialized yet
+      ceph_assert(min_alloc_size && isp2(min_alloc_size));
+      ceph_assert(mem_cap);
+      
+      total = round_up_to(total, min_alloc_size);
+      granularity = total * BLOOM_FILTER_TABLE_SIZE * 2 / mem_cap;
+
+      if (!granularity) {
+	granularity = min_alloc_size;
+      } else {
+	granularity = round_up_to(granularity, min_alloc_size);
+      }
+
+      uint64_t entries = round_up_to(total, granularity) / granularity;
+      collections_bfs.resize(entries,
+        bloom_filter(BLOOM_FILTER_SALT_COUNT,
+                     BLOOM_FILTER_TABLE_SIZE,
+                     0,
+                     BLOOM_FILTER_EXPECTED_COUNT));
+      objects_bfs.resize(entries, 
+        bloom_filter(BLOOM_FILTER_SALT_COUNT,
+                     BLOOM_FILTER_TABLE_SIZE,
+                     0,
+                     BLOOM_FILTER_EXPECTED_COUNT));
+    }
+    inline uint32_t get_hash(const coll_t& cid) const {
+      return cid.hash_to_shard(1);
+    }
+    inline void set_used(uint64_t offset, uint64_t len,
+			 const coll_t& cid, const ghobject_t& oid) {
+      ceph_assert(granularity); // initialized
+      
+      // can't call this func after filter_out has been applied
+      ceph_assert(!was_filtered_out);
+      if (!len) {
+	return;
+      }
+      auto pos = offset / granularity;
+      auto end_pos = (offset + len - 1) / granularity;
+      while (pos <= end_pos) {
+        collections_bfs[pos].insert(get_hash(cid));
+        objects_bfs[pos].insert(oid.hobj.get_hash());
+        ++pos;
+      }
+    }
+    // filter-out entries unrelated to the specified(broken) extents.
+    // 'is_used' calls are permitted after that only
+    size_t filter_out(const fsck_interval& extents);
+
+    // determines if collection's present after filtering-out 
+    inline bool is_used(const coll_t& cid) const {
+      ceph_assert(was_filtered_out);
+      for(auto& bf : collections_bfs) {
+        if (bf.contains(get_hash(cid))) {
+          return true;
+        }
+      }
+      return false;
+    }
+    // determines if object's present after filtering-out 
+    inline bool is_used(const ghobject_t& oid) const {
+      ceph_assert(was_filtered_out);
+      for(auto& bf : objects_bfs) {
+        if (bf.contains(oid.hobj.get_hash())) {
+          return true;
+        }
+      }
+      return false;
+    }
+    // determines if collection's present before filtering-out 
+    inline bool is_used(const coll_t& cid, uint64_t offs) const {
+      ceph_assert(granularity); // initialized
+      ceph_assert(!was_filtered_out);
+      auto &bf = collections_bfs[offs / granularity];
+      if (bf.contains(get_hash(cid))) {
+        return true;
+      }
+      return false;
+    }
+    // determines if object's present before filtering-out 
+    inline bool is_used(const ghobject_t& oid, uint64_t offs) const {
+      ceph_assert(granularity); // initialized
+      ceph_assert(!was_filtered_out);
+      auto &bf = objects_bfs[offs / granularity];
+      if (bf.contains(oid.hobj.get_hash())) {
+        return true;
+      }
+      return false;
+    }
+  };
+public:
+  void fix_per_pool_omap(KeyValueDB *db, int);
+  bool remove_key(KeyValueDB *db, const std::string& prefix, const std::string& key);
+  bool fix_shared_blob(KeyValueDB::Transaction txn,
+			uint64_t sbid,
+			bluestore_extent_ref_map_t* ref_map,
+			size_t repaired = 1);
+  bool fix_statfs(KeyValueDB *db, const std::string& key,
+    const store_statfs_t& new_statfs);
+
+  bool fix_leaked(KeyValueDB *db,
+		  FreelistManager* fm,
+		  uint64_t offset, uint64_t len);
+  bool fix_false_free(KeyValueDB *db,
+		      FreelistManager* fm,
+		      uint64_t offset, uint64_t len);
+  bool fix_spanning_blobs(
+    KeyValueDB* db,
+    std::function<void(KeyValueDB::Transaction)> f);
+
+  bool preprocess_misreference(KeyValueDB *db);
+
+  unsigned apply(KeyValueDB* db);
+
+  void note_misreference(uint64_t offs, uint64_t len, bool inc_error) {
+    std::lock_guard l(lock);
+    misreferenced_extents.union_insert(offs, len);
+    if (inc_error) {
+      ++to_repair_cnt;
+    }
+  }
+  //////////////////////
+  //In fact two methods below are the only ones in this class which are thread-safe!!
+  void inc_repaired(size_t n = 1) {
+    to_repair_cnt += n;
+  }
+  void request_compaction() {
+    need_compact = true;
+  }
+  //////////////////////
+
+  void init_space_usage_tracker(
+    uint64_t total_space, uint64_t lres_tracking_unit_size)
+  {
+    //NB: not for use in multithreading mode!!!
+    space_usage_tracker.init(total_space, lres_tracking_unit_size);
+  }
+  void set_space_used(uint64_t offset, uint64_t len,
+    const coll_t& cid, const ghobject_t& oid) {
+    std::lock_guard l(lock);
+    space_usage_tracker.set_used(offset, len, cid, oid);
+  }
+  inline bool is_used(const coll_t& cid) const {
+    //NB: not for use in multithreading mode!!!
+    return space_usage_tracker.is_used(cid);
+  }
+  inline bool is_used(const ghobject_t& oid) const {
+    //NB: not for use in multithreading mode!!!
+    return space_usage_tracker.is_used(oid);
+  }
+
+  const fsck_interval& get_misreferences() const {
+    //NB: not for use in multithreading mode!!!
+    return misreferenced_extents;
+  }
+  KeyValueDB::Transaction get_fix_misreferences_txn() {
+    //NB: not for use in multithreading mode!!!
+    return fix_misreferences_txn;
+  }
+
+private:
+  std::atomic<unsigned> to_repair_cnt = { 0 };
+  std::atomic<bool> need_compact = { false };
+  KeyValueDB::Transaction fix_per_pool_omap_txn;
+  KeyValueDB::Transaction fix_fm_leaked_txn;
+  KeyValueDB::Transaction fix_fm_false_free_txn;
+  KeyValueDB::Transaction remove_key_txn;
+  KeyValueDB::Transaction fix_statfs_txn;
+  KeyValueDB::Transaction fix_shared_blob_txn;
+
+  KeyValueDB::Transaction fix_misreferences_txn;
+  KeyValueDB::Transaction fix_onode_txn;
+
+  StoreSpaceTracker space_usage_tracker;
+
+  // non-shared extents with multiple references
+  fsck_interval misreferenced_extents;
+
+};
+
+class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector
+{
+  template <class T, size_t MaxX, size_t MaxY>
+  class matrix_2d {
+    T values[MaxX][MaxY];
+  public:
+    matrix_2d() {
+      clear();
+    }
+    T& at(size_t x, size_t y) {
+      ceph_assert(x < MaxX);
+      ceph_assert(y < MaxY);
+
+      return values[x][y];
+    }
+    size_t get_max_x() const {
+      return MaxX;
+    }
+    size_t get_max_y() const {
+      return MaxY;
+    }
+    void clear() {
+      memset(values, 0, sizeof(values));
+    }
+  };
+
+  enum {
+    // use 0/nullptr as unset indication
+    LEVEL_FIRST = 1,
+    LEVEL_LOG = LEVEL_FIRST, // BlueFS log
+    LEVEL_WAL,
+    LEVEL_DB,
+    LEVEL_SLOW,
+    LEVEL_MAX
+  };
+  // add +1 row for corresponding per-device totals
+  // add +1 column for per-level actual (taken from file size) total
+  typedef matrix_2d<uint64_t, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t;
+
+  per_level_per_dev_usage_t per_level_per_dev_usage;
+  // file count per level, add +1 to keep total file count
+  uint64_t per_level_files[LEVEL_MAX - LEVEL_FIRST + 1] = { 0 };
+
+  // Note: maximum per-device totals below might be smaller than corresponding
+  // perf counters by up to a single alloc unit (1M) due to superblock extent.
+  // The later is not accounted here.
+  per_level_per_dev_usage_t per_level_per_dev_max;
+
+  uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST];
+  uint64_t db_avail4slow = 0;
+  enum {
+    OLD_POLICY,
+    USE_SOME_EXTRA
+  };
+
+public:
+  RocksDBBlueFSVolumeSelector(
+    uint64_t _wal_total,
+    uint64_t _db_total,
+    uint64_t _slow_total,
+    uint64_t _level0_size,
+    uint64_t _level_base,
+    uint64_t _level_multiplier,
+    double reserved_factor,
+    uint64_t reserved,
+    bool new_pol)
+  {
+    l_totals[LEVEL_LOG - LEVEL_FIRST] = 0; // not used at the moment
+    l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total;
+    l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total;
+    l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total;
+
+    if (!new_pol) {
+      return;
+    }
+
+    // Calculating how much extra space is available at DB volume.
+    // Depending on the presence of explicit reserved size specification it might be either
+    // * DB volume size - reserved
+    // or
+    // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
+    if (!reserved) {
+      uint64_t prev_levels = _level0_size;
+      uint64_t cur_level = _level_base;
+      uint64_t cur_threshold = 0;
+      do {
+        uint64_t next_level = cur_level * _level_multiplier;
+        uint64_t next_threshold = prev_levels + cur_level + next_level * reserved_factor;
+        if (_db_total <= next_threshold) {
+          db_avail4slow = cur_threshold ? _db_total - cur_threshold : 0;
+          break;
+        } else {
+          prev_levels += cur_level;
+          cur_level = next_level;
+          cur_threshold = next_threshold;
+        }
+      } while (true);
+    } else {
+      db_avail4slow = _db_total - reserved;
+    }
+  }
+
+  void* get_hint_for_log() const override {
+    return  reinterpret_cast<void*>(LEVEL_LOG);
+  }
+  void* get_hint_by_dir(std::string_view dirname) const override;
+
+  void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+    if (hint == nullptr)
+      return;
+    size_t pos = (size_t)hint - LEVEL_FIRST;
+    for (auto& p : fnode.extents) {
+      auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
+      auto& max = per_level_per_dev_max.at(p.bdev, pos);
+      cur += p.length;
+      if (cur > max) {
+        max = cur;
+      }
+      {
+        //update per-device totals
+        auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+        auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+        cur += p.length;
+        if (cur > max) {
+          max = cur;
+        }
+      }
+    }
+    {
+      //update per-level actual totals
+      auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+      auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
+      cur += fnode.size;
+      if (cur > max) {
+        max = cur;
+      }
+    }
+    ++per_level_files[pos];
+    ++per_level_files[LEVEL_MAX - LEVEL_FIRST];
+  }
+  void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+    if (hint == nullptr)
+      return;
+    size_t pos = (size_t)hint - LEVEL_FIRST;
+    for (auto& p : fnode.extents) {
+      auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
+      ceph_assert(cur >= p.length);
+      cur -= p.length;
+
+      //update per-device totals
+      auto& cur2 = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+      ceph_assert(cur2 >= p.length);
+      cur2 -= p.length;
+    }
+    //update per-level actual totals
+    auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+    ceph_assert(cur >= fnode.size);
+    cur -= fnode.size;
+    ceph_assert(per_level_files[pos] > 0);
+    --per_level_files[pos];
+    ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0);
+    --per_level_files[LEVEL_MAX - LEVEL_FIRST];
+  }
+  void add_usage(void* hint, uint64_t fsize) override {
+    if (hint == nullptr)
+      return;
+    size_t pos = (size_t)hint - LEVEL_FIRST;
+    //update per-level actual totals
+    auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+    auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
+    cur += fsize;
+    if (cur > max) {
+      max = cur;
+    }
+  }
+  void sub_usage(void* hint, uint64_t fsize) override {
+    if (hint == nullptr)
+      return;
+    size_t pos = (size_t)hint - LEVEL_FIRST;
+    //update per-level actual totals
+    auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+    ceph_assert(cur >= fsize);
+    per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos) -= fsize;
+  }
+
+  uint8_t select_prefer_bdev(void* h) override;
+  void get_paths(
+    const std::string& base,
+    BlueFSVolumeSelector::paths& res) const override;
+
+  void dump(std::ostream& sout) override;
+};
+
+#endif
diff --git a/src/os/bluestore/FreelistManager.cc b/src/os/bluestore/FreelistManager.cc
new file mode 100644
index 000000000..5907df443
--- /dev/null
+++ b/src/os/bluestore/FreelistManager.cc
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "FreelistManager.h"
+#include "BitmapFreelistManager.h"
+#ifdef HAVE_LIBZBD
+#include "ZonedFreelistManager.h"
+#endif
+
+FreelistManager *FreelistManager::create(
+  CephContext* cct,
+  std::string type,
+  std::string prefix)
+{
+  // a bit of a hack... we hard-code the prefixes here.  we need to
+  // put the freelistmanagers in different prefixes because the merge
+  // op is per prefix, has to done pre-db-open, and we don't know the
+  // freelist type until after we open the db.
+  ceph_assert(prefix == "B");
+  if (type == "bitmap")
+    return new BitmapFreelistManager(cct, "B", "b");
+
+#ifdef HAVE_LIBZBD
+  // With zoned drives there is only one FreelistManager implementation that we
+  // can use, and we also know if a drive is zoned right after opening it
+  // (BlueStore::_open_bdev).  Hence, we set freelist_type to "zoned" whenever
+  // we open the device and it turns out to be is zoned.  We ignore |prefix|
+  // passed to create and use the prefixes defined for zoned devices at the top
+  // of BlueStore.cc.
+  if (type == "zoned")
+    return new ZonedFreelistManager(cct, "Z", "z");
+#endif
+
+  return NULL;
+}
+
+void FreelistManager::setup_merge_operators(KeyValueDB *db,
+					    const std::string& type)
+{
+#ifdef HAVE_LIBZBD
+  if (type == "zoned")
+    ZonedFreelistManager::setup_merge_operator(db, "z");
+  else
+#endif
+    BitmapFreelistManager::setup_merge_operator(db, "b");
+}
diff --git a/src/os/bluestore/FreelistManager.h b/src/os/bluestore/FreelistManager.h
new file mode 100644
index 000000000..1aaff89ea
--- /dev/null
+++ b/src/os/bluestore/FreelistManager.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_FREELISTMANAGER_H
+#define CEPH_OS_BLUESTORE_FREELISTMANAGER_H
+
+#include <string>
+#include <vector>
+#include <mutex>
+#include <ostream>
+#include "kv/KeyValueDB.h"
+#include "bluestore_types.h"
+#include "zoned_types.h"
+
+class FreelistManager {
+public:
+  CephContext* cct;
+  FreelistManager(CephContext* cct) : cct(cct) {}
+  virtual ~FreelistManager() {}
+
+  static FreelistManager *create(
+    CephContext* cct,
+    std::string type,
+    std::string prefix);
+
+  static void setup_merge_operators(KeyValueDB *db, const std::string &type);
+
+  virtual int create(uint64_t size, uint64_t granularity,
+		     KeyValueDB::Transaction txn) = 0;
+
+  virtual int init(KeyValueDB *kvdb, bool db_in_read_only,
+    std::function<int(const std::string&, std::string*)> cfg_reader) = 0;
+  virtual void sync(KeyValueDB* kvdb) = 0;
+  virtual void shutdown() = 0;
+
+  virtual void dump(KeyValueDB *kvdb) = 0;
+
+  virtual void enumerate_reset() = 0;
+  virtual bool enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length) = 0;
+
+  virtual void allocate(
+    uint64_t offset, uint64_t length,
+    KeyValueDB::Transaction txn) = 0;
+  virtual void release(
+    uint64_t offset, uint64_t length,
+    KeyValueDB::Transaction txn) = 0;
+
+  virtual uint64_t get_size() const = 0;
+  virtual uint64_t get_alloc_units() const = 0;
+  virtual uint64_t get_alloc_size() const = 0;
+
+  virtual void get_meta(uint64_t target_size,
+    std::vector<std::pair<string, string>>*) const = 0;
+
+  virtual std::vector<zone_state_t> get_zone_states(KeyValueDB *kvdb) const {
+    return {};
+  }
+};
+
+
+#endif
diff --git a/src/os/bluestore/HybridAllocator.cc b/src/os/bluestore/HybridAllocator.cc
new file mode 100644
index 000000000..b78a99ffe
--- /dev/null
+++ b/src/os/bluestore/HybridAllocator.cc
@@ -0,0 +1,226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "HybridAllocator.h"
+
+#include <limits>
+
+#include "common/config_proxy.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef  dout_prefix
+#define dout_prefix *_dout << "HybridAllocator "
+
+
+int64_t HybridAllocator::allocate(
+  uint64_t want,
+  uint64_t unit,
+  uint64_t max_alloc_size,
+  int64_t  hint,
+  PExtentVector* extents)
+{
+  ldout(cct, 10) << __func__ << std::hex
+                 << " want 0x" << want
+                 << " unit 0x" << unit
+                 << " max_alloc_size 0x" << max_alloc_size
+                 << " hint 0x" << hint
+                 << std::dec << dendl;
+  ceph_assert(isp2(unit));
+  ceph_assert(want % unit == 0);
+
+  if (max_alloc_size == 0) {
+    max_alloc_size = want;
+  }
+  if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
+      max_alloc_size >= cap) {
+    max_alloc_size = p2align(uint64_t(cap), (uint64_t)get_block_size());
+  }
+
+  std::lock_guard l(lock);
+
+  int64_t res;
+  PExtentVector local_extents;
+
+  // preserve original 'extents' vector state
+  auto orig_size = extents->size();
+  auto orig_pos = extents->end();
+  if (orig_size) {
+    --orig_pos;
+  }
+
+  // try bitmap first to avoid unneeded contiguous extents split if
+  // desired amount is less than shortes range in AVL
+  if (bmap_alloc && bmap_alloc->get_free() &&
+    want < _lowest_size_available()) {
+    res = bmap_alloc->allocate(want, unit, max_alloc_size, hint, extents);
+    if (res < 0) {
+      // got a failure, release already allocated and
+      // start over allocation from avl
+      if (orig_size) {
+        local_extents.insert(
+          local_extents.end(), ++orig_pos, extents->end());
+        extents->resize(orig_size);
+      } else {
+        extents->swap(local_extents);
+      }
+      bmap_alloc->release(local_extents);
+      res = 0;
+    }
+    if ((uint64_t)res < want) {
+      auto res2 = _allocate(want - res, unit, max_alloc_size, hint, extents);
+      if (res2 < 0) {
+        res = res2; // caller to do the release
+      } else {
+        res += res2;
+      }
+    }
+  } else {
+    res = _allocate(want, unit, max_alloc_size, hint, extents);
+    if (res < 0) {
+      // got a failure, release already allocated and
+      // start over allocation from bitmap
+      if (orig_size) {
+        local_extents.insert(
+          local_extents.end(), ++orig_pos, extents->end());
+        extents->resize(orig_size);
+      } else {
+        extents->swap(local_extents);
+      }
+      _release(local_extents);
+      res = 0;
+    }
+    if ((uint64_t)res < want ) {
+      auto res2 = bmap_alloc ?
+        bmap_alloc->allocate(want - res, unit, max_alloc_size, hint, extents) :
+        0;
+      if (res2 < 0 ) {
+        res = res2; // caller to do the release
+      } else {
+        res += res2;
+      }
+    }
+  }
+  return res ? res : -ENOSPC;
+}
+
+void HybridAllocator::release(const interval_set<uint64_t>& release_set) {
+  std::lock_guard l(lock);
+  // this will attempt to put free ranges into AvlAllocator first and
+  // fallback to bitmap one via _try_insert_range call
+  _release(release_set);
+}
+
+uint64_t HybridAllocator::get_free()
+{
+  std::lock_guard l(lock);
+  return (bmap_alloc ? bmap_alloc->get_free() : 0) + _get_free();
+}
+
+double HybridAllocator::get_fragmentation()
+{
+  std::lock_guard l(lock);
+  auto f = AvlAllocator::_get_fragmentation();
+  auto bmap_free = bmap_alloc ? bmap_alloc->get_free() : 0;
+  if (bmap_free) {
+    auto _free = _get_free() + bmap_free;
+    auto bf = bmap_alloc->get_fragmentation();
+
+    f = f * _get_free() / _free + bf * bmap_free / _free;
+  }
+  return f;
+}
+
+void HybridAllocator::dump()
+{
+  std::lock_guard l(lock);
+  AvlAllocator::_dump();
+  if (bmap_alloc) {
+    bmap_alloc->dump();
+  }
+  ldout(cct, 0) << __func__
+    << " avl_free: " << _get_free()
+    << " bmap_free: " << (bmap_alloc ? bmap_alloc->get_free() : 0)
+    << dendl;
+}
+
+void HybridAllocator::foreach(
+  std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+  std::lock_guard l(lock);
+  AvlAllocator::_foreach(notify);
+  if (bmap_alloc) {
+    bmap_alloc->foreach(notify);
+  }
+}
+
+void HybridAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+  if (!length)
+    return;
+  std::lock_guard l(lock);
+  ldout(cct, 10) << __func__ << std::hex
+                 << " offset 0x" << offset
+                 << " length 0x" << length
+                 << std::dec << dendl;
+  _try_remove_from_tree(offset, length,
+    [&](uint64_t o, uint64_t l, bool found) {
+      if (!found) {
+        if (bmap_alloc) {
+          bmap_alloc->init_rm_free(o, l);
+        } else {
+          lderr(cct) << "init_rm_free lambda" << std::hex
+            << "Uexpected extent: "
+            << " 0x" << o << "~" << l
+            << std::dec << dendl;
+          ceph_assert(false);
+        }
+      }
+    });
+}
+
+void HybridAllocator::shutdown()
+{
+  std::lock_guard l(lock);
+  _shutdown();
+  if (bmap_alloc) {
+    bmap_alloc->shutdown();
+    delete bmap_alloc;
+    bmap_alloc = nullptr;
+  }
+}
+
+void HybridAllocator::_spillover_range(uint64_t start, uint64_t end)
+{
+  auto size = end - start;
+  dout(20) << __func__
+    << std::hex << " "
+    << start << "~" << size
+    << std::dec
+    << dendl;
+  ceph_assert(size);
+  if (!bmap_alloc) {
+    dout(1) << __func__
+      << std::hex
+      << " constructing fallback allocator"
+      << dendl;
+    bmap_alloc = new BitmapAllocator(cct,
+      get_capacity(),
+      get_block_size(),
+      get_name() + ".fallback");
+  }
+  bmap_alloc->init_add_free(start, size);
+}
+
+void HybridAllocator::_add_to_tree(uint64_t start, uint64_t size)
+{
+  if (bmap_alloc) {
+    uint64_t head = bmap_alloc->claim_free_to_left(start);
+    uint64_t tail = bmap_alloc->claim_free_to_right(start + size);
+    ceph_assert(head <= start);
+    start -= head;
+    size += head + tail;
+  }
+  AvlAllocator::_add_to_tree(start, size);
+}
diff --git a/src/os/bluestore/HybridAllocator.h b/src/os/bluestore/HybridAllocator.h
new file mode 100644
index 000000000..d60fc1a31
--- /dev/null
+++ b/src/os/bluestore/HybridAllocator.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <mutex>
+
+#include "AvlAllocator.h"
+#include "BitmapAllocator.h"
+
+class HybridAllocator : public AvlAllocator {
+  BitmapAllocator* bmap_alloc = nullptr;
+public:
+  HybridAllocator(CephContext* cct, int64_t device_size, int64_t _block_size,
+                  uint64_t max_mem,
+	          const std::string& name) :
+      AvlAllocator(cct, device_size, _block_size, max_mem, name) {
+  }
+  const char* get_type() const override
+  {
+    return "hybrid";
+  }
+  int64_t allocate(
+    uint64_t want,
+    uint64_t unit,
+    uint64_t max_alloc_size,
+    int64_t  hint,
+    PExtentVector *extents) override;
+  void release(const interval_set<uint64_t>& release_set) override;
+  uint64_t get_free() override;
+  double get_fragmentation() override;
+
+  void dump() override;
+  void foreach(
+    std::function<void(uint64_t offset, uint64_t length)> notify) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+  void shutdown() override;
+
+protected:
+  // intended primarily for UT
+  BitmapAllocator* get_bmap() {
+    return bmap_alloc;
+  }
+  const BitmapAllocator* get_bmap() const {
+    return bmap_alloc;
+  }
+private:
+
+  void _spillover_range(uint64_t start, uint64_t end) override;
+
+  // called when extent to be released/marked free
+  void _add_to_tree(uint64_t start, uint64_t size) override;
+};
diff --git a/src/os/bluestore/StupidAllocator.cc b/src/os/bluestore/StupidAllocator.cc
new file mode 100644
index 000000000..805a51fb0
--- /dev/null
+++ b/src/os/bluestore/StupidAllocator.cc
@@ -0,0 +1,371 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "StupidAllocator.h"
+#include "bluestore_types.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "stupidalloc 0x" << this << " "
+
+StupidAllocator::StupidAllocator(CephContext* cct,
+                                 const std::string& name,
+                                 int64_t _size,
+                                 int64_t _block_size)
+  : Allocator(name, _size, _block_size), cct(cct), num_free(0),
+    free(10)
+{
+  ceph_assert(cct != nullptr);
+  bdev_block_size = cct->_conf->bdev_block_size;
+}
+
+StupidAllocator::~StupidAllocator()
+{
+}
+
+unsigned StupidAllocator::_choose_bin(uint64_t orig_len)
+{
+  ceph_assert(bdev_block_size > 0);
+  uint64_t len = orig_len / bdev_block_size;
+  int bin = std::min((int)cbits(len), (int)free.size() - 1);
+  ldout(cct, 30) << __func__ << " len 0x" << std::hex << orig_len
+		 << std::dec << " -> " << bin << dendl;
+  return bin;
+}
+
+void StupidAllocator::_insert_free(uint64_t off, uint64_t len)
+{
+  unsigned bin = _choose_bin(len);
+  ldout(cct, 30) << __func__ << " 0x" << std::hex << off << "~" << len
+		 << std::dec << " in bin " << bin << dendl;
+  while (true) {
+    free[bin].insert(off, len, &off, &len);
+    unsigned newbin = _choose_bin(len);
+    if (newbin == bin)
+      break;
+    ldout(cct, 30) << __func__ << " promoting 0x" << std::hex << off << "~" << len
+		   << std::dec << " to bin " << newbin << dendl;
+    free[bin].erase(off, len);
+    bin = newbin;
+  }
+}
+
+/// return the effective length of the extent if we align to alloc_unit
+uint64_t StupidAllocator::_aligned_len(
+  StupidAllocator::interval_set_t::iterator p,
+  uint64_t alloc_unit)
+{
+  uint64_t skew = p.get_start() % alloc_unit;
+  if (skew)
+    skew = alloc_unit - skew;
+  if (skew > p.get_len())
+    return 0;
+  else
+    return p.get_len() - skew;
+}
+
+int64_t StupidAllocator::allocate_int(
+  uint64_t want_size, uint64_t alloc_unit, int64_t hint,
+  uint64_t *offset, uint32_t *length)
+{
+  std::lock_guard l(lock);
+  ldout(cct, 10) << __func__ << " want_size 0x" << std::hex << want_size
+	   	 << " alloc_unit 0x" << alloc_unit
+	   	 << " hint 0x" << hint << std::dec
+	   	 << dendl;
+  uint64_t want = std::max(alloc_unit, want_size);
+  int bin = _choose_bin(want);
+  int orig_bin = bin;
+
+  auto p = free[0].begin();
+
+  if (!hint)
+    hint = last_alloc;
+
+  // search up (from hint)
+  if (hint) {
+    for (bin = orig_bin; bin < (int)free.size(); ++bin) {
+      p = free[bin].lower_bound(hint);
+      while (p != free[bin].end()) {
+	if (_aligned_len(p, alloc_unit) >= want_size) {
+	  goto found;
+	}
+	++p;
+      }
+    }
+  }
+
+  // search up (from origin, and skip searched extents by hint)
+  for (bin = orig_bin; bin < (int)free.size(); ++bin) {
+    p = free[bin].begin();
+    auto end = hint ? free[bin].lower_bound(hint) : free[bin].end();
+    while (p != end) {
+      if (_aligned_len(p, alloc_unit) >= want_size) {
+	goto found;
+      }
+      ++p;
+    }
+  }
+
+  // search down (hint)
+  if (hint) {
+    for (bin = orig_bin; bin >= 0; --bin) {
+      p = free[bin].lower_bound(hint);
+      while (p != free[bin].end()) {
+	if (_aligned_len(p, alloc_unit) >= alloc_unit) {
+	  goto found;
+	}
+	++p;
+      }
+    }
+  }
+
+  // search down (from origin, and skip searched extents by hint)
+  for (bin = orig_bin; bin >= 0; --bin) {
+    p = free[bin].begin();
+    auto end = hint ? free[bin].lower_bound(hint) : free[bin].end();
+    while (p != end) {
+      if (_aligned_len(p, alloc_unit) >= alloc_unit) {
+	goto found;
+      }
+      ++p;
+    }
+  }
+
+  return -ENOSPC;
+
+ found:
+  uint64_t skew = p.get_start() % alloc_unit;
+  if (skew)
+    skew = alloc_unit - skew;
+  *offset = p.get_start() + skew;
+  *length = std::min(std::max(alloc_unit, want_size), p2align((p.get_len() - skew), alloc_unit));
+  if (cct->_conf->bluestore_debug_small_allocations) {
+    uint64_t max =
+      alloc_unit * (rand() % cct->_conf->bluestore_debug_small_allocations);
+    if (max && *length > max) {
+      ldout(cct, 10) << __func__ << " shortening allocation of 0x" << std::hex
+	       	     << *length << " -> 0x"
+	       	     << max << " due to debug_small_allocations" << std::dec
+		     << dendl;
+      *length = max;
+    }
+  }
+  ldout(cct, 30) << __func__ << " got 0x" << std::hex << *offset << "~" << *length
+	   	 << " from bin " << std::dec << bin << dendl;
+
+  free[bin].erase(*offset, *length);
+  uint64_t off, len;
+  if (*offset && free[bin].contains(*offset - skew - 1, &off, &len)) {
+    int newbin = _choose_bin(len);
+    if (newbin != bin) {
+      ldout(cct, 30) << __func__ << " demoting 0x" << std::hex << off << "~" << len
+	       	     << std::dec << " to bin " << newbin << dendl;
+      free[bin].erase(off, len);
+      _insert_free(off, len);
+    }
+  }
+  if (free[bin].contains(*offset + *length, &off, &len)) {
+    int newbin = _choose_bin(len);
+    if (newbin != bin) {
+      ldout(cct, 30) << __func__ << " demoting 0x" << std::hex << off << "~" << len
+	       	     << std::dec << " to bin " << newbin << dendl;
+      free[bin].erase(off, len);
+      _insert_free(off, len);
+    }
+  }
+
+  num_free -= *length;
+  ceph_assert(num_free >= 0);
+  last_alloc = *offset + *length;
+  return 0;
+}
+
+int64_t StupidAllocator::allocate(
+  uint64_t want_size,
+  uint64_t alloc_unit,
+  uint64_t max_alloc_size,
+  int64_t hint,
+  PExtentVector *extents)
+{
+  uint64_t allocated_size = 0;
+  uint64_t offset = 0;
+  uint32_t length = 0;
+  int res = 0;
+
+  if (max_alloc_size == 0) {
+    max_alloc_size = want_size;
+  }
+  // cap with 32-bit val
+  max_alloc_size = std::min(max_alloc_size, 0x10000000 - alloc_unit);
+
+  while (allocated_size < want_size) {
+    res = allocate_int(std::min(max_alloc_size, (want_size - allocated_size)),
+       alloc_unit, hint, &offset, &length);
+    if (res != 0) {
+      /*
+       * Allocation failed.
+       */
+      break;
+    }
+    bool can_append = true;
+    if (!extents->empty()) {
+      bluestore_pextent_t &last_extent  = extents->back();
+      if (last_extent.end() == offset) {
+        uint64_t l64 = last_extent.length;
+        l64 += length;
+        if (l64 < 0x100000000 && l64 <= max_alloc_size) {
+	  can_append = false;
+	  last_extent.length += length;
+        }
+      }
+    }
+    if (can_append) {
+      extents->emplace_back(bluestore_pextent_t(offset, length));
+    }
+
+    allocated_size += length;
+    hint = offset + length;
+  }
+
+  if (allocated_size == 0) {
+    return -ENOSPC;
+  }
+  return allocated_size;
+}
+
+void StupidAllocator::release(
+  const interval_set<uint64_t>& release_set)
+{
+  std::lock_guard l(lock);
+  for (interval_set<uint64_t>::const_iterator p = release_set.begin();
+       p != release_set.end();
+       ++p) {
+    const auto offset = p.get_start();
+    const auto length = p.get_len();
+    ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		   << std::dec << dendl;
+    _insert_free(offset, length);
+    num_free += length;
+  }
+}
+
+uint64_t StupidAllocator::get_free()
+{
+  std::lock_guard l(lock);
+  return num_free;
+}
+
+double StupidAllocator::get_fragmentation()
+{
+  ceph_assert(get_block_size());
+  double res;
+  uint64_t max_intervals = 0;
+  uint64_t intervals = 0;
+  {
+    std::lock_guard l(lock);
+    max_intervals = p2roundup<uint64_t>(num_free,
+                                        get_block_size()) / get_block_size();
+    for (unsigned bin = 0; bin < free.size(); ++bin) {
+      intervals += free[bin].num_intervals();
+    }
+  }
+  ldout(cct, 30) << __func__ << " " << intervals << "/" << max_intervals 
+                 << dendl;
+  ceph_assert(intervals <= max_intervals);
+  if (!intervals || max_intervals <= 1) {
+    return 0.0;
+  }
+  intervals--;
+  max_intervals--;
+  res = (double)intervals / max_intervals;
+  return res;
+}
+
+void StupidAllocator::dump()
+{
+  std::lock_guard l(lock);
+  for (unsigned bin = 0; bin < free.size(); ++bin) {
+    ldout(cct, 0) << __func__ << " free bin " << bin << ": "
+	    	  << free[bin].num_intervals() << " extents" << dendl;
+    for (auto p = free[bin].begin();
+	 p != free[bin].end();
+	 ++p) {
+      ldout(cct, 0) << __func__ << "  0x" << std::hex << p.get_start() << "~"
+	      	    << p.get_len() << std::dec << dendl;
+    }
+  }
+}
+
+void StupidAllocator::foreach(std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+  std::lock_guard l(lock);
+  for (unsigned bin = 0; bin < free.size(); ++bin) {
+    for (auto p = free[bin].begin(); p != free[bin].end(); ++p) {
+      notify(p.get_start(), p.get_len());
+    }
+  }
+}
+
+void StupidAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+  if (!length)
+    return;
+  std::lock_guard l(lock);
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		 << std::dec << dendl;
+  _insert_free(offset, length);
+  num_free += length;
+}
+
+void StupidAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+  if (!length)
+    return;
+  std::lock_guard l(lock);
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   	 << std::dec << dendl;
+  interval_set_t rm;
+  rm.insert(offset, length);
+  for (unsigned i = 0; i < free.size() && !rm.empty(); ++i) {
+    interval_set_t overlap;
+    overlap.intersection_of(rm, free[i]);
+    if (!overlap.empty()) {
+      ldout(cct, 20) << __func__ << " bin " << i << " rm 0x" << std::hex << overlap
+		     << std::dec << dendl;
+      auto it = overlap.begin();
+      auto it_end = overlap.end();
+      while (it != it_end) {
+        auto o = it.get_start();
+        auto l = it.get_len();
+
+        free[i].erase(o, l,
+          [&](uint64_t off, uint64_t len) {
+            unsigned newbin = _choose_bin(len);
+            if (newbin != i) {
+              ldout(cct, 30) << __func__ << " demoting1 0x" << std::hex << off << "~" << len
+                             << std::dec << " to bin " << newbin << dendl;
+              _insert_free(off, len);
+              return true;
+            }
+            return false;
+          });
+        ++it;
+      }
+
+      rm.subtract(overlap);
+    }
+  }
+  ceph_assert(rm.empty());
+  num_free -= length;
+  ceph_assert(num_free >= 0);
+}
+
+
+void StupidAllocator::shutdown()
+{
+  ldout(cct, 1) << __func__ << dendl;
+}
+
diff --git a/src/os/bluestore/StupidAllocator.h b/src/os/bluestore/StupidAllocator.h
new file mode 100644
index 000000000..e1fc101e0
--- /dev/null
+++ b/src/os/bluestore/StupidAllocator.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_STUPIDALLOCATOR_H
+#define CEPH_OS_BLUESTORE_STUPIDALLOCATOR_H
+
+#include <mutex>
+
+#include "Allocator.h"
+#include "include/btree_map.h"
+#include "include/interval_set.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+#include "common/ceph_mutex.h"
+
+class StupidAllocator : public Allocator {
+  CephContext* cct;
+  ceph::mutex lock = ceph::make_mutex("StupidAllocator::lock");
+
+  int64_t num_free;     ///< total bytes in freelist
+  uint64_t bdev_block_size;
+
+  template <typename K, typename V> using allocator_t =
+    mempool::bluestore_alloc::pool_allocator<std::pair<const K, V>>;
+  template <typename K, typename V> using btree_map_t =
+    btree::btree_map<K, V, std::less<K>, allocator_t<K, V>>;
+  using interval_set_t = interval_set<uint64_t, btree_map_t>;
+  std::vector<interval_set_t> free;  ///< leading-edge copy
+
+  uint64_t last_alloc = 0;
+
+  unsigned _choose_bin(uint64_t len);
+  void _insert_free(uint64_t offset, uint64_t len);
+
+  uint64_t _aligned_len(
+    interval_set_t::iterator p,
+    uint64_t alloc_unit);
+
+public:
+  StupidAllocator(CephContext* cct,
+                  const std::string& name,
+                  int64_t size,
+                  int64_t block_size);
+  ~StupidAllocator() override;
+  const char* get_type() const override
+  {
+    return "stupid";
+  }
+
+  int64_t allocate(
+    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+    int64_t hint, PExtentVector *extents) override;
+
+  int64_t allocate_int(
+    uint64_t want_size, uint64_t alloc_unit, int64_t hint,
+    uint64_t *offset, uint32_t *length);
+
+  void release(
+    const interval_set<uint64_t>& release_set) override;
+
+  uint64_t get_free() override;
+  double get_fragmentation() override;
+
+  void dump() override;
+  void foreach(std::function<void(uint64_t offset, uint64_t length)> notify) override;
+
+  void init_add_free(uint64_t offset, uint64_t length) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+
+  void shutdown() override;
+};
+
+#endif
diff --git a/src/os/bluestore/ZonedAllocator.cc b/src/os/bluestore/ZonedAllocator.cc
new file mode 100644
index 000000000..71f0e6e89
--- /dev/null
+++ b/src/os/bluestore/ZonedAllocator.cc
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+// 
+// A simple allocator that just hands out space from the next empty zone.  This
+// is temporary, just to get the simplest append-only write workload to work.
+//
+// Copyright (C) 2020 Abutalib Aghayev
+//
+
+#include "ZonedAllocator.h"
+#include "bluestore_types.h"
+#include "zoned_types.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "ZonedAllocator " << this << " "
+
+ZonedAllocator::ZonedAllocator(CephContext* cct,
+			       int64_t size,
+			       int64_t block_size,
+			       const std::string& name)
+    : Allocator(name, size, block_size),
+      cct(cct),
+      num_free(0),
+      size(size),
+      // To avoid interface changes, we piggyback zone size and the first
+      // sequential zone number onto the first 32 bits of 64-bit |block_size|.
+      // The last 32 bits of |block_size| is holding the actual block size.
+      block_size((block_size & 0x00000000ffffffff)),
+      zone_size(((block_size & 0x0000ffff00000000) >> 32) * 1024 * 1024),
+      starting_zone_num((block_size & 0xffff000000000000) >> 48),
+      num_zones(size / zone_size) {
+  ldout(cct, 10) << __func__ << " size 0x" << std::hex << size
+		 << " zone size 0x" << zone_size << std::dec
+		 << " number of zones " << num_zones
+		 << " first sequential zone " << starting_zone_num
+		 << dendl;
+  ceph_assert(size % zone_size == 0);
+}
+
+ZonedAllocator::~ZonedAllocator() {}
+
+int64_t ZonedAllocator::allocate(
+  uint64_t want_size,
+  uint64_t alloc_unit,
+  uint64_t max_alloc_size,
+  int64_t hint,
+  PExtentVector *extents) {
+  std::lock_guard l(lock);
+
+  ceph_assert(want_size % 4096 == 0);
+
+  ldout(cct, 10) << __func__ << " trying to allocate "
+		 << std::hex << want_size << dendl;
+
+  uint64_t zone_num = starting_zone_num;
+  for ( ; zone_num < num_zones; ++zone_num) {
+    if (fits(want_size, zone_num)) {
+      break;
+    }
+    ldout(cct, 10) << __func__ << " skipping zone " << zone_num
+		   << " because there is not enough space: "
+		   << " want_size = " << want_size
+		   << " available = " << get_remaining_space(zone_num)
+		   << dendl;
+  }
+
+  if (zone_num == num_zones) {
+    ldout(cct, 10) << __func__ << " failed to allocate" << dendl;
+    return -ENOSPC;
+  }
+
+  uint64_t offset = get_offset(zone_num);
+
+  ldout(cct, 10) << __func__ << " advancing zone " << std::hex
+		 << zone_num << " write pointer from " << offset
+		 << " to " << offset + want_size << dendl;
+
+  advance_write_pointer(zone_num, want_size);
+  if (get_remaining_space(zone_num) == 0) {
+    starting_zone_num = zone_num + 1;
+  }
+
+  ldout(cct, 10) << __func__ << std::hex << " zone " << zone_num
+		 << " offset is now " << get_write_pointer(zone_num) << dendl;
+
+  ldout(cct, 10) << __func__ << " allocated " << std::hex << want_size
+		 << " bytes at offset " << offset
+		 << " located at zone " << zone_num
+		 << " and zone offset " << offset % zone_size << dendl;
+
+  extents->emplace_back(bluestore_pextent_t(offset, want_size));
+  return want_size;
+}
+
+void ZonedAllocator::release(const interval_set<uint64_t>& release_set) {
+  std::lock_guard l(lock);
+}
+
+uint64_t ZonedAllocator::get_free() {
+  return num_free;
+}
+
+void ZonedAllocator::dump() {
+  std::lock_guard l(lock);
+}
+
+void ZonedAllocator::foreach(
+  std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+  std::lock_guard l(lock);
+}
+
+// This just increments |num_free|.  The actual free space is added by
+// set_zone_states, as it updates the write pointer for each zone.
+void ZonedAllocator::init_add_free(uint64_t offset, uint64_t length) {
+  ldout(cct, 40) << __func__ << " " << std::hex
+		 << offset << "~" << length << dendl;
+
+  num_free += length;
+}
+
+void ZonedAllocator::init_rm_free(uint64_t offset, uint64_t length) {
+  std::lock_guard l(lock);
+  ldout(cct, 40) << __func__ << " 0x" << std::hex
+		 << offset << "~" << length << dendl;
+
+  num_free -= length;
+  ceph_assert(num_free >= 0);
+
+  uint64_t zone_num = offset / zone_size;
+  uint64_t write_pointer = offset % zone_size;
+  uint64_t remaining_space = get_remaining_space(zone_num);
+
+  ceph_assert(get_write_pointer(zone_num) == write_pointer);
+  ceph_assert(remaining_space <= length);
+  advance_write_pointer(zone_num, remaining_space);
+
+  ldout(cct, 40) << __func__ << " set zone 0x" << std::hex
+		 << zone_num << " write pointer to 0x" << zone_size << dendl;
+
+  length -= remaining_space;
+  ceph_assert(length % zone_size == 0);
+
+  for ( ; length; length -= zone_size) {
+    advance_write_pointer(++zone_num, zone_size);
+    ldout(cct, 40) << __func__ << " set zone 0x" << std::hex
+		   << zone_num << " write pointer to 0x" << zone_size << dendl;
+  }
+}
+
+bool ZonedAllocator::zoned_get_zones_to_clean(std::deque<uint64_t> *zones_to_clean) {
+  // TODO: make 0.25 tunable
+  if (static_cast<double>(num_free) / size > 0.25) {
+    return false;
+  }
+  {
+    std::lock_guard l(lock);
+    // TODO: populate |zones_to_clean| with the numbers of zones that should be
+    // cleaned.
+  }
+  return true;
+}
+
+void ZonedAllocator::zoned_set_zone_states(std::vector<zone_state_t> &&_zone_states) {
+  std::lock_guard l(lock);
+  ldout(cct, 10) << __func__ << dendl;
+  zone_states = std::move(_zone_states);
+}
+
+void ZonedAllocator::shutdown() {
+  ldout(cct, 1) << __func__ << dendl;
+}
diff --git a/src/os/bluestore/ZonedAllocator.h b/src/os/bluestore/ZonedAllocator.h
new file mode 100644
index 000000000..47cb46be7
--- /dev/null
+++ b/src/os/bluestore/ZonedAllocator.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+// 
+// A simple allocator that just hands out space from the next empty zone.  This
+// is temporary, just to get the simplest append-only write workload to work.
+//
+// Copyright (C) 2020 Abutalib Aghayev
+//
+
+#ifndef CEPH_OS_BLUESTORE_ZONEDALLOCATOR_H
+#define CEPH_OS_BLUESTORE_ZONEDALLOCATOR_H
+
+#include <mutex>
+
+#include "Allocator.h"
+#include "common/ceph_mutex.h"
+#include "include/btree_map.h"
+#include "include/interval_set.h"
+#include "include/mempool.h"
+#include "bluestore_types.h"
+#include "zoned_types.h"
+
+class ZonedAllocator : public Allocator {
+  CephContext* cct;
+
+  // Currently only one thread at a time calls into ZonedAllocator due to
+  // atomic_alloc_and_submit_lock in BlueStore.cc, but we do locking anyway
+  // because eventually ZONE_APPEND support will land and
+  // atomic_alloc_and_submit_lock will be removed.
+  ceph::mutex lock = ceph::make_mutex("ZonedAllocator::lock");
+
+  std::atomic<int64_t> num_free;     ///< total bytes in freelist
+  uint64_t size;
+  uint64_t block_size;
+  uint64_t zone_size;
+  uint64_t starting_zone_num;
+  uint64_t num_zones;
+  std::vector<zone_state_t> zone_states;
+
+  inline uint64_t get_offset(uint64_t zone_num) const {
+    return zone_num * zone_size + get_write_pointer(zone_num);
+  }
+
+  inline uint64_t get_write_pointer(uint64_t zone_num) const {
+    return zone_states[zone_num].get_write_pointer();
+  }
+
+  inline uint64_t get_remaining_space(uint64_t zone_num) const {
+    return zone_size - get_write_pointer(zone_num);
+  }
+
+  inline void advance_write_pointer(uint64_t zone_num, uint64_t want_size) {
+    zone_states[zone_num].increment_write_pointer(want_size);
+  }
+
+  inline bool fits(uint64_t want_size, uint64_t zone_num) const {
+    return want_size <= get_remaining_space(zone_num);
+  }
+
+public:
+  ZonedAllocator(CephContext* cct, int64_t size, int64_t block_size,
+                 const std::string& name);
+  ~ZonedAllocator() override;
+
+  const char *get_type() const override {
+    return "zoned";
+  }
+
+  int64_t allocate(
+    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+    int64_t hint, PExtentVector *extents) override;
+
+  void release(const interval_set<uint64_t>& release_set) override;
+
+  uint64_t get_free() override;
+
+  void dump() override;
+  void foreach(
+    std::function<void(uint64_t offset, uint64_t length)> notify) override;
+
+  void zoned_set_zone_states(std::vector<zone_state_t> &&_zone_states) override;
+  bool zoned_get_zones_to_clean(std::deque<uint64_t> *zones_to_clean) override;
+
+  void init_add_free(uint64_t offset, uint64_t length) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+
+  void shutdown() override;
+};
+
+#endif
diff --git a/src/os/bluestore/ZonedFreelistManager.cc b/src/os/bluestore/ZonedFreelistManager.cc
new file mode 100644
index 000000000..b135ee524
--- /dev/null
+++ b/src/os/bluestore/ZonedFreelistManager.cc
@@ -0,0 +1,315 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+//
+// A freelist manager for zoned devices.  This iteration just keeps the write
+// pointer per zone.  Following iterations will add enough information to enable
+// cleaning of zones.
+//
+// Copyright (C) 2020 Abutalib Aghayev
+//
+
+#include "ZonedFreelistManager.h"
+#include "bluestore_common.h"
+#include "include/stringify.h"
+#include "kv/KeyValueDB.h"
+#include "os/kv.h"
+#include "zoned_types.h"
+
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "zoned freelist "
+
+using std::string;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::decode;
+using ceph::encode;
+
+void ZonedFreelistManager::write_zone_state_to_db(
+    uint64_t zone_num,
+    const zone_state_t &zone_state,
+    KeyValueDB::Transaction txn) {
+  string key;
+  _key_encode_u64(zone_num, &key);
+  bufferlist bl;
+  zone_state.encode(bl);
+  txn->merge(info_prefix, key, bl);
+}
+
+void ZonedFreelistManager::load_zone_state_from_db(
+    uint64_t zone_num,
+    zone_state_t &zone_state,
+    KeyValueDB::Iterator& it) const {
+  string k = it->key();
+  uint64_t zone_num_from_db;
+  _key_decode_u64(k.c_str(), &zone_num_from_db);
+  ceph_assert(zone_num_from_db == zone_num);
+
+  bufferlist bl = it->value();
+  auto p = bl.cbegin();
+  zone_state.decode(p);
+}
+
+void ZonedFreelistManager::init_zone_states(KeyValueDB::Transaction txn) {
+  dout(10) << __func__ << dendl;
+  for (uint64_t zone_num = 0; zone_num < num_zones; ++zone_num) {
+    zone_state_t zone_state;
+    write_zone_state_to_db(zone_num, zone_state, txn);
+  }
+}
+
+void ZonedFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix) {
+  std::shared_ptr<Int64ArrayMergeOperator> merge_op(
+      new Int64ArrayMergeOperator);
+  db->set_merge_operator(prefix, merge_op);
+}
+
+ZonedFreelistManager::ZonedFreelistManager(
+    CephContext* cct,
+    string meta_prefix,
+    string info_prefix)
+  : FreelistManager(cct),
+    meta_prefix(meta_prefix),
+    info_prefix(info_prefix),
+    enumerate_zone_num(~0UL) {}
+
+int ZonedFreelistManager::create(
+    uint64_t new_size,
+    uint64_t granularity,
+    KeyValueDB::Transaction txn) {
+  // To avoid interface changes, we piggyback zone size and the first sequential
+  // zone number onto the first 32 bits of 64-bit |granularity|.  The last 32
+  // bits of |granularity| is holding the actual allocation granularity, which
+  // is bytes_per_block.
+  size = new_size;
+  bytes_per_block = granularity & 0x00000000ffffffff;
+  zone_size = ((granularity & 0x0000ffff00000000) >> 32) * 1024 * 1024;
+  num_zones = size / zone_size;
+  starting_zone_num = (granularity & 0xffff000000000000) >> 48;
+  enumerate_zone_num = ~0UL;
+
+  ceph_assert(size % zone_size == 0);
+
+  dout(1) << __func__ << std::hex
+	  << " size 0x" << size
+	  << " bytes_per_block 0x" << bytes_per_block
+	  << " zone size 0x " << zone_size
+	  << " num_zones 0x" << num_zones
+	  << " starting_zone 0x" << starting_zone_num << dendl;
+  {
+    bufferlist bl;
+    encode(size, bl);
+    txn->set(meta_prefix, "size", bl);
+  }
+  {
+    bufferlist bl;
+    encode(bytes_per_block, bl);
+    txn->set(meta_prefix, "bytes_per_block", bl);
+  }
+  {
+    bufferlist bl;
+    encode(zone_size, bl);
+    txn->set(meta_prefix, "zone_size", bl);
+  }
+  {
+    bufferlist bl;
+    encode(num_zones, bl);
+    txn->set(meta_prefix, "num_zones", bl);
+  }
+  {
+    bufferlist bl;
+    encode(starting_zone_num, bl);
+    txn->set(meta_prefix, "starting_zone_num", bl);
+  }
+
+  init_zone_states(txn);
+
+  return 0;
+}
+
+int ZonedFreelistManager::init(
+    KeyValueDB *kvdb,
+    bool db_in_read_only,
+    cfg_reader_t cfg_reader) {
+  dout(1) << __func__ << dendl;
+  int r = _read_cfg(cfg_reader);
+  if (r != 0) {
+    return r;
+  }
+
+  ceph_assert(num_zones == size / zone_size);
+
+  dout(10) << __func__ << std::hex
+	   << " size 0x" << size
+	   << " bytes_per_block 0x" << bytes_per_block
+	   << " zone size 0x" << zone_size
+	   << " num_zones 0x" << num_zones
+	   << " starting_zone 0x" << starting_zone_num
+	   << std::dec << dendl;
+  return 0;
+}
+
+void ZonedFreelistManager::sync(KeyValueDB* kvdb) {}
+
+void ZonedFreelistManager::shutdown() {
+  dout(1) << __func__ << dendl;
+}
+
+void ZonedFreelistManager::enumerate_reset() {
+  std::lock_guard l(lock);
+
+  dout(1) << __func__ << dendl;
+
+  enumerate_p.reset();
+  enumerate_zone_num = ~0UL;
+}
+
+// Currently, this just iterates over the list of zones and sets |offset| and
+// |length| to the write pointer and the number of remaining free bytes in a
+// given zone.  Hence, it can set |length| to 0 if a zone is full, and it can
+// also return two contiguous empty zones in two calls.  This does not violate
+// current semantics of the call and appears to work fine with the clients of
+// this call.
+bool ZonedFreelistManager::enumerate_next(
+    KeyValueDB *kvdb,
+    uint64_t *offset,
+    uint64_t *length) {
+  std::lock_guard l(lock);
+
+  // starting case
+  if (enumerate_zone_num == ~0UL) {
+    dout(30) << __func__ << " start" << dendl;
+    enumerate_p = kvdb->get_iterator(info_prefix);
+    enumerate_p->lower_bound(string());
+    ceph_assert(enumerate_p->valid());
+    enumerate_zone_num = 0;
+  } else {
+    enumerate_p->next();
+    if (!enumerate_p->valid()) {
+      dout(30) << __func__ << " end" << dendl;
+      return false;
+    }
+    ++enumerate_zone_num;
+  }
+
+  zone_state_t zone_state;
+  load_zone_state_from_db(enumerate_zone_num, zone_state, enumerate_p);
+
+  *offset = enumerate_zone_num * zone_size + zone_state.get_write_pointer();
+  *length = zone_size - zone_state.get_write_pointer();
+
+  dout(30) << __func__ << std::hex << " 0x" << *offset << "~" << *length
+	   << std::dec << dendl;
+
+  return true;
+}
+
+void ZonedFreelistManager::dump(KeyValueDB *kvdb) {
+  enumerate_reset();
+  uint64_t offset, length;
+  while (enumerate_next(kvdb, &offset, &length)) {
+    dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+	     << std::dec << dendl;
+  }
+}
+
+// Advances the write pointer and writes the updated write pointer to database.
+void ZonedFreelistManager::allocate(
+    uint64_t offset,
+    uint64_t length,
+    KeyValueDB::Transaction txn) {
+  dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length << dendl;
+  uint64_t zone_num = offset / zone_size;
+  zone_state_t zone_state;
+  zone_state.increment_write_pointer(length);
+  write_zone_state_to_db(zone_num, zone_state, txn);
+}
+
+// Increments the number of dead bytes in a zone and writes the updated value to
+// database.  The dead bytes in the zone are not usable.  The cleaner will later
+// copy live objects from the zone to another zone an make the zone writable
+// again.  The number of dead bytes in a zone is used by the cleaner to select
+// which zones to clean -- the ones with most dead bytes are good candidates
+// since they require less I/O.
+void ZonedFreelistManager::release(
+    uint64_t offset,
+    uint64_t length,
+    KeyValueDB::Transaction txn) {
+  dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length << dendl;
+  uint64_t zone_num = offset / zone_size;
+  zone_state_t zone_state;
+  zone_state.increment_num_dead_bytes(length);
+  write_zone_state_to_db(zone_num, zone_state, txn);
+}
+
+void ZonedFreelistManager::get_meta(
+    uint64_t target_size,
+    std::vector<std::pair<string, string>>* res) const {
+  // We do not support expanding devices for now.
+  ceph_assert(target_size == 0);
+  res->emplace_back("zfm_size", stringify(size));
+  res->emplace_back("zfm_bytes_per_block", stringify(bytes_per_block));
+  res->emplace_back("zfm_zone_size", stringify(zone_size));
+  res->emplace_back("zfm_num_zones", stringify(num_zones));
+  res->emplace_back("zfm_starting_zone_num", stringify(starting_zone_num));
+}
+
+std::vector<zone_state_t> ZonedFreelistManager::get_zone_states(
+    KeyValueDB *kvdb) const {
+  std::vector<zone_state_t> zone_states;
+  auto p = kvdb->get_iterator(info_prefix);
+  uint64_t zone_num = 0;
+  for (p->lower_bound(string()); p->valid(); p->next(), ++zone_num) {
+    zone_state_t zone_state;
+    load_zone_state_from_db(zone_num, zone_state, p);
+    zone_states.emplace_back(zone_state);
+  }
+  return zone_states;
+}
+
+// TODO: The following function is copied almost verbatim from
+// BitmapFreelistManager.  Eliminate duplication.
+int ZonedFreelistManager::_read_cfg(cfg_reader_t cfg_reader) {
+  dout(1) << __func__ << dendl;
+
+  string err;
+
+  const size_t key_count = 5;
+  string keys[key_count] = {
+    "zfm_size",
+    "zfm_bytes_per_block",
+    "zfm_zone_size",
+    "zfm_num_zones",
+    "zfm_starting_zone_num"
+  };
+  uint64_t* vals[key_count] = {
+    &size,
+    &bytes_per_block,
+    &zone_size,
+    &num_zones,
+    &starting_zone_num};
+
+  for (size_t i = 0; i < key_count; i++) {
+    string val;
+    int r = cfg_reader(keys[i], &val);
+    if (r == 0) {
+      *(vals[i]) = strict_iecstrtoll(val.c_str(), &err);
+      if (!err.empty()) {
+        derr << __func__ << " Failed to parse - "
+          << keys[i] << ":" << val
+          << ", error: " << err << dendl;
+        return -EINVAL;
+      }
+    } else {
+      // this is expected for legacy deployed OSDs
+      dout(0) << __func__ << " " << keys[i] << " not found in bdev meta" << dendl;
+      return r;
+    }
+  }
+  return 0;
+}
diff --git a/src/os/bluestore/ZonedFreelistManager.h b/src/os/bluestore/ZonedFreelistManager.h
new file mode 100644
index 000000000..ec08f3113
--- /dev/null
+++ b/src/os/bluestore/ZonedFreelistManager.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+//
+// A freelist manager for zoned devices.  This iteration just keeps the write
+// pointer per zone.  Following iterations will add enough information to enable
+// cleaning of zones.
+//
+// Copyright (C) 2020 Abutalib Aghayev
+//
+
+#ifndef CEPH_OS_BLUESTORE_ZONEDFREELISTMANAGER_H
+#define CEPH_OS_BLUESTORE_ZONEDFREELISTMANAGER_H
+
+#include "FreelistManager.h"
+
+#include <string>
+#include <mutex>
+
+#include "common/ceph_mutex.h"
+#include "include/buffer.h"
+#include "kv/KeyValueDB.h"
+
+using cfg_reader_t = std::function<int(const std::string&, std::string*)>;
+
+class ZonedFreelistManager : public FreelistManager {
+  std::string meta_prefix;    ///< device size, zone size, etc.
+  std::string info_prefix;    ///< per zone write pointer, dead bytes
+  mutable ceph::mutex lock = ceph::make_mutex("ZonedFreelistManager::lock");
+
+  uint64_t size;	      ///< size of sequential region (bytes)
+  uint64_t bytes_per_block;   ///< bytes per allocation unit (bytes)
+  uint64_t zone_size;	      ///< size of a single zone (bytes)
+  uint64_t num_zones;	      ///< number of sequential zones
+  uint64_t starting_zone_num; ///< the first sequential zone number
+
+  KeyValueDB::Iterator enumerate_p;
+  uint64_t enumerate_zone_num;
+
+  void write_zone_state_to_db(uint64_t zone_num,
+			      const zone_state_t &zone_state,
+			      KeyValueDB::Transaction txn);
+  void load_zone_state_from_db(uint64_t zone_num,
+			       zone_state_t &zone_state,
+			       KeyValueDB::Iterator &it) const;
+
+  void init_zone_states(KeyValueDB::Transaction txn);
+
+  void increment_write_pointer(
+      uint64_t zone, uint64_t length, KeyValueDB::Transaction txn);
+  void increment_num_dead_bytes(
+      uint64_t zone, uint64_t num_bytes, KeyValueDB::Transaction txn);
+
+  int _read_cfg(cfg_reader_t cfg_reader);
+
+public:
+  ZonedFreelistManager(CephContext* cct,
+		       std::string meta_prefix,
+		       std::string info_prefix);
+
+  static void setup_merge_operator(KeyValueDB *db, std::string prefix);
+
+  int create(uint64_t size,
+	     uint64_t granularity,
+	     KeyValueDB::Transaction txn) override;
+
+  int init(KeyValueDB *kvdb,
+	   bool db_in_read_only,
+	   cfg_reader_t cfg_reader) override;
+
+  void shutdown() override;
+  void sync(KeyValueDB* kvdb) override;
+  void dump(KeyValueDB *kvdb) override;
+
+  void enumerate_reset() override;
+  bool enumerate_next(KeyValueDB *kvdb,
+		      uint64_t *offset,
+		      uint64_t *length) override;
+
+  void allocate(uint64_t offset,
+		uint64_t length,
+		KeyValueDB::Transaction txn) override;
+
+  void release(uint64_t offset,
+	       uint64_t length,
+	       KeyValueDB::Transaction txn) override;
+
+  inline uint64_t get_size() const override {
+    return size;
+  }
+
+  inline uint64_t get_alloc_units() const override {
+    return size / bytes_per_block;
+  }
+
+  inline uint64_t get_alloc_size() const override {
+    return bytes_per_block;
+  }
+
+  void get_meta(uint64_t target_size,
+		std::vector<std::pair<string, string>>*) const override;
+
+  std::vector<zone_state_t> get_zone_states(KeyValueDB *kvdb) const override;
+};
+
+#endif
diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc
new file mode 100644
index 000000000..3a812cf5f
--- /dev/null
+++ b/src/os/bluestore/bluefs_types.cc
@@ -0,0 +1,285 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include "bluefs_types.h"
+#include "common/Formatter.h"
+#include "include/uuid.h"
+#include "include/stringify.h"
+
+using std::list;
+using std::ostream;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+// bluefs_extent_t
+void bluefs_extent_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("offset", offset);
+  f->dump_unsigned("length", length);
+  f->dump_unsigned("bdev", bdev);
+}
+
+void bluefs_extent_t::generate_test_instances(list<bluefs_extent_t*>& ls)
+{
+  ls.push_back(new bluefs_extent_t);
+  ls.push_back(new bluefs_extent_t);
+  ls.back()->offset = 1;
+  ls.back()->length = 2;
+  ls.back()->bdev = 1;
+}
+
+ostream& operator<<(ostream& out, const bluefs_extent_t& e)
+{
+  return out << (int)e.bdev << ":0x" << std::hex << e.offset << "~" << e.length
+	     << std::dec;
+}
+
+// bluefs_layout_t
+
+void bluefs_layout_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(shared_bdev, bl);
+  encode(dedicated_db, bl);
+  encode(dedicated_wal, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bluefs_layout_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(shared_bdev, p);
+  decode(dedicated_db, p);
+  decode(dedicated_wal, p);
+  DECODE_FINISH(p);
+}
+
+void bluefs_layout_t::dump(Formatter *f) const
+{
+  f->dump_stream("shared_bdev") << shared_bdev;
+  f->dump_stream("dedicated_db") << dedicated_db;
+  f->dump_stream("dedicated_wal") << dedicated_wal;
+}
+
+// bluefs_super_t
+
+void bluefs_super_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 1, bl);
+  encode(uuid, bl);
+  encode(osd_uuid, bl);
+  encode(version, bl);
+  encode(block_size, bl);
+  encode(log_fnode, bl);
+  encode(memorized_layout, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bluefs_super_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(2, p);
+  decode(uuid, p);
+  decode(osd_uuid, p);
+  decode(version, p);
+  decode(block_size, p);
+  decode(log_fnode, p);
+  if (struct_v >= 2) {
+    decode(memorized_layout, p);
+  }
+  DECODE_FINISH(p);
+}
+
+void bluefs_super_t::dump(Formatter *f) const
+{
+  f->dump_stream("uuid") << uuid;
+  f->dump_stream("osd_uuid") << osd_uuid;
+  f->dump_unsigned("version", version);
+  f->dump_unsigned("block_size", block_size);
+  f->dump_object("log_fnode", log_fnode);
+}
+
+void bluefs_super_t::generate_test_instances(list<bluefs_super_t*>& ls)
+{
+  ls.push_back(new bluefs_super_t);
+  ls.push_back(new bluefs_super_t);
+  ls.back()->version = 1;
+  ls.back()->block_size = 4096;
+}
+
+ostream& operator<<(ostream& out, const bluefs_super_t& s)
+{
+  return out << "super(uuid " << s.uuid
+	     << " osd " << s.osd_uuid
+	     << " v " << s.version
+	     << " block_size 0x" << std::hex << s.block_size
+	     << " log_fnode 0x" << s.log_fnode
+	     << std::dec << ")";
+}
+
+// bluefs_fnode_t
+
+mempool::bluefs::vector<bluefs_extent_t>::iterator bluefs_fnode_t::seek(
+  uint64_t offset, uint64_t *x_off)
+{
+  auto p = extents.begin();
+
+  if (extents_index.size() > 4) {
+    auto it = std::upper_bound(extents_index.begin(), extents_index.end(),
+      offset);
+    assert(it != extents_index.begin());
+    --it;
+    assert(offset >= *it);
+    p += it - extents_index.begin();
+    offset -= *it;
+  }
+
+  while (p != extents.end()) {
+    if ((int64_t) offset >= p->length) {
+      offset -= p->length;
+      ++p;
+    } else {
+      break;
+    }
+  }
+  *x_off = offset;
+  return p;
+}
+
+bluefs_fnode_delta_t* bluefs_fnode_t::make_delta(bluefs_fnode_delta_t* delta) {
+  ceph_assert(delta);
+  delta->ino = ino;
+  delta->size = size;
+  delta->mtime = mtime;
+  delta->offset = allocated_commited;
+  delta->extents.clear();
+  if (allocated_commited < allocated) {
+    uint64_t x_off = 0;
+    auto p = seek(allocated_commited, &x_off);
+    ceph_assert(p != extents.end());
+    if (x_off > 0) {
+      ceph_assert(x_off < p->length);
+      delta->extents.emplace_back(p->bdev, p->offset + x_off, p->length - x_off);
+      ++p;
+    }
+    while (p != extents.end()) {
+      delta->extents.push_back(*p);
+      ++p;
+    }
+    reset_delta();
+  }
+  return delta;
+}
+
+void bluefs_fnode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("ino", ino);
+  f->dump_unsigned("size", size);
+  f->dump_stream("mtime") << mtime;
+  f->open_array_section("extents");
+  for (auto& p : extents)
+    f->dump_object("extent", p);
+  f->close_section();
+}
+
+void bluefs_fnode_t::generate_test_instances(list<bluefs_fnode_t*>& ls)
+{
+  ls.push_back(new bluefs_fnode_t);
+  ls.push_back(new bluefs_fnode_t);
+  ls.back()->ino = 123;
+  ls.back()->size = 1048576;
+  ls.back()->mtime = utime_t(123,45);
+  ls.back()->extents.push_back(bluefs_extent_t(0, 1048576, 4096));
+  ls.back()->__unused__ = 1;
+}
+
+ostream& operator<<(ostream& out, const bluefs_fnode_t& file)
+{
+  return out << "file(ino " << file.ino
+	     << " size 0x" << std::hex << file.size << std::dec
+	     << " mtime " << file.mtime
+	     << " allocated " << std::hex << file.allocated << std::dec
+	     << " alloc_commit " << std::hex << file.allocated_commited << std::dec
+	     << " extents " << file.extents
+	     << ")";
+}
+
+// bluefs_fnode_delta_t
+
+std::ostream& operator<<(std::ostream& out, const bluefs_fnode_delta_t& delta)
+{
+  return out << "delta(ino " << delta.ino
+	     << " size 0x" << std::hex << delta.size << std::dec
+	     << " mtime " << delta.mtime
+	     << " offset " << std::hex << delta.offset << std::dec
+	     << " extents " << delta.extents
+	     << ")";
+}
+
+// bluefs_transaction_t
+
+void bluefs_transaction_t::encode(bufferlist& bl) const
+{
+  uint32_t crc = op_bl.crc32c(-1);
+  ENCODE_START(1, 1, bl);
+  encode(uuid, bl);
+  encode(seq, bl);
+  // not using bufferlist encode method, as it merely copies the bufferptr and not
+  // contents, meaning we're left with fragmented target bl
+  __u32 len = op_bl.length();
+  encode(len, bl);
+  for (auto& it : op_bl.buffers()) {
+    bl.append(it.c_str(),  it.length());
+  }
+  encode(crc, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bluefs_transaction_t::decode(bufferlist::const_iterator& p)
+{
+  uint32_t crc;
+  DECODE_START(1, p);
+  decode(uuid, p);
+  decode(seq, p);
+  decode(op_bl, p);
+  decode(crc, p);
+  DECODE_FINISH(p);
+  uint32_t actual = op_bl.crc32c(-1);
+  if (actual != crc)
+    throw ceph::buffer::malformed_input("bad crc " + stringify(actual)
+				  + " expected " + stringify(crc));
+}
+
+void bluefs_transaction_t::dump(Formatter *f) const
+{
+  f->dump_stream("uuid") << uuid;
+  f->dump_unsigned("seq", seq);
+  f->dump_unsigned("op_bl_length", op_bl.length());
+  f->dump_unsigned("crc", op_bl.crc32c(-1));
+}
+
+void bluefs_transaction_t::generate_test_instances(
+  list<bluefs_transaction_t*>& ls)
+{
+  ls.push_back(new bluefs_transaction_t);
+  ls.push_back(new bluefs_transaction_t);
+  ls.back()->op_init();
+  ls.back()->op_dir_create("dir");
+  ls.back()->op_dir_create("dir2");
+  bluefs_fnode_t fnode;
+  fnode.ino = 2;
+  ls.back()->op_file_update(fnode);
+  ls.back()->op_dir_link("dir", "file1", 2);
+  ls.back()->op_dir_unlink("dir", "file1");
+  ls.back()->op_file_remove(2);
+  ls.back()->op_dir_remove("dir2");
+}
+
+ostream& operator<<(ostream& out, const bluefs_transaction_t& t)
+{
+  return out << "txn(seq " << t.seq
+	     << " len 0x" << std::hex << t.op_bl.length()
+	     << " crc 0x" << t.op_bl.crc32c(-1)
+	     << std::dec << ")";
+}
diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h
new file mode 100644
index 000000000..b53000188
--- /dev/null
+++ b/src/os/bluestore/bluefs_types.h
@@ -0,0 +1,320 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OS_BLUESTORE_BLUEFS_TYPES_H
+#define CEPH_OS_BLUESTORE_BLUEFS_TYPES_H
+
+#include <optional>
+
+#include "bluestore_types.h"
+#include "include/utime.h"
+#include "include/encoding.h"
+#include "include/denc.h"
+
+class bluefs_extent_t {
+public:
+  uint64_t offset = 0;
+  uint32_t length = 0;
+  uint8_t bdev;
+
+  bluefs_extent_t(uint8_t b = 0, uint64_t o = 0, uint32_t l = 0)
+    : offset(o), length(l), bdev(b) {}
+
+  uint64_t end() const { return  offset + length; }
+  DENC(bluefs_extent_t, v, p) {
+    DENC_START(1, 1, p);
+    denc_lba(v.offset, p);
+    denc_varint_lowz(v.length, p);
+    denc(v.bdev, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluefs_extent_t*>&);
+};
+WRITE_CLASS_DENC(bluefs_extent_t)
+
+std::ostream& operator<<(std::ostream& out, const bluefs_extent_t& e);
+
+struct bluefs_fnode_delta_t {
+  uint64_t ino;
+  uint64_t size;
+  utime_t mtime;
+  uint64_t offset; // Contains offset in file of extents.
+                   // Equal to 'allocated' when created.
+                   // Used for consistency checking.
+  mempool::bluefs::vector<bluefs_extent_t> extents;
+
+  DENC(bluefs_fnode_delta_t, v, p) {
+    DENC_START(1, 1, p);
+    denc_varint(v.ino, p);
+    denc_varint(v.size, p);
+    denc(v.mtime, p);
+    denc(v.offset, p);
+    denc(v.extents, p);
+    DENC_FINISH(p);
+  }
+};
+WRITE_CLASS_DENC(bluefs_fnode_delta_t)
+
+std::ostream& operator<<(std::ostream& out, const bluefs_fnode_delta_t& delta);
+
+struct bluefs_fnode_t {
+  uint64_t ino;
+  uint64_t size;
+  utime_t mtime;
+  uint8_t __unused__; // was prefer_bdev
+  mempool::bluefs::vector<bluefs_extent_t> extents;
+
+  // precalculated logical offsets for extents vector entries
+  // allows fast lookup for extent index by the offset value via upper_bound()
+  mempool::bluefs::vector<uint64_t> extents_index;
+
+  uint64_t allocated;
+  uint64_t allocated_commited;
+
+  bluefs_fnode_t() : ino(0), size(0), __unused__(0), allocated(0), allocated_commited(0) {}
+
+  uint64_t get_allocated() const {
+    return allocated;
+  }
+
+  void recalc_allocated() {
+    allocated = 0;
+    extents_index.reserve(extents.size());
+    for (auto& p : extents) {
+      extents_index.emplace_back(allocated);
+      allocated += p.length;
+    }
+    allocated_commited = allocated;
+  }
+
+  DENC_HELPERS
+  void bound_encode(size_t& p) const {
+    _denc_friend(*this, p);
+  }
+  void encode(ceph::buffer::list::contiguous_appender& p) const {
+    DENC_DUMP_PRE(bluefs_fnode_t);
+    _denc_friend(*this, p);
+  }
+  void decode(ceph::buffer::ptr::const_iterator& p) {
+    _denc_friend(*this, p);
+    recalc_allocated();
+  }
+  template<typename T, typename P>
+  friend std::enable_if_t<std::is_same_v<bluefs_fnode_t, std::remove_const_t<T>>>
+  _denc_friend(T& v, P& p) {
+    DENC_START(1, 1, p);
+    denc_varint(v.ino, p);
+    denc_varint(v.size, p);
+    denc(v.mtime, p);
+    denc(v.__unused__, p);
+    denc(v.extents, p);
+    DENC_FINISH(p);
+  }
+
+  void reset_delta() {
+    allocated_commited = allocated;
+  }
+  void claim_extents(mempool::bluefs::vector<bluefs_extent_t>& extents) {
+    for (const auto& p : extents) {
+      append_extent(p);
+    }
+    extents.clear();
+  }
+  void append_extent(const bluefs_extent_t& ext) {
+    if (!extents.empty() &&
+	extents.back().end() == ext.offset &&
+	extents.back().bdev == ext.bdev &&
+	(uint64_t)extents.back().length + (uint64_t)ext.length < 0xffffffff) {
+      extents.back().length += ext.length;
+    } else {
+      extents_index.emplace_back(allocated);
+      extents.push_back(ext);
+    }
+    allocated += ext.length;
+  }
+
+  void pop_front_extent() {
+    auto it = extents.begin();
+    allocated -= it->length;
+    extents_index.erase(extents_index.begin());
+    for (auto& i: extents_index) {
+      i -= it->length;
+    }
+    extents.erase(it);
+  }
+  
+  void swap_extents(bluefs_fnode_t& other) {
+    other.extents.swap(extents);
+    other.extents_index.swap(extents_index);
+    std::swap(allocated, other.allocated);
+    std::swap(allocated_commited, other.allocated_commited);
+  }
+  void clear_extents() {
+    extents_index.clear();
+    extents.clear();
+    allocated = 0;
+    allocated_commited = 0;
+  }
+
+  mempool::bluefs::vector<bluefs_extent_t>::iterator seek(
+    uint64_t off, uint64_t *x_off);
+  bluefs_fnode_delta_t* make_delta(bluefs_fnode_delta_t* delta);
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluefs_fnode_t*>& ls);
+
+};
+WRITE_CLASS_DENC(bluefs_fnode_t)
+
+std::ostream& operator<<(std::ostream& out, const bluefs_fnode_t& file);
+
+struct bluefs_layout_t {
+  unsigned shared_bdev = 0;         ///< which bluefs bdev we are sharing
+  bool dedicated_db = false;        ///< whether block.db is present
+  bool dedicated_wal = false;       ///< whether block.wal is present
+
+  bool single_shared_device() const {
+    return !dedicated_db && !dedicated_wal;
+  }
+
+  bool operator==(const bluefs_layout_t& other) const {
+    return shared_bdev == other.shared_bdev &&
+           dedicated_db == other.dedicated_db &&
+           dedicated_wal == other.dedicated_wal;
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(bluefs_layout_t)
+
+struct bluefs_super_t {
+  uuid_d uuid;      ///< unique to this bluefs instance
+  uuid_d osd_uuid;  ///< matches the osd that owns us
+  uint64_t version;
+  uint32_t block_size;
+
+  bluefs_fnode_t log_fnode;
+
+  std::optional<bluefs_layout_t> memorized_layout;
+
+  bluefs_super_t()
+    : version(0),
+      block_size(4096) { }
+
+  uint64_t block_mask() const {
+    return ~((uint64_t)block_size - 1);
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluefs_super_t*>& ls);
+};
+WRITE_CLASS_ENCODER(bluefs_super_t)
+
+std::ostream& operator<<(std::ostream&, const bluefs_super_t& s);
+
+
+struct bluefs_transaction_t {
+  typedef enum {
+    OP_NONE = 0,
+    OP_INIT,        ///< initial (empty) file system marker
+    OP_ALLOC_ADD,   ///< OBSOLETE: add extent to available block storage (extent)
+    OP_ALLOC_RM,    ///< OBSOLETE: remove extent from available block storage (extent)
+    OP_DIR_LINK,    ///< (re)set a dir entry (dirname, filename, ino)
+    OP_DIR_UNLINK,  ///< remove a dir entry (dirname, filename)
+    OP_DIR_CREATE,  ///< create a dir (dirname)
+    OP_DIR_REMOVE,  ///< remove a dir (dirname)
+    OP_FILE_UPDATE, ///< set/update file metadata (file)
+    OP_FILE_REMOVE, ///< remove file (ino)
+    OP_JUMP,        ///< jump the seq # and offset
+    OP_JUMP_SEQ,    ///< jump the seq #
+    OP_FILE_UPDATE_INC, ///< incremental update file metadata (file)
+  } op_t;
+
+  uuid_d uuid;          ///< fs uuid
+  uint64_t seq;         ///< sequence number
+  ceph::buffer::list op_bl;     ///< encoded transaction ops
+
+  bluefs_transaction_t() : seq(0) {}
+
+  void clear() {
+    *this = bluefs_transaction_t();
+  }
+  bool empty() const {
+    return op_bl.length() == 0;
+  }
+
+  void op_init() {
+    using ceph::encode;
+    encode((__u8)OP_INIT, op_bl);
+  }
+  void op_dir_create(std::string_view dir) {
+    using ceph::encode;
+    encode((__u8)OP_DIR_CREATE, op_bl);
+    encode(dir, op_bl);
+  }
+  void op_dir_remove(std::string_view dir) {
+    using ceph::encode;
+    encode((__u8)OP_DIR_REMOVE, op_bl);
+    encode(dir, op_bl);
+  }
+  void op_dir_link(std::string_view dir, std::string_view file, uint64_t ino) {
+    using ceph::encode;
+    encode((__u8)OP_DIR_LINK, op_bl);
+    encode(dir, op_bl);
+    encode(file, op_bl);
+    encode(ino, op_bl);
+  }
+  void op_dir_unlink(std::string_view dir, std::string_view file) {
+    using ceph::encode;
+    encode((__u8)OP_DIR_UNLINK, op_bl);
+    encode(dir, op_bl);
+    encode(file, op_bl);
+  }
+  void op_file_update(bluefs_fnode_t& file) {
+    using ceph::encode;
+    encode((__u8)OP_FILE_UPDATE, op_bl);
+    encode(file, op_bl);
+    file.reset_delta();
+  }
+  /* streams update to bufferlist and clears update state */
+  void op_file_update_inc(bluefs_fnode_t& file) {
+    using ceph::encode;
+    bluefs_fnode_delta_t delta;
+    file.make_delta(&delta); //also resets delta to zero
+    encode((__u8)OP_FILE_UPDATE_INC, op_bl);
+    encode(delta, op_bl);
+  }
+  void op_file_remove(uint64_t ino) {
+    using ceph::encode;
+    encode((__u8)OP_FILE_REMOVE, op_bl);
+    encode(ino, op_bl);
+  }
+  void op_jump(uint64_t next_seq, uint64_t offset) {
+    using ceph::encode;
+    encode((__u8)OP_JUMP, op_bl);
+    encode(next_seq, op_bl);
+    encode(offset, op_bl);
+  }
+  void op_jump_seq(uint64_t next_seq) {
+    using ceph::encode;
+    encode((__u8)OP_JUMP_SEQ, op_bl);
+    encode(next_seq, op_bl);
+  }
+  void claim_ops(bluefs_transaction_t& from) {
+    op_bl.claim_append(from.op_bl);
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluefs_transaction_t*>& ls);
+};
+WRITE_CLASS_ENCODER(bluefs_transaction_t)
+
+std::ostream& operator<<(std::ostream& out, const bluefs_transaction_t& t);
+#endif
diff --git a/src/os/bluestore/bluestore_common.h b/src/os/bluestore/bluestore_common.h
new file mode 100755
index 000000000..f61a5dcfd
--- /dev/null
+++ b/src/os/bluestore/bluestore_common.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_BLUESTORE_COMMON_H
+#define CEPH_OSD_BLUESTORE_COMMON_H
+
+#include "include/intarith.h"
+#include "include/ceph_assert.h"
+#include "kv/KeyValueDB.h"
+
+template <class Bitset, class Func>
+void apply_for_bitset_range(uint64_t off,
+  uint64_t len,
+  uint64_t granularity,
+  Bitset &bitset,
+  Func f) {
+  auto end = round_up_to(off + len, granularity) / granularity;
+  ceph_assert(end <= bitset.size());
+  uint64_t pos = off / granularity;
+  while (pos < end) {
+    f(pos, bitset);
+    pos++;
+  }
+}
+
+// merge operators
+
+struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
+  void merge_nonexistent(
+    const char *rdata, size_t rlen, std::string *new_value) override {
+    *new_value = std::string(rdata, rlen);
+  }
+  void merge(
+    const char *ldata, size_t llen,
+    const char *rdata, size_t rlen,
+    std::string *new_value) override {
+    ceph_assert(llen == rlen);
+    ceph_assert((rlen % 8) == 0);
+    new_value->resize(rlen);
+    const ceph_le64* lv = (const ceph_le64*)ldata;
+    const ceph_le64* rv = (const ceph_le64*)rdata;
+    ceph_le64* nv = &(ceph_le64&)new_value->at(0);
+    for (size_t i = 0; i < rlen >> 3; ++i) {
+      nv[i] = lv[i] + rv[i];
+    }
+  }
+  // We use each operator name and each prefix to construct the
+  // overall RocksDB operator name for consistency check at open time.
+  const char *name() const override {
+    return "int64_array";
+  }
+};
+
+#endif
diff --git a/src/os/bluestore/bluestore_tool.cc b/src/os/bluestore/bluestore_tool.cc
new file mode 100644
index 000000000..1d0bdb391
--- /dev/null
+++ b/src/os/bluestore/bluestore_tool.cc
@@ -0,0 +1,1064 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+
+#include <stdio.h>
+#include <string.h>
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#endif
+#include <iostream>
+#include <fstream>
+#include <time.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "global/global_init.h"
+#include "common/ceph_argparse.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+
+#include "os/bluestore/BlueFS.h"
+#include "os/bluestore/BlueStore.h"
+#include "common/admin_socket.h"
+#include "kv/RocksDBStore.h"
+
+namespace po = boost::program_options;
+
+void usage(po::options_description &desc)
+{
+  cout << desc << std::endl;
+}
+
+void validate_path(CephContext *cct, const string& path, bool bluefs)
+{
+  BlueStore bluestore(cct, path);
+  string type;
+  int r = bluestore.read_meta("type", &type);
+  if (r < 0) {
+    cerr << "failed to load os-type: " << cpp_strerror(r) << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  if (type != "bluestore") {
+    cerr << "expected bluestore, but type is " << type << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  if (!bluefs) {
+    return;
+  }
+
+  string kv_backend;
+  r = bluestore.read_meta("kv_backend", &kv_backend);
+  if (r < 0) {
+    cerr << "failed to load kv_backend: " << cpp_strerror(r) << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  if (kv_backend != "rocksdb") {
+    cerr << "expect kv_backend to be rocksdb, but is " << kv_backend
+         << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  string bluefs_enabled;
+  r = bluestore.read_meta("bluefs", &bluefs_enabled);
+  if (r < 0) {
+    cerr << "failed to load do_bluefs: " << cpp_strerror(r) << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  if (bluefs_enabled != "1") {
+    cerr << "bluefs not enabled for rocksdb" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+const char* find_device_path(
+  int id,
+  CephContext *cct,
+  const vector<string>& devs)
+{
+  for (auto& i : devs) {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct, i, &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << i << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if ((id == BlueFS::BDEV_SLOW && label.description == "main") ||
+        (id == BlueFS::BDEV_DB && label.description == "bluefs db") ||
+        (id == BlueFS::BDEV_WAL && label.description == "bluefs wal")) {
+      return i.c_str();
+    }
+  }
+  return nullptr;
+}
+
+void parse_devices(
+  CephContext *cct,
+  const vector<string>& devs,
+  map<string, int>* got,
+  bool* has_db,
+  bool* has_wal)
+{
+  string main;
+  bool was_db = false;
+  if (has_wal) {
+    *has_wal = false;
+  }
+  if (has_db) {
+    *has_db = false;
+  }
+  for (auto& d : devs) {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct, d, &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << d << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    int id = -1;
+    if (label.description == "main")
+      main = d;
+    else if (label.description == "bluefs db") {
+      id = BlueFS::BDEV_DB;
+      was_db = true;
+      if (has_db) {
+	*has_db = true;
+      }
+    }
+    else if (label.description == "bluefs wal") {
+      id = BlueFS::BDEV_WAL;
+      if (has_wal) {
+	*has_wal = true;
+      }
+    }
+    if (id >= 0) {
+      got->emplace(d, id);
+    }
+  }
+  if (main.length()) {
+    int id = was_db ? BlueFS::BDEV_SLOW : BlueFS::BDEV_DB;
+    got->emplace(main, id);
+  }
+}
+
+void add_devices(
+  BlueFS *fs,
+  CephContext *cct,
+  const vector<string>& devs)
+{
+  map<string, int> got;
+  parse_devices(cct, devs, &got, nullptr, nullptr);
+  for(auto e : got) {
+    char target_path[PATH_MAX] = "";
+    if(!e.first.empty()) {
+      if (realpath(e.first.c_str(), target_path) == nullptr) {
+	cerr << "failed to retrieve absolute path for " << e.first
+	      << ": " << cpp_strerror(errno)
+	      << std::endl;
+      }
+    }
+
+    cout << " slot " << e.second << " " << e.first;
+    if (target_path[0]) {
+      cout << " -> " << target_path;
+    }
+    cout << std::endl;
+
+    // We provide no shared allocator which prevents bluefs to operate in R/W mode.
+    // Read-only mode isn't strictly enforced though
+    int r = fs->add_block_device(e.second, e.first, false, 0); // 'reserved' is fake
+    if (r < 0) {
+      cerr << "unable to open " << e.first << ": " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+}
+
+BlueFS *open_bluefs_readonly(
+  CephContext *cct,
+  const string& path,
+  const vector<string>& devs)
+{
+  validate_path(cct, path, true);
+  BlueFS *fs = new BlueFS(cct);
+
+  add_devices(fs, cct, devs);
+
+  int r = fs->mount();
+  if (r < 0) {
+    cerr << "unable to mount bluefs: " << cpp_strerror(r)
+	 << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  return fs;
+}
+
+void log_dump(
+  CephContext *cct,
+  const string& path,
+  const vector<string>& devs)
+{
+  validate_path(cct, path, true);
+  BlueFS *fs = new BlueFS(cct);
+
+  add_devices(fs, cct, devs);
+  int r = fs->log_dump();
+  if (r < 0) {
+    cerr << "log_dump failed" << ": "
+         << cpp_strerror(r) << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  delete fs;
+}
+
+void inferring_bluefs_devices(vector<string>& devs, std::string& path)
+{
+  cout << "inferring bluefs devices from bluestore path" << std::endl;
+  for (auto fn : {"block", "block.wal", "block.db"}) {
+    string p = path + "/" + fn;
+    struct stat st;
+    if (::stat(p.c_str(), &st) == 0) {
+      devs.push_back(p);
+    }
+  }
+}
+
+static void bluefs_import(
+  const string& input_file,
+  const string& dest_file,
+  CephContext *cct,
+  const string& path,
+  const vector<string>& devs)
+{
+  int r;
+  std::ifstream f(input_file.c_str(), std::ifstream::binary);
+  if (!f) {
+    r = -errno;
+    cerr << "open " << input_file.c_str() << " failed: " << cpp_strerror(r) << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  BlueStore bluestore(cct, path);
+  KeyValueDB *db_ptr;
+  r = bluestore.open_db_environment(&db_ptr, false);
+  if (r < 0) {
+    cerr << "error preparing db environment: " << cpp_strerror(r) << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  BlueFS* bs = bluestore.get_bluefs();
+
+  BlueFS::FileWriter *h;
+  fs::path file_path(dest_file);
+  const string dir = file_path.parent_path();
+  const string file_name = file_path.filename();
+  bs->open_for_write(dir, file_name, &h, false);
+  uint64_t max_block = 4096;
+  char buf[max_block];
+  uint64_t left = fs::file_size(input_file.c_str());
+  uint64_t size = 0;
+  while (left) {
+    size = std::min(max_block, left);
+    f.read(buf, size);
+    h->append(buf, size);
+    left -= size;
+  }
+  f.close();
+  bs->fsync(h);
+  bs->close_writer(h);
+  bluestore.close_db_environment();
+  return;
+}
+
+int main(int argc, char **argv)
+{
+  string out_dir;
+  vector<string> devs;
+  vector<string> devs_source;
+  string dev_target;
+  string path;
+  string action;
+  string log_file;
+  string input_file;
+  string dest_file;
+  string key, value;
+  vector<string> allocs_name;
+  string empty_sharding(1, '\0');
+  string new_sharding = empty_sharding;
+  string resharding_ctrl;
+  int log_level = 30;
+  bool fsck_deep = false;
+  po::options_description po_options("Options");
+  po_options.add_options()
+    ("help,h", "produce help message")
+    ("path", po::value<string>(&path), "bluestore path")
+    ("out-dir", po::value<string>(&out_dir), "output directory")
+    ("input-file", po::value<string>(&input_file), "import file")
+    ("dest-file", po::value<string>(&dest_file), "destination file")
+    ("log-file,l", po::value<string>(&log_file), "log file")
+    ("log-level", po::value<int>(&log_level), "log level (30=most, 20=lots, 10=some, 1=little)")
+    ("dev", po::value<vector<string>>(&devs), "device(s)")
+    ("devs-source", po::value<vector<string>>(&devs_source), "bluefs-dev-migrate source device(s)")
+    ("dev-target", po::value<string>(&dev_target), "target/resulting device")
+    ("deep", po::value<bool>(&fsck_deep), "deep fsck (read all data)")
+    ("key,k", po::value<string>(&key), "label metadata key name")
+    ("value,v", po::value<string>(&value), "label metadata value")
+    ("allocator", po::value<vector<string>>(&allocs_name), "allocator to inspect: 'block'/'bluefs-wal'/'bluefs-db'/'bluefs-slow'")
+    ("sharding", po::value<string>(&new_sharding), "new sharding to apply")
+    ("resharding-ctrl", po::value<string>(&resharding_ctrl), "gives control over resharding procedure details")
+    ;
+  po::options_description po_positional("Positional options");
+  po_positional.add_options()
+    ("command", po::value<string>(&action),
+        "fsck, "
+        "repair, "
+        "quick-fix, "
+        "bluefs-export, "
+        "bluefs-import, "
+        "bluefs-bdev-sizes, "
+        "bluefs-bdev-expand, "
+        "bluefs-bdev-new-db, "
+        "bluefs-bdev-new-wal, "
+        "bluefs-bdev-migrate, "
+        "show-label, "
+        "set-label-key, "
+        "rm-label-key, "
+        "prime-osd-dir, "
+        "bluefs-log-dump, "
+        "free-dump, "
+        "free-score, "
+        "bluefs-stats, "
+        "reshard, "
+        "show-sharding")
+    ;
+  po::options_description po_all("All options");
+  po_all.add(po_options).add(po_positional);
+  po::positional_options_description pd;
+  pd.add("command", 1);
+
+  vector<string> ceph_option_strings;
+  po::variables_map vm;
+  try {
+    po::parsed_options parsed =
+      po::command_line_parser(argc, argv).options(po_all).allow_unregistered().positional(pd).run();
+    po::store( parsed, vm);
+    po::notify(vm);
+    ceph_option_strings = po::collect_unrecognized(parsed.options,
+						   po::include_positional);
+  } catch(po::error &e) {
+    std::cerr << e.what() << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  // normalize path (remove ending '/' if any)
+  if (path.size() > 1 && *(path.end() - 1) == '/') {
+    path.resize(path.size() - 1);
+  }
+  if (vm.count("help")) {
+    usage(po_all);
+    exit(EXIT_SUCCESS);
+  }
+  if (action.empty()) {
+    cerr << "must specify an action; --help for help" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  if (action == "fsck" || action == "repair" || action == "quick-fix") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  if (action == "prime-osd-dir") {
+    if (devs.size() != 1) {
+      cerr << "must specify the main bluestore device" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (path.empty()) {
+      cerr << "must specify osd dir to prime" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  if (action == "set-label-key" ||
+      action == "rm-label-key") {
+    if (devs.size() != 1) {
+      cerr << "must specify the main bluestore device" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (key.size() == 0) {
+      cerr << "must specify a key name with -k" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (action == "set-label-key" && value.size() == 0) {
+      cerr << "must specify a value with -v" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  if (action == "show-label") {
+    if (devs.empty() && path.empty()) {
+      cerr << "must specify bluestore path *or* raw device(s)" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (devs.empty())
+      inferring_bluefs_devices(devs, path);
+  }
+  if (action == "bluefs-export" || 
+      action == "bluefs-import" || 
+      action == "bluefs-log-dump") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if ((action == "bluefs-export") && out_dir.empty()) {
+      cerr << "must specify out-dir to export bluefs" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (action == "bluefs-import" && input_file.empty()) {
+      cerr << "must specify input_file to import bluefs" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (action == "bluefs-import" && dest_file.empty()) {
+      cerr << "must specify dest_file to import bluefs" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    inferring_bluefs_devices(devs, path);
+  }
+  if (action == "bluefs-bdev-sizes" || action == "bluefs-bdev-expand") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    inferring_bluefs_devices(devs, path);
+  }
+  if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (dev_target.empty()) {
+      cout << "NOTICE: --dev-target option omitted, will allocate as a file" << std::endl;
+    }
+    inferring_bluefs_devices(devs, path);
+  }
+  if (action == "bluefs-bdev-migrate") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    inferring_bluefs_devices(devs, path);
+    if (devs_source.size() == 0) {
+      cerr << "must specify source devices with --devs-source" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (dev_target.empty()) {
+      cerr << "must specify target device with --dev-target" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  if (action == "free-score" || action == "free-dump") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    for (auto name : allocs_name) {
+      if (!name.empty() &&
+          name != "block" &&
+          name != "bluefs-db" &&
+          name != "bluefs-wal" &&
+          name != "bluefs-slow") {
+        cerr << "unknown allocator '" << name << "'" << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    }
+    if (allocs_name.empty())
+      allocs_name = vector<string>{"block", "bluefs-db", "bluefs-wal", "bluefs-slow"};
+  }
+  if (action == "reshard") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (new_sharding == empty_sharding) {
+      cerr << "must provide reshard specification" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  vector<const char*> args;
+  if (log_file.size()) {
+    args.push_back("--log-file");
+    args.push_back(log_file.c_str());
+    static char ll[10];
+    snprintf(ll, sizeof(ll), "%d", log_level);
+    args.push_back("--debug-bluestore");
+    args.push_back(ll);
+    args.push_back("--debug-bluefs");
+    args.push_back(ll);
+    args.push_back("--debug-rocksdb");
+    args.push_back(ll);
+  }
+  args.push_back("--no-log-to-stderr");
+  args.push_back("--err-to-stderr");
+
+  for (auto& i : ceph_option_strings) {
+    args.push_back(i.c_str());
+  }
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+			 CODE_ENVIRONMENT_UTILITY,
+			 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+
+  common_init_finish(cct.get());
+
+  if (action == "fsck" ||
+      action == "repair" ||
+      action == "quick-fix") {
+    validate_path(cct.get(), path, false);
+    BlueStore bluestore(cct.get(), path);
+    int r;
+    if (action == "fsck") {
+      r = bluestore.fsck(fsck_deep);
+    } else if (action == "repair") {
+      r = bluestore.repair(fsck_deep);
+    } else {
+      r = bluestore.quick_fix();
+    }
+    if (r < 0) {
+      cerr << action << " failed: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    } else if (r > 0) {
+      cerr << action << " status: remaining " << r << " error(s) and warning(s)" << std::endl;
+      exit(EXIT_FAILURE);
+    } else {
+      cout << action << " success" << std::endl;
+    }
+  }
+  else if (action == "prime-osd-dir") {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+    if (r < 0) {
+      cerr << "failed to read label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    // kludge some things into the map that we want to populate into
+    // target dir
+    label.meta["path_block"] = devs.front();
+    label.meta["type"] = "bluestore";
+    label.meta["fsid"] = stringify(label.osd_uuid);
+    
+    for (auto kk : {
+	"whoami",
+	  "osd_key",
+	  "ceph_fsid",
+	  "fsid",
+	  "type",
+	  "ready" }) {
+      string k = kk;
+      auto i = label.meta.find(k);
+      if (i == label.meta.end()) {
+	continue;
+      }
+      string p = path + "/" + k;
+      string v = i->second;
+      if (k == "osd_key") {
+	p = path + "/keyring";
+	v = "[osd.";
+	v += label.meta["whoami"];
+	v += "]\nkey = " + i->second;
+      }
+      v += "\n";
+      int fd = ::open(p.c_str(), O_CREAT|O_TRUNC|O_WRONLY|O_CLOEXEC, 0600);
+      if (fd < 0) {
+	cerr << "error writing " << p << ": " << cpp_strerror(errno)
+	     << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      int r = safe_write(fd, v.c_str(), v.size());
+      if (r < 0) {
+	cerr << "error writing to " << p << ": " << cpp_strerror(errno)
+	     << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      ::close(fd);
+    }
+  }
+  else if (action == "show-label") {
+    JSONFormatter jf(true);
+    jf.open_object_section("devices");
+    for (auto& i : devs) {
+      bluestore_bdev_label_t label;
+      int r = BlueStore::_read_bdev_label(cct.get(), i, &label);
+      if (r < 0) {
+	cerr << "unable to read label for " << i << ": "
+	     << cpp_strerror(r) << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      jf.open_object_section(i.c_str());
+      label.dump(&jf);
+      jf.close_section();
+    }
+    jf.close_section();
+    jf.flush(cout);
+  }
+  else if (action == "set-label-key") {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (key == "size") {
+      label.size = strtoull(value.c_str(), nullptr, 10);
+    } else if (key =="osd_uuid") {
+      label.osd_uuid.parse(value.c_str());
+    } else if (key =="btime") {
+      uint64_t epoch;
+      uint64_t nsec;
+      int r = utime_t::parse_date(value.c_str(), &epoch, &nsec);
+      if (r == 0) {
+	label.btime = utime_t(epoch, nsec);
+      }
+    } else if (key =="description") {
+      label.description = value;
+    } else {
+      label.meta[key] = value;
+    }
+    r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
+    if (r < 0) {
+      cerr << "unable to write label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  else if (action == "rm-label-key") {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (!label.meta.count(key)) {
+      cerr << "key '" << key << "' not present" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    label.meta.erase(key);
+    r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
+    if (r < 0) {
+      cerr << "unable to write label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  else if (action == "bluefs-bdev-sizes") {
+    BlueStore bluestore(cct.get(), path);
+    bluestore.dump_bluefs_sizes(cout);
+  }
+  else if (action == "bluefs-bdev-expand") {
+    BlueStore bluestore(cct.get(), path);
+    auto r = bluestore.expand_devices(cout);
+    if (r <0) {
+      cerr << "failed to expand bluestore devices: "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  else if (action == "bluefs-import") {
+    bluefs_import(input_file, dest_file, cct.get(), path, devs);
+  }
+  else if (action == "bluefs-export") {
+    BlueFS *fs = open_bluefs_readonly(cct.get(), path, devs);
+
+    vector<string> dirs;
+    int r = fs->readdir("", &dirs);
+    if (r < 0) {
+      cerr << "readdir in root failed: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    if (::access(out_dir.c_str(), F_OK)) {
+      r = ::mkdir(out_dir.c_str(), 0755);
+      if (r < 0) {
+        r = -errno;
+        cerr << "mkdir " << out_dir << " failed: " << cpp_strerror(r) << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    }
+
+    for (auto& dir : dirs) {
+      if (dir[0] == '.')
+	continue;
+      cout << dir << "/" << std::endl;
+      vector<string> ls;
+      r = fs->readdir(dir, &ls);
+      if (r < 0) {
+	cerr << "readdir " << dir << " failed: " << cpp_strerror(r) << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      string full = out_dir + "/" + dir;
+      if (::access(full.c_str(), F_OK)) {
+        r = ::mkdir(full.c_str(), 0755);
+        if (r < 0) {
+          r = -errno;
+          cerr << "mkdir " << full << " failed: " << cpp_strerror(r) << std::endl;
+          exit(EXIT_FAILURE);
+        }
+      }
+      for (auto& file : ls) {
+	if (file[0] == '.')
+	  continue;
+	cout << dir << "/" << file << std::endl;
+	uint64_t size;
+	utime_t mtime;
+	r = fs->stat(dir, file, &size, &mtime);
+	if (r < 0) {
+	  cerr << "stat " << file << " failed: " << cpp_strerror(r) << std::endl;
+	  exit(EXIT_FAILURE);
+	}
+	string path = out_dir + "/" + dir + "/" + file;
+	int fd = ::open(path.c_str(), O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0644);
+	if (fd < 0) {
+	  r = -errno;
+	  cerr << "open " << path << " failed: " << cpp_strerror(r) << std::endl;
+	  exit(EXIT_FAILURE);
+	}
+	if (size > 0) {
+	  BlueFS::FileReader *h;
+	  r = fs->open_for_read(dir, file, &h, false);
+	  if (r < 0) {
+	    cerr << "open_for_read " << dir << "/" << file << " failed: "
+		 << cpp_strerror(r) << std::endl;
+	    exit(EXIT_FAILURE);
+	  }
+	  int pos = 0;
+	  int left = size;
+	  while (left) {
+	    bufferlist bl;
+	    r = fs->read(h, pos, left, &bl, NULL);
+	    if (r <= 0) {
+	      cerr << "read " << dir << "/" << file << " from " << pos
+		   << " failed: " << cpp_strerror(r) << std::endl;
+	      exit(EXIT_FAILURE);
+	    }
+	    int rc = bl.write_fd(fd);
+	    if (rc < 0) {
+	      cerr << "write to " << path << " failed: "
+		   << cpp_strerror(r) << std::endl;
+	      exit(EXIT_FAILURE);
+	    }
+	    pos += r;
+	    left -= r;
+	  }
+	  delete h;
+	}
+	::close(fd);
+      }
+    }
+    fs->umount();
+    delete fs;
+  } else if (action == "bluefs-log-dump") {
+    log_dump(cct.get(), path, devs);
+  } else if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") {
+    map<string, int> cur_devs_map;
+    bool need_db = action == "bluefs-bdev-new-db";
+
+    bool has_wal = false;
+    bool has_db = false;
+    char target_path[PATH_MAX] = "";
+
+    parse_devices(cct.get(), devs, &cur_devs_map, &has_db, &has_wal);
+
+    const char* rlpath = nullptr;
+    if (has_db && has_wal) {
+      cerr << "can't allocate new device, both WAL and DB exist"
+	    << std::endl;
+      exit(EXIT_FAILURE);
+    } else if (need_db && has_db) {
+      cerr << "can't allocate new DB device, already exists"
+	    << std::endl;
+      exit(EXIT_FAILURE);
+    } else if (!need_db && has_wal) {
+      cerr << "can't allocate new WAL device, already exists"
+	    << std::endl;
+      exit(EXIT_FAILURE);
+    } else if(!dev_target.empty() &&
+	      (rlpath = realpath(dev_target.c_str(), target_path)) == nullptr) {
+      cerr << "failed to retrieve absolute path for " << dev_target
+           << ": " << cpp_strerror(errno)
+           << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    // Attach either DB or WAL volume, create if needed
+    struct stat st;
+    int r = -1;
+    if (rlpath != nullptr) {
+      r = ::stat(rlpath, &st);
+    }
+    // check if we need additional size specification
+    if (r == -1 || (r == 0 && S_ISREG(st.st_mode) && st.st_size == 0)) {
+      r = 0;
+      if (need_db && cct->_conf->bluestore_block_db_size == 0) {
+	cerr << "Might need DB size specification, "
+		"please set Ceph bluestore-block-db-size config parameter "
+	     << std::endl;
+	r = EXIT_FAILURE;
+      } else if (!need_db && cct->_conf->bluestore_block_wal_size == 0) {
+	cerr << "Might need WAL size specification, "
+		"please set Ceph bluestore-block-wal-size config parameter "
+	     << std::endl;
+	r = EXIT_FAILURE;
+      }
+    }
+    if (r == 0) {
+      BlueStore bluestore(cct.get(), path);
+      r = bluestore.add_new_bluefs_device(
+        need_db ? BlueFS::BDEV_NEWDB : BlueFS::BDEV_NEWWAL,
+        target_path);
+      if (r == 0) {
+        cout << (need_db ? "DB" : "WAL") << " device added " << target_path
+             << std::endl;
+      } else {
+        cerr << "failed to add " << (need_db ? "DB" : "WAL") << " device:"
+             << cpp_strerror(r)
+             << std::endl;
+      }
+    }
+    return r;
+  } else if (action == "bluefs-bdev-migrate") {
+    map<string, int> cur_devs_map;
+    set<int> src_dev_ids;
+    map<string, int> src_devs;
+
+    parse_devices(cct.get(), devs, &cur_devs_map, nullptr, nullptr);
+    for (auto& s :  devs_source) {
+      auto i = cur_devs_map.find(s);
+      if (i != cur_devs_map.end()) {
+        if (s == dev_target) {
+	  cerr << "Device " << dev_target
+	       << " is present in both source and target lists, omitted."
+	       << std::endl;
+        } else {
+	  src_devs.emplace(*i);
+	  src_dev_ids.emplace(i->second);
+	}
+      } else {
+	cerr << "can't migrate " << s << ", not a valid bluefs volume "
+	      << std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+
+    auto i = cur_devs_map.find(dev_target);
+
+    if (i != cur_devs_map.end()) {
+      // Migrate to an existing BlueFS volume
+
+      auto dev_target_id = i->second;
+      if (dev_target_id == BlueFS::BDEV_WAL) {
+	// currently we're unable to migrate to WAL device since there is no space
+	// reserved for superblock
+	cerr << "Migrate to WAL device isn't supported." << std::endl;
+	exit(EXIT_FAILURE);
+      }
+
+      BlueStore bluestore(cct.get(), path);
+      int r = bluestore.migrate_to_existing_bluefs_device(
+	src_dev_ids,
+	dev_target_id);
+      if (r == 0) {
+	for(auto src : src_devs) {
+	  if (src.second != BlueFS::BDEV_SLOW) {
+	    cout << " device removed:" << src.second << " " << src.first
+		 << std::endl;
+	  }
+	}
+      } else {
+        bool need_db = dev_target_id == BlueFS::BDEV_DB;
+	cerr << "failed to migrate to existing BlueFS device: "
+	     << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_WAL)
+	     << " " << dev_target
+	     << cpp_strerror(r)
+	     << std::endl;
+      }
+      return r;
+    } else {
+      // Migrate to a new BlueFS volume
+      // via creating either DB or WAL volume
+      char target_path[PATH_MAX] = "";
+      int dev_target_id;
+      if (src_dev_ids.count(BlueFS::BDEV_DB)) {
+	// if we have DB device in the source list - we create DB device
+	// (and may be remove WAL).
+	dev_target_id = BlueFS::BDEV_NEWDB;
+      } else if (src_dev_ids.count(BlueFS::BDEV_WAL)) {
+	dev_target_id = BlueFS::BDEV_NEWWAL;
+      } else {
+        cerr << "Unable to migrate Slow volume to new location, "
+	        "please allocate new DB or WAL with "
+		"--bluefs-bdev-new-db(wal) command"
+	     << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      if(!dev_target.empty() &&
+	        realpath(dev_target.c_str(), target_path) == nullptr) {
+	cerr << "failed to retrieve absolute path for " << dev_target
+	     << ": " << cpp_strerror(errno)
+	     << std::endl;
+	exit(EXIT_FAILURE);
+      }
+
+      BlueStore bluestore(cct.get(), path);
+
+      bool need_db = dev_target_id == BlueFS::BDEV_NEWDB;
+      int r = bluestore.migrate_to_new_bluefs_device(
+	src_dev_ids,
+	dev_target_id,
+	target_path);
+      if (r == 0) {
+	for(auto src : src_devs) {
+	  if (src.second != BlueFS::BDEV_SLOW) {
+	    cout << " device removed:" << src.second << " " << src.first
+		 << std::endl;
+	  }
+	}
+	cout << " device added: "
+	     << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB)
+	     << " " << target_path
+	     << std::endl;
+      } else {
+	cerr << "failed to migrate to new BlueFS device: "
+	     << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB)
+	     << " " << target_path
+	     << cpp_strerror(r)
+	     << std::endl;
+      }
+      return r;
+    }
+  } else  if (action == "free-dump" || action == "free-score") {
+    AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+    ceph_assert(admin_socket);
+    std::string action_name = action == "free-dump" ? "dump" : "score";
+    validate_path(cct.get(), path, false);
+    BlueStore bluestore(cct.get(), path);
+    int r = bluestore.cold_open();
+    if (r < 0) {
+      cerr << "error from cold_open: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    for (auto alloc_name : allocs_name) {
+      ceph::bufferlist in, out;
+      ostringstream err;
+      int r = admin_socket->execute_command(
+	{"{\"prefix\": \"bluestore allocator " + action_name + " " + alloc_name + "\"}"},
+	in, err, &out);
+      if (r != 0) {
+        cerr << "failure querying '" << alloc_name << "'" << std::endl;
+        exit(EXIT_FAILURE);
+      }
+      cout << alloc_name << ":" << std::endl;
+      cout << std::string(out.c_str(),out.length()) << std::endl;
+    }
+
+    bluestore.cold_close();
+  } else  if (action == "bluefs-stats") {
+    AdminSocket* admin_socket = g_ceph_context->get_admin_socket();
+    ceph_assert(admin_socket);
+    validate_path(cct.get(), path, false);
+    BlueStore bluestore(cct.get(), path);
+    int r = bluestore.cold_open();
+    if (r < 0) {
+      cerr << "error from cold_open: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    ceph::bufferlist in, out;
+    ostringstream err;
+    r = admin_socket->execute_command(
+      { "{\"prefix\": \"bluefs stats\"}" },
+      in, err, &out);
+    if (r != 0) {
+      cerr << "failure querying bluefs stats: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    cout << std::string(out.c_str(), out.length()) << std::endl;
+     bluestore.cold_close();
+  } else if (action == "reshard") {
+    auto get_ctrl = [&](size_t& val) {
+      if (!resharding_ctrl.empty()) {
+	size_t pos;
+	std::string token;
+	pos = resharding_ctrl.find('/');
+	token = resharding_ctrl.substr(0, pos);
+	if (pos != std::string::npos)
+	  resharding_ctrl.erase(0, pos + 1);
+	else
+	  resharding_ctrl.erase();
+	char* endptr;
+	val = strtoll(token.c_str(), &endptr, 0);
+	if (*endptr != '\0') {
+	  cerr << "invalid --resharding-ctrl. '" << token << "' is not a number" << std::endl;
+	  exit(EXIT_FAILURE);
+	}
+      }
+    };
+    BlueStore bluestore(cct.get(), path);
+    KeyValueDB *db_ptr;
+    RocksDBStore::resharding_ctrl ctrl;
+    if (!resharding_ctrl.empty()) {
+      get_ctrl(ctrl.bytes_per_iterator);
+      get_ctrl(ctrl.keys_per_iterator);
+      get_ctrl(ctrl.bytes_per_batch);
+      get_ctrl(ctrl.keys_per_batch);
+      if (!resharding_ctrl.empty()) {
+	cerr << "extra chars in --resharding-ctrl" << std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+    int r = bluestore.open_db_environment(&db_ptr, true);
+    if (r < 0) {
+      cerr << "error preparing db environment: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    ceph_assert(db_ptr);
+    RocksDBStore* rocks_db = dynamic_cast<RocksDBStore*>(db_ptr);
+    ceph_assert(rocks_db);
+    r = rocks_db->reshard(new_sharding, &ctrl);
+    if (r < 0) {
+      cerr << "error resharding: " << cpp_strerror(r) << std::endl;
+    } else {
+      cout << "reshard success" << std::endl;
+    }
+    bluestore.close_db_environment();
+  } else if (action == "show-sharding") {
+    BlueStore bluestore(cct.get(), path);
+    KeyValueDB *db_ptr;
+    int r = bluestore.open_db_environment(&db_ptr, false);
+    if (r < 0) {
+      cerr << "error preparing db environment: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    ceph_assert(db_ptr);
+    RocksDBStore* rocks_db = dynamic_cast<RocksDBStore*>(db_ptr);
+    ceph_assert(rocks_db);
+    std::string sharding;
+    bool res = rocks_db->get_sharding(sharding);
+    bluestore.close_db_environment();
+    if (!res) {
+      cerr << "failed to retrieve sharding def" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    cout << sharding << std::endl;
+  } else {
+    cerr << "unrecognized action " << action << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/src/os/bluestore/bluestore_types.cc b/src/os/bluestore/bluestore_types.cc
new file mode 100644
index 000000000..b62f6e2a3
--- /dev/null
+++ b/src/os/bluestore/bluestore_types.cc
@@ -0,0 +1,1279 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "bluestore_types.h"
+#include "common/Formatter.h"
+#include "common/Checksummer.h"
+#include "include/stringify.h"
+
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::string;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::Formatter;
+
+// bluestore_bdev_label_t
+
+void bluestore_bdev_label_t::encode(bufferlist& bl) const
+{
+  // be slightly friendly to someone who looks at the device
+  bl.append("bluestore block device\n");
+  bl.append(stringify(osd_uuid));
+  bl.append("\n");
+  ENCODE_START(2, 1, bl);
+  encode(osd_uuid, bl);
+  encode(size, bl);
+  encode(btime, bl);
+  encode(description, bl);
+  encode(meta, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bluestore_bdev_label_t::decode(bufferlist::const_iterator& p)
+{
+  p += 60u; // see above
+  DECODE_START(2, p);
+  decode(osd_uuid, p);
+  decode(size, p);
+  decode(btime, p);
+  decode(description, p);
+  if (struct_v >= 2) {
+    decode(meta, p);
+  }
+  DECODE_FINISH(p);
+}
+
+void bluestore_bdev_label_t::dump(Formatter *f) const
+{
+  f->dump_stream("osd_uuid") << osd_uuid;
+  f->dump_unsigned("size", size);
+  f->dump_stream("btime") << btime;
+  f->dump_string("description", description);
+  for (auto& i : meta) {
+    f->dump_string(i.first.c_str(), i.second);
+  }
+}
+
+void bluestore_bdev_label_t::generate_test_instances(
+  list<bluestore_bdev_label_t*>& o)
+{
+  o.push_back(new bluestore_bdev_label_t);
+  o.push_back(new bluestore_bdev_label_t);
+  o.back()->size = 123;
+  o.back()->btime = utime_t(4, 5);
+  o.back()->description = "fakey";
+  o.back()->meta["foo"] = "bar";
+}
+
+ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l)
+{
+  return out << "bdev(osd_uuid " << l.osd_uuid
+	     << ", size 0x" << std::hex << l.size << std::dec
+	     << ", btime " << l.btime
+	     << ", desc " << l.description
+	     << ", " << l.meta.size() << " meta"
+	     << ")";
+}
+
+// cnode_t
+
+void bluestore_cnode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("bits", bits);
+}
+
+void bluestore_cnode_t::generate_test_instances(list<bluestore_cnode_t*>& o)
+{
+  o.push_back(new bluestore_cnode_t());
+  o.push_back(new bluestore_cnode_t(0));
+  o.push_back(new bluestore_cnode_t(123));
+}
+
+ostream& operator<<(ostream& out, const bluestore_cnode_t& l)
+{
+  return out << "cnode(bits " << l.bits << ")";
+}
+
+// bluestore_extent_ref_map_t
+
+void bluestore_extent_ref_map_t::_check() const
+{
+  uint64_t pos = 0;
+  unsigned refs = 0;
+  for (const auto &p : ref_map) {
+    if (p.first < pos)
+      ceph_abort_msg("overlap");
+    if (p.first == pos && p.second.refs == refs)
+      ceph_abort_msg("unmerged");
+    pos = p.first + p.second.length;
+    refs = p.second.refs;
+  }
+}
+
+void bluestore_extent_ref_map_t::_maybe_merge_left(
+  map<uint64_t,record_t>::iterator& p)
+{
+  if (p == ref_map.begin())
+    return;
+  auto q = p;
+  --q;
+  if (q->second.refs == p->second.refs &&
+      q->first + q->second.length == p->first) {
+    q->second.length += p->second.length;
+    ref_map.erase(p);
+    p = q;
+  }
+}
+
+void bluestore_extent_ref_map_t::get(uint64_t offset, uint32_t length)
+{
+  auto p = ref_map.lower_bound(offset);
+  if (p != ref_map.begin()) {
+    --p;
+    if (p->first + p->second.length <= offset) {
+      ++p;
+    }
+  }
+  while (length > 0) {
+    if (p == ref_map.end()) {
+      // nothing after offset; add the whole thing.
+      p = ref_map.insert(
+	map<uint64_t,record_t>::value_type(offset, record_t(length, 1))).first;
+      break;
+    }
+    if (p->first > offset) {
+      // gap
+      uint64_t newlen = std::min<uint64_t>(p->first - offset, length);
+      p = ref_map.insert(
+	map<uint64_t,record_t>::value_type(offset,
+					   record_t(newlen, 1))).first;
+      offset += newlen;
+      length -= newlen;
+      _maybe_merge_left(p);
+      ++p;
+      continue;
+    }
+    if (p->first < offset) {
+      // split off the portion before offset
+      ceph_assert(p->first + p->second.length > offset);
+      uint64_t left = p->first + p->second.length - offset;
+      p->second.length = offset - p->first;
+      p = ref_map.insert(map<uint64_t,record_t>::value_type(
+			   offset, record_t(left, p->second.refs))).first;
+      // continue below
+    }
+    ceph_assert(p->first == offset);
+    if (length < p->second.length) {
+      ref_map.insert(make_pair(offset + length,
+			       record_t(p->second.length - length,
+					p->second.refs)));
+      p->second.length = length;
+      ++p->second.refs;
+      break;
+    }
+    ++p->second.refs;
+    offset += p->second.length;
+    length -= p->second.length;
+    _maybe_merge_left(p);
+    ++p;
+  }
+  if (p != ref_map.end())
+    _maybe_merge_left(p);
+  //_check();
+}
+
+void bluestore_extent_ref_map_t::put(
+  uint64_t offset, uint32_t length,
+  PExtentVector *release,
+  bool *maybe_unshared)
+{
+  //NB: existing entries in 'release' container must be preserved!
+  bool unshared = true;
+  auto p = ref_map.lower_bound(offset);
+  if (p == ref_map.end() || p->first > offset) {
+    if (p == ref_map.begin()) {
+      ceph_abort_msg("put on missing extent (nothing before)");
+    }
+    --p;
+    if (p->first + p->second.length <= offset) {
+      ceph_abort_msg("put on missing extent (gap)");
+    }
+  }
+  if (p->first < offset) {
+    uint64_t left = p->first + p->second.length - offset;
+    p->second.length = offset - p->first;
+    if (p->second.refs != 1) {
+      unshared = false;
+    }
+    p = ref_map.insert(map<uint64_t,record_t>::value_type(
+			 offset, record_t(left, p->second.refs))).first;
+  }
+  while (length > 0) {
+    ceph_assert(p->first == offset);
+    if (length < p->second.length) {
+      if (p->second.refs != 1) {
+	unshared = false;
+      }
+      ref_map.insert(make_pair(offset + length,
+			       record_t(p->second.length - length,
+					p->second.refs)));
+      if (p->second.refs > 1) {
+	p->second.length = length;
+	--p->second.refs;
+	if (p->second.refs != 1) {
+	  unshared = false;
+	}
+	_maybe_merge_left(p);
+      } else {
+	if (release)
+	  release->push_back(bluestore_pextent_t(p->first, length));
+	ref_map.erase(p);
+      }
+      goto out;
+    }
+    offset += p->second.length;
+    length -= p->second.length;
+    if (p->second.refs > 1) {
+      --p->second.refs;
+      if (p->second.refs != 1) {
+	unshared = false;
+      }
+      _maybe_merge_left(p);
+      ++p;
+    } else {
+      if (release)
+	release->push_back(bluestore_pextent_t(p->first, p->second.length));
+      ref_map.erase(p++);
+    }
+  }
+  if (p != ref_map.end())
+    _maybe_merge_left(p);
+  //_check();
+out:
+  if (maybe_unshared) {
+    if (unshared) {
+      // we haven't seen a ref != 1 yet; check the whole map.
+      for (auto& p : ref_map) {
+	if (p.second.refs != 1) {
+	  unshared = false;
+	  break;
+	}
+      }
+    }
+    *maybe_unshared = unshared;
+  }
+}
+
+bool bluestore_extent_ref_map_t::contains(uint64_t offset, uint32_t length) const
+{
+  auto p = ref_map.lower_bound(offset);
+  if (p == ref_map.end() || p->first > offset) {
+    if (p == ref_map.begin()) {
+      return false; // nothing before
+    }
+    --p;
+    if (p->first + p->second.length <= offset) {
+      return false; // gap
+    }
+  }
+  while (length > 0) {
+    if (p == ref_map.end())
+      return false;
+    if (p->first > offset)
+      return false;
+    if (p->first + p->second.length >= offset + length)
+      return true;
+    uint64_t overlap = p->first + p->second.length - offset;
+    offset += overlap;
+    length -= overlap;
+    ++p;
+  }
+  return true;
+}
+
+bool bluestore_extent_ref_map_t::intersects(
+  uint64_t offset,
+  uint32_t length) const
+{
+  auto p = ref_map.lower_bound(offset);
+  if (p != ref_map.begin()) {
+    --p;
+    if (p->first + p->second.length <= offset) {
+      ++p;
+    }
+  }
+  if (p == ref_map.end())
+    return false;
+  if (p->first >= offset + length)
+    return false;
+  return true;  // intersects p!
+}
+
+void bluestore_extent_ref_map_t::dump(Formatter *f) const
+{
+  f->open_array_section("ref_map");
+  for (auto& p : ref_map) {
+    f->open_object_section("ref");
+    f->dump_unsigned("offset", p.first);
+    f->dump_unsigned("length", p.second.length);
+    f->dump_unsigned("refs", p.second.refs);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void bluestore_extent_ref_map_t::generate_test_instances(
+  list<bluestore_extent_ref_map_t*>& o)
+{
+  o.push_back(new bluestore_extent_ref_map_t);
+  o.push_back(new bluestore_extent_ref_map_t);
+  o.back()->get(10, 10);
+  o.back()->get(18, 22);
+  o.back()->get(20, 20);
+  o.back()->get(10, 25);
+  o.back()->get(15, 20);
+}
+
+ostream& operator<<(ostream& out, const bluestore_extent_ref_map_t& m)
+{
+  out << "ref_map(";
+  for (auto p = m.ref_map.begin(); p != m.ref_map.end(); ++p) {
+    if (p != m.ref_map.begin())
+      out << ",";
+    out << std::hex << "0x" << p->first << "~" << p->second.length << std::dec
+	<< "=" << p->second.refs;
+  }
+  out << ")";
+  return out;
+}
+
+// bluestore_blob_use_tracker_t
+bluestore_blob_use_tracker_t::bluestore_blob_use_tracker_t(
+  const bluestore_blob_use_tracker_t& tracker)
+ : au_size{tracker.au_size},
+   num_au(0),
+   alloc_au(0),
+   bytes_per_au{nullptr}
+{
+  if (tracker.num_au > 0) {
+    allocate(tracker.num_au);
+    std::copy(tracker.bytes_per_au, tracker.bytes_per_au + num_au, bytes_per_au);
+  } else {
+    total_bytes = tracker.total_bytes;
+  }
+}
+
+bluestore_blob_use_tracker_t&
+bluestore_blob_use_tracker_t::operator=(const bluestore_blob_use_tracker_t& rhs)
+{
+  if (this == &rhs) {
+    return *this;
+  }
+  clear();
+  au_size = rhs.au_size;
+  if (rhs.num_au > 0) {
+    allocate( rhs.num_au);
+    std::copy(rhs.bytes_per_au, rhs.bytes_per_au + num_au, bytes_per_au);
+  } else {
+    total_bytes = rhs.total_bytes;
+  }
+  return *this;
+}
+
+void bluestore_blob_use_tracker_t::allocate(uint32_t au_count)
+{
+  ceph_assert(au_count != 0);
+  ceph_assert(num_au == 0);
+  ceph_assert(alloc_au == 0);
+  num_au = alloc_au = au_count;
+  bytes_per_au = new uint32_t[alloc_au];
+  mempool::get_pool(
+    mempool::pool_index_t(mempool::mempool_bluestore_cache_other)).
+      adjust_count(alloc_au, sizeof(uint32_t) * alloc_au);
+
+  for (uint32_t i = 0; i < num_au; ++i) {
+    bytes_per_au[i] = 0;
+  }
+}
+
+void bluestore_blob_use_tracker_t::release(uint32_t au_count, uint32_t* ptr) {
+  if (au_count) {
+    delete[] ptr;
+    mempool::get_pool(
+      mempool::pool_index_t(mempool::mempool_bluestore_cache_other)).
+        adjust_count(-(int32_t)au_count, -(int32_t)(sizeof(uint32_t) * au_count));
+  }
+}
+
+void bluestore_blob_use_tracker_t::init(
+  uint32_t full_length, uint32_t _au_size) {
+  ceph_assert(!au_size || is_empty()); 
+  ceph_assert(_au_size > 0);
+  ceph_assert(full_length > 0);
+  clear();  
+  uint32_t _num_au = round_up_to(full_length, _au_size) / _au_size;
+  au_size = _au_size;
+  if ( _num_au > 1 ) {
+    allocate(_num_au);
+  }
+}
+
+void bluestore_blob_use_tracker_t::get(
+  uint32_t offset, uint32_t length)
+{
+  ceph_assert(au_size);
+  if (!num_au) {
+    total_bytes += length;
+  } else {
+    auto end = offset + length;
+
+    while (offset < end) {
+      auto phase = offset % au_size;
+      bytes_per_au[offset / au_size] += 
+	std::min(au_size - phase, end - offset);
+      offset += (phase ? au_size - phase : au_size);
+    }
+  }
+}
+
+bool bluestore_blob_use_tracker_t::put(
+  uint32_t offset, uint32_t length,
+  PExtentVector *release_units)
+{
+  ceph_assert(au_size);
+  if (release_units) {
+    release_units->clear();
+  }
+  bool maybe_empty = true;
+  if (!num_au) {
+    ceph_assert(total_bytes >= length);
+    total_bytes -= length;
+  } else {
+    auto end = offset + length;
+    uint64_t next_offs = 0;
+    while (offset < end) {
+      auto phase = offset % au_size;
+      size_t pos = offset / au_size;
+      auto diff = std::min(au_size - phase, end - offset);
+      ceph_assert(diff <= bytes_per_au[pos]);
+      bytes_per_au[pos] -= diff;
+      offset += (phase ? au_size - phase : au_size);
+      if (bytes_per_au[pos] == 0) {
+	if (release_units) {
+          if (release_units->empty() || next_offs != pos * au_size) {
+  	    release_units->emplace_back(pos * au_size, au_size);
+            next_offs = pos * au_size;
+          } else {
+            release_units->back().length += au_size;
+          }
+          next_offs += au_size;
+	}
+      } else {
+	maybe_empty = false; // micro optimization detecting we aren't empty 
+	                     // even in the affected extent
+      }
+    }
+  }
+  bool empty = maybe_empty ? !is_not_empty() : false;
+  if (empty && release_units) {
+    release_units->clear();
+  }
+  return empty;
+}
+
+bool bluestore_blob_use_tracker_t::can_split() const
+{
+  return num_au > 0;
+}
+
+bool bluestore_blob_use_tracker_t::can_split_at(uint32_t blob_offset) const
+{
+  ceph_assert(au_size);
+  return (blob_offset % au_size) == 0 &&
+         blob_offset < num_au * au_size;
+}
+
+void bluestore_blob_use_tracker_t::split(
+  uint32_t blob_offset,
+  bluestore_blob_use_tracker_t* r)
+{
+  ceph_assert(au_size);
+  ceph_assert(can_split());
+  ceph_assert(can_split_at(blob_offset));
+  ceph_assert(r->is_empty());
+  
+  uint32_t new_num_au = blob_offset / au_size;
+  r->init( (num_au - new_num_au) * au_size, au_size);
+
+  for (auto i = new_num_au; i < num_au; i++) {
+    r->get((i - new_num_au) * au_size, bytes_per_au[i]);
+    bytes_per_au[i] = 0;
+  }
+  if (new_num_au == 0) {
+    clear();
+  } else if (new_num_au == 1) {
+    uint32_t tmp = bytes_per_au[0];
+    uint32_t _au_size = au_size;
+    clear();
+    au_size = _au_size;
+    total_bytes = tmp;
+  } else {
+    num_au = new_num_au;
+  }
+}
+
+bool bluestore_blob_use_tracker_t::equal(
+  const bluestore_blob_use_tracker_t& other) const
+{
+  if (!num_au && !other.num_au) {
+    return total_bytes == other.total_bytes && au_size == other.au_size;
+  } else if (num_au && other.num_au) {
+    if (num_au != other.num_au || au_size != other.au_size) {
+      return false;
+    }
+    for (size_t i = 0; i < num_au; i++) {
+      if (bytes_per_au[i] != other.bytes_per_au[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  uint32_t n = num_au ? num_au : other.num_au;
+  uint32_t referenced = 
+    num_au ? other.get_referenced_bytes() : get_referenced_bytes();
+   auto bytes_per_au_tmp = num_au ? bytes_per_au : other.bytes_per_au;
+  uint32_t my_referenced = 0;
+  for (size_t i = 0; i < n; i++) {
+    my_referenced += bytes_per_au_tmp[i];
+    if (my_referenced > referenced) {
+      return false;
+    }
+  }
+  return my_referenced == referenced;
+}
+
+void bluestore_blob_use_tracker_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("num_au", num_au);
+  f->dump_unsigned("au_size", au_size);
+  if (!num_au) {
+    f->dump_unsigned("total_bytes", total_bytes);
+  } else {
+    f->open_array_section("bytes_per_au");
+    for (size_t i = 0; i < num_au; ++i) {
+      f->dump_unsigned("", bytes_per_au[i]);
+    }
+    f->close_section();
+  }
+}
+
+void bluestore_blob_use_tracker_t::generate_test_instances(
+  list<bluestore_blob_use_tracker_t*>& o)
+{
+  o.push_back(new bluestore_blob_use_tracker_t());
+  o.back()->init(16, 16);
+  o.back()->get(10, 10);
+  o.back()->get(10, 5);
+  o.push_back(new bluestore_blob_use_tracker_t());
+  o.back()->init(60, 16);
+  o.back()->get(18, 22);
+  o.back()->get(20, 20);
+  o.back()->get(15, 20);
+}
+
+ostream& operator<<(ostream& out, const bluestore_blob_use_tracker_t& m)
+{
+  out << "use_tracker(" << std::hex;
+  if (!m.num_au) {
+    out << "0x" << m.au_size 
+        << " "
+        << "0x" << m.total_bytes;
+  } else {
+    out << "0x" << m.num_au 
+        << "*0x" << m.au_size 
+	<< " 0x[";
+    for (size_t i = 0; i < m.num_au; ++i) {
+      if (i != 0)
+	out << ",";
+      out << m.bytes_per_au[i];
+    }
+    out << "]";
+  }
+  out << std::dec << ")";
+  return out;
+}
+
+// bluestore_pextent_t
+
+void bluestore_pextent_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("offset", offset);
+  f->dump_unsigned("length", length);
+}
+
+ostream& operator<<(ostream& out, const bluestore_pextent_t& o) {
+  if (o.is_valid())
+    return out << "0x" << std::hex << o.offset << "~" << o.length << std::dec;
+  else
+    return out << "!~" << std::hex << o.length << std::dec;
+}
+
+void bluestore_pextent_t::generate_test_instances(list<bluestore_pextent_t*>& ls)
+{
+  ls.push_back(new bluestore_pextent_t);
+  ls.push_back(new bluestore_pextent_t(1, 2));
+}
+
+// bluestore_blob_t
+
+string bluestore_blob_t::get_flags_string(unsigned flags)
+{
+  string s;
+  if (flags & FLAG_COMPRESSED) {
+    if (s.length())
+      s += '+';
+    s += "compressed";
+  }
+  if (flags & FLAG_CSUM) {
+    if (s.length())
+      s += '+';
+    s += "csum";
+  }
+  if (flags & FLAG_HAS_UNUSED) {
+    if (s.length())
+      s += '+';
+    s += "has_unused";
+  }
+  if (flags & FLAG_SHARED) {
+    if (s.length())
+      s += '+';
+    s += "shared";
+  }
+
+  return s;
+}
+
+size_t bluestore_blob_t::get_csum_value_size() const 
+{
+  return Checksummer::get_csum_value_size(csum_type);
+}
+
+void bluestore_blob_t::dump(Formatter *f) const
+{
+  f->open_array_section("extents");
+  for (auto& p : extents) {
+    f->dump_object("extent", p);
+  }
+  f->close_section();
+  f->dump_unsigned("logical_length", logical_length);
+  f->dump_unsigned("compressed_length", compressed_length);
+  f->dump_unsigned("flags", flags);
+  f->dump_unsigned("csum_type", csum_type);
+  f->dump_unsigned("csum_chunk_order", csum_chunk_order);
+  f->open_array_section("csum_data");
+  size_t n = get_csum_count();
+  for (unsigned i = 0; i < n; ++i)
+    f->dump_unsigned("csum", get_csum_item(i));
+  f->close_section();
+  f->dump_unsigned("unused", unused);
+}
+
+void bluestore_blob_t::generate_test_instances(list<bluestore_blob_t*>& ls)
+{
+  ls.push_back(new bluestore_blob_t);
+  ls.push_back(new bluestore_blob_t(0));
+  ls.push_back(new bluestore_blob_t);
+  ls.back()->allocated_test(bluestore_pextent_t(111, 222));
+  ls.push_back(new bluestore_blob_t);
+  ls.back()->init_csum(Checksummer::CSUM_XXHASH32, 16, 65536);
+  ls.back()->csum_data = ceph::buffer::claim_malloc(4, strdup("abcd"));
+  ls.back()->add_unused(0, 3);
+  ls.back()->add_unused(8, 8);
+  ls.back()->allocated_test(bluestore_pextent_t(0x40100000, 0x10000));
+  ls.back()->allocated_test(
+    bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, 0x1000));
+  ls.back()->allocated_test(bluestore_pextent_t(0x40120000, 0x10000));
+}
+
+ostream& operator<<(ostream& out, const bluestore_blob_t& o)
+{
+  out << "blob(" << o.get_extents();
+  if (o.is_compressed()) {
+    out << " clen 0x" << std::hex
+	<< o.get_logical_length()
+	<< " -> 0x"
+	<< o.get_compressed_payload_length()
+	<< std::dec;
+  }
+  if (o.flags) {
+    out << " " << o.get_flags_string();
+  }
+  if (o.has_csum()) {
+    out << " " << Checksummer::get_csum_type_string(o.csum_type)
+	<< "/0x" << std::hex << (1ull << o.csum_chunk_order) << std::dec;
+  }
+  if (o.has_unused())
+    out << " unused=0x" << std::hex << o.unused << std::dec;
+  out << ")";
+  return out;
+}
+
+void bluestore_blob_t::calc_csum(uint64_t b_off, const bufferlist& bl)
+{
+  switch (csum_type) {
+  case Checksummer::CSUM_XXHASH32:
+    Checksummer::calculate<Checksummer::xxhash32>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+    break;
+  case Checksummer::CSUM_XXHASH64:
+    Checksummer::calculate<Checksummer::xxhash64>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+    break;;
+  case Checksummer::CSUM_CRC32C:
+    Checksummer::calculate<Checksummer::crc32c>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+    break;
+  case Checksummer::CSUM_CRC32C_16:
+    Checksummer::calculate<Checksummer::crc32c_16>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+    break;
+  case Checksummer::CSUM_CRC32C_8:
+    Checksummer::calculate<Checksummer::crc32c_8>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+    break;
+  }
+}
+
+int bluestore_blob_t::verify_csum(uint64_t b_off, const bufferlist& bl,
+				  int* b_bad_off, uint64_t *bad_csum) const
+{
+  int r = 0;
+
+  *b_bad_off = -1;
+  switch (csum_type) {
+  case Checksummer::CSUM_NONE:
+    break;
+  case Checksummer::CSUM_XXHASH32:
+    *b_bad_off = Checksummer::verify<Checksummer::xxhash32>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+    break;
+  case Checksummer::CSUM_XXHASH64:
+    *b_bad_off = Checksummer::verify<Checksummer::xxhash64>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+    break;
+  case Checksummer::CSUM_CRC32C:
+    *b_bad_off = Checksummer::verify<Checksummer::crc32c>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+    break;
+  case Checksummer::CSUM_CRC32C_16:
+    *b_bad_off = Checksummer::verify<Checksummer::crc32c_16>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+    break;
+  case Checksummer::CSUM_CRC32C_8:
+    *b_bad_off = Checksummer::verify<Checksummer::crc32c_8>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+    break;
+  default:
+    r = -EOPNOTSUPP;
+    break;
+  }
+
+  if (r < 0)
+    return r;
+  else if (*b_bad_off >= 0)
+    return -1; // bad checksum
+  else
+    return 0;
+}
+
+void bluestore_blob_t::allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs)
+{
+  if (extents.size() == 0) {
+    // if blob is compressed then logical length to be already configured
+    // otherwise - to be unset.
+    ceph_assert((is_compressed() && logical_length != 0) ||
+      (!is_compressed() && logical_length == 0));
+
+    extents.reserve(allocs.size() + (b_off ? 1 : 0));
+    if (b_off) {
+      extents.emplace_back(
+        bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, b_off));
+
+    }
+    uint32_t new_len = b_off;
+    for (auto& a : allocs) {
+      extents.emplace_back(a.offset, a.length);
+      new_len += a.length;
+    }
+    if (!is_compressed()) {
+      logical_length = new_len;
+    }
+  } else {
+    ceph_assert(!is_compressed()); // partial allocations are forbidden when 
+                              // compressed
+    ceph_assert(b_off < logical_length);
+    uint32_t cur_offs = 0;
+    auto start_it = extents.begin();
+    size_t pos = 0;
+    while (true) {
+      ceph_assert(start_it != extents.end());
+      if (cur_offs + start_it->length > b_off) {
+	break;
+      }
+      cur_offs += start_it->length;
+      ++start_it;
+      ++pos;
+    }
+    uint32_t head = b_off - cur_offs;
+    uint32_t end_off = b_off + length;
+    auto end_it = start_it;
+
+    while (true) {
+      ceph_assert(end_it != extents.end());
+      ceph_assert(!end_it->is_valid());
+      if (cur_offs + end_it->length >= end_off) {
+	break;
+      }
+      cur_offs += end_it->length;
+      ++end_it;
+    }
+    ceph_assert(cur_offs + end_it->length >= end_off);
+    uint32_t tail = cur_offs + end_it->length - end_off;
+
+    start_it = extents.erase(start_it, end_it + 1);
+    size_t count = allocs.size();
+    count += head ? 1 : 0;
+    count += tail ? 1 : 0;
+    extents.insert(start_it,
+                   count,
+                   bluestore_pextent_t(
+                     bluestore_pextent_t::INVALID_OFFSET, 0));
+   
+    // Workaround to resolve lack of proper iterator return in vector::insert
+    // Looks like some gcc/stl implementations still lack it despite c++11
+    // support claim
+    start_it = extents.begin() + pos;
+
+    if (head) {
+      start_it->length = head;
+      ++start_it;
+    }
+    for(auto& e : allocs) {
+      *start_it = e;
+      ++start_it;
+    }
+    if (tail) {
+      start_it->length = tail;
+    } 
+  }
+}
+
+// cut it out of extents
+struct vecbuilder {
+  PExtentVector v;
+  uint64_t invalid = 0;
+
+  void add_invalid(uint64_t length) {
+    invalid += length;
+  }
+  void flush() {
+    if (invalid) {
+      v.emplace_back(bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET,
+        invalid));
+
+      invalid = 0;
+    }
+  }
+  void add(uint64_t offset, uint64_t length) {
+    if (offset == bluestore_pextent_t::INVALID_OFFSET) {
+      add_invalid(length);
+    }
+    else {
+      flush();
+      v.emplace_back(offset, length);
+    }
+  }
+};
+
+void bluestore_blob_t::allocated_test(const bluestore_pextent_t& alloc)
+{
+  extents.emplace_back(alloc);
+  if (!is_compressed()) {
+    logical_length += alloc.length;
+  }
+}
+
+bool bluestore_blob_t::release_extents(bool all,
+				       const PExtentVector& logical,
+				       PExtentVector* r)
+{
+  // common case: all of it?
+  if (all) {
+    uint64_t pos = 0;
+    for (auto& e : extents) {
+      if (e.is_valid()) {
+	r->push_back(e);
+      }
+      pos += e.length;
+    }
+    ceph_assert(is_compressed() || get_logical_length() == pos);
+    extents.resize(1);
+    extents[0].offset = bluestore_pextent_t::INVALID_OFFSET;
+    extents[0].length = pos;
+    return true;
+  }
+  // remove from pextents according to logical release list
+  vecbuilder vb;
+  auto loffs_it = logical.begin();
+  auto lend = logical.end();
+  uint32_t pext_loffs_start = 0; //starting loffset of the current pextent
+  uint32_t pext_loffs = 0; //current loffset
+  auto pext_it = extents.begin();
+  auto pext_end = extents.end();
+  while (pext_it != pext_end) {
+    if (loffs_it == lend ||
+        pext_loffs_start + pext_it->length <= loffs_it->offset) {
+      int delta0 = pext_loffs - pext_loffs_start;
+      ceph_assert(delta0 >= 0);
+      if ((uint32_t)delta0 < pext_it->length) {
+	vb.add(pext_it->offset + delta0, pext_it->length - delta0);
+      }
+      pext_loffs_start += pext_it->length;
+      pext_loffs = pext_loffs_start;
+      ++pext_it;
+    }
+    else {
+      //assert(pext_loffs == pext_loffs_start);
+      int delta0 = pext_loffs - pext_loffs_start;
+      ceph_assert(delta0 >= 0);
+
+      int delta = loffs_it->offset - pext_loffs;
+      ceph_assert(delta >= 0);
+      if (delta > 0) {
+	vb.add(pext_it->offset + delta0, delta);
+	pext_loffs += delta;
+      }
+
+      PExtentVector::iterator last_r = r->end();
+      if (r->begin() != last_r) {
+	--last_r;
+      }
+      uint32_t to_release = loffs_it->length;
+      do {
+	uint32_t to_release_part =
+	  std::min(pext_it->length - delta0 - delta, to_release);
+	auto o = pext_it->offset + delta0 + delta;
+	if (last_r != r->end() && last_r->offset + last_r->length == o) {
+	  last_r->length += to_release_part;
+	}
+	else {
+	  last_r = r->emplace(r->end(), o, to_release_part);
+	}
+	to_release -= to_release_part;
+	pext_loffs += to_release_part;
+	if (pext_loffs == pext_loffs_start + pext_it->length) {
+	  pext_loffs_start += pext_it->length;
+	  pext_loffs = pext_loffs_start;
+	  pext_it++;
+	  delta0 = delta = 0;
+	}
+      } while (to_release > 0 && pext_it != pext_end);
+      vb.add_invalid(loffs_it->length - to_release);
+      ++loffs_it;
+    }
+  }
+  vb.flush();
+  extents.swap(vb.v);
+  return false;
+}
+
+void bluestore_blob_t::split(uint32_t blob_offset, bluestore_blob_t& rb)
+{
+  size_t left = blob_offset;
+  uint32_t llen_lb = 0;
+  uint32_t llen_rb = 0;
+  unsigned i = 0;
+  for (auto p = extents.begin(); p != extents.end(); ++p, ++i) {
+    if (p->length <= left) {
+      left -= p->length;
+      llen_lb += p->length;
+      continue;
+    }
+    if (left) {
+      if (p->is_valid()) {
+	rb.extents.emplace_back(bluestore_pextent_t(p->offset + left,
+	  p->length - left));
+      }
+      else {
+	rb.extents.emplace_back(bluestore_pextent_t(
+	  bluestore_pextent_t::INVALID_OFFSET,
+	  p->length - left));
+      }
+      llen_rb += p->length - left;
+      llen_lb += left;
+      p->length = left;
+      ++i;
+      ++p;
+    }
+    while (p != extents.end()) {
+      llen_rb += p->length;
+      rb.extents.push_back(*p++);
+    }
+    extents.resize(i);
+    logical_length = llen_lb;
+    rb.logical_length = llen_rb;
+    break;
+  }
+  rb.flags = flags;
+
+  if (has_csum()) {
+    rb.csum_type = csum_type;
+    rb.csum_chunk_order = csum_chunk_order;
+    size_t csum_order = get_csum_chunk_size();
+    ceph_assert(blob_offset % csum_order == 0);
+    size_t pos = (blob_offset / csum_order) * get_csum_value_size();
+    // deep copy csum data
+    bufferptr old;
+    old.swap(csum_data);
+    rb.csum_data = bufferptr(old.c_str() + pos, old.length() - pos);
+    csum_data = bufferptr(old.c_str(), pos);
+  }
+}
+
+// bluestore_shared_blob_t
+MEMPOOL_DEFINE_OBJECT_FACTORY(bluestore_shared_blob_t, bluestore_shared_blob_t,
+	          bluestore_cache_other);
+
+void bluestore_shared_blob_t::dump(Formatter *f) const
+{
+  f->dump_int("sbid", sbid);
+  f->dump_object("ref_map", ref_map);
+}
+
+void bluestore_shared_blob_t::generate_test_instances(
+  list<bluestore_shared_blob_t*>& ls)
+{
+  ls.push_back(new bluestore_shared_blob_t(1));
+}
+
+ostream& operator<<(ostream& out, const bluestore_shared_blob_t& sb)
+{
+  out << "(sbid 0x" << std::hex << sb.sbid << std::dec;
+  out << " " << sb.ref_map << ")";
+  return out;
+}
+
+// bluestore_onode_t
+
+void bluestore_onode_t::shard_info::dump(Formatter *f) const
+{
+  f->dump_unsigned("offset", offset);
+  f->dump_unsigned("bytes", bytes);
+}
+
+ostream& operator<<(ostream& out, const bluestore_onode_t::shard_info& si)
+{
+  return out << std::hex << "0x" << si.offset << "(0x" << si.bytes << " bytes"
+	     << std::dec << ")";
+}
+
+void bluestore_onode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("nid", nid);
+  f->dump_unsigned("size", size);
+  f->open_object_section("attrs");
+  for (auto p = attrs.begin(); p != attrs.end(); ++p) {
+    f->open_object_section("attr");
+    f->dump_string("name", p->first.c_str());  // it's not quite std::string
+    f->dump_unsigned("len", p->second.length());
+    f->close_section();
+  }
+  f->close_section();
+  f->dump_string("flags", get_flags_string());
+  f->open_array_section("extent_map_shards");
+  for (auto si : extent_map_shards) {
+    f->dump_object("shard", si);
+  }
+  f->close_section();
+  f->dump_unsigned("expected_object_size", expected_object_size);
+  f->dump_unsigned("expected_write_size", expected_write_size);
+  f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
+}
+
+void bluestore_onode_t::generate_test_instances(list<bluestore_onode_t*>& o)
+{
+  o.push_back(new bluestore_onode_t());
+  // FIXME
+}
+
+// bluestore_deferred_op_t
+
+void bluestore_deferred_op_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("op", (int)op);
+  f->dump_unsigned("data_len", data.length());
+  f->open_array_section("extents");
+  for (auto& e : extents) {
+    f->dump_object("extent", e);
+  }
+  f->close_section();
+}
+
+void bluestore_deferred_op_t::generate_test_instances(list<bluestore_deferred_op_t*>& o)
+{
+  o.push_back(new bluestore_deferred_op_t);
+  o.push_back(new bluestore_deferred_op_t);
+  o.back()->op = OP_WRITE;
+  o.back()->extents.push_back(bluestore_pextent_t(1, 2));
+  o.back()->extents.push_back(bluestore_pextent_t(100, 5));
+  o.back()->data.append("my data");
+}
+
+void bluestore_deferred_transaction_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("seq", seq);
+  f->open_array_section("ops");
+  for (list<bluestore_deferred_op_t>::const_iterator p = ops.begin(); p != ops.end(); ++p) {
+    f->dump_object("op", *p);
+  }
+  f->close_section();
+
+  f->open_array_section("released extents");
+  for (interval_set<uint64_t>::const_iterator p = released.begin(); p != released.end(); ++p) {
+    f->open_object_section("extent");
+    f->dump_unsigned("offset", p.get_start());
+    f->dump_unsigned("length", p.get_len());
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void bluestore_deferred_transaction_t::generate_test_instances(list<bluestore_deferred_transaction_t*>& o)
+{
+  o.push_back(new bluestore_deferred_transaction_t());
+  o.push_back(new bluestore_deferred_transaction_t());
+  o.back()->seq = 123;
+  o.back()->ops.push_back(bluestore_deferred_op_t());
+  o.back()->ops.push_back(bluestore_deferred_op_t());
+  o.back()->ops.back().op = bluestore_deferred_op_t::OP_WRITE;
+  o.back()->ops.back().extents.push_back(bluestore_pextent_t(1,7));
+  o.back()->ops.back().data.append("foodata");
+}
+
+void bluestore_compression_header_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("type", type);
+  f->dump_unsigned("length", length);
+  if (compressor_message) {
+    f->dump_int("compressor_message", *compressor_message);
+  }
+}
+
+void bluestore_compression_header_t::generate_test_instances(
+  list<bluestore_compression_header_t*>& o)
+{
+  o.push_back(new bluestore_compression_header_t);
+  o.push_back(new bluestore_compression_header_t(1));
+  o.back()->length = 1234;
+}
+
+// adds more salt to build a hash func input
+shared_blob_2hash_tracker_t::hash_input_t
+  shared_blob_2hash_tracker_t::build_hash_input(
+    uint64_t sbid,
+    uint64_t offset) const
+{
+  hash_input_t res = {
+    sbid,
+    offset >> au_void_bits,
+    ((sbid & 0xffffffff) << 32) + ~(uint32_t((offset >> au_void_bits) & 0xffffffff))
+  };
+  return res;
+}
+
+void shared_blob_2hash_tracker_t::inc(
+  uint64_t sbid,
+  uint64_t offset,
+  int n)
+{
+  auto hash_input = build_hash_input(sbid, offset);
+  ref_counter_2hash_tracker_t::inc(
+    (char*)hash_input.data(),
+    get_hash_input_size(),
+    n);
+}
+
+void shared_blob_2hash_tracker_t::inc_range(
+  uint64_t sbid,
+  uint64_t offset,
+  uint32_t len,
+  int n)
+{
+  uint32_t alloc_unit = 1 << au_void_bits;
+  int64_t l = len;
+  while (l > 0) {
+    // don't care about ofset alignment as inc() trims it anyway
+    inc(sbid, offset, n);
+    offset += alloc_unit;
+    l -= alloc_unit;
+  }
+}
+
+bool shared_blob_2hash_tracker_t::test_hash_conflict(
+  uint64_t sbid1,
+  uint64_t offset1,
+  uint64_t sbid2,
+  uint64_t offset2) const
+{
+  auto hash_input1 = build_hash_input(sbid1, offset1);
+  auto hash_input2 = build_hash_input(sbid2, offset2);
+  return ref_counter_2hash_tracker_t::test_hash_conflict(
+    (char*)hash_input1.data(),
+    (char*)hash_input2.data(),
+    get_hash_input_size());
+}
+
+bool shared_blob_2hash_tracker_t::test_all_zero(
+  uint64_t sbid,
+  uint64_t offset) const
+{
+  auto hash_input = build_hash_input(sbid, offset);
+  return
+    ref_counter_2hash_tracker_t::test_all_zero(
+      (char*)hash_input.data(),
+      get_hash_input_size());
+}
+
+bool shared_blob_2hash_tracker_t::test_all_zero_range(
+  uint64_t sbid,
+  uint64_t offset,
+  uint32_t len) const
+{
+  uint32_t alloc_unit = 1 << au_void_bits;
+  int64_t l = len;
+  while (l > 0) {
+    // don't care about ofset alignment as inc() trims it anyway
+    if (!test_all_zero(sbid, offset)) {
+      return false;
+    }
+    offset += alloc_unit;
+    l -= alloc_unit;
+  }
+  return true;
+}
diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h
new file mode 100644
index 000000000..b21531bfe
--- /dev/null
+++ b/src/os/bluestore/bluestore_types.h
@@ -0,0 +1,1368 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
+#define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
+
+#include <ostream>
+#include <type_traits>
+#include <vector>
+#include <array>
+#include "include/mempool.h"
+#include "include/types.h"
+#include "include/interval_set.h"
+#include "include/utime.h"
+#include "common/hobject.h"
+#include "compressor/Compressor.h"
+#include "common/Checksummer.h"
+#include "include/mempool.h"
+#include "include/ceph_hash.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+/// label for block device
+struct bluestore_bdev_label_t {
+  uuid_d osd_uuid;     ///< osd uuid
+  uint64_t size = 0;   ///< device size
+  utime_t btime;       ///< birth time
+  std::string description;  ///< device description
+
+  std::map<std::string,std::string> meta; ///< {read,write}_meta() content from ObjectStore
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_bdev_label_t*>& o);
+};
+WRITE_CLASS_ENCODER(bluestore_bdev_label_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_bdev_label_t& l);
+
+/// collection metadata
+struct bluestore_cnode_t {
+  uint32_t bits;   ///< how many bits of coll pgid are significant
+
+  explicit bluestore_cnode_t(int b=0) : bits(b) {}
+
+  DENC(bluestore_cnode_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.bits, p);
+    DENC_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_cnode_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_cnode_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_cnode_t& l);
+
+template <typename OFFS_TYPE, typename LEN_TYPE>
+struct bluestore_interval_t
+{
+  static const uint64_t INVALID_OFFSET = ~0ull;
+
+  OFFS_TYPE offset = 0;
+  LEN_TYPE length = 0;
+
+  bluestore_interval_t(){}
+  bluestore_interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
+
+  bool is_valid() const {
+    return offset != INVALID_OFFSET;
+  }
+  uint64_t end() const {
+    return offset != INVALID_OFFSET ? offset + length : INVALID_OFFSET;
+  }
+
+  bool operator==(const bluestore_interval_t& other) const {
+    return offset == other.offset && length == other.length;
+  }
+
+};
+
+/// pextent: physical extent
+struct bluestore_pextent_t : public bluestore_interval_t<uint64_t, uint32_t> 
+{
+  bluestore_pextent_t() {}
+  bluestore_pextent_t(uint64_t o, uint64_t l) : bluestore_interval_t(o, l) {}
+  bluestore_pextent_t(const bluestore_interval_t &ext) :
+    bluestore_interval_t(ext.offset, ext.length) {}
+
+  DENC(bluestore_pextent_t, v, p) {
+    denc_lba(v.offset, p);
+    denc_varint_lowz(v.length, p);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_pextent_t*>& ls);
+};
+WRITE_CLASS_DENC(bluestore_pextent_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_pextent_t& o);
+
+typedef mempool::bluestore_cache_other::vector<bluestore_pextent_t> PExtentVector;
+
+template<>
+struct denc_traits<PExtentVector> {
+  static constexpr bool supported = true;
+  static constexpr bool bounded = false;
+  static constexpr bool featured = false;
+  static constexpr bool need_contiguous = true;
+  static void bound_encode(const PExtentVector& v, size_t& p) {
+    p += sizeof(uint32_t);
+    const auto size = v.size();
+    if (size) {
+      size_t per = 0;
+      denc(v.front(), per);
+      p +=  per * size;
+    }
+  }
+  static void encode(const PExtentVector& v,
+		     ceph::buffer::list::contiguous_appender& p) {
+    denc_varint(v.size(), p);
+    for (auto& i : v) {
+      denc(i, p);
+    }
+  }
+  static void decode(PExtentVector& v, ceph::buffer::ptr::const_iterator& p) {
+    unsigned num;
+    denc_varint(num, p);
+    v.clear();
+    v.resize(num);
+    for (unsigned i=0; i<num; ++i) {
+      denc(v[i], p);
+    }
+  }
+};
+
+/// extent_map: a std::map of reference counted extents
+struct bluestore_extent_ref_map_t {
+  struct record_t {
+    uint32_t length;
+    uint32_t refs;
+    record_t(uint32_t l=0, uint32_t r=0) : length(l), refs(r) {}
+    DENC(bluestore_extent_ref_map_t::record_t, v, p) {
+      denc_varint_lowz(v.length, p);
+      denc_varint(v.refs, p);
+    }
+  };
+
+  typedef mempool::bluestore_cache_other::map<uint64_t,record_t> map_t;
+  map_t ref_map;
+
+  void _check() const;
+  void _maybe_merge_left(map_t::iterator& p);
+
+  void clear() {
+    ref_map.clear();
+  }
+  bool empty() const {
+    return ref_map.empty();
+  }
+
+  void get(uint64_t offset, uint32_t len);
+  void put(uint64_t offset, uint32_t len, PExtentVector *release,
+	   bool *maybe_unshared);
+
+  bool contains(uint64_t offset, uint32_t len) const;
+  bool intersects(uint64_t offset, uint32_t len) const;
+
+  void bound_encode(size_t& p) const {
+    denc_varint((uint32_t)0, p);
+    if (!ref_map.empty()) {
+      size_t elem_size = 0;
+      denc_varint_lowz((uint64_t)0, elem_size);
+      ref_map.begin()->second.bound_encode(elem_size);
+      p += elem_size * ref_map.size();
+    }
+  }
+  void encode(ceph::buffer::list::contiguous_appender& p) const {
+    const uint32_t n = ref_map.size();
+    denc_varint(n, p);
+    if (n) {
+      auto i = ref_map.begin();
+      denc_varint_lowz(i->first, p);
+      i->second.encode(p);
+      int64_t pos = i->first;
+      while (++i != ref_map.end()) {
+	denc_varint_lowz((int64_t)i->first - pos, p);
+	i->second.encode(p);
+	pos = i->first;
+      }
+    }
+  }
+  void decode(ceph::buffer::ptr::const_iterator& p) {
+    uint32_t n;
+    denc_varint(n, p);
+    if (n) {
+      int64_t pos;
+      denc_varint_lowz(pos, p);
+      ref_map[pos].decode(p);
+      while (--n) {
+	int64_t delta;
+	denc_varint_lowz(delta, p);
+	pos += delta;
+	ref_map[pos].decode(p);
+      }
+    }
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_extent_ref_map_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_extent_ref_map_t)
+
+
+std::ostream& operator<<(std::ostream& out, const bluestore_extent_ref_map_t& rm);
+static inline bool operator==(const bluestore_extent_ref_map_t::record_t& l,
+			      const bluestore_extent_ref_map_t::record_t& r) {
+  return l.length == r.length && l.refs == r.refs;
+}
+static inline bool operator==(const bluestore_extent_ref_map_t& l,
+			      const bluestore_extent_ref_map_t& r) {
+  return l.ref_map == r.ref_map;
+}
+static inline bool operator!=(const bluestore_extent_ref_map_t& l,
+			      const bluestore_extent_ref_map_t& r) {
+  return !(l == r);
+}
+
+/// blob_use_tracker: a set of per-alloc unit ref buckets to track blob usage
+struct bluestore_blob_use_tracker_t {
+  // N.B.: There is no need to minimize au_size/num_au
+  //   as much as possible (e.g. have just a single byte for au_size) since:
+  //   1) Struct isn't packed hence it's padded. And even if it's packed see 2)
+  //   2) Mem manager has its own granularity, most probably >= 8 bytes
+  //
+  uint32_t au_size;  // Allocation (=tracking) unit size,
+                     // == 0 if uninitialized
+  uint32_t num_au;   // Amount of allocation units tracked
+                     // == 0 if single unit or the whole blob is tracked
+  uint32_t alloc_au; // Amount of allocation units allocated
+                       
+  union {
+    uint32_t* bytes_per_au;
+    uint32_t total_bytes;
+  };
+  
+  bluestore_blob_use_tracker_t()
+    : au_size(0), num_au(0), alloc_au(0), bytes_per_au(nullptr) {
+  }
+  bluestore_blob_use_tracker_t(const bluestore_blob_use_tracker_t& tracker);
+  bluestore_blob_use_tracker_t& operator=(const bluestore_blob_use_tracker_t& rhs);
+  ~bluestore_blob_use_tracker_t() {
+    clear();
+  }
+
+  void clear() {
+    release(alloc_au, bytes_per_au);
+    num_au = 0;
+    alloc_au = 0;
+    bytes_per_au = 0;
+    au_size = 0;
+  }
+
+  uint32_t get_referenced_bytes() const {
+    uint32_t total = 0;
+    if (!num_au) {
+      total = total_bytes;
+    } else {
+      for (size_t i = 0; i < num_au; ++i) {
+	total += bytes_per_au[i];
+      }
+    }
+    return total;
+  }
+  bool is_not_empty() const {
+    if (!num_au) {
+      return total_bytes != 0;
+    } else {
+      for (size_t i = 0; i < num_au; ++i) {
+	if (bytes_per_au[i]) {
+	  return true;
+	}
+      }
+    }
+    return false;
+  }
+  bool is_empty() const {
+    return !is_not_empty();
+  }
+  void prune_tail(uint32_t new_len) {
+    if (num_au) {
+      new_len = round_up_to(new_len, au_size);
+      uint32_t _num_au = new_len / au_size;
+      ceph_assert(_num_au <= num_au);
+      if (_num_au) {
+        num_au = _num_au; // bytes_per_au array is left unmodified
+      } else {
+        clear();
+      }
+    }
+  }
+  void add_tail(uint32_t new_len, uint32_t _au_size) {
+    auto full_size = au_size * (num_au ? num_au : 1);
+    ceph_assert(new_len >= full_size);
+    if (new_len == full_size) {
+      return;
+    }
+    if (!num_au) {
+      uint32_t old_total = total_bytes;
+      total_bytes = 0;
+      init(new_len, _au_size);
+      ceph_assert(num_au);
+      bytes_per_au[0] = old_total;
+    } else {
+      ceph_assert(_au_size == au_size);
+      new_len = round_up_to(new_len, au_size);
+      uint32_t _num_au = new_len / au_size;
+      ceph_assert(_num_au >= num_au);
+      if (_num_au > num_au) {
+	auto old_bytes = bytes_per_au;
+	auto old_num_au = num_au;
+	auto old_alloc_au = alloc_au;
+	alloc_au = num_au = 0; // to bypass an assertion in allocate()
+	bytes_per_au = nullptr;
+	allocate(_num_au);
+	for (size_t i = 0; i < old_num_au; i++) {
+	  bytes_per_au[i] = old_bytes[i];
+	}
+	for (size_t i = old_num_au; i < num_au; i++) {
+	  bytes_per_au[i] = 0;
+	}
+	release(old_alloc_au, old_bytes);
+      }
+    }
+  }
+
+  void init(
+    uint32_t full_length,
+    uint32_t _au_size);
+
+  void get(
+    uint32_t offset,
+    uint32_t len);
+
+  /// put: return true if the blob has no references any more after the call,
+  /// no release_units is filled for the sake of performance.
+  /// return false if there are some references to the blob,
+  /// in this case release_units contains pextents
+  /// (identified by their offsets relative to the blob start)
+  ///  that are not used any more and can be safely deallocated.
+  bool put(
+    uint32_t offset,
+    uint32_t len,
+    PExtentVector *release);
+
+  bool can_split() const;
+  bool can_split_at(uint32_t blob_offset) const;
+  void split(
+    uint32_t blob_offset,
+    bluestore_blob_use_tracker_t* r);
+
+  bool equal(
+    const bluestore_blob_use_tracker_t& other) const;
+    
+  void bound_encode(size_t& p) const {
+    denc_varint(au_size, p);
+    if (au_size) {
+      denc_varint(num_au, p);
+      if (!num_au) {
+        denc_varint(total_bytes, p);
+      } else {
+        size_t elem_size = 0;
+        denc_varint((uint32_t)0, elem_size);
+        p += elem_size * num_au;
+      }
+    }
+  }
+  void encode(ceph::buffer::list::contiguous_appender& p) const {
+    denc_varint(au_size, p);
+    if (au_size) {
+      denc_varint(num_au, p);
+      if (!num_au) {
+        denc_varint(total_bytes, p);
+      } else {
+        size_t elem_size = 0;
+        denc_varint((uint32_t)0, elem_size);
+        for (size_t i = 0; i < num_au; ++i) {
+          denc_varint(bytes_per_au[i], p);
+        }
+      }
+    }
+  }
+  void decode(ceph::buffer::ptr::const_iterator& p) {
+    clear();
+    denc_varint(au_size, p);
+    if (au_size) {
+      uint32_t _num_au;
+      denc_varint(_num_au, p);
+      if (!_num_au) {
+        num_au = 0;
+        denc_varint(total_bytes, p);
+      } else {
+        allocate(_num_au);
+        for (size_t i = 0; i < _num_au; ++i) {
+	  denc_varint(bytes_per_au[i], p);
+        }
+      }
+    }
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_blob_use_tracker_t*>& o);
+private:
+  void allocate(uint32_t _num_au);
+  void release(uint32_t _num_au, uint32_t* ptr);
+};
+WRITE_CLASS_DENC(bluestore_blob_use_tracker_t)
+std::ostream& operator<<(std::ostream& out, const bluestore_blob_use_tracker_t& rm);
+
+/// blob: a piece of data on disk
+struct bluestore_blob_t {
+private:
+  PExtentVector extents;              ///< raw data position on device
+  uint32_t logical_length = 0;        ///< original length of data stored in the blob
+  uint32_t compressed_length = 0;     ///< compressed length if any
+
+public:
+  enum {
+    LEGACY_FLAG_MUTABLE = 1,  ///< [legacy] blob can be overwritten or split
+    FLAG_COMPRESSED = 2,      ///< blob is compressed
+    FLAG_CSUM = 4,            ///< blob has checksums
+    FLAG_HAS_UNUSED = 8,      ///< blob has unused std::map
+    FLAG_SHARED = 16,         ///< blob is shared; see external SharedBlob
+  };
+  static std::string get_flags_string(unsigned flags);
+
+  uint32_t flags = 0;                 ///< FLAG_*
+
+  typedef uint16_t unused_t;
+  unused_t unused = 0;     ///< portion that has never been written to (bitmap)
+
+  uint8_t csum_type = Checksummer::CSUM_NONE;      ///< CSUM_*
+  uint8_t csum_chunk_order = 0;       ///< csum block size is 1<<block_order bytes
+
+  ceph::buffer::ptr csum_data;                ///< opaque std::vector of csum data
+
+  bluestore_blob_t(uint32_t f = 0) : flags(f) {}
+
+  const PExtentVector& get_extents() const {
+    return extents;
+  }
+  PExtentVector& dirty_extents() {
+    return extents;
+  }
+
+  DENC_HELPERS;
+  void bound_encode(size_t& p, uint64_t struct_v) const {
+    ceph_assert(struct_v == 1 || struct_v == 2);
+    denc(extents, p);
+    denc_varint(flags, p);
+    denc_varint_lowz(logical_length, p);
+    denc_varint_lowz(compressed_length, p);
+    denc(csum_type, p);
+    denc(csum_chunk_order, p);
+    denc_varint(csum_data.length(), p);
+    p += csum_data.length();
+    p += sizeof(unused_t);
+  }
+
+  void encode(ceph::buffer::list::contiguous_appender& p, uint64_t struct_v) const {
+    ceph_assert(struct_v == 1 || struct_v == 2);
+    denc(extents, p);
+    denc_varint(flags, p);
+    if (is_compressed()) {
+      denc_varint_lowz(logical_length, p);
+      denc_varint_lowz(compressed_length, p);
+    }
+    if (has_csum()) {
+      denc(csum_type, p);
+      denc(csum_chunk_order, p);
+      denc_varint(csum_data.length(), p);
+      memcpy(p.get_pos_add(csum_data.length()), csum_data.c_str(),
+	     csum_data.length());
+    }
+    if (has_unused()) {
+      denc(unused, p);
+    }
+  }
+
+  void decode(ceph::buffer::ptr::const_iterator& p, uint64_t struct_v) {
+    ceph_assert(struct_v == 1 || struct_v == 2);
+    denc(extents, p);
+    denc_varint(flags, p);
+    if (is_compressed()) {
+      denc_varint_lowz(logical_length, p);
+      denc_varint_lowz(compressed_length, p);
+    } else {
+      logical_length = get_ondisk_length();
+    }
+    if (has_csum()) {
+      denc(csum_type, p);
+      denc(csum_chunk_order, p);
+      int len;
+      denc_varint(len, p);
+      csum_data = p.get_ptr(len);
+      csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+    }
+    if (has_unused()) {
+      denc(unused, p);
+    }
+  }
+
+  bool can_split() const {
+    return
+      !has_flag(FLAG_SHARED) &&
+      !has_flag(FLAG_COMPRESSED) &&
+      !has_flag(FLAG_HAS_UNUSED);     // splitting unused set is complex
+  }
+  bool can_split_at(uint32_t blob_offset) const {
+    return !has_csum() || blob_offset % get_csum_chunk_size() == 0;
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_blob_t*>& ls);
+
+  bool has_flag(unsigned f) const {
+    return flags & f;
+  }
+  void set_flag(unsigned f) {
+    flags |= f;
+  }
+  void clear_flag(unsigned f) {
+    flags &= ~f;
+  }
+  std::string get_flags_string() const {
+    return get_flags_string(flags);
+  }
+
+  void set_compressed(uint64_t clen_orig, uint64_t clen) {
+    set_flag(FLAG_COMPRESSED);
+    logical_length = clen_orig;
+    compressed_length = clen;
+  }
+  bool is_mutable() const {
+    return !is_compressed() && !is_shared();
+  }
+  bool is_compressed() const {
+    return has_flag(FLAG_COMPRESSED);
+  }
+  bool has_csum() const {
+    return has_flag(FLAG_CSUM);
+  }
+  bool has_unused() const {
+    return has_flag(FLAG_HAS_UNUSED);
+  }
+  bool is_shared() const {
+    return has_flag(FLAG_SHARED);
+  }
+
+  /// return chunk (i.e. min readable block) size for the blob
+  uint64_t get_chunk_size(uint64_t dev_block_size) const {
+    return has_csum() ?
+      std::max<uint64_t>(dev_block_size, get_csum_chunk_size()) : dev_block_size;
+  }
+  uint32_t get_csum_chunk_size() const {
+    return 1 << csum_chunk_order;
+  }
+  uint32_t get_compressed_payload_length() const {
+    return is_compressed() ? compressed_length : 0;
+  }
+  uint64_t calc_offset(uint64_t x_off, uint64_t *plen) const {
+    auto p = extents.begin();
+    ceph_assert(p != extents.end());
+    while (x_off >= p->length) {
+      x_off -= p->length;
+      ++p;
+      ceph_assert(p != extents.end());
+    }
+    if (plen)
+      *plen = p->length - x_off;
+    return p->offset + x_off;
+  }
+
+  // validate whether or not the status of pextents within the given range
+  // meets the requirement(allocated or unallocated).
+  bool _validate_range(uint64_t b_off, uint64_t b_len,
+                       bool require_allocated) const {
+    auto p = extents.begin();
+    ceph_assert(p != extents.end());
+    while (b_off >= p->length) {
+      b_off -= p->length;
+      if (++p == extents.end())
+        return false;
+    }
+    b_len += b_off;
+    while (b_len) {
+      if (require_allocated != p->is_valid()) {
+        return false;
+      }
+      if (p->length >= b_len) {
+        return true;
+      }
+      b_len -= p->length;
+      if (++p == extents.end())
+        return false;
+    }
+    ceph_abort_msg("we should not get here");
+    return false;
+  }
+
+  /// return true if the entire range is allocated
+  /// (mapped to extents on disk)
+  bool is_allocated(uint64_t b_off, uint64_t b_len) const {
+    return _validate_range(b_off, b_len, true);
+  }
+
+  /// return true if the entire range is unallocated
+  /// (not mapped to extents on disk)
+  bool is_unallocated(uint64_t b_off, uint64_t b_len) const {
+    return _validate_range(b_off, b_len, false);
+  }
+
+  /// return true if the logical range has never been used
+  bool is_unused(uint64_t offset, uint64_t length) const {
+    if (!has_unused()) {
+      return false;
+    }
+    ceph_assert(!is_compressed());
+    uint64_t blob_len = get_logical_length();
+    ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
+    ceph_assert(offset + length <= blob_len);
+    uint64_t chunk_size = blob_len / (sizeof(unused)*8);
+    uint64_t start = offset / chunk_size;
+    uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size;
+    auto i = start;
+    while (i < end && (unused & (1u << i))) {
+      i++;
+    }
+    return i >= end;
+  }
+
+  /// mark a range that has never been used
+  void add_unused(uint64_t offset, uint64_t length) {
+    ceph_assert(!is_compressed());
+    uint64_t blob_len = get_logical_length();
+    ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
+    ceph_assert(offset + length <= blob_len);
+    uint64_t chunk_size = blob_len / (sizeof(unused)*8);
+    uint64_t start = round_up_to(offset, chunk_size) / chunk_size;
+    uint64_t end = (offset + length) / chunk_size;
+    for (auto i = start; i < end; ++i) {
+      unused |= (1u << i);
+    }
+    if (start != end) {
+      set_flag(FLAG_HAS_UNUSED);
+    }
+  }
+
+  /// indicate that a range has (now) been used.
+  void mark_used(uint64_t offset, uint64_t length) {
+    if (has_unused()) {
+      ceph_assert(!is_compressed());
+      uint64_t blob_len = get_logical_length();
+      ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
+      ceph_assert(offset + length <= blob_len);
+      uint64_t chunk_size = blob_len / (sizeof(unused)*8);
+      uint64_t start = offset / chunk_size;
+      uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size;
+      for (auto i = start; i < end; ++i) {
+        unused &= ~(1u << i);
+      }
+      if (unused == 0) {
+        clear_flag(FLAG_HAS_UNUSED);
+      }
+    }
+  }
+
+  // map_f_invoke templates intended to mask parameters which are not expected
+  // by the provided callback
+  template<class F, typename std::enable_if<std::is_invocable_r_v<
+    int,
+    F,
+    uint64_t,
+    uint64_t>>::type* = nullptr>
+  int map_f_invoke(uint64_t lo,
+    const bluestore_pextent_t& p,
+    uint64_t o,
+    uint64_t l, F&& f) const{
+    return f(o, l);
+  }
+
+  template<class F, typename std::enable_if<std::is_invocable_r_v<
+    int,
+    F,
+    uint64_t,
+    uint64_t,
+    uint64_t>>::type * = nullptr>
+  int map_f_invoke(uint64_t lo,
+    const bluestore_pextent_t& p,
+    uint64_t o,
+    uint64_t l, F&& f) const {
+    return f(lo, o, l);
+  }
+
+  template<class F, typename std::enable_if<std::is_invocable_r_v<
+    int,
+    F,
+    const bluestore_pextent_t&,
+    uint64_t,
+    uint64_t>>::type * = nullptr>
+    int map_f_invoke(uint64_t lo,
+      const bluestore_pextent_t& p,
+      uint64_t o,
+      uint64_t l, F&& f) const {
+    return f(p, o, l);
+  }
+
+  template<class F>
+  int map(uint64_t x_off, uint64_t x_len, F&& f) const {
+    auto x_off0 = x_off;
+    auto p = extents.begin();
+    ceph_assert(p != extents.end());
+    while (x_off >= p->length) {
+      x_off -= p->length;
+      ++p;
+      ceph_assert(p != extents.end());
+    }
+    while (x_len > 0 && p != extents.end()) {
+      uint64_t l = std::min(p->length - x_off, x_len);
+      int r = map_f_invoke(x_off0, *p, p->offset + x_off, l, f);
+      if (r < 0)
+        return r;
+      x_off = 0;
+      x_len -= l;
+      x_off0 += l;
+      ++p;
+    }
+    return 0;
+  }
+
+  template<class F>
+  void map_bl(uint64_t x_off,
+	      ceph::buffer::list& bl,
+	      F&& f) const {
+    static_assert(std::is_invocable_v<F, uint64_t, ceph::buffer::list&>);
+
+    auto p = extents.begin();
+    ceph_assert(p != extents.end());
+    while (x_off >= p->length) {
+      x_off -= p->length;
+      ++p;
+      ceph_assert(p != extents.end());
+    }
+    ceph::buffer::list::iterator it = bl.begin();
+    uint64_t x_len = bl.length();
+    while (x_len > 0) {
+      ceph_assert(p != extents.end());
+      uint64_t l = std::min(p->length - x_off, x_len);
+      ceph::buffer::list t;
+      it.copy(l, t);
+      f(p->offset + x_off, t);
+      x_off = 0;
+      x_len -= l;
+      ++p;
+    }
+  }
+
+  uint32_t get_ondisk_length() const {
+    uint32_t len = 0;
+    for (auto &p : extents) {
+      len += p.length;
+    }
+    return len;
+  }
+
+  uint32_t get_logical_length() const {
+    return logical_length;
+  }
+  size_t get_csum_value_size() const;
+
+  size_t get_csum_count() const {
+    size_t vs = get_csum_value_size();
+    if (!vs)
+      return 0;
+    return csum_data.length() / vs;
+  }
+  uint64_t get_csum_item(unsigned i) const {
+    size_t cs = get_csum_value_size();
+    const char *p = csum_data.c_str();
+    switch (cs) {
+    case 0:
+      ceph_abort_msg("no csum data, bad index");
+    case 1:
+      return reinterpret_cast<const uint8_t*>(p)[i];
+    case 2:
+      return reinterpret_cast<const ceph_le16*>(p)[i];
+    case 4:
+      return reinterpret_cast<const ceph_le32*>(p)[i];
+    case 8:
+      return reinterpret_cast<const ceph_le64*>(p)[i];
+    default:
+      ceph_abort_msg("unrecognized csum word size");
+    }
+  }
+  const char *get_csum_item_ptr(unsigned i) const {
+    size_t cs = get_csum_value_size();
+    return csum_data.c_str() + (cs * i);
+  }
+  char *get_csum_item_ptr(unsigned i) {
+    size_t cs = get_csum_value_size();
+    return csum_data.c_str() + (cs * i);
+  }
+
+  void init_csum(unsigned type, unsigned order, unsigned len) {
+    flags |= FLAG_CSUM;
+    csum_type = type;
+    csum_chunk_order = order;
+    csum_data = ceph::buffer::create(get_csum_value_size() * len / get_csum_chunk_size());
+    csum_data.zero();
+    csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+  }
+
+  /// calculate csum for the buffer at the given b_off
+  void calc_csum(uint64_t b_off, const ceph::buffer::list& bl);
+
+  /// verify csum: return -EOPNOTSUPP for unsupported checksum type;
+  /// return -1 and valid(nonnegative) b_bad_off for checksum error;
+  /// return 0 if all is well.
+  int verify_csum(uint64_t b_off, const ceph::buffer::list& bl, int* b_bad_off,
+		  uint64_t *bad_csum) const;
+
+  bool can_prune_tail() const {
+    return
+      extents.size() > 1 &&  // if it's all invalid it's not pruning.
+      !extents.back().is_valid() &&
+      !has_unused();
+  }
+  void prune_tail() {
+    const auto &p = extents.back();
+    logical_length -= p.length;
+    extents.pop_back();
+    if (has_csum()) {
+      ceph::buffer::ptr t;
+      t.swap(csum_data);
+      csum_data = ceph::buffer::ptr(t.c_str(),
+			    get_logical_length() / get_csum_chunk_size() *
+			    get_csum_value_size());
+    }
+  }
+  void add_tail(uint32_t new_len) {
+    ceph_assert(is_mutable());
+    ceph_assert(!has_unused());
+    ceph_assert(new_len > logical_length);
+    extents.emplace_back(
+      bluestore_pextent_t(
+        bluestore_pextent_t::INVALID_OFFSET,
+        new_len - logical_length));
+    logical_length = new_len;
+    if (has_csum()) {
+      ceph::buffer::ptr t;
+      t.swap(csum_data);
+      csum_data = ceph::buffer::create(
+	get_csum_value_size() * logical_length / get_csum_chunk_size());
+      csum_data.copy_in(0, t.length(), t.c_str());
+      csum_data.zero(t.length(), csum_data.length() - t.length());
+    }
+  }
+  uint32_t get_release_size(uint32_t min_alloc_size) const {
+    if (is_compressed()) {
+      return get_logical_length();
+    }
+    uint32_t res = get_csum_chunk_size();
+    if (!has_csum() || res < min_alloc_size) {
+      res = min_alloc_size;
+    }
+    return res;
+  }
+
+  void split(uint32_t blob_offset, bluestore_blob_t& rb);
+  void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs);
+  void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only
+
+  /// updates blob's pextents container and return unused pextents eligible
+  /// for release.
+  /// all - indicates that the whole blob to be released.
+  /// logical - specifies set of logical extents within blob's
+  /// to be released
+  /// Returns true if blob has no more valid pextents
+  bool release_extents(
+    bool all,
+    const PExtentVector& logical,
+    PExtentVector* r);
+};
+WRITE_CLASS_DENC_FEATURED(bluestore_blob_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_blob_t& o);
+
+
+/// shared blob state
+struct bluestore_shared_blob_t {
+  MEMPOOL_CLASS_HELPERS();
+  uint64_t sbid;                       ///> shared blob id
+  bluestore_extent_ref_map_t ref_map;  ///< shared blob extents
+
+  bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {}
+  bluestore_shared_blob_t(uint64_t _sbid,
+			  bluestore_extent_ref_map_t&& _ref_map ) 
+    : sbid(_sbid), ref_map(std::move(_ref_map)) {}
+
+  DENC(bluestore_shared_blob_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.ref_map, p);
+    DENC_FINISH(p);
+  }
+
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_shared_blob_t*>& ls);
+
+  bool empty() const {
+    return ref_map.empty();
+  }
+};
+WRITE_CLASS_DENC(bluestore_shared_blob_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_shared_blob_t& o);
+
+/// onode: per-object metadata
+struct bluestore_onode_t {
+  uint64_t nid = 0;                    ///< numeric id (locally unique)
+  uint64_t size = 0;                   ///< object size
+  // mempool to be assigned to buffer::ptr manually
+  std::map<mempool::bluestore_cache_meta::string, ceph::buffer::ptr> attrs;
+
+  struct shard_info {
+    uint32_t offset = 0;  ///< logical offset for start of shard
+    uint32_t bytes = 0;   ///< encoded bytes
+    DENC(shard_info, v, p) {
+      denc_varint(v.offset, p);
+      denc_varint(v.bytes, p);
+    }
+    void dump(ceph::Formatter *f) const;
+  };
+  std::vector<shard_info> extent_map_shards; ///< extent std::map shards (if any)
+
+  uint32_t expected_object_size = 0;
+  uint32_t expected_write_size = 0;
+  uint32_t alloc_hint_flags = 0;
+
+  uint8_t flags = 0;
+
+  enum {
+    FLAG_OMAP = 1,         ///< object may have omap data
+    FLAG_PGMETA_OMAP = 2,  ///< omap data is in meta omap prefix
+    FLAG_PERPOOL_OMAP = 4, ///< omap data is in per-pool prefix; per-pool keys
+    FLAG_PERPG_OMAP = 8,   ///< omap data is in per-pg prefix; per-pg keys
+  };
+
+  std::string get_flags_string() const {
+    std::string s;
+    if (flags & FLAG_OMAP) {
+      s = "omap";
+    }
+    if (flags & FLAG_PGMETA_OMAP) {
+      s += "+pgmeta_omap";
+    }
+    if (flags & FLAG_PERPOOL_OMAP) {
+      s += "+per_pool_omap";
+    }
+    if (flags & FLAG_PERPG_OMAP) {
+      s += "+per_pg_omap";
+    }
+    return s;
+  }
+
+  bool has_flag(unsigned f) const {
+    return flags & f;
+  }
+
+  void set_flag(unsigned f) {
+    flags |= f;
+  }
+
+  void clear_flag(unsigned f) {
+    flags &= ~f;
+  }
+
+  bool has_omap() const {
+    return has_flag(FLAG_OMAP);
+  }
+
+  static bool is_pgmeta_omap(uint8_t flags) {
+    return flags & FLAG_PGMETA_OMAP;
+  }
+  static bool is_perpool_omap(uint8_t flags) {
+    return flags & FLAG_PERPOOL_OMAP;
+  }
+  static bool is_perpg_omap(uint8_t flags) {
+    return flags & FLAG_PERPG_OMAP;
+  }
+  bool is_pgmeta_omap() const {
+    return has_flag(FLAG_PGMETA_OMAP);
+  }
+  bool is_perpool_omap() const {
+    return has_flag(FLAG_PERPOOL_OMAP);
+  }
+  bool is_perpg_omap() const {
+    return has_flag(FLAG_PERPG_OMAP);
+  }
+
+  void set_omap_flags(bool legacy) {
+    set_flag(FLAG_OMAP | (legacy ? 0 : (FLAG_PERPOOL_OMAP | FLAG_PERPG_OMAP)));
+  }
+  void set_omap_flags_pgmeta() {
+    set_flag(FLAG_OMAP | FLAG_PGMETA_OMAP);
+  }
+
+  void clear_omap_flag() {
+    clear_flag(FLAG_OMAP);
+  }
+
+  DENC(bluestore_onode_t, v, p) {
+    DENC_START(1, 1, p);
+    denc_varint(v.nid, p);
+    denc_varint(v.size, p);
+    denc(v.attrs, p);
+    denc(v.flags, p);
+    denc(v.extent_map_shards, p);
+    denc_varint(v.expected_object_size, p);
+    denc_varint(v.expected_write_size, p);
+    denc_varint(v.alloc_hint_flags, p);
+    DENC_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_onode_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_onode_t::shard_info)
+WRITE_CLASS_DENC(bluestore_onode_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_onode_t::shard_info& si);
+
+/// writeahead-logged op
+struct bluestore_deferred_op_t {
+  typedef enum {
+    OP_WRITE = 1,
+  } type_t;
+  __u8 op = 0;
+
+  PExtentVector extents;
+  ceph::buffer::list data;
+
+  DENC(bluestore_deferred_op_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.op, p);
+    denc(v.extents, p);
+    denc(v.data, p);
+    DENC_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_deferred_op_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_deferred_op_t)
+
+
+/// writeahead-logged transaction
+struct bluestore_deferred_transaction_t {
+  uint64_t seq = 0;
+  std::list<bluestore_deferred_op_t> ops;
+  interval_set<uint64_t> released;  ///< allocations to release after tx
+
+  bluestore_deferred_transaction_t() : seq(0) {}
+
+  DENC(bluestore_deferred_transaction_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.seq, p);
+    denc(v.ops, p);
+    denc(v.released, p);
+    DENC_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_deferred_transaction_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_deferred_transaction_t)
+
+struct bluestore_compression_header_t {
+  uint8_t type = Compressor::COMP_ALG_NONE;
+  uint32_t length = 0;
+  boost::optional<int32_t> compressor_message;
+
+  bluestore_compression_header_t() {}
+  bluestore_compression_header_t(uint8_t _type)
+    : type(_type) {}
+
+  DENC(bluestore_compression_header_t, v, p) {
+    DENC_START(2, 1, p);
+    denc(v.type, p);
+    denc(v.length, p);
+    if (struct_v >= 2) {
+      denc(v.compressor_message, p);
+    }
+    DENC_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_compression_header_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_compression_header_t)
+
+template <template <typename> typename V, class COUNTER_TYPE = int32_t>
+class ref_counter_2hash_tracker_t {
+  size_t num_non_zero = 0;
+  size_t num_buckets = 0;
+  V<COUNTER_TYPE> buckets1;
+  V<COUNTER_TYPE> buckets2;
+
+public:
+  ref_counter_2hash_tracker_t(uint64_t mem_cap) {
+    num_buckets = mem_cap / sizeof(COUNTER_TYPE) / 2;
+    ceph_assert(num_buckets);
+    buckets1.resize(num_buckets);
+    buckets2.resize(num_buckets);
+    reset();
+  }
+
+  size_t get_num_buckets() const {
+    return num_buckets;
+  }
+
+  void inc(const char* hash_val, size_t hash_val_len, int n) {
+    auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len) %
+      num_buckets;
+    if (buckets1[h] == 0 && n) {
+      ++num_non_zero;
+    } else if (buckets1[h] == -n) {
+      --num_non_zero;
+    }
+    buckets1[h] += n;
+    h = ceph_str_hash_linux((const char*)hash_val, hash_val_len) % num_buckets;
+    if (buckets2[h] == 0 && n) {
+      ++num_non_zero;
+    } else if (buckets2[h] == -n) {
+      --num_non_zero;
+    }
+    buckets2[h] += n;
+  }
+
+  bool test_hash_conflict(
+    const char* hash_val1,
+    const char* hash_val2,
+    size_t hash_val_len) const {
+
+    auto h1 = ceph_str_hash_rjenkins((const char*)hash_val1, hash_val_len);
+    auto h2 = ceph_str_hash_rjenkins((const char*)hash_val2, hash_val_len);
+    auto h3 = ceph_str_hash_linux((const char*)hash_val1, hash_val_len);
+    auto h4 = ceph_str_hash_linux((const char*)hash_val2, hash_val_len);
+    return ((h1 % num_buckets) == (h2 % num_buckets)) &&
+      ((h3 % num_buckets) == (h4 % num_buckets));
+  }
+
+  bool test_all_zero(const char* hash_val, size_t hash_val_len) const {
+    auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len);
+    if (buckets1[h % num_buckets] != 0) {
+      return false;
+    }
+    h = ceph_str_hash_linux((const char*)hash_val, hash_val_len);
+    return buckets2[h % num_buckets] == 0;
+  }
+
+  // returns number of mismatching buckets
+  size_t count_non_zero() const {
+    return num_non_zero;
+  }
+  void reset() {
+    for (size_t i = 0; i < num_buckets; i++) {
+      buckets1[i] = 0;
+      buckets2[i] = 0;
+    }
+    num_non_zero = 0;
+  }
+};
+
+class shared_blob_2hash_tracker_t
+  : public ref_counter_2hash_tracker_t<mempool::bluestore_fsck::vector> {
+
+  static const size_t hash_input_len = 3;
+
+  typedef std::array<uint64_t, hash_input_len> hash_input_t;
+
+  static size_t get_hash_input_size() {
+    return hash_input_len * sizeof(hash_input_t::value_type);
+  }
+
+  inline hash_input_t build_hash_input(uint64_t sbid, uint64_t offset) const;
+
+  size_t au_void_bits = 0;
+
+
+public:
+  shared_blob_2hash_tracker_t(uint64_t mem_cap, size_t alloc_unit)
+    : ref_counter_2hash_tracker_t(mem_cap) {
+    ceph_assert(alloc_unit);
+    ceph_assert(isp2(alloc_unit));
+    au_void_bits = ctz(alloc_unit);
+  }
+  void inc(uint64_t sbid, uint64_t offset, int n);
+  void inc_range(uint64_t sbid, uint64_t offset, uint32_t len, int n);
+
+  bool test_hash_conflict(
+    uint64_t sbid,
+    uint64_t offset,
+    uint64_t sbid2,
+    uint64_t offset2) const;
+  bool test_all_zero(
+    uint64_t sbid,
+    uint64_t offset) const;
+  bool test_all_zero_range(
+    uint64_t sbid,
+    uint64_t offset,
+    uint32_t len) const;
+};
+
+class sb_info_t {
+  // subzero value indicates (potentially) stray blob,
+  // i.e. blob that has got no real references from onodes
+  int64_t sbid = 0;
+
+public:
+  enum {
+    INVALID_POOL_ID = INT64_MIN
+  };
+
+  int64_t pool_id = INVALID_POOL_ID;
+  // subzero value indicates compressed_allocated as well
+  int32_t allocated_chunks = 0;
+
+  sb_info_t(int64_t _sbid = 0) : sbid(_sbid)
+  {
+  }
+  bool operator< (const sb_info_t& other) const {
+    return std::abs(sbid) < std::abs(other.sbid);
+  }
+  bool operator< (const uint64_t& other_sbid) const {
+    return uint64_t(std::abs(sbid)) < other_sbid;
+  }
+  bool is_stray() const {
+    return sbid < 0;
+  }
+  uint64_t get_sbid() const {
+    return uint64_t(std::abs(sbid));
+  }
+  void adopt() {
+    sbid = std::abs(sbid);
+  }
+} __attribute__((packed));
+
+// Space-efficient container to keep a set of sb_info structures
+// given that the majority of entries are appended in a proper id-sorted
+// order. Hence one can keep them in a regular vector and apply binary search
+// whenever specific entry to be found.
+// For the rare occasions when out-of-order append takes place - an auxilliary
+// regular map is used.
+struct sb_info_space_efficient_map_t {
+  // large array sorted by the user
+  mempool::bluestore_fsck::vector<sb_info_t> items;
+  // small additional set of items we maintain sorting ourselves
+  // this would never keep an entry with id > items.back().id
+  mempool::bluestore_fsck::vector<sb_info_t> aux_items;
+
+  sb_info_t& add_maybe_stray(uint64_t sbid) {
+    return _add(-int64_t(sbid));
+  }
+  sb_info_t& add_or_adopt(uint64_t sbid) {
+    auto& r = _add(sbid);
+    r.adopt();
+    return r;
+  }
+  auto find(uint64_t id) {
+    if (items.size() != 0) {
+      auto it = std::lower_bound(
+	items.begin(),
+	items.end() - 1,
+	id,
+	[](const sb_info_t& a, const uint64_t& b) {
+	  return a < b;
+	});
+      if (it->get_sbid() == id) {
+	return it;
+      }
+      if (aux_items.size() != 0) {
+	auto it = std::lower_bound(
+	  aux_items.begin(),
+	  aux_items.end(),
+	  id,
+	  [](const sb_info_t& a, const uint64_t& b) {
+	    return a < b;
+	  });
+	if (it->get_sbid() == id) {
+	  return it;
+	}
+      }
+    }
+    return items.end();
+  }
+  // enumerates strays, order isn't guaranteed.
+  void foreach_stray(std::function<void(const sb_info_t&)> cb) {
+    for (auto& sbi : items) {
+      if (sbi.is_stray()) {
+	cb(sbi);
+      }
+    }
+    for (auto& sbi : aux_items) {
+      if (sbi.is_stray()) {
+	cb(sbi);
+      }
+    }
+  }
+  auto end() {
+    return items.end();
+  }
+
+  void shrink() {
+    items.shrink_to_fit();
+    aux_items.shrink_to_fit();
+  }
+  void clear() {
+    items.clear();
+    aux_items.clear();
+    shrink();
+  }
+private:
+  sb_info_t& _add(int64_t id) {
+    uint64_t n_id = uint64_t(std::abs(id));
+    if (items.size() == 0 || n_id > items.back().get_sbid()) {
+      return items.emplace_back(id);
+    }
+    auto it = find(n_id);
+    if (it != items.end()) {
+      return *it;
+    }
+    if (aux_items.size() == 0 || n_id > aux_items.back().get_sbid()) {
+      return aux_items.emplace_back(id);
+    }
+    // do sorted insertion, may be expensive!
+    it = std::upper_bound(
+      aux_items.begin(),
+      aux_items.end(),
+      n_id,
+      [](const uint64_t& a, const sb_info_t& b) {
+	return a < b.get_sbid();
+      });
+    return *aux_items.emplace(it, id);
+  }
+};
+
+#endif
diff --git a/src/os/bluestore/fastbmap_allocator_impl.cc b/src/os/bluestore/fastbmap_allocator_impl.cc
new file mode 100644
index 000000000..595b12485
--- /dev/null
+++ b/src/os/bluestore/fastbmap_allocator_impl.cc
@@ -0,0 +1,717 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator implementation.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ *
+ */
+
+#include "fastbmap_allocator_impl.h"
+
+uint64_t AllocatorLevel::l0_dives = 0;
+uint64_t AllocatorLevel::l0_iterations = 0;
+uint64_t AllocatorLevel::l0_inner_iterations = 0;
+uint64_t AllocatorLevel::alloc_fragments = 0;
+uint64_t AllocatorLevel::alloc_fragments_fast = 0;
+uint64_t AllocatorLevel::l2_allocs = 0;
+
+inline interval_t _align2units(uint64_t offset, uint64_t len, uint64_t min_length)
+{
+  interval_t res;
+  if (len >= min_length) {
+    res.offset = p2roundup(offset, min_length);
+    auto delta_off = res.offset - offset;
+    if (len > delta_off) {
+      res.length = len - delta_off;
+      res.length = p2align<uint64_t>(res.length, min_length);
+      if (res.length) {
+	return res;
+      }
+    }
+  }
+  return interval_t();
+}
+
+interval_t AllocatorLevel01Loose::_get_longest_from_l0(uint64_t pos0,
+  uint64_t pos1, uint64_t min_length, interval_t* tail) const
+{
+  interval_t res;
+  if (pos0 >= pos1) {
+    return res;
+  }
+  auto pos = pos0;
+
+  interval_t res_candidate;
+  if (tail->length != 0) {
+    ceph_assert((tail->offset % l0_granularity) == 0);
+    ceph_assert((tail->length % l0_granularity) == 0);
+    res_candidate.offset = tail->offset / l0_granularity;
+    res_candidate.length = tail->length / l0_granularity;
+  }
+  *tail = interval_t();
+
+  auto d = bits_per_slot;
+  slot_t bits = l0[pos / d];
+  bits >>= pos % d;
+  bool end_loop = false;
+  auto min_granules = min_length / l0_granularity;
+
+  do {
+    if ((pos % d) == 0) {
+      bits = l0[pos / d];
+      if (pos1 - pos >= d) {
+        switch(bits) {
+	  case all_slot_set:
+	    // slot is totally free
+	    if (!res_candidate.length) {
+	      res_candidate.offset = pos;
+	    }
+	    res_candidate.length += d;
+	    pos += d;
+	    end_loop = pos >= pos1;
+	    if (end_loop) {
+	      *tail = res_candidate;
+	      res_candidate = _align2units(res_candidate.offset,
+		res_candidate.length, min_granules);
+	      if(res.length < res_candidate.length) {
+		res = res_candidate;
+	      }
+	    }
+	    continue;
+	  case all_slot_clear:
+	    // slot is totally allocated
+	    res_candidate = _align2units(res_candidate.offset,
+	      res_candidate.length, min_granules);
+	    if (res.length < res_candidate.length) {
+	      res = res_candidate;
+	    }
+	    res_candidate = interval_t();
+	    pos += d;
+	    end_loop = pos >= pos1;
+	    continue;
+	}
+      }
+    } //if ((pos % d) == 0)
+
+    end_loop = ++pos >= pos1;
+    if (bits & 1) {
+      // item is free
+      if (!res_candidate.length) {
+	res_candidate.offset = pos - 1;
+      }
+      ++res_candidate.length;
+      if (end_loop) {
+	*tail = res_candidate;
+	res_candidate = _align2units(res_candidate.offset,
+	  res_candidate.length, min_granules);
+	if (res.length < res_candidate.length) {
+	  res = res_candidate;
+	}
+      }
+    } else {
+      res_candidate = _align2units(res_candidate.offset,
+	res_candidate.length, min_granules);
+      if (res.length < res_candidate.length) {
+	res = res_candidate;
+      }
+      res_candidate = interval_t();
+    }
+    bits >>= 1;
+  } while (!end_loop);
+  res.offset *= l0_granularity;
+  res.length *= l0_granularity;
+  tail->offset *= l0_granularity;
+  tail->length *= l0_granularity;
+  return res;
+}
+
+void AllocatorLevel01Loose::_analyze_partials(uint64_t pos_start,
+  uint64_t pos_end, uint64_t length, uint64_t min_length, int mode,
+  search_ctx_t* ctx)
+{
+  auto d = L1_ENTRIES_PER_SLOT;
+  ceph_assert((pos_start % d) == 0);
+  ceph_assert((pos_end % d) == 0);
+
+  uint64_t l0_w = slots_per_slotset * L0_ENTRIES_PER_SLOT;
+
+  uint64_t l1_pos = pos_start;
+  const interval_t empty_tail;
+  interval_t prev_tail;
+
+  uint64_t next_free_l1_pos = 0;
+  for (auto pos = pos_start / d; pos < pos_end / d; ++pos) {
+    slot_t slot_val = l1[pos];
+    // FIXME minor: code below can be optimized to check slot_val against
+    // all_slot_set(_clear) value
+
+    for (auto c = 0; c < d; c++) {
+      switch (slot_val & L1_ENTRY_MASK) {
+      case L1_ENTRY_FREE:
+        prev_tail  = empty_tail;
+        if (!ctx->free_count) {
+          ctx->free_l1_pos = l1_pos;
+        } else if (l1_pos != next_free_l1_pos){
+	  auto o = ctx->free_l1_pos * l1_granularity;
+	  auto l = ctx->free_count * l1_granularity;
+          // check if already found extent fits min_length after alignment
+	  if (_align2units(o, l, min_length).length >= min_length) {
+	    break;
+	  }
+	  // if not - proceed with the next one
+          ctx->free_l1_pos = l1_pos;
+          ctx->free_count = 0;
+	}
+	next_free_l1_pos = l1_pos + 1;
+        ++ctx->free_count;
+        if (mode == STOP_ON_EMPTY) {
+          return;
+        }
+        break;
+      case L1_ENTRY_FULL:
+        prev_tail = empty_tail;
+        break;
+      case L1_ENTRY_PARTIAL:
+	interval_t longest;
+        ++ctx->partial_count;
+
+        longest = _get_longest_from_l0(l1_pos * l0_w, (l1_pos + 1) * l0_w, min_length, &prev_tail);
+
+        if (longest.length >= length) {
+          if ((ctx->affordable_len == 0) ||
+              ((ctx->affordable_len != 0) &&
+                (longest.length < ctx->affordable_len))) {
+            ctx->affordable_len = longest.length;
+	    ctx->affordable_offs = longest.offset;
+          }
+        }
+        if (longest.length >= min_length &&
+	    (ctx->min_affordable_len == 0 ||
+	      (longest.length < ctx->min_affordable_len))) {
+
+          ctx->min_affordable_len = p2align<uint64_t>(longest.length, min_length);
+	  ctx->min_affordable_offs = longest.offset;
+        }
+        if (mode == STOP_ON_PARTIAL) {
+          return;
+        }
+        break;
+      }
+      slot_val >>= L1_ENTRY_WIDTH;
+      ++l1_pos;
+    }
+  }
+  ctx->fully_processed = true;
+}
+
+void AllocatorLevel01Loose::_mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end)
+{
+  if (l0_pos == l0_pos_end) {
+    return;
+  }
+  auto d0 = bits_per_slotset;
+  uint64_t l1_w = L1_ENTRIES_PER_SLOT;
+  // this should be aligned with slotset boundaries
+  ceph_assert(0 == (l0_pos % d0));
+  ceph_assert(0 == (l0_pos_end % d0));
+
+  int64_t idx = l0_pos / bits_per_slot;
+  int64_t idx_end = l0_pos_end / bits_per_slot;
+  slot_t mask_to_apply = L1_ENTRY_NOT_USED;
+
+  auto l1_pos = l0_pos / d0;
+
+  while (idx < idx_end) {
+    if (l0[idx] == all_slot_clear) {
+      // if not all prev slots are allocated then no need to check the
+      // current slot set, it's partial
+      ++idx;
+      if (mask_to_apply == L1_ENTRY_NOT_USED) {
+	mask_to_apply = L1_ENTRY_FULL;
+      } else if (mask_to_apply != L1_ENTRY_FULL) {
+	idx = p2roundup(idx, int64_t(slots_per_slotset));
+        mask_to_apply = L1_ENTRY_PARTIAL;
+      }
+    } else if (l0[idx] == all_slot_set) {
+      // if not all prev slots are free then no need to check the
+      // current slot set, it's partial
+      ++idx;
+      if (mask_to_apply == L1_ENTRY_NOT_USED) {
+	mask_to_apply = L1_ENTRY_FREE;
+      } else if (mask_to_apply != L1_ENTRY_FREE) {
+	idx = p2roundup(idx, int64_t(slots_per_slotset));
+        mask_to_apply = L1_ENTRY_PARTIAL;
+      }
+    } else {
+      // no need to check the current slot set, it's partial
+      mask_to_apply = L1_ENTRY_PARTIAL;
+      ++idx;
+      idx = p2roundup(idx, int64_t(slots_per_slotset));
+    }
+    if ((idx % slots_per_slotset) == 0) {
+      ceph_assert(mask_to_apply != L1_ENTRY_NOT_USED);
+      uint64_t shift = (l1_pos % l1_w) * L1_ENTRY_WIDTH;
+      slot_t& slot_val = l1[l1_pos / l1_w];
+      auto mask = slot_t(L1_ENTRY_MASK) << shift;
+
+      slot_t old_mask = (slot_val & mask) >> shift;
+      switch(old_mask) {
+      case L1_ENTRY_FREE:
+	unalloc_l1_count--;
+	break;
+      case L1_ENTRY_PARTIAL:
+	partial_l1_count--;
+	break;
+      }
+      slot_val &= ~mask;
+      slot_val |= slot_t(mask_to_apply) << shift;
+      switch(mask_to_apply) {
+      case L1_ENTRY_FREE:
+	unalloc_l1_count++;
+	break;
+      case L1_ENTRY_PARTIAL:
+	partial_l1_count++;
+	break;
+      }
+      mask_to_apply = L1_ENTRY_NOT_USED;
+      ++l1_pos;
+    }
+  }
+}
+
+void AllocatorLevel01Loose::_mark_alloc_l0(int64_t l0_pos_start,
+  int64_t l0_pos_end)
+{
+  auto d0 = L0_ENTRIES_PER_SLOT;
+
+  int64_t pos = l0_pos_start;
+  slot_t bits = (slot_t)1 << (l0_pos_start % d0);
+  slot_t* val_s = l0.data() + (pos / d0);
+  int64_t pos_e = std::min(l0_pos_end, p2roundup<int64_t>(l0_pos_start + 1, d0));
+  while (pos < pos_e) {
+    (*val_s) &= ~bits;
+    bits <<= 1;
+    pos++;
+  }
+  pos_e = std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0));
+  while (pos < pos_e) {
+    *(++val_s) = all_slot_clear;
+    pos += d0;
+  }
+  bits = 1;
+  ++val_s;
+  while (pos < l0_pos_end) {
+    (*val_s) &= ~bits;
+    bits <<= 1;
+    pos++;
+  }
+}
+
+interval_t AllocatorLevel01Loose::_allocate_l1_contiguous(uint64_t length,
+  uint64_t min_length, uint64_t max_length,
+  uint64_t pos_start, uint64_t pos_end)
+{
+  interval_t res = { 0, 0 };
+  uint64_t l0_w = slots_per_slotset * L0_ENTRIES_PER_SLOT;
+
+  if (unlikely(length <= l0_granularity)) {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, l0_granularity, l0_granularity,
+      STOP_ON_PARTIAL, &ctx);
+
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.affordable_len) {
+      // allocate as specified
+      ceph_assert(ctx.affordable_len >= length);
+      auto pos = ctx.affordable_offs / l0_granularity;
+      _mark_alloc_l1_l0(pos, pos + 1);
+      res = interval_t(ctx.affordable_offs, length);
+      return res;
+    }
+
+    // allocate from free slot sets
+    if (ctx.free_count) {
+      auto l = std::min(length, ctx.free_count * l1_granularity);
+      ceph_assert((l % l0_granularity) == 0);
+      auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+      _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+      res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+      return res;
+    }
+  } else if (unlikely(length == l1_granularity)) {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, length, min_length, STOP_ON_EMPTY, &ctx);
+
+    // allocate using contiguous extent found at l1 if any
+    if (ctx.free_count) {
+
+      auto l = std::min(length, ctx.free_count * l1_granularity);
+      ceph_assert((l % l0_granularity) == 0);
+      auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+      _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+      res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+
+      return res;
+    }
+
+    // we can terminate earlier on free entry only
+    ceph_assert(ctx.fully_processed);
+
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.affordable_len) {
+      ceph_assert(ctx.affordable_len >= length);
+      ceph_assert((length % l0_granularity) == 0);
+      auto pos_start = ctx.affordable_offs / l0_granularity;
+      auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      res = interval_t(ctx.affordable_offs, length);
+      return res;
+    }
+    if (ctx.min_affordable_len) {
+      auto pos_start = ctx.min_affordable_offs / l0_granularity;
+      auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
+    }
+  } else {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, length, min_length, NO_STOP, &ctx);
+    ceph_assert(ctx.fully_processed);
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.affordable_len) {
+      ceph_assert(ctx.affordable_len >= length);
+      ceph_assert((length % l0_granularity) == 0);
+      auto pos_start = ctx.affordable_offs / l0_granularity;
+      auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      res = interval_t(ctx.affordable_offs, length);
+      return res;
+    }
+    // allocate using contiguous extent found at l1 if affordable
+    // align allocated extent with min_length
+    if (ctx.free_count) {
+      auto o = ctx.free_l1_pos * l1_granularity;
+      auto l = ctx.free_count * l1_granularity;
+      interval_t aligned_extent = _align2units(o, l, min_length);
+      if (aligned_extent.length > 0) {
+	aligned_extent.length = std::min(length,
+	  uint64_t(aligned_extent.length));
+	ceph_assert((aligned_extent.offset % l0_granularity) == 0);
+	ceph_assert((aligned_extent.length % l0_granularity) == 0);
+
+	auto pos_start = aligned_extent.offset / l0_granularity;
+	auto pos_end = (aligned_extent.offset + aligned_extent.length) / l0_granularity;
+
+	_mark_alloc_l1_l0(pos_start, pos_end);
+	return aligned_extent;
+      }
+    }
+    if (ctx.min_affordable_len) {
+      auto pos_start = ctx.min_affordable_offs / l0_granularity;
+      auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
+    }
+  }
+  return res;
+}
+
+bool AllocatorLevel01Loose::_allocate_l1(uint64_t length,
+  uint64_t min_length, uint64_t max_length,
+  uint64_t l1_pos_start, uint64_t l1_pos_end,
+  uint64_t* allocated,
+  interval_vector_t* res)
+{
+  uint64_t d0 = L0_ENTRIES_PER_SLOT;
+  uint64_t d1 = L1_ENTRIES_PER_SLOT;
+
+  ceph_assert(0 == (l1_pos_start % (slots_per_slotset * d1)));
+  ceph_assert(0 == (l1_pos_end % (slots_per_slotset * d1)));
+  if (min_length != l0_granularity) {
+    // probably not the most effecient way but
+    // don't care much about that at the moment
+    bool has_space = true;
+    while (length > *allocated && has_space) {
+      interval_t i =
+        _allocate_l1_contiguous(length - *allocated, min_length, max_length,
+	  l1_pos_start, l1_pos_end);
+      if (i.length == 0) {
+        has_space = false;
+      } else {
+	_fragment_and_emplace(max_length, i.offset, i.length, res);
+        *allocated += i.length;
+      }
+    }
+  } else {
+    uint64_t l0_w = slots_per_slotset * d0;
+
+    for (auto idx = l1_pos_start / d1;
+      idx < l1_pos_end / d1 && length > *allocated;
+      ++idx) {
+      slot_t& slot_val = l1[idx];
+      if (slot_val == all_slot_clear) {
+        continue;
+      } else if (slot_val == all_slot_set) {
+        uint64_t to_alloc = std::min(length - *allocated,
+          l1_granularity * d1);
+        *allocated += to_alloc;
+        ++alloc_fragments_fast;
+	_fragment_and_emplace(max_length, idx * d1 * l1_granularity, to_alloc,
+	  res);
+        _mark_alloc_l1_l0(idx * d1 * bits_per_slotset,
+	  idx * d1 * bits_per_slotset + to_alloc / l0_granularity);
+        continue;
+      }
+      auto free_pos = find_next_set_bit(slot_val, 0);
+      ceph_assert(free_pos < bits_per_slot);
+      do {
+        ceph_assert(length > *allocated);
+
+        bool empty;
+        empty = _allocate_l0(length, max_length,
+	  (idx * d1 + free_pos / L1_ENTRY_WIDTH) * l0_w,
+          (idx * d1 + free_pos / L1_ENTRY_WIDTH + 1) * l0_w,
+          allocated,
+          res);
+
+	auto mask = slot_t(L1_ENTRY_MASK) << free_pos;
+
+	slot_t old_mask = (slot_val & mask) >> free_pos;
+	switch(old_mask) {
+	case L1_ENTRY_FREE:
+	  unalloc_l1_count--;
+	  break;
+	case L1_ENTRY_PARTIAL:
+	  partial_l1_count--;
+	  break;
+	}
+        slot_val &= ~mask;
+        if (empty) {
+          // the next line is no op with the current L1_ENTRY_FULL but left
+          // as-is for the sake of uniformity and to avoid potential errors
+          // in future
+          slot_val |= slot_t(L1_ENTRY_FULL) << free_pos;
+        } else {
+          slot_val |= slot_t(L1_ENTRY_PARTIAL) << free_pos;
+	  partial_l1_count++;
+        }
+        if (length <= *allocated || slot_val == all_slot_clear) {
+          break;
+        }
+	free_pos = find_next_set_bit(slot_val, free_pos + L1_ENTRY_WIDTH);
+      } while (free_pos < bits_per_slot);
+    }
+  }
+  return _is_empty_l1(l1_pos_start, l1_pos_end);
+}
+
+void AllocatorLevel01Loose::collect_stats(
+  std::map<size_t, size_t>& bins_overall)
+{
+  size_t free_seq_cnt = 0;
+  for (auto slot : l0) {
+    if (slot == all_slot_set) {
+      free_seq_cnt += L0_ENTRIES_PER_SLOT;
+    } else if(slot != all_slot_clear) {
+      size_t pos = 0;
+      do {
+	auto pos1 = find_next_set_bit(slot, pos);
+	if (pos1 == pos) {
+	  free_seq_cnt++;
+	  pos = pos1 + 1;
+	} else {
+	  if (free_seq_cnt) {
+	    bins_overall[cbits(free_seq_cnt) - 1]++;
+	    free_seq_cnt = 0;
+	  }
+	  if (pos1 < bits_per_slot) {
+	    free_seq_cnt = 1;
+	  }
+          pos = pos1 + 1;
+	}
+      } while (pos < bits_per_slot);
+    } else if (free_seq_cnt) {
+      bins_overall[cbits(free_seq_cnt) - 1]++;
+      free_seq_cnt = 0;
+    }
+  }
+  if (free_seq_cnt) {
+    bins_overall[cbits(free_seq_cnt) - 1]++;
+  }
+}
+
+inline ssize_t AllocatorLevel01Loose::count_0s(slot_t slot_val, size_t start_pos)
+  {
+  #ifdef __GNUC__
+    size_t pos = __builtin_ffsll(slot_val >> start_pos);
+    if (pos == 0)
+      return sizeof(slot_t)*8 - start_pos;
+    return pos - 1;
+  #else
+    size_t pos = start_pos;
+    slot_t mask = slot_t(1) << pos;
+    while (pos < bits_per_slot && (slot_val & mask) == 0) {
+      mask <<= 1;
+      pos++;
+    }
+    return pos - start_pos;
+  #endif
+  }
+
+ inline ssize_t AllocatorLevel01Loose::count_1s(slot_t slot_val, size_t start_pos)
+ {
+   return count_0s(~slot_val, start_pos);
+ }
+void AllocatorLevel01Loose::foreach_internal(
+    std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+  size_t len = 0;
+  size_t off = 0;
+  for (size_t i = 0; i < l1.size(); i++)
+  {
+    for (size_t j = 0; j < L1_ENTRIES_PER_SLOT * L1_ENTRY_WIDTH; j += L1_ENTRY_WIDTH)
+    {
+      size_t w = (l1[i] >> j) & L1_ENTRY_MASK;
+      switch (w) {
+        case L1_ENTRY_FULL:
+          if (len > 0) {
+            notify(off, len);
+            len = 0;
+          }
+          break;
+        case L1_ENTRY_FREE:
+          if (len == 0)
+            off = ( ( bits_per_slot * i + j ) / L1_ENTRY_WIDTH ) * slots_per_slotset * bits_per_slot;
+          len += bits_per_slotset;
+          break;
+        case L1_ENTRY_PARTIAL:
+          size_t pos = ( ( bits_per_slot * i + j ) / L1_ENTRY_WIDTH ) * slots_per_slotset;
+          for (size_t t = 0; t < slots_per_slotset; t++) {
+            size_t p = 0;
+            slot_t allocation_pattern = l0[pos + t];
+            while (p < bits_per_slot) {
+              if (len == 0) {
+                //continue to skip allocated space, meaning bits set to 0
+                ssize_t alloc_count = count_0s(allocation_pattern, p);
+                p += alloc_count;
+                //now we are switched to expecting free space
+                if (p < bits_per_slot) {
+                  //now @p are 1s
+                  ssize_t free_count = count_1s(allocation_pattern, p);
+                  assert(free_count > 0);
+                  len = free_count;
+                  off = (pos + t) * bits_per_slot + p;
+                  p += free_count;
+                }
+              } else {
+                //continue free region
+                ssize_t free_count = count_1s(allocation_pattern, p);
+                if (free_count == 0) {
+                  notify(off, len);
+                  len = 0;
+                } else {
+                  p += free_count;
+                  len += free_count;
+                }
+              }
+            }
+          }
+          break;
+      }
+    }
+  }
+  if (len > 0)
+    notify(off, len);
+}
+
+uint64_t AllocatorLevel01Loose::_claim_free_to_left_l0(int64_t l0_pos_start)
+{
+  int64_t d0 = L0_ENTRIES_PER_SLOT;
+
+  int64_t pos = l0_pos_start - 1;
+  slot_t bits = (slot_t)1 << (pos % d0);
+  int64_t idx = pos / d0;
+  slot_t* val_s = l0.data() + idx;
+
+  int64_t pos_e = p2align<int64_t>(pos, d0);
+
+  while (pos >= pos_e) {
+    if (0 == ((*val_s) & bits))
+      return pos + 1;
+    (*val_s) &= ~bits;
+    bits >>= 1;
+    --pos;
+  }
+  --idx;
+  val_s = l0.data() + idx;
+  while (idx >= 0 && (*val_s) == all_slot_set) {
+    *val_s = all_slot_clear;
+    --idx;
+    pos -= d0;
+    val_s = l0.data() + idx;
+  }
+
+  if (idx >= 0 &&
+      (*val_s) != all_slot_set && (*val_s) != all_slot_clear) {
+    int64_t pos_e = p2align<int64_t>(pos, d0);
+    slot_t bits = (slot_t)1 << (pos % d0);
+    while (pos >= pos_e) {
+      if (0 == ((*val_s) & bits))
+        return pos + 1;
+      (*val_s) &= ~bits;
+      bits >>= 1;
+      --pos;
+    }
+  }
+  return pos + 1;
+}
+
+uint64_t AllocatorLevel01Loose::_claim_free_to_right_l0(int64_t l0_pos_start)
+{
+  auto d0 = L0_ENTRIES_PER_SLOT;
+
+  int64_t pos = l0_pos_start;
+  slot_t bits = (slot_t)1 << (pos % d0);
+  size_t idx = pos / d0;
+  if (idx >= l0.size()) {
+    return pos;
+  }
+  slot_t* val_s = l0.data() + idx;
+
+  int64_t pos_e = p2roundup<int64_t>(pos + 1, d0);
+
+  while (pos < pos_e) {
+    if (0 == ((*val_s) & bits))
+      return pos;
+    (*val_s) &= ~bits;
+    bits <<= 1;
+    ++pos;
+  }
+  ++idx;
+  val_s = l0.data() + idx;
+  while (idx < l0.size() && (*val_s) == all_slot_set) {
+    *val_s = all_slot_clear;
+    ++idx;
+    pos += d0;
+    val_s = l0.data() + idx;
+  }
+
+  if (idx < l0.size() &&
+      (*val_s) != all_slot_set && (*val_s) != all_slot_clear) {
+    int64_t pos_e = p2roundup<int64_t>(pos + 1, d0);
+    slot_t bits = (slot_t)1 << (pos % d0);
+    while (pos < pos_e) {
+      if (0 == ((*val_s) & bits))
+        return pos;
+      (*val_s) &= ~bits;
+      bits <<= 1;
+      ++pos;
+    }
+  }
+  return pos;
+}
diff --git a/src/os/bluestore/fastbmap_allocator_impl.h b/src/os/bluestore/fastbmap_allocator_impl.h
new file mode 100644
index 000000000..e44fc76d7
--- /dev/null
+++ b/src/os/bluestore/fastbmap_allocator_impl.h
@@ -0,0 +1,845 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator implementation.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ *
+ */
+
+#ifndef __FAST_BITMAP_ALLOCATOR_IMPL_H
+#define __FAST_BITMAP_ALLOCATOR_IMPL_H
+#include "include/intarith.h"
+
+#include <vector>
+#include <algorithm>
+#include <mutex>
+
+typedef uint64_t slot_t;
+
+#ifdef NON_CEPH_BUILD
+#include <assert.h>
+struct interval_t
+{
+  uint64_t offset = 0;
+  uint64_t length = 0;
+
+  interval_t() {}
+  interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
+  interval_t(const interval_t &ext) :
+    offset(ext.offset), length(ext.length) {}
+};
+typedef std::vector<interval_t> interval_vector_t;
+typedef std::vector<slot_t> slot_vector_t;
+#else
+#include "include/ceph_assert.h"
+#include "common/likely.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+#include "common/ceph_mutex.h"
+
+typedef bluestore_interval_t<uint64_t, uint64_t> interval_t;
+typedef PExtentVector interval_vector_t;
+
+typedef mempool::bluestore_alloc::vector<slot_t> slot_vector_t;
+
+#endif
+
+// fitting into cache line on x86_64
+static const size_t slots_per_slotset = 8; // 8 slots per set
+static const size_t slotset_bytes = sizeof(slot_t) * slots_per_slotset;
+static const size_t bits_per_slot = sizeof(slot_t) * 8;
+static const size_t bits_per_slotset = slotset_bytes * 8;
+static const slot_t all_slot_set = 0xffffffffffffffff;
+static const slot_t all_slot_clear = 0;
+
+inline size_t find_next_set_bit(slot_t slot_val, size_t start_pos)
+{
+#ifdef __GNUC__
+  if (start_pos == 0) {
+    start_pos = __builtin_ffsll(slot_val);
+    return start_pos ? start_pos - 1 : bits_per_slot;
+  }
+#endif
+  slot_t mask = slot_t(1) << start_pos;
+  while (start_pos < bits_per_slot && !(slot_val & mask)) {
+    mask <<= 1;
+    ++start_pos;
+  }
+  return start_pos;
+}
+
+
+class AllocatorLevel
+{
+protected:
+
+  virtual uint64_t _children_per_slot() const = 0;
+  virtual uint64_t _level_granularity() const = 0;
+
+public:
+  static uint64_t l0_dives;
+  static uint64_t l0_iterations;
+  static uint64_t l0_inner_iterations;
+  static uint64_t alloc_fragments;
+  static uint64_t alloc_fragments_fast;
+  static uint64_t l2_allocs;
+
+  virtual ~AllocatorLevel()
+  {}
+
+  virtual void collect_stats(
+    std::map<size_t, size_t>& bins_overall) = 0;
+
+};
+
+class AllocatorLevel01 : public AllocatorLevel
+{
+protected:
+  slot_vector_t l0; // set bit means free entry
+  slot_vector_t l1;
+  uint64_t l0_granularity = 0; // space per entry
+  uint64_t l1_granularity = 0; // space per entry
+
+  size_t partial_l1_count = 0;
+  size_t unalloc_l1_count = 0;
+
+  double get_fragmentation() const {
+    double res = 0.0;
+    auto total = unalloc_l1_count + partial_l1_count;
+    if (total) {
+      res = double(partial_l1_count) / double(total);
+    }
+    return res;
+  }
+
+  uint64_t _level_granularity() const override
+  {
+    return l1_granularity;
+  }
+
+  inline bool _is_slot_fully_allocated(uint64_t idx) const {
+    return l1[idx] == all_slot_clear;
+  }
+public:
+  inline uint64_t get_min_alloc_size() const
+  {
+    return l0_granularity;
+  }
+
+};
+
+template <class T>
+class AllocatorLevel02;
+
+class AllocatorLevel01Loose : public AllocatorLevel01
+{
+  enum {
+    L1_ENTRY_WIDTH = 2,
+    L1_ENTRY_MASK = (1 << L1_ENTRY_WIDTH) - 1,
+    L1_ENTRY_FULL = 0x00,
+    L1_ENTRY_PARTIAL = 0x01,
+    L1_ENTRY_NOT_USED = 0x02,
+    L1_ENTRY_FREE = 0x03,
+    L1_ENTRIES_PER_SLOT = bits_per_slot / L1_ENTRY_WIDTH, //32
+    L0_ENTRIES_PER_SLOT = bits_per_slot, // 64
+  };
+  uint64_t _children_per_slot() const override
+  {
+    return L1_ENTRIES_PER_SLOT;
+  }
+
+  interval_t _get_longest_from_l0(uint64_t pos0, uint64_t pos1,
+    uint64_t min_length, interval_t* tail) const;
+
+  inline void _fragment_and_emplace(uint64_t max_length, uint64_t offset,
+    uint64_t len,
+    interval_vector_t* res)
+  {
+    auto it = res->rbegin();
+    if (max_length) {
+      if (it != res->rend() && it->offset + it->length == offset) {
+	auto l = max_length - it->length;
+	if (l >= len) {
+	  it->length += len;
+	  return;
+	} else {
+	  offset += l;
+	  len -= l;
+	  it->length += l;
+	}
+      }
+
+      while (len > max_length) {
+	res->emplace_back(offset, max_length);
+	offset += max_length;
+	len -= max_length;
+      }
+      res->emplace_back(offset, len);
+      return;
+    }
+
+    if (it != res->rend() && it->offset + it->length == offset) {
+      it->length += len;
+    } else {
+      res->emplace_back(offset, len);
+    }
+  }
+
+  bool _allocate_l0(uint64_t length,
+    uint64_t max_length,
+    uint64_t l0_pos0, uint64_t l0_pos1,
+    uint64_t* allocated,
+    interval_vector_t* res)
+  {
+    uint64_t d0 = L0_ENTRIES_PER_SLOT;
+
+    ++l0_dives;
+
+    ceph_assert(l0_pos0 < l0_pos1);
+    ceph_assert(length > *allocated);
+    ceph_assert(0 == (l0_pos0 % (slots_per_slotset * d0)));
+    ceph_assert(0 == (l0_pos1 % (slots_per_slotset * d0)));
+    ceph_assert(((length - *allocated) % l0_granularity) == 0);
+
+    uint64_t need_entries = (length - *allocated) / l0_granularity;
+
+    for (auto idx = l0_pos0 / d0; (idx < l0_pos1 / d0) && (length > *allocated);
+      ++idx) {
+      ++l0_iterations;
+      slot_t& slot_val = l0[idx];
+      auto base = idx * d0;
+      if (slot_val == all_slot_clear) {
+        continue;
+      } else if (slot_val == all_slot_set) {
+        uint64_t to_alloc = std::min(need_entries, d0);
+        *allocated += to_alloc * l0_granularity;
+	++alloc_fragments;
+        need_entries -= to_alloc;
+
+	_fragment_and_emplace(max_length, base * l0_granularity,
+          to_alloc * l0_granularity, res);
+
+        if (to_alloc == d0) {
+          slot_val = all_slot_clear;
+        } else {
+          _mark_alloc_l0(base, base + to_alloc);
+        }
+        continue;
+      }
+
+      auto free_pos = find_next_set_bit(slot_val, 0);
+      ceph_assert(free_pos < bits_per_slot);
+      auto next_pos = free_pos + 1;
+      while (next_pos < bits_per_slot &&
+        (next_pos - free_pos) < need_entries) {
+	++l0_inner_iterations;
+
+        if (0 == (slot_val & (slot_t(1) << next_pos))) {
+          auto to_alloc = (next_pos - free_pos);
+          *allocated += to_alloc * l0_granularity;
+	  ++alloc_fragments;
+          need_entries -= to_alloc;
+	  _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
+	    to_alloc * l0_granularity, res);
+          _mark_alloc_l0(base + free_pos, base + next_pos);
+          free_pos = find_next_set_bit(slot_val, next_pos + 1);
+          next_pos = free_pos + 1;
+        } else {
+          ++next_pos;
+        }
+      }
+      if (need_entries && free_pos < bits_per_slot) {
+        auto to_alloc = std::min(need_entries, d0 - free_pos);
+        *allocated += to_alloc * l0_granularity;
+	++alloc_fragments;
+	need_entries -= to_alloc;
+	_fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
+	  to_alloc * l0_granularity, res);
+        _mark_alloc_l0(base + free_pos, base + free_pos + to_alloc);
+      }
+    }
+    return _is_empty_l0(l0_pos0, l0_pos1);
+  }
+
+protected:
+
+  friend class AllocatorLevel02<AllocatorLevel01Loose>;
+
+  void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
+  {
+    l0_granularity = _alloc_unit;
+    // 512 bits at L0 mapped to L1 entry
+    l1_granularity = l0_granularity * bits_per_slotset;
+
+    // capacity to have slot alignment at l1
+    auto aligned_capacity =
+      p2roundup((int64_t)capacity,
+        int64_t(l1_granularity * slots_per_slotset * _children_per_slot()));
+    size_t slot_count =
+      aligned_capacity / l1_granularity / _children_per_slot();
+    // we use set bit(s) as a marker for (partially) free entry
+    l1.resize(slot_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+    // l0 slot count
+    size_t slot_count_l0 = aligned_capacity / _alloc_unit / bits_per_slot;
+    // we use set bit(s) as a marker for (partially) free entry
+    l0.resize(slot_count_l0, mark_as_free ? all_slot_set : all_slot_clear);
+
+    partial_l1_count = unalloc_l1_count = 0;
+    if (mark_as_free) {
+      unalloc_l1_count = slot_count * _children_per_slot();
+      auto l0_pos_no_use = p2roundup((int64_t)capacity, (int64_t)l0_granularity) / l0_granularity;
+      _mark_alloc_l1_l0(l0_pos_no_use, aligned_capacity / l0_granularity);
+    }
+  }
+
+  struct search_ctx_t
+  {
+    size_t partial_count = 0;
+    size_t free_count = 0;
+    uint64_t free_l1_pos = 0;
+
+    uint64_t min_affordable_len = 0;
+    uint64_t min_affordable_offs = 0;
+    uint64_t affordable_len = 0;
+    uint64_t affordable_offs = 0;
+
+    bool fully_processed = false;
+
+    void reset()
+    {
+      *this = search_ctx_t();
+    }
+  };
+  enum {
+    NO_STOP,
+    STOP_ON_EMPTY,
+    STOP_ON_PARTIAL,
+  };
+  void _analyze_partials(uint64_t pos_start, uint64_t pos_end,
+    uint64_t length, uint64_t min_length, int mode,
+    search_ctx_t* ctx);
+
+  void _mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end);
+  void _mark_alloc_l0(int64_t l0_pos_start, int64_t l0_pos_end);
+  uint64_t _claim_free_to_left_l0(int64_t l0_pos_start);
+  uint64_t _claim_free_to_right_l0(int64_t l0_pos_start);
+
+
+  void _mark_alloc_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    _mark_alloc_l0(l0_pos_start, l0_pos_end);
+    l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
+    l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
+    _mark_l1_on_l0(l0_pos_start, l0_pos_end);
+  }
+
+  void _mark_free_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    auto d0 = L0_ENTRIES_PER_SLOT;
+
+    auto pos = l0_pos_start;
+    slot_t bits = (slot_t)1 << (l0_pos_start % d0);
+    slot_t* val_s = &l0[pos / d0];
+    int64_t pos_e = std::min(l0_pos_end,
+                             p2roundup<int64_t>(l0_pos_start + 1, d0));
+    while (pos < pos_e) {
+      *val_s |=  bits;
+      bits <<= 1;
+      pos++;
+    }
+    pos_e = std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0));
+    while (pos < pos_e) {
+      *(++val_s) = all_slot_set;
+      pos += d0;
+    }
+    bits = 1;
+    ++val_s;
+    while (pos < l0_pos_end) {
+      *val_s |= bits;
+      bits <<= 1;
+      pos++;
+    }
+  }
+
+  void _mark_free_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    _mark_free_l0(l0_pos_start, l0_pos_end);
+    l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
+    l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
+    _mark_l1_on_l0(l0_pos_start, l0_pos_end);
+  }
+
+  bool _is_empty_l0(uint64_t l0_pos, uint64_t l0_pos_end)
+  {
+    bool no_free = true;
+    uint64_t d = slots_per_slotset * L0_ENTRIES_PER_SLOT;
+    ceph_assert(0 == (l0_pos % d));
+    ceph_assert(0 == (l0_pos_end % d));
+
+    auto idx = l0_pos / L0_ENTRIES_PER_SLOT;
+    auto idx_end = l0_pos_end / L0_ENTRIES_PER_SLOT;
+    while (idx < idx_end && no_free) {
+      no_free = l0[idx] == all_slot_clear;
+      ++idx;
+    }
+    return no_free;
+  }
+  bool _is_empty_l1(uint64_t l1_pos, uint64_t l1_pos_end)
+  {
+    bool no_free = true;
+    uint64_t d = slots_per_slotset * _children_per_slot();
+    ceph_assert(0 == (l1_pos % d));
+    ceph_assert(0 == (l1_pos_end % d));
+
+    auto idx = l1_pos / L1_ENTRIES_PER_SLOT;
+    auto idx_end = l1_pos_end / L1_ENTRIES_PER_SLOT;
+    while (idx < idx_end && no_free) {
+      no_free = _is_slot_fully_allocated(idx);
+      ++idx;
+    }
+    return no_free;
+  }
+
+  interval_t _allocate_l1_contiguous(uint64_t length,
+    uint64_t min_length, uint64_t max_length,
+    uint64_t pos_start, uint64_t pos_end);
+
+  bool _allocate_l1(uint64_t length,
+    uint64_t min_length, uint64_t max_length,
+    uint64_t l1_pos_start, uint64_t l1_pos_end,
+    uint64_t* allocated,
+    interval_vector_t* res);
+
+  uint64_t _mark_alloc_l1(uint64_t offset, uint64_t length)
+  {
+    uint64_t l0_pos_start = offset / l0_granularity;
+    uint64_t l0_pos_end = p2roundup(offset + length, l0_granularity) / l0_granularity;
+    _mark_alloc_l1_l0(l0_pos_start, l0_pos_end);
+    return l0_granularity * (l0_pos_end - l0_pos_start);
+  }
+
+  uint64_t _free_l1(uint64_t offs, uint64_t len)
+  {
+    uint64_t l0_pos_start = offs / l0_granularity;
+    uint64_t l0_pos_end = p2roundup(offs + len, l0_granularity) / l0_granularity;
+    _mark_free_l1_l0(l0_pos_start, l0_pos_end);
+    return l0_granularity * (l0_pos_end - l0_pos_start);
+  }
+
+  uint64_t claim_free_to_left_l1(uint64_t offs)
+  {
+    uint64_t l0_pos_end = offs / l0_granularity;
+    uint64_t l0_pos_start = _claim_free_to_left_l0(l0_pos_end);
+    if (l0_pos_start < l0_pos_end) {
+      _mark_l1_on_l0(
+        p2align(l0_pos_start, uint64_t(bits_per_slotset)),
+        p2roundup(l0_pos_end, uint64_t(bits_per_slotset)));
+      return l0_granularity * (l0_pos_end - l0_pos_start);
+    }
+    return 0;
+  }
+
+  uint64_t claim_free_to_right_l1(uint64_t offs)
+  {
+    uint64_t l0_pos_start = offs / l0_granularity;
+    uint64_t l0_pos_end = _claim_free_to_right_l0(l0_pos_start);
+
+    if (l0_pos_start < l0_pos_end) {
+      _mark_l1_on_l0(
+        p2align(l0_pos_start, uint64_t(bits_per_slotset)),
+        p2roundup(l0_pos_end, uint64_t(bits_per_slotset)));
+      return l0_granularity * (l0_pos_end - l0_pos_start);
+    }
+    return 0;
+  }
+
+
+public:
+  uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    if (pos1 == 0) {
+      pos1 = l1.size() * L1_ENTRIES_PER_SLOT;
+    }
+    auto avail = debug_get_free(pos0, pos1);
+    return (pos1 - pos0) * l1_granularity - avail;
+  }
+
+  uint64_t debug_get_free(uint64_t l1_pos0 = 0, uint64_t l1_pos1 = 0)
+  {
+    ceph_assert(0 == (l1_pos0 % L1_ENTRIES_PER_SLOT));
+    ceph_assert(0 == (l1_pos1 % L1_ENTRIES_PER_SLOT));
+
+    auto idx0 = l1_pos0 * slots_per_slotset;
+    auto idx1 = l1_pos1 * slots_per_slotset;
+
+    if (idx1 == 0) {
+      idx1 = l0.size();
+    }
+
+    uint64_t res = 0;
+    for (uint64_t i = idx0; i < idx1; ++i) {
+      auto v = l0[i];
+      if (v == all_slot_set) {
+        res += L0_ENTRIES_PER_SLOT;
+      } else if (v != all_slot_clear) {
+        size_t cnt = 0;
+#ifdef __GNUC__
+        cnt = __builtin_popcountll(v);
+#else
+        // Kernighan's Alg to count set bits
+        while (v) {
+          v &= (v - 1);
+          cnt++;
+        }
+#endif
+        res += cnt;
+      }
+    }
+    return res * l0_granularity;
+  }
+  void collect_stats(
+    std::map<size_t, size_t>& bins_overall) override;
+
+  static inline ssize_t count_0s(slot_t slot_val, size_t start_pos);
+  static inline ssize_t count_1s(slot_t slot_val, size_t start_pos);
+  void foreach_internal(std::function<void(uint64_t offset, uint64_t length)> notify);
+};
+
+
+class AllocatorLevel01Compact : public AllocatorLevel01
+{
+  uint64_t _children_per_slot() const override
+  {
+    return 8;
+  }
+public:
+  void collect_stats(
+    std::map<size_t, size_t>& bins_overall) override
+  {
+    // not implemented
+  }
+};
+
+template <class L1>
+class AllocatorLevel02 : public AllocatorLevel
+{
+public:
+  uint64_t debug_get_free(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    std::lock_guard l(lock);
+    return l1.debug_get_free(pos0 * l1._children_per_slot() * bits_per_slot,
+      pos1 * l1._children_per_slot() * bits_per_slot);
+  }
+  uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    std::lock_guard l(lock);
+    return l1.debug_get_allocated(pos0 * l1._children_per_slot() * bits_per_slot,
+      pos1 * l1._children_per_slot() * bits_per_slot);
+  }
+
+  uint64_t get_available()
+  {
+    std::lock_guard l(lock);
+    return available;
+  }
+  inline uint64_t get_min_alloc_size() const
+  {
+    return l1.get_min_alloc_size();
+  }
+  void collect_stats(
+    std::map<size_t, size_t>& bins_overall) override {
+
+      std::lock_guard l(lock);
+      l1.collect_stats(bins_overall);
+  }
+  uint64_t claim_free_to_left(uint64_t offset) {
+    std::lock_guard l(lock);
+    auto allocated = l1.claim_free_to_left_l1(offset);
+    ceph_assert(available >= allocated);
+    available -= allocated;
+
+    uint64_t l2_pos = (offset - allocated) / l2_granularity;
+    uint64_t l2_pos_end =
+      p2roundup(int64_t(offset), int64_t(l2_granularity)) / l2_granularity;
+    _mark_l2_on_l1(l2_pos, l2_pos_end);
+    return allocated;
+  }
+
+  uint64_t claim_free_to_right(uint64_t offset) {
+    std::lock_guard l(lock);
+    auto allocated = l1.claim_free_to_right_l1(offset);
+    ceph_assert(available >= allocated);
+    available -= allocated;
+
+    uint64_t l2_pos = (offset) / l2_granularity;
+    int64_t end = offset + allocated;
+    uint64_t l2_pos_end = p2roundup(end, int64_t(l2_granularity)) / l2_granularity;
+    _mark_l2_on_l1(l2_pos, l2_pos_end);
+    return allocated;
+  }
+
+  void foreach_internal(
+    std::function<void(uint64_t offset, uint64_t length)> notify)
+  {
+    size_t alloc_size = get_min_alloc_size();
+    auto multiply_by_alloc_size = [alloc_size, notify](size_t off, size_t len) {
+      notify(off * alloc_size, len * alloc_size);
+    };
+    std::lock_guard l(lock);
+    l1.foreach_internal(multiply_by_alloc_size);
+  }
+  double get_fragmentation_internal() {
+    std::lock_guard l(lock);
+    return l1.get_fragmentation();
+  }
+
+protected:
+  ceph::mutex lock = ceph::make_mutex("AllocatorLevel02::lock");
+  L1 l1;
+  slot_vector_t l2;
+  uint64_t l2_granularity = 0; // space per entry
+  uint64_t available = 0;
+  uint64_t last_pos = 0;
+
+  enum {
+    L1_ENTRIES_PER_SLOT = bits_per_slot, // 64
+  };
+
+  uint64_t _children_per_slot() const override
+  {
+    return L1_ENTRIES_PER_SLOT;
+  }
+  uint64_t _level_granularity() const override
+  {
+    return l2_granularity;
+  }
+
+  void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
+  {
+    ceph_assert(isp2(_alloc_unit));
+    l1._init(capacity, _alloc_unit, mark_as_free);
+
+    l2_granularity =
+      l1._level_granularity() * l1._children_per_slot() * slots_per_slotset;
+
+    // capacity to have slot alignment at l2
+    auto aligned_capacity =
+      p2roundup((int64_t)capacity, (int64_t)l2_granularity * L1_ENTRIES_PER_SLOT);
+    size_t elem_count = aligned_capacity / l2_granularity / L1_ENTRIES_PER_SLOT;
+    // we use set bit(s) as a marker for (partially) free entry
+    l2.resize(elem_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+    if (mark_as_free) {
+      // capacity to have slotset alignment at l1
+      auto l2_pos_no_use =
+	p2roundup((int64_t)capacity, (int64_t)l2_granularity) / l2_granularity;
+      _mark_l2_allocated(l2_pos_no_use, aligned_capacity / l2_granularity);
+      available = p2align(capacity, _alloc_unit);
+    } else {
+      available = 0;
+    }
+  }
+
+  void _mark_l2_allocated(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = L1_ENTRIES_PER_SLOT;
+    ceph_assert(0 <= l2_pos_end);
+    ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    while (l2_pos < l2_pos_end) {
+      l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
+      ++l2_pos;
+    }
+  }
+
+  void _mark_l2_free(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = L1_ENTRIES_PER_SLOT;
+    ceph_assert(0 <= l2_pos_end);
+    ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    while (l2_pos < l2_pos_end) {
+        l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
+        ++l2_pos;
+    }
+  }
+
+  void _mark_l2_on_l1(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = L1_ENTRIES_PER_SLOT;
+    ceph_assert(0 <= l2_pos_end);
+    ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    auto idx = l2_pos * slots_per_slotset;
+    auto idx_end = l2_pos_end * slots_per_slotset;
+    bool all_allocated = true;
+    while (idx < idx_end) {
+      if (!l1._is_slot_fully_allocated(idx)) {
+        all_allocated = false;
+        idx = p2roundup(int64_t(++idx), int64_t(slots_per_slotset));
+      }
+      else {
+        ++idx;
+      }
+      if ((idx % slots_per_slotset) == 0) {
+        if (all_allocated) {
+          l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
+        }
+        else {
+          l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
+        }
+        all_allocated = true;
+        ++l2_pos;
+      }
+    }
+  }
+
+  void _allocate_l2(uint64_t length,
+    uint64_t min_length,
+    uint64_t max_length,
+    uint64_t hint,
+    
+    uint64_t* allocated,
+    interval_vector_t* res)
+  {
+    uint64_t prev_allocated = *allocated;
+    uint64_t d = L1_ENTRIES_PER_SLOT;
+    ceph_assert(min_length <= l2_granularity);
+    ceph_assert(max_length == 0 || max_length >= min_length);
+    ceph_assert(max_length == 0 || (max_length % min_length) == 0);
+    ceph_assert(length >= min_length);
+    ceph_assert((length % min_length) == 0);
+
+    uint64_t cap = 1ull << 31;
+    if (max_length == 0 || max_length >= cap) {
+      max_length = cap;
+    }
+
+    uint64_t l1_w = slots_per_slotset * l1._children_per_slot();
+
+    std::lock_guard l(lock);
+
+    if (available < min_length) {
+      return;
+    }
+    if (hint != 0) {
+      last_pos = (hint / (d * l2_granularity)) < l2.size() ? p2align(hint / l2_granularity, d) : 0;
+    }
+    auto l2_pos = last_pos;
+    auto last_pos0 = last_pos;
+    auto pos = last_pos / d;
+    auto pos_end = l2.size();
+    // outer loop below is intended to optimize the performance by
+    // avoiding 'modulo' operations inside the internal loop.
+    // Looks like they have negative impact on the performance
+    for (auto i = 0; i < 2; ++i) {
+      for(; length > *allocated && pos < pos_end; ++pos) {
+	slot_t& slot_val = l2[pos];
+	size_t free_pos = 0;
+	bool all_set = false;
+	if (slot_val == all_slot_clear) {
+	  l2_pos += d;
+	  last_pos = l2_pos;
+	  continue;
+	} else if (slot_val == all_slot_set) {
+	  free_pos = 0;
+	  all_set = true;
+	} else {
+	  free_pos = find_next_set_bit(slot_val, 0);
+	  ceph_assert(free_pos < bits_per_slot);
+	}
+	do {
+	  ceph_assert(length > *allocated);
+	  bool empty = l1._allocate_l1(length,
+	    min_length,
+	    max_length,
+	    (l2_pos + free_pos) * l1_w,
+	    (l2_pos + free_pos + 1) * l1_w,
+	    allocated,
+	    res);
+	  if (empty) {
+	    slot_val &= ~(slot_t(1) << free_pos);
+	  }
+	  if (length <= *allocated || slot_val == all_slot_clear) {
+	    break;
+	  }
+	  ++free_pos;
+	  if (!all_set) {
+	    free_pos = find_next_set_bit(slot_val, free_pos);
+	  }
+	} while (free_pos < bits_per_slot);
+	last_pos = l2_pos;
+	l2_pos += d;
+      }
+      l2_pos = 0;
+      pos = 0;
+      pos_end = last_pos0 / d;
+    }
+
+    ++l2_allocs;
+    auto allocated_here = *allocated - prev_allocated;
+    ceph_assert(available >= allocated_here);
+    available -= allocated_here;
+  }
+
+#ifndef NON_CEPH_BUILD
+  // to provide compatibility with BlueStore's allocator interface
+  void _free_l2(const interval_set<uint64_t> & rr)
+  {
+    uint64_t released = 0;
+    std::lock_guard l(lock);
+    for (auto r : rr) {
+      released += l1._free_l1(r.first, r.second);
+      uint64_t l2_pos = r.first / l2_granularity;
+      uint64_t l2_pos_end = p2roundup(int64_t(r.first + r.second), int64_t(l2_granularity)) / l2_granularity;
+
+      _mark_l2_free(l2_pos, l2_pos_end);
+    }
+    available += released;
+  }
+#endif
+
+  template <typename T>
+  void _free_l2(const T& rr)
+  {
+    uint64_t released = 0;
+    std::lock_guard l(lock);
+    for (auto r : rr) {
+      released += l1._free_l1(r.offset, r.length);
+      uint64_t l2_pos = r.offset / l2_granularity;
+      uint64_t l2_pos_end = p2roundup(int64_t(r.offset + r.length), int64_t(l2_granularity)) / l2_granularity;
+
+      _mark_l2_free(l2_pos, l2_pos_end);
+    }
+    available += released;
+  }
+
+  void _mark_allocated(uint64_t o, uint64_t len)
+  {
+    uint64_t l2_pos = o / l2_granularity;
+    uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
+
+    std::lock_guard l(lock);
+    auto allocated = l1._mark_alloc_l1(o, len);
+    ceph_assert(available >= allocated);
+    available -= allocated;
+    _mark_l2_on_l1(l2_pos, l2_pos_end);
+  }
+
+  void _mark_free(uint64_t o, uint64_t len)
+  {
+    uint64_t l2_pos = o / l2_granularity;
+    uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
+
+    std::lock_guard l(lock);
+    available += l1._free_l1(o, len);
+    _mark_l2_free(l2_pos, l2_pos_end);
+  }
+  void _shutdown()
+  {
+    last_pos = 0;
+  }
+};
+
+#endif
diff --git a/src/os/bluestore/zoned_types.cc b/src/os/bluestore/zoned_types.cc
new file mode 100644
index 000000000..f33bd89e3
--- /dev/null
+++ b/src/os/bluestore/zoned_types.cc
@@ -0,0 +1,24 @@
+#include "zoned_types.h"
+
+using ceph::decode;
+using ceph::encode;
+
+std::ostream& operator<<(std::ostream& out,
+                         const zone_state_t& zone_state) {
+  return out << " zone: 0x" << std::hex
+	     << " dead bytes: 0x" << zone_state.get_num_dead_bytes()
+	     << " write pointer: 0x"  << zone_state.get_write_pointer()
+	     << " " << std::dec;
+}
+
+void zone_state_t::encode(ceph::buffer::list &bl) const {
+  uint64_t v = static_cast<uint64_t>(num_dead_bytes) << 32 | write_pointer;
+  ::encode(v, bl);
+}
+
+void zone_state_t::decode(ceph::buffer::list::const_iterator &p) {
+  uint64_t v;
+  ::decode(v, p);
+  num_dead_bytes = v >> 32;
+  write_pointer = v;  // discard left-most 32 bits
+}
diff --git a/src/os/bluestore/zoned_types.h b/src/os/bluestore/zoned_types.h
new file mode 100644
index 000000000..6ff5d5f31
--- /dev/null
+++ b/src/os/bluestore/zoned_types.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OS_BLUESTORE_ZONED_TYPES_H
+#define CEPH_OS_BLUESTORE_ZONED_TYPES_H
+
+#include "include/types.h"
+#include "kv/KeyValueDB.h"
+#include "os/kv.h"
+
+// Tracks two bits of information about the state of a zone: (1) number of dead
+// bytes in a zone and (2) the write pointer.  We assume that for now 32 bits is
+// enough for the zone capacity and represent these as uint32_t, and we store
+// them as a single 64-bit value in RocksDB so that we can use the existing
+// Int64ArrayMergeOperator for merge and avoid the cost of point queries.
+//
+// We use the same struct for an on-disk and in-memory representation of the
+// state.
+struct zone_state_t {
+  uint32_t num_dead_bytes = 0;
+  uint32_t write_pointer = 0;
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &p);
+
+  uint64_t get_num_dead_bytes() const {
+    return num_dead_bytes;
+  }
+
+  uint64_t get_write_pointer() const {
+    return write_pointer;
+  }
+
+  void increment_num_dead_bytes(uint64_t num_bytes) {
+    num_dead_bytes += num_bytes;
+  }
+
+  void increment_write_pointer(uint64_t num_bytes) {
+    write_pointer += num_bytes;
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const zone_state_t& zone_state);
+
+#endif
diff --git a/src/os/filestore/BtrfsFileStoreBackend.cc b/src/os/filestore/BtrfsFileStoreBackend.cc
new file mode 100644
index 000000000..27161c22a
--- /dev/null
+++ b/src/os/filestore/BtrfsFileStoreBackend.cc
@@ -0,0 +1,590 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+#include "include/color.h"
+#include "include/buffer.h"
+#include "include/ceph_assert.h"
+
+#ifndef __CYGWIN__
+#include "os/fs/btrfs_ioctl.h"
+#endif
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "BtrfsFileStoreBackend.h"
+
+#include "common/errno.h"
+#include "common/config.h"
+
+#if defined(__linux__)
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") "
+
+using std::cerr;
+using std::list;
+using std::string;
+
+#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
+#define ALIGNED(x, by) (!((x) % (by)))
+#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
+
+BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs):
+    GenericFileStoreBackend(fs), has_clone_range(false),
+    has_snap_create(false), has_snap_destroy(false),
+    has_snap_create_v2(false), has_wait_sync(false), stable_commits(false),
+    m_filestore_btrfs_clone_range(cct()->_conf->filestore_btrfs_clone_range),
+    m_filestore_btrfs_snap (cct()->_conf->filestore_btrfs_snap) { }
+
+int BtrfsFileStoreBackend::detect_features()
+{
+  int r;
+
+  r = GenericFileStoreBackend::detect_features();
+  if (r < 0)
+    return r;
+
+  // clone_range?
+  if (m_filestore_btrfs_clone_range) {
+    int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY|O_CLOEXEC, 0600);
+    if (fd >= 0) {
+      if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) {
+	r = -errno;
+	dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: "
+		<< cpp_strerror(r) << dendl;
+      }
+      btrfs_ioctl_clone_range_args clone_args;
+      memset(&clone_args, 0, sizeof(clone_args));
+      clone_args.src_fd = -1;
+      r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args);
+      if (r < 0 && errno == EBADF) {
+	dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl;
+	has_clone_range = true;
+      } else {
+	r = -errno;
+	dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl;
+      }
+      TEMP_FAILURE_RETRY(::close(fd));
+    } else {
+      r = -errno;
+      dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: "
+	      << cpp_strerror(r) << dendl;
+    }
+  } else {
+    dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl;
+  }
+
+  struct btrfs_ioctl_vol_args vol_args;
+  memset(&vol_args, 0, sizeof(vol_args));
+
+  // create test source volume
+  vol_args.fd = 0;
+  strcpy(vol_args.name, "test_subvol");
+  r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args);
+  if (r != 0) {
+    r = -errno;
+    dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+  }
+  int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY|O_CLOEXEC);
+  if (srcfd < 0) {
+    r = -errno;
+    dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+  }
+
+  // snap_create and snap_destroy?
+  vol_args.fd = srcfd;
+  strcpy(vol_args.name, "sync_snap_test");
+  r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+  int err = errno;
+  if (r == 0 || errno == EEXIST) {
+    dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl;
+    has_snap_create = true;
+
+    r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+    if (r == 0) {
+      dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl;
+      has_snap_destroy = true;
+    } else {
+      err = -errno;
+      dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
+
+      if (err == -EPERM && getuid() != 0) {
+	dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl;
+	cerr << TEXT_YELLOW
+	     << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed"
+	     << TEXT_NORMAL << std::endl;
+      } else if (err == -EOPNOTSUPP) {
+	derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl;
+      }
+    }
+  } else {
+    dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl;
+  }
+
+  if (m_filestore_btrfs_snap) {
+    if (has_snap_destroy)
+      stable_commits = true;
+    else
+      dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl;
+  }
+
+  // start_sync?
+  __u64 transid = 0;
+  r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid);
+  if (r < 0) {
+    int err = errno;
+    dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl;
+  }
+  if (r == 0 && transid > 0) {
+    dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl;
+
+    // do we have wait_sync too?
+    r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
+    if (r == 0 || errno == ERANGE) {
+      dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl;
+      has_wait_sync = true;
+    } else {
+      int err = errno;
+      dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
+    }
+  } else {
+    int err = errno;
+    dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
+  }
+
+  if (has_wait_sync) {
+    // async snap creation?
+    struct btrfs_ioctl_vol_args_v2 async_args;
+    memset(&async_args, 0, sizeof(async_args));
+    async_args.fd = srcfd;
+    async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
+    strcpy(async_args.name, "async_snap_test");
+
+    // remove old one, first
+    struct stat st;
+    strcpy(vol_args.name, async_args.name);
+    if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) {
+      dout(0) << "detect_feature: removing old async_snap_test" << dendl;
+      r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+      if (r != 0) {
+	int err = errno;
+	dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl;
+      }
+    }
+
+    r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
+    if (r == 0 || errno == EEXIST) {
+      dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl;
+      has_snap_create_v2 = true;
+
+      // clean up
+      strcpy(vol_args.name, "async_snap_test");
+      r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+      if (r != 0) {
+	int err = errno;
+	dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
+      }
+    } else {
+      int err = errno;
+      dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl;
+    }
+  }
+
+  // clean up test subvol
+  if (srcfd >= 0)
+    TEMP_FAILURE_RETRY(::close(srcfd));
+
+  strcpy(vol_args.name, "test_subvol");
+  r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+  if (r < 0) {
+    r = -errno;
+    dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl;
+  }
+
+  if (m_filestore_btrfs_snap && !has_snap_create_v2) {
+    dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl;
+    cerr << TEXT_YELLOW
+      << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n"
+      << "             rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n"
+      << "             (added in Linux 2.6.37).  Expect slow btrfs sync/commit\n"
+      << "             performance.\n"
+      << TEXT_NORMAL;
+  }
+
+  return 0;
+}
+
+bool BtrfsFileStoreBackend::can_checkpoint()
+{
+  return stable_commits;
+}
+
+int BtrfsFileStoreBackend::create_current()
+{
+  struct stat st;
+  int ret = ::stat(get_current_path().c_str(), &st);
+  if (ret == 0) {
+    // current/ exists
+    if (!S_ISDIR(st.st_mode)) {
+      dout(0) << "create_current: current/ exists but is not a directory" << dendl;
+      return -EINVAL;
+    }
+
+    struct stat basest;
+    struct statfs currentfs;
+    ret = ::fstat(get_basedir_fd(), &basest);
+    if (ret < 0) {
+      ret = -errno;
+      dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl;
+      return ret;
+    }
+    ret = ::statfs(get_current_path().c_str(), &currentfs);
+    if (ret < 0) {
+      ret = -errno;
+      dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl;
+      return ret;
+    }
+    if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) {
+      dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl;
+      stable_commits = true;
+    }
+    return 0;
+  }
+
+  struct btrfs_ioctl_vol_args volargs;
+  memset(&volargs, 0, sizeof(volargs));
+
+  volargs.fd = 0;
+  strcpy(volargs.name, "current");
+  if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) {
+    ret = -errno;
+    dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error "
+	    << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl;
+  if (::chmod(get_current_path().c_str(), 0755) < 0) {
+    ret = -errno;
+    dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: "
+	    << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  stable_commits = true;
+  return 0;
+}
+
+int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
+{
+  int ret, err = 0;
+
+  struct stat basest;
+  ret = ::fstat(get_basedir_fd(), &basest);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  // get snap list
+  DIR *dir = ::opendir(get_basedir_path().c_str());
+  if (!dir) {
+    ret = -errno;
+    dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: "
+	    << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  list<string> snaps;
+  char path[PATH_MAX];
+  struct dirent *de;
+  while (true) {
+    errno = 0;
+    de = ::readdir(dir);
+    if (de == nullptr) {
+      if (errno != 0) {
+        err = -errno;
+        dout(0) << "list_checkpoints: readdir '" << get_basedir_path() << "' failed: "
+                << cpp_strerror(err) << dendl;
+      }
+      break;
+    }
+    snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name);
+
+    struct stat st;
+    ret = ::stat(path, &st);
+    if (ret < 0) {
+      err = -errno;
+      dout(0) << "list_checkpoints: stat '" << path << "' failed: "
+	      << cpp_strerror(err) << dendl;
+      break;
+    }
+
+    if (!S_ISDIR(st.st_mode))
+      continue;
+
+    struct statfs fs;
+    ret = ::statfs(path, &fs);
+    if (ret < 0) {
+      err = -errno;
+      dout(0) << "list_checkpoints: statfs '" << path << "' failed: "
+	      << cpp_strerror(err) << dendl;
+      break;
+    }
+
+    if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev)
+      snaps.push_back(string(de->d_name));
+  }
+
+  if (::closedir(dir) < 0) {
+      ret = -errno;
+      dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl;
+      if (!err)
+	err = ret;
+  }
+
+  if (err)
+    return err;
+
+  ls.swap(snaps);
+  return 0;
+}
+
+int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid)
+{
+  dout(10) << "create_checkpoint: '" << name << "'" << dendl;
+  if (has_snap_create_v2 && transid) {
+    struct btrfs_ioctl_vol_args_v2 async_args;
+    memset(&async_args, 0, sizeof(async_args));
+    async_args.fd = get_current_fd();
+    async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
+
+    size_t name_size = sizeof(async_args.name);
+    strncpy(async_args.name, name.c_str(), name_size);
+    async_args.name[name_size-1] = '\0';
+
+    int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
+    if (r < 0) {
+      r = -errno;
+      dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl;
+    *transid = async_args.transid;
+  } else {
+    struct btrfs_ioctl_vol_args vol_args;
+    memset(&vol_args, 0, sizeof(vol_args));
+    vol_args.fd = get_current_fd();
+
+    size_t name_size = sizeof(vol_args.name);
+    strncpy(vol_args.name, name.c_str(), name_size);
+    vol_args.name[name_size-1] = '\0';
+
+    int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+    if (r < 0) {
+      r = -errno;
+      dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    if (transid)
+      *transid = 0;
+  }
+  return 0;
+}
+
+int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid)
+{
+  // wait for commit
+  dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl;
+  int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl;
+    return -errno;
+  }
+  dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl;
+  return 0;
+}
+
+int BtrfsFileStoreBackend::rollback_to(const string& name)
+{
+  dout(10) << "rollback_to: to '" << name << "'" << dendl;
+  char s[PATH_MAX];
+  btrfs_ioctl_vol_args vol_args;
+
+  memset(&vol_args, 0, sizeof(vol_args));
+  vol_args.fd = 0;
+  strcpy(vol_args.name, "current");
+
+  int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+  if (ret && errno != ENOENT) {
+    dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl;
+    snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand());
+    if (::rename(get_current_path().c_str(), s)) {
+      ret = -errno;
+      dout(0) << "rollback_to: error renaming old current subvol: "
+	      << cpp_strerror(ret) << dendl;
+      return ret;
+    }
+  }
+
+  snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str());
+
+  // roll back
+  vol_args.fd = ::open(s, O_RDONLY|O_CLOEXEC);
+  if (vol_args.fd < 0) {
+    ret = -errno;
+    dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
+  if (ret < 0 ) {
+    ret = -errno;
+    dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl;
+  }
+  TEMP_FAILURE_RETRY(::close(vol_args.fd));
+  return ret;
+}
+
+int BtrfsFileStoreBackend::destroy_checkpoint(const string& name)
+{
+  dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
+  btrfs_ioctl_vol_args vol_args;
+  memset(&vol_args, 0, sizeof(vol_args));
+  vol_args.fd = 0;
+  strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name) - 1);
+  vol_args.name[sizeof(vol_args.name) - 1] = '\0';
+
+  int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
+  if (ret) {
+    ret = -errno;
+    dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+int BtrfsFileStoreBackend::syncfs()
+{
+  dout(15) << "syncfs" << dendl;
+  // do a full btrfs commit
+  int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl;
+  }
+  return ret;
+}
+
+int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+  dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl;
+  size_t blk_size = get_blksize();
+  if (!has_clone_range ||
+      srcoff % blk_size != dstoff % blk_size) {
+    dout(20) << "clone_range: using copy" << dendl;
+    return _copy_range(from, to, srcoff, len, dstoff);
+  }
+
+  int err = 0;
+  int r = 0;
+
+  uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size);
+  uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size);
+  if (srcoffclone >= srcoff + len) {
+    dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl;
+    return _copy_range(from, to, srcoff, len, dstoff);
+  }
+
+  uint64_t lenclone = len - (srcoffclone - srcoff);
+  if (!ALIGNED(lenclone, blk_size)) {
+    struct stat from_stat, to_stat;
+    err = ::fstat(from, &from_stat);
+    if (err) return -errno;
+    err = ::fstat(to , &to_stat);
+    if (err) return -errno;
+
+    if (srcoff + len != (uint64_t)from_stat.st_size ||
+	dstoff + len < (uint64_t)to_stat.st_size) {
+      // Not to the end of the file, need to align length as well
+      lenclone = ALIGN_DOWN(lenclone, blk_size);
+    }
+  }
+  if (lenclone == 0) {
+    // too short
+    return _copy_range(from, to, srcoff, len, dstoff);
+  }
+
+  dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone
+	   << " to " << dstoffclone << " = " << r << dendl;
+  btrfs_ioctl_clone_range_args a;
+  a.src_fd = from;
+  a.src_offset = srcoffclone;
+  a.src_length = lenclone;
+  a.dest_offset = dstoffclone;
+  err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a);
+  if (err >= 0) {
+    r += err;
+  } else if (errno == EINVAL) {
+    // Still failed, might be compressed
+    dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl;
+    return _copy_range(from, to, srcoff, len, dstoff);
+  } else {
+    return -errno;
+  }
+
+  // Take care any trimmed from front
+  if (srcoffclone != srcoff) {
+    err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff);
+    if (err >= 0) {
+      r += err;
+    } else {
+      return err;
+    }
+  }
+
+  // Copy end
+  if (srcoffclone + lenclone != srcoff + len) {
+    err = _copy_range(from, to,
+			 srcoffclone + lenclone,
+			 (srcoff + len) - (srcoffclone + lenclone),
+			 dstoffclone + lenclone);
+    if (err >= 0) {
+      r += err;
+    } else {
+      return err;
+    }
+  }
+  dout(20) << "clone_range: finished " << srcoff << "~" << len
+	   << " to " << dstoff << " = " << r << dendl;
+  return r;
+}
+#endif
diff --git a/src/os/filestore/BtrfsFileStoreBackend.h b/src/os/filestore/BtrfsFileStoreBackend.h
new file mode 100644
index 000000000..eafa88f41
--- /dev/null
+++ b/src/os/filestore/BtrfsFileStoreBackend.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_BTRFSFILESTOREBACKEDN_H
+#define CEPH_BTRFSFILESTOREBACKEDN_H
+
+#if defined(__linux__)
+#include "GenericFileStoreBackend.h"
+
+class BtrfsFileStoreBackend : public GenericFileStoreBackend {
+private:
+  bool has_clone_range;       ///< clone range ioctl is supported
+  bool has_snap_create;       ///< snap create ioctl is supported
+  bool has_snap_destroy;      ///< snap destroy ioctl is supported
+  bool has_snap_create_v2;    ///< snap create v2 ioctl (async!) is supported
+  bool has_wait_sync;         ///< wait sync ioctl is supported
+  bool stable_commits;
+  bool m_filestore_btrfs_clone_range;
+  bool m_filestore_btrfs_snap;
+public:
+  explicit BtrfsFileStoreBackend(FileStore *fs);
+  ~BtrfsFileStoreBackend() override {}
+  const char *get_name() override {
+    return "btrfs";
+  }
+  int detect_features() override;
+  bool can_checkpoint() override;
+  int create_current() override;
+  int list_checkpoints(std::list<std::string>& ls) override;
+  int create_checkpoint(const std::string& name, uint64_t *cid) override;
+  int sync_checkpoint(uint64_t cid) override;
+  int rollback_to(const std::string& name) override;
+  int destroy_checkpoint(const std::string& name) override;
+  int syncfs() override;
+  int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) override;
+};
+#endif
+#endif
diff --git a/src/os/filestore/CollectionIndex.h b/src/os/filestore/CollectionIndex.h
new file mode 100644
index 000000000..ff3d706cf
--- /dev/null
+++ b/src/os/filestore/CollectionIndex.h
@@ -0,0 +1,208 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef OS_COLLECTIONINDEX_H
+#define OS_COLLECTIONINDEX_H
+
+#include <string>
+#include <vector>
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/RWLock.h"
+
+/**
+  CollectionIndex provides an interface for manipulating indexed collections
+ */
+class CollectionIndex {
+public:
+  CephContext* cct;
+protected:
+  /**
+   * Object encapsulating a returned path.
+   *
+   * A path to an object (existent or non-existent) becomes invalid
+   * when a different object is created in the index.  Path stores
+   * a shared_ptr to the CollectionIndex to keep the index alive
+   * during its lifetime.
+   * @see IndexManager
+   * @see self_ref
+   * @see set_ref
+   */
+  class Path {
+  public:
+    /// Returned path
+    std::string full_path;
+    /// Ref to parent Index
+    CollectionIndex* parent_ref;
+    /// coll_t for parent Index
+    coll_t parent_coll;
+
+    /// Normal Constructor
+    Path(
+      std::string path,                              ///< [in] Path to return.
+      CollectionIndex* ref)
+      : full_path(path), parent_ref(ref), parent_coll(parent_ref->coll()) {}
+
+    /// Debugging Constructor
+    Path(
+      std::string path,                              ///< [in] Path to return.
+      const coll_t& coll)                              ///< [in] collection
+      : full_path(path), parent_coll(coll) {}
+
+    /// Getter for the stored path.
+    const char *path() const { return full_path.c_str(); }
+
+    /// Getter for collection
+    const coll_t& coll() const { return parent_coll; }
+
+    /// Getter for parent
+    CollectionIndex* get_index() const {
+      return parent_ref;
+    }
+  };
+ public:
+
+  ceph::shared_mutex access_lock =
+    ceph::make_shared_mutex("CollectionIndex::access_lock", true, false);
+  /// Type of returned paths
+  typedef std::shared_ptr<Path> IndexedPath;
+
+  static IndexedPath get_testing_path(std::string path, coll_t collection) {
+    return std::make_shared<Path>(path, collection);
+  }
+
+  static const uint32_t FLAT_INDEX_TAG = 0;
+  static const uint32_t HASH_INDEX_TAG = 1;
+  static const uint32_t HASH_INDEX_TAG_2 = 2;
+  static const uint32_t HOBJECT_WITH_POOL = 3;
+  /**
+   * For tracking Filestore collection versions.
+   *
+   * @return Collection version represented by the Index implementation
+   */
+  virtual uint32_t collection_version() = 0;
+
+  /**
+   * Returns the collection managed by this CollectionIndex
+   */
+  virtual coll_t coll() const = 0;
+
+
+  /**
+   * Initializes the index.
+   *
+   * @return Error Code, 0 for success
+   */
+  virtual int init() = 0;
+
+  /**
+   * Cleanup before replaying journal
+   *
+   * Index implementations may need to perform compound operations
+   * which may leave the collection unstable if interrupted.  cleanup
+   * is called on mount to allow the CollectionIndex implementation
+   * to stabilize.
+   *
+   * @see HashIndex
+   * @return Error Code, 0 for success
+   */
+  virtual int cleanup() = 0;
+
+  /**
+   * Call when a file is created using a path returned from lookup.
+   *
+   * @return Error Code, 0 for success
+   */
+  virtual int created(
+    const ghobject_t &oid, ///< [in] Created object.
+    const char *path       ///< [in] Path to created object.
+    ) = 0;
+
+  /**
+   * Removes oid from the collection
+   *
+   * @return Error Code, 0 for success
+   */
+  virtual int unlink(
+    const ghobject_t &oid ///< [in] Object to remove
+    ) = 0;
+
+  /**
+   * Gets the IndexedPath for oid.
+   *
+   * @return Error Code, 0 for success
+   */
+  virtual int lookup(
+    const ghobject_t &oid, ///< [in] Object to lookup
+    IndexedPath *path,	   ///< [out] Path to object
+    int *hardlink          ///< [out] number of hard links of this object. *hardlink=0 mean object no-exist.
+    ) = 0;
+
+  /**
+   * Moves objects matching @e match in the lsb @e bits
+   *
+   * dest and this must be the same subclass
+   *
+   * @return Error Code, 0 for success
+   */
+  virtual int split(
+    uint32_t match,                             //< [in] value to match
+    uint32_t bits,                              //< [in] bits to check
+    CollectionIndex* dest  //< [in] destination index
+    ) { ceph_abort(); return 0; }
+
+  virtual int merge(
+    uint32_t bits,                              //< [in] common (target) bits
+    CollectionIndex* dest  //< [in] destination index
+    ) { ceph_abort(); return 0; }
+
+
+  /// List contents of collection by hash
+  virtual int collection_list_partial(
+    const ghobject_t &start, ///< [in] object at which to start
+    const ghobject_t &end,    ///< [in] list only objects < end
+    int max_count,          ///< [in] return at most max_count objects
+    std::vector<ghobject_t> *ls,  ///< [out] Listed objects
+    ghobject_t *next         ///< [out] Next object to list
+    ) = 0;
+
+  /// Call prior to removing directory
+  virtual int prep_delete() { return 0; }
+
+  CollectionIndex(CephContext* cct, const coll_t& collection)
+    : cct(cct) {}
+
+  /*
+   * Pre-hash the collection, this collection should map to a PG folder.
+   *
+   * @param pg_num            - pg number of the pool this collection belongs to.
+   * @param expected_num_objs - expected number of objects in this collection.
+   * @Return 0 on success, an error code otherwise.
+   */
+  virtual int pre_hash_collection(
+      uint32_t pg_num,            ///< [in] pg number of the pool this collection belongs to
+      uint64_t expected_num_objs  ///< [in] expected number of objects this collection has
+      ) { ceph_abort(); return 0; }
+
+  virtual int apply_layout_settings(int target_level) { ceph_abort(); return 0; }
+
+  /// Read index-wide settings (should be called after construction)
+  virtual int read_settings() { return 0; }
+
+  /// Virtual destructor
+  virtual ~CollectionIndex() {}
+};
+
+#endif
diff --git a/src/os/filestore/DBObjectMap.cc b/src/os/filestore/DBObjectMap.cc
new file mode 100644
index 000000000..7da9a67be
--- /dev/null
+++ b/src/os/filestore/DBObjectMap.cc
@@ -0,0 +1,1424 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+
+#include <iostream>
+#include <set>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "os/ObjectMap.h"
+#include "kv/KeyValueDB.h"
+#include "DBObjectMap.h"
+#include <errno.h>
+
+#include "common/debug.h"
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore "
+
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::bufferlist;
+
+const string DBObjectMap::USER_PREFIX = "_USER_";
+const string DBObjectMap::XATTR_PREFIX = "_AXATTR_";
+const string DBObjectMap::SYS_PREFIX = "_SYS_";
+const string DBObjectMap::COMPLETE_PREFIX = "_COMPLETE_";
+const string DBObjectMap::HEADER_KEY = "HEADER";
+const string DBObjectMap::USER_HEADER_KEY = "USER_HEADER";
+const string DBObjectMap::GLOBAL_STATE_KEY = "HEADER";
+const string DBObjectMap::HOBJECT_TO_SEQ = "_HOBJTOSEQ_";
+
+// Legacy
+const string DBObjectMap::LEAF_PREFIX = "_LEAF_";
+const string DBObjectMap::REVERSE_LEAF_PREFIX = "_REVLEAF_";
+
+static void append_escaped(const string &in, string *out)
+{
+  for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+    if (*i == '%') {
+      out->push_back('%');
+      out->push_back('p');
+    } else if (*i == '.') {
+      out->push_back('%');
+      out->push_back('e');
+    } else if (*i == '_') {
+      out->push_back('%');
+      out->push_back('u');
+    } else {
+      out->push_back(*i);
+    }
+  }
+}
+
+int DBObjectMap::check(std::ostream &out, bool repair, bool force)
+{
+  int errors = 0, comp_errors = 0;
+  bool repaired = false;
+  map<uint64_t, uint64_t> parent_to_num_children;
+  map<uint64_t, uint64_t> parent_to_actual_num_children;
+  KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    _Header header;
+    bufferlist bl = iter->value();
+    while (true) {
+      auto bliter = bl.cbegin();
+      header.decode(bliter);
+      if (header.seq != 0)
+	parent_to_actual_num_children[header.seq] = header.num_children;
+
+      if (state.v == 2 || force) {
+	// Check complete table
+	bool complete_error = false;
+	boost::optional<string> prev;
+	KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+	for (complete_iter->seek_to_first(); complete_iter->valid();
+	     complete_iter->next()) {
+	  if (prev && prev >= complete_iter->key()) {
+	     out << "Bad complete for " << header.oid << std::endl;
+	     complete_error = true;
+	     break;
+	  }
+	  prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1);
+	}
+	if (complete_error) {
+	  out << "Complete mapping for " << header.seq << " :" << std::endl;
+	  for (complete_iter->seek_to_first(); complete_iter->valid();
+	       complete_iter->next()) {
+	    out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl;
+	  }
+	  if (repair) {
+	    repaired = true;
+	    KeyValueDB::Transaction t = db->get_transaction();
+	    t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+	    db->submit_transaction(t);
+	    out << "Cleared complete mapping to repair" << std::endl;
+	  } else {
+	    errors++;  // Only count when not repaired
+	    comp_errors++;  // Track errors here for version update
+	  }
+	}
+      }
+
+      if (header.parent == 0)
+	break;
+
+      if (!parent_to_num_children.count(header.parent))
+	parent_to_num_children[header.parent] = 0;
+      parent_to_num_children[header.parent]++;
+      if (parent_to_actual_num_children.count(header.parent))
+	break;
+
+      set<string> to_get;
+      map<string, bufferlist> got;
+      to_get.insert(HEADER_KEY);
+      db->get(sys_parent_prefix(header), to_get, &got);
+      if (got.empty()) {
+	out << "Missing: seq " << header.parent << std::endl;
+	errors++;
+	break;
+      } else {
+	bl = got.begin()->second;
+      }
+    }
+  }
+
+  for (map<uint64_t, uint64_t>::iterator i = parent_to_num_children.begin();
+       i != parent_to_num_children.end();
+       parent_to_num_children.erase(i++)) {
+    if (!parent_to_actual_num_children.count(i->first))
+      continue;
+    if (parent_to_actual_num_children[i->first] != i->second) {
+      out << "Invalid: seq " << i->first << " recorded children: "
+	  << parent_to_actual_num_children[i->first] << " found: "
+	  << i->second << std::endl;
+      errors++;
+    }
+    parent_to_actual_num_children.erase(i->first);
+  }
+
+  // Only advance the version from 2 to 3 here
+  // Mark as legacy because there are still older structures
+  // we don't update.  The value of legacy is only used
+  // for internal assertions.
+  if (comp_errors == 0 && state.v == 2 && repair) {
+    state.v = 3;
+    state.legacy = true;
+    set_state();
+  }
+
+  if (errors == 0 && repaired)
+    return -1;
+  return errors;
+}
+
+string DBObjectMap::ghobject_key(const ghobject_t &oid)
+{
+  string out;
+  append_escaped(oid.hobj.oid.name, &out);
+  out.push_back('.');
+  append_escaped(oid.hobj.get_key(), &out);
+  out.push_back('.');
+  append_escaped(oid.hobj.nspace, &out);
+  out.push_back('.');
+
+  char snap_with_hash[1000];
+  char *t = snap_with_hash;
+  char *end = t + sizeof(snap_with_hash);
+  if (oid.hobj.snap == CEPH_NOSNAP)
+    t += snprintf(t, end - t, "head");
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
+    t += snprintf(t, end - t, "snapdir");
+  else
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+
+  if (oid.hobj.pool == -1)
+    t += snprintf(t, end - t, ".none");
+  else
+    t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool);
+  t += snprintf(t, end - t, ".%.*X", (int)(sizeof(uint32_t)*2), oid.hobj.get_hash());
+
+  if (oid.generation != ghobject_t::NO_GEN ||
+      oid.shard_id != shard_id_t::NO_SHARD) {
+    t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation);
+    t += snprintf(t, end - t, ".%x", (int)oid.shard_id);
+  }
+  out += string(snap_with_hash);
+  return out;
+}
+
+//    ok: pglog%u3%efs1...0.none.0017B237
+//   bad: plana8923501-10...4c.3.ffffffffffffffff.2
+// fixed: plana8923501-10...4c.3.CB767F2D.ffffffffffffffff.2
+// returns 0 for false, 1 for true, negative for error
+int DBObjectMap::is_buggy_ghobject_key_v1(CephContext* cct,
+					  const string &in)
+{
+  int dots = 5;  // skip 5 .'s
+  const char *s = in.c_str();
+  do {
+    while (*s && *s != '.')
+      ++s;
+    if (!*s) {
+      derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
+      return -EINVAL;
+    }
+    ++s;
+  } while (*s && --dots);
+  if (!*s) {
+    derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
+    return -EINVAL;
+  }
+  // we are now either at a hash value (32 bits, 8 chars) or a generation
+  // value (64 bits) '.' and shard id.  count the dots!
+  int len = 0;
+  while (*s && *s != '.') {
+    ++s;
+    ++len;
+  }
+  if (*s == '\0') {
+    if (len != 8) {
+      derr << "hash value is not 8 chars" << dendl;
+      return -EINVAL;  // the hash value is always 8 chars.
+    }
+    return 0;
+  }
+  if (*s != '.') { // the shard follows.
+    derr << "missing final . and shard id at " << (int)(s-in.c_str()) << dendl;
+    return -EINVAL;
+  }
+  return 1;
+}
+
+
+string DBObjectMap::map_header_key(const ghobject_t &oid)
+{
+  return ghobject_key(oid);
+}
+
+string DBObjectMap::header_key(uint64_t seq)
+{
+  char buf[100];
+  snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq);
+  return string(buf);
+}
+
+string DBObjectMap::complete_prefix(Header header)
+{
+  return USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX;
+}
+
+string DBObjectMap::user_prefix(Header header)
+{
+  return USER_PREFIX + header_key(header->seq) + USER_PREFIX;
+}
+
+string DBObjectMap::sys_prefix(Header header)
+{
+  return USER_PREFIX + header_key(header->seq) + SYS_PREFIX;
+}
+
+string DBObjectMap::xattr_prefix(Header header)
+{
+  return USER_PREFIX + header_key(header->seq) + XATTR_PREFIX;
+}
+
+string DBObjectMap::sys_parent_prefix(_Header header)
+{
+  return USER_PREFIX + header_key(header.parent) + SYS_PREFIX;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::init()
+{
+  invalid = false;
+  if (ready) {
+    return 0;
+  }
+  ceph_assert(!parent_iter);
+  if (header->parent) {
+    Header parent = map->lookup_parent(header);
+    if (!parent) {
+      ceph_abort();
+      return -EINVAL;
+    }
+    parent_iter = std::make_shared<DBObjectMapIteratorImpl>(map, parent);
+  }
+  key_iter = map->db->get_iterator(map->user_prefix(header));
+  ceph_assert(key_iter);
+  complete_iter = map->db->get_iterator(map->complete_prefix(header));
+  ceph_assert(complete_iter);
+  cur_iter = key_iter;
+  ceph_assert(cur_iter);
+  ready = true;
+  return 0;
+}
+
+ObjectMap::ObjectMapIterator DBObjectMap::get_iterator(
+  const ghobject_t &oid)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return ObjectMapIterator(new EmptyIteratorImpl());
+  DBObjectMapIterator iter = _get_iterator(header);
+  iter->hlock.swap(hl);
+  return iter;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::seek_to_first()
+{
+  init();
+  r = 0;
+  if (parent_iter) {
+    r = parent_iter->seek_to_first();
+    if (r < 0)
+      return r;
+  }
+  r = key_iter->seek_to_first();
+  if (r < 0)
+    return r;
+  return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::seek_to_last()
+{
+  init();
+  r = 0;
+  if (parent_iter) {
+    r = parent_iter->seek_to_last();
+    if (r < 0)
+      return r;
+    if (parent_iter->valid())
+      r = parent_iter->next();
+    if (r < 0)
+      return r;
+  }
+  r = key_iter->seek_to_last();
+  if (r < 0)
+    return r;
+  if (key_iter->valid())
+    r = key_iter->next();
+  if (r < 0)
+    return r;
+  return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::lower_bound(const string &to)
+{
+  init();
+  r = 0;
+  if (parent_iter) {
+    r = parent_iter->lower_bound(to);
+    if (r < 0)
+      return r;
+  }
+  r = key_iter->lower_bound(to);
+  if (r < 0)
+    return r;
+  return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::lower_bound_parent(const string &to)
+{
+  int r = lower_bound(to);
+  if (r < 0)
+    return r;
+  if (valid() && !on_parent())
+    return next_parent();
+  else
+    return r;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::upper_bound(const string &after)
+{
+  init();
+  r = 0;
+  if (parent_iter) {
+    r = parent_iter->upper_bound(after);
+    if (r < 0)
+      return r;
+  }
+  r = key_iter->upper_bound(after);
+  if (r < 0)
+    return r;
+  return adjust();
+}
+
+bool DBObjectMap::DBObjectMapIteratorImpl::valid()
+{
+  bool valid = !invalid && ready;
+  ceph_assert(!valid || cur_iter->valid());
+  return valid;
+}
+
+bool DBObjectMap::DBObjectMapIteratorImpl::valid_parent()
+{
+  if (parent_iter && parent_iter->valid() &&
+      (!key_iter->valid() || key_iter->key() > parent_iter->key()))
+    return true;
+  return false;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::next()
+{
+  ceph_assert(cur_iter->valid());
+  ceph_assert(valid());
+  cur_iter->next();
+  return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::next_parent()
+{
+  r = next();
+  if (r < 0)
+    return r;
+  while (parent_iter && parent_iter->valid() && !on_parent()) {
+    ceph_assert(valid());
+    r = lower_bound(parent_iter->key());
+    if (r < 0)
+      return r;
+  }
+
+  if (!parent_iter || !parent_iter->valid()) {
+    invalid = true;
+  }
+  return 0;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::in_complete_region(const string &to_test,
+							     string *begin,
+							     string *end)
+{
+  /* This is clumsy because one cannot call prev() on end(), nor can one
+   * test for == begin().
+   */
+  complete_iter->upper_bound(to_test);
+  if (complete_iter->valid()) {
+    complete_iter->prev();
+    if (!complete_iter->valid()) {
+      complete_iter->upper_bound(to_test);
+      return false;
+    }
+  } else {
+    complete_iter->seek_to_last();
+    if (!complete_iter->valid())
+      return false;
+  }
+
+  ceph_assert(complete_iter->key() <= to_test);
+  ceph_assert(complete_iter->value().length() >= 1);
+  string _end(complete_iter->value().c_str(),
+	      complete_iter->value().length() - 1);
+  if (_end.empty() || _end > to_test) {
+    if (begin)
+      *begin = complete_iter->key();
+    if (end)
+      *end = _end;
+    return true;
+  } else {
+    complete_iter->next();
+    ceph_assert(!complete_iter->valid() || complete_iter->key() > to_test);
+    return false;
+  }
+}
+
+/**
+ * Moves parent_iter to the next position both out of the complete_region and
+ * not equal to key_iter.  Then, we set cur_iter to parent_iter if valid and
+ * less than key_iter and key_iter otherwise.
+ */
+int DBObjectMap::DBObjectMapIteratorImpl::adjust()
+{
+  string begin, end;
+  while (parent_iter && parent_iter->valid()) {
+    if (in_complete_region(parent_iter->key(), &begin, &end)) {
+      if (end.size() == 0) {
+	parent_iter->seek_to_last();
+	if (parent_iter->valid())
+	  parent_iter->next();
+      } else
+	parent_iter->lower_bound(end);
+    } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) {
+      parent_iter->next();
+    } else {
+      break;
+    }
+  }
+  if (valid_parent()) {
+    cur_iter = parent_iter;
+  } else if (key_iter->valid()) {
+    cur_iter = key_iter;
+  } else {
+    invalid = true;
+  }
+  ceph_assert(invalid || cur_iter->valid());
+  return 0;
+}
+
+
+string DBObjectMap::DBObjectMapIteratorImpl::key()
+{
+  return cur_iter->key();
+}
+
+bufferlist DBObjectMap::DBObjectMapIteratorImpl::value()
+{
+  return cur_iter->value();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::status()
+{
+  return r;
+}
+
+int DBObjectMap::set_keys(const ghobject_t &oid,
+			  const map<string, bufferlist> &set,
+			  const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_create_map_header(hl, oid, t);
+  if (!header)
+    return -EINVAL;
+  if (check_spos(oid, header, spos))
+    return 0;
+
+  t->set(user_prefix(header), set);
+
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::set_header(const ghobject_t &oid,
+			    const bufferlist &bl,
+			    const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_create_map_header(hl, oid, t);
+  if (!header)
+    return -EINVAL;
+  if (check_spos(oid, header, spos))
+    return 0;
+  _set_header(header, bl, t);
+  return db->submit_transaction(t);
+}
+
+void DBObjectMap::_set_header(Header header, const bufferlist &bl,
+			      KeyValueDB::Transaction t)
+{
+  map<string, bufferlist> to_set;
+  to_set[USER_HEADER_KEY] = bl;
+  t->set(sys_prefix(header), to_set);
+}
+
+int DBObjectMap::get_header(const ghobject_t &oid,
+			    bufferlist *bl)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header) {
+    return 0;
+  }
+  return _get_header(header, bl);
+}
+
+int DBObjectMap::_get_header(Header header,
+			     bufferlist *bl)
+{
+  map<string, bufferlist> out;
+  while (true) {
+    out.clear();
+    set<string> to_get;
+    to_get.insert(USER_HEADER_KEY);
+    int r = db->get(sys_prefix(header), to_get, &out);
+    if (r == 0 && !out.empty())
+      break;
+    if (r < 0)
+      return r;
+    Header current(header);
+    if (!current->parent)
+      break;
+    header = lookup_parent(current);
+  }
+
+  if (!out.empty())
+    bl->swap(out.begin()->second);
+  return 0;
+}
+
+int DBObjectMap::clear(const ghobject_t &oid,
+		       const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  if (check_spos(oid, header, spos))
+    return 0;
+  remove_map_header(hl, oid, header, t);
+  ceph_assert(header->num_children > 0);
+  header->num_children--;
+  int r = _clear(header, t);
+  if (r < 0)
+    return r;
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::_clear(Header header,
+			KeyValueDB::Transaction t)
+{
+  while (1) {
+    if (header->num_children) {
+      set_header(header, t);
+      break;
+    }
+    clear_header(header, t);
+    if (!header->parent)
+      break;
+    Header parent = lookup_parent(header);
+    if (!parent) {
+      return -EINVAL;
+    }
+    ceph_assert(parent->num_children > 0);
+    parent->num_children--;
+    header.swap(parent);
+  }
+  return 0;
+}
+
+int DBObjectMap::copy_up_header(Header header,
+				KeyValueDB::Transaction t)
+{
+  bufferlist bl;
+  int r = _get_header(header, &bl);
+  if (r < 0)
+    return r;
+
+  _set_header(header, bl, t);
+  return 0;
+}
+
+int DBObjectMap::rm_keys(const ghobject_t &oid,
+			 const set<string> &to_clear,
+			 const SequencerPosition *spos)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  KeyValueDB::Transaction t = db->get_transaction();
+  if (check_spos(oid, header, spos))
+    return 0;
+  t->rmkeys(user_prefix(header), to_clear);
+  if (!header->parent) {
+    return db->submit_transaction(t);
+  }
+
+  ceph_assert(state.legacy);
+
+  {
+    // We only get here for legacy (v2) stores
+    // Copy up all keys from parent excluding to_clear
+    // and remove parent
+    // This eliminates a v2 format use of complete for this oid only
+    map<string, bufferlist> to_write;
+    ObjectMapIterator iter = _get_iterator(header);
+    for (iter->seek_to_first() ; iter->valid() ; iter->next()) {
+      if (iter->status())
+        return iter->status();
+      if (!to_clear.count(iter->key()))
+        to_write[iter->key()] = iter->value();
+    }
+    t->set(user_prefix(header), to_write);
+  } // destruct iter which has parent in_use
+
+  copy_up_header(header, t);
+  Header parent = lookup_parent(header);
+  if (!parent)
+    return -EINVAL;
+  parent->num_children--;
+  _clear(parent, t);
+  header->parent = 0;
+  set_map_header(hl, oid, *header, t);
+  t->rmkeys_by_prefix(complete_prefix(header));
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::clear_keys_header(const ghobject_t &oid,
+				   const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  if (check_spos(oid, header, spos))
+    return 0;
+
+  // save old attrs
+  KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
+  if (!iter)
+    return -EINVAL;
+  map<string, bufferlist> attrs;
+  for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
+    attrs.insert(make_pair(iter->key(), iter->value()));
+  if (iter->status())
+    return iter->status();
+
+  // remove current header
+  remove_map_header(hl, oid, header, t);
+  ceph_assert(header->num_children > 0);
+  header->num_children--;
+  int r = _clear(header, t);
+  if (r < 0)
+    return r;
+
+  // create new header
+  Header newheader = generate_new_header(oid, Header());
+  set_map_header(hl, oid, *newheader, t);
+  if (!attrs.empty())
+    t->set(xattr_prefix(newheader), attrs);
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::get(const ghobject_t &oid,
+		     bufferlist *_header,
+		     map<string, bufferlist> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  _get_header(header, _header);
+  ObjectMapIterator iter = _get_iterator(header);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    if (iter->status())
+      return iter->status();
+    out->insert(make_pair(iter->key(), iter->value()));
+  }
+  return 0;
+}
+
+int DBObjectMap::get_keys(const ghobject_t &oid,
+			  set<string> *keys)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  ObjectMapIterator iter = _get_iterator(header);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    if (iter->status())
+      return iter->status();
+    keys->insert(iter->key());
+  }
+  return 0;
+}
+
+int DBObjectMap::scan(Header header,
+		      const set<string> &in_keys,
+		      set<string> *out_keys,
+		      map<string, bufferlist> *out_values)
+{
+  ObjectMapIterator db_iter = _get_iterator(header);
+  for (set<string>::const_iterator key_iter = in_keys.begin();
+       key_iter != in_keys.end();
+       ++key_iter) {
+    db_iter->lower_bound(*key_iter);
+    if (db_iter->status())
+      return db_iter->status();
+    if (db_iter->valid() && db_iter->key() == *key_iter) {
+      if (out_keys)
+	out_keys->insert(*key_iter);
+      if (out_values)
+	out_values->insert(make_pair(db_iter->key(), db_iter->value()));
+    }
+  }
+  return 0;
+}
+
+int DBObjectMap::get_values(const ghobject_t &oid,
+			    const set<string> &keys,
+			    map<string, bufferlist> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  return scan(header, keys, 0, out);
+}
+
+int DBObjectMap::check_keys(const ghobject_t &oid,
+			    const set<string> &keys,
+			    set<string> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  return scan(header, keys, out, 0);
+}
+
+int DBObjectMap::get_xattrs(const ghobject_t &oid,
+			    const set<string> &to_get,
+			    map<string, bufferlist> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  return db->get(xattr_prefix(header), to_get, out);
+}
+
+int DBObjectMap::get_all_xattrs(const ghobject_t &oid,
+				set<string> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
+  if (!iter)
+    return -EINVAL;
+  for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
+    out->insert(iter->key());
+  return iter->status();
+}
+
+int DBObjectMap::set_xattrs(const ghobject_t &oid,
+			    const map<string, bufferlist> &to_set,
+			    const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_create_map_header(hl, oid, t);
+  if (!header)
+    return -EINVAL;
+  if (check_spos(oid, header, spos))
+    return 0;
+  t->set(xattr_prefix(header), to_set);
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::remove_xattrs(const ghobject_t &oid,
+			       const set<string> &to_remove,
+			       const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  if (check_spos(oid, header, spos))
+    return 0;
+  t->rmkeys(xattr_prefix(header), to_remove);
+  return db->submit_transaction(t);
+}
+
+// ONLY USED FOR TESTING
+// Set version to 2 to avoid asserts
+int DBObjectMap::legacy_clone(const ghobject_t &oid,
+		       const ghobject_t &target,
+		       const SequencerPosition *spos)
+{
+  state.legacy = true;
+
+  if (oid == target)
+    return 0;
+
+  MapHeaderLock _l1(this, std::min(oid, target));
+  MapHeaderLock _l2(this, std::max(oid, target));
+  MapHeaderLock *lsource, *ltarget;
+  if (oid > target) {
+    lsource = &_l2;
+    ltarget= &_l1;
+  } else {
+    lsource = &_l1;
+    ltarget= &_l2;
+  }
+
+  KeyValueDB::Transaction t = db->get_transaction();
+  {
+    Header destination = lookup_map_header(*ltarget, target);
+    if (destination) {
+      if (check_spos(target, destination, spos))
+	return 0;
+      destination->num_children--;
+      remove_map_header(*ltarget, target, destination, t);
+      _clear(destination, t);
+    }
+  }
+
+  Header parent = lookup_map_header(*lsource, oid);
+  if (!parent)
+    return db->submit_transaction(t);
+
+  Header source = generate_new_header(oid, parent);
+  Header destination = generate_new_header(target, parent);
+  if (spos)
+    destination->spos = *spos;
+
+  parent->num_children = 2;
+  set_header(parent, t);
+  set_map_header(*lsource, oid, *source, t);
+  set_map_header(*ltarget, target, *destination, t);
+
+  map<string, bufferlist> to_set;
+  KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(parent));
+  for (xattr_iter->seek_to_first();
+       xattr_iter->valid();
+       xattr_iter->next())
+    to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value()));
+  t->set(xattr_prefix(source), to_set);
+  t->set(xattr_prefix(destination), to_set);
+  t->rmkeys_by_prefix(xattr_prefix(parent));
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::clone(const ghobject_t &oid,
+		       const ghobject_t &target,
+		       const SequencerPosition *spos)
+{
+  if (oid == target)
+    return 0;
+
+  MapHeaderLock _l1(this, std::min(oid, target));
+  MapHeaderLock _l2(this, std::max(oid, target));
+  MapHeaderLock *lsource, *ltarget;
+  if (oid > target) {
+    lsource = &_l2;
+    ltarget= &_l1;
+  } else {
+    lsource = &_l1;
+    ltarget= &_l2;
+  }
+
+  KeyValueDB::Transaction t = db->get_transaction();
+  {
+    Header destination = lookup_map_header(*ltarget, target);
+    if (destination) {
+      if (check_spos(target, destination, spos))
+	return 0;
+      destination->num_children--;
+      remove_map_header(*ltarget, target, destination, t);
+      _clear(destination, t);
+    }
+  }
+
+  Header source = lookup_map_header(*lsource, oid);
+  if (!source)
+    return db->submit_transaction(t);
+
+  Header destination = generate_new_header(target, Header());
+  if (spos)
+    destination->spos = *spos;
+
+  set_map_header(*ltarget, target, *destination, t);
+
+  bufferlist bl;
+  int r = _get_header(source, &bl);
+  if (r < 0)
+    return r;
+  _set_header(destination, bl, t);
+
+  map<string, bufferlist> to_set;
+  KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(source));
+  for (xattr_iter->seek_to_first();
+       xattr_iter->valid();
+       xattr_iter->next())
+    to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value()));
+  t->set(xattr_prefix(destination), to_set);
+
+  map<string, bufferlist> to_write;
+  ObjectMapIterator iter = _get_iterator(source);
+  for (iter->seek_to_first() ; iter->valid() ; iter->next()) {
+    if (iter->status())
+      return iter->status();
+    to_write[iter->key()] = iter->value();
+  }
+  t->set(user_prefix(destination), to_write);
+
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::upgrade_to_v2()
+{
+  dout(1) << __func__ << " start" << dendl;
+  KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+  iter->seek_to_first();
+  while (iter->valid()) {
+    unsigned count = 0;
+    KeyValueDB::Transaction t = db->get_transaction();
+    set<string> remove;
+    map<string, bufferlist> add;
+    for (;
+        iter->valid() && count < 300;
+        iter->next()) {
+      dout(20) << __func__ << " key is " << iter->key() << dendl;
+      int r = is_buggy_ghobject_key_v1(cct, iter->key());
+      if (r < 0) {
+	derr << __func__ << " bad key '" << iter->key() << "'" << dendl;
+	return r;
+      }
+      if (!r) {
+	dout(20) << __func__ << " " << iter->key() << " ok" << dendl;
+	continue;
+      }
+
+      // decode header to get oid
+      _Header hdr;
+      bufferlist bl = iter->value();
+      auto bliter = bl.cbegin();
+      hdr.decode(bliter);
+
+      string newkey(ghobject_key(hdr.oid));
+      dout(20) << __func__ << " " << iter->key() << " -> " << newkey << dendl;
+      add[newkey] = iter->value();
+      remove.insert(iter->key());
+      ++count;
+    }
+
+    if (!remove.empty()) {
+      dout(20) << __func__ << " updating " << remove.size() << " keys" << dendl;
+      t->rmkeys(HOBJECT_TO_SEQ, remove);
+      t->set(HOBJECT_TO_SEQ, add);
+      int r = db->submit_transaction(t);
+      if (r < 0)
+	return r;
+    }
+  }
+
+  state.v = 2;
+
+  set_state();
+  return 0;
+}
+
+void DBObjectMap::set_state()
+{
+  std::lock_guard l{header_lock};
+  KeyValueDB::Transaction t = db->get_transaction();
+  write_state(t);
+  int ret = db->submit_transaction_sync(t);
+  ceph_assert(ret == 0);
+  dout(1) << __func__ << " done" << dendl;
+  return;
+}
+
+int DBObjectMap::get_state()
+{
+  map<string, bufferlist> result;
+  set<string> to_get;
+  to_get.insert(GLOBAL_STATE_KEY);
+  int r = db->get(SYS_PREFIX, to_get, &result);
+  if (r < 0)
+    return r;
+  if (!result.empty()) {
+    auto bliter = result.begin()->second.cbegin();
+    state.decode(bliter);
+  } else {
+    // New store
+    state.v = State::CUR_VERSION;
+    state.seq = 1;
+    state.legacy = false;
+  }
+  return 0;
+}
+
+int DBObjectMap::init(bool do_upgrade)
+{
+  int ret = get_state();
+  if (ret < 0)
+    return ret;
+  if (state.v < 1) {
+    dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
+	    << dendl;
+    return -ENOTSUP;
+  }
+  if (state.v < 2) { // Needs upgrade
+    if (!do_upgrade) {
+      dout(1) << "DOBjbectMap requires an upgrade,"
+	      << " set filestore_update_to"
+	      << dendl;
+      return -ENOTSUP;
+    } else {
+      int r = upgrade_to_v2();
+      if (r < 0)
+	return r;
+    }
+  }
+  ostringstream ss;
+  int errors = check(ss, true);
+  if (errors) {
+    derr << ss.str() << dendl;
+    if (errors > 0)
+      return -EINVAL;
+  }
+  dout(20) << "(init)dbobjectmap: seq is " << state.seq << dendl;
+  return 0;
+}
+
+int DBObjectMap::sync(const ghobject_t *oid,
+		      const SequencerPosition *spos) {
+  KeyValueDB::Transaction t = db->get_transaction();
+  if (oid) {
+    ceph_assert(spos);
+    MapHeaderLock hl(this, *oid);
+    Header header = lookup_map_header(hl, *oid);
+    if (header) {
+      dout(10) << "oid: " << *oid << " setting spos to "
+	       << *spos << dendl;
+      header->spos = *spos;
+      set_map_header(hl, *oid, *header, t);
+    }
+    /* It may appear that this and the identical portion of the else
+     * block can combined below, but in this block, the transaction
+     * must be submitted under *both* the MapHeaderLock and the full
+     * header_lock.
+     *
+     * See 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0 and bug 9891.
+     */
+    std::lock_guard l{header_lock};
+    write_state(t);
+    return db->submit_transaction_sync(t);
+  } else {
+    std::lock_guard l{header_lock};
+    write_state(t);
+    return db->submit_transaction_sync(t);
+  }
+}
+
+int DBObjectMap::write_state(KeyValueDB::Transaction _t) {
+  ceph_assert(ceph_mutex_is_locked_by_me(header_lock));
+  dout(20) << "dbobjectmap: seq is " << state.seq << dendl;
+  KeyValueDB::Transaction t = _t ? _t : db->get_transaction();
+  bufferlist bl;
+  state.encode(bl);
+  map<string, bufferlist> to_write;
+  to_write[GLOBAL_STATE_KEY] = bl;
+  t->set(SYS_PREFIX, to_write);
+  return _t ? 0 : db->submit_transaction(t);
+}
+
+
+DBObjectMap::Header DBObjectMap::_lookup_map_header(
+  const MapHeaderLock &l,
+  const ghobject_t &oid)
+{
+  ceph_assert(l.get_locked() == oid);
+
+  _Header *header = new _Header();
+  {
+    std::lock_guard l{cache_lock};
+    if (caches.lookup(oid, header)) {
+      ceph_assert(!in_use.count(header->seq));
+      in_use.insert(header->seq);
+      return Header(header, RemoveOnDelete(this));
+    }
+  }
+
+  bufferlist out;
+  int r = db->get(HOBJECT_TO_SEQ, map_header_key(oid), &out);
+  if (r < 0 || out.length()==0) {
+    delete header;
+    return Header();
+  }
+
+  Header ret(header, RemoveOnDelete(this));
+  auto iter = out.cbegin();
+  ret->decode(iter);
+  {
+    std::lock_guard l{cache_lock};
+    caches.add(oid, *ret);
+  }
+
+  ceph_assert(!in_use.count(header->seq));
+  in_use.insert(header->seq);
+  return ret;
+}
+
+DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid,
+						      Header parent)
+{
+  Header header = Header(new _Header(), RemoveOnDelete(this));
+  header->seq = state.seq++;
+  if (parent) {
+    header->parent = parent->seq;
+    header->spos = parent->spos;
+  }
+  header->num_children = 1;
+  header->oid = oid;
+  ceph_assert(!in_use.count(header->seq));
+  in_use.insert(header->seq);
+
+  write_state();
+  return header;
+}
+
+DBObjectMap::Header DBObjectMap::lookup_parent(Header input)
+{
+  std::unique_lock l{header_lock};
+  header_cond.wait(l, [&input, this] { return !in_use.count(input->parent); });
+  map<string, bufferlist> out;
+  set<string> keys;
+  keys.insert(HEADER_KEY);
+
+  dout(20) << "lookup_parent: parent " << input->parent
+       << " for seq " << input->seq << dendl;
+  int r = db->get(sys_parent_prefix(input), keys, &out);
+  if (r < 0) {
+    ceph_abort();
+    return Header();
+  }
+  if (out.empty()) {
+    ceph_abort();
+    return Header();
+  }
+
+  Header header = Header(new _Header(), RemoveOnDelete(this));
+  auto iter = out.begin()->second.cbegin();
+  header->decode(iter);
+  ceph_assert(header->seq == input->parent);
+  dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
+       << header->parent << dendl;
+  in_use.insert(header->seq);
+  return header;
+}
+
+DBObjectMap::Header DBObjectMap::lookup_create_map_header(
+  const MapHeaderLock &hl,
+  const ghobject_t &oid,
+  KeyValueDB::Transaction t)
+{
+  std::lock_guard l{header_lock};
+  Header header = _lookup_map_header(hl, oid);
+  if (!header) {
+    header = _generate_new_header(oid, Header());
+    set_map_header(hl, oid, *header, t);
+  }
+  return header;
+}
+
+void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t)
+{
+  dout(20) << "clear_header: clearing seq " << header->seq << dendl;
+  t->rmkeys_by_prefix(user_prefix(header));
+  t->rmkeys_by_prefix(sys_prefix(header));
+  if (state.legacy)
+    t->rmkeys_by_prefix(complete_prefix(header)); // Needed when header.parent != 0
+  t->rmkeys_by_prefix(xattr_prefix(header));
+  set<string> keys;
+  keys.insert(header_key(header->seq));
+  t->rmkeys(USER_PREFIX, keys);
+}
+
+void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t)
+{
+  dout(20) << "set_header: setting seq " << header->seq << dendl;
+  map<string, bufferlist> to_write;
+  header->encode(to_write[HEADER_KEY]);
+  t->set(sys_prefix(header), to_write);
+}
+
+void DBObjectMap::remove_map_header(
+  const MapHeaderLock &l,
+  const ghobject_t &oid,
+  Header header,
+  KeyValueDB::Transaction t)
+{
+  ceph_assert(l.get_locked() == oid);
+  dout(20) << "remove_map_header: removing " << header->seq
+	   << " oid " << oid << dendl;
+  set<string> to_remove;
+  to_remove.insert(map_header_key(oid));
+  t->rmkeys(HOBJECT_TO_SEQ, to_remove);
+  {
+    std::lock_guard l{cache_lock};
+    caches.clear(oid);
+  }
+}
+
+void DBObjectMap::set_map_header(
+  const MapHeaderLock &l,
+  const ghobject_t &oid, _Header header,
+  KeyValueDB::Transaction t)
+{
+  ceph_assert(l.get_locked() == oid);
+  dout(20) << "set_map_header: setting " << header.seq
+	   << " oid " << oid << " parent seq "
+	   << header.parent << dendl;
+  map<string, bufferlist> to_set;
+  header.encode(to_set[map_header_key(oid)]);
+  t->set(HOBJECT_TO_SEQ, to_set);
+  {
+    std::lock_guard l{cache_lock};
+    caches.add(oid, header);
+  }
+}
+
+bool DBObjectMap::check_spos(const ghobject_t &oid,
+			     Header header,
+			     const SequencerPosition *spos)
+{
+  if (!spos || *spos > header->spos) {
+    stringstream out;
+    if (spos)
+      dout(10) << "oid: " << oid << " not skipping op, *spos "
+	       << *spos << dendl;
+    else
+      dout(10) << "oid: " << oid << " not skipping op, *spos "
+	       << "empty" << dendl;
+    dout(10) << " > header.spos " << header->spos << dendl;
+    return false;
+  } else {
+    dout(10) << "oid: " << oid << " skipping op, *spos " << *spos
+	     << " <= header.spos " << header->spos << dendl;
+    return true;
+  }
+}
+
+int DBObjectMap::list_objects(vector<ghobject_t> *out)
+{
+  KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    bufferlist bl = iter->value();
+    auto bliter = bl.cbegin();
+    _Header header;
+    header.decode(bliter);
+    out->push_back(header.oid);
+  }
+  return 0;
+}
+
+int DBObjectMap::list_object_headers(vector<_Header> *out)
+{
+  int error = 0;
+  KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    bufferlist bl = iter->value();
+    auto bliter = bl.cbegin();
+    _Header header;
+    header.decode(bliter);
+    out->push_back(header);
+    while (header.parent) {
+      set<string> to_get;
+      map<string, bufferlist> got;
+      to_get.insert(HEADER_KEY);
+      db->get(sys_parent_prefix(header), to_get, &got);
+      if (got.empty()) {
+	dout(0) << "Missing: seq " << header.parent << dendl;
+	error = -ENOENT;
+	break;
+      } else {
+	bl = got.begin()->second;
+        auto bliter = bl.cbegin();
+        header.decode(bliter);
+        out->push_back(header);
+      }
+    }
+  }
+  return error;
+}
+
+ostream& operator<<(ostream& out, const DBObjectMap::_Header& h)
+{
+  out << "seq=" << h.seq << " parent=" << h.parent 
+      << " num_children=" << h.num_children
+      << " ghobject=" << h.oid;
+  return out;
+}
+
+int DBObjectMap::rename(const ghobject_t &from,
+		       const ghobject_t &to,
+		       const SequencerPosition *spos)
+{
+  if (from == to)
+    return 0;
+
+  MapHeaderLock _l1(this, std::min(from, to));
+  MapHeaderLock _l2(this, std::max(from, to));
+  MapHeaderLock *lsource, *ltarget;
+  if (from > to) {
+    lsource = &_l2;
+    ltarget= &_l1;
+  } else {
+    lsource = &_l1;
+    ltarget= &_l2;
+  }
+
+  KeyValueDB::Transaction t = db->get_transaction();
+  {
+    Header destination = lookup_map_header(*ltarget, to);
+    if (destination) {
+      if (check_spos(to, destination, spos))
+	return 0;
+      destination->num_children--;
+      remove_map_header(*ltarget, to, destination, t);
+      _clear(destination, t);
+    }
+  }
+
+  Header hdr = lookup_map_header(*lsource, from);
+  if (!hdr)
+    return db->submit_transaction(t);
+
+  remove_map_header(*lsource, from, hdr, t);
+  hdr->oid = to;
+  set_map_header(*ltarget, to, *hdr, t);
+
+  return db->submit_transaction(t);
+}
diff --git a/src/os/filestore/DBObjectMap.h b/src/os/filestore/DBObjectMap.h
new file mode 100644
index 000000000..444f21eb8
--- /dev/null
+++ b/src/os/filestore/DBObjectMap.h
@@ -0,0 +1,584 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#ifndef DBOBJECTMAP_DB_H
+#define DBOBJECTMAP_DB_H
+
+#include "include/buffer_fwd.h"
+#include <set>
+#include <map>
+#include <string>
+
+#include <vector>
+#include <boost/scoped_ptr.hpp>
+
+#include "os/ObjectMap.h"
+#include "kv/KeyValueDB.h"
+#include "osd/osd_types.h"
+#include "common/ceph_mutex.h"
+#include "common/simple_cache.hpp"
+#include <boost/optional/optional_io.hpp>
+
+#include "SequencerPosition.h"
+
+/**
+ * DBObjectMap: Implements ObjectMap in terms of KeyValueDB
+ *
+ * Prefix space structure:
+ *
+ * @see complete_prefix
+ * @see user_prefix
+ * @see sys_prefix
+ *
+ * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and
+ *                   corresponding omap header
+ * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
+ *                                  @see State
+ *                                  @see write_state
+ *                                  @see init
+ *                                  @see generate_new_header
+ * - USER_PREFIX + header_key(header->seq) + USER_PREFIX
+ *              : key->value for header->seq
+ * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below
+ * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs
+ * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX
+ *              : USER_HEADER_KEY - omap header for header->seq
+ *              : HEADER_KEY - encoding of header for header->seq
+ *
+ * For each node (represented by a header), we
+ * store three mappings: the key mapping, the complete mapping, and the parent.
+ * The complete mapping (COMPLETE_PREFIX space) is key->key.  Each x->y entry in
+ * this mapping indicates that the key mapping contains all entries on [x,y).
+ * Note, max std::string is represented by "", so ""->"" indicates that the parent
+ * is unnecessary (@see rm_keys).  When looking up a key not contained in the
+ * the complete std::set, we have to check the parent if we don't find it in the
+ * key std::set.  During rm_keys, we copy keys from the parent and update the
+ * complete std::set to reflect the change @see rm_keys.
+ */
+class DBObjectMap : public ObjectMap {
+public:
+
+  KeyValueDB *get_db() override { return db.get(); }
+
+  /**
+   * Serializes access to next_seq as well as the in_use std::set
+   */
+  ceph::mutex header_lock = ceph::make_mutex("DBOBjectMap");
+  ceph::condition_variable header_cond;
+  ceph::condition_variable map_header_cond;
+
+  /**
+   * Std::Set of headers currently in use
+   */
+  std::set<uint64_t> in_use;
+  std::set<ghobject_t> map_header_in_use;
+
+  /**
+   * Takes the map_header_in_use entry in constructor, releases in
+   * destructor
+   */
+  class MapHeaderLock {
+    DBObjectMap *db;
+    boost::optional<ghobject_t> locked;
+
+    MapHeaderLock(const MapHeaderLock &);
+    MapHeaderLock &operator=(const MapHeaderLock &);
+  public:
+    explicit MapHeaderLock(DBObjectMap *db) : db(db) {}
+    MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) {
+      std::unique_lock l{db->header_lock};
+      db->map_header_cond.wait(l, [db, this] {
+        return !db->map_header_in_use.count(*locked);
+      });
+      db->map_header_in_use.insert(*locked);
+    }
+
+    const ghobject_t &get_locked() const {
+      ceph_assert(locked);
+      return *locked;
+    }
+
+    void swap(MapHeaderLock &o) {
+      ceph_assert(db == o.db);
+
+      // centos6's boost optional doesn't seem to have swap :(
+      boost::optional<ghobject_t> _locked = o.locked;
+      o.locked = locked;
+      locked = _locked;
+    }
+
+    ~MapHeaderLock() {
+      if (locked) {
+	std::lock_guard l{db->header_lock};
+	ceph_assert(db->map_header_in_use.count(*locked));
+	db->map_header_cond.notify_all();
+	db->map_header_in_use.erase(*locked);
+      }
+    }
+  };
+
+  DBObjectMap(CephContext* cct, KeyValueDB *db)
+    : ObjectMap(cct, db),
+      caches(cct->_conf->filestore_omap_header_cache_size)
+    {}
+
+  int set_keys(
+    const ghobject_t &oid,
+    const std::map<std::string, ceph::buffer::list> &set,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int set_header(
+    const ghobject_t &oid,
+    const ceph::buffer::list &bl,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int get_header(
+    const ghobject_t &oid,
+    ceph::buffer::list *bl
+    ) override;
+
+  int clear(
+    const ghobject_t &oid,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int clear_keys_header(
+    const ghobject_t &oid,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int rm_keys(
+    const ghobject_t &oid,
+    const std::set<std::string> &to_clear,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int get(
+    const ghobject_t &oid,
+    ceph::buffer::list *header,
+    std::map<std::string, ceph::buffer::list> *out
+    ) override;
+
+  int get_keys(
+    const ghobject_t &oid,
+    std::set<std::string> *keys
+    ) override;
+
+  int get_values(
+    const ghobject_t &oid,
+    const std::set<std::string> &keys,
+    std::map<std::string, ceph::buffer::list> *out
+    ) override;
+
+  int check_keys(
+    const ghobject_t &oid,
+    const std::set<std::string> &keys,
+    std::set<std::string> *out
+    ) override;
+
+  int get_xattrs(
+    const ghobject_t &oid,
+    const std::set<std::string> &to_get,
+    std::map<std::string, ceph::buffer::list> *out
+    ) override;
+
+  int get_all_xattrs(
+    const ghobject_t &oid,
+    std::set<std::string> *out
+    ) override;
+
+  int set_xattrs(
+    const ghobject_t &oid,
+    const std::map<std::string, ceph::buffer::list> &to_set,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int remove_xattrs(
+    const ghobject_t &oid,
+    const std::set<std::string> &to_remove,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int clone(
+    const ghobject_t &oid,
+    const ghobject_t &target,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int rename(
+    const ghobject_t &from,
+    const ghobject_t &to,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int legacy_clone(
+    const ghobject_t &oid,
+    const ghobject_t &target,
+    const SequencerPosition *spos=0
+    ) override;
+
+  /// Read initial state from backing store
+  int get_state();
+  /// Write current state settings to DB
+  void set_state();
+  /// Read initial state and upgrade or initialize state
+  int init(bool upgrade = false);
+
+  /// Upgrade store to current version
+  int upgrade_to_v2();
+
+  /// Consistency check, debug, there must be no parallel writes
+  int check(std::ostream &out, bool repair = false, bool force = false) override;
+
+  /// Ensure that all previous operations are durable
+  int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override;
+
+  void compact() override {
+    ceph_assert(db);
+    db->compact();
+  }
+
+  /// Util, get all objects, there must be no other concurrent access
+  int list_objects(std::vector<ghobject_t> *objs ///< [out] objects
+    );
+
+  struct _Header;
+  // Util, get all object headers, there must be no other concurrent access
+  int list_object_headers(std::vector<_Header> *out ///< [out] headers
+    );
+
+  ObjectMapIterator get_iterator(const ghobject_t &oid) override;
+
+  static const std::string USER_PREFIX;
+  static const std::string XATTR_PREFIX;
+  static const std::string SYS_PREFIX;
+  static const std::string COMPLETE_PREFIX;
+  static const std::string HEADER_KEY;
+  static const std::string USER_HEADER_KEY;
+  static const std::string GLOBAL_STATE_KEY;
+  static const std::string HOBJECT_TO_SEQ;
+
+  /// Legacy
+  static const std::string LEAF_PREFIX;
+  static const std::string REVERSE_LEAF_PREFIX;
+
+  /// persistent state for store @see generate_header
+  struct State {
+    static const __u8 CUR_VERSION = 3;
+    __u8 v;
+    uint64_t seq;
+    // legacy is false when complete regions never used
+    bool legacy;
+    State() : v(0), seq(1), legacy(false) {}
+    explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {}
+
+    void encode(ceph::buffer::list &bl) const {
+      ENCODE_START(3, 1, bl);
+      encode(v, bl);
+      encode(seq, bl);
+      encode(legacy, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator &bl) {
+      DECODE_START(3, bl);
+      if (struct_v >= 2)
+	decode(v, bl);
+      else
+	v = 0;
+      decode(seq, bl);
+      if (struct_v >= 3)
+	decode(legacy, bl);
+      else
+	legacy = false;
+      DECODE_FINISH(bl);
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->dump_unsigned("v", v);
+      f->dump_unsigned("seq", seq);
+      f->dump_bool("legacy", legacy);
+    }
+
+    static void generate_test_instances(std::list<State*> &o) {
+      o.push_back(new State(0));
+      o.push_back(new State(20));
+    }
+  } state;
+
+  struct _Header {
+    uint64_t seq;
+    uint64_t parent;
+    uint64_t num_children;
+
+    ghobject_t oid;
+
+    SequencerPosition spos;
+
+    void encode(ceph::buffer::list &bl) const {
+      coll_t unused;
+      ENCODE_START(2, 1, bl);
+      encode(seq, bl);
+      encode(parent, bl);
+      encode(num_children, bl);
+      encode(unused, bl);
+      encode(oid, bl);
+      encode(spos, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator &bl) {
+      coll_t unused;
+      DECODE_START(2, bl);
+      decode(seq, bl);
+      decode(parent, bl);
+      decode(num_children, bl);
+      decode(unused, bl);
+      decode(oid, bl);
+      if (struct_v >= 2)
+	decode(spos, bl);
+      DECODE_FINISH(bl);
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->dump_unsigned("seq", seq);
+      f->dump_unsigned("parent", parent);
+      f->dump_unsigned("num_children", num_children);
+      f->dump_stream("oid") << oid;
+    }
+
+    static void generate_test_instances(std::list<_Header*> &o) {
+      o.push_back(new _Header);
+      o.push_back(new _Header);
+      o.back()->parent = 20;
+      o.back()->seq = 30;
+    }
+
+    size_t length() {
+      return sizeof(_Header);
+    }
+
+    _Header() : seq(0), parent(0), num_children(1) {}
+  };
+
+  /// Std::String munging (public for testing)
+  static std::string ghobject_key(const ghobject_t &oid);
+  static std::string ghobject_key_v0(coll_t c, const ghobject_t &oid);
+  static int is_buggy_ghobject_key_v1(CephContext* cct,
+				      const std::string &in);
+private:
+  /// Implicit lock on Header->seq
+  typedef std::shared_ptr<_Header> Header;
+  ceph::mutex cache_lock = ceph::make_mutex("DBObjectMap::CacheLock");
+  SimpleLRU<ghobject_t, _Header> caches;
+
+  std::string map_header_key(const ghobject_t &oid);
+  std::string header_key(uint64_t seq);
+  std::string complete_prefix(Header header);
+  std::string user_prefix(Header header);
+  std::string sys_prefix(Header header);
+  std::string xattr_prefix(Header header);
+  std::string sys_parent_prefix(_Header header);
+  std::string sys_parent_prefix(Header header) {
+    return sys_parent_prefix(*header);
+  }
+
+  class EmptyIteratorImpl : public ObjectMapIteratorImpl {
+  public:
+    int seek_to_first() override { return 0; }
+    int seek_to_last() { return 0; }
+    int upper_bound(const std::string &after) override { return 0; }
+    int lower_bound(const std::string &to) override { return 0; }
+    bool valid() override { return false; }
+    int next() override { ceph_abort(); return 0; }
+    std::string key() override { ceph_abort(); return ""; }
+    ceph::buffer::list value() override { ceph_abort(); return ceph::buffer::list(); }
+    int status() override { return 0; }
+  };
+
+
+  /// Iterator
+  class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl {
+  public:
+    DBObjectMap *map;
+
+    /// NOTE: implicit lock hlock->get_locked() when returned out of the class
+    MapHeaderLock hlock;
+    /// NOTE: implicit lock on header->seq AND for all ancestors
+    Header header;
+
+    /// parent_iter == NULL iff no parent
+    std::shared_ptr<DBObjectMapIteratorImpl> parent_iter;
+    KeyValueDB::Iterator key_iter;
+    KeyValueDB::Iterator complete_iter;
+
+    /// cur_iter points to currently valid iterator
+    std::shared_ptr<ObjectMapIteratorImpl> cur_iter;
+    int r;
+
+    /// init() called, key_iter, complete_iter, parent_iter filled in
+    bool ready;
+    /// past end
+    bool invalid;
+
+    DBObjectMapIteratorImpl(DBObjectMap *map, Header header) :
+      map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {}
+    int seek_to_first() override;
+    int seek_to_last();
+    int upper_bound(const std::string &after) override;
+    int lower_bound(const std::string &to) override;
+    bool valid() override;
+    int next() override;
+    std::string key() override;
+    ceph::buffer::list value() override;
+    int status() override;
+
+    bool on_parent() {
+      return cur_iter == parent_iter;
+    }
+
+    /// skips to next valid parent entry
+    int next_parent();
+    
+    /// first parent() >= to
+    int lower_bound_parent(const std::string &to);
+
+    /**
+     * Tests whether to_test is in complete region
+     *
+     * postcondition: complete_iter will be max s.t. complete_iter->value > to_test
+     */
+    int in_complete_region(const std::string &to_test, ///< [in] key to test
+			   std::string *begin,         ///< [out] beginning of region
+			   std::string *end            ///< [out] end of region
+      ); ///< @returns true if to_test is in the complete region, else false
+
+  private:
+    int init();
+    bool valid_parent();
+    int adjust();
+  };
+
+  typedef std::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator;
+  DBObjectMapIterator _get_iterator(Header header) {
+    return std::make_shared<DBObjectMapIteratorImpl>(this, header);
+  }
+
+  /// sys
+
+  /// Removes node corresponding to header
+  void clear_header(Header header, KeyValueDB::Transaction t);
+
+  /// Std::Set node containing input to new contents
+  void set_header(Header input, KeyValueDB::Transaction t);
+
+  /// Remove leaf node corresponding to oid in c
+  void remove_map_header(
+    const MapHeaderLock &l,
+    const ghobject_t &oid,
+    Header header,
+    KeyValueDB::Transaction t);
+
+  /// Std::Set leaf node for c and oid to the value of header
+  void set_map_header(
+    const MapHeaderLock &l,
+    const ghobject_t &oid, _Header header,
+    KeyValueDB::Transaction t);
+
+  /// Std::Set leaf node for c and oid to the value of header
+  bool check_spos(const ghobject_t &oid,
+		  Header header,
+		  const SequencerPosition *spos);
+
+  /// Lookup or create header for c oid
+  Header lookup_create_map_header(
+    const MapHeaderLock &l,
+    const ghobject_t &oid,
+    KeyValueDB::Transaction t);
+
+  /**
+   * Generate new header for c oid with new seq number
+   *
+   * Has the side effect of synchronously saving the new DBObjectMap state
+   */
+  Header _generate_new_header(const ghobject_t &oid, Header parent);
+  Header generate_new_header(const ghobject_t &oid, Header parent) {
+    std::lock_guard l{header_lock};
+    return _generate_new_header(oid, parent);
+  }
+
+  /// Lookup leaf header for c oid
+  Header _lookup_map_header(
+    const MapHeaderLock &l,
+    const ghobject_t &oid);
+  Header lookup_map_header(
+    const MapHeaderLock &l2,
+    const ghobject_t &oid) {
+    std::lock_guard l{header_lock};
+    return _lookup_map_header(l2, oid);
+  }
+
+  /// Lookup header node for input
+  Header lookup_parent(Header input);
+
+
+  /// Helpers
+  int _get_header(Header header, ceph::buffer::list *bl);
+
+  /// Scan keys in header into out_keys and out_values (if nonnull)
+  int scan(Header header,
+	   const std::set<std::string> &in_keys,
+	   std::set<std::string> *out_keys,
+	   std::map<std::string, ceph::buffer::list> *out_values);
+
+  /// Remove header and all related prefixes
+  int _clear(Header header,
+	     KeyValueDB::Transaction t);
+
+  /* Scan complete region bumping *begin to the beginning of any
+   * containing region and adding all complete region keys between
+   * the updated begin and end to the complete_keys_to_remove std::set */
+  int merge_new_complete(DBObjectMapIterator &iter,
+			 std::string *begin,
+			 const std::string &end,
+			 std::set<std::string> *complete_keys_to_remove);
+
+  /// Writes out State (mainly next_seq)
+  int write_state(KeyValueDB::Transaction _t =
+		  KeyValueDB::Transaction());
+
+  /// Copies header entry from parent @see rm_keys
+  int copy_up_header(Header header,
+		     KeyValueDB::Transaction t);
+
+  /// Sets header @see set_header
+  void _set_header(Header header, const ceph::buffer::list &bl,
+		   KeyValueDB::Transaction t);
+
+  /**
+   * Removes header seq lock and possibly object lock
+   * once Header is out of scope
+   * @see lookup_parent
+   * @see generate_new_header
+   */
+  class RemoveOnDelete {
+  public:
+    DBObjectMap *db;
+    explicit RemoveOnDelete(DBObjectMap *db) :
+      db(db) {}
+    void operator() (_Header *header) {
+      std::lock_guard l{db->header_lock};
+      ceph_assert(db->in_use.count(header->seq));
+      db->in_use.erase(header->seq);
+      db->header_cond.notify_all();
+      delete header;
+    }
+  };
+  friend class RemoveOnDelete;
+};
+WRITE_CLASS_ENCODER(DBObjectMap::_Header)
+WRITE_CLASS_ENCODER(DBObjectMap::State)
+
+std::ostream& operator<<(std::ostream& out, const DBObjectMap::_Header& h);
+
+#endif
diff --git a/src/os/filestore/FDCache.h b/src/os/filestore/FDCache.h
new file mode 100644
index 000000000..a7d90c0e6
--- /dev/null
+++ b/src/os/filestore/FDCache.h
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_FDCACHE_H
+#define CEPH_FDCACHE_H
+
+#include <memory>
+#include <errno.h>
+#include <cstdio>
+#include "common/config_obs.h"
+#include "common/hobject.h"
+#include "common/shared_cache.hpp"
+#include "include/compat.h"
+#include "include/intarith.h"
+
+/**
+ * FD Cache
+ */
+class FDCache : public md_config_obs_t {
+public:
+  /**
+   * FD
+   *
+   * Wrapper for an fd.  Destructor closes the fd.
+   */
+  class FD {
+  public:
+    const int fd;
+    explicit FD(int _fd) : fd(_fd) {
+      ceph_assert(_fd >= 0);
+    }
+    int operator*() const {
+      return fd;
+    }
+    ~FD() {
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+    }
+  };
+
+private:
+  CephContext *cct;
+  const int registry_shards;
+  SharedLRU<ghobject_t, FD> *registry;
+
+public:
+  explicit FDCache(CephContext *cct) : cct(cct),
+  registry_shards(std::max<int64_t>(cct->_conf->filestore_fd_cache_shards, 1)) {
+    ceph_assert(cct);
+    cct->_conf.add_observer(this);
+    registry = new SharedLRU<ghobject_t, FD>[registry_shards];
+    for (int i = 0; i < registry_shards; ++i) {
+      registry[i].set_cct(cct);
+      registry[i].set_size(
+          std::max<int64_t>((cct->_conf->filestore_fd_cache_size / registry_shards), 1));
+    }
+  }
+  ~FDCache() override {
+    cct->_conf.remove_observer(this);
+    delete[] registry;
+  }
+  typedef std::shared_ptr<FD> FDRef;
+
+  FDRef lookup(const ghobject_t &hoid) {
+    int registry_id = hoid.hobj.get_hash() % registry_shards;
+    return registry[registry_id].lookup(hoid);
+  }
+
+  FDRef add(const ghobject_t &hoid, int fd, bool *existed) {
+    int registry_id = hoid.hobj.get_hash() % registry_shards;
+    return registry[registry_id].add(hoid, new FD(fd), existed);
+  }
+
+  /// clear cached fd for hoid, subsequent lookups will get an empty FD
+  void clear(const ghobject_t &hoid) {
+    int registry_id = hoid.hobj.get_hash() % registry_shards;
+    registry[registry_id].purge(hoid);
+  }
+
+  /// md_config_obs_t
+  const char** get_tracked_conf_keys() const override {
+    static const char* KEYS[] = {
+      "filestore_fd_cache_size",
+      NULL
+    };
+    return KEYS;
+  }
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set<std::string> &changed) override {
+    if (changed.count("filestore_fd_cache_size")) {
+      for (int i = 0; i < registry_shards; ++i)
+        registry[i].set_size(
+              std::max<int64_t>((conf->filestore_fd_cache_size / registry_shards), 1));
+    }
+  }
+
+};
+typedef FDCache::FDRef FDRef;
+
+#endif
diff --git a/src/os/filestore/FileJournal.cc b/src/os/filestore/FileJournal.cc
new file mode 100644
index 000000000..610b2b32d
--- /dev/null
+++ b/src/os/filestore/FileJournal.cc
@@ -0,0 +1,2234 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "acconfig.h"
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "FileJournal.h"
+#include "include/color.h"
+#include "common/perf_counters.h"
+#include "FileStore.h"
+
+#include "include/compat.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+
+#include "common/blkdev.h"
+#if defined(__linux__)
+#include "common/linux_version.h"
+#endif
+
+#if defined(__FreeBSD__)
+#define O_DSYNC O_SYNC
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_journal
+#undef dout_prefix
+#define dout_prefix *_dout << "journal "
+
+using std::list;
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+
+const static int64_t ONE_MEG(1 << 20);
+const static int CEPH_DIRECTIO_ALIGNMENT(4096);
+
+
+int FileJournal::_open(bool forwrite, bool create)
+{
+  int flags, ret;
+
+  if (forwrite) {
+    flags = O_RDWR;
+    if (directio)
+      flags |= O_DIRECT | O_DSYNC;
+  } else {
+    flags = O_RDONLY;
+  }
+  if (create)
+    flags |= O_CREAT;
+
+  if (fd >= 0) {
+    if (TEMP_FAILURE_RETRY(::close(fd))) {
+      int err = errno;
+      derr << "FileJournal::_open: error closing old fd: "
+	   << cpp_strerror(err) << dendl;
+    }
+  }
+  fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags|O_CLOEXEC, 0644));
+  if (fd < 0) {
+    int err = errno;
+    dout(2) << "FileJournal::_open unable to open journal "
+	    << fn << ": " << cpp_strerror(err) << dendl;
+    return -err;
+  }
+
+  struct stat st;
+  ret = ::fstat(fd, &st);
+  if (ret) {
+    ret = errno;
+    derr << "FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl;
+    ret = -ret;
+    goto out_fd;
+  }
+
+  if (S_ISBLK(st.st_mode)) {
+    ret = _open_block_device();
+  } else if (S_ISREG(st.st_mode)) {
+    if (aio && !force_aio) {
+      derr << "FileJournal::_open: disabling aio for non-block journal.  Use "
+	   << "journal_force_aio to force use of aio anyway" << dendl;
+      aio = false;
+    }
+    ret = _open_file(st.st_size, st.st_blksize, create);
+  } else {
+    derr << "FileJournal::_open: wrong journal file type: " << st.st_mode
+	 << dendl;
+    ret = -EINVAL;
+  }
+
+  if (ret)
+    goto out_fd;
+
+#ifdef HAVE_LIBAIO
+  if (aio) {
+    aio_ctx = 0;
+    ret = io_setup(128, &aio_ctx);
+    if (ret < 0) {
+      switch (ret) {
+	// Contrary to naive expectations -EAGIAN means ...
+	case -EAGAIN:
+	  derr << "FileJournal::_open: user's limit of aio events exceeded. "
+	       << "Try increasing /proc/sys/fs/aio-max-nr" << dendl;
+	  break;
+	default:
+	  derr << "FileJournal::_open: unable to setup io_context " << cpp_strerror(-ret) << dendl;
+	  break;
+      }
+      goto out_fd;
+    }
+  }
+#endif
+
+  /* We really want max_size to be a multiple of block_size. */
+  max_size -= max_size % block_size;
+
+  dout(1) << "_open " << fn << " fd " << fd
+	  << ": " << max_size
+	  << " bytes, block size " << block_size
+	  << " bytes, directio = " << directio
+	  << ", aio = " << aio
+	  << dendl;
+  return 0;
+
+ out_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  fd = -1;
+  return ret;
+}
+
+int FileJournal::_open_block_device()
+{
+  int64_t bdev_sz = 0;
+  BlkDev blkdev(fd);
+  int ret = blkdev.get_size(&bdev_sz);
+  if (ret) {
+    dout(0) << __func__ << ": failed to read block device size." << dendl;
+    return -EIO;
+  }
+
+  /* Check for bdev_sz too small */
+  if (bdev_sz < ONE_MEG) {
+    dout(0) << __func__ << ": your block device must be at least "
+      << ONE_MEG << " bytes to be used for a Ceph journal." << dendl;
+    return -EINVAL;
+  }
+
+  dout(10) << __func__ << ": ignoring osd journal size. "
+	   << "We'll use the entire block device (size: " << bdev_sz << ")"
+	   << dendl;
+  max_size = bdev_sz;
+
+  block_size = cct->_conf->journal_block_size;
+
+  if (cct->_conf->journal_discard) {
+    discard = blkdev.support_discard();
+    dout(10) << fn << " support discard: " << (int)discard << dendl;
+  }
+
+  return 0;
+}
+
+int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
+			    bool create)
+{
+  int ret;
+  int64_t conf_journal_sz(cct->_conf->osd_journal_size);
+  conf_journal_sz <<= 20;
+
+  if ((cct->_conf->osd_journal_size == 0) && (oldsize < ONE_MEG)) {
+    derr << "I'm sorry, I don't know how large of a journal to create."
+	 << "Please specify a block device to use as the journal OR "
+	 << "set osd_journal_size in your ceph.conf" << dendl;
+    return -EINVAL;
+  }
+
+  if (create && (oldsize < conf_journal_sz)) {
+    uint64_t newsize(conf_journal_sz);
+    dout(10) <<  __func__ << " _open extending to " << newsize << " bytes" << dendl;
+    ret = ::ftruncate(fd, newsize);
+    if (ret < 0) {
+      int err = errno;
+      derr << "FileJournal::_open_file : unable to extend journal to "
+	   << newsize << " bytes: " << cpp_strerror(err) << dendl;
+      return -err;
+    }
+    ret = ceph_posix_fallocate(fd, 0, newsize);
+    if (ret) {
+      derr << "FileJournal::_open_file : unable to preallocation journal to "
+	   << newsize << " bytes: " << cpp_strerror(ret) << dendl;
+      return -ret;
+    }
+    max_size = newsize;
+  }
+  else {
+    max_size = oldsize;
+  }
+  block_size = cct->_conf->journal_block_size;
+
+  if (create && cct->_conf->journal_zero_on_create) {
+    derr << "FileJournal::_open_file : zeroing journal" << dendl;
+    uint64_t write_size = 1 << 20;
+    char *buf;
+    ret = ::posix_memalign((void **)&buf, block_size, write_size);
+    if (ret != 0) {
+      return -ret;
+    }
+    memset(static_cast<void*>(buf), 0, write_size);
+    uint64_t i = 0;
+    for (; (i + write_size) <= (uint64_t)max_size; i += write_size) {
+      ret = ::pwrite(fd, static_cast<void*>(buf), write_size, i);
+      if (ret < 0) {
+	aligned_free(buf);
+	return -errno;
+      }
+    }
+    if (i < (uint64_t)max_size) {
+      ret = ::pwrite(fd, static_cast<void*>(buf), max_size - i, i);
+      if (ret < 0) {
+	aligned_free(buf);
+	return -errno;
+      }
+    }
+    aligned_free(buf);
+  }
+
+
+  dout(10) << "_open journal is not a block device, NOT checking disk "
+           << "write cache on '" << fn << "'" << dendl;
+
+  return 0;
+}
+
+// This can not be used on an active journal
+int FileJournal::check()
+{
+  int ret;
+
+  ceph_assert(fd == -1);
+  ret = _open(false, false);
+  if (ret)
+    return ret;
+
+  ret = read_header(&header);
+  if (ret < 0)
+    goto done;
+
+  if (header.fsid != fsid) {
+    derr << "check: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
+	 << ", invalid (someone else's?) journal" << dendl;
+    ret = -EINVAL;
+    goto done;
+  }
+
+  dout(1) << "check: header looks ok" << dendl;
+  ret = 0;
+
+ done:
+  close();
+  return ret;
+}
+
+
+int FileJournal::create()
+{
+  void *buf = 0;
+  int64_t needed_space;
+  int ret;
+  ceph::buffer::ptr bp;
+  dout(2) << "create " << fn << " fsid " << fsid << dendl;
+
+  ret = _open(true, true);
+  if (ret)
+    goto done;
+
+  // write empty header
+  header = header_t();
+  header.flags = header_t::FLAG_CRC;  // enable crcs on any new journal.
+  header.fsid = fsid;
+  header.max_size = max_size;
+  header.block_size = block_size;
+  if (cct->_conf->journal_block_align || directio)
+    header.alignment = block_size;
+  else
+    header.alignment = 16;  // at least stay word aligned on 64bit machines...
+
+  header.start = get_top();
+  header.start_seq = 0;
+
+  print_header(header);
+
+  // static zeroed buffer for alignment padding
+  delete [] zero_buf;
+  zero_buf = new char[header.alignment];
+  memset(zero_buf, 0, header.alignment);
+
+  bp = prepare_header();
+  if (TEMP_FAILURE_RETRY(::pwrite(fd, bp.c_str(), bp.length(), 0)) < 0) {
+    ret = -errno;
+    derr << "FileJournal::create : create write header error "
+         << cpp_strerror(ret) << dendl;
+    goto close_fd;
+  }
+
+  // zero first little bit, too.
+  ret = posix_memalign(&buf, block_size, block_size);
+  if (ret) {
+    ret = -ret;
+    derr << "FileJournal::create: failed to allocate " << block_size
+	 << " bytes of memory: " << cpp_strerror(ret) << dendl;
+    goto close_fd;
+  }
+  memset(buf, 0, block_size);
+  if (TEMP_FAILURE_RETRY(::pwrite(fd, buf, block_size, get_top())) < 0) {
+    ret = -errno;
+    derr << "FileJournal::create: error zeroing first " << block_size
+	 << " bytes " << cpp_strerror(ret) << dendl;
+    goto free_buf;
+  }
+
+  needed_space = cct->_conf->osd_max_write_size << 20;
+  needed_space += (2 * sizeof(entry_header_t)) + get_top();
+  if (header.max_size - header.start < needed_space) {
+    derr << "FileJournal::create: OSD journal is not large enough to hold "
+	 << "osd_max_write_size bytes!" << dendl;
+    ret = -ENOSPC;
+    goto free_buf;
+  }
+
+  dout(2) << "create done" << dendl;
+  ret = 0;
+
+free_buf:
+  free(buf);
+  buf = 0;
+close_fd:
+  if (TEMP_FAILURE_RETRY(::close(fd)) < 0) {
+    ret = -errno;
+    derr << "FileJournal::create: error closing fd: " << cpp_strerror(ret)
+	 << dendl;
+  }
+done:
+  fd = -1;
+  return ret;
+}
+
+// This can not be used on an active journal
+int FileJournal::peek_fsid(uuid_d& fsid)
+{
+  ceph_assert(fd == -1);
+  int r = _open(false, false);
+  if (r)
+    return r;
+  r = read_header(&header);
+  if (r < 0)
+    goto out;
+  fsid = header.fsid;
+out:
+  close();
+  return r;
+}
+
+int FileJournal::open(uint64_t fs_op_seq)
+{
+  dout(2) << "open " << fn << " fsid " << fsid << " fs_op_seq " << fs_op_seq << dendl;
+
+  uint64_t next_seq = fs_op_seq + 1;
+  uint64_t seq = -1;
+
+  int err = _open(false);
+  if (err)
+    return err;
+
+  // assume writeable, unless...
+  read_pos = 0;
+  write_pos = get_top();
+
+  // read header?
+  err = read_header(&header);
+  if (err < 0)
+    goto out;
+
+  // static zeroed buffer for alignment padding
+  delete [] zero_buf;
+  zero_buf = new char[header.alignment];
+  memset(zero_buf, 0, header.alignment);
+
+  dout(10) << "open header.fsid = " << header.fsid
+    //<< " vs expected fsid = " << fsid
+	   << dendl;
+  if (header.fsid != fsid) {
+    derr << "FileJournal::open: ondisk fsid " << header.fsid << " doesn't match expected " << fsid
+         << ", invalid (someone else's?) journal" << dendl;
+    err = -EINVAL;
+    goto out;
+  }
+  if (header.max_size > max_size) {
+    dout(2) << "open journal size " << header.max_size << " > current " << max_size << dendl;
+    err = -EINVAL;
+    goto out;
+  }
+  if (header.block_size != block_size) {
+    dout(2) << "open journal block size " << header.block_size << " != current " << block_size << dendl;
+    err = -EINVAL;
+    goto out;
+  }
+  if (header.max_size % header.block_size) {
+    dout(2) << "open journal max size " << header.max_size
+	    << " not a multiple of block size " << header.block_size << dendl;
+    err = -EINVAL;
+    goto out;
+  }
+  if (header.alignment != block_size && directio) {
+    dout(0) << "open journal alignment " << header.alignment << " does not match block size "
+	    << block_size << " (required for direct_io journal mode)" << dendl;
+    err = -EINVAL;
+    goto out;
+  }
+  if ((header.alignment % CEPH_DIRECTIO_ALIGNMENT) && directio) {
+    dout(0) << "open journal alignment " << header.alignment
+	    << " is not multiple of minimum directio alignment "
+	    << CEPH_DIRECTIO_ALIGNMENT << " (required for direct_io journal mode)"
+	    << dendl;
+    err = -EINVAL;
+    goto out;
+  }
+
+  // looks like a valid header.
+  write_pos = 0;  // not writeable yet
+
+  journaled_seq = header.committed_up_to;
+
+  // find next entry
+  read_pos = header.start;
+  seq = header.start_seq;
+
+  while (1) {
+    bufferlist bl;
+    off64_t old_pos = read_pos;
+    if (!read_entry(bl, seq)) {
+      dout(10) << "open reached end of journal." << dendl;
+      break;
+    }
+    if (seq > next_seq) {
+      dout(10) << "open entry " << seq << " len " << bl.length() << " > next_seq " << next_seq
+	       << ", ignoring journal contents"
+	       << dendl;
+      read_pos = -1;
+      last_committed_seq = 0;
+      return 0;
+    }
+    if (seq == next_seq) {
+      dout(10) << "open reached seq " << seq << dendl;
+      read_pos = old_pos;
+      break;
+    }
+    seq++;  // next event should follow.
+  }
+
+  return 0;
+out:
+  close();
+  return err;
+}
+
+void FileJournal::_close(int fd) const
+{
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+void FileJournal::close()
+{
+  dout(1) << "close " << fn << dendl;
+
+  // stop writer thread
+  stop_writer();
+
+  // close
+  ceph_assert(writeq_empty());
+  ceph_assert(!must_write_header);
+  ceph_assert(fd >= 0);
+  _close(fd);
+  fd = -1;
+}
+
+
+int FileJournal::dump(ostream& out)
+{
+  return _dump(out, false);
+}
+
+int FileJournal::simple_dump(ostream& out)
+{
+  return _dump(out, true);
+}
+
+int FileJournal::_dump(ostream& out, bool simple)
+{
+  JSONFormatter f(true);
+  int ret = _fdump(f, simple);
+  f.flush(out);
+  return ret;
+}
+
+int FileJournal::_fdump(Formatter &f, bool simple)
+{
+  dout(10) << "_fdump" << dendl;
+
+  ceph_assert(fd == -1);
+  int err = _open(false, false);
+  if (err)
+    return err;
+
+  err = read_header(&header);
+  if (err < 0) {
+    close();
+    return err;
+  }
+
+  off64_t next_pos = header.start;
+
+  f.open_object_section("journal");
+
+  f.open_object_section("header");
+  f.dump_unsigned("flags", header.flags);
+  ostringstream os;
+  os << header.fsid;
+  f.dump_string("fsid", os.str());
+  f.dump_unsigned("block_size", header.block_size);
+  f.dump_unsigned("alignment", header.alignment);
+  f.dump_int("max_size", header.max_size);
+  f.dump_int("start", header.start);
+  f.dump_unsigned("committed_up_to", header.committed_up_to);
+  f.dump_unsigned("start_seq", header.start_seq);
+  f.close_section();
+
+  f.open_array_section("entries");
+  uint64_t seq = header.start_seq;
+  while (1) {
+    bufferlist bl;
+    off64_t pos = next_pos;
+
+    if (!pos) {
+      dout(2) << "_dump -- not readable" << dendl;
+      err = -EINVAL;
+      break;
+    }
+    stringstream ss;
+    read_entry_result result = do_read_entry(
+      pos,
+      &next_pos,
+      &bl,
+      &seq,
+      &ss);
+    if (result != SUCCESS) {
+      if (seq < header.committed_up_to) {
+        dout(2) << "Unable to read past sequence " << seq
+	    << " but header indicates the journal has committed up through "
+	    << header.committed_up_to << ", journal is corrupt" << dendl;
+        err = -EINVAL;
+      }
+      dout(25) << ss.str() << dendl;
+      dout(25) << "No further valid entries found, journal is most likely valid"
+	  << dendl;
+      break;
+    }
+
+    f.open_object_section("entry");
+    f.dump_unsigned("offset", pos);
+    f.dump_unsigned("seq", seq);
+    if (simple) {
+      f.dump_unsigned("bl.length", bl.length());
+    } else {
+      f.open_array_section("transactions");
+      auto p = bl.cbegin();
+      int trans_num = 0;
+      while (!p.end()) {
+        ObjectStore::Transaction t(p);
+        f.open_object_section("transaction");
+        f.dump_unsigned("trans_num", trans_num);
+        t.dump(&f);
+        f.close_section();
+        trans_num++;
+      }
+      f.close_section();
+    }
+    f.close_section();
+  }
+
+  f.close_section();
+  f.close_section();
+  dout(10) << "dump finish" << dendl;
+
+  close();
+  return err;
+}
+
+
+void FileJournal::start_writer()
+{
+  write_stop = false;
+  aio_stop = false;
+  write_thread.create("journal_write");
+#ifdef HAVE_LIBAIO
+  if (aio)
+    write_finish_thread.create("journal_wrt_fin");
+#endif
+}
+
+void FileJournal::stop_writer()
+{
+  // Do nothing if writer already stopped or never started
+  if (!write_stop)
+  {
+    {
+      std::lock_guard l{write_lock};
+      std::lock_guard p{writeq_lock};
+      write_stop = true;
+      writeq_cond.notify_all();
+      // Doesn't hurt to signal commit_cond in case thread is waiting there
+      // and caller didn't use committed_thru() first.
+      commit_cond.notify_all();
+    }
+    write_thread.join();
+
+    // write journal header now so that we have less to replay on remount
+    write_header_sync();
+  }
+
+#ifdef HAVE_LIBAIO
+  // stop aio completeion thread *after* writer thread has stopped
+  // and has submitted all of its io
+  if (aio && !aio_stop) {
+    aio_lock.lock();
+    aio_stop = true;
+    aio_cond.notify_all();
+    write_finish_cond.notify_all();
+    aio_lock.unlock();
+    write_finish_thread.join();
+  }
+#endif
+}
+
+
+
+void FileJournal::print_header(const header_t &header) const
+{
+  dout(10) << "header: block_size " << header.block_size
+	   << " alignment " << header.alignment
+	   << " max_size " << header.max_size
+	   << dendl;
+  dout(10) << "header: start " << header.start << dendl;
+  dout(10) << " write_pos " << write_pos << dendl;
+}
+
+int FileJournal::read_header(header_t *hdr) const
+{
+  dout(10) << "read_header" << dendl;
+  bufferlist bl;
+
+  ceph::buffer::ptr bp = ceph::buffer::create_small_page_aligned(block_size);
+  char* bpdata = bp.c_str();
+  int r = ::pread(fd, bpdata, bp.length(), 0);
+
+  if (r < 0) {
+    int err = errno;
+    dout(0) << "read_header got " << cpp_strerror(err) << dendl;
+    return -err;
+  }
+
+  // don't use bp.zero() here, because it also invalidates
+  // crc cache (which is not yet populated anyway)
+  if (bp.length() != (size_t)r) {
+      // r will be always less or equal than bp.length
+      bpdata += r;
+      memset(bpdata, 0, bp.length() - r);
+  }
+
+  bl.push_back(std::move(bp));
+
+  try {
+    auto p = bl.cbegin();
+    decode(*hdr, p);
+  }
+  catch (ceph::buffer::error& e) {
+    derr << "read_header error decoding journal header" << dendl;
+    return -EINVAL;
+  }
+
+
+  /*
+   * Unfortunately we weren't initializing the flags field for new
+   * journals!  Aie.  This is safe(ish) now that we have only one
+   * flag.  Probably around when we add the next flag we need to
+   * remove this or else this (eventually old) code will clobber newer
+   * code's flags.
+   */
+  if (hdr->flags > 3) {
+    derr << "read_header appears to have gibberish flags; assuming 0" << dendl;
+    hdr->flags = 0;
+  }
+
+  print_header(*hdr);
+
+  return 0;
+}
+
+bufferptr FileJournal::prepare_header()
+{
+  bufferlist bl;
+  {
+    std::lock_guard l{finisher_lock};
+    header.committed_up_to = journaled_seq;
+  }
+  encode(header, bl);
+  bufferptr bp = ceph::buffer::create_small_page_aligned(get_top());
+  // don't use bp.zero() here, because it also invalidates
+  // crc cache (which is not yet populated anyway)
+  char* data = bp.c_str();
+  memcpy(data, bl.c_str(), bl.length());
+  data += bl.length();
+  memset(data, 0, bp.length()-bl.length());
+  return bp;
+}
+
+void FileJournal::write_header_sync()
+{
+  std::lock_guard locker{write_lock};
+  must_write_header = true;
+  bufferlist bl;
+  do_write(bl);
+  dout(20) << __func__ << " finish" << dendl;
+}
+
+int FileJournal::check_for_full(uint64_t seq, off64_t pos, off64_t size)
+{
+  // already full?
+  if (full_state != FULL_NOTFULL)
+    return -ENOSPC;
+
+  // take 1 byte off so that we only get pos == header.start on EMPTY, never on FULL.
+  off64_t room;
+  if (pos >= header.start)
+    room = (header.max_size - pos) + (header.start - get_top()) - 1;
+  else
+    room = header.start - pos - 1;
+  dout(10) << "room " << room << " max_size " << max_size << " pos " << pos << " header.start " << header.start
+	   << " top " << get_top() << dendl;
+
+  if (do_sync_cond) {
+    if (room >= (header.max_size >> 1) &&
+        room - size < (header.max_size >> 1)) {
+      dout(10) << " passing half full mark, triggering commit" << dendl;
+#ifdef CEPH_DEBUG_MUTEX
+      do_sync_cond->notify_all(true);  // initiate a real commit so we can trim
+#else
+      do_sync_cond->notify_all();
+#endif
+    }
+  }
+
+  if (room >= size) {
+    dout(10) << "check_for_full at " << pos << " : " << size << " < " << room << dendl;
+    if (pos + size > header.max_size)
+      must_write_header = true;
+    return 0;
+  }
+
+  // full
+  dout(1) << "check_for_full at " << pos << " : JOURNAL FULL "
+	  << pos << " >= " << room
+	  << " (max_size " << header.max_size << " start " << header.start << ")"
+	  << dendl;
+
+  off64_t max = header.max_size - get_top();
+  if (size > max)
+    dout(0) << "JOURNAL TOO SMALL: continuing, but slow: item " << size << " > journal " << max << " (usable)" << dendl;
+
+  return -ENOSPC;
+}
+
+int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_t& orig_bytes)
+{
+  // gather queued writes
+  off64_t queue_pos = write_pos;
+
+  int eleft = cct->_conf->journal_max_write_entries;
+  unsigned bmax = cct->_conf->journal_max_write_bytes;
+
+  if (full_state != FULL_NOTFULL)
+    return -ENOSPC;
+
+  while (!writeq_empty()) {
+    list<write_item> items;
+    batch_pop_write(items);
+    list<write_item>::iterator it = items.begin();
+    while (it != items.end()) {
+      uint64_t bytes = it->bl.length();
+      int r = prepare_single_write(*it, bl, queue_pos, orig_ops, orig_bytes);
+      if (r == 0) { // prepare ok, delete it
+	items.erase(it++);
+#ifdef HAVE_LIBAIO
+	{
+	  std::lock_guard locker{aio_lock};
+	  ceph_assert(aio_write_queue_ops > 0);
+	  aio_write_queue_ops--;
+	  ceph_assert(aio_write_queue_bytes >= bytes);
+	  aio_write_queue_bytes -= bytes;
+	}
+#else
+	(void)bytes;
+#endif
+      }
+      if (r == -ENOSPC) {
+        // the journal maybe full, insert the left item to writeq
+        batch_unpop_write(items);
+        if (orig_ops)
+          goto out;         // commit what we have
+
+        if (logger)
+          logger->inc(l_filestore_journal_full);
+
+        if (wait_on_full) {
+          dout(20) << "prepare_multi_write full on first entry, need to wait" << dendl;
+        } else {
+          dout(20) << "prepare_multi_write full on first entry, restarting journal" << dendl;
+
+          // throw out what we have so far
+          full_state = FULL_FULL;
+          while (!writeq_empty()) {
+            complete_write(1, peek_write().orig_len);
+            pop_write();
+          }
+          print_header(header);
+        }
+
+        return -ENOSPC;  // hrm, full on first op
+      }
+      if (eleft) {
+        if (--eleft == 0) {
+          dout(20) << "prepare_multi_write hit max events per write "
+		   << cct->_conf->journal_max_write_entries << dendl;
+          batch_unpop_write(items);
+          goto out;
+        }
+      }
+      if (bmax) {
+        if (bl.length() >= bmax) {
+          dout(20) << "prepare_multi_write hit max write size "
+		   << cct->_conf->journal_max_write_bytes << dendl;
+          batch_unpop_write(items);
+          goto out;
+        }
+      }
+    }
+  }
+
+out:
+  dout(20) << "prepare_multi_write queue_pos now " << queue_pos << dendl;
+  ceph_assert((write_pos + bl.length() == queue_pos) ||
+         (write_pos + bl.length() - header.max_size + get_top() == queue_pos));
+  return 0;
+}
+
+/*
+void FileJournal::queue_write_fin(uint64_t seq, Context *fin)
+{
+  writing_seq.push_back(seq);
+  if (!waiting_for_notfull.empty()) {
+    // make sure previously unjournaled stuff waiting for UNFULL triggers
+    // _before_ newly journaled stuff does
+    dout(10) << "queue_write_fin will defer seq " << seq << " callback " << fin
+	     << " until after UNFULL" << dendl;
+    C_Gather *g = new C_Gather(writeq.front().fin);
+    writing_fin.push_back(g->new_sub());
+    waiting_for_notfull.push_back(g->new_sub());
+  } else {
+    writing_fin.push_back(writeq.front().fin);
+    dout(20) << "queue_write_fin seq " << seq << " callback " << fin << dendl;
+  }
+}
+*/
+
+void FileJournal::queue_completions_thru(uint64_t seq)
+{
+  ceph_assert(ceph_mutex_is_locked(finisher_lock));
+  utime_t now = ceph_clock_now();
+  list<completion_item> items;
+  batch_pop_completions(items);
+  list<completion_item>::iterator it = items.begin();
+  while (it != items.end()) {
+    completion_item& next = *it;
+    if (next.seq > seq)
+      break;
+    utime_t lat = now;
+    lat -= next.start;
+    dout(10) << "queue_completions_thru seq " << seq
+	     << " queueing seq " << next.seq
+	     << " " << next.finish
+	     << " lat " << lat << dendl;
+    if (logger) {
+      logger->tinc(l_filestore_journal_latency, lat);
+    }
+    if (next.finish)
+      finisher->queue(next.finish);
+    if (next.tracked_op) {
+      next.tracked_op->mark_event("journaled_completion_queued");
+      next.tracked_op->journal_trace.event("queued completion");
+      next.tracked_op->journal_trace.keyval("completed through", seq);
+    }
+    items.erase(it++);
+  }
+  batch_unpop_completions(items);
+  finisher_cond.notify_all();
+}
+
+
+int FileJournal::prepare_single_write(write_item &next_write, bufferlist& bl, off64_t& queue_pos, uint64_t& orig_ops, uint64_t& orig_bytes)
+{
+  uint64_t seq = next_write.seq;
+  bufferlist &ebl = next_write.bl;
+  off64_t size = ebl.length();
+
+  int r = check_for_full(seq, queue_pos, size);
+  if (r < 0)
+    return r;   // ENOSPC or EAGAIN
+
+  uint32_t orig_len = next_write.orig_len;
+  orig_bytes += orig_len;
+  orig_ops++;
+
+  // add to write buffer
+  dout(15) << "prepare_single_write " << orig_ops << " will write " << queue_pos << " : seq " << seq
+	   << " len " << orig_len << " -> " << size << dendl;
+
+  unsigned seq_offset = offsetof(entry_header_t, seq);
+  unsigned magic1_offset = offsetof(entry_header_t, magic1);
+  unsigned magic2_offset = offsetof(entry_header_t, magic2);
+
+  bufferptr headerptr = ebl.buffers().front();
+  uint64_t _seq = seq;
+  uint64_t _queue_pos = queue_pos;
+  uint64_t magic2 = entry_header_t::make_magic(seq, orig_len, header.get_fsid64());
+  headerptr.copy_in(seq_offset, sizeof(uint64_t), (char *)&_seq);
+  headerptr.copy_in(magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
+  headerptr.copy_in(magic2_offset, sizeof(uint64_t), (char *)&magic2);
+
+  bufferptr footerptr = ebl.buffers().back();
+  unsigned post_offset  = footerptr.length() - sizeof(entry_header_t);
+  footerptr.copy_in(post_offset + seq_offset, sizeof(uint64_t), (char *)&_seq);
+  footerptr.copy_in(post_offset + magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
+  footerptr.copy_in(post_offset + magic2_offset, sizeof(uint64_t), (char *)&magic2);
+
+  bl.claim_append(ebl);
+  if (next_write.tracked_op) {
+    next_write.tracked_op->mark_event("write_thread_in_journal_buffer");
+    next_write.tracked_op->journal_trace.event("prepare_single_write");
+  }
+
+  journalq.push_back(pair<uint64_t,off64_t>(seq, queue_pos));
+  writing_seq = seq;
+
+  queue_pos += size;
+  if (queue_pos >= header.max_size)
+    queue_pos = queue_pos + get_top() - header.max_size;
+
+  return 0;
+}
+
+void FileJournal::check_align(off64_t pos, bufferlist& bl)
+{
+  // make sure list segments are page aligned
+  if (directio && !bl.is_aligned_size_and_memory(block_size, CEPH_DIRECTIO_ALIGNMENT)) {
+    ceph_assert((bl.length() & (CEPH_DIRECTIO_ALIGNMENT - 1)) == 0);
+    ceph_assert((pos & (CEPH_DIRECTIO_ALIGNMENT - 1)) == 0);
+    ceph_abort_msg("bl was not aligned");
+  }
+}
+
+int FileJournal::write_bl(off64_t& pos, bufferlist& bl)
+{
+  int ret;
+
+  off64_t spos = ::lseek64(fd, pos, SEEK_SET);
+  if (spos < 0) {
+    ret = -errno;
+    derr << "FileJournal::write_bl : lseek64 failed " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  ret = bl.write_fd(fd);
+  if (ret) {
+    derr << "FileJournal::write_bl : write_fd failed: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  pos += bl.length();
+  if (pos == header.max_size)
+    pos = get_top();
+  return 0;
+}
+
+void FileJournal::do_write(bufferlist& bl)
+{
+  // nothing to do?
+  if (bl.length() == 0 && !must_write_header)
+    return;
+
+  ceph::buffer::ptr hbp;
+  if (cct->_conf->journal_write_header_frequency &&
+      (((++journaled_since_start) %
+	cct->_conf->journal_write_header_frequency) == 0)) {
+    must_write_header = true;
+  }
+
+  if (must_write_header) {
+    must_write_header = false;
+    hbp = prepare_header();
+  }
+
+  dout(15) << "do_write writing " << write_pos << "~" << bl.length()
+	   << (hbp.length() ? " + header":"")
+	   << dendl;
+
+  utime_t from = ceph_clock_now();
+
+  // entry
+  off64_t pos = write_pos;
+
+  // Adjust write_pos
+  write_pos += bl.length();
+  if (write_pos >= header.max_size)
+    write_pos = write_pos - header.max_size + get_top();
+
+  write_lock.unlock();
+
+  // split?
+  off64_t split = 0;
+  if (pos + bl.length() > header.max_size) {
+    bufferlist first, second;
+    split = header.max_size - pos;
+    first.substr_of(bl, 0, split);
+    second.substr_of(bl, split, bl.length() - split);
+    ceph_assert(first.length() + second.length() == bl.length());
+    dout(10) << "do_write wrapping, first bit at " << pos << " len " << first.length()
+	     << " second bit len " << second.length() << " (orig len " << bl.length() << ")" << dendl;
+
+    //Save pos to write first piece second
+    off64_t first_pos = pos;
+    off64_t orig_pos;
+    pos = get_top();
+    // header too?
+    if (hbp.length()) {
+      // be sneaky: include the header in the second fragment
+      bufferlist tmp;
+      tmp.push_back(hbp);
+      tmp.claim_append(second);
+      second.swap(tmp);
+      pos = 0;          // we included the header
+    }
+    // Write the second portion first possible with the header, so
+    // do_read_entry() won't even get a valid entry_header_t if there
+    // is a crash between the two writes.
+    orig_pos = pos;
+    if (write_bl(pos, second)) {
+      derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
+	   << ") failed" << dendl;
+      check_align(pos, second);
+      ceph_abort();
+    }
+    orig_pos = first_pos;
+    if (write_bl(first_pos, first)) {
+      derr << "FileJournal::do_write: write_bl(pos=" << orig_pos
+	   << ") failed" << dendl;
+      check_align(first_pos, first);
+      ceph_abort();
+    }
+    ceph_assert(first_pos == get_top());
+  } else {
+    // header too?
+    if (hbp.length()) {
+      if (TEMP_FAILURE_RETRY(::pwrite(fd, hbp.c_str(), hbp.length(), 0)) < 0) {
+	int err = errno;
+	derr << "FileJournal::do_write: pwrite(fd=" << fd
+	     << ", hbp.length=" << hbp.length() << ") failed :"
+	     << cpp_strerror(err) << dendl;
+	ceph_abort();
+      }
+    }
+
+    if (write_bl(pos, bl)) {
+      derr << "FileJournal::do_write: write_bl(pos=" << pos
+	   << ") failed" << dendl;
+      check_align(pos, bl);
+      ceph_abort();
+    }
+  }
+
+  if (!directio) {
+    dout(20) << "do_write fsync" << dendl;
+
+    /*
+     * We'd really love to have a fsync_range or fdatasync_range and do a:
+     *
+     *  if (split) {
+     *    ::fsync_range(fd, header.max_size - split, split)l
+     *    ::fsync_range(fd, get_top(), bl.length() - split);
+     *  else
+     *    ::fsync_range(fd, write_pos, bl.length())
+     *
+     * NetBSD and AIX apparently have it, and adding it to Linux wouldn't be
+     * too hard given all the underlying infrastructure already exist.
+     *
+     * NOTE: using sync_file_range here would not be safe as it does not
+     * flush disk caches or commits any sort of metadata.
+     */
+    int ret = 0;
+#if defined(__APPLE__) || defined(__FreeBSD__)
+    ret = ::fsync(fd);
+#else
+    ret = ::fdatasync(fd);
+#endif
+    if (ret < 0) {
+      derr << __func__ << " fsync/fdatasync failed: " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+#ifdef HAVE_POSIX_FADVISE
+    if (cct->_conf->filestore_fadvise)
+      posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+  }
+
+  utime_t lat = ceph_clock_now() - from;
+  dout(20) << "do_write latency " << lat << dendl;
+
+  write_lock.lock();
+
+  ceph_assert(write_pos == pos);
+  ceph_assert(write_pos % header.alignment == 0);
+
+  {
+    std::lock_guard locker{finisher_lock};
+    journaled_seq = writing_seq;
+
+    // kick finisher?
+    //  only if we haven't filled up recently!
+    if (full_state != FULL_NOTFULL) {
+      dout(10) << "do_write NOT queueing finisher seq " << journaled_seq
+	       << ", full_commit_seq|full_restart_seq" << dendl;
+    } else {
+      if (plug_journal_completions) {
+	dout(20) << "do_write NOT queueing finishers through seq " << journaled_seq
+		 << " due to completion plug" << dendl;
+      } else {
+	dout(20) << "do_write queueing finishers through seq " << journaled_seq << dendl;
+	queue_completions_thru(journaled_seq);
+      }
+    }
+  }
+}
+
+void FileJournal::flush()
+{
+  dout(10) << "waiting for completions to empty" << dendl;
+  {
+    std::unique_lock l{finisher_lock};
+    finisher_cond.wait(l, [this] { return completions_empty(); });
+  }
+  dout(10) << "flush waiting for finisher" << dendl;
+  finisher->wait_for_empty();
+  dout(10) << "flush done" << dendl;
+}
+
+
+void FileJournal::write_thread_entry()
+{
+  dout(10) << "write_thread_entry start" << dendl;
+  while (1) {
+    {
+      std::unique_lock locker{writeq_lock};
+      if (writeq.empty() && !must_write_header) {
+	if (write_stop)
+	  break;
+	dout(20) << "write_thread_entry going to sleep" << dendl;
+	writeq_cond.wait(locker);
+	dout(20) << "write_thread_entry woke up" << dendl;
+	continue;
+      }
+    }
+
+#ifdef HAVE_LIBAIO
+    if (aio) {
+      std::unique_lock locker{aio_lock};
+      // should we back off to limit aios in flight?  try to do this
+      // adaptively so that we submit larger aios once we have lots of
+      // them in flight.
+      //
+      // NOTE: our condition here is based on aio_num (protected by
+      // aio_lock) and throttle_bytes (part of the write queue).  when
+      // we sleep, we *only* wait for aio_num to change, and do not
+      // wake when more data is queued.  this is not strictly correct,
+      // but should be fine given that we will have plenty of aios in
+      // flight if we hit this limit to ensure we keep the device
+      // saturated.
+      while (aio_num > 0) {
+	int exp = std::min<int>(aio_num * 2, 24);
+	long unsigned min_new = 1ull << exp;
+	uint64_t cur = aio_write_queue_bytes;
+	dout(20) << "write_thread_entry aio throttle: aio num " << aio_num << " bytes " << aio_bytes
+		 << " ... exp " << exp << " min_new " << min_new
+		 << " ... pending " << cur << dendl;
+	if (cur >= min_new)
+	  break;
+	dout(20) << "write_thread_entry deferring until more aios complete: "
+		 << aio_num << " aios with " << aio_bytes << " bytes needs " << min_new
+		 << " bytes to start a new aio (currently " << cur << " pending)" << dendl;
+	aio_cond.wait(locker);
+	dout(20) << "write_thread_entry woke up" << dendl;
+      }
+    }
+#endif
+
+    std::unique_lock locker{write_lock};
+    uint64_t orig_ops = 0;
+    uint64_t orig_bytes = 0;
+
+    bufferlist bl;
+    int r = prepare_multi_write(bl, orig_ops, orig_bytes);
+    // Don't care about journal full if stoppping, so drop queue and
+    // possibly let header get written and loop above to notice stop
+    if (r == -ENOSPC) {
+      if (write_stop) {
+	dout(20) << "write_thread_entry full and stopping, throw out queue and finish up" << dendl;
+	while (!writeq_empty()) {
+	  complete_write(1, peek_write().orig_len);
+	  pop_write();
+	}
+	print_header(header);
+	r = 0;
+      } else {
+	dout(20) << "write_thread_entry full, going to sleep (waiting for commit)" << dendl;
+	commit_cond.wait(locker);
+	dout(20) << "write_thread_entry woke up" << dendl;
+	continue;
+      }
+    }
+    ceph_assert(r == 0);
+
+    if (logger) {
+      logger->inc(l_filestore_journal_wr);
+      logger->inc(l_filestore_journal_wr_bytes, bl.length());
+    }
+
+#ifdef HAVE_LIBAIO
+    if (aio)
+      do_aio_write(bl);
+    else
+      do_write(bl);
+#else
+    do_write(bl);
+#endif
+    complete_write(orig_ops, orig_bytes);
+  }
+
+  dout(10) << "write_thread_entry finish" << dendl;
+}
+
+#ifdef HAVE_LIBAIO
+void FileJournal::do_aio_write(bufferlist& bl)
+{
+
+  if (cct->_conf->journal_write_header_frequency &&
+      (((++journaled_since_start) %
+	cct->_conf->journal_write_header_frequency) == 0)) {
+    must_write_header = true;
+  }
+
+  // nothing to do?
+  if (bl.length() == 0 && !must_write_header)
+    return;
+
+  ceph::buffer::ptr hbp;
+  if (must_write_header) {
+    must_write_header = false;
+    hbp = prepare_header();
+  }
+
+  // entry
+  off64_t pos = write_pos;
+
+  dout(15) << "do_aio_write writing " << pos << "~" << bl.length()
+	   << (hbp.length() ? " + header":"")
+	   << dendl;
+
+  // split?
+  off64_t split = 0;
+  if (pos + bl.length() > header.max_size) {
+    bufferlist first, second;
+    split = header.max_size - pos;
+    first.substr_of(bl, 0, split);
+    second.substr_of(bl, split, bl.length() - split);
+    ceph_assert(first.length() + second.length() == bl.length());
+    dout(10) << "do_aio_write wrapping, first bit at " << pos << "~" << first.length() << dendl;
+
+    if (write_aio_bl(pos, first, 0)) {
+      derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+	   << ") failed" << dendl;
+      ceph_abort();
+    }
+    ceph_assert(pos == header.max_size);
+    if (hbp.length()) {
+      // be sneaky: include the header in the second fragment
+      bufferlist tmp;
+      tmp.push_back(hbp);
+      tmp.claim_append(second);
+      second.swap(tmp);
+      pos = 0;          // we included the header
+    } else
+      pos = get_top();  // no header, start after that
+    if (write_aio_bl(pos, second, writing_seq)) {
+      derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+	   << ") failed" << dendl;
+      ceph_abort();
+    }
+  } else {
+    // header too?
+    if (hbp.length()) {
+      bufferlist hbl;
+      hbl.push_back(hbp);
+      loff_t pos = 0;
+      if (write_aio_bl(pos, hbl, 0)) {
+	derr << "FileJournal::do_aio_write: write_aio_bl(header) failed" << dendl;
+	ceph_abort();
+      }
+    }
+
+    if (write_aio_bl(pos, bl, writing_seq)) {
+      derr << "FileJournal::do_aio_write: write_aio_bl(pos=" << pos
+	   << ") failed" << dendl;
+      ceph_abort();
+    }
+  }
+
+  write_pos = pos;
+  if (write_pos == header.max_size)
+    write_pos = get_top();
+  ceph_assert(write_pos % header.alignment == 0);
+}
+
+/**
+ * write a buffer using aio
+ *
+ * @param seq seq to trigger when this aio completes.  if 0, do not update any state
+ * on completion.
+ */
+int FileJournal::write_aio_bl(off64_t& pos, bufferlist& bl, uint64_t seq)
+{
+  dout(20) << "write_aio_bl " << pos << "~" << bl.length() << " seq " << seq << dendl;
+
+  while (bl.length() > 0) {
+    int max = std::min<int>(bl.get_num_buffers(), IOV_MAX-1);
+    iovec *iov = new iovec[max];
+    int n = 0;
+    unsigned len = 0;
+    for (auto p = std::cbegin(bl.buffers()); n < max; ++p, ++n) {
+      ceph_assert(p != std::cend(bl.buffers()));
+      iov[n].iov_base = const_cast<void*>(static_cast<const void*>(p->c_str()));
+      iov[n].iov_len = p->length();
+      len += p->length();
+    }
+
+    bufferlist tbl;
+    bl.splice(0, len, &tbl);  // move bytes from bl -> tbl
+
+    // lock only aio_queue, current aio, aio_num, aio_bytes, which may be
+    // modified in check_aio_completion
+    aio_lock.lock();
+    aio_queue.push_back(aio_info(tbl, pos, bl.length() > 0 ? 0 : seq));
+    aio_info& aio = aio_queue.back();
+    aio.iov = iov;
+
+    io_prep_pwritev(&aio.iocb, fd, aio.iov, n, pos);
+
+    dout(20) << "write_aio_bl .. " << aio.off << "~" << aio.len
+	     << " in " << n << dendl;
+
+    aio_num++;
+    aio_bytes += aio.len;
+
+    // need to save current aio len to update write_pos later because current
+    // aio could be ereased from aio_queue once it is done
+    uint64_t cur_len = aio.len;
+    // unlock aio_lock because following io_submit might take time to return
+    aio_lock.unlock();
+
+    iocb *piocb = &aio.iocb;
+
+    // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
+    int attempts = 16;
+    int delay = 125;
+    do {
+      int r = io_submit(aio_ctx, 1, &piocb);
+      dout(20) << "write_aio_bl io_submit return value: " << r << dendl;
+      if (r < 0) {
+	derr << "io_submit to " << aio.off << "~" << cur_len
+	     << " got " << cpp_strerror(r) << dendl;
+	if (r == -EAGAIN && attempts-- > 0) {
+	  usleep(delay);
+	  delay *= 2;
+	  continue;
+	}
+	check_align(pos, tbl);
+	ceph_abort_msg("io_submit got unexpected error");
+      } else {
+	break;
+      }
+    } while (true);
+    pos += cur_len;
+  }
+  aio_lock.lock();
+  write_finish_cond.notify_all();
+  aio_lock.unlock();
+  return 0;
+}
+#endif
+
+void FileJournal::write_finish_thread_entry()
+{
+#ifdef HAVE_LIBAIO
+  dout(10) << __func__ << " enter" << dendl;
+  while (true) {
+    {
+      std::unique_lock locker{aio_lock};
+      if (aio_queue.empty()) {
+	if (aio_stop)
+	  break;
+	dout(20) << __func__ << " sleeping" << dendl;
+	write_finish_cond.wait(locker);
+	continue;
+      }
+    }
+
+    dout(20) << __func__ << " waiting for aio(s)" << dendl;
+    io_event event[16];
+    int r = io_getevents(aio_ctx, 1, 16, event, NULL);
+    if (r < 0) {
+      if (r == -EINTR) {
+	dout(0) << "io_getevents got " << cpp_strerror(r) << dendl;
+	continue;
+      }
+      derr << "io_getevents got " << cpp_strerror(r) << dendl;
+      if (r == -EIO) {
+	note_io_error_event(devname.c_str(), fn.c_str(), -EIO, 0, 0, 0);
+      }
+      ceph_abort_msg("got unexpected error from io_getevents");
+    }
+
+    {
+      std::lock_guard locker{aio_lock};
+      for (int i=0; i<r; i++) {
+	aio_info *ai = (aio_info *)event[i].obj;
+	if (event[i].res != ai->len) {
+	  derr << "aio to " << ai->off << "~" << ai->len
+	       << " returned: " << (int)event[i].res << dendl;
+	  ceph_abort_msg("unexpected aio error");
+	}
+	dout(10) << __func__ << " aio " << ai->off
+		 << "~" << ai->len << " done" << dendl;
+	ai->done = true;
+      }
+      check_aio_completion();
+    }
+  }
+  dout(10) << __func__ << " exit" << dendl;
+#endif
+}
+
+#ifdef HAVE_LIBAIO
+/**
+ * check aio_wait for completed aio, and update state appropriately.
+ */
+void FileJournal::check_aio_completion()
+{
+  ceph_assert(ceph_mutex_is_locked(aio_lock));
+  dout(20) << "check_aio_completion" << dendl;
+
+  bool completed_something = false, signal = false;
+  uint64_t new_journaled_seq = 0;
+
+  list<aio_info>::iterator p = aio_queue.begin();
+  while (p != aio_queue.end() && p->done) {
+    dout(20) << "check_aio_completion completed seq " << p->seq << " "
+	     << p->off << "~" << p->len << dendl;
+    if (p->seq) {
+      new_journaled_seq = p->seq;
+      completed_something = true;
+    }
+    aio_num--;
+    aio_bytes -= p->len;
+    aio_queue.erase(p++);
+    signal = true;
+  }
+
+  if (completed_something) {
+    // kick finisher?
+    //  only if we haven't filled up recently!
+    std::lock_guard locker{finisher_lock};
+    journaled_seq = new_journaled_seq;
+    if (full_state != FULL_NOTFULL) {
+      dout(10) << "check_aio_completion NOT queueing finisher seq " << journaled_seq
+	       << ", full_commit_seq|full_restart_seq" << dendl;
+    } else {
+      if (plug_journal_completions) {
+	dout(20) << "check_aio_completion NOT queueing finishers through seq " << journaled_seq
+		 << " due to completion plug" << dendl;
+      } else {
+	dout(20) << "check_aio_completion queueing finishers through seq " << journaled_seq << dendl;
+	queue_completions_thru(journaled_seq);
+      }
+    }
+  }
+  if (signal) {
+    // maybe write queue was waiting for aio count to drop?
+    aio_cond.notify_all();
+  }
+}
+#endif
+
+int FileJournal::prepare_entry(vector<ObjectStore::Transaction>& tls, bufferlist* tbl) {
+  dout(10) << "prepare_entry " << tls << dendl;
+  int data_len = cct->_conf->journal_align_min_size - 1;
+  int data_align = -1; // -1 indicates that we don't care about the alignment
+  bufferlist bl;
+  for (vector<ObjectStore::Transaction>::iterator p = tls.begin();
+      p != tls.end(); ++p) {
+   if ((int)(*p).get_data_length() > data_len) {
+     data_len = (*p).get_data_length();
+     data_align = ((*p).get_data_alignment() - bl.length()) & ~CEPH_PAGE_MASK;
+    }
+    encode(*p, bl);
+  }
+  if (tbl->length()) {
+    bl.claim_append(*tbl);
+  }
+  // add it this entry
+  entry_header_t h;
+  unsigned head_size = sizeof(entry_header_t);
+  off64_t base_size = 2*head_size + bl.length();
+  memset(&h, 0, sizeof(h));
+  if (data_align >= 0)
+    h.pre_pad = ((unsigned int)data_align - (unsigned int)head_size) & ~CEPH_PAGE_MASK;
+  off64_t size = round_up_to(base_size + h.pre_pad, header.alignment);
+  unsigned post_pad = size - base_size - h.pre_pad;
+  h.len = bl.length();
+  h.post_pad = post_pad;
+  h.crc32c = bl.crc32c(0);
+  dout(10) << " len " << bl.length() << " -> " << size
+       << " (head " << head_size << " pre_pad " << h.pre_pad
+       << " bl " << bl.length() << " post_pad " << post_pad << " tail " << head_size << ")"
+       << " (bl alignment " << data_align << ")"
+       << dendl;
+  bufferlist ebl;
+  // header
+  ebl.append((const char*)&h, sizeof(h));
+  if (h.pre_pad) {
+    ebl.push_back(ceph::buffer::create_static(h.pre_pad, zero_buf));
+  }
+  // payload
+  ebl.claim_append(bl);
+  if (h.post_pad) {
+    ebl.push_back(ceph::buffer::create_static(h.post_pad, zero_buf));
+  }
+  // footer
+  ebl.append((const char*)&h, sizeof(h));
+  if (directio)
+    ebl.rebuild_aligned(CEPH_DIRECTIO_ALIGNMENT);
+  *tbl = std::move(ebl);
+  return h.len;
+}
+
+void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
+			       Context *oncommit, TrackedOpRef osd_op)
+{
+  // dump on queue
+  dout(5) << "submit_entry seq " << seq
+	  << " len " << e.length()
+	  << " (" << oncommit << ")" << dendl;
+  ceph_assert(e.length() > 0);
+  ceph_assert(e.length() < header.max_size);
+
+  if (logger) {
+    logger->inc(l_filestore_journal_queue_bytes, orig_len);
+    logger->inc(l_filestore_journal_queue_ops, 1);
+  }
+
+  throttle.register_throttle_seq(seq, e.length());
+  if (logger) {
+    logger->inc(l_filestore_journal_ops, 1);
+    logger->inc(l_filestore_journal_bytes, e.length());
+  }
+
+  if (osd_op) {
+    osd_op->mark_event("commit_queued_for_journal_write");
+    if (osd_op->store_trace) {
+      osd_op->journal_trace.init("journal", &trace_endpoint, &osd_op->store_trace);
+      osd_op->journal_trace.event("submit_entry");
+      osd_op->journal_trace.keyval("seq", seq);
+    }
+  }
+  {
+    std::lock_guard l1{writeq_lock};
+#ifdef HAVE_LIBAIO
+    std::lock_guard l2{aio_lock};
+#endif
+    std::lock_guard l3{completions_lock};
+
+#ifdef HAVE_LIBAIO
+    aio_write_queue_ops++;
+    aio_write_queue_bytes += e.length();
+    aio_cond.notify_all();
+#endif
+
+    completions.push_back(
+      completion_item(
+	seq, oncommit, ceph_clock_now(), osd_op));
+    if (writeq.empty())
+      writeq_cond.notify_all();
+    writeq.push_back(write_item(seq, e, orig_len, osd_op));
+    if (osd_op)
+      osd_op->journal_trace.keyval("queue depth", writeq.size());
+  }
+}
+
+bool FileJournal::writeq_empty()
+{
+  std::lock_guard locker{writeq_lock};
+  return writeq.empty();
+}
+
+FileJournal::write_item &FileJournal::peek_write()
+{
+  ceph_assert(ceph_mutex_is_locked(write_lock));
+  std::lock_guard locker{writeq_lock};
+  return writeq.front();
+}
+
+void FileJournal::pop_write()
+{
+  ceph_assert(ceph_mutex_is_locked(write_lock));
+  std::lock_guard locker{writeq_lock};
+  if (logger) {
+    logger->dec(l_filestore_journal_queue_bytes, writeq.front().orig_len);
+    logger->dec(l_filestore_journal_queue_ops, 1);
+  }
+  writeq.pop_front();
+}
+
+void FileJournal::batch_pop_write(list<write_item> &items)
+{
+  ceph_assert(ceph_mutex_is_locked(write_lock));
+  {
+    std::lock_guard locker{writeq_lock};
+    writeq.swap(items);
+  }
+  for (auto &&i : items) {
+    if (logger) {
+      logger->dec(l_filestore_journal_queue_bytes, i.orig_len);
+      logger->dec(l_filestore_journal_queue_ops, 1);
+    }
+  }
+}
+
+void FileJournal::batch_unpop_write(list<write_item> &items)
+{
+  ceph_assert(ceph_mutex_is_locked(write_lock));
+  for (auto &&i : items) {
+    if (logger) {
+      logger->inc(l_filestore_journal_queue_bytes, i.orig_len);
+      logger->inc(l_filestore_journal_queue_ops, 1);
+    }
+  }
+  std::lock_guard locker{writeq_lock};
+  writeq.splice(writeq.begin(), items);
+}
+
+void FileJournal::commit_start(uint64_t seq)
+{
+  dout(10) << "commit_start" << dendl;
+
+  // was full?
+  switch (full_state) {
+  case FULL_NOTFULL:
+    break; // all good
+
+  case FULL_FULL:
+    if (seq >= journaled_seq) {
+      dout(1) << " FULL_FULL -> FULL_WAIT.  commit_start on seq "
+	      << seq << " > journaled_seq " << journaled_seq
+	      << ", moving to FULL_WAIT."
+	      << dendl;
+      full_state = FULL_WAIT;
+    } else {
+      dout(1) << "FULL_FULL commit_start on seq "
+	      << seq << " < journaled_seq " << journaled_seq
+	      << ", remaining in FULL_FULL"
+	      << dendl;
+    }
+    break;
+
+  case FULL_WAIT:
+    dout(1) << " FULL_WAIT -> FULL_NOTFULL.  journal now active, setting completion plug." << dendl;
+    full_state = FULL_NOTFULL;
+    plug_journal_completions = true;
+    break;
+  }
+}
+
+/*
+ *send discard command to joural block deivce
+ */
+void FileJournal::do_discard(int64_t offset, int64_t end)
+{
+  dout(10) << __func__ << " trim(" << offset << ", " << end << dendl;
+
+  offset = round_up_to(offset, block_size);
+  if (offset >= end)
+    return;
+  end = round_up_to(end - block_size, block_size);
+  ceph_assert(end >= offset);
+  if (offset < end) {
+    BlkDev blkdev(fd);
+    if (blkdev.discard(offset, end - offset) < 0) {
+	dout(1) << __func__ << "ioctl(BLKDISCARD) error:" << cpp_strerror(errno) << dendl;
+    }
+  }
+}
+
+void FileJournal::committed_thru(uint64_t seq)
+{
+  std::lock_guard locker{write_lock};
+
+  auto released = throttle.flush(seq);
+  if (logger) {
+    logger->dec(l_filestore_journal_ops, released.first);
+    logger->dec(l_filestore_journal_bytes, released.second);
+  }
+
+  if (seq < last_committed_seq) {
+    dout(5) << "committed_thru " << seq << " < last_committed_seq " << last_committed_seq << dendl;
+    ceph_assert(seq >= last_committed_seq);
+    return;
+  }
+  if (seq == last_committed_seq) {
+    dout(5) << "committed_thru " << seq << " == last_committed_seq " << last_committed_seq << dendl;
+    return;
+  }
+
+  dout(5) << "committed_thru " << seq << " (last_committed_seq " << last_committed_seq << ")" << dendl;
+  last_committed_seq = seq;
+
+  // completions!
+  {
+    std::lock_guard locker{finisher_lock};
+    queue_completions_thru(seq);
+    if (plug_journal_completions && seq >= header.start_seq) {
+      dout(10) << " removing completion plug, queuing completions thru journaled_seq " << journaled_seq << dendl;
+      plug_journal_completions = false;
+      queue_completions_thru(journaled_seq);
+    }
+  }
+
+  // adjust start pointer
+  while (!journalq.empty() && journalq.front().first <= seq) {
+    journalq.pop_front();
+  }
+
+  int64_t old_start = header.start;
+  if (!journalq.empty()) {
+    header.start = journalq.front().second;
+    header.start_seq = journalq.front().first;
+  } else {
+    header.start = write_pos;
+    header.start_seq = seq + 1;
+  }
+
+  if (discard) {
+    dout(10) << __func__  << " will trim (" << old_start << ", " << header.start << ")" << dendl;
+    if (old_start < header.start)
+      do_discard(old_start, header.start - 1);
+    else {
+      do_discard(old_start, header.max_size - 1);
+      do_discard(get_top(), header.start - 1);
+    }
+  }
+
+  must_write_header = true;
+  print_header(header);
+
+  // committed but unjournaled items
+  while (!writeq_empty() && peek_write().seq <= seq) {
+    dout(15) << " dropping committed but unwritten seq " << peek_write().seq
+	     << " len " << peek_write().bl.length()
+	     << dendl;
+    complete_write(1, peek_write().orig_len);
+    pop_write();
+  }
+
+  commit_cond.notify_all();
+
+  dout(10) << "committed_thru done" << dendl;
+}
+
+
+void FileJournal::complete_write(uint64_t ops, uint64_t bytes)
+{
+  dout(5) << __func__ << " finished " << ops << " ops and "
+	  << bytes << " bytes" << dendl;
+}
+
+int FileJournal::make_writeable()
+{
+  dout(10) << __func__ << dendl;
+  int r = set_throttle_params();
+  if (r < 0)
+    return r;
+
+  r = _open(true);
+  if (r < 0)
+    return r;
+
+  if (read_pos > 0)
+    write_pos = read_pos;
+  else
+    write_pos = get_top();
+  read_pos = 0;
+
+  must_write_header = true;
+
+  start_writer();
+  return 0;
+}
+
+int FileJournal::set_throttle_params()
+{
+  stringstream ss;
+  bool valid = throttle.set_params(
+    cct->_conf->journal_throttle_low_threshhold,
+    cct->_conf->journal_throttle_high_threshhold,
+    cct->_conf->filestore_expected_throughput_bytes,
+    cct->_conf->journal_throttle_high_multiple,
+    cct->_conf->journal_throttle_max_multiple,
+    header.max_size - get_top(),
+    &ss);
+
+  if (!valid) {
+    derr << "tried to set invalid params: "
+	 << ss.str()
+	 << dendl;
+  }
+  return valid ? 0 : -EINVAL;
+}
+
+const char** FileJournal::get_tracked_conf_keys() const
+{
+  static const char *KEYS[] = {
+    "journal_throttle_low_threshhold",
+    "journal_throttle_high_threshhold",
+    "journal_throttle_high_multiple",
+    "journal_throttle_max_multiple",
+    "filestore_expected_throughput_bytes",
+    NULL};
+  return KEYS;
+}
+
+void FileJournal::wrap_read_bl(
+  off64_t pos,
+  int64_t olen,
+  bufferlist* bl,
+  off64_t *out_pos
+  ) const
+{
+  while (olen > 0) {
+    while (pos >= header.max_size)
+      pos = pos + get_top() - header.max_size;
+
+    int64_t len;
+    if (pos + olen > header.max_size)
+      len = header.max_size - pos;        // partial
+    else
+      len = olen;                         // rest
+
+    int64_t actual = ::lseek64(fd, pos, SEEK_SET);
+    ceph_assert(actual == pos);
+
+    bufferptr bp = ceph::buffer::create(len);
+    int r = safe_read_exact(fd, bp.c_str(), len);
+    if (r) {
+      derr << "FileJournal::wrap_read_bl: safe_read_exact " << pos << "~" << len << " returned "
+	   << cpp_strerror(r) << dendl;
+      ceph_abort();
+    }
+    bl->push_back(std::move(bp));
+    pos += len;
+    olen -= len;
+  }
+  if (pos >= header.max_size)
+    pos = pos + get_top() - header.max_size;
+  if (out_pos)
+    *out_pos = pos;
+}
+
+bool FileJournal::read_entry(
+  bufferlist &bl,
+  uint64_t &next_seq,
+  bool *corrupt)
+{
+  if (corrupt)
+    *corrupt = false;
+  uint64_t seq = next_seq;
+
+  if (!read_pos) {
+    dout(2) << "read_entry -- not readable" << dendl;
+    return false;
+  }
+
+  off64_t pos = read_pos;
+  off64_t next_pos = pos;
+  stringstream ss;
+  read_entry_result result = do_read_entry(
+    pos,
+    &next_pos,
+    &bl,
+    &seq,
+    &ss);
+  if (result == SUCCESS) {
+    journalq.push_back( pair<uint64_t,off64_t>(seq, pos));
+    uint64_t amount_to_take =
+      next_pos > pos ?
+      next_pos - pos :
+      (header.max_size - pos) + (next_pos - get_top());
+    throttle.take(amount_to_take);
+    throttle.register_throttle_seq(next_seq, amount_to_take);
+    if (logger) {
+      logger->inc(l_filestore_journal_ops, 1);
+      logger->inc(l_filestore_journal_bytes, amount_to_take);
+    }
+    if (next_seq > seq) {
+      return false;
+    } else {
+      read_pos = next_pos;
+      next_seq = seq;
+      if (seq > journaled_seq)
+        journaled_seq = seq;
+      return true;
+    }
+  } else {
+    derr << "do_read_entry(" << pos << "): " << ss.str() << dendl;
+  }
+
+  if (seq && seq < header.committed_up_to) {
+    derr << "Unable to read past sequence " << seq
+	 << " but header indicates the journal has committed up through "
+	 << header.committed_up_to << ", journal is corrupt" << dendl;
+    if (cct->_conf->journal_ignore_corruption) {
+      if (corrupt)
+	*corrupt = true;
+      return false;
+    } else {
+      ceph_abort();
+    }
+  }
+
+  dout(2) << "No further valid entries found, journal is most likely valid"
+	  << dendl;
+  return false;
+}
+
+FileJournal::read_entry_result FileJournal::do_read_entry(
+  off64_t init_pos,
+  off64_t *next_pos,
+  bufferlist *bl,
+  uint64_t *seq,
+  ostream *ss,
+  entry_header_t *_h) const
+{
+  off64_t cur_pos = init_pos;
+  bufferlist _bl;
+  if (!bl)
+    bl = &_bl;
+
+  // header
+  entry_header_t *h;
+  bufferlist hbl;
+  off64_t _next_pos;
+  wrap_read_bl(cur_pos, sizeof(*h), &hbl, &_next_pos);
+  h = reinterpret_cast<entry_header_t *>(hbl.c_str());
+
+  if (!h->check_magic(cur_pos, header.get_fsid64())) {
+    dout(25) << "read_entry " << init_pos
+	     << " : bad header magic, end of journal" << dendl;
+    if (ss)
+      *ss << "bad header magic";
+    if (next_pos)
+      *next_pos = init_pos + (4<<10); // check 4k ahead
+    return MAYBE_CORRUPT;
+  }
+  cur_pos = _next_pos;
+
+  // pad + body + pad
+  if (h->pre_pad)
+    cur_pos += h->pre_pad;
+
+  bl->clear();
+  wrap_read_bl(cur_pos, h->len, bl, &cur_pos);
+
+  if (h->post_pad)
+    cur_pos += h->post_pad;
+
+  // footer
+  entry_header_t *f;
+  bufferlist fbl;
+  wrap_read_bl(cur_pos, sizeof(*f), &fbl, &cur_pos);
+  f = reinterpret_cast<entry_header_t *>(fbl.c_str());
+  if (memcmp(f, h, sizeof(*f))) {
+    if (ss)
+      *ss << "bad footer magic, partial entry";
+    if (next_pos)
+      *next_pos = cur_pos;
+    return MAYBE_CORRUPT;
+  }
+
+  if ((header.flags & header_t::FLAG_CRC) ||   // if explicitly enabled (new journal)
+      h->crc32c != 0) {                        // newer entry in old journal
+    uint32_t actual_crc = bl->crc32c(0);
+    if (actual_crc != h->crc32c) {
+      if (ss)
+	*ss << "header crc (" << h->crc32c
+	    << ") doesn't match body crc (" << actual_crc << ")";
+      if (next_pos)
+	*next_pos = cur_pos;
+      return MAYBE_CORRUPT;
+    }
+  }
+
+  // yay!
+  dout(2) << "read_entry " << init_pos << " : seq " << h->seq
+	  << " " << h->len << " bytes"
+	  << dendl;
+
+  // ok!
+  if (seq)
+    *seq = h->seq;
+
+
+  if (next_pos)
+    *next_pos = cur_pos;
+
+  if (_h)
+    *_h = *h;
+
+  ceph_assert(cur_pos % header.alignment == 0);
+  return SUCCESS;
+}
+
+void FileJournal::reserve_throttle_and_backoff(uint64_t count)
+{
+  throttle.get(count);
+}
+
+void FileJournal::get_header(
+  uint64_t wanted_seq,
+  off64_t *_pos,
+  entry_header_t *h)
+{
+  off64_t pos = header.start;
+  off64_t next_pos = pos;
+  bufferlist bl;
+  uint64_t seq = 0;
+  dout(2) << __func__ << dendl;
+  while (1) {
+    bl.clear();
+    pos = next_pos;
+    read_entry_result result = do_read_entry(
+      pos,
+      &next_pos,
+      &bl,
+      &seq,
+      0,
+      h);
+    if (result == FAILURE || result == MAYBE_CORRUPT)
+      ceph_abort();
+    if (seq == wanted_seq) {
+      if (_pos)
+	*_pos = pos;
+      return;
+    }
+  }
+  ceph_abort(); // not reachable
+}
+
+void FileJournal::corrupt(
+  int wfd,
+  off64_t corrupt_at)
+{
+  dout(2) << __func__ << dendl;
+  if (corrupt_at >= header.max_size)
+    corrupt_at = corrupt_at + get_top() - header.max_size;
+
+  int64_t actual = ::lseek64(fd, corrupt_at, SEEK_SET);
+  ceph_assert(actual == corrupt_at);
+
+  char buf[10];
+  int r = safe_read_exact(fd, buf, 1);
+  ceph_assert(r == 0);
+
+  actual = ::lseek64(wfd, corrupt_at, SEEK_SET);
+  ceph_assert(actual == corrupt_at);
+
+  buf[0]++;
+  r = safe_write(wfd, buf, 1);
+  ceph_assert(r == 0);
+}
+
+void FileJournal::corrupt_payload(
+  int wfd,
+  uint64_t seq)
+{
+  dout(2) << __func__ << dendl;
+  off64_t pos = 0;
+  entry_header_t h;
+  get_header(seq, &pos, &h);
+  off64_t corrupt_at =
+    pos + sizeof(entry_header_t) + h.pre_pad;
+  corrupt(wfd, corrupt_at);
+}
+
+
+void FileJournal::corrupt_footer_magic(
+  int wfd,
+  uint64_t seq)
+{
+  dout(2) << __func__ << dendl;
+  off64_t pos = 0;
+  entry_header_t h;
+  get_header(seq, &pos, &h);
+  off64_t corrupt_at =
+    pos + sizeof(entry_header_t) + h.pre_pad +
+    h.len + h.post_pad +
+    (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
+  corrupt(wfd, corrupt_at);
+}
+
+
+void FileJournal::corrupt_header_magic(
+  int wfd,
+  uint64_t seq)
+{
+  dout(2) << __func__ << dendl;
+  off64_t pos = 0;
+  entry_header_t h;
+  get_header(seq, &pos, &h);
+  off64_t corrupt_at =
+    pos +
+    (reinterpret_cast<char*>(&h.magic2) - reinterpret_cast<char*>(&h));
+  corrupt(wfd, corrupt_at);
+}
+
+off64_t FileJournal::get_journal_size_estimate()
+{
+  off64_t size, start = header.start;
+  if (write_pos < start) {
+    size = (max_size - start) + write_pos;
+  } else {
+    size = write_pos - start;
+  }
+  dout(20) << __func__ << " journal size=" << size << dendl;
+  return size;
+}
+
+void FileJournal::get_devices(set<string> *ls)
+{
+  string dev_node;
+  BlkDev blkdev(fd);
+  if (int rc = blkdev.wholedisk(&dev_node); rc) {
+    return;
+  }
+  get_raw_devices(dev_node, ls);
+}
+
+void FileJournal::collect_metadata(map<string,string> *pm)
+{
+  BlkDev blkdev(fd);
+  char partition_path[PATH_MAX];
+  char dev_node[PATH_MAX];
+  if (blkdev.partition(partition_path, PATH_MAX)) {
+    (*pm)["backend_filestore_journal_partition_path"] = "unknown";
+  } else {
+    (*pm)["backend_filestore_journal_partition_path"] = string(partition_path);
+  }
+  if (blkdev.wholedisk(dev_node, PATH_MAX)) {
+    (*pm)["backend_filestore_journal_dev_node"] = "unknown";
+  } else {
+    (*pm)["backend_filestore_journal_dev_node"] = string(dev_node);
+    devname = dev_node;
+  }
+}
diff --git a/src/os/filestore/FileJournal.h b/src/os/filestore/FileJournal.h
new file mode 100644
index 000000000..53b18c125
--- /dev/null
+++ b/src/os/filestore/FileJournal.h
@@ -0,0 +1,546 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILEJOURNAL_H
+#define CEPH_FILEJOURNAL_H
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <stdlib.h>
+using std::deque;
+
+#include "Journal.h"
+#include "common/config_fwd.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+#include "common/Throttle.h"
+#include "JournalThrottle.h"
+#include "common/zipkin_trace.h"
+
+#ifdef HAVE_LIBAIO
+# include <libaio.h>
+#endif
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+/**
+ * Implements journaling on top of block device or file.
+ *
+ * Lock ordering is write_lock > aio_lock > (completions_lock | finisher_lock)
+ */
+class FileJournal :
+  public Journal,
+  public md_config_obs_t {
+public:
+  /// Protected by finisher_lock
+  struct completion_item {
+    uint64_t seq;
+    Context *finish;
+    utime_t start;
+    TrackedOpRef tracked_op;
+    completion_item(uint64_t o, Context *c, utime_t s, TrackedOpRef opref)
+      : seq(o), finish(c), start(s), tracked_op(opref) {}
+    completion_item() : seq(0), finish(0), start(0) {}
+  };
+  struct write_item {
+    uint64_t seq;
+    ceph::buffer::list bl;
+    uint32_t orig_len;
+    TrackedOpRef tracked_op;
+    ZTracer::Trace trace;
+    write_item(uint64_t s, ceph::buffer::list& b, int ol, TrackedOpRef opref) :
+      seq(s), orig_len(ol), tracked_op(opref) {
+      bl = std::move(b);
+    }
+    write_item() : seq(0), orig_len(0) {}
+  };
+
+  ceph::mutex finisher_lock = ceph::make_mutex("FileJournal::finisher_lock");
+  ceph::condition_variable finisher_cond;
+  uint64_t journaled_seq;
+  bool plug_journal_completions;
+
+  ceph::mutex writeq_lock = ceph::make_mutex("FileJournal::writeq_lock");
+  ceph::condition_variable writeq_cond;
+  std::list<write_item> writeq;
+  bool writeq_empty();
+  write_item &peek_write();
+  void pop_write();
+  void batch_pop_write(std::list<write_item> &items);
+  void batch_unpop_write(std::list<write_item> &items);
+
+  ceph::mutex completions_lock =
+    ceph::make_mutex("FileJournal::completions_lock");
+  std::list<completion_item> completions;
+  bool completions_empty() {
+    std::lock_guard l{completions_lock};
+    return completions.empty();
+  }
+  void batch_pop_completions(std::list<completion_item> &items) {
+    std::lock_guard l{completions_lock};
+    completions.swap(items);
+  }
+  void batch_unpop_completions(std::list<completion_item> &items) {
+    std::lock_guard l{completions_lock};
+    completions.splice(completions.begin(), items);
+  }
+  completion_item completion_peek_front() {
+    std::lock_guard l{completions_lock};
+    ceph_assert(!completions.empty());
+    return completions.front();
+  }
+  void completion_pop_front() {
+    std::lock_guard l{completions_lock};
+    ceph_assert(!completions.empty());
+    completions.pop_front();
+  }
+
+  int prepare_entry(std::vector<ObjectStore::Transaction>& tls, ceph::buffer::list* tbl) override;
+
+  void submit_entry(uint64_t seq, ceph::buffer::list& bl, uint32_t orig_len,
+		    Context *oncommit,
+		    TrackedOpRef osd_op = TrackedOpRef()) override;
+  /// End protected by finisher_lock
+
+  /*
+   * journal header
+   */
+  struct header_t {
+    enum {
+      FLAG_CRC = (1<<0),
+      // NOTE: remove kludgey weirdness in read_header() next time a flag is added.
+    };
+
+    uint64_t flags;
+    uuid_d fsid;
+    __u32 block_size;
+    __u32 alignment;
+    int64_t max_size;   // max size of journal ring buffer
+    int64_t start;      // offset of first entry
+    uint64_t committed_up_to; // committed up to
+
+    /**
+     * start_seq
+     *
+     * entry at header.start has sequence >= start_seq
+     *
+     * Generally, the entry at header.start will have sequence
+     * start_seq if it exists.  The only exception is immediately
+     * after journal creation since the first sequence number is
+     * not known.
+     *
+     * If the first read on open fails, we can assume corruption
+     * if start_seq > committed_up_to because the entry would have
+     * a sequence >= start_seq and therefore > committed_up_to.
+     */
+    uint64_t start_seq;
+
+    header_t() :
+      flags(0), block_size(0), alignment(0), max_size(0), start(0),
+      committed_up_to(0), start_seq(0) {}
+
+    void clear() {
+      start = block_size;
+    }
+
+    uint64_t get_fsid64() const {
+      return *(uint64_t*)fsid.bytes();
+    }
+
+    void encode(ceph::buffer::list& bl) const {
+      using ceph::encode;
+      __u32 v = 4;
+      encode(v, bl);
+      ceph::buffer::list em;
+      {
+	encode(flags, em);
+	encode(fsid, em);
+	encode(block_size, em);
+	encode(alignment, em);
+	encode(max_size, em);
+	encode(start, em);
+	encode(committed_up_to, em);
+	encode(start_seq, em);
+      }
+      encode(em, bl);
+    }
+    void decode(ceph::buffer::list::const_iterator& bl) {
+      using ceph::decode;
+      __u32 v;
+      decode(v, bl);
+      if (v < 2) {  // normally 0, but conceivably 1
+	// decode old header_t struct (pre v0.40).
+	bl += 4u; // skip __u32 flags (it was unused by any old code)
+	flags = 0;
+	uint64_t tfsid;
+	decode(tfsid, bl);
+	*(uint64_t*)&fsid.bytes()[0] = tfsid;
+	*(uint64_t*)&fsid.bytes()[8] = tfsid;
+	decode(block_size, bl);
+	decode(alignment, bl);
+	decode(max_size, bl);
+	decode(start, bl);
+	committed_up_to = 0;
+	start_seq = 0;
+	return;
+      }
+      ceph::buffer::list em;
+      decode(em, bl);
+      auto t = em.cbegin();
+      decode(flags, t);
+      decode(fsid, t);
+      decode(block_size, t);
+      decode(alignment, t);
+      decode(max_size, t);
+      decode(start, t);
+
+      if (v > 2)
+	decode(committed_up_to, t);
+      else
+	committed_up_to = 0;
+
+      if (v > 3)
+	decode(start_seq, t);
+      else
+	start_seq = 0;
+    }
+  } header;
+
+  struct entry_header_t {
+    uint64_t seq;     // fs op seq #
+    uint32_t crc32c;  // payload only.  not header, pre_pad, post_pad, or footer.
+    uint32_t len;
+    uint32_t pre_pad, post_pad;
+    uint64_t magic1;
+    uint64_t magic2;
+
+    static uint64_t make_magic(uint64_t seq, uint32_t len, uint64_t fsid) {
+      return (fsid ^ seq ^ len);
+    }
+    bool check_magic(off64_t pos, uint64_t fsid) {
+      return
+    magic1 == (uint64_t)pos &&
+    magic2 == (fsid ^ seq ^ len);
+    }
+  } __attribute__((__packed__, aligned(4)));
+
+  bool journalq_empty() { return journalq.empty(); }
+
+private:
+  std::string fn;
+
+  char *zero_buf;
+  off64_t max_size;
+  size_t block_size;
+  bool directio, aio, force_aio;
+  bool must_write_header;
+  off64_t write_pos;      // byte where the next entry to be written will go
+  off64_t read_pos;       //
+  bool discard;	  //for block journal whether support discard
+
+#ifdef HAVE_LIBAIO
+  /// state associated with an in-flight aio request
+  /// Protected by aio_lock
+  struct aio_info {
+    struct iocb iocb {};
+    ceph::buffer::list bl;
+    struct iovec *iov;
+    bool done;
+    uint64_t off, len;    ///< these are for debug only
+    uint64_t seq;         ///< seq number to complete on aio completion, if non-zero
+
+    aio_info(ceph::buffer::list& b, uint64_t o, uint64_t s)
+      : iov(NULL), done(false), off(o), len(b.length()), seq(s) {
+      bl = std::move(b);
+    }
+    ~aio_info() {
+      delete[] iov;
+    }
+  };
+  ceph::mutex aio_lock = ceph::make_mutex("FileJournal::aio_lock");
+  ceph::condition_variable aio_cond;
+  ceph::condition_variable write_finish_cond;
+  io_context_t aio_ctx = 0;
+  std::list<aio_info> aio_queue;
+  int aio_num = 0, aio_bytes = 0;
+  uint64_t aio_write_queue_ops = 0;
+  uint64_t aio_write_queue_bytes = 0;
+  /// End protected by aio_lock
+#endif
+
+  uint64_t last_committed_seq;
+  uint64_t journaled_since_start;
+
+  std::string devname;
+
+  /*
+   * full states cycle at the beginnging of each commit epoch, when commit_start()
+   * is called.
+   *   FULL - we just filled up during this epoch.
+   *   WAIT - we filled up last epoch; now we have to wait until everything during
+   *          that epoch commits to the fs before we can start writing over it.
+   *   NOTFULL - all good, journal away.
+   */
+  enum {
+    FULL_NOTFULL = 0,
+    FULL_FULL = 1,
+    FULL_WAIT = 2,
+  } full_state;
+
+  int fd;
+
+  // in journal
+  std::deque<std::pair<uint64_t, off64_t> > journalq;  // track seq offsets, so we can trim later.
+  uint64_t writing_seq;
+
+
+  // throttle
+  int set_throttle_params();
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(
+    const ConfigProxy& conf,
+    const std::set <std::string> &changed) override {
+    for (const char **i = get_tracked_conf_keys();
+	 *i;
+	 ++i) {
+      if (changed.count(std::string(*i))) {
+	set_throttle_params();
+	return;
+      }
+    }
+  }
+
+  void complete_write(uint64_t ops, uint64_t bytes);
+  JournalThrottle throttle;
+
+  // write thread
+  ceph::mutex write_lock = ceph::make_mutex("FileJournal::write_lock");
+  bool write_stop;
+  bool aio_stop;
+
+  ceph::condition_variable commit_cond;
+
+  int _open(bool wr, bool create=false);
+  int _open_block_device();
+  void _close(int fd) const;
+  int _open_file(int64_t oldsize, blksize_t blksize, bool create);
+  int _dump(std::ostream& out, bool simple);
+  void print_header(const header_t &hdr) const;
+  int read_header(header_t *hdr) const;
+  ceph::bufferptr prepare_header();
+  void start_writer();
+  void stop_writer();
+  void write_thread_entry();
+
+  void queue_completions_thru(uint64_t seq);
+
+  int check_for_full(uint64_t seq, off64_t pos, off64_t size);
+  int prepare_multi_write(ceph::buffer::list& bl, uint64_t& orig_ops, uint64_t& orig_bytee);
+  int prepare_single_write(write_item &next_write, ceph::buffer::list& bl, off64_t& queue_pos,
+    uint64_t& orig_ops, uint64_t& orig_bytes);
+  void do_write(ceph::buffer::list& bl);
+
+  void write_finish_thread_entry();
+  void check_aio_completion();
+  void do_aio_write(ceph::buffer::list& bl);
+  int write_aio_bl(off64_t& pos, ceph::buffer::list& bl, uint64_t seq);
+
+
+  void check_align(off64_t pos, ceph::buffer::list& bl);
+  int write_bl(off64_t& pos, ceph::buffer::list& bl);
+
+  /// read len from journal starting at in_pos and wrapping up to len
+  void wrap_read_bl(
+    off64_t in_pos,   ///< [in] start position
+    int64_t len,      ///< [in] length to read
+    ceph::buffer::list* bl,   ///< [out] result
+    off64_t *out_pos  ///< [out] next position to read, will be wrapped
+    ) const;
+
+  void do_discard(int64_t offset, int64_t end);
+
+  class Writer : public Thread {
+    FileJournal *journal;
+  public:
+    explicit Writer(FileJournal *fj) : journal(fj) {}
+    void *entry() override {
+      journal->write_thread_entry();
+      return 0;
+    }
+  } write_thread;
+
+  class WriteFinisher : public Thread {
+    FileJournal *journal;
+  public:
+    explicit WriteFinisher(FileJournal *fj) : journal(fj) {}
+    void *entry() override {
+      journal->write_finish_thread_entry();
+      return 0;
+    }
+  } write_finish_thread;
+
+  off64_t get_top() const {
+    return round_up_to(sizeof(header), block_size);
+  }
+
+  ZTracer::Endpoint trace_endpoint;
+
+ public:
+  FileJournal(CephContext* cct, uuid_d fsid, Finisher *fin, ceph::condition_variable *sync_cond,
+	      const char *f, bool dio=false, bool ai=true, bool faio=false) :
+    Journal(cct, fsid, fin, sync_cond),
+    journaled_seq(0),
+    plug_journal_completions(false),
+    fn(f),
+    zero_buf(NULL),
+    max_size(0), block_size(0),
+    directio(dio), aio(ai), force_aio(faio),
+    must_write_header(false),
+    write_pos(0), read_pos(0),
+    discard(false),
+    last_committed_seq(0),
+    journaled_since_start(0),
+    full_state(FULL_NOTFULL),
+    fd(-1),
+    writing_seq(0),
+    throttle(cct, cct->_conf->filestore_caller_concurrency),
+    write_stop(true),
+    aio_stop(true),
+    write_thread(this),
+    write_finish_thread(this),
+    trace_endpoint("0.0.0.0", 0, "FileJournal") {
+
+      if (aio && !directio) {
+	lderr(cct) << "FileJournal::_open_any: aio not supported without directio; disabling aio" << dendl;
+        aio = false;
+      }
+#ifndef HAVE_LIBAIO
+      if (aio && ::getenv("CEPH_DEV") == NULL) {
+	lderr(cct) << "FileJournal::_open_any: libaio not compiled in; disabling aio" << dendl;
+        aio = false;
+      }
+#endif
+
+      cct->_conf.add_observer(this);
+  }
+  ~FileJournal() override {
+    ceph_assert(fd == -1);
+    delete[] zero_buf;
+    cct->_conf.remove_observer(this);
+  }
+
+  int check() override;
+  int create() override;
+  int open(uint64_t fs_op_seq) override;
+  void close() override;
+  int peek_fsid(uuid_d& fsid);
+
+  int dump(std::ostream& out) override;
+  int simple_dump(std::ostream& out);
+  int _fdump(ceph::Formatter &f, bool simple);
+
+  void flush() override;
+
+  void get_devices(std::set<std::string> *ls) override;
+  void collect_metadata(std::map<std::string,std::string> *pm) override;
+
+  void reserve_throttle_and_backoff(uint64_t count) override;
+
+  bool is_writeable() override {
+    return read_pos == 0;
+  }
+  int make_writeable() override;
+
+  // writes
+  void commit_start(uint64_t seq) override;
+  void committed_thru(uint64_t seq) override;
+  bool should_commit_now() override {
+    return full_state != FULL_NOTFULL && !write_stop;
+  }
+
+  void write_header_sync();
+
+  void set_wait_on_full(bool b) { wait_on_full = b; }
+
+  off64_t get_journal_size_estimate() override;
+
+  // reads
+
+  /// Result code for read_entry
+  enum read_entry_result {
+    SUCCESS,
+    FAILURE,
+    MAYBE_CORRUPT
+  };
+
+  /**
+   * read_entry
+   *
+   * Reads next entry starting at pos.  If the entry appears
+   * clean, *bl will contain the payload, *seq will contain
+   * the sequence number, and *out_pos will reflect the next
+   * read position.  If the entry is invalid *ss will contain
+   * debug text, while *seq, *out_pos, and *bl will be unchanged.
+   *
+   * If the entry suggests a corrupt log, *ss will contain debug
+   * text, *out_pos will contain the next index to check.  If
+   * we find an entry in this way that returns SUCCESS, the journal
+   * is most likely corrupt.
+   */
+  read_entry_result do_read_entry(
+    off64_t pos,          ///< [in] position to read
+    off64_t *next_pos,    ///< [out] next position to read
+    ceph::buffer::list* bl,       ///< [out] payload for successful read
+    uint64_t *seq,        ///< [out] seq of successful read
+    std::ostream *ss,          ///< [out] error output
+    entry_header_t *h = 0 ///< [out] header
+    ) const; ///< @return result code
+
+  bool read_entry(
+    ceph::buffer::list &bl,
+    uint64_t &last_seq,
+    bool *corrupt
+    );
+
+  bool read_entry(
+    ceph::buffer::list &bl,
+    uint64_t &last_seq) override {
+    return read_entry(bl, last_seq, 0);
+  }
+
+  // Debug/Testing
+  void get_header(
+    uint64_t wanted_seq,
+    off64_t *_pos,
+    entry_header_t *h);
+  void corrupt(
+    int wfd,
+    off64_t corrupt_at);
+  void corrupt_payload(
+    int wfd,
+    uint64_t seq);
+  void corrupt_footer_magic(
+    int wfd,
+    uint64_t seq);
+  void corrupt_header_magic(
+    int wfd,
+    uint64_t seq);
+};
+
+WRITE_CLASS_ENCODER(FileJournal::header_t)
+
+#endif
diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc
new file mode 100644
index 000000000..5b6c7e39b
--- /dev/null
+++ b/src/os/filestore/FileStore.cc
@@ -0,0 +1,6449 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "include/compat.h"
+#include "include/int_types.h"
+#include "boost/tuple/tuple.hpp"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <errno.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#include <linux/falloc.h>
+#endif
+
+#include <iostream>
+#include <map>
+
+#include "include/linux_fiemap.h"
+
+#include "chain_xattr.h"
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif
+
+
+#include <fstream>
+#include <sstream>
+
+#include "FileStore.h"
+#include "GenericFileStoreBackend.h"
+#include "BtrfsFileStoreBackend.h"
+#include "XfsFileStoreBackend.h"
+#include "ZFSFileStoreBackend.h"
+#include "common/BackTrace.h"
+#include "include/types.h"
+#include "FileJournal.h"
+
+#include "osd/osd_types.h"
+#include "include/color.h"
+#include "include/buffer.h"
+
+#include "common/Timer.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/run_cmd.h"
+#include "common/safe_io.h"
+#include "common/perf_counters.h"
+#include "common/sync_filesystem.h"
+#include "common/fd.h"
+#include "HashIndex.h"
+#include "DBObjectMap.h"
+#include "kv/KeyValueDB.h"
+
+#include "common/ceph_crypto.h"
+
+#include "include/ceph_assert.h"
+
+#include "common/config.h"
+#include "common/blkdev.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/objectstore.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore(" << basedir << ") "
+
+#define COMMIT_SNAP_ITEM "snap_%llu"
+#define CLUSTER_SNAP_ITEM "clustersnap_%s"
+
+#define REPLAY_GUARD_XATTR "user.cephos.seq"
+#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq"
+
+// XATTR_SPILL_OUT_NAME as a xattr is used to maintain that indicates whether
+// xattrs spill over into DBObjectMap, if XATTR_SPILL_OUT_NAME exists in file
+// xattrs and the value is "no", it indicates no xattrs in DBObjectMap
+#define XATTR_SPILL_OUT_NAME "user.cephos.spill_out"
+#define XATTR_NO_SPILL_OUT "0"
+#define XATTR_SPILL_OUT "1"
+#define __FUNC__ __func__ << "(" << __LINE__ << ")"
+
+using std::cerr;
+using std::list;
+using std::make_pair;
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::crypto::SHA1;
+using ceph::BackTrace;
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+
+//Initial features in new superblock.
+static CompatSet get_fs_initial_compat_set() {
+  CompatSet::FeatureSet ceph_osd_feature_compat;
+  CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+  CompatSet::FeatureSet ceph_osd_feature_incompat;
+  return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+		   ceph_osd_feature_incompat);
+}
+
+//Features are added here that this FileStore supports.
+static CompatSet get_fs_supported_compat_set() {
+  CompatSet compat =  get_fs_initial_compat_set();
+  //Any features here can be set in code, but not in initial superblock
+  compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+  return compat;
+}
+
+int FileStore::validate_hobject_key(const hobject_t &obj) const
+{
+  unsigned len = LFNIndex::get_max_escaped_name_len(obj);
+  return len > m_filestore_max_xattr_value_size ? -ENAMETOOLONG : 0;
+}
+
+int FileStore::get_block_device_fsid(CephContext* cct, const string& path,
+				     uuid_d *fsid)
+{
+  // make sure we don't try to use aio or direct_io (and get annoying
+  // error messages from failing to do so); performance implications
+  // should be irrelevant for this use
+  FileJournal j(cct, *fsid, 0, 0, path.c_str(), false, false);
+  return j.peek_fsid(*fsid);
+}
+
+void FileStore::FSPerfTracker::update_from_perfcounters(
+  PerfCounters &logger)
+{
+  os_commit_latency_ns.consume_next(
+    logger.get_tavg_ns(
+      l_filestore_journal_latency));
+  os_apply_latency_ns.consume_next(
+    logger.get_tavg_ns(
+      l_filestore_apply_latency));
+}
+
+
+ostream& operator<<(ostream& out, const FileStore::OpSequencer& s)
+{
+  return out << "osr(" << s.cid << ")";
+}
+
+int FileStore::get_cdir(const coll_t& cid, char *s, int len)
+{
+  const string &cid_str(cid.to_str());
+  return snprintf(s, len, "%s/current/%s", basedir.c_str(), cid_str.c_str());
+}
+
+void FileStore::handle_eio()
+{
+  // don't try to map this back to an offset; too hard since there is
+  // a file system in between.  we also don't really know whether this
+  // was a read or a write, since we have so many layers beneath us.
+  // don't even try.
+  note_io_error_event(devname.c_str(), basedir.c_str(), -EIO, 0, 0, 0);
+  ceph_abort_msg("unexpected eio error");
+}
+
+int FileStore::get_index(const coll_t& cid, Index *index)
+{
+  int r = index_manager.get_index(cid, basedir, index);
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  return r;
+}
+
+int FileStore::init_index(const coll_t& cid)
+{
+  char path[PATH_MAX];
+  get_cdir(cid, path, sizeof(path));
+  int r = index_manager.init_index(cid, path, target_version);
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  return r;
+}
+
+int FileStore::lfn_find(const ghobject_t& oid, const Index& index, IndexedPath *path)
+{
+  IndexedPath path2;
+  if (!path)
+    path = &path2;
+  int r, exist;
+  ceph_assert(index.index);
+  r = (index.index)->lookup(oid, path, &exist);
+  if (r < 0) {
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  if (!exist)
+    return -ENOENT;
+  return 0;
+}
+
+int FileStore::lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length)
+{
+  FDRef fd;
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0)
+    return r;
+  r = ::ftruncate(**fd, length);
+  if (r < 0)
+    r = -errno;
+  if (r >= 0 && m_filestore_sloppy_crc) {
+    int rc = backend->_crc_update_truncate(**fd, length);
+    ceph_assert(rc >= 0);
+  }
+  lfn_close(fd);
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  return r;
+}
+
+int FileStore::lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf)
+{
+  IndexedPath path;
+  Index index;
+  int r = get_index(cid, &index);
+  if (r < 0)
+    return r;
+
+  ceph_assert(index.index);
+  std::shared_lock l{(index.index)->access_lock};
+
+  r = lfn_find(oid, index, &path);
+  if (r < 0)
+    return r;
+  r = ::stat(path->path(), buf);
+  if (r < 0)
+    r = -errno;
+  return r;
+}
+
+int FileStore::lfn_open(const coll_t& cid,
+			const ghobject_t& oid,
+			bool create,
+			FDRef *outfd,
+                        Index *index)
+{
+  ceph_assert(outfd);
+  int r = 0;
+  bool need_lock = true;
+  int flags = O_RDWR;
+
+  if (create)
+    flags |= O_CREAT;
+  if (cct->_conf->filestore_odsync_write) {
+    flags |= O_DSYNC;
+  }
+
+  Index index2;
+  if (!index) {
+    index = &index2;
+  }
+  if (!((*index).index)) {
+    r = get_index(cid, index);
+    if (r < 0) {
+      dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+      return r;
+    }
+  } else {
+    need_lock = false;
+  }
+
+  int fd, exist;
+  ceph_assert((*index).index);
+  if (need_lock) {
+    ((*index).index)->access_lock.lock();
+  }
+  if (!replaying) {
+    *outfd = fdcache.lookup(oid);
+    if (*outfd) {
+      if (need_lock) {
+        ((*index).index)->access_lock.unlock();
+      }
+      return 0;
+    }
+  }
+
+
+  IndexedPath path2;
+  IndexedPath *path = &path2;
+
+  r = (*index)->lookup(oid, path, &exist);
+  if (r < 0) {
+    derr << "could not find " << oid << " in index: "
+      << cpp_strerror(-r) << dendl;
+    goto fail;
+  }
+
+  r = ::open((*path)->path(), flags|O_CLOEXEC, 0644);
+  if (r < 0) {
+    r = -errno;
+    dout(10) << "error opening file " << (*path)->path() << " with flags="
+      << flags << ": " << cpp_strerror(-r) << dendl;
+    goto fail;
+  }
+  fd = r;
+  if (create && (!exist)) {
+    r = (*index)->created(oid, (*path)->path());
+    if (r < 0) {
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+      derr << "error creating " << oid << " (" << (*path)->path()
+          << ") in index: " << cpp_strerror(-r) << dendl;
+      goto fail;
+    }
+    r = chain_fsetxattr<true, true>(
+      fd, XATTR_SPILL_OUT_NAME,
+      XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT));
+    if (r < 0) {
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+      derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path()
+                     << "):" << cpp_strerror(-r) << dendl;
+      goto fail;
+    }
+  }
+
+  if (!replaying) {
+    bool existed;
+    *outfd = fdcache.add(oid, fd, &existed);
+    if (existed) {
+      TEMP_FAILURE_RETRY(::close(fd));
+    }
+  } else {
+    *outfd = std::make_shared<FDCache::FD>(fd);
+  }
+
+  if (need_lock) {
+    ((*index).index)->access_lock.unlock();
+  }
+
+  return 0;
+
+ fail:
+
+  if (need_lock) {
+    ((*index).index)->access_lock.unlock();
+  }
+
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  return r;
+}
+
+void FileStore::lfn_close(FDRef fd)
+{
+}
+
+int FileStore::lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid)
+{
+  Index index_new, index_old;
+  IndexedPath path_new, path_old;
+  int exist;
+  int r;
+  bool index_same = false;
+  if (c < newcid) {
+    r = get_index(newcid, &index_new);
+    if (r < 0)
+      return r;
+    r = get_index(c, &index_old);
+    if (r < 0)
+      return r;
+  } else if (c == newcid) {
+    r = get_index(c, &index_old);
+    if (r < 0)
+      return r;
+    index_new = index_old;
+    index_same = true;
+  } else {
+    r = get_index(c, &index_old);
+    if (r < 0)
+      return r;
+    r = get_index(newcid, &index_new);
+    if (r < 0)
+      return r;
+  }
+
+  ceph_assert(index_old.index);
+  ceph_assert(index_new.index);
+
+  if (!index_same) {
+
+    std::shared_lock l1{(index_old.index)->access_lock};
+
+    r = index_old->lookup(o, &path_old, &exist);
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+    if (!exist)
+      return -ENOENT;
+
+    std::unique_lock l2{(index_new.index)->access_lock};
+
+    r = index_new->lookup(newoid, &path_new, &exist);
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+    if (exist)
+      return -EEXIST;
+
+    dout(25) << __FUNC__ << ": path_old: " << path_old << dendl;
+    dout(25) << __FUNC__ << ": path_new: " << path_new << dendl;
+    r = ::link(path_old->path(), path_new->path());
+    if (r < 0)
+      return -errno;
+
+    r = index_new->created(newoid, path_new->path());
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+  } else {
+    std::unique_lock l1{(index_old.index)->access_lock};
+
+    r = index_old->lookup(o, &path_old, &exist);
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+    if (!exist)
+      return -ENOENT;
+
+    r = index_new->lookup(newoid, &path_new, &exist);
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+    if (exist)
+      return -EEXIST;
+
+    dout(25) << __FUNC__ << ": path_old: " << path_old << dendl;
+    dout(25) << __FUNC__ << ": path_new: " << path_new << dendl;
+    r = ::link(path_old->path(), path_new->path());
+    if (r < 0)
+      return -errno;
+
+    // make sure old fd for unlinked/overwritten file is gone
+    fdcache.clear(newoid);
+
+    r = index_new->created(newoid, path_new->path());
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+  }
+  return 0;
+}
+
+int FileStore::lfn_unlink(const coll_t& cid, const ghobject_t& o,
+			  const SequencerPosition &spos,
+			  bool force_clear_omap)
+{
+  Index index;
+  int r = get_index(cid, &index);
+  if (r < 0) {
+    dout(25) << __FUNC__ << ": get_index failed " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  ceph_assert(index.index);
+  std::unique_lock l{(index.index)->access_lock};
+
+  {
+    IndexedPath path;
+    int hardlink;
+    r = index->lookup(o, &path, &hardlink);
+    if (r < 0) {
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      return r;
+    }
+
+    if (!force_clear_omap) {
+      if (hardlink == 0 || hardlink == 1) {
+	  force_clear_omap = true;
+      }
+    }
+    if (force_clear_omap) {
+      dout(20) << __FUNC__ << ": clearing omap on " << o
+	       << " in cid " << cid << dendl;
+      r = object_map->clear(o, &spos);
+      if (r < 0 && r != -ENOENT) {
+	dout(25) << __FUNC__ << ": omap clear failed " << cpp_strerror(r) << dendl;
+	if (r == -EIO && m_filestore_fail_eio) handle_eio();
+	return r;
+      }
+      if (cct->_conf->filestore_debug_inject_read_err) {
+	debug_obj_on_delete(o);
+      }
+      if (!m_disable_wbthrottle) {
+        wbthrottle.clear_object(o); // should be only non-cache ref
+      }
+      fdcache.clear(o);
+    } else {
+      /* Ensure that replay of this op doesn't result in the object_map
+       * going away.
+       */
+      if (!backend->can_checkpoint())
+	object_map->sync(&o, &spos);
+    }
+    if (hardlink == 0) {
+      if (!m_disable_wbthrottle) {
+	wbthrottle.clear_object(o); // should be only non-cache ref
+      }
+      return 0;
+    }
+  }
+  r = index->unlink(o);
+  if (r < 0) {
+    dout(25) << __FUNC__ << ": index unlink failed " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+FileStore::FileStore(CephContext* cct, const std::string &base,
+		     const std::string &jdev, osflagbits_t flags,
+		     const char *name, bool do_update) :
+  JournalingObjectStore(cct, base),
+  internal_name(name),
+  basedir(base), journalpath(jdev),
+  generic_flags(flags),
+  blk_size(0),
+  fsid_fd(-1), op_fd(-1),
+  basedir_fd(-1), current_fd(-1),
+  backend(nullptr),
+  index_manager(cct, do_update),
+  force_sync(false),
+  timer(cct, sync_entry_timeo_lock),
+  stop(false), sync_thread(this),
+  fdcache(cct),
+  wbthrottle(cct),
+  next_osr_id(0),
+  m_disable_wbthrottle(cct->_conf->filestore_odsync_write ||
+                      !cct->_conf->filestore_wbthrottle_enable),
+  throttle_ops(cct, "filestore_ops", cct->_conf->filestore_caller_concurrency),
+  throttle_bytes(cct, "filestore_bytes", cct->_conf->filestore_caller_concurrency),
+  m_ondisk_finisher_num(cct->_conf->filestore_ondisk_finisher_threads),
+  m_apply_finisher_num(cct->_conf->filestore_apply_finisher_threads),
+  op_tp(cct, "FileStore::op_tp", "tp_fstore_op", cct->_conf->filestore_op_threads, "filestore_op_threads"),
+  op_wq(this,
+	ceph::make_timespan(cct->_conf->filestore_op_thread_timeout),
+	ceph::make_timespan(cct->_conf->filestore_op_thread_suicide_timeout),
+	&op_tp),
+  logger(nullptr),
+  trace_endpoint("0.0.0.0", 0, "FileStore"),
+  m_filestore_commit_timeout(cct->_conf->filestore_commit_timeout),
+  m_filestore_journal_parallel(cct->_conf->filestore_journal_parallel ),
+  m_filestore_journal_trailing(cct->_conf->filestore_journal_trailing),
+  m_filestore_journal_writeahead(cct->_conf->filestore_journal_writeahead),
+  m_filestore_fiemap_threshold(cct->_conf->filestore_fiemap_threshold),
+  m_filestore_max_sync_interval(cct->_conf->filestore_max_sync_interval),
+  m_filestore_min_sync_interval(cct->_conf->filestore_min_sync_interval),
+  m_filestore_fail_eio(cct->_conf->filestore_fail_eio),
+  m_filestore_fadvise(cct->_conf->filestore_fadvise),
+  do_update(do_update),
+  m_journal_dio(cct->_conf->journal_dio),
+  m_journal_aio(cct->_conf->journal_aio),
+  m_journal_force_aio(cct->_conf->journal_force_aio),
+  m_osd_rollback_to_cluster_snap(cct->_conf->osd_rollback_to_cluster_snap),
+  m_osd_use_stale_snap(cct->_conf->osd_use_stale_snap),
+  m_filestore_do_dump(false),
+  m_filestore_dump_fmt(true),
+  m_filestore_sloppy_crc(cct->_conf->filestore_sloppy_crc),
+  m_filestore_sloppy_crc_block_size(cct->_conf->filestore_sloppy_crc_block_size),
+  m_filestore_max_alloc_hint_size(cct->_conf->filestore_max_alloc_hint_size),
+  m_fs_type(0),
+  m_filestore_max_inline_xattr_size(0),
+  m_filestore_max_inline_xattrs(0),
+  m_filestore_max_xattr_value_size(0)
+{
+  m_filestore_kill_at = cct->_conf->filestore_kill_at;
+  for (int i = 0; i < m_ondisk_finisher_num; ++i) {
+    ostringstream oss;
+    oss << "filestore-ondisk-" << i;
+    Finisher *f = new Finisher(cct, oss.str(), "fn_odsk_fstore");
+    ondisk_finishers.push_back(f);
+  }
+  for (int i = 0; i < m_apply_finisher_num; ++i) {
+    ostringstream oss;
+    oss << "filestore-apply-" << i;
+    Finisher *f = new Finisher(cct, oss.str(), "fn_appl_fstore");
+    apply_finishers.push_back(f);
+  }
+
+  ostringstream oss;
+  oss << basedir << "/current";
+  current_fn = oss.str();
+
+  ostringstream sss;
+  sss << basedir << "/current/commit_op_seq";
+  current_op_seq_fn = sss.str();
+
+  ostringstream omss;
+  if (cct->_conf->filestore_omap_backend_path != "") {
+      omap_dir = cct->_conf->filestore_omap_backend_path;
+  } else {
+      omss << basedir << "/current/omap";
+      omap_dir = omss.str();
+  }
+
+  // initialize logger
+  PerfCountersBuilder plb(cct, internal_name, l_filestore_first, l_filestore_last);
+
+  plb.add_u64(l_filestore_journal_queue_ops, "journal_queue_ops", "Operations in journal queue");
+  plb.add_u64(l_filestore_journal_ops, "journal_ops", "Active journal entries to be applied");
+  plb.add_u64(l_filestore_journal_queue_bytes, "journal_queue_bytes", "Size of journal queue");
+  plb.add_u64(l_filestore_journal_bytes, "journal_bytes", "Active journal operation size to be applied");
+  plb.add_time_avg(l_filestore_journal_latency, "journal_latency", "Average journal queue completing latency",
+                   NULL, PerfCountersBuilder::PRIO_USEFUL);
+  plb.add_u64_counter(l_filestore_journal_wr, "journal_wr", "Journal write IOs");
+  plb.add_u64_avg(l_filestore_journal_wr_bytes, "journal_wr_bytes", "Journal data written");
+  plb.add_u64(l_filestore_op_queue_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue");
+  plb.add_u64(l_filestore_op_queue_ops, "op_queue_ops", "Operations in writing to FS queue");
+  plb.add_u64_counter(l_filestore_ops, "ops", "Operations written to store");
+  plb.add_u64(l_filestore_op_queue_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue");
+  plb.add_u64(l_filestore_op_queue_bytes, "op_queue_bytes", "Size of writing to FS queue");
+  plb.add_u64_counter(l_filestore_bytes, "bytes", "Data written to store");
+  plb.add_time_avg(l_filestore_apply_latency, "apply_latency", "Apply latency");
+  plb.add_u64(l_filestore_committing, "committing", "Is currently committing");
+
+  plb.add_u64_counter(l_filestore_commitcycle, "commitcycle", "Commit cycles");
+  plb.add_time_avg(l_filestore_commitcycle_interval, "commitcycle_interval", "Average interval between commits");
+  plb.add_time_avg(l_filestore_commitcycle_latency, "commitcycle_latency", "Average latency of commit");
+  plb.add_u64_counter(l_filestore_journal_full, "journal_full", "Journal writes while full");
+  plb.add_time_avg(l_filestore_queue_transaction_latency_avg, "queue_transaction_latency_avg",
+                   "Store operation queue latency", NULL, PerfCountersBuilder::PRIO_USEFUL);
+  plb.add_time(l_filestore_sync_pause_max_lat, "sync_pause_max_latency", "Max latency of op_wq pause before syncfs");
+
+  logger = plb.create_perf_counters();
+
+  cct->get_perfcounters_collection()->add(logger);
+  cct->_conf.add_observer(this);
+
+  superblock.compat_features = get_fs_initial_compat_set();
+}
+
+FileStore::~FileStore()
+{
+  for (auto it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+    delete *it;
+    *it = nullptr;
+  }
+  for (auto it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+    delete *it;
+    *it = nullptr;
+  }
+  cct->_conf.remove_observer(this);
+  cct->get_perfcounters_collection()->remove(logger);
+
+  if (journal)
+    journal->logger = nullptr;
+  delete logger;
+  logger = nullptr;
+
+  if (m_filestore_do_dump) {
+    dump_stop();
+  }
+}
+
+static void get_attrname(const char *name, char *buf, int len)
+{
+  snprintf(buf, len, "user.ceph.%s", name);
+}
+
+bool parse_attrname(char **name)
+{
+  if (strncmp(*name, "user.ceph.", 10) == 0) {
+    *name += 10;
+    return true;
+  }
+  return false;
+}
+
+void FileStore::collect_metadata(map<string,string> *pm)
+{
+  char partition_path[PATH_MAX];
+  char dev_node[PATH_MAX];
+
+  (*pm)["filestore_backend"] = backend->get_name();
+  ostringstream ss;
+  ss << "0x" << std::hex << m_fs_type << std::dec;
+  (*pm)["filestore_f_type"] = ss.str();
+
+  if (cct->_conf->filestore_collect_device_partition_information) {
+    int rc = 0;
+    BlkDev blkdev(fsid_fd);
+    if (rc = blkdev.partition(partition_path, PATH_MAX); rc) {
+      (*pm)["backend_filestore_partition_path"] = "unknown";
+    } else {
+      (*pm)["backend_filestore_partition_path"] = string(partition_path);
+    }
+    if (rc = blkdev.wholedisk(dev_node, PATH_MAX); rc) {
+      (*pm)["backend_filestore_dev_node"] = "unknown";
+    } else {
+      (*pm)["backend_filestore_dev_node"] = string(dev_node);
+      devname = dev_node;
+    }
+    if (rc == 0 && vdo_fd >= 0) {
+      (*pm)["vdo"] = "true";
+      (*pm)["vdo_physical_size"] =
+	stringify(4096 * get_vdo_stat(vdo_fd, "physical_blocks"));
+    }
+    if (journal) {
+      journal->collect_metadata(pm);
+    }
+  }
+}
+
+int FileStore::get_devices(set<string> *ls)
+{
+  string dev_node;
+  BlkDev blkdev(fsid_fd);
+  if (int rc = blkdev.wholedisk(&dev_node); rc) {
+    return rc;
+  }
+  get_raw_devices(dev_node, ls);
+  if (journal) {
+    journal->get_devices(ls);
+  }
+  return 0;
+}
+
+int FileStore::statfs(struct store_statfs_t *buf0, osd_alert_list_t* alerts)
+{
+  struct statfs buf;
+  buf0->reset();
+  if (alerts) {
+    alerts->clear(); // returns nothing for now
+  }
+  if (::statfs(basedir.c_str(), &buf) < 0) {
+    int r = -errno;
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    ceph_assert(r != -ENOENT);
+    return r;
+  }
+
+  uint64_t bfree = buf.f_bavail * buf.f_bsize;
+
+  // assume all of leveldb/rocksdb is omap.
+  {
+    map<string,uint64_t> kv_usage;
+    buf0->omap_allocated += object_map->get_db()->get_estimated_size(kv_usage);
+  }
+
+  uint64_t thin_total, thin_avail;
+  if (get_vdo_utilization(vdo_fd, &thin_total, &thin_avail)) {
+    buf0->total = thin_total;
+    bfree = std::min(bfree, thin_avail);
+    buf0->allocated = thin_total - thin_avail;
+    buf0->data_stored = bfree;
+  } else {
+    buf0->total = buf.f_blocks * buf.f_bsize;
+    buf0->allocated = bfree;
+    buf0->data_stored = bfree;
+  }
+  buf0->available = bfree;
+
+  // FIXME: we don't know how to populate buf->internal_metadata; XFS doesn't
+  // tell us what its internal overhead is.
+
+  // Adjust for writes pending in the journal
+  if (journal) {
+    uint64_t estimate = journal->get_journal_size_estimate();
+    buf0->internally_reserved = estimate;
+    if (buf0->available > estimate)
+      buf0->available -= estimate;
+    else
+      buf0->available = 0;
+  }
+
+  return 0;
+}
+
+int FileStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+			   bool *per_pool_omap)
+{
+  return -ENOTSUP;
+}
+
+void FileStore::new_journal()
+{
+  if (journalpath.length()) {
+    dout(10) << "open_journal at " << journalpath << dendl;
+    journal = new FileJournal(cct, fsid, &finisher, &sync_cond,
+			      journalpath.c_str(),
+			      m_journal_dio, m_journal_aio,
+			      m_journal_force_aio);
+    if (journal)
+      journal->logger = logger;
+  }
+  return;
+}
+
+int FileStore::dump_journal(ostream& out)
+{
+  int r;
+
+  if (!journalpath.length())
+    return -EINVAL;
+
+  FileJournal *journal = new FileJournal(cct, fsid, &finisher, &sync_cond, journalpath.c_str(), m_journal_dio);
+  r = journal->dump(out);
+  delete journal;
+  journal = nullptr;
+  return r;
+}
+
+FileStoreBackend *FileStoreBackend::create(unsigned long f_type, FileStore *fs)
+{
+  switch (f_type) {
+#if defined(__linux__)
+  case BTRFS_SUPER_MAGIC:
+    return new BtrfsFileStoreBackend(fs);
+# ifdef HAVE_LIBXFS
+  case XFS_SUPER_MAGIC:
+    return new XfsFileStoreBackend(fs);
+# endif
+#endif
+#ifdef HAVE_LIBZFS
+  case ZFS_SUPER_MAGIC:
+    return new ZFSFileStoreBackend(fs);
+#endif
+  default:
+    return new GenericFileStoreBackend(fs);
+  }
+}
+
+void FileStore::create_backend(unsigned long f_type)
+{
+  m_fs_type = f_type;
+
+  ceph_assert(!backend);
+  backend = FileStoreBackend::create(f_type, this);
+
+  dout(0) << "backend " << backend->get_name()
+	  << " (magic 0x" << std::hex << f_type << std::dec << ")"
+	  << dendl;
+
+  switch (f_type) {
+#if defined(__linux__)
+  case BTRFS_SUPER_MAGIC:
+    if (!m_disable_wbthrottle){
+      wbthrottle.set_fs(WBThrottle::BTRFS);
+    }
+    break;
+
+  case XFS_SUPER_MAGIC:
+    // wbthrottle is constructed with fs(WBThrottle::XFS)
+    break;
+#endif
+  }
+
+  set_xattr_limits_via_conf();
+}
+
+int FileStore::mkfs()
+{
+  int ret = 0;
+  char fsid_fn[PATH_MAX];
+  char fsid_str[40];
+  uuid_d old_fsid;
+  uuid_d old_omap_fsid;
+
+  dout(1) << "mkfs in " << basedir << dendl;
+  basedir_fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC);
+  if (basedir_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": failed to open base dir " << basedir << ": " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  // open+lock fsid
+  snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str());
+  fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT|O_CLOEXEC, 0644);
+  if (fsid_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl;
+    goto close_basedir_fd;
+  }
+
+  if (lock_fsid() < 0) {
+    ret = -EBUSY;
+    goto close_fsid_fd;
+  }
+
+  if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) {
+    if (fsid.is_zero()) {
+      fsid.generate_random();
+      dout(1) << __FUNC__ << ": generated fsid " << fsid << dendl;
+    } else {
+      dout(1) << __FUNC__ << ": using provided fsid " << fsid << dendl;
+    }
+
+    fsid.print(fsid_str);
+    strcat(fsid_str, "\n");
+    ret = ::ftruncate(fsid_fd, 0);
+    if (ret < 0) {
+      ret = -errno;
+      derr << __FUNC__ << ": failed to truncate fsid: "
+	   << cpp_strerror(ret) << dendl;
+      goto close_fsid_fd;
+    }
+    ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str));
+    if (ret < 0) {
+      derr << __FUNC__ << ": failed to write fsid: "
+	   << cpp_strerror(ret) << dendl;
+      goto close_fsid_fd;
+    }
+    if (::fsync(fsid_fd) < 0) {
+      ret = -errno;
+      derr << __FUNC__ << ": close failed: can't write fsid: "
+	   << cpp_strerror(ret) << dendl;
+      goto close_fsid_fd;
+    }
+    dout(10) << __FUNC__ << ": fsid is " << fsid << dendl;
+  } else {
+    if (!fsid.is_zero() && fsid != old_fsid) {
+      derr << __FUNC__ << ": on-disk fsid " << old_fsid << " != provided " << fsid << dendl;
+      ret = -EINVAL;
+      goto close_fsid_fd;
+    }
+    fsid = old_fsid;
+    dout(1) << __FUNC__ << ": fsid is already set to " << fsid << dendl;
+  }
+
+  // version stamp
+  ret = write_version_stamp();
+  if (ret < 0) {
+    derr << __FUNC__ << ": write_version_stamp() failed: "
+	 << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  }
+
+  // superblock
+  superblock.omap_backend = cct->_conf->filestore_omap_backend;
+  ret = write_superblock();
+  if (ret < 0) {
+    derr << __FUNC__ << ": write_superblock() failed: "
+	 << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  }
+
+  struct statfs basefs;
+  ret = ::fstatfs(basedir_fd, &basefs);
+  if (ret < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": cannot fstatfs basedir "
+	 << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  }
+
+#if defined(__linux__)
+  if (basefs.f_type == BTRFS_SUPER_MAGIC &&
+      !g_ceph_context->check_experimental_feature_enabled("btrfs")) {
+    derr << __FUNC__ << ": deprecated btrfs support is not enabled" << dendl;
+    goto close_fsid_fd;
+  }
+#endif
+
+  create_backend(basefs.f_type);
+
+  ret = backend->create_current();
+  if (ret < 0) {
+    derr << __FUNC__ << ": failed to create current/ " << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  }
+
+  // write initial op_seq
+  {
+    uint64_t initial_seq = 0;
+    int fd = read_op_seq(&initial_seq);
+    if (fd < 0) {
+      ret = fd;
+      derr << __FUNC__ << ": failed to create " << current_op_seq_fn << ": "
+	   << cpp_strerror(ret) << dendl;
+      goto close_fsid_fd;
+    }
+    if (initial_seq == 0) {
+      ret = write_op_seq(fd, 1);
+      if (ret < 0) {
+	VOID_TEMP_FAILURE_RETRY(::close(fd));
+	derr << __FUNC__ << ": failed to write to " << current_op_seq_fn << ": "
+	     << cpp_strerror(ret) << dendl;
+	goto close_fsid_fd;
+      }
+
+      if (backend->can_checkpoint()) {
+	// create snap_1 too
+	current_fd = ::open(current_fn.c_str(), O_RDONLY|O_CLOEXEC);
+	ceph_assert(current_fd >= 0);
+	char s[NAME_MAX];
+	snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, 1ull);
+	ret = backend->create_checkpoint(s, nullptr);
+	VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+	if (ret < 0 && ret != -EEXIST) {
+	  VOID_TEMP_FAILURE_RETRY(::close(fd));
+	  derr << __FUNC__ << ": failed to create snap_1: " << cpp_strerror(ret) << dendl;
+	  goto close_fsid_fd;
+	}
+      }
+    }
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+  }
+  ret = KeyValueDB::test_init(superblock.omap_backend, omap_dir);
+  if (ret < 0) {
+    derr << __FUNC__ << ": failed to create " << cct->_conf->filestore_omap_backend << dendl;
+    goto close_fsid_fd;
+  }
+  // create fsid under omap
+  // open+lock fsid
+  int omap_fsid_fd;
+  char omap_fsid_fn[PATH_MAX];
+  snprintf(omap_fsid_fn, sizeof(omap_fsid_fn), "%s/osd_uuid", omap_dir.c_str());
+  omap_fsid_fd = ::open(omap_fsid_fn, O_RDWR|O_CREAT|O_CLOEXEC, 0644);
+  if (omap_fsid_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": failed to open " << omap_fsid_fn << ": " << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  }
+
+  if (read_fsid(omap_fsid_fd, &old_omap_fsid) < 0 || old_omap_fsid.is_zero()) {
+    ceph_assert(!fsid.is_zero());
+    fsid.print(fsid_str);
+    strcat(fsid_str, "\n");
+    ret = ::ftruncate(omap_fsid_fd, 0);
+    if (ret < 0) {
+      ret = -errno;
+      derr << __FUNC__ << ": failed to truncate fsid: "
+	   << cpp_strerror(ret) << dendl;
+      goto close_omap_fsid_fd;
+    }
+    ret = safe_write(omap_fsid_fd, fsid_str, strlen(fsid_str));
+    if (ret < 0) {
+      derr << __FUNC__ << ": failed to write fsid: "
+	   << cpp_strerror(ret) << dendl;
+      goto close_omap_fsid_fd;
+    }
+    dout(10) << __FUNC__ << ": write success, fsid:" << fsid_str << ", ret:" << ret << dendl;
+    if (::fsync(omap_fsid_fd) < 0) {
+      ret = -errno;
+      derr << __FUNC__ << ": close failed: can't write fsid: "
+	   << cpp_strerror(ret) << dendl;
+      goto close_omap_fsid_fd;
+    }
+    dout(10) << "mkfs omap fsid is " << fsid << dendl;
+  } else {
+    if (fsid != old_omap_fsid) {
+      derr << __FUNC__ << ": " << omap_fsid_fn
+           << " has existed omap fsid " << old_omap_fsid
+           << " != expected osd fsid " << fsid
+           << dendl;
+      ret = -EINVAL;
+      goto close_omap_fsid_fd;
+    }
+    dout(1) << __FUNC__ << ": omap fsid is already set to " << fsid << dendl;
+  }
+
+  dout(1) << cct->_conf->filestore_omap_backend << " db exists/created" << dendl;
+
+  // journal?
+  ret = mkjournal();
+  if (ret)
+    goto close_omap_fsid_fd;
+
+  ret = write_meta("type", "filestore");
+  if (ret)
+    goto close_omap_fsid_fd;
+
+  dout(1) << "mkfs done in " << basedir << dendl;
+  ret = 0;
+
+ close_omap_fsid_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd));
+ close_fsid_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+  fsid_fd = -1;
+ close_basedir_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+  delete backend;
+  backend = nullptr;
+  return ret;
+}
+
+int FileStore::mkjournal()
+{
+  // read fsid
+  int ret;
+  char fn[PATH_MAX];
+  snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC, 0644);
+  if (fd < 0) {
+    int err = errno;
+    derr << __FUNC__ << ": open error: " << cpp_strerror(err) << dendl;
+    return -err;
+  }
+  ret = read_fsid(fd, &fsid);
+  if (ret < 0) {
+    derr << __FUNC__ << ": read error: " << cpp_strerror(ret) << dendl;
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return ret;
+  }
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+
+  ret = 0;
+
+  new_journal();
+  if (journal) {
+    ret = journal->check();
+    if (ret < 0) {
+      ret = journal->create();
+      if (ret)
+	derr << __FUNC__ << ": error creating journal on " << journalpath
+		<< ": " << cpp_strerror(ret) << dendl;
+      else
+	dout(0) << __FUNC__ << ": created journal on " << journalpath << dendl;
+    }
+    delete journal;
+    journal = nullptr;
+  }
+  return ret;
+}
+
+int FileStore::read_fsid(int fd, uuid_d *uuid)
+{
+  char fsid_str[40];
+  memset(fsid_str, 0, sizeof(fsid_str));
+  int ret = safe_read(fd, fsid_str, sizeof(fsid_str));
+  if (ret < 0)
+    return ret;
+  if (ret == 8) {
+    // old 64-bit fsid... mirror it.
+    *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str;
+    *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str;
+    return 0;
+  }
+
+  if (ret > 36)
+    fsid_str[36] = 0;
+  else
+    fsid_str[ret] = 0;
+  if (!uuid->parse(fsid_str))
+    return -EINVAL;
+  return 0;
+}
+
+int FileStore::lock_fsid()
+{
+  struct flock l;
+  memset(&l, 0, sizeof(l));
+  l.l_type = F_WRLCK;
+  l.l_whence = SEEK_SET;
+  l.l_start = 0;
+  l.l_len = 0;
+  int r = ::fcntl(fsid_fd, F_SETLK, &l);
+  if (r < 0) {
+    int err = errno;
+    dout(0) << __FUNC__ << ": failed to lock " << basedir << "/fsid, is another ceph-osd still running? "
+	    << cpp_strerror(err) << dendl;
+    return -err;
+  }
+  return 0;
+}
+
+bool FileStore::test_mount_in_use()
+{
+  dout(5) << __FUNC__ << ": basedir " << basedir << " journal " << journalpath << dendl;
+  char fn[PATH_MAX];
+  snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str());
+
+  // verify fs isn't in use
+
+  fsid_fd = ::open(fn, O_RDWR|O_CLOEXEC, 0644);
+  if (fsid_fd < 0)
+    return 0;   // no fsid, ok.
+  bool inuse = lock_fsid() < 0;
+  VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+  fsid_fd = -1;
+  return inuse;
+}
+
+bool FileStore::is_rotational()
+{
+  bool rotational;
+  if (backend) {
+    rotational = backend->is_rotational();
+  } else {
+    int fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC);
+    if (fd < 0)
+      return true;
+    struct statfs st;
+    int r = ::fstatfs(fd, &st);
+    ::close(fd);
+    if (r < 0) {
+      return true;
+    }
+    create_backend(st.f_type);
+    rotational = backend->is_rotational();
+    delete backend;
+    backend = nullptr;
+  }
+  dout(10) << __func__ << " " << (int)rotational << dendl;
+  return rotational;
+}
+
+bool FileStore::is_journal_rotational()
+{
+  bool journal_rotational;
+  if (backend) {
+    journal_rotational = backend->is_journal_rotational();
+  } else {
+    int fd = ::open(journalpath.c_str(), O_RDONLY|O_CLOEXEC);
+    if (fd < 0)
+      return true;
+    struct statfs st;
+    int r = ::fstatfs(fd, &st);
+    ::close(fd);
+    if (r < 0) {
+      return true;
+    }
+    create_backend(st.f_type);
+    journal_rotational = backend->is_journal_rotational();
+    delete backend;
+    backend = nullptr;
+  }
+  dout(10) << __func__ << " " << (int)journal_rotational << dendl;
+  return journal_rotational;
+}
+
+int FileStore::_detect_fs()
+{
+  struct statfs st;
+  int r = ::fstatfs(basedir_fd, &st);
+  if (r < 0)
+    return -errno;
+
+  blk_size = st.f_bsize;
+
+#if defined(__linux__)
+  if (st.f_type == BTRFS_SUPER_MAGIC &&
+      !g_ceph_context->check_experimental_feature_enabled("btrfs")) {
+    derr <<__FUNC__ << ": deprecated btrfs support is not enabled" << dendl;
+    return -EPERM;
+  }
+#endif
+
+  create_backend(st.f_type);
+
+  r = backend->detect_features();
+  if (r < 0) {
+    derr << __FUNC__ << ": detect_features error: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  // vdo
+  {
+    char dev_node[PATH_MAX];
+    if (int rc = BlkDev{fsid_fd}.wholedisk(dev_node, PATH_MAX); rc == 0) {
+      vdo_fd = get_vdo_stats_handle(dev_node, &vdo_name);
+      if (vdo_fd >= 0) {
+	dout(0) << __func__ << " VDO volume " << vdo_name << " for " << dev_node
+		<< dendl;
+      }
+    }
+  }
+
+  // test xattrs
+  char fn[PATH_MAX];
+  int x = rand();
+  int y = x+1;
+  snprintf(fn, sizeof(fn), "%s/xattr_test", basedir.c_str());
+  int tmpfd = ::open(fn, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0700);
+  if (tmpfd < 0) {
+    int ret = -errno;
+    derr << __FUNC__ << ": unable to create " << fn << ": " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  int ret = chain_fsetxattr(tmpfd, "user.test", &x, sizeof(x));
+  if (ret >= 0)
+    ret = chain_fgetxattr(tmpfd, "user.test", &y, sizeof(y));
+  if ((ret < 0) || (x != y)) {
+    derr << "Extended attributes don't appear to work. ";
+    if (ret)
+      *_dout << "Got error " + cpp_strerror(ret) + ". ";
+    *_dout << "If you are using ext3 or ext4, be sure to mount the underlying "
+	   << "file system with the 'user_xattr' option." << dendl;
+    ::unlink(fn);
+    VOID_TEMP_FAILURE_RETRY(::close(tmpfd));
+    return -ENOTSUP;
+  }
+
+  char buf[1000];
+  memset(buf, 0, sizeof(buf)); // shut up valgrind
+  chain_fsetxattr(tmpfd, "user.test", &buf, sizeof(buf));
+  chain_fsetxattr(tmpfd, "user.test2", &buf, sizeof(buf));
+  chain_fsetxattr(tmpfd, "user.test3", &buf, sizeof(buf));
+  chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf));
+  ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf));
+  if (ret == -ENOSPC) {
+    dout(0) << "limited size xattrs" << dendl;
+  }
+  chain_fremovexattr(tmpfd, "user.test");
+  chain_fremovexattr(tmpfd, "user.test2");
+  chain_fremovexattr(tmpfd, "user.test3");
+  chain_fremovexattr(tmpfd, "user.test4");
+  chain_fremovexattr(tmpfd, "user.test5");
+
+  ::unlink(fn);
+  VOID_TEMP_FAILURE_RETRY(::close(tmpfd));
+
+  return 0;
+}
+
+int FileStore::_sanity_check_fs()
+{
+  // sanity check(s)
+
+  if (((int)m_filestore_journal_writeahead +
+      (int)m_filestore_journal_parallel +
+      (int)m_filestore_journal_trailing) > 1) {
+    dout(0) << "mount ERROR: more than one of filestore journal {writeahead,parallel,trailing} enabled" << dendl;
+    cerr << TEXT_RED
+	 << " ** WARNING: more than one of 'filestore journal {writeahead,parallel,trailing}'\n"
+	 << "             is enabled in ceph.conf.  You must choose a single journal mode."
+	 << TEXT_NORMAL << std::endl;
+    return -EINVAL;
+  }
+
+  if (!backend->can_checkpoint()) {
+    if (!journal || !m_filestore_journal_writeahead) {
+      dout(0) << "mount WARNING: no btrfs, and no journal in writeahead mode; data may be lost" << dendl;
+      cerr << TEXT_RED
+	   << " ** WARNING: no btrfs AND (no journal OR journal not in writeahead mode)\n"
+	   << "             For non-btrfs volumes, a writeahead journal is required to\n"
+	   << "             maintain on-disk consistency in the event of a crash.  Your conf\n"
+	   << "             should include something like:\n"
+	   << "        osd journal = /path/to/journal_device_or_file\n"
+	   << "        filestore journal writeahead = true\n"
+	   << TEXT_NORMAL;
+    }
+  }
+
+  if (!journal) {
+    dout(0) << "mount WARNING: no journal" << dendl;
+    cerr << TEXT_YELLOW
+	 << " ** WARNING: No osd journal is configured: write latency may be high.\n"
+	 << "             If you will not be using an osd journal, write latency may be\n"
+	 << "             relatively high.  It can be reduced somewhat by lowering\n"
+	 << "             filestore_max_sync_interval, but lower values mean lower write\n"
+	 << "             throughput, especially with spinning disks.\n"
+	 << TEXT_NORMAL;
+  }
+
+  return 0;
+}
+
+int FileStore::write_superblock()
+{
+  bufferlist bl;
+  encode(superblock, bl);
+  return safe_write_file(basedir.c_str(), "superblock",
+			 bl.c_str(), bl.length(), 0600);
+}
+
+int FileStore::read_superblock()
+{
+  bufferptr bp(PATH_MAX);
+  int ret = safe_read_file(basedir.c_str(), "superblock",
+      bp.c_str(), bp.length());
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      // If the file doesn't exist write initial CompatSet
+      return write_superblock();
+    }
+    return ret;
+  }
+
+  bufferlist bl;
+  bl.push_back(std::move(bp));
+  auto i = bl.cbegin();
+  decode(superblock, i);
+  return 0;
+}
+
+int FileStore::update_version_stamp()
+{
+  return write_version_stamp();
+}
+
+int FileStore::version_stamp_is_valid(uint32_t *version)
+{
+  bufferptr bp(PATH_MAX);
+  int ret = safe_read_file(basedir.c_str(), "store_version",
+      bp.c_str(), bp.length());
+  if (ret < 0) {
+    return ret;
+  }
+  bufferlist bl;
+  bl.push_back(std::move(bp));
+  auto i = bl.cbegin();
+  decode(*version, i);
+  dout(10) << __FUNC__ << ": was " << *version << " vs target "
+	   << target_version << dendl;
+  if (*version == target_version)
+    return 1;
+  else
+    return 0;
+}
+
+int FileStore::flush_cache(ostream *os)
+{
+  string drop_caches_file = "/proc/sys/vm/drop_caches";
+  int drop_caches_fd = ::open(drop_caches_file.c_str(), O_WRONLY|O_CLOEXEC), ret = 0;
+  char buf[2] = "3";
+  size_t len = strlen(buf);
+
+  if (drop_caches_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": failed to open " << drop_caches_file << ": " << cpp_strerror(ret) << dendl;
+    if (os) {
+      *os << "FileStore flush_cache: failed to open " << drop_caches_file << ": " << cpp_strerror(ret);
+    }
+    return ret;
+  }
+
+  if (::write(drop_caches_fd, buf, len) < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": failed to write to " << drop_caches_file << ": " << cpp_strerror(ret) << dendl;
+    if (os) {
+      *os << "FileStore flush_cache: failed to write to " << drop_caches_file << ": " << cpp_strerror(ret);
+    }
+    goto out;
+  }
+
+out:
+  ::close(drop_caches_fd);
+  return ret;
+}
+
+int FileStore::write_version_stamp()
+{
+  dout(1) << __FUNC__ << ": " << target_version << dendl;
+  bufferlist bl;
+  encode(target_version, bl);
+
+  return safe_write_file(basedir.c_str(), "store_version",
+			 bl.c_str(), bl.length(), 0600);
+}
+
+int FileStore::upgrade()
+{
+  dout(1) << __FUNC__ << dendl;
+  uint32_t version;
+  int r = version_stamp_is_valid(&version);
+
+  if (r == -ENOENT) {
+      derr << "The store_version file doesn't exist." << dendl;
+      return -EINVAL;
+  }
+  if (r < 0)
+    return r;
+  if (r == 1)
+    return 0;
+
+  if (version < 3) {
+    derr << "ObjectStore is old at version " << version << ".  Please upgrade to firefly v0.80.x, convert your store, and then upgrade."  << dendl;
+    return -EINVAL;
+  }
+
+  // nothing necessary in FileStore for v3 -> v4 upgrade; we just need to
+  // open up DBObjectMap with the do_upgrade flag, which we already did.
+  update_version_stamp();
+  return 0;
+}
+
+int FileStore::read_op_seq(uint64_t *seq)
+{
+  int op_fd = ::open(current_op_seq_fn.c_str(), O_CREAT|O_RDWR|O_CLOEXEC, 0644);
+  if (op_fd < 0) {
+    int r = -errno;
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  char s[40];
+  memset(s, 0, sizeof(s));
+  int ret = safe_read(op_fd, s, sizeof(s) - 1);
+  if (ret < 0) {
+    derr << __FUNC__ << ": error reading " << current_op_seq_fn << ": " << cpp_strerror(ret) << dendl;
+    VOID_TEMP_FAILURE_RETRY(::close(op_fd));
+    ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+    return ret;
+  }
+  *seq = atoll(s);
+  return op_fd;
+}
+
+int FileStore::write_op_seq(int fd, uint64_t seq)
+{
+  char s[30];
+  snprintf(s, sizeof(s), "%" PRId64 "\n", seq);
+  int ret = TEMP_FAILURE_RETRY(::pwrite(fd, s, strlen(s), 0));
+  if (ret < 0) {
+    ret = -errno;
+    ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+  }
+  return ret;
+}
+
+int FileStore::mount()
+{
+  int ret;
+  char buf[PATH_MAX];
+  uint64_t initial_op_seq;
+  uuid_d omap_fsid;
+  set<string> cluster_snaps;
+  CompatSet supported_compat_set = get_fs_supported_compat_set();
+
+  dout(5) << "basedir " << basedir << " journal " << journalpath << dendl;
+
+  ret = set_throttle_params();
+  if (ret != 0)
+    goto done;
+
+  // make sure global base dir exists
+  if (::access(basedir.c_str(), R_OK | W_OK)) {
+    ret = -errno;
+    derr << __FUNC__ << ": unable to access basedir '" << basedir << "': "
+	 << cpp_strerror(ret) << dendl;
+    goto done;
+  }
+
+  // get fsid
+  snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str());
+  fsid_fd = ::open(buf, O_RDWR|O_CLOEXEC, 0644);
+  if (fsid_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": error opening '" << buf << "': "
+	 << cpp_strerror(ret) << dendl;
+    goto done;
+  }
+
+  ret = read_fsid(fsid_fd, &fsid);
+  if (ret < 0) {
+    derr << __FUNC__ << ": error reading fsid_fd: " << cpp_strerror(ret)
+	 << dendl;
+    goto close_fsid_fd;
+  }
+
+  if (lock_fsid() < 0) {
+    derr << __FUNC__ << ": lock_fsid failed" << dendl;
+    ret = -EBUSY;
+    goto close_fsid_fd;
+  }
+
+  dout(10) << "mount fsid is " << fsid << dendl;
+
+
+  uint32_t version_stamp;
+  ret = version_stamp_is_valid(&version_stamp);
+  if (ret < 0) {
+    derr << __FUNC__ << ": error in version_stamp_is_valid: "
+	 << cpp_strerror(ret) << dendl;
+    goto close_fsid_fd;
+  } else if (ret == 0) {
+    if (do_update || (int)version_stamp < cct->_conf->filestore_update_to) {
+      derr << __FUNC__ << ": stale version stamp detected: "
+	   << version_stamp
+	   << ". Proceeding, do_update "
+	   << "is set, performing disk format upgrade."
+	   << dendl;
+      do_update = true;
+    } else {
+      ret = -EINVAL;
+      derr << __FUNC__ << ": stale version stamp " << version_stamp
+	   << ". Please run the FileStore update script before starting the "
+	   << "OSD, or set filestore_update_to to " << target_version
+	   << " (currently " << cct->_conf->filestore_update_to << ")"
+	   << dendl;
+      goto close_fsid_fd;
+    }
+  }
+
+  ret = read_superblock();
+  if (ret < 0) {
+    goto close_fsid_fd;
+  }
+
+  // Check if this FileStore supports all the necessary features to mount
+  if (supported_compat_set.compare(superblock.compat_features) == -1) {
+    derr << __FUNC__ << ": Incompatible features set "
+	   << superblock.compat_features << dendl;
+    ret = -EINVAL;
+    goto close_fsid_fd;
+  }
+
+  // open some dir handles
+  basedir_fd = ::open(basedir.c_str(), O_RDONLY|O_CLOEXEC);
+  if (basedir_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": failed to open " << basedir << ": "
+	 << cpp_strerror(ret) << dendl;
+    basedir_fd = -1;
+    goto close_fsid_fd;
+  }
+
+  // test for btrfs, xattrs, etc.
+  ret = _detect_fs();
+  if (ret < 0) {
+    derr << __FUNC__ << ": error in _detect_fs: "
+	 << cpp_strerror(ret) << dendl;
+    goto close_basedir_fd;
+  }
+
+  {
+    list<string> ls;
+    ret = backend->list_checkpoints(ls);
+    if (ret < 0) {
+      derr << __FUNC__ << ": error in _list_snaps: "<< cpp_strerror(ret) << dendl;
+      goto close_basedir_fd;
+    }
+
+    long long unsigned c, prev = 0;
+    char clustersnap[NAME_MAX];
+    for (list<string>::iterator it = ls.begin(); it != ls.end(); ++it) {
+      if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) {
+	ceph_assert(c > prev);
+	prev = c;
+	snaps.push_back(c);
+      } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1)
+	cluster_snaps.insert(*it);
+    }
+  }
+
+  if (m_osd_rollback_to_cluster_snap.length() &&
+      cluster_snaps.count(m_osd_rollback_to_cluster_snap) == 0) {
+    derr << "rollback to cluster snapshot '" << m_osd_rollback_to_cluster_snap << "': not found" << dendl;
+    ret = -ENOENT;
+    goto close_basedir_fd;
+  }
+
+  char nosnapfn[200];
+  snprintf(nosnapfn, sizeof(nosnapfn), "%s/nosnap", current_fn.c_str());
+
+  if (backend->can_checkpoint()) {
+    if (snaps.empty()) {
+      dout(0) << __FUNC__ << ": WARNING: no consistent snaps found, store may be in inconsistent state" << dendl;
+    } else {
+      char s[NAME_MAX];
+      uint64_t curr_seq = 0;
+
+      if (m_osd_rollback_to_cluster_snap.length()) {
+	derr << TEXT_RED
+	     << " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **"
+	     << TEXT_NORMAL
+	     << dendl;
+	ceph_assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap));
+	snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str());
+      } else {
+	{
+	  int fd = read_op_seq(&curr_seq);
+	  if (fd >= 0) {
+	    VOID_TEMP_FAILURE_RETRY(::close(fd));
+	  }
+	}
+	if (curr_seq)
+	  dout(10) << " current/ seq was " << curr_seq << dendl;
+	else
+	  dout(10) << " current/ missing entirely (unusual, but okay)" << dendl;
+
+	uint64_t cp = snaps.back();
+	dout(10) << " most recent snap from " << snaps << " is " << cp << dendl;
+
+	// if current/ is marked as non-snapshotted, refuse to roll
+	// back (without clear direction) to avoid throwing out new
+	// data.
+	struct stat st;
+	if (::stat(nosnapfn, &st) == 0) {
+	  if (!m_osd_use_stale_snap) {
+	    derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl;
+	    derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl;
+	    derr << "config option for --osd-use-stale-snap startup argument." << dendl;
+	    ret = -ENOTSUP;
+	    goto close_basedir_fd;
+	  }
+	  derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq
+	       << ", newest snap is " << cp << dendl;
+	  cerr << TEXT_YELLOW
+	       << " ** WARNING: forcing the use of stale snapshot data **"
+	       << TEXT_NORMAL << std::endl;
+	}
+
+        dout(10) << __FUNC__ << ": rolling back to consistent snap " << cp << dendl;
+	snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
+      }
+
+      // drop current?
+      ret = backend->rollback_to(s);
+      if (ret) {
+	derr << __FUNC__ << ": error rolling back to " << s << ": "
+	     << cpp_strerror(ret) << dendl;
+	goto close_basedir_fd;
+      }
+    }
+  }
+  initial_op_seq = 0;
+
+  current_fd = ::open(current_fn.c_str(), O_RDONLY|O_CLOEXEC);
+  if (current_fd < 0) {
+    ret = -errno;
+    derr << __FUNC__ << ": error opening: " << current_fn << ": " << cpp_strerror(ret) << dendl;
+    goto close_basedir_fd;
+  }
+
+  ceph_assert(current_fd >= 0);
+
+  op_fd = read_op_seq(&initial_op_seq);
+  if (op_fd < 0) {
+    ret = op_fd;
+    derr << __FUNC__ << ": read_op_seq failed" << dendl;
+    goto close_current_fd;
+  }
+
+  dout(5) << "mount op_seq is " << initial_op_seq << dendl;
+  if (initial_op_seq == 0) {
+    derr << "mount initial op seq is 0; something is wrong" << dendl;
+    ret = -EINVAL;
+    goto close_current_fd;
+  }
+
+  if (!backend->can_checkpoint()) {
+    // mark current/ as non-snapshotted so that we don't rollback away
+    // from it.
+    int r = ::creat(nosnapfn, 0644);
+    if (r < 0) {
+      ret = -errno;
+      derr << __FUNC__ << ": failed to create current/nosnap" << dendl;
+      goto close_current_fd;
+    }
+    VOID_TEMP_FAILURE_RETRY(::close(r));
+  } else {
+    // clear nosnap marker, if present.
+    ::unlink(nosnapfn);
+  }
+
+  // check fsid with omap
+  // get omap fsid
+  char omap_fsid_buf[PATH_MAX];
+  struct ::stat omap_fsid_stat;
+  snprintf(omap_fsid_buf, sizeof(omap_fsid_buf), "%s/osd_uuid", omap_dir.c_str());
+  // if osd_uuid not exists, assume as this omap matchs corresponding osd
+  if (::stat(omap_fsid_buf, &omap_fsid_stat) != 0){
+    dout(10) << __FUNC__ << ": osd_uuid not found under omap, "
+             << "assume as matched."
+             << dendl;
+  } else {
+    int omap_fsid_fd;
+    // if osd_uuid exists, compares osd_uuid with fsid
+    omap_fsid_fd = ::open(omap_fsid_buf, O_RDONLY|O_CLOEXEC, 0644);
+    if (omap_fsid_fd < 0) {
+        ret = -errno;
+        derr << __FUNC__ << ": error opening '" << omap_fsid_buf << "': "
+             << cpp_strerror(ret)
+             << dendl;
+        goto close_current_fd;
+    }
+    ret = read_fsid(omap_fsid_fd, &omap_fsid);
+    VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd));
+    if (ret < 0) {
+      derr << __FUNC__ << ": error reading omap_fsid_fd"
+           << ", omap_fsid = " << omap_fsid
+           << cpp_strerror(ret)
+           << dendl;
+      goto close_current_fd;
+    }
+    if (fsid != omap_fsid) {
+      derr << __FUNC__ << ": " << omap_fsid_buf
+           << " has existed omap fsid " << omap_fsid
+           << " != expected osd fsid " << fsid
+           << dendl;
+      ret = -EINVAL;
+      goto close_current_fd;
+    }
+  }
+
+  dout(0) << "start omap initiation" << dendl;
+  if (!(generic_flags & SKIP_MOUNT_OMAP)) {
+    KeyValueDB * omap_store = KeyValueDB::create(cct,
+						 superblock.omap_backend,
+						 omap_dir);
+    if (!omap_store)
+    {
+      derr << __FUNC__ << ": Error creating " << superblock.omap_backend << dendl;
+      ret = -1;
+      goto close_current_fd;
+    }
+
+    if (superblock.omap_backend == "rocksdb")
+      ret = omap_store->init(cct->_conf->filestore_rocksdb_options);
+    else
+      ret = omap_store->init();
+
+    if (ret < 0) {
+      derr << __FUNC__ << ": Error initializing omap_store: " << cpp_strerror(ret) << dendl;
+      goto close_current_fd;
+    }
+
+    stringstream err;
+    if (omap_store->create_and_open(err)) {
+      delete omap_store;
+      omap_store = nullptr;
+      derr << __FUNC__ << ": Error initializing " << superblock.omap_backend
+	   << " : " << err.str() << dendl;
+      ret = -1;
+      goto close_current_fd;
+    }
+
+    DBObjectMap *dbomap = new DBObjectMap(cct, omap_store);
+    ret = dbomap->init(do_update);
+    if (ret < 0) {
+      delete dbomap;
+      dbomap = nullptr;
+      derr << __FUNC__ << ": Error initializing DBObjectMap: " << ret << dendl;
+      goto close_current_fd;
+    }
+    stringstream err2;
+
+    if (cct->_conf->filestore_debug_omap_check && !dbomap->check(err2)) {
+      derr << err2.str() << dendl;
+      delete dbomap;
+      dbomap = nullptr;
+      ret = -EINVAL;
+      goto close_current_fd;
+    }
+    object_map.reset(dbomap);
+  }
+
+  // journal
+  new_journal();
+
+  // select journal mode?
+  if (journal) {
+    if (!m_filestore_journal_writeahead &&
+	!m_filestore_journal_parallel &&
+	!m_filestore_journal_trailing) {
+      if (!backend->can_checkpoint()) {
+	m_filestore_journal_writeahead = true;
+	dout(0) << __FUNC__ << ": enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl;
+      } else {
+	m_filestore_journal_parallel = true;
+	dout(0) << __FUNC__ << ": enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl;
+      }
+    } else {
+      if (m_filestore_journal_writeahead)
+	dout(0) << __FUNC__ << ": WRITEAHEAD journal mode explicitly enabled in conf" << dendl;
+      if (m_filestore_journal_parallel)
+	dout(0) << __FUNC__ << ": PARALLEL journal mode explicitly enabled in conf" << dendl;
+      if (m_filestore_journal_trailing)
+	dout(0) << __FUNC__ << ": TRAILING journal mode explicitly enabled in conf" << dendl;
+    }
+    if (m_filestore_journal_writeahead)
+      journal->set_wait_on_full(true);
+  } else {
+    dout(0) << __FUNC__ << ": no journal" << dendl;
+  }
+
+  ret = _sanity_check_fs();
+  if (ret) {
+    derr << __FUNC__ << ": _sanity_check_fs failed with error "
+	 << ret << dendl;
+    goto close_current_fd;
+  }
+
+  // Cleanup possibly invalid collections
+  {
+    vector<coll_t> collections;
+    ret = list_collections(collections, true);
+    if (ret < 0) {
+      derr << "Error " << ret << " while listing collections" << dendl;
+      goto close_current_fd;
+    }
+    for (vector<coll_t>::iterator i = collections.begin();
+	 i != collections.end();
+	 ++i) {
+      Index index;
+      ret = get_index(*i, &index);
+      if (ret < 0) {
+	derr << "Unable to mount index " << *i
+	     << " with error: " << ret << dendl;
+	goto close_current_fd;
+      }
+      ceph_assert(index.index);
+      std::unique_lock l{(index.index)->access_lock};
+
+      index->cleanup();
+    }
+  }
+  if (!m_disable_wbthrottle) {
+    wbthrottle.start();
+  } else {
+    dout(0) << __FUNC__ << ": INFO: WbThrottle is disabled" << dendl;
+    if (cct->_conf->filestore_odsync_write) {
+      dout(0) << __FUNC__ << ": INFO: O_DSYNC write is enabled" << dendl;
+    }
+  }
+  sync_thread.create("filestore_sync");
+
+  if (!(generic_flags & SKIP_JOURNAL_REPLAY)) {
+    ret = journal_replay(initial_op_seq);
+    if (ret < 0) {
+      derr << __FUNC__ << ": failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl;
+      if (ret == -ENOTTY) {
+        derr << "maybe journal is not pointing to a block device and its size "
+	     << "wasn't configured?" << dendl;
+      }
+
+      goto stop_sync;
+    }
+  }
+
+  {
+    stringstream err2;
+    if (cct->_conf->filestore_debug_omap_check && !object_map->check(err2)) {
+      derr << err2.str() << dendl;
+      ret = -EINVAL;
+      goto stop_sync;
+    }
+  }
+
+  init_temp_collections();
+
+  journal_start();
+
+  op_tp.start();
+  for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+    (*it)->start();
+  }
+  for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+    (*it)->start();
+  }
+
+  timer.init();
+
+  // upgrade?
+  if (cct->_conf->filestore_update_to >= (int)get_target_version()) {
+    int err = upgrade();
+    if (err < 0) {
+      derr << "error converting store" << dendl;
+      umount();
+      return err;
+    }
+  }
+
+  // all okay.
+  return 0;
+
+stop_sync:
+  // stop sync thread
+  {
+    std::lock_guard l{lock};
+    stop = true;
+    sync_cond.notify_all();
+  }
+  sync_thread.join();
+  if (!m_disable_wbthrottle) {
+    wbthrottle.stop();
+  }
+close_current_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+  current_fd = -1;
+close_basedir_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+  basedir_fd = -1;
+close_fsid_fd:
+  VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+  fsid_fd = -1;
+done:
+  ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+  delete backend;
+  backend = nullptr;
+  object_map.reset();
+  return ret;
+}
+
+void FileStore::init_temp_collections()
+{
+  dout(10) << __FUNC__ << dendl;
+  vector<coll_t> ls;
+  int r = list_collections(ls, true);
+  ceph_assert(r >= 0);
+
+  dout(20) << " ls " << ls << dendl;
+
+  SequencerPosition spos;
+
+  set<coll_t> temps;
+  for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p)
+    if (p->is_temp())
+      temps.insert(*p);
+  dout(20) << " temps " << temps << dendl;
+
+  for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+    if (p->is_temp())
+      continue;
+    coll_map[*p] = ceph::make_ref<OpSequencer>(cct, ++next_osr_id, *p);
+    if (p->is_meta())
+      continue;
+    coll_t temp = p->get_temp();
+    if (temps.count(temp)) {
+      temps.erase(temp);
+    } else {
+      dout(10) << __FUNC__ << ": creating " << temp << dendl;
+      r = _create_collection(temp, 0, spos);
+      ceph_assert(r == 0);
+    }
+  }
+
+  for (set<coll_t>::iterator p = temps.begin(); p != temps.end(); ++p) {
+    dout(10) << __FUNC__ << ": removing stray " << *p << dendl;
+    r = _collection_remove_recursive(*p, spos);
+    ceph_assert(r == 0);
+  }
+}
+
+int FileStore::umount()
+{
+  dout(5) << __FUNC__ << ": " << basedir << dendl;
+
+  flush();
+  sync();
+  do_force_sync();
+
+  {
+    std::lock_guard l(coll_lock);
+    coll_map.clear();
+  }
+
+  {
+    std::lock_guard l{lock};
+    stop = true;
+    sync_cond.notify_all();
+  }
+  sync_thread.join();
+  if (!m_disable_wbthrottle){
+    wbthrottle.stop();
+  }
+  op_tp.stop();
+
+  journal_stop();
+  if (!(generic_flags & SKIP_JOURNAL_REPLAY))
+    journal_write_close();
+
+  for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+    (*it)->stop();
+  }
+  for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+    (*it)->stop();
+  }
+
+  if (vdo_fd >= 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(vdo_fd));
+    vdo_fd = -1;
+  }
+  if (fsid_fd >= 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+    fsid_fd = -1;
+  }
+  if (op_fd >= 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(op_fd));
+    op_fd = -1;
+  }
+  if (current_fd >= 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+    current_fd = -1;
+  }
+  if (basedir_fd >= 0) {
+    VOID_TEMP_FAILURE_RETRY(::close(basedir_fd));
+    basedir_fd = -1;
+  }
+
+  force_sync = false;
+
+  delete backend;
+  backend = nullptr;
+
+  object_map.reset();
+
+  {
+    std::lock_guard l{sync_entry_timeo_lock};
+    timer.shutdown();
+  }
+
+  // nothing
+  return 0;
+}
+
+
+/// -----------------------------
+
+// keep OpSequencer handles alive for all time so that a sequence
+// that removes a collection and creates a new one will not allow
+// two sequencers for the same collection to be alive at once.
+
+ObjectStore::CollectionHandle FileStore::open_collection(const coll_t& c)
+{
+  std::lock_guard l{coll_lock};
+  auto p = coll_map.find(c);
+  if (p == coll_map.end()) {
+    return CollectionHandle();
+  }
+  return p->second;
+}
+
+ObjectStore::CollectionHandle FileStore::create_new_collection(const coll_t& c)
+{
+  std::lock_guard l{coll_lock};
+  auto p = coll_map.find(c);
+  if (p == coll_map.end()) {
+    auto r = ceph::make_ref<OpSequencer>(cct, ++next_osr_id, c);
+    coll_map[c] = r;
+    return r;
+  } else {
+    return p->second;
+  }
+}
+
+
+/// -----------------------------
+
+FileStore::Op *FileStore::build_op(vector<Transaction>& tls,
+				   Context *onreadable,
+				   Context *onreadable_sync,
+				   TrackedOpRef osd_op)
+{
+  uint64_t bytes = 0, ops = 0;
+  for (vector<Transaction>::iterator p = tls.begin();
+       p != tls.end();
+       ++p) {
+    bytes += (*p).get_num_bytes();
+    ops += (*p).get_num_ops();
+  }
+
+  Op *o = new Op;
+  o->start = ceph_clock_now();
+  o->tls = std::move(tls);
+  o->onreadable = onreadable;
+  o->onreadable_sync = onreadable_sync;
+  o->ops = ops;
+  o->bytes = bytes;
+  o->osd_op = osd_op;
+  return o;
+}
+
+
+
+void FileStore::queue_op(OpSequencer *osr, Op *o)
+{
+  // queue op on sequencer, then queue sequencer for the threadpool,
+  // so that regardless of which order the threads pick up the
+  // sequencer, the op order will be preserved.
+
+  osr->queue(o);
+  o->trace.event("queued");
+
+  logger->inc(l_filestore_ops);
+  logger->inc(l_filestore_bytes, o->bytes);
+
+  dout(5) << __FUNC__ << ": " << o << " seq " << o->op
+	  << " " << *osr
+	  << " " << o->bytes << " bytes"
+	  << "   (queue has " << throttle_ops.get_current() << " ops and " << throttle_bytes.get_current() << " bytes)"
+	  << dendl;
+  op_wq.queue(osr);
+}
+
+void FileStore::op_queue_reserve_throttle(Op *o)
+{
+  throttle_ops.get();
+  throttle_bytes.get(o->bytes);
+
+  logger->set(l_filestore_op_queue_ops, throttle_ops.get_current());
+  logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current());
+}
+
+void FileStore::op_queue_release_throttle(Op *o)
+{
+  throttle_ops.put();
+  throttle_bytes.put(o->bytes);
+  logger->set(l_filestore_op_queue_ops, throttle_ops.get_current());
+  logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current());
+}
+
+void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
+{
+  if (!m_disable_wbthrottle) {
+    wbthrottle.throttle();
+  }
+  // inject a stall?
+  if (cct->_conf->filestore_inject_stall) {
+    int orig = cct->_conf->filestore_inject_stall;
+    dout(5) << __FUNC__ << ": filestore_inject_stall " << orig << ", sleeping" << dendl;
+    sleep(orig);
+    cct->_conf.set_val("filestore_inject_stall", "0");
+    dout(5) << __FUNC__ << ": done stalling" << dendl;
+  }
+
+  osr->apply_lock.lock();
+  Op *o = osr->peek_queue();
+  o->trace.event("op_apply_start");
+  apply_manager.op_apply_start(o->op);
+  dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " start" << dendl;
+  o->trace.event("_do_transactions start");
+  int r = _do_transactions(o->tls, o->op, &handle, osr->osr_name);
+  o->trace.event("op_apply_finish");
+  apply_manager.op_apply_finish(o->op);
+  dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " r = " << r
+	   << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl;
+}
+
+void FileStore::_finish_op(OpSequencer *osr)
+{
+  list<Context*> to_queue;
+  Op *o = osr->dequeue(&to_queue);
+
+  o->tls.clear();
+
+  utime_t lat = ceph_clock_now();
+  lat -= o->start;
+
+  dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " lat " << lat << dendl;
+  osr->apply_lock.unlock();  // locked in _do_op
+  o->trace.event("_finish_op");
+
+  // called with tp lock held
+  op_queue_release_throttle(o);
+
+  logger->tinc(l_filestore_apply_latency, lat);
+
+  if (o->onreadable_sync) {
+    o->onreadable_sync->complete(0);
+  }
+  if (o->onreadable) {
+    apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable);
+  }
+  if (!to_queue.empty()) {
+    apply_finishers[osr->id % m_apply_finisher_num]->queue(to_queue);
+  }
+  delete o;
+  o = nullptr;
+}
+
+struct C_JournaledAhead : public Context {
+  FileStore *fs;
+  FileStore::OpSequencer *osr;
+  FileStore::Op *o;
+  Context *ondisk;
+
+  C_JournaledAhead(FileStore *f, FileStore::OpSequencer *os, FileStore::Op *o, Context *ondisk):
+    fs(f), osr(os), o(o), ondisk(ondisk) { }
+  void finish(int r) override {
+    fs->_journaled_ahead(osr, o, ondisk);
+  }
+};
+
+int FileStore::queue_transactions(CollectionHandle& ch, vector<Transaction>& tls,
+				  TrackedOpRef osd_op,
+				  ThreadPool::TPHandle *handle)
+{
+  Context *onreadable;
+  Context *ondisk;
+  Context *onreadable_sync;
+  ObjectStore::Transaction::collect_contexts(
+    tls, &onreadable, &ondisk, &onreadable_sync);
+
+  if (cct->_conf->objectstore_blackhole) {
+    dout(0) << __FUNC__ << ": objectstore_blackhole = TRUE, dropping transaction"
+	    << dendl;
+    delete ondisk;
+    ondisk = nullptr;
+    delete onreadable;
+    onreadable = nullptr;
+    delete onreadable_sync;
+    onreadable_sync = nullptr;
+    return 0;
+  }
+
+  utime_t start = ceph_clock_now();
+
+  OpSequencer *osr = static_cast<OpSequencer*>(ch.get());
+  dout(5) << __FUNC__ << ": osr " << osr << " " << *osr << dendl;
+
+  ZTracer::Trace trace;
+  if (osd_op && osd_op->pg_trace) {
+    osd_op->store_trace.init("filestore op", &trace_endpoint, &osd_op->pg_trace);
+    trace = osd_op->store_trace;
+  }
+
+  if (journal && journal->is_writeable() && !m_filestore_journal_trailing) {
+    Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
+
+    //prepare and encode transactions data out of lock
+    bufferlist tbl;
+    int orig_len = journal->prepare_entry(o->tls, &tbl);
+
+    if (handle)
+      handle->suspend_tp_timeout();
+
+    op_queue_reserve_throttle(o);
+    journal->reserve_throttle_and_backoff(tbl.length());
+
+    if (handle)
+      handle->reset_tp_timeout();
+
+    uint64_t op_num = submit_manager.op_submit_start();
+    o->op = op_num;
+    trace.keyval("opnum", op_num);
+
+    if (m_filestore_do_dump)
+      dump_transactions(o->tls, o->op, osr);
+
+    if (m_filestore_journal_parallel) {
+      dout(5) << __FUNC__ << ": (parallel) " << o->op << " " << o->tls << dendl;
+
+      trace.keyval("journal mode", "parallel");
+      trace.event("journal started");
+      _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op);
+
+      // queue inside submit_manager op submission lock
+      queue_op(osr, o);
+      trace.event("op queued");
+    } else if (m_filestore_journal_writeahead) {
+      dout(5) << __FUNC__ << ": (writeahead) " << o->op << " " << o->tls << dendl;
+
+      osr->queue_journal(o);
+
+      trace.keyval("journal mode", "writeahead");
+      trace.event("journal started");
+      _op_journal_transactions(tbl, orig_len, o->op,
+			       new C_JournaledAhead(this, osr, o, ondisk),
+			       osd_op);
+    } else {
+      ceph_abort();
+    }
+    submit_manager.op_submit_finish(op_num);
+    utime_t end = ceph_clock_now();
+    logger->tinc(l_filestore_queue_transaction_latency_avg, end - start);
+    return 0;
+  }
+
+  if (!journal) {
+    Op *o = build_op(tls, onreadable, onreadable_sync, osd_op);
+    dout(5) << __FUNC__ << ": (no journal) " << o << " " << tls << dendl;
+
+    if (handle)
+      handle->suspend_tp_timeout();
+
+    op_queue_reserve_throttle(o);
+
+    if (handle)
+      handle->reset_tp_timeout();
+
+    uint64_t op_num = submit_manager.op_submit_start();
+    o->op = op_num;
+
+    if (m_filestore_do_dump)
+      dump_transactions(o->tls, o->op, osr);
+
+    queue_op(osr, o);
+    trace.keyval("opnum", op_num);
+    trace.keyval("journal mode", "none");
+    trace.event("op queued");
+
+    if (ondisk)
+      apply_manager.add_waiter(op_num, ondisk);
+    submit_manager.op_submit_finish(op_num);
+    utime_t end = ceph_clock_now();
+    logger->tinc(l_filestore_queue_transaction_latency_avg, end - start);
+    return 0;
+  }
+
+  ceph_assert(journal);
+  //prepare and encode transactions data out of lock
+  bufferlist tbl;
+  int orig_len = -1;
+  if (journal->is_writeable()) {
+    orig_len = journal->prepare_entry(tls, &tbl);
+  }
+  uint64_t op = submit_manager.op_submit_start();
+  dout(5) << __FUNC__ << ": (trailing journal) " << op << " " << tls << dendl;
+
+  if (m_filestore_do_dump)
+    dump_transactions(tls, op, osr);
+
+  trace.event("op_apply_start");
+  trace.keyval("opnum", op);
+  trace.keyval("journal mode", "trailing");
+  apply_manager.op_apply_start(op);
+  trace.event("do_transactions");
+  int r = do_transactions(tls, op);
+
+  if (r >= 0) {
+    trace.event("journal started");
+    _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op);
+  } else {
+    delete ondisk;
+    ondisk = nullptr;
+  }
+
+  // start on_readable finisher after we queue journal item, as on_readable callback
+  // is allowed to delete the Transaction
+  if (onreadable_sync) {
+    onreadable_sync->complete(r);
+  }
+  apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r);
+
+  submit_manager.op_submit_finish(op);
+  trace.event("op_apply_finish");
+  apply_manager.op_apply_finish(op);
+
+  utime_t end = ceph_clock_now();
+  logger->tinc(l_filestore_queue_transaction_latency_avg, end - start);
+  return r;
+}
+
+void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk)
+{
+  dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " " << o->tls << dendl;
+
+  o->trace.event("writeahead journal finished");
+
+  // this should queue in order because the journal does it's completions in order.
+  queue_op(osr, o);
+
+  list<Context*> to_queue;
+  osr->dequeue_journal(&to_queue);
+
+  // do ondisk completions async, to prevent any onreadable_sync completions
+  // getting blocked behind an ondisk completion.
+  if (ondisk) {
+    dout(10) << " queueing ondisk " << ondisk << dendl;
+    ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk);
+  }
+  if (!to_queue.empty()) {
+    ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(to_queue);
+  }
+}
+
+int FileStore::_do_transactions(
+  vector<Transaction> &tls,
+  uint64_t op_seq,
+  ThreadPool::TPHandle *handle,
+  const char *osr_name)
+{
+  int trans_num = 0;
+
+  for (vector<Transaction>::iterator p = tls.begin();
+       p != tls.end();
+       ++p, trans_num++) {
+    _do_transaction(*p, op_seq, trans_num, handle, osr_name);
+    if (handle)
+      handle->reset_tp_timeout();
+  }
+
+  return 0;
+}
+
+void FileStore::_set_global_replay_guard(const coll_t& cid,
+					 const SequencerPosition &spos)
+{
+  if (backend->can_checkpoint())
+    return;
+
+  // sync all previous operations on this sequencer
+  int ret = object_map->sync();
+  if (ret < 0) {
+    derr << __FUNC__ << ": omap sync error " << cpp_strerror(ret) << dendl;
+    ceph_abort_msg("_set_global_replay_guard failed");
+  }
+  ret = sync_filesystem(basedir_fd);
+  if (ret < 0) {
+    derr << __FUNC__ << ": sync_filesystem error " << cpp_strerror(ret) << dendl;
+    ceph_abort_msg("_set_global_replay_guard failed");
+  }
+
+  char fn[PATH_MAX];
+  get_cdir(cid, fn, sizeof(fn));
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    int err = errno;
+    derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+    ceph_abort_msg("_set_global_replay_guard failed");
+  }
+
+  _inject_failure();
+
+  // then record that we did it
+  bufferlist v;
+  encode(spos, v);
+  int r = chain_fsetxattr<true, true>(
+    fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length());
+  if (r < 0) {
+    derr << __FUNC__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR
+	 << " got " << cpp_strerror(r) << dendl;
+    ceph_abort_msg("fsetxattr failed");
+  }
+
+  // and make sure our xattr is durable.
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  _inject_failure();
+
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  dout(10) << __FUNC__ << ": " << spos << " done" << dendl;
+}
+
+int FileStore::_check_global_replay_guard(const coll_t& cid,
+					  const SequencerPosition& spos)
+{
+  char fn[PATH_MAX];
+  get_cdir(cid, fn, sizeof(fn));
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    dout(10) << __FUNC__ << ": " << cid << " dne" << dendl;
+    return 1;  // if collection does not exist, there is no guard, and we can replay.
+  }
+
+  char buf[100];
+  int r = chain_fgetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, buf, sizeof(buf));
+  if (r < 0) {
+    dout(20) << __FUNC__ << ": no xattr" << dendl;
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return 1;  // no xattr
+  }
+  bufferlist bl;
+  bl.append(buf, r);
+
+  SequencerPosition opos;
+  auto p = bl.cbegin();
+  decode(opos, p);
+
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  return spos >= opos ? 1 : -1;
+}
+
+
+void FileStore::_set_replay_guard(const coll_t& cid,
+                                  const SequencerPosition &spos,
+                                  bool in_progress=false)
+{
+  char fn[PATH_MAX];
+  get_cdir(cid, fn, sizeof(fn));
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    int err = errno;
+    derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+    ceph_abort_msg("_set_replay_guard failed");
+  }
+  _set_replay_guard(fd, spos, 0, in_progress);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+
+void FileStore::_set_replay_guard(int fd,
+				  const SequencerPosition& spos,
+				  const ghobject_t *hoid,
+				  bool in_progress)
+{
+  if (backend->can_checkpoint())
+    return;
+
+  dout(10) << __FUNC__ << ": " << spos << (in_progress ? " START" : "") << dendl;
+
+  _inject_failure();
+
+  // first make sure the previous operation commits
+  int r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  if (!in_progress) {
+    // sync object_map too.  even if this object has a header or keys,
+    // it have had them in the past and then removed them, so always
+    // sync.
+    object_map->sync(hoid, &spos);
+  }
+
+  _inject_failure();
+
+  // then record that we did it
+  bufferlist v(40);
+  encode(spos, v);
+  encode(in_progress, v);
+  r = chain_fsetxattr<true, true>(
+    fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
+  if (r < 0) {
+    derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
+    ceph_abort_msg("fsetxattr failed");
+  }
+
+  // and make sure our xattr is durable.
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  _inject_failure();
+
+  dout(10) << __FUNC__ << ": " << spos << " done" << dendl;
+}
+
+void FileStore::_close_replay_guard(const coll_t& cid,
+                                    const SequencerPosition &spos)
+{
+  char fn[PATH_MAX];
+  get_cdir(cid, fn, sizeof(fn));
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    int err = errno;
+    derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl;
+    ceph_abort_msg("_close_replay_guard failed");
+  }
+  _close_replay_guard(fd, spos);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+}
+
+void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos,
+				    const ghobject_t *hoid)
+{
+  if (backend->can_checkpoint())
+    return;
+
+  dout(10) << __FUNC__ << ": " << spos << dendl;
+
+  _inject_failure();
+
+  // sync object_map too.  even if this object has a header or keys,
+  // it have had them in the past and then removed them, so always
+  // sync.
+  object_map->sync(hoid, &spos);
+
+  // then record that we are done with this operation
+  bufferlist v(40);
+  encode(spos, v);
+  bool in_progress = false;
+  encode(in_progress, v);
+  int r = chain_fsetxattr<true, true>(
+    fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
+  if (r < 0) {
+    derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
+    ceph_abort_msg("fsetxattr failed");
+  }
+
+  // and make sure our xattr is durable.
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  _inject_failure();
+
+  dout(10) << __FUNC__ << ": " << spos << " done" << dendl;
+}
+
+int FileStore::_check_replay_guard(const coll_t& cid, const ghobject_t &oid,
+                                   const SequencerPosition& spos)
+{
+  if (!replaying || backend->can_checkpoint())
+    return 1;
+
+  int r = _check_global_replay_guard(cid, spos);
+  if (r < 0)
+    return r;
+
+  FDRef fd;
+  r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    dout(10) << __FUNC__ << ": " << cid << " " << oid << " dne" << dendl;
+    return 1;  // if file does not exist, there is no guard, and we can replay.
+  }
+  int ret = _check_replay_guard(**fd, spos);
+  lfn_close(fd);
+  return ret;
+}
+
+int FileStore::_check_replay_guard(const coll_t& cid, const SequencerPosition& spos)
+{
+  if (!replaying || backend->can_checkpoint())
+    return 1;
+
+  char fn[PATH_MAX];
+  get_cdir(cid, fn, sizeof(fn));
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    dout(10) << __FUNC__ << ": " << cid << " dne" << dendl;
+    return 1;  // if collection does not exist, there is no guard, and we can replay.
+  }
+  int ret = _check_replay_guard(fd, spos);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  return ret;
+}
+
+int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos)
+{
+  if (!replaying || backend->can_checkpoint())
+    return 1;
+
+  char buf[100];
+  int r = chain_fgetxattr(fd, REPLAY_GUARD_XATTR, buf, sizeof(buf));
+  if (r < 0) {
+    dout(20) << __FUNC__ << ": no xattr" << dendl;
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return 1;  // no xattr
+  }
+  bufferlist bl;
+  bl.append(buf, r);
+
+  SequencerPosition opos;
+  auto p = bl.cbegin();
+  decode(opos, p);
+  bool in_progress = false;
+  if (!p.end())   // older journals don't have this
+    decode(in_progress, p);
+  if (opos > spos) {
+    dout(10) << __FUNC__ << ": object has " << opos << " > current pos " << spos
+	     << ", now or in future, SKIPPING REPLAY" << dendl;
+    return -1;
+  } else if (opos == spos) {
+    if (in_progress) {
+      dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos
+	       << ", in_progress=true, CONDITIONAL REPLAY" << dendl;
+      return 0;
+    } else {
+      dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos
+	       << ", in_progress=false, SKIPPING REPLAY" << dendl;
+      return -1;
+    }
+  } else {
+    dout(10) << __FUNC__ << ": object has " << opos << " < current pos " << spos
+	     << ", in past, will replay" << dendl;
+    return 1;
+  }
+}
+
+void FileStore::_do_transaction(
+  Transaction& t, uint64_t op_seq, int trans_num,
+  ThreadPool::TPHandle *handle,
+  const char *osr_name)
+{
+  dout(10) << __FUNC__ << ": on " << &t << dendl;
+
+  Transaction::iterator i = t.begin();
+
+  SequencerPosition spos(op_seq, trans_num, 0);
+  while (i.have_op()) {
+    if (handle)
+      handle->reset_tp_timeout();
+
+    Transaction::Op *op = i.decode_op();
+    int r = 0;
+
+    _inject_failure();
+
+    switch (op->op) {
+    case Transaction::OP_NOP:
+      break;
+    case Transaction::OP_TOUCH:
+    case Transaction::OP_CREATE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        tracepoint(objectstore, touch_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _touch(cid, oid);
+        tracepoint(objectstore, touch_exit, r);
+      }
+      break;
+
+    case Transaction::OP_WRITE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+        uint32_t fadvise_flags = i.get_fadvise_flags();
+        bufferlist bl;
+        i.decode_bl(bl);
+        tracepoint(objectstore, write_enter, osr_name, off, len);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _write(cid, oid, off, len, bl, fadvise_flags);
+        tracepoint(objectstore, write_exit, r);
+      }
+      break;
+
+    case Transaction::OP_ZERO:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+        tracepoint(objectstore, zero_enter, osr_name, off, len);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _zero(cid, oid, off, len);
+        tracepoint(objectstore, zero_exit, r);
+      }
+      break;
+
+    case Transaction::OP_TRIMCACHE:
+      {
+	// deprecated, no-op
+      }
+      break;
+
+    case Transaction::OP_TRUNCATE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        uint64_t off = op->off;
+        tracepoint(objectstore, truncate_enter, osr_name, off);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _truncate(cid, oid, off);
+        tracepoint(objectstore, truncate_exit, r);
+      }
+      break;
+
+    case Transaction::OP_REMOVE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        tracepoint(objectstore, remove_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _remove(cid, oid, spos);
+        tracepoint(objectstore, remove_exit, r);
+      }
+      break;
+
+    case Transaction::OP_SETATTR:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        string name = i.decode_string();
+        bufferlist bl;
+        i.decode_bl(bl);
+        tracepoint(objectstore, setattr_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0) {
+          map<string, bufferptr> to_set;
+          to_set[name] = bufferptr(bl.c_str(), bl.length());
+          r = _setattrs(cid, oid, to_set, spos);
+          if (r == -ENOSPC)
+            dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid
+                    << " name " << name << " size " << bl.length() << dendl;
+        }
+        tracepoint(objectstore, setattr_exit, r);
+      }
+      break;
+
+    case Transaction::OP_SETATTRS:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        map<string, bufferptr> aset;
+        i.decode_attrset(aset);
+        tracepoint(objectstore, setattrs_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _setattrs(cid, oid, aset, spos);
+        tracepoint(objectstore, setattrs_exit, r);
+        if (r == -ENOSPC)
+          dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl;
+      }
+      break;
+
+    case Transaction::OP_RMATTR:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        string name = i.decode_string();
+        tracepoint(objectstore, rmattr_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _rmattr(cid, oid, name.c_str(), spos);
+        tracepoint(objectstore, rmattr_exit, r);
+      }
+      break;
+
+    case Transaction::OP_RMATTRS:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        tracepoint(objectstore, rmattrs_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _rmattrs(cid, oid, spos);
+        tracepoint(objectstore, rmattrs_exit, r);
+      }
+      break;
+
+    case Transaction::OP_CLONE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        const ghobject_t &noid = i.get_oid(op->dest_oid);
+        tracepoint(objectstore, clone_enter, osr_name);
+        r = _clone(cid, oid, noid, spos);
+        tracepoint(objectstore, clone_exit, r);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const ghobject_t &noid = i.get_oid(op->dest_oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ?
+          _cid : _cid.get_temp();
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+        tracepoint(objectstore, clone_range_enter, osr_name, len);
+        r = _clone_range(cid, oid, ncid, noid, off, len, off, spos);
+        tracepoint(objectstore, clone_range_exit, r);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE2:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const ghobject_t &noid = i.get_oid(op->dest_oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ?
+          _cid : _cid.get_temp();
+        uint64_t srcoff = op->off;
+        uint64_t len = op->len;
+        uint64_t dstoff = op->dest_off;
+        tracepoint(objectstore, clone_range2_enter, osr_name, len);
+        r = _clone_range(cid, oid, ncid, noid, srcoff, len, dstoff, spos);
+        tracepoint(objectstore, clone_range2_exit, r);
+      }
+      break;
+
+    case Transaction::OP_MKCOLL:
+      {
+        const coll_t &cid = i.get_cid(op->cid);
+        tracepoint(objectstore, mkcoll_enter, osr_name);
+        if (_check_replay_guard(cid, spos) > 0)
+          r = _create_collection(cid, op->split_bits, spos);
+        tracepoint(objectstore, mkcoll_exit, r);
+      }
+      break;
+
+    case Transaction::OP_COLL_SET_BITS:
+      {
+	const coll_t &cid = i.get_cid(op->cid);
+	int bits = op->split_bits;
+	r = _collection_set_bits(cid, bits);
+      }
+      break;
+
+    case Transaction::OP_COLL_HINT:
+      {
+        const coll_t &cid = i.get_cid(op->cid);
+        uint32_t type = op->hint;
+        bufferlist hint;
+        i.decode_bl(hint);
+        auto hiter = hint.cbegin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          decode(pg_num, hiter);
+          decode(num_objs, hiter);
+          if (_check_replay_guard(cid, spos) > 0) {
+            r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos);
+          }
+        } else {
+          // Ignore the hint
+          dout(10) << "Unrecognized collection hint type: " << type << dendl;
+        }
+      }
+      break;
+
+    case Transaction::OP_RMCOLL:
+      {
+        const coll_t &cid = i.get_cid(op->cid);
+        tracepoint(objectstore, rmcoll_enter, osr_name);
+        if (_check_replay_guard(cid, spos) > 0)
+          r = _destroy_collection(cid);
+        tracepoint(objectstore, rmcoll_exit, r);
+      }
+      break;
+
+    case Transaction::OP_COLL_ADD:
+      {
+        const coll_t &ocid = i.get_cid(op->cid);
+        const coll_t &ncid = i.get_cid(op->dest_cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+
+	ceph_assert(oid.hobj.pool >= -1);
+
+        // always followed by OP_COLL_REMOVE
+        Transaction::Op *op2 = i.decode_op();
+        const coll_t &ocid2 = i.get_cid(op2->cid);
+        const ghobject_t &oid2 = i.get_oid(op2->oid);
+        ceph_assert(op2->op == Transaction::OP_COLL_REMOVE);
+        ceph_assert(ocid2 == ocid);
+        ceph_assert(oid2 == oid);
+
+        tracepoint(objectstore, coll_add_enter);
+        r = _collection_add(ncid, ocid, oid, spos);
+        tracepoint(objectstore, coll_add_exit, r);
+        spos.op++;
+        if (r < 0)
+          break;
+        tracepoint(objectstore, coll_remove_enter, osr_name);
+        if (_check_replay_guard(ocid, oid, spos) > 0)
+          r = _remove(ocid, oid, spos);
+        tracepoint(objectstore, coll_remove_exit, r);
+      }
+      break;
+
+    case Transaction::OP_COLL_MOVE:
+      {
+        // WARNING: this is deprecated and buggy; only here to replay old journals.
+        const coll_t &ocid = i.get_cid(op->cid);
+        const coll_t &ncid = i.get_cid(op->dest_cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        tracepoint(objectstore, coll_move_enter);
+        r = _collection_add(ocid, ncid, oid, spos);
+        if (r == 0 &&
+            (_check_replay_guard(ocid, oid, spos) > 0))
+          r = _remove(ocid, oid, spos);
+        tracepoint(objectstore, coll_move_exit, r);
+      }
+      break;
+
+    case Transaction::OP_COLL_MOVE_RENAME:
+      {
+        const coll_t &_oldcid = i.get_cid(op->cid);
+        const ghobject_t &oldoid = i.get_oid(op->oid);
+        const coll_t &_newcid = i.get_cid(op->dest_cid);
+        const ghobject_t &newoid = i.get_oid(op->dest_oid);
+        const coll_t &oldcid = !_need_temp_object_collection(_oldcid, oldoid) ?
+          _oldcid : _oldcid.get_temp();
+        const coll_t &newcid = !_need_temp_object_collection(_newcid, newoid) ?
+          _oldcid : _newcid.get_temp();
+        tracepoint(objectstore, coll_move_rename_enter);
+        r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos);
+        tracepoint(objectstore, coll_move_rename_exit, r);
+      }
+      break;
+
+    case Transaction::OP_TRY_RENAME:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oldoid = i.get_oid(op->oid);
+        const ghobject_t &newoid = i.get_oid(op->dest_oid);
+        const coll_t &oldcid = !_need_temp_object_collection(_cid, oldoid) ?
+          _cid : _cid.get_temp();
+        const coll_t &newcid = !_need_temp_object_collection(_cid, newoid) ?
+          _cid : _cid.get_temp();
+        tracepoint(objectstore, coll_try_rename_enter);
+        r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos, true);
+        tracepoint(objectstore, coll_try_rename_exit, r);
+      }
+      break;
+
+    case Transaction::OP_COLL_SETATTR:
+    case Transaction::OP_COLL_RMATTR:
+      ceph_abort_msg("collection attr methods no longer implemented");
+      break;
+
+    case Transaction::OP_COLL_RENAME:
+      {
+        r = -EOPNOTSUPP;
+      }
+      break;
+
+    case Transaction::OP_OMAP_CLEAR:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        tracepoint(objectstore, omap_clear_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+	  r = _omap_clear(cid, oid, spos);
+        tracepoint(objectstore, omap_clear_exit, r);
+      }
+      break;
+    case Transaction::OP_OMAP_SETKEYS:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        map<string, bufferlist> aset;
+        i.decode_attrset(aset);
+        tracepoint(objectstore, omap_setkeys_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+	  r = _omap_setkeys(cid, oid, aset, spos);
+        tracepoint(objectstore, omap_setkeys_exit, r);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYS:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        set<string> keys;
+        i.decode_keyset(keys);
+        tracepoint(objectstore, omap_rmkeys_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+	  r = _omap_rmkeys(cid, oid, keys, spos);
+        tracepoint(objectstore, omap_rmkeys_exit, r);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYRANGE:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        string first, last;
+        first = i.decode_string();
+        last = i.decode_string();
+        tracepoint(objectstore, omap_rmkeyrange_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+	  r = _omap_rmkeyrange(cid, oid, first, last, spos);
+        tracepoint(objectstore, omap_rmkeyrange_exit, r);
+      }
+      break;
+    case Transaction::OP_OMAP_SETHEADER:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        bufferlist bl;
+        i.decode_bl(bl);
+        tracepoint(objectstore, omap_setheader_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+	  r = _omap_setheader(cid, oid, bl, spos);
+        tracepoint(objectstore, omap_setheader_exit, r);
+      }
+      break;
+    case Transaction::OP_SPLIT_COLLECTION:
+      {
+	ceph_abort_msg("not legacy journal; upgrade to firefly first");
+      }
+      break;
+    case Transaction::OP_SPLIT_COLLECTION2:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+        coll_t dest = i.get_cid(op->dest_cid);
+        tracepoint(objectstore, split_coll2_enter, osr_name);
+        r = _split_collection(cid, bits, rem, dest, spos);
+        tracepoint(objectstore, split_coll2_exit, r);
+      }
+      break;
+
+    case Transaction::OP_MERGE_COLLECTION:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        coll_t dest = i.get_cid(op->dest_cid);
+        tracepoint(objectstore, merge_coll_enter, osr_name);
+        r = _merge_collection(cid, bits, dest, spos);
+        tracepoint(objectstore, merge_coll_exit, r);
+      }
+      break;
+
+    case Transaction::OP_SETALLOCHINT:
+      {
+        const coll_t &_cid = i.get_cid(op->cid);
+        const ghobject_t &oid = i.get_oid(op->oid);
+        const coll_t &cid = !_need_temp_object_collection(_cid, oid) ?
+          _cid : _cid.get_temp();
+        uint64_t expected_object_size = op->expected_object_size;
+        uint64_t expected_write_size = op->expected_write_size;
+        tracepoint(objectstore, setallochint_enter, osr_name);
+        if (_check_replay_guard(cid, oid, spos) > 0)
+          r = _set_alloc_hint(cid, oid, expected_object_size,
+                              expected_write_size);
+        tracepoint(objectstore, setallochint_exit, r);
+      }
+      break;
+
+    default:
+      derr << "bad op " << op->op << dendl;
+      ceph_abort();
+    }
+
+    if (r < 0) {
+      bool ok = false;
+
+      if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+			    op->op == Transaction::OP_CLONE ||
+			    op->op == Transaction::OP_CLONERANGE2 ||
+			    op->op == Transaction::OP_COLL_ADD ||
+			    op->op == Transaction::OP_SETATTR ||
+			    op->op == Transaction::OP_SETATTRS ||
+			    op->op == Transaction::OP_RMATTR ||
+			    op->op == Transaction::OP_OMAP_SETKEYS ||
+			    op->op == Transaction::OP_OMAP_RMKEYS ||
+			    op->op == Transaction::OP_OMAP_RMKEYRANGE ||
+			    op->op == Transaction::OP_OMAP_SETHEADER))
+	// -ENOENT is normally okay
+	// ...including on a replayed OP_RMCOLL with checkpoint mode
+	ok = true;
+      if (r == -ENODATA)
+	ok = true;
+
+      if (op->op == Transaction::OP_SETALLOCHINT)
+        // Either EOPNOTSUPP or EINVAL most probably.  EINVAL in most
+        // cases means invalid hint size (e.g. too big, not a multiple
+        // of block size, etc) or, at least on xfs, an attempt to set
+        // or change it when the file is not empty.  However,
+        // OP_SETALLOCHINT is advisory, so ignore all errors.
+        ok = true;
+
+      if (replaying && !backend->can_checkpoint()) {
+	if (r == -EEXIST && op->op == Transaction::OP_MKCOLL) {
+	  dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+	  ok = true;
+	}
+	if (r == -EEXIST && op->op == Transaction::OP_COLL_ADD) {
+	  dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+	  ok = true;
+	}
+	if (r == -EEXIST && op->op == Transaction::OP_COLL_MOVE) {
+	  dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl;
+	  ok = true;
+	}
+	if (r == -ERANGE) {
+	  dout(10) << "tolerating ERANGE on replay" << dendl;
+	  ok = true;
+	}
+	if (r == -ENOENT) {
+	  dout(10) << "tolerating ENOENT on replay" << dendl;
+	  ok = true;
+	}
+      }
+
+      if (!ok) {
+	const char *msg = "unexpected error code";
+
+	if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+			     op->op == Transaction::OP_CLONE ||
+			     op->op == Transaction::OP_CLONERANGE2)) {
+	  msg = "ENOENT on clone suggests osd bug";
+	} else if (r == -ENOSPC) {
+	  // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+	  // by partially applying transactions.
+	  msg = "ENOSPC from disk filesystem, misconfigured cluster";
+	} else if (r == -ENOTEMPTY) {
+	  msg = "ENOTEMPTY suggests garbage data in osd data dir";
+	} else if (r == -EPERM) {
+          msg = "EPERM suggests file(s) in osd data dir not owned by ceph user, or leveldb corruption";
+        }
+
+	derr  << " error " << cpp_strerror(r) << " not handled on operation " << op
+	      << " (" << spos << ", or op " << spos.op << ", counting from 0)" << dendl;
+	dout(0) << msg << dendl;
+	dout(0) << " transaction dump:\n";
+	JSONFormatter f(true);
+	f.open_object_section("transaction");
+	t.dump(&f);
+	f.close_section();
+	f.flush(*_dout);
+	*_dout << dendl;
+
+	if (r == -EMFILE) {
+	  dump_open_fds(cct);
+	}
+
+	ceph_abort_msg("unexpected error");
+      }
+    }
+
+    spos.op++;
+  }
+
+  _inject_failure();
+}
+
+  /*********************************************/
+
+
+
+// --------------------
+// objects
+
+bool FileStore::exists(CollectionHandle& ch, const ghobject_t& oid)
+{
+  tracepoint(objectstore, exists_enter, ch->cid.c_str());
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+  struct stat st;
+  bool retval = stat(ch, oid, &st) == 0;
+  tracepoint(objectstore, exists_exit, retval);
+  return retval;
+}
+
+int FileStore::stat(
+  CollectionHandle& ch, const ghobject_t& oid, struct stat *st, bool allow_eio)
+{
+  tracepoint(objectstore, stat_enter, ch->cid.c_str());
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+  const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+  int r = lfn_stat(cid, oid, st);
+  ceph_assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
+  if (r < 0) {
+    dout(10) << __FUNC__ << ": " << ch->cid << "/" << oid
+	     << " = " << r << dendl;
+  } else {
+    dout(10) << __FUNC__ << ": " << ch->cid << "/" << oid
+	     << " = " << r
+	     << " (size " << st->st_size << ")" << dendl;
+  }
+  if (cct->_conf->filestore_debug_inject_read_err &&
+      debug_mdata_eio(oid)) {
+    return -EIO;
+  } else {
+    tracepoint(objectstore, stat_exit, r);
+    return r;
+  }
+}
+
+int FileStore::set_collection_opts(
+  CollectionHandle& ch,
+  const pool_opts_t& opts)
+{
+  return -EOPNOTSUPP;
+}
+
+int FileStore::read(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t len,
+  bufferlist& bl,
+  uint32_t op_flags)
+{
+  int got;
+  tracepoint(objectstore, read_enter, ch->cid.c_str(), offset, len);
+  const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+
+  FDRef fd;
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") open error: "
+	     << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  if (offset == 0 && len == 0) {
+    struct stat st;
+    memset(&st, 0, sizeof(struct stat));
+    int r = ::fstat(**fd, &st);
+    ceph_assert(r == 0);
+    len = st.st_size;
+  }
+
+#ifdef HAVE_POSIX_FADVISE
+  if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_RANDOM)
+    posix_fadvise(**fd, offset, len, POSIX_FADV_RANDOM);
+  if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL)
+    posix_fadvise(**fd, offset, len, POSIX_FADV_SEQUENTIAL);
+#endif
+
+  bufferptr bptr(len);  // prealloc space for entire read
+  got = safe_pread(**fd, bptr.c_str(), len, offset);
+  if (got < 0) {
+    dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") pread error: " << cpp_strerror(got) << dendl;
+    lfn_close(fd);
+    return got;
+  }
+  bptr.set_length(got);   // properly size the buffer
+  bl.clear();
+  bl.push_back(std::move(bptr));   // put it in the target bufferlist
+
+#ifdef HAVE_POSIX_FADVISE
+  if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)
+    posix_fadvise(**fd, offset, len, POSIX_FADV_DONTNEED);
+  if (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_RANDOM | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL))
+    posix_fadvise(**fd, offset, len, POSIX_FADV_NORMAL);
+#endif
+
+  if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) {
+    ostringstream ss;
+    int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss);
+    if (errors != 0) {
+      dout(0) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~"
+	      << got << " ... BAD CRC:\n" << ss.str() << dendl;
+      ceph_abort_msg("bad crc on read");
+    }
+  }
+
+  lfn_close(fd);
+
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~"
+	   << got << "/" << len << dendl;
+  if (cct->_conf->filestore_debug_inject_read_err &&
+      debug_data_eio(oid)) {
+    return -EIO;
+  } else if (oid.hobj.pool > 0 &&  /* FIXME, see #23029 */
+	     cct->_conf->filestore_debug_random_read_err &&
+	     (rand() % (int)(cct->_conf->filestore_debug_random_read_err *
+			     100.0)) == 0) {
+    dout(0) << __func__ << ": inject random EIO" << dendl;
+    return -EIO;
+  } else {
+    tracepoint(objectstore, read_exit, got);
+    return got;
+  }
+}
+
+int FileStore::_do_fiemap(int fd, uint64_t offset, size_t len,
+                          map<uint64_t, uint64_t> *m)
+{
+  uint64_t i;
+  struct fiemap_extent *extent = nullptr;
+  struct fiemap *fiemap = nullptr;
+  int r = 0;
+
+more:
+  r = backend->do_fiemap(fd, offset, len, &fiemap);
+  if (r < 0)
+    return r;
+
+  if (fiemap->fm_mapped_extents == 0) {
+    free(fiemap);
+    return r;
+  }
+
+  extent = &fiemap->fm_extents[0];
+
+  /* start where we were asked to start */
+  if (extent->fe_logical < offset) {
+    extent->fe_length -= offset - extent->fe_logical;
+    extent->fe_logical = offset;
+  }
+
+  i = 0;
+
+  struct fiemap_extent *last = nullptr;
+  while (i < fiemap->fm_mapped_extents) {
+    struct fiemap_extent *next = extent + 1;
+
+    dout(10) << __FUNC__ << ": fm_mapped_extents=" << fiemap->fm_mapped_extents
+             << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl;
+
+    /* try to merge extents */
+    while ((i < fiemap->fm_mapped_extents - 1) &&
+           (extent->fe_logical + extent->fe_length == next->fe_logical)) {
+        next->fe_length += extent->fe_length;
+        next->fe_logical = extent->fe_logical;
+        extent = next;
+        next = extent + 1;
+        i++;
+    }
+
+    if (extent->fe_logical + extent->fe_length > offset + len)
+      extent->fe_length = offset + len - extent->fe_logical;
+    (*m)[extent->fe_logical] = extent->fe_length;
+    i++;
+    last = extent++;
+  }
+  uint64_t xoffset = last->fe_logical + last->fe_length - offset;
+  offset = last->fe_logical + last->fe_length;
+  len -= xoffset;
+  const bool is_last = (last->fe_flags & FIEMAP_EXTENT_LAST) || (len == 0);
+  free(fiemap);
+  if (!is_last) {
+    goto more;
+  }
+
+  return r;
+}
+
+int FileStore::_do_seek_hole_data(int fd, uint64_t offset, size_t len,
+                                  map<uint64_t, uint64_t> *m)
+{
+#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
+  off_t hole_pos, data_pos;
+  int r = 0;
+
+  // If lseek fails with errno setting to be ENXIO, this means the current
+  // file offset is beyond the end of the file.
+  off_t start = offset;
+  while(start < (off_t)(offset + len)) {
+    data_pos = lseek(fd, start, SEEK_DATA);
+    if (data_pos < 0) {
+      if (errno == ENXIO)
+        break;
+      else {
+        r = -errno;
+        dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
+	return r;
+      }
+    } else if (data_pos > (off_t)(offset + len)) {
+      break;
+    }
+
+    hole_pos = lseek(fd, data_pos, SEEK_HOLE);
+    if (hole_pos < 0) {
+      if (errno == ENXIO) {
+        break;
+      } else {
+        r = -errno;
+        dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl;
+	return r;
+      }
+    }
+
+    if (hole_pos >= (off_t)(offset + len)) {
+      (*m)[data_pos] = offset + len - data_pos;
+      break;
+    }
+    (*m)[data_pos] = hole_pos - data_pos;
+    start = hole_pos;
+  }
+
+  return r;
+#else
+  (*m)[offset] = len;
+  return 0;
+#endif
+}
+
+int FileStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+                    uint64_t offset, size_t len,
+                    bufferlist& bl)
+{
+  map<uint64_t, uint64_t> exomap;
+  int r = fiemap(ch, oid, offset, len, exomap);
+  if (r >= 0) {
+    encode(exomap, bl);
+  }
+  return r;
+}
+
+int FileStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+                    uint64_t offset, size_t len,
+                    map<uint64_t, uint64_t>& destmap)
+{
+  tracepoint(objectstore, fiemap_enter, ch->cid.c_str(), offset, len);
+  const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+  destmap.clear();
+
+  if ((!backend->has_seek_data_hole() && !backend->has_fiemap()) ||
+      len <= (size_t)m_filestore_fiemap_threshold) {
+    destmap[offset] = len;
+    return 0;
+  }
+
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+
+  FDRef fd;
+
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl;
+    goto done;
+  }
+
+  if (backend->has_seek_data_hole()) {
+    dout(15) << "seek_data/seek_hole " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+    r = _do_seek_hole_data(**fd, offset, len, &destmap);
+  } else if (backend->has_fiemap()) {
+    dout(15) << "fiemap ioctl" << cid << "/" << oid << " " << offset << "~" << len << dendl;
+    r = _do_fiemap(**fd, offset, len, &destmap);
+  }
+
+  lfn_close(fd);
+
+done:
+
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << destmap.size() << " " << destmap << dendl;
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  tracepoint(objectstore, fiemap_exit, r);
+  return r;
+}
+
+int FileStore::_remove(const coll_t& cid, const ghobject_t& oid,
+		       const SequencerPosition &spos)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+  int r = lfn_unlink(cid, oid, spos);
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << dendl;
+  int r = lfn_truncate(cid, oid, size);
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << " = " << r << dendl;
+  return r;
+}
+
+
+int FileStore::_touch(const coll_t& cid, const ghobject_t& oid)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+
+  FDRef fd;
+  int r = lfn_open(cid, oid, true, &fd);
+  if (r < 0) {
+    return r;
+  } else {
+    lfn_close(fd);
+  }
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_write(const coll_t& cid, const ghobject_t& oid,
+                     uint64_t offset, size_t len,
+                     const bufferlist& bl, uint32_t fadvise_flags)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+  int r;
+
+  FDRef fd;
+  r = lfn_open(cid, oid, true, &fd);
+  if (r < 0) {
+    dout(0) << __FUNC__ << ": couldn't open " << cid << "/"
+	    << oid << ": "
+	    << cpp_strerror(r) << dendl;
+    goto out;
+  }
+
+  // write
+  r = bl.write_fd(**fd, offset);
+  if (r < 0) {
+    derr << __FUNC__ << ": write_fd on " << cid << "/" << oid
+         << " error: " << cpp_strerror(r) << dendl;
+    lfn_close(fd);
+    goto out;
+  }
+  r = bl.length();
+
+  if (r >= 0 && m_filestore_sloppy_crc) {
+    int rc = backend->_crc_update_write(**fd, offset, len, bl);
+    ceph_assert(rc >= 0);
+  }
+ 
+  if (replaying || m_disable_wbthrottle) {
+    if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) {
+#ifdef HAVE_POSIX_FADVISE
+        posix_fadvise(**fd, 0, 0, POSIX_FADV_DONTNEED);
+#endif
+    }
+  } else {
+    wbthrottle.queue_wb(fd, oid, offset, len,
+        fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+  }
+ 
+  lfn_close(fd);
+
+ out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl;
+  int ret = 0;
+
+  if (cct->_conf->filestore_punch_hole) {
+#ifdef CEPH_HAVE_FALLOCATE
+# if !defined(__APPLE__) && !defined(__FreeBSD__)
+#    ifdef FALLOC_FL_KEEP_SIZE
+    // first try to punch a hole.
+    FDRef fd;
+    ret = lfn_open(cid, oid, false, &fd);
+    if (ret < 0) {
+      goto out;
+    }
+
+    struct stat st;
+    ret = ::fstat(**fd, &st);
+    if (ret < 0) {
+      ret = -errno;
+      lfn_close(fd);
+      goto out;
+    }
+
+    // first try fallocate
+    ret = fallocate(**fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+		    offset, len);
+    if (ret < 0) {
+      ret = -errno;
+    } else {
+      // ensure we extend file size, if needed
+      if (len > 0 && offset + len > (uint64_t)st.st_size) {
+	ret = ::ftruncate(**fd, offset + len);
+	if (ret < 0) {
+	  ret = -errno;
+	  lfn_close(fd);
+	  goto out;
+	}
+      }
+    }
+    lfn_close(fd);
+
+    if (ret >= 0 && m_filestore_sloppy_crc) {
+      int rc = backend->_crc_update_zero(**fd, offset, len);
+      ceph_assert(rc >= 0);
+    }
+
+    if (ret == 0)
+      goto out;  // yay!
+    if (ret != -EOPNOTSUPP)
+      goto out;  // some other error
+#    endif
+# endif
+#endif
+  }
+
+  // lame, kernel is old and doesn't support it.
+  // write zeros.. yuck!
+  dout(20) << __FUNC__ << ": falling back to writing zeros" << dendl;
+  {
+    bufferlist bl;
+    bl.append_zero(len);
+    ret = _write(cid, oid, offset, len, bl);
+  }
+
+#ifdef CEPH_HAVE_FALLOCATE
+# if !defined(__APPLE__) && !defined(__FreeBSD__)
+#    ifdef FALLOC_FL_KEEP_SIZE
+ out:
+#    endif
+# endif
+#endif
+  dout(20) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl;
+  return ret;
+}
+
+int FileStore::_clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid,
+		      const SequencerPosition& spos)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl;
+
+  if (_check_replay_guard(cid, newoid, spos) < 0)
+    return 0;
+
+  int r;
+  FDRef o, n;
+  {
+    Index index;
+    r = lfn_open(cid, oldoid, false, &o, &index);
+    if (r < 0) {
+      goto out2;
+    }
+    ceph_assert(index.index);
+    std::unique_lock l{(index.index)->access_lock};
+
+    r = lfn_open(cid, newoid, true, &n, &index);
+    if (r < 0) {
+      goto out;
+    }
+    r = ::ftruncate(**n, 0);
+    if (r < 0) {
+      r = -errno;
+      goto out3;
+    }
+    struct stat st;
+    r = ::fstat(**o, &st);
+    if (r < 0) {
+      r = -errno;
+      goto out3;
+    }
+
+    r = _do_clone_range(**o, **n, 0, st.st_size, 0);
+    if (r < 0) {
+      goto out3;
+    }
+
+    dout(20) << "objectmap clone" << dendl;
+    r = object_map->clone(oldoid, newoid, &spos);
+    if (r < 0 && r != -ENOENT)
+      goto out3;
+  }
+
+  {
+    char buf[2];
+    map<string, bufferptr> aset;
+    r = _fgetattrs(**o, aset);
+    if (r < 0)
+      goto out3;
+
+    r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+    if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+      r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
+                          sizeof(XATTR_NO_SPILL_OUT));
+    } else {
+      r = chain_fsetxattr<true, true>(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
+                          sizeof(XATTR_SPILL_OUT));
+    }
+    if (r < 0)
+      goto out3;
+
+    r = _fsetattrs(**n, aset);
+    if (r < 0)
+      goto out3;
+  }
+
+  // clone is non-idempotent; record our work.
+  _set_replay_guard(**n, spos, &newoid);
+
+ out3:
+  lfn_close(n);
+ out:
+  lfn_close(o);
+ out2:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " = " << r << dendl;
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  return r;
+}
+
+int FileStore::_do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+  dout(20) << __FUNC__ << ": copy " << srcoff << "~" << len << " to " << dstoff << dendl;
+  return backend->clone_range(from, to, srcoff, len, dstoff);
+}
+
+int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+  dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl;
+  int r = 0;
+  map<uint64_t, uint64_t> exomap;
+  // fiemap doesn't allow zero length
+  if (len == 0)
+    return 0;
+
+  if (backend->has_seek_data_hole()) {
+    dout(15) << "seek_data/seek_hole " << from << " " << srcoff << "~" << len << dendl;
+    r = _do_seek_hole_data(from, srcoff, len, &exomap);
+  } else if (backend->has_fiemap()) {
+    dout(15) << "fiemap ioctl" << from << " " << srcoff << "~" << len << dendl;
+    r = _do_fiemap(from, srcoff, len, &exomap);
+  }
+
+ 
+ int64_t written = 0;
+ if (r < 0)
+    goto out;
+
+  for (map<uint64_t, uint64_t>::iterator miter = exomap.begin(); miter != exomap.end(); ++miter) {
+    uint64_t it_off = miter->first - srcoff + dstoff;
+    r = _do_copy_range(from, to, miter->first, miter->second, it_off, true);
+    if (r < 0) {
+      derr << __FUNC__ << ": copy error at " << miter->first << "~" << miter->second
+             << " to " << it_off << ", " << cpp_strerror(r) << dendl;
+      break;
+    }
+    written += miter->second;
+  }
+
+  if (r >= 0) {
+    if (m_filestore_sloppy_crc) {
+      int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+      ceph_assert(rc >= 0);
+    }
+    struct stat st;
+    r = ::fstat(to, &st);
+    if (r < 0) {
+      r = -errno;
+      derr << __FUNC__ << ": fstat error at " << to << " " << cpp_strerror(r) << dendl;
+      goto out;
+    }
+    if (st.st_size < (int)(dstoff + len)) {
+      r = ::ftruncate(to, dstoff + len);
+      if (r < 0) {
+        r = -errno;
+        derr << __FUNC__ << ": ftruncate error at " << dstoff+len << " " << cpp_strerror(r) << dendl;
+        goto out;
+      }
+    }
+    r = written;
+  }
+
+ out:
+  dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc)
+{
+  dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl;
+  int r = 0;
+  loff_t pos = srcoff;
+  loff_t end = srcoff + len;
+  int buflen = 4096 * 16; //limit by pipe max size.see fcntl
+
+#ifdef CEPH_HAVE_SPLICE
+  if (backend->has_splice()) {
+    int pipefd[2];
+    if (pipe_cloexec(pipefd, 0) < 0) {
+      int e = errno;
+      derr << " pipe " << " got " << cpp_strerror(e) << dendl;
+      return -e;
+    }
+
+    loff_t dstpos = dstoff;
+    while (pos < end) {
+      int l = std::min<int>(end-pos, buflen);
+      r = safe_splice(from, &pos, pipefd[1], nullptr, l, SPLICE_F_NONBLOCK);
+      dout(10) << "  safe_splice read from " << pos << "~" << l << " got " << r << dendl;
+      if (r < 0) {
+	derr << __FUNC__ << ": safe_splice read error at " << pos << "~" << len
+	  << ", " << cpp_strerror(r) << dendl;
+	break;
+      }
+      if (r == 0) {
+	// hrm, bad source range, wtf.
+	r = -ERANGE;
+	derr << __FUNC__ << ": got short read result at " << pos
+	  << " of fd " << from << " len " << len << dendl;
+	break;
+      }
+
+      r = safe_splice(pipefd[0], nullptr, to, &dstpos, r, 0);
+      dout(10) << " safe_splice write to " << to << " len " << r
+	<< " got " << r << dendl;
+      if (r < 0) {
+	derr << __FUNC__ << ": write error at " << pos << "~"
+	  << r << ", " << cpp_strerror(r) << dendl;
+	break;
+      }
+    }
+    close(pipefd[0]);
+    close(pipefd[1]);
+  } else
+#endif
+  {
+    int64_t actual;
+
+    actual = ::lseek64(from, srcoff, SEEK_SET);
+    if (actual != (int64_t)srcoff) {
+      if (actual < 0)
+        r = -errno;
+      else
+        r = -EINVAL;
+      derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    actual = ::lseek64(to, dstoff, SEEK_SET);
+    if (actual != (int64_t)dstoff) {
+      if (actual < 0)
+        r = -errno;
+      else
+        r = -EINVAL;
+      derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    char buf[buflen];
+    while (pos < end) {
+      int l = std::min<int>(end-pos, buflen);
+      r = ::read(from, buf, l);
+      dout(25) << "  read from " << pos << "~" << l << " got " << r << dendl;
+      if (r < 0) {
+	if (errno == EINTR) {
+	  continue;
+	} else {
+	  r = -errno;
+	  derr << __FUNC__ << ": read error at " << pos << "~" << len
+	    << ", " << cpp_strerror(r) << dendl;
+	  break;
+	}
+      }
+      if (r == 0) {
+	// hrm, bad source range, wtf.
+	r = -ERANGE;
+	derr << __FUNC__ << ": got short read result at " << pos
+	  << " of fd " << from << " len " << len << dendl;
+	break;
+      }
+      int op = 0;
+      while (op < r) {
+	int r2 = safe_write(to, buf+op, r-op);
+	dout(25) << " write to " << to << " len " << (r-op)
+	  << " got " << r2 << dendl;
+	if (r2 < 0) {
+	  r = r2;
+	  derr << __FUNC__ << ": write error at " << pos << "~"
+	    << r-op << ", " << cpp_strerror(r) << dendl;
+
+	  break;
+	}
+	op += (r-op);
+      }
+      if (r < 0)
+	break;
+      pos += r;
+    }
+  }
+
+  if (r < 0 && replaying) {
+    ceph_assert(r == -ERANGE);
+    derr << __FUNC__ << ": short source tolerated because we are replaying" << dendl;
+    r = len;
+  }
+  ceph_assert(replaying || pos == end);
+  if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) {
+    int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff);
+    ceph_assert(rc >= 0);
+  }
+  dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid,
+			    uint64_t srcoff, uint64_t len, uint64_t dstoff,
+			    const SequencerPosition& spos)
+{
+  dout(15) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " " << srcoff << "~" << len << " to " << dstoff << dendl;
+
+  if (_check_replay_guard(newcid, newoid, spos) < 0)
+    return 0;
+
+  int r;
+  FDRef o, n;
+  r = lfn_open(oldcid, oldoid, false, &o);
+  if (r < 0) {
+    goto out2;
+  }
+  r = lfn_open(newcid, newoid, true, &n);
+  if (r < 0) {
+    goto out;
+  }
+  r = _do_clone_range(**o, **n, srcoff, len, dstoff);
+  if (r < 0) {
+    goto out3;
+  }
+
+  // clone is non-idempotent; record our work.
+  _set_replay_guard(**n, spos, &newoid);
+
+ out3:
+  lfn_close(n);
+ out:
+  lfn_close(o);
+ out2:
+  dout(10) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " "
+	   << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl;
+  return r;
+}
+
+class SyncEntryTimeout : public Context {
+public:
+  CephContext* cct;
+  explicit SyncEntryTimeout(CephContext* cct, int commit_timeo)
+    : cct(cct), m_commit_timeo(commit_timeo)
+  {
+  }
+
+  void finish(int r) override {
+    BackTrace *bt = new BackTrace(1);
+    generic_dout(-1) << "FileStore: sync_entry timed out after "
+	   << m_commit_timeo << " seconds.\n";
+    bt->print(*_dout);
+    *_dout << dendl;
+    delete bt;
+    bt = nullptr;
+    ceph_abort();
+  }
+private:
+  int m_commit_timeo;
+};
+
+void FileStore::sync_entry()
+{
+  std::unique_lock l{lock};
+  while (!stop) {
+    auto min_interval = ceph::make_timespan(m_filestore_min_sync_interval);
+    auto max_interval = ceph::make_timespan(m_filestore_max_sync_interval);
+    auto startwait = ceph::real_clock::now();
+    if (!force_sync) {
+      dout(20) << __FUNC__ << ":  waiting for max_interval " << max_interval << dendl;
+      sync_cond.wait_for(l, max_interval);
+    } else {
+      dout(20) << __FUNC__ << ": not waiting, force_sync set" << dendl;
+    }
+
+    if (force_sync) {
+      dout(20) << __FUNC__ << ": force_sync set" << dendl;
+      force_sync = false;
+    } else if (stop) {
+      dout(20) << __FUNC__ << ": stop set" << dendl;
+      break;
+    } else {
+      // wait for at least the min interval
+      auto woke = ceph::real_clock::now() - startwait;
+      dout(20) << __FUNC__ << ": woke after " << woke << dendl;
+      if (woke < min_interval) {
+	auto t = min_interval - woke;
+	dout(20) << __FUNC__ << ": waiting for another " << t
+		 << " to reach min interval " << min_interval << dendl;
+	sync_cond.wait_for(l, t);
+      }
+    }
+
+    list<Context*> fin;
+  again:
+    fin.swap(sync_waiters);
+    l.unlock();
+
+    op_tp.pause();
+    if (apply_manager.commit_start()) {
+      auto start = ceph::real_clock::now();
+      uint64_t cp = apply_manager.get_committing_seq();
+
+      sync_entry_timeo_lock.lock();
+      SyncEntryTimeout *sync_entry_timeo =
+	new SyncEntryTimeout(cct, m_filestore_commit_timeout);
+      if (!timer.add_event_after(m_filestore_commit_timeout,
+				 sync_entry_timeo)) {
+	sync_entry_timeo = nullptr;
+      }
+      sync_entry_timeo_lock.unlock();
+
+      logger->set(l_filestore_committing, 1);
+
+      dout(15) << __FUNC__ << ": committing " << cp << dendl;
+      stringstream errstream;
+      if (cct->_conf->filestore_debug_omap_check && !object_map->check(errstream)) {
+	derr << errstream.str() << dendl;
+	ceph_abort();
+      }
+
+      if (backend->can_checkpoint()) {
+	int err = write_op_seq(op_fd, cp);
+	if (err < 0) {
+	  derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
+	  ceph_abort_msg("error during write_op_seq");
+	}
+
+	char s[NAME_MAX];
+	snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
+	uint64_t cid = 0;
+	err = backend->create_checkpoint(s, &cid);
+	if (err < 0) {
+	    int err = errno;
+	    derr << "snap create '" << s << "' got error " << err << dendl;
+	    ceph_assert(err == 0);
+	}
+
+	snaps.push_back(cp);
+	apply_manager.commit_started();
+	op_tp.unpause();
+
+	if (cid > 0) {
+	  dout(20) << " waiting for checkpoint " << cid << " to complete" << dendl;
+	  err = backend->sync_checkpoint(cid);
+	  if (err < 0) {
+	    derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl;
+	    ceph_abort_msg("wait_sync got error");
+	  }
+	  dout(20) << " done waiting for checkpoint " << cid << " to complete" << dendl;
+	}
+      } else {
+	apply_manager.commit_started();
+	op_tp.unpause();
+
+	int err = object_map->sync();
+	if (err < 0) {
+	  derr << "object_map sync got " << cpp_strerror(err) << dendl;
+	  ceph_abort_msg("object_map sync returned error");
+	}
+
+	err = backend->syncfs();
+	if (err < 0) {
+	  derr << "syncfs got " << cpp_strerror(err) << dendl;
+	  ceph_abort_msg("syncfs returned error");
+	}
+
+	err = write_op_seq(op_fd, cp);
+	if (err < 0) {
+	  derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl;
+	  ceph_abort_msg("error during write_op_seq");
+	}
+	err = ::fsync(op_fd);
+	if (err < 0) {
+	  derr << "Error during fsync of op_seq: " << cpp_strerror(err) << dendl;
+	  ceph_abort_msg("error during fsync of op_seq");
+	}
+      }
+
+      auto done = ceph::real_clock::now();
+      auto lat = done - start;
+      auto dur = done - startwait;
+      dout(10) << __FUNC__ << ": commit took " << lat << ", interval was " << dur << dendl;
+      utime_t max_pause_lat = logger->tget(l_filestore_sync_pause_max_lat);
+      if (max_pause_lat < utime_t{dur - lat}) {
+        logger->tinc(l_filestore_sync_pause_max_lat, dur - lat);
+      }
+
+      logger->inc(l_filestore_commitcycle);
+      logger->tinc(l_filestore_commitcycle_latency, lat);
+      logger->tinc(l_filestore_commitcycle_interval, dur);
+
+      apply_manager.commit_finish();
+      if (!m_disable_wbthrottle) {
+        wbthrottle.clear();
+      }
+
+      logger->set(l_filestore_committing, 0);
+
+      // remove old snaps?
+      if (backend->can_checkpoint()) {
+	char s[NAME_MAX];
+	while (snaps.size() > 2) {
+	  snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)snaps.front());
+	  snaps.pop_front();
+	  dout(10) << "removing snap '" << s << "'" << dendl;
+	  int r = backend->destroy_checkpoint(s);
+	  if (r) {
+	    int err = errno;
+	    derr << "unable to destroy snap '" << s << "' got " << cpp_strerror(err) << dendl;
+	  }
+	}
+      }
+
+      dout(15) << __FUNC__ << ": committed to op_seq " << cp << dendl;
+
+      if (sync_entry_timeo) {
+	std::lock_guard lock{sync_entry_timeo_lock};
+	timer.cancel_event(sync_entry_timeo);
+      }
+    } else {
+      op_tp.unpause();
+    }
+
+    l.lock();
+    finish_contexts(cct, fin, 0);
+    fin.clear();
+    if (!sync_waiters.empty()) {
+      dout(10) << __FUNC__ << ": more waiters, committing again" << dendl;
+      goto again;
+    }
+    if (!stop && journal && journal->should_commit_now()) {
+      dout(10) << __FUNC__ << ": journal says we should commit again (probably is/was full)" << dendl;
+      goto again;
+    }
+  }
+  stop = false;
+}
+
+void FileStore::do_force_sync()
+{
+  dout(10) << __FUNC__ << dendl;
+  std::lock_guard l{lock};
+  force_sync = true;
+  sync_cond.notify_all();
+}
+
+void FileStore::start_sync(Context *onsafe)
+{
+  std::lock_guard l{lock};
+  sync_waiters.push_back(onsafe);
+  sync_cond.notify_all();
+  force_sync = true;
+  dout(10) << __FUNC__ << dendl;
+}
+
+void FileStore::sync()
+{
+  ceph::mutex m = ceph::make_mutex("FileStore::sync");
+  ceph::condition_variable c;
+  bool done;
+  C_SafeCond *fin = new C_SafeCond(m, c, &done);
+
+  start_sync(fin);
+
+  std::unique_lock l{m};
+  c.wait(l, [&done, this] {
+    if (!done) {
+      dout(10) << "sync waiting" << dendl;
+    }
+    return done;
+  });
+  dout(10) << "sync done" << dendl;
+}
+
+void FileStore::_flush_op_queue()
+{
+  dout(10) << __FUNC__ << ": draining op tp" << dendl;
+  op_wq.drain();
+  dout(10) << __FUNC__ << ": waiting for apply finisher" << dendl;
+  for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+    (*it)->wait_for_empty();
+  }
+}
+
+/*
+ * flush - make every queued write readable
+ */
+void FileStore::flush()
+{
+  dout(10) << __FUNC__ << dendl;
+
+  if (cct->_conf->filestore_blackhole) {
+    // wait forever
+    ceph::mutex lock = ceph::make_mutex("FileStore::flush::lock");
+    ceph::condition_variable cond;
+    std::unique_lock l{lock};
+    cond.wait(l, [] {return false;} );
+    ceph_abort();
+  }
+
+  if (m_filestore_journal_writeahead) {
+    if (journal)
+      journal->flush();
+    dout(10) << __FUNC__ << ": draining ondisk finisher" << dendl;
+    for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+      (*it)->wait_for_empty();
+    }
+  }
+
+  _flush_op_queue();
+  dout(10) << __FUNC__ << ": complete" << dendl;
+}
+
+/*
+ * sync_and_flush - make every queued write readable AND committed to disk
+ */
+void FileStore::sync_and_flush()
+{
+  dout(10) << __FUNC__ << dendl;
+
+  if (m_filestore_journal_writeahead) {
+    if (journal)
+      journal->flush();
+    _flush_op_queue();
+  } else {
+    // includes m_filestore_journal_parallel
+    _flush_op_queue();
+    sync();
+  }
+  dout(10) << __FUNC__ << ": done" << dendl;
+}
+
+int FileStore::flush_journal()
+{
+  dout(10) << __FUNC__ << dendl;
+  sync_and_flush();
+  sync();
+  return 0;
+}
+
+int FileStore::snapshot(const string& name)
+{
+  dout(10) << __FUNC__ << ": " << name << dendl;
+  sync_and_flush();
+
+  if (!backend->can_checkpoint()) {
+    dout(0) << __FUNC__ << ": " << name << " failed, not supported" << dendl;
+    return -EOPNOTSUPP;
+  }
+
+  char s[NAME_MAX];
+  snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, name.c_str());
+
+  int r = backend->create_checkpoint(s, nullptr);
+  if (r) {
+    derr << __FUNC__ << ": " << name << " failed: " << cpp_strerror(r) << dendl;
+  }
+
+  return r;
+}
+
+// -------------------------------
+// attributes
+
+int FileStore::_fgetattr(int fd, const char *name, bufferptr& bp)
+{
+  char val[CHAIN_XATTR_MAX_BLOCK_LEN];
+  int l = chain_fgetxattr(fd, name, val, sizeof(val));
+  if (l >= 0) {
+    bp = ceph::buffer::create(l);
+    memcpy(bp.c_str(), val, l);
+  } else if (l == -ERANGE) {
+    l = chain_fgetxattr(fd, name, 0, 0);
+    if (l > 0) {
+      bp = ceph::buffer::create(l);
+      l = chain_fgetxattr(fd, name, bp.c_str(), l);
+    }
+  }
+  ceph_assert(!m_filestore_fail_eio || l != -EIO);
+  return l;
+}
+
+int FileStore::_fgetattrs(int fd, map<string,bufferptr>& aset)
+{
+  // get attr list
+  char names1[100];
+  int len = chain_flistxattr(fd, names1, sizeof(names1)-1);
+  char *names2 = 0;
+  char *name = 0;
+  if (len == -ERANGE) {
+    len = chain_flistxattr(fd, 0, 0);
+    if (len < 0) {
+      ceph_assert(!m_filestore_fail_eio || len != -EIO);
+      return len;
+    }
+    dout(10) << " -ERANGE, len is " << len << dendl;
+    names2 = new char[len+1];
+    len = chain_flistxattr(fd, names2, len);
+    dout(10) << " -ERANGE, got " << len << dendl;
+    if (len < 0) {
+      ceph_assert(!m_filestore_fail_eio || len != -EIO);
+      delete[] names2;
+      return len;
+    }
+    name = names2;
+  } else if (len < 0) {
+    ceph_assert(!m_filestore_fail_eio || len != -EIO);
+    return len;
+  } else {
+    name = names1;
+  }
+  name[len] = 0;
+
+  char *end = name + len;
+  while (name < end) {
+    char *attrname = name;
+    if (parse_attrname(&name)) {
+      if (*name) {
+        dout(20) << __FUNC__ << ": " << fd << " getting '" << name << "'" << dendl;
+        int r = _fgetattr(fd, attrname, aset[name]);
+        if (r < 0) {
+	  delete[] names2;
+	  return r;
+        }
+      }
+    }
+    name += strlen(name) + 1;
+  }
+
+  delete[] names2;
+  return 0;
+}
+
+int FileStore::_fsetattrs(int fd, map<string, bufferptr> &aset)
+{
+  for (map<string, bufferptr>::iterator p = aset.begin();
+       p != aset.end();
+       ++p) {
+    char n[CHAIN_XATTR_MAX_NAME_LEN];
+    get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+    const char *val;
+    if (p->second.length())
+      val = p->second.c_str();
+    else
+      val = "";
+    // ??? Why do we skip setting all the other attrs if one fails?
+    int r = chain_fsetxattr(fd, n, val, p->second.length());
+    if (r < 0) {
+      derr << __FUNC__ << ": chain_setxattr returned " << r << dendl;
+      return r;
+    }
+  }
+  return 0;
+}
+
+// debug EIO injection
+void FileStore::inject_data_error(const ghobject_t &oid) {
+  std::lock_guard l{read_error_lock};
+  dout(10) << __FUNC__ << ": init error on " << oid << dendl;
+  data_error_set.insert(oid);
+}
+void FileStore::inject_mdata_error(const ghobject_t &oid) {
+  std::lock_guard l{read_error_lock};
+  dout(10) << __FUNC__ << ": init error on " << oid << dendl;
+  mdata_error_set.insert(oid);
+}
+
+void FileStore::debug_obj_on_delete(const ghobject_t &oid) {
+  std::lock_guard l{read_error_lock};
+  dout(10) << __FUNC__ << ": clear error on " << oid << dendl;
+  data_error_set.erase(oid);
+  mdata_error_set.erase(oid);
+}
+bool FileStore::debug_data_eio(const ghobject_t &oid) {
+  std::lock_guard l{read_error_lock};
+  if (data_error_set.count(oid)) {
+    dout(10) << __FUNC__ << ": inject error on " << oid << dendl;
+    return true;
+  } else {
+    return false;
+  }
+}
+bool FileStore::debug_mdata_eio(const ghobject_t &oid) {
+  std::lock_guard l{read_error_lock};
+  if (mdata_error_set.count(oid)) {
+    dout(10) << __FUNC__ << ": inject error on " << oid << dendl;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+
+// objects
+
+int FileStore::getattr(CollectionHandle& ch, const ghobject_t& oid, const char *name, bufferptr &bp)
+{
+  tracepoint(objectstore, getattr_enter, ch->cid.c_str());
+  const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+
+  FDRef fd;
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    goto out;
+  }
+  char n[CHAIN_XATTR_MAX_NAME_LEN];
+  get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
+  r = _fgetattr(**fd, n, bp);
+  lfn_close(fd);
+  if (r == -ENODATA) {
+    map<string, bufferlist> got;
+    set<string> to_get;
+    to_get.insert(string(name));
+    Index index;
+    r = get_index(cid, &index);
+    if (r < 0) {
+      dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+      goto out;
+    }
+    r = object_map->get_xattrs(oid, to_get, &got);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": get_xattrs err r =" << r << dendl;
+      goto out;
+    }
+    if (got.empty()) {
+      dout(10) << __FUNC__ << ": got.size() is 0" << dendl;
+      return -ENODATA;
+    }
+    bp = bufferptr(got.begin()->second.c_str(),
+		   got.begin()->second.length());
+    r = bp.length();
+  }
+ out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl;
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  if (cct->_conf->filestore_debug_inject_read_err &&
+      debug_mdata_eio(oid)) {
+    return -EIO;
+  } else {
+    tracepoint(objectstore, getattr_exit, r);
+    return r < 0 ? r : 0;
+  }
+}
+
+int FileStore::getattrs(CollectionHandle& ch, const ghobject_t& oid, map<string,bufferptr>& aset)
+{
+  tracepoint(objectstore, getattrs_enter, ch->cid.c_str());
+  const coll_t& cid = !_need_temp_object_collection(ch->cid, oid) ? ch->cid : ch->cid.get_temp();
+  set<string> omap_attrs;
+  map<string, bufferlist> omap_aset;
+  Index index;
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+
+  FDRef fd;
+  bool spill_out = true;
+  char buf[2];
+
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    goto out;
+  }
+
+  r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+  if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)))
+    spill_out = false;
+
+  r = _fgetattrs(**fd, aset);
+  lfn_close(fd);
+  fd = FDRef(); // defensive
+  if (r < 0) {
+    goto out;
+  }
+
+  if (!spill_out) {
+    dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl;
+    goto out;
+  }
+
+  r = get_index(cid, &index);
+  if (r < 0) {
+    dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+    goto out;
+  }
+  {
+    r = object_map->get_all_xattrs(oid, &omap_attrs);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl;
+      goto out;
+    }
+
+    r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl;
+      goto out;
+    }
+    if (r == -ENOENT)
+      r = 0;
+  }
+  ceph_assert(omap_attrs.size() == omap_aset.size());
+  for (map<string, bufferlist>::iterator i = omap_aset.begin();
+	 i != omap_aset.end();
+	 ++i) {
+    string key(i->first);
+    aset.insert(make_pair(key,
+			    bufferptr(i->second.c_str(), i->second.length())));
+  }
+ out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+
+  if (cct->_conf->filestore_debug_inject_read_err &&
+      debug_mdata_eio(oid)) {
+    return -EIO;
+  } else {
+    tracepoint(objectstore, getattrs_exit, r);
+    return r;
+  }
+}
+
+int FileStore::_setattrs(const coll_t& cid, const ghobject_t& oid, map<string,bufferptr>& aset,
+			 const SequencerPosition &spos)
+{
+  map<string, bufferlist> omap_set;
+  set<string> omap_remove;
+  map<string, bufferptr> inline_set;
+  map<string, bufferptr> inline_to_set;
+  FDRef fd;
+  int spill_out = -1;
+  bool incomplete_inline = false;
+
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    goto out;
+  }
+
+  char buf[2];
+  r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+  if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)))
+    spill_out = 0;
+  else
+    spill_out = 1;
+
+  r = _fgetattrs(**fd, inline_set);
+  incomplete_inline = (r == -E2BIG);
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid
+	   << (incomplete_inline ? " (incomplete_inline, forcing omap)" : "")
+	   << dendl;
+
+  for (map<string,bufferptr>::iterator p = aset.begin();
+       p != aset.end();
+       ++p) {
+    char n[CHAIN_XATTR_MAX_NAME_LEN];
+    get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+
+    if (incomplete_inline) {
+      chain_fremovexattr(**fd, n); // ignore any error
+      omap_set[p->first].push_back(p->second);
+      continue;
+    }
+
+    if (p->second.length() > m_filestore_max_inline_xattr_size) {
+	if (inline_set.count(p->first)) {
+	  inline_set.erase(p->first);
+	  r = chain_fremovexattr(**fd, n);
+	  if (r < 0)
+	    goto out_close;
+	}
+	omap_set[p->first].push_back(p->second);
+	continue;
+    }
+
+    if (!inline_set.count(p->first) &&
+	  inline_set.size() >= m_filestore_max_inline_xattrs) {
+	omap_set[p->first].push_back(p->second);
+	continue;
+    }
+    omap_remove.insert(p->first);
+    inline_set.insert(*p);
+
+    inline_to_set.insert(*p);
+  }
+
+  if (spill_out != 1 && !omap_set.empty()) {
+    chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
+		    sizeof(XATTR_SPILL_OUT));
+  }
+
+  r = _fsetattrs(**fd, inline_to_set);
+  if (r < 0)
+    goto out_close;
+
+  if (spill_out && !omap_remove.empty()) {
+    r = object_map->remove_xattrs(oid, omap_remove, &spos);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": could not remove_xattrs r = " << r << dendl;
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      goto out_close;
+    } else {
+      r = 0; // don't confuse the debug output
+    }
+  }
+
+  if (!omap_set.empty()) {
+    r = object_map->set_xattrs(oid, omap_set, &spos);
+    if (r < 0) {
+      dout(10) << __FUNC__ << ": could not set_xattrs r = " << r << dendl;
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      goto out_close;
+    }
+  }
+ out_close:
+  lfn_close(fd);
+ out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+  return r;
+}
+
+
+int FileStore::_rmattr(const coll_t& cid, const ghobject_t& oid, const char *name,
+		       const SequencerPosition &spos)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl;
+  FDRef fd;
+  bool spill_out = true;
+
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    goto out;
+  }
+
+  char buf[2];
+  r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+  if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+    spill_out = false;
+  }
+
+  char n[CHAIN_XATTR_MAX_NAME_LEN];
+  get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
+  r = chain_fremovexattr(**fd, n);
+  if (r == -ENODATA && spill_out) {
+    Index index;
+    r = get_index(cid, &index);
+    if (r < 0) {
+      dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+      goto out_close;
+    }
+    set<string> to_remove;
+    to_remove.insert(string(name));
+    r = object_map->remove_xattrs(oid, to_remove, &spos);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": could not remove_xattrs index r = " << r << dendl;
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      goto out_close;
+    }
+  }
+ out_close:
+  lfn_close(fd);
+ out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl;
+  return r;
+}
+
+int FileStore::_rmattrs(const coll_t& cid, const ghobject_t& oid,
+			const SequencerPosition &spos)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl;
+
+  map<string,bufferptr> aset;
+  FDRef fd;
+  set<string> omap_attrs;
+  Index index;
+  bool spill_out = true;
+
+  int r = lfn_open(cid, oid, false, &fd);
+  if (r < 0) {
+    goto out;
+  }
+
+  char buf[2];
+  r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
+  if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
+    spill_out = false;
+  }
+
+  r = _fgetattrs(**fd, aset);
+  if (r >= 0) {
+    for (map<string,bufferptr>::iterator p = aset.begin(); p != aset.end(); ++p) {
+      char n[CHAIN_XATTR_MAX_NAME_LEN];
+      get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
+      r = chain_fremovexattr(**fd, n);
+      if (r < 0) {
+        dout(10) << __FUNC__ << ": could not remove xattr r = " << r << dendl;
+	goto out_close;
+      }
+    }
+  }
+
+  if (!spill_out) {
+    dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl;
+    goto out_close;
+  }
+
+  r = get_index(cid, &index);
+  if (r < 0) {
+    dout(10) << __FUNC__ << ": could not get index r = " << r << dendl;
+    goto out_close;
+  }
+  {
+    r = object_map->get_all_xattrs(oid, &omap_attrs);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl;
+      if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      goto out_close;
+    }
+    r = object_map->remove_xattrs(oid, omap_attrs, &spos);
+    if (r < 0 && r != -ENOENT) {
+      dout(10) << __FUNC__ << ": could not remove omap_attrs r = " << r << dendl;
+      goto out_close;
+    }
+    if (r == -ENOENT)
+      r = 0;
+    chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
+		  sizeof(XATTR_NO_SPILL_OUT));
+  }
+
+ out_close:
+  lfn_close(fd);
+ out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl;
+  return r;
+}
+
+
+
+
+int FileStore::_collection_remove_recursive(const coll_t &cid,
+					    const SequencerPosition &spos)
+{
+  struct stat st;
+  int r = collection_stat(cid, &st);
+  if (r < 0) {
+    if (r == -ENOENT)
+      return 0;
+    return r;
+  }
+
+  vector<ghobject_t> objects;
+  ghobject_t max;
+  while (!max.is_max()) {
+    r = collection_list(cid, max, ghobject_t::get_max(),
+			300, &objects, &max);
+    if (r < 0)
+      return r;
+    for (vector<ghobject_t>::iterator i = objects.begin();
+	 i != objects.end();
+	 ++i) {
+      ceph_assert(_check_replay_guard(cid, *i, spos));
+      r = _remove(cid, *i, spos);
+      if (r < 0)
+	return r;
+    }
+    objects.clear();
+  }
+  return _destroy_collection(cid);
+}
+
+// --------------------------
+// collections
+
+int FileStore::list_collections(vector<coll_t>& ls)
+{
+  return list_collections(ls, false);
+}
+
+int FileStore::list_collections(vector<coll_t>& ls, bool include_temp)
+{
+  tracepoint(objectstore, list_collections_enter);
+  dout(10) << __FUNC__ << dendl;
+
+  char fn[PATH_MAX];
+  snprintf(fn, sizeof(fn), "%s/current", basedir.c_str());
+
+  int r = 0;
+  DIR *dir = ::opendir(fn);
+  if (!dir) {
+    r = -errno;
+    derr << "tried opening directory " << fn << ": " << cpp_strerror(-r) << dendl;
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+
+  struct dirent *de = nullptr;
+  while (true) {
+    errno = 0;
+    de = ::readdir(dir);
+    if (de == nullptr) {
+      if (errno != 0) {
+        r = -errno;
+        derr << "readdir failed " << fn << ": " << cpp_strerror(-r) << dendl;
+        if (r == -EIO && m_filestore_fail_eio) handle_eio();
+      }
+      break;
+    }
+    if (de->d_type == DT_UNKNOWN) {
+      // d_type not supported (non-ext[234], btrfs), must stat
+      struct stat sb;
+      char filename[PATH_MAX];
+      if (int n = snprintf(filename, sizeof(filename), "%s/%s", fn, de->d_name);
+	  n >= static_cast<int>(sizeof(filename))) {
+	derr << __func__ << " path length overrun: " << n << dendl;
+	ceph_abort();
+      }
+
+      r = ::stat(filename, &sb);
+      if (r < 0) {
+	r = -errno;
+	derr << "stat on " << filename << ": " << cpp_strerror(-r) << dendl;
+	if (r == -EIO && m_filestore_fail_eio) handle_eio();
+	break;
+      }
+      if (!S_ISDIR(sb.st_mode)) {
+	continue;
+      }
+    } else if (de->d_type != DT_DIR) {
+      continue;
+    }
+    if (strcmp(de->d_name, "omap") == 0) {
+      continue;
+    }
+    if (de->d_name[0] == '.' &&
+	(de->d_name[1] == '\0' ||
+	 (de->d_name[1] == '.' &&
+	  de->d_name[2] == '\0')))
+      continue;
+    coll_t cid;
+    if (!cid.parse(de->d_name)) {
+      derr << "ignoring invalid collection '" << de->d_name << "'" << dendl;
+      continue;
+    }
+    if (!cid.is_temp() || include_temp)
+      ls.push_back(cid);
+  }
+
+  if (r > 0) {
+    derr << "trying readdir " << fn << ": " << cpp_strerror(r) << dendl;
+    r = -r;
+  }
+
+  ::closedir(dir);
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  tracepoint(objectstore, list_collections_exit, r);
+  return r;
+}
+
+int FileStore::collection_stat(const coll_t& c, struct stat *st)
+{
+  tracepoint(objectstore, collection_stat_enter, c.c_str());
+  char fn[PATH_MAX];
+  get_cdir(c, fn, sizeof(fn));
+  dout(15) << __FUNC__ << ": " << fn << dendl;
+  int r = ::stat(fn, st);
+  if (r < 0)
+    r = -errno;
+  dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl;
+  if (r == -EIO && m_filestore_fail_eio) handle_eio();
+  tracepoint(objectstore, collection_stat_exit, r);
+  return r;
+}
+
+bool FileStore::collection_exists(const coll_t& c)
+{
+  tracepoint(objectstore, collection_exists_enter, c.c_str());
+  struct stat st;
+  bool ret = collection_stat(c, &st) == 0;
+  tracepoint(objectstore, collection_exists_exit, ret);
+  return ret;
+}
+
+int FileStore::collection_empty(const coll_t& cid, bool *empty)
+{
+  tracepoint(objectstore, collection_empty_enter, cid.c_str());
+  dout(15) << __FUNC__ << ": " << cid << dendl;
+  Index index;
+  int r = get_index(cid, &index);
+  if (r < 0) {
+    derr << __FUNC__ << ": get_index returned: " << cpp_strerror(r)
+         << dendl;
+    return r;
+  }
+
+  ceph_assert(index.index);
+  std::shared_lock l{(index.index)->access_lock};
+
+  vector<ghobject_t> ls;
+  r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(),
+				     1, &ls, nullptr);
+  if (r < 0) {
+    derr << __FUNC__ << ": collection_list_partial returned: "
+         << cpp_strerror(r) << dendl;
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  *empty = ls.empty();
+  tracepoint(objectstore, collection_empty_exit, *empty);
+  return 0;
+}
+
+int FileStore::_collection_set_bits(const coll_t& c, int bits)
+{
+  char fn[PATH_MAX];
+  get_cdir(c, fn, sizeof(fn));
+  dout(10) << __FUNC__ << ": " << fn << " " << bits << dendl;
+  char n[PATH_MAX];
+  int r;
+  int32_t v = bits;
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    r = -errno;
+    goto out;
+  }
+  get_attrname("bits", n, PATH_MAX);
+  r = chain_fsetxattr(fd, n, (char*)&v, sizeof(v));
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+ out:
+  dout(10) << __FUNC__ << ": " << fn << " " << bits << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::collection_bits(CollectionHandle& ch)
+{
+  char fn[PATH_MAX];
+  get_cdir(ch->cid, fn, sizeof(fn));
+  dout(15) << __FUNC__ << ": " << fn << dendl;
+  int r;
+  char n[PATH_MAX];
+  int32_t bits;
+  int fd = ::open(fn, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    bits = r = -errno;
+    goto out;
+  }
+  get_attrname("bits", n, PATH_MAX);
+  r = chain_fgetxattr(fd, n, (char*)&bits, sizeof(bits));
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  if (r < 0) {
+    bits = r;
+    goto out;
+  }
+ out:
+  dout(10) << __FUNC__ << ": " << fn << " = " << bits << dendl;
+  return bits;
+}
+
+int FileStore::collection_list(const coll_t& c,
+			       const ghobject_t& orig_start,
+			       const ghobject_t& end,
+			       int max,
+			       vector<ghobject_t> *ls, ghobject_t *next)
+{
+  ghobject_t start = orig_start;
+  if (start.is_max())
+    return 0;
+
+  ghobject_t temp_next;
+  if (!next)
+    next = &temp_next;
+  // figure out the pool id.  we need this in order to generate a
+  // meaningful 'next' value.
+  int64_t pool = -1;
+  shard_id_t shard;
+  {
+    spg_t pgid;
+    if (c.is_temp(&pgid)) {
+      pool = -2 - pgid.pool();
+      shard = pgid.shard;
+    } else if (c.is_pg(&pgid)) {
+      pool = pgid.pool();
+      shard = pgid.shard;
+    } else if (c.is_meta()) {
+      pool = -1;
+      shard = shard_id_t::NO_SHARD;
+    } else {
+      // hrm, the caller is test code!  we should get kill it off.  for now,
+      // tolerate it.
+      pool = 0;
+      shard = shard_id_t::NO_SHARD;
+    }
+    dout(20) << __FUNC__ << ": pool is " << pool << " shard is " << shard
+	     << " pgid " << pgid << dendl;
+  }
+  ghobject_t sep;
+  sep.hobj.pool = -1;
+  sep.set_shard(shard);
+  if (!c.is_temp() && !c.is_meta()) {
+    if (start < sep) {
+      dout(10) << __FUNC__ << ": first checking temp pool" << dendl;
+      coll_t temp = c.get_temp();
+      int r = collection_list(temp, start, end, max, ls, next);
+      if (r < 0)
+	return r;
+      if (*next != ghobject_t::get_max())
+	return r;
+      start = sep;
+      dout(10) << __FUNC__ << ": fall through to non-temp collection, start "
+	       << start << dendl;
+    } else {
+      dout(10) << __FUNC__ << ": start " << start << " >= sep " << sep << dendl;
+    }
+  }
+
+  Index index;
+  int r = get_index(c, &index);
+  if (r < 0)
+    return r;
+
+  ceph_assert(index.index);
+  std::shared_lock l{(index.index)->access_lock};
+
+  r = index->collection_list_partial(start, end, max, ls, next);
+
+  if (r < 0) {
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  dout(20) << "objects: " << *ls << dendl;
+
+  // HashIndex doesn't know the pool when constructing a 'next' value
+  if (!next->is_max()) {
+    next->hobj.pool = pool;
+    next->set_shard(shard);
+    dout(20) << "  next " << *next << dendl;
+  }
+
+  return 0;
+}
+
+int FileStore::omap_get(CollectionHandle& ch, const ghobject_t &hoid,
+			bufferlist *header,
+			map<string, bufferlist> *out)
+{
+  tracepoint(objectstore, omap_get_enter, ch->cid.c_str());
+  const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(hoid);
+
+  Index index;
+  int r = get_index(c, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    std::shared_lock l{(index.index)->access_lock};
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+  r = object_map->get(hoid, header, out);
+  if (r < 0 && r != -ENOENT) {
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  tracepoint(objectstore, omap_get_exit, 0);
+  return 0;
+}
+
+int FileStore::omap_get_header(
+  CollectionHandle& ch,
+  const ghobject_t &hoid,
+  bufferlist *bl,
+  bool allow_eio)
+{
+  tracepoint(objectstore, omap_get_header_enter, ch->cid.c_str());
+  const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(hoid);
+
+  Index index;
+  int r = get_index(c, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    std::shared_lock l{(index.index)->access_lock};
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+  r = object_map->get_header(hoid, bl);
+  if (r < 0 && r != -ENOENT) {
+    ceph_assert(allow_eio || !m_filestore_fail_eio || r != -EIO);
+    return r;
+  }
+  tracepoint(objectstore, omap_get_header_exit, 0);
+  return 0;
+}
+
+int FileStore::omap_get_keys(CollectionHandle& ch, const ghobject_t &hoid, set<string> *keys)
+{
+  tracepoint(objectstore, omap_get_keys_enter, ch->cid.c_str());
+  const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(hoid);
+
+  Index index;
+  int r = get_index(c, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    std::shared_lock l{(index.index)->access_lock};
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+  r = object_map->get_keys(hoid, keys);
+  if (r < 0 && r != -ENOENT) {
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  tracepoint(objectstore, omap_get_keys_exit, 0);
+  return 0;
+}
+
+int FileStore::omap_get_values(CollectionHandle& ch, const ghobject_t &hoid,
+			       const set<string> &keys,
+			       map<string, bufferlist> *out)
+{
+  tracepoint(objectstore, omap_get_values_enter, ch->cid.c_str());
+  const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(hoid);
+
+  Index index;
+  const char *where = "()";
+  int r = get_index(c, &index);
+  if (r < 0) {
+    where = " (get_index)";
+    goto out;
+  }
+  {
+    ceph_assert(index.index);
+    std::shared_lock l{(index.index)->access_lock};
+    r = lfn_find(hoid, index);
+    if (r < 0) {
+      where = " (lfn_find)";
+      goto out;
+    }
+  }
+  r = object_map->get_values(hoid, keys, out);
+  if (r < 0 && r != -ENOENT) {
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    where = " (get_values)";
+    goto out;
+  }
+  r = 0;
+ out:
+  tracepoint(objectstore, omap_get_values_exit, r);
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << " = " << r
+	   << where << dendl;
+  return r;
+}
+
+int FileStore::omap_check_keys(CollectionHandle& ch, const ghobject_t &hoid,
+			       const set<string> &keys,
+			       set<string> *out)
+{
+  tracepoint(objectstore, omap_check_keys_enter, ch->cid.c_str());
+  const coll_t& c = !_need_temp_object_collection(ch->cid, hoid) ? ch->cid : ch->cid.get_temp();
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(hoid);
+
+  Index index;
+  int r = get_index(c, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    std::shared_lock l{(index.index)->access_lock};
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+  r = object_map->check_keys(hoid, keys, out);
+  if (r < 0 && r != -ENOENT) {
+    if (r == -EIO && m_filestore_fail_eio) handle_eio();
+    return r;
+  }
+  tracepoint(objectstore, omap_check_keys_exit, 0);
+  return 0;
+}
+
+ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(
+  CollectionHandle& ch,
+  const ghobject_t &oid)
+{
+  auto osr = static_cast<OpSequencer*>(ch.get());
+  osr->wait_for_apply(oid);
+  return get_omap_iterator(ch->cid, oid);
+}
+
+ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(const coll_t& _c,
+							  const ghobject_t &hoid)
+{
+  tracepoint(objectstore, get_omap_iterator, _c.c_str());
+  const coll_t& c = !_need_temp_object_collection(_c, hoid) ? _c : _c.get_temp();
+  dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl;
+  Index index;
+  int r = get_index(c, &index);
+  if (r < 0) {
+    dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 "
+	     << "(get_index failed with " << cpp_strerror(r) << ")" << dendl;
+    return ObjectMap::ObjectMapIterator();
+  }
+  {
+    ceph_assert(index.index);
+    std::shared_lock l{(index.index)->access_lock};
+    r = lfn_find(hoid, index);
+    if (r < 0) {
+      dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 "
+	       << "(lfn_find failed with " << cpp_strerror(r) << ")" << dendl;
+      return ObjectMap::ObjectMapIterator();
+    }
+  }
+  return object_map->get_iterator(hoid);
+}
+
+int FileStore::_collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num,
+    uint64_t expected_num_objs,
+    const SequencerPosition &spos)
+{
+  dout(15) << __FUNC__ << ": collection: " << c << " pg number: "
+     << pg_num << " expected number of objects: " << expected_num_objs << dendl;
+
+  bool empty;
+  int ret = collection_empty(c, &empty);
+  if (ret < 0)
+    return ret;
+  if (!empty && !replaying) {
+    dout(0) << "Failed to give an expected number of objects hint to collection : "
+      << c << ", only empty collection can take such type of hint. " << dendl;
+    return 0;
+  }
+
+  Index index;
+  ret = get_index(c, &index);
+  if (ret < 0)
+    return ret;
+  // Pre-hash the collection
+  ret = index->pre_hash_collection(pg_num, expected_num_objs);
+  dout(10) << "pre_hash_collection " << c << " = " << ret << dendl;
+  if (ret < 0)
+    return ret;
+  _set_replay_guard(c, spos);
+
+  return 0;
+}
+
+int FileStore::_create_collection(
+  const coll_t& c,
+  int bits,
+  const SequencerPosition &spos)
+{
+  char fn[PATH_MAX];
+  get_cdir(c, fn, sizeof(fn));
+  dout(15) << __FUNC__ << ": " << fn << dendl;
+  int r = ::mkdir(fn, 0755);
+  if (r < 0)
+    r = -errno;
+  if (r == -EEXIST && replaying)
+    r = 0;
+  dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl;
+
+  if (r < 0)
+    return r;
+  r = init_index(c);
+  if (r < 0)
+    return r;
+  r = _collection_set_bits(c, bits);
+  if (r < 0)
+    return r;
+  // create parallel temp collection, too
+  if (!c.is_meta() && !c.is_temp()) {
+    coll_t temp = c.get_temp();
+    r = _create_collection(temp, 0, spos);
+    if (r < 0)
+      return r;
+  }
+
+  _set_replay_guard(c, spos);
+  return 0;
+}
+
+int FileStore::_destroy_collection(const coll_t& c)
+{
+  int r = 0;
+  char fn[PATH_MAX];
+  get_cdir(c, fn, sizeof(fn));
+  dout(15) << __FUNC__ << ": " << fn << dendl;
+  {
+    Index from;
+    r = get_index(c, &from);
+    if (r < 0)
+      goto out;
+    ceph_assert(from.index);
+    std::unique_lock l{(from.index)->access_lock};
+
+    r = from->prep_delete();
+    if (r < 0)
+      goto out;
+  }
+  r = ::rmdir(fn);
+  if (r < 0) {
+    r = -errno;
+    goto out;
+  }
+
+ out:
+  // destroy parallel temp collection, too
+  if (!c.is_meta() && !c.is_temp()) {
+    coll_t temp = c.get_temp();
+    int r2 = _destroy_collection(temp);
+    if (r2 < 0) {
+      r = r2;
+      goto out_final;
+    }
+  }
+
+ out_final:
+  dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl;
+  return r;
+}
+
+
+int FileStore::_collection_add(const coll_t& c, const coll_t& oldcid, const ghobject_t& o,
+			       const SequencerPosition& spos)
+{
+  dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << dendl;
+
+  int dstcmp = _check_replay_guard(c, o, spos);
+  if (dstcmp < 0)
+    return 0;
+
+  // check the src name too; it might have a newer guard, and we don't
+  // want to clobber it
+  int srccmp = _check_replay_guard(oldcid, o, spos);
+  if (srccmp < 0)
+    return 0;
+
+  // open guard on object so we don't any previous operations on the
+  // new name that will modify the source inode.
+  FDRef fd;
+  int r = lfn_open(oldcid, o, 0, &fd);
+  if (r < 0) {
+    // the source collection/object does not exist. If we are replaying, we
+    // should be safe, so just return 0 and move on.
+    ceph_assert(replaying);
+    dout(10) << __FUNC__ << ": " << c << "/" << o << " from "
+	     << oldcid << "/" << o << " (dne, continue replay) " << dendl;
+    return 0;
+  }
+  if (dstcmp > 0) {      // if dstcmp == 0 the guard already says "in-progress"
+    _set_replay_guard(**fd, spos, &o, true);
+  }
+
+  r = lfn_link(oldcid, c, o, o);
+  if (replaying && !backend->can_checkpoint() &&
+      r == -EEXIST)    // crashed between link() and set_replay_guard()
+    r = 0;
+
+  _inject_failure();
+
+  // close guard on object so we don't do this again
+  if (r == 0) {
+    _close_replay_guard(**fd, spos);
+  }
+  lfn_close(fd);
+
+  dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+				       coll_t c, const ghobject_t& o,
+				       const SequencerPosition& spos,
+				       bool allow_enoent)
+{
+  dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl;
+  int r = 0;
+  int dstcmp, srccmp;
+
+  if (replaying) {
+    /* If the destination collection doesn't exist during replay,
+     * we need to delete the src object and continue on
+     */
+    if (!collection_exists(c))
+      goto out_rm_src;
+  }
+
+  dstcmp = _check_replay_guard(c, o, spos);
+  if (dstcmp < 0)
+    goto out_rm_src;
+
+  // check the src name too; it might have a newer guard, and we don't
+  // want to clobber it
+  srccmp = _check_replay_guard(oldcid, oldoid, spos);
+  if (srccmp < 0)
+    return 0;
+
+  {
+    // open guard on object so we don't any previous operations on the
+    // new name that will modify the source inode.
+    FDRef fd;
+    r = lfn_open(oldcid, oldoid, 0, &fd);
+    if (r < 0) {
+      // the source collection/object does not exist. If we are replaying, we
+      // should be safe, so just return 0 and move on.
+      if (replaying) {
+	dout(10) << __FUNC__ << ": " << c << "/" << o << " from "
+		 << oldcid << "/" << oldoid << " (dne, continue replay) " << dendl;
+      } else if (allow_enoent) {
+	dout(10) << __FUNC__ << ": " << c << "/" << o << " from "
+		 << oldcid << "/" << oldoid << " (dne, ignoring enoent)"
+		 << dendl;
+      } else {
+	ceph_abort_msg("ERROR: source must exist");
+      }
+
+      if (!replaying) {
+	return 0;
+      }
+      if (allow_enoent && dstcmp > 0) { // if dstcmp == 0, try_rename was started.
+	return 0;
+      }
+
+      r = 0; // don't know if object_map was cloned
+    } else {
+      if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress"
+	_set_replay_guard(**fd, spos, &o, true);
+      }
+
+      r = lfn_link(oldcid, c, oldoid, o);
+      if (replaying && !backend->can_checkpoint() &&
+	  r == -EEXIST)    // crashed between link() and set_replay_guard()
+	r = 0;
+
+      lfn_close(fd);
+      fd = FDRef();
+
+      _inject_failure();
+    }
+
+    if (r == 0) {
+      // the name changed; link the omap content
+      r = object_map->rename(oldoid, o, &spos);
+      if (r == -ENOENT)
+	r = 0;
+    }
+
+    _inject_failure();
+
+    if (r == 0)
+      r = lfn_unlink(oldcid, oldoid, spos, true);
+
+    if (r == 0)
+      r = lfn_open(c, o, 0, &fd);
+
+    // close guard on object so we don't do this again
+    if (r == 0) {
+      _close_replay_guard(**fd, spos, &o);
+      lfn_close(fd);
+    }
+  }
+
+  dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid
+	   << " = " << r << dendl;
+  return r;
+
+ out_rm_src:
+  // remove source
+  if (_check_replay_guard(oldcid, oldoid, spos) > 0) {
+    r = lfn_unlink(oldcid, oldoid, spos, true);
+  }
+
+  dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid
+	   << " = " << r << dendl;
+  return r;
+}
+
+void FileStore::_inject_failure()
+{
+  if (m_filestore_kill_at) {
+    int final = --m_filestore_kill_at;
+    dout(5) << __FUNC__ << ": " << (final+1) << " -> " << final << dendl;
+    if (final == 0) {
+      derr << __FUNC__ << ": KILLING" << dendl;
+      cct->_log->flush();
+      _exit(1);
+    }
+  }
+}
+
+int FileStore::_omap_clear(const coll_t& cid, const ghobject_t &hoid,
+			   const SequencerPosition &spos) {
+  dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+  Index index;
+  int r = get_index(cid, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    std::shared_lock l{(index.index)->access_lock};
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+  r = object_map->clear_keys_header(hoid, &spos);
+  if (r < 0 && r != -ENOENT)
+    return r;
+  return 0;
+}
+
+int FileStore::_omap_setkeys(const coll_t& cid, const ghobject_t &hoid,
+			     const map<string, bufferlist> &aset,
+			     const SequencerPosition &spos) {
+  dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+  Index index;
+  int r;
+  //treat pgmeta as a logical object, skip to check exist
+  if (hoid.is_pgmeta())
+    goto skip;
+
+  r = get_index(cid, &index);
+  if (r < 0) {
+    dout(20) << __FUNC__ << ": get_index got " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  {
+    ceph_assert(index.index);
+    std::shared_lock l{(index.index)->access_lock};
+    r = lfn_find(hoid, index);
+    if (r < 0) {
+      dout(20) << __FUNC__ << ": lfn_find got " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+skip:
+  if (g_conf()->subsys.should_gather<ceph_subsys_filestore, 20>()) {
+    for (auto& p : aset) {
+      dout(20) << __FUNC__ << ":  set " << p.first << dendl;
+    }
+  }
+  r = object_map->set_keys(hoid, aset, &spos);
+  dout(20) << __FUNC__ << ": " << cid << "/" << hoid << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_omap_rmkeys(const coll_t& cid, const ghobject_t &hoid,
+			    const set<string> &keys,
+			    const SequencerPosition &spos) {
+  dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+  Index index;
+  int r;
+  //treat pgmeta as a logical object, skip to check exist
+  if (hoid.is_pgmeta())
+    goto skip;
+
+  r = get_index(cid, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    std::shared_lock l{(index.index)->access_lock};
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+skip:
+  r = object_map->rm_keys(hoid, keys, &spos);
+  if (r < 0 && r != -ENOENT)
+    return r;
+  return 0;
+}
+
+int FileStore::_omap_rmkeyrange(const coll_t& cid, const ghobject_t &hoid,
+				const string& first, const string& last,
+				const SequencerPosition &spos) {
+  dout(15) << __FUNC__ << ": " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl;
+  set<string> keys;
+  {
+    ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid);
+    if (!iter)
+      return -ENOENT;
+    for (iter->lower_bound(first); iter->valid() && iter->key() < last;
+	 iter->next()) {
+      keys.insert(iter->key());
+    }
+  }
+  return _omap_rmkeys(cid, hoid, keys, spos);
+}
+
+int FileStore::_omap_setheader(const coll_t& cid, const ghobject_t &hoid,
+			       const bufferlist &bl,
+			       const SequencerPosition &spos)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl;
+  Index index;
+  int r = get_index(cid, &index);
+  if (r < 0)
+    return r;
+  {
+    ceph_assert(index.index);
+    std::shared_lock l{(index.index)->access_lock};
+    r = lfn_find(hoid, index);
+    if (r < 0)
+      return r;
+  }
+  return object_map->set_header(hoid, bl, &spos);
+}
+
+int FileStore::_merge_collection(const coll_t& cid,
+				 uint32_t bits,
+				 coll_t dest,
+				 const SequencerPosition &spos)
+{
+  dout(15) << __FUNC__ << ": " << cid << " " << dest
+	   << " bits " << bits << dendl;
+  int r = 0;
+
+  if (!collection_exists(cid)) {
+    dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl;
+    ceph_assert(replaying);
+    return 0;
+  }
+  if (!collection_exists(dest)) {
+    dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl;
+    ceph_assert(replaying);
+    return 0;
+  }
+
+  // set bits
+  if (_check_replay_guard(cid, spos) > 0)
+    _collection_set_bits(dest, bits);
+
+  spg_t pgid;
+  bool is_pg = dest.is_pg(&pgid);
+  ceph_assert(is_pg);
+
+  int dstcmp = _check_replay_guard(dest, spos);
+  if (dstcmp < 0)
+    return 0;
+
+  int srccmp = _check_replay_guard(cid, spos);
+  if (srccmp < 0)
+    return 0;
+
+  _set_global_replay_guard(cid, spos);
+  _set_replay_guard(cid, spos, true);
+  _set_replay_guard(dest, spos, true);
+
+  // main collection
+  {
+    Index from;
+    r = get_index(cid, &from);
+
+    Index to;
+    if (!r)
+      r = get_index(dest, &to);
+
+    if (!r) {
+      ceph_assert(from.index);
+      std::unique_lock l1{(from.index)->access_lock};
+
+      ceph_assert(to.index);
+      std::unique_lock l2{(to.index)->access_lock};
+
+      r = from->merge(bits, to.index);
+    }
+  }
+
+  // temp too
+  {
+    Index from;
+    r = get_index(cid.get_temp(), &from);
+
+    Index to;
+    if (!r)
+      r = get_index(dest.get_temp(), &to);
+
+    if (!r) {
+      ceph_assert(from.index);
+      std::unique_lock l1{(from.index)->access_lock};
+
+      ceph_assert(to.index);
+      std::unique_lock l2{(to.index)->access_lock};
+
+      r = from->merge(bits, to.index);
+    }
+  }
+
+  // remove source
+  _destroy_collection(cid);
+
+  _close_replay_guard(dest, spos);
+  _close_replay_guard(dest.get_temp(), spos);
+  // no need to close guards on cid... it's removed.
+
+  if (!r && cct->_conf->filestore_debug_verify_split) {
+    vector<ghobject_t> objects;
+    ghobject_t next;
+    while (1) {
+      collection_list(
+	dest,
+	next, ghobject_t::get_max(),
+	get_ideal_list_max(),
+	&objects,
+	&next);
+      if (objects.empty())
+	break;
+      for (vector<ghobject_t>::iterator i = objects.begin();
+	   i != objects.end();
+	   ++i) {
+	if (!i->match(bits, pgid.pgid.ps())) {
+	  dout(20) << __FUNC__ << ": " << *i << " does not belong in "
+		   << cid << dendl;
+	  ceph_assert(i->match(bits, pgid.pgid.ps()));
+	}
+      }
+      objects.clear();
+    }
+  }
+
+  dout(15) << __FUNC__ << ": " << cid << " " << dest << " bits " << bits
+	   << " = " << r << dendl;
+  return r;
+}
+
+int FileStore::_split_collection(const coll_t& cid,
+				 uint32_t bits,
+				 uint32_t rem,
+				 coll_t dest,
+				 const SequencerPosition &spos)
+{
+  int r;
+  {
+    dout(15) << __FUNC__ << ": " << cid << " bits: " << bits << dendl;
+    if (!collection_exists(cid)) {
+      dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl;
+      ceph_assert(replaying);
+      return 0;
+    }
+    if (!collection_exists(dest)) {
+      dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl;
+      ceph_assert(replaying);
+      return 0;
+    }
+
+    int dstcmp = _check_replay_guard(dest, spos);
+    if (dstcmp < 0)
+      return 0;
+
+    int srccmp = _check_replay_guard(cid, spos);
+    if (srccmp < 0)
+      return 0;
+
+    _set_global_replay_guard(cid, spos);
+    _set_replay_guard(cid, spos, true);
+    _set_replay_guard(dest, spos, true);
+
+    Index from;
+    r = get_index(cid, &from);
+
+    Index to;
+    if (!r)
+      r = get_index(dest, &to);
+
+    if (!r) {
+      ceph_assert(from.index);
+      std::unique_lock l1{(from.index)->access_lock};
+
+      ceph_assert(to.index);
+      std::unique_lock l2{(to.index)->access_lock};
+
+      r = from->split(rem, bits, to.index);
+    }
+
+    _close_replay_guard(cid, spos);
+    _close_replay_guard(dest, spos);
+  }
+  _collection_set_bits(cid, bits);
+  if (!r && cct->_conf->filestore_debug_verify_split) {
+    vector<ghobject_t> objects;
+    ghobject_t next;
+    while (1) {
+      collection_list(
+	cid,
+	next, ghobject_t::get_max(),
+	get_ideal_list_max(),
+	&objects,
+	&next);
+      if (objects.empty())
+	break;
+      for (vector<ghobject_t>::iterator i = objects.begin();
+	   i != objects.end();
+	   ++i) {
+	dout(20) << __FUNC__ << ": " << *i << " still in source "
+		 << cid << dendl;
+	ceph_assert(!i->match(bits, rem));
+      }
+      objects.clear();
+    }
+    next = ghobject_t();
+    while (1) {
+      collection_list(
+	dest,
+	next, ghobject_t::get_max(),
+	get_ideal_list_max(),
+	&objects,
+	&next);
+      if (objects.empty())
+	break;
+      for (vector<ghobject_t>::iterator i = objects.begin();
+	   i != objects.end();
+	   ++i) {
+	dout(20) << __FUNC__ << ": " << *i << " now in dest "
+		 << *i << dendl;
+	ceph_assert(i->match(bits, rem));
+      }
+      objects.clear();
+    }
+  }
+  return r;
+}
+
+int FileStore::_set_alloc_hint(const coll_t& cid, const ghobject_t& oid,
+                               uint64_t expected_object_size,
+                               uint64_t expected_write_size)
+{
+  dout(15) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << dendl;
+
+  FDRef fd;
+  int ret = 0;
+
+  if (expected_object_size == 0 || expected_write_size == 0)
+    goto out;
+
+  ret = lfn_open(cid, oid, false, &fd);
+  if (ret < 0)
+    goto out;
+
+  {
+    // TODO: a more elaborate hint calculation
+    uint64_t hint = std::min<uint64_t>(expected_write_size, m_filestore_max_alloc_hint_size);
+
+    ret = backend->set_alloc_hint(**fd, hint);
+    dout(20) << __FUNC__ << ": hint " << hint << " ret " << ret << dendl;
+  }
+
+  lfn_close(fd);
+out:
+  dout(10) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " = " << ret << dendl;
+  ceph_assert(!m_filestore_fail_eio || ret != -EIO);
+  return ret;
+}
+
+const char** FileStore::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "filestore_max_inline_xattr_size",
+    "filestore_max_inline_xattr_size_xfs",
+    "filestore_max_inline_xattr_size_btrfs",
+    "filestore_max_inline_xattr_size_other",
+    "filestore_max_inline_xattrs",
+    "filestore_max_inline_xattrs_xfs",
+    "filestore_max_inline_xattrs_btrfs",
+    "filestore_max_inline_xattrs_other",
+    "filestore_max_xattr_value_size",
+    "filestore_max_xattr_value_size_xfs",
+    "filestore_max_xattr_value_size_btrfs",
+    "filestore_max_xattr_value_size_other",
+    "filestore_min_sync_interval",
+    "filestore_max_sync_interval",
+    "filestore_queue_max_ops",
+    "filestore_queue_max_bytes",
+    "filestore_expected_throughput_bytes",
+    "filestore_expected_throughput_ops",
+    "filestore_queue_low_threshhold",
+    "filestore_queue_high_threshhold",
+    "filestore_queue_high_delay_multiple",
+    "filestore_queue_max_delay_multiple",
+    "filestore_commit_timeout",
+    "filestore_dump_file",
+    "filestore_kill_at",
+    "filestore_fail_eio",
+    "filestore_fadvise",
+    "filestore_sloppy_crc",
+    "filestore_sloppy_crc_block_size",
+    "filestore_max_alloc_hint_size",
+    NULL
+  };
+  return KEYS;
+}
+
+void FileStore::handle_conf_change(const ConfigProxy& conf,
+			  const std::set <std::string> &changed)
+{
+  if (changed.count("filestore_max_inline_xattr_size") ||
+      changed.count("filestore_max_inline_xattr_size_xfs") ||
+      changed.count("filestore_max_inline_xattr_size_btrfs") ||
+      changed.count("filestore_max_inline_xattr_size_other") ||
+      changed.count("filestore_max_inline_xattrs") ||
+      changed.count("filestore_max_inline_xattrs_xfs") ||
+      changed.count("filestore_max_inline_xattrs_btrfs") ||
+      changed.count("filestore_max_inline_xattrs_other") ||
+      changed.count("filestore_max_xattr_value_size") ||
+      changed.count("filestore_max_xattr_value_size_xfs") ||
+      changed.count("filestore_max_xattr_value_size_btrfs") ||
+      changed.count("filestore_max_xattr_value_size_other")) {
+    if (backend) {
+      std::lock_guard l(lock);
+      set_xattr_limits_via_conf();
+    }
+  }
+
+  if (changed.count("filestore_queue_max_bytes") ||
+      changed.count("filestore_queue_max_ops") ||
+      changed.count("filestore_expected_throughput_bytes") ||
+      changed.count("filestore_expected_throughput_ops") ||
+      changed.count("filestore_queue_low_threshhold") ||
+      changed.count("filestore_queue_high_threshhold") ||
+      changed.count("filestore_queue_high_delay_multiple") ||
+      changed.count("filestore_queue_max_delay_multiple")) {
+    std::lock_guard l(lock);
+    set_throttle_params();
+  }
+
+  if (changed.count("filestore_min_sync_interval") ||
+      changed.count("filestore_max_sync_interval") ||
+      changed.count("filestore_kill_at") ||
+      changed.count("filestore_fail_eio") ||
+      changed.count("filestore_sloppy_crc") ||
+      changed.count("filestore_sloppy_crc_block_size") ||
+      changed.count("filestore_max_alloc_hint_size") ||
+      changed.count("filestore_fadvise")) {
+    std::lock_guard l(lock);
+    m_filestore_min_sync_interval = conf->filestore_min_sync_interval;
+    m_filestore_max_sync_interval = conf->filestore_max_sync_interval;
+    m_filestore_kill_at = conf->filestore_kill_at;
+    m_filestore_fail_eio = conf->filestore_fail_eio;
+    m_filestore_fadvise = conf->filestore_fadvise;
+    m_filestore_sloppy_crc = conf->filestore_sloppy_crc;
+    m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size;
+    m_filestore_max_alloc_hint_size = conf->filestore_max_alloc_hint_size;
+  }
+  if (changed.count("filestore_commit_timeout")) {
+    std::lock_guard l(sync_entry_timeo_lock);
+    m_filestore_commit_timeout = conf->filestore_commit_timeout;
+  }
+  if (changed.count("filestore_dump_file")) {
+    if (conf->filestore_dump_file.length() &&
+	conf->filestore_dump_file != "-") {
+      dump_start(conf->filestore_dump_file);
+    } else {
+      dump_stop();
+    }
+  }
+}
+
+int FileStore::set_throttle_params()
+{
+  stringstream ss;
+  bool valid = throttle_bytes.set_params(
+    cct->_conf->filestore_queue_low_threshhold,
+    cct->_conf->filestore_queue_high_threshhold,
+    cct->_conf->filestore_expected_throughput_bytes,
+    cct->_conf->filestore_queue_high_delay_multiple?
+    cct->_conf->filestore_queue_high_delay_multiple:
+    cct->_conf->filestore_queue_high_delay_multiple_bytes,
+    cct->_conf->filestore_queue_max_delay_multiple?
+    cct->_conf->filestore_queue_max_delay_multiple:
+    cct->_conf->filestore_queue_max_delay_multiple_bytes,
+    cct->_conf->filestore_queue_max_bytes,
+    &ss);
+
+  valid &= throttle_ops.set_params(
+    cct->_conf->filestore_queue_low_threshhold,
+    cct->_conf->filestore_queue_high_threshhold,
+    cct->_conf->filestore_expected_throughput_ops,
+    cct->_conf->filestore_queue_high_delay_multiple?
+    cct->_conf->filestore_queue_high_delay_multiple:
+    cct->_conf->filestore_queue_high_delay_multiple_ops,
+    cct->_conf->filestore_queue_max_delay_multiple?
+    cct->_conf->filestore_queue_max_delay_multiple:
+    cct->_conf->filestore_queue_max_delay_multiple_ops,
+    cct->_conf->filestore_queue_max_ops,
+    &ss);
+
+  logger->set(l_filestore_op_queue_max_ops, throttle_ops.get_max());
+  logger->set(l_filestore_op_queue_max_bytes, throttle_bytes.get_max());
+
+  if (!valid) {
+    derr << "tried to set invalid params: "
+	 << ss.str()
+	 << dendl;
+  }
+  return valid ? 0 : -EINVAL;
+}
+
+void FileStore::dump_start(const std::string& file)
+{
+  dout(10) << __FUNC__ << ": " << file << dendl;
+  if (m_filestore_do_dump) {
+    dump_stop();
+  }
+  m_filestore_dump_fmt.reset();
+  m_filestore_dump_fmt.open_array_section("dump");
+  m_filestore_dump.open(file.c_str());
+  m_filestore_do_dump = true;
+}
+
+void FileStore::dump_stop()
+{
+  dout(10) << __FUNC__ << dendl;
+  m_filestore_do_dump = false;
+  if (m_filestore_dump.is_open()) {
+    m_filestore_dump_fmt.close_section();
+    m_filestore_dump_fmt.flush(m_filestore_dump);
+    m_filestore_dump.flush();
+    m_filestore_dump.close();
+  }
+}
+
+void FileStore::dump_transactions(vector<ObjectStore::Transaction>& ls, uint64_t seq, OpSequencer *osr)
+{
+  m_filestore_dump_fmt.open_array_section("transactions");
+  unsigned trans_num = 0;
+  for (vector<ObjectStore::Transaction>::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) {
+    m_filestore_dump_fmt.open_object_section("transaction");
+    m_filestore_dump_fmt.dump_stream("osr") << osr->cid;
+    m_filestore_dump_fmt.dump_unsigned("seq", seq);
+    m_filestore_dump_fmt.dump_unsigned("trans_num", trans_num);
+    (*i).dump(&m_filestore_dump_fmt);
+    m_filestore_dump_fmt.close_section();
+  }
+  m_filestore_dump_fmt.close_section();
+  m_filestore_dump_fmt.flush(m_filestore_dump);
+  m_filestore_dump.flush();
+}
+
+void FileStore::get_db_statistics(Formatter* f)
+{
+  object_map->db->get_statistics(f);
+}
+
+void FileStore::set_xattr_limits_via_conf()
+{
+  uint32_t fs_xattr_size;
+  uint32_t fs_xattrs;
+  uint32_t fs_xattr_max_value_size;
+
+  switch (m_fs_type) {
+#if defined(__linux__)
+  case XFS_SUPER_MAGIC:
+    fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_xfs;
+    fs_xattrs = cct->_conf->filestore_max_inline_xattrs_xfs;
+    fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_xfs;
+    break;
+  case BTRFS_SUPER_MAGIC:
+    fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_btrfs;
+    fs_xattrs = cct->_conf->filestore_max_inline_xattrs_btrfs;
+    fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_btrfs;
+    break;
+#endif
+  default:
+    fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_other;
+    fs_xattrs = cct->_conf->filestore_max_inline_xattrs_other;
+    fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_other;
+    break;
+  }
+
+  // Use override value if set
+  if (cct->_conf->filestore_max_inline_xattr_size)
+    m_filestore_max_inline_xattr_size = cct->_conf->filestore_max_inline_xattr_size;
+  else
+    m_filestore_max_inline_xattr_size = fs_xattr_size;
+
+  // Use override value if set
+  if (cct->_conf->filestore_max_inline_xattrs)
+    m_filestore_max_inline_xattrs = cct->_conf->filestore_max_inline_xattrs;
+  else
+    m_filestore_max_inline_xattrs = fs_xattrs;
+
+  // Use override value if set
+  if (cct->_conf->filestore_max_xattr_value_size)
+    m_filestore_max_xattr_value_size = cct->_conf->filestore_max_xattr_value_size;
+  else
+    m_filestore_max_xattr_value_size = fs_xattr_max_value_size;
+
+  if (m_filestore_max_xattr_value_size < cct->_conf->osd_max_object_name_len) {
+    derr << "WARNING: max attr value size ("
+	 << m_filestore_max_xattr_value_size
+	 << ") is smaller than osd_max_object_name_len ("
+	 << cct->_conf->osd_max_object_name_len
+	 << ").  Your backend filesystem appears to not support attrs large "
+	 << "enough to handle the configured max rados name size.  You may get "
+	 << "unexpected ENAMETOOLONG errors on rados operations or buggy "
+	 << "behavior"
+	 << dendl;
+  }
+}
+
+uint64_t FileStore::estimate_objects_overhead(uint64_t num_objects)
+{
+  uint64_t res = num_objects * blk_size / 2; //assumes that each object uses ( in average ) additional 1/2 block due to FS allocation granularity.
+  return res;
+}
+
+int FileStore::apply_layout_settings(const coll_t &cid, int target_level)
+{
+  dout(20) << __FUNC__ << ": " << cid << " target level: " 
+           << target_level << dendl;
+  Index index;
+  int r = get_index(cid, &index);
+  if (r < 0) {
+    dout(10) << "Error getting index for " << cid << ": " << cpp_strerror(r)
+	     << dendl;
+    return r;
+  }
+
+  return index->apply_layout_settings(target_level);
+}
+
+
+// -- FSSuperblock --
+
+void FSSuperblock::encode(bufferlist &bl) const
+{
+  ENCODE_START(2, 1, bl);
+  compat_features.encode(bl);
+  encode(omap_backend, bl);
+  ENCODE_FINISH(bl);
+}
+
+void FSSuperblock::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(2, bl);
+  compat_features.decode(bl);
+  if (struct_v >= 2)
+    decode(omap_backend, bl);
+  else
+    omap_backend = "leveldb";
+  DECODE_FINISH(bl);
+}
+
+void FSSuperblock::dump(Formatter *f) const
+{
+  f->open_object_section("compat");
+  compat_features.dump(f);
+  f->dump_string("omap_backend", omap_backend);
+  f->close_section();
+}
+
+void FSSuperblock::generate_test_instances(list<FSSuperblock*>& o)
+{
+  FSSuperblock z;
+  o.push_back(new FSSuperblock(z));
+  CompatSet::FeatureSet feature_compat;
+  CompatSet::FeatureSet feature_ro_compat;
+  CompatSet::FeatureSet feature_incompat;
+  feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS);
+  z.compat_features = CompatSet(feature_compat, feature_ro_compat,
+                                feature_incompat);
+  o.push_back(new FSSuperblock(z));
+  z.omap_backend = "rocksdb";
+  o.push_back(new FSSuperblock(z));
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore.osr(" << this << ") "
+
+void FileStore::OpSequencer::_register_apply(Op *o)
+{
+  if (o->registered_apply) {
+    dout(20) << __func__ << " " << o << " already registered" << dendl;
+    return;
+  }
+  o->registered_apply = true;
+  for (auto& t : o->tls) {
+    for (auto& i : t.get_object_index()) {
+      uint32_t key = i.first.hobj.get_hash();
+      applying.emplace(make_pair(key, &i.first));
+      dout(20) << __func__ << " " << o << " " << i.first << " ("
+	       << &i.first << ")" << dendl;
+    }
+  }
+}
+
+void FileStore::OpSequencer::_unregister_apply(Op *o)
+{
+  ceph_assert(o->registered_apply);
+  for (auto& t : o->tls) {
+    for (auto& i : t.get_object_index()) {
+      uint32_t key = i.first.hobj.get_hash();
+      auto p = applying.find(key);
+      bool removed = false;
+      while (p != applying.end() &&
+	     p->first == key) {
+	if (p->second == &i.first) {
+	  dout(20) << __func__ << " " << o << " " << i.first << " ("
+		   << &i.first << ")" << dendl;
+	  applying.erase(p);
+	  removed = true;
+	  break;
+	}
+	++p;
+      }
+      ceph_assert(removed);
+    }
+  }
+}
+
+void FileStore::OpSequencer::wait_for_apply(const ghobject_t& oid)
+{
+  std::unique_lock l{qlock};
+  uint32_t key = oid.hobj.get_hash();
+retry:
+  while (true) {
+    // search all items in hash slot for a matching object
+    auto p = applying.find(key);
+    while (p != applying.end() &&
+	   p->first == key) {
+      if (*p->second == oid) {
+	dout(20) << __func__ << " " << oid << " waiting on " << p->second
+		 << dendl;
+	cond.wait(l);
+	goto retry;
+      }
+      ++p;
+    }
+    break;
+  }
+  dout(20) << __func__ << " " << oid << " done" << dendl;
+}
diff --git a/src/os/filestore/FileStore.h b/src/os/filestore/FileStore.h
new file mode 100644
index 000000000..324dbbe4d
--- /dev/null
+++ b/src/os/filestore/FileStore.h
@@ -0,0 +1,944 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILESTORE_H
+#define CEPH_FILESTORE_H
+
+#include "include/types.h"
+
+#include <map>
+#include <deque>
+#include <atomic>
+#include <fstream>
+
+
+#include <boost/scoped_ptr.hpp>
+
+#include "include/unordered_map.h"
+
+#include "include/ceph_assert.h"
+
+#include "os/ObjectStore.h"
+#include "JournalingObjectStore.h"
+
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "common/perf_counters.h"
+#include "common/zipkin_trace.h"
+
+#include "common/ceph_mutex.h"
+#include "HashIndex.h"
+#include "IndexManager.h"
+#include "os/ObjectMap.h"
+#include "SequencerPosition.h"
+#include "FDCache.h"
+#include "WBThrottle.h"
+
+#include "include/uuid.h"
+
+#if defined(__linux__)
+# ifndef BTRFS_SUPER_MAGIC
+#define BTRFS_SUPER_MAGIC 0x9123683EUL
+# endif
+# ifndef XFS_SUPER_MAGIC
+#define XFS_SUPER_MAGIC 0x58465342UL
+# endif
+# ifndef ZFS_SUPER_MAGIC
+#define ZFS_SUPER_MAGIC 0x2fc12fc1UL
+# endif
+#endif
+
+
+class FileStoreBackend;
+
+#define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
+
+enum {
+  l_filestore_first = 84000,
+  l_filestore_journal_queue_ops,
+  l_filestore_journal_queue_bytes,
+  l_filestore_journal_ops,
+  l_filestore_journal_bytes,
+  l_filestore_journal_latency,
+  l_filestore_journal_wr,
+  l_filestore_journal_wr_bytes,
+  l_filestore_journal_full,
+  l_filestore_committing,
+  l_filestore_commitcycle,
+  l_filestore_commitcycle_interval,
+  l_filestore_commitcycle_latency,
+  l_filestore_op_queue_max_ops,
+  l_filestore_op_queue_ops,
+  l_filestore_ops,
+  l_filestore_op_queue_max_bytes,
+  l_filestore_op_queue_bytes,
+  l_filestore_bytes,
+  l_filestore_apply_latency,
+  l_filestore_queue_transaction_latency_avg,
+  l_filestore_sync_pause_max_lat,
+  l_filestore_last,
+};
+
+class FSSuperblock {
+public:
+  CompatSet compat_features;
+  std::string omap_backend;
+
+  FSSuperblock() { }
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<FSSuperblock*>& o);
+};
+WRITE_CLASS_ENCODER(FSSuperblock)
+
+inline std::ostream& operator<<(std::ostream& out, const FSSuperblock& sb)
+{
+  return out << "sb(" << sb.compat_features << "): "
+             << sb.omap_backend;
+}
+
+class FileStore : public JournalingObjectStore,
+                  public md_config_obs_t
+{
+  static const uint32_t target_version = 4;
+public:
+  uint32_t get_target_version() {
+    return target_version;
+  }
+
+  static int get_block_device_fsid(CephContext* cct, const std::string& path,
+				   uuid_d *fsid);
+  struct FSPerfTracker {
+    PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
+    PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
+
+    objectstore_perf_stat_t get_cur_stats() const {
+      objectstore_perf_stat_t ret;
+      ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
+      ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
+      return ret;
+    }
+
+    void update_from_perfcounters(PerfCounters &logger);
+  } perf_tracker;
+  objectstore_perf_stat_t get_cur_stats() override {
+    perf_tracker.update_from_perfcounters(*logger);
+    return perf_tracker.get_cur_stats();
+  }
+  const PerfCounters* get_perf_counters() const override {
+    return logger;
+  }
+
+private:
+  std::string internal_name;         ///< internal name, used to name the perfcounter instance
+  std::string basedir, journalpath;
+  osflagbits_t generic_flags;
+  std::string current_fn;
+  std::string current_op_seq_fn;
+  std::string omap_dir;
+  uuid_d fsid;
+
+  size_t blk_size;            ///< fs block size
+
+  int fsid_fd, op_fd, basedir_fd, current_fd;
+
+  FileStoreBackend *backend;
+
+  void create_backend(unsigned long f_type);
+
+  std::string devname;
+
+  int vdo_fd = -1;
+  std::string vdo_name;
+
+  deque<uint64_t> snaps;
+
+  // Indexed Collections
+  IndexManager index_manager;
+  int get_index(const coll_t& c, Index *index);
+  int init_index(const coll_t& c);
+
+  bool _need_temp_object_collection(const coll_t& cid, const ghobject_t& oid) {
+    // - normal temp case: cid is pg, object is temp (pool < -1)
+    // - hammer temp case: cid is pg (or already temp), object pool is -1
+    return cid.is_pg() && oid.hobj.pool <= -1;
+  }
+  void init_temp_collections();
+
+  void handle_eio();
+
+  // ObjectMap
+  boost::scoped_ptr<ObjectMap> object_map;
+
+  // helper fns
+  int get_cdir(const coll_t& cid, char *s, int len);
+
+  /// read a uuid from fd
+  int read_fsid(int fd, uuid_d *uuid);
+
+  /// lock fsid_fd
+  int lock_fsid();
+
+  // sync thread
+  ceph::mutex lock = ceph::make_mutex("FileStore::lock");
+  bool force_sync;
+  ceph::condition_variable sync_cond;
+
+  ceph::mutex sync_entry_timeo_lock = ceph::make_mutex("FileStore::sync_entry_timeo_lock");
+  SafeTimer timer;
+
+  std::list<Context*> sync_waiters;
+  bool stop;
+  void sync_entry();
+  struct SyncThread : public Thread {
+    FileStore *fs;
+    explicit SyncThread(FileStore *f) : fs(f) {}
+    void *entry() override {
+      fs->sync_entry();
+      return 0;
+    }
+  } sync_thread;
+
+  // -- op workqueue --
+  struct Op {
+    utime_t start;
+    uint64_t op;
+    std::vector<Transaction> tls;
+    Context *onreadable, *onreadable_sync;
+    uint64_t ops, bytes;
+    TrackedOpRef osd_op;
+    ZTracer::Trace trace;
+    bool registered_apply = false;
+  };
+  class OpSequencer : public CollectionImpl {
+    CephContext *cct;
+    // to protect q, for benefit of flush (peek/dequeue also protected by lock)
+    ceph::mutex qlock =
+      ceph::make_mutex("FileStore::OpSequencer::qlock", false);
+    std::list<Op*> q;
+    std::list<uint64_t> jq;
+    std::list<std::pair<uint64_t, Context*> > flush_commit_waiters;
+    ceph::condition_variable cond;
+    std::string osr_name_str;
+    /// hash of pointers to ghobject_t's for in-flight writes
+    std::unordered_multimap<uint32_t,const ghobject_t*> applying;
+  public:
+    // for apply mutual exclusion
+    ceph::mutex apply_lock =
+      ceph::make_mutex("FileStore::OpSequencer::apply_lock", false);
+    int id;
+    const char *osr_name;
+
+    /// get_max_uncompleted
+    bool _get_max_uncompleted(
+      uint64_t *seq ///< [out] max uncompleted seq
+      ) {
+      ceph_assert(seq);
+      *seq = 0;
+      if (q.empty() && jq.empty())
+	return true;
+
+      if (!q.empty())
+	*seq = q.back()->op;
+      if (!jq.empty() && jq.back() > *seq)
+	*seq = jq.back();
+
+      return false;
+    } /// @returns true if both queues are empty
+
+    /// get_min_uncompleted
+    bool _get_min_uncompleted(
+      uint64_t *seq ///< [out] min uncompleted seq
+      ) {
+      ceph_assert(seq);
+      *seq = 0;
+      if (q.empty() && jq.empty())
+	return true;
+
+      if (!q.empty())
+	*seq = q.front()->op;
+      if (!jq.empty() && jq.front() < *seq)
+	*seq = jq.front();
+
+      return false;
+    } /// @returns true if both queues are empty
+
+    void _wake_flush_waiters(std::list<Context*> *to_queue) {
+      uint64_t seq;
+      if (_get_min_uncompleted(&seq))
+	seq = -1;
+
+      for (auto i = flush_commit_waiters.begin();
+	   i != flush_commit_waiters.end() && i->first < seq;
+	   flush_commit_waiters.erase(i++)) {
+	to_queue->push_back(i->second);
+      }
+    }
+
+    void queue_journal(Op *o) {
+      std::lock_guard l{qlock};
+      jq.push_back(o->op);
+      _register_apply(o);
+    }
+    void dequeue_journal(std::list<Context*> *to_queue) {
+      std::lock_guard l{qlock};
+      jq.pop_front();
+      cond.notify_all();
+      _wake_flush_waiters(to_queue);
+    }
+    void queue(Op *o) {
+      std::lock_guard l{qlock};
+      q.push_back(o);
+      _register_apply(o);
+      o->trace.keyval("queue depth", q.size());
+    }
+    void _register_apply(Op *o);
+    void _unregister_apply(Op *o);
+    void wait_for_apply(const ghobject_t& oid);
+    Op *peek_queue() {
+      std::lock_guard l{qlock};
+      ceph_assert(ceph_mutex_is_locked(apply_lock));
+      return q.front();
+    }
+
+    Op *dequeue(std::list<Context*> *to_queue) {
+      ceph_assert(to_queue);
+      ceph_assert(ceph_mutex_is_locked(apply_lock));
+      std::lock_guard l{qlock};
+      Op *o = q.front();
+      q.pop_front();
+      cond.notify_all();
+      _unregister_apply(o);
+      _wake_flush_waiters(to_queue);
+      return o;
+    }
+
+    void flush() override {
+      std::unique_lock l{qlock};
+      // wait forever
+      cond.wait(l, [this] { return !cct->_conf->filestore_blackhole; });
+
+      // get max for journal _or_ op queues
+      uint64_t seq = 0;
+      if (!q.empty())
+	seq = q.back()->op;
+      if (!jq.empty() && jq.back() > seq)
+	seq = jq.back();
+
+      if (seq) {
+	// everything prior to our watermark to drain through either/both queues
+	cond.wait(l, [seq, this] {
+          return ((q.empty() || q.front()->op > seq) &&
+		  (jq.empty() || jq.front() > seq));
+        });
+      }
+    }
+    bool flush_commit(Context *c) override {
+      std::lock_guard l{qlock};
+      uint64_t seq = 0;
+      if (_get_max_uncompleted(&seq)) {
+	return true;
+      } else {
+	flush_commit_waiters.push_back(std::make_pair(seq, c));
+	return false;
+      }
+    }
+
+  private:
+    FRIEND_MAKE_REF(OpSequencer);
+    OpSequencer(CephContext* cct, int i, coll_t cid)
+      : CollectionImpl(cct, cid),
+	cct(cct),
+	osr_name_str(stringify(cid)),
+        id(i),
+	osr_name(osr_name_str.c_str()) {}
+    ~OpSequencer() override {
+      ceph_assert(q.empty());
+    }
+  };
+  typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
+
+  ceph::mutex coll_lock = ceph::make_mutex("FileStore::coll_lock");
+  std::map<coll_t,OpSequencerRef> coll_map;
+
+  friend std::ostream& operator<<(std::ostream& out, const OpSequencer& s);
+
+  FDCache fdcache;
+  WBThrottle wbthrottle;
+
+  std::atomic<int64_t> next_osr_id = { 0 };
+  bool m_disable_wbthrottle;
+  deque<OpSequencer*> op_queue;
+  BackoffThrottle throttle_ops, throttle_bytes;
+  const int m_ondisk_finisher_num;
+  const int m_apply_finisher_num;
+  std::vector<Finisher*> ondisk_finishers;
+  std::vector<Finisher*> apply_finishers;
+
+  ThreadPool op_tp;
+  struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> {
+    FileStore *store;
+    OpWQ(FileStore *fs,
+	 ceph::timespan timeout,
+	 ceph::timespan suicide_timeout,
+	 ThreadPool *tp)
+      : ThreadPool::WorkQueue<OpSequencer>("FileStore::OpWQ",
+					   timeout, suicide_timeout, tp),
+	store(fs) {}
+
+    bool _enqueue(OpSequencer *osr) override {
+      store->op_queue.push_back(osr);
+      return true;
+    }
+    void _dequeue(OpSequencer *o) override {
+      ceph_abort();
+    }
+    bool _empty() override {
+      return store->op_queue.empty();
+    }
+    OpSequencer *_dequeue() override {
+      if (store->op_queue.empty())
+	return nullptr;
+      OpSequencer *osr = store->op_queue.front();
+      store->op_queue.pop_front();
+      return osr;
+    }
+    void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) override {
+      store->_do_op(osr, handle);
+    }
+    void _process_finish(OpSequencer *osr) override {
+      store->_finish_op(osr);
+    }
+    void _clear() override {
+      ceph_assert(store->op_queue.empty());
+    }
+  } op_wq;
+
+  void _do_op(OpSequencer *o, ThreadPool::TPHandle &handle);
+  void _finish_op(OpSequencer *o);
+  Op *build_op(std::vector<Transaction>& tls,
+	       Context *onreadable, Context *onreadable_sync,
+	       TrackedOpRef osd_op);
+  void queue_op(OpSequencer *osr, Op *o);
+  void op_queue_reserve_throttle(Op *o);
+  void op_queue_release_throttle(Op *o);
+  void _journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk);
+  friend struct C_JournaledAhead;
+
+  void new_journal();
+
+  PerfCounters *logger;
+
+  ZTracer::Endpoint trace_endpoint;
+
+public:
+  int lfn_find(const ghobject_t& oid, const Index& index,
+                                  IndexedPath *path = nullptr);
+  int lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length);
+  int lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf);
+  int lfn_open(
+    const coll_t& cid,
+    const ghobject_t& oid,
+    bool create,
+    FDRef *outfd,
+    Index *index = nullptr);
+
+  void lfn_close(FDRef fd);
+  int lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) ;
+  int lfn_unlink(const coll_t& cid, const ghobject_t& o, const SequencerPosition &spos,
+		 bool force_clear_omap=false);
+
+public:
+  FileStore(CephContext* cct, const std::string &base, const std::string &jdev,
+	    osflagbits_t flags = 0,
+    const char *internal_name = "filestore", bool update_to=false);
+  ~FileStore() override;
+
+  std::string get_type() override {
+    return "filestore";
+  }
+
+  int _detect_fs();
+  int _sanity_check_fs();
+
+  bool test_mount_in_use() override;
+  int read_op_seq(uint64_t *seq);
+  int write_op_seq(int, uint64_t seq);
+  int mount() override;
+  int umount() override;
+
+  int validate_hobject_key(const hobject_t &obj) const override;
+
+  unsigned get_max_attr_name_length() override {
+    // xattr limit is 128; leave room for our prefixes (user.ceph._),
+    // some margin, and cap at 100
+    return 100;
+  }
+  int mkfs() override;
+  int mkjournal() override;
+  bool wants_journal() override {
+    return true;
+  }
+  bool allows_journal() override {
+    return true;
+  }
+  bool needs_journal() override {
+    return false;
+  }
+
+  bool is_sync_onreadable() const override {
+    return false;
+  }
+
+  bool is_rotational() override;
+  bool is_journal_rotational() override;
+
+  void dump_perf_counters(ceph::Formatter *f) override {
+    f->open_object_section("perf_counters");
+    logger->dump_formatted(f, false);
+    f->close_section();
+  }
+
+  int flush_cache(std::ostream *os = NULL) override;
+  int write_version_stamp();
+  int version_stamp_is_valid(uint32_t *version);
+  int update_version_stamp();
+  int upgrade() override;
+
+  bool can_sort_nibblewise() override {
+    return true;    // i support legacy sort order
+  }
+
+  void collect_metadata(std::map<std::string,std::string> *pm) override;
+  int get_devices(std::set<std::string> *ls) override;
+
+  int statfs(struct store_statfs_t *buf,
+             osd_alert_list_t* alerts = nullptr) override;
+  int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+		  bool *per_pool_omap) override;
+
+  int _do_transactions(
+    std::vector<Transaction> &tls, uint64_t op_seq,
+    ThreadPool::TPHandle *handle,
+    const char *osr_name);
+  int do_transactions(std::vector<Transaction> &tls, uint64_t op_seq) override {
+    return _do_transactions(tls, op_seq, nullptr, "replay");
+  }
+  void _do_transaction(
+    Transaction& t, uint64_t op_seq, int trans_num,
+    ThreadPool::TPHandle *handle, const char *osr_name);
+
+  CollectionHandle open_collection(const coll_t& c) override;
+  CollectionHandle create_new_collection(const coll_t& c) override;
+  void set_collection_commit_queue(const coll_t& cid,
+				   ContextQueue *commit_queue) override {
+  }
+
+  int queue_transactions(CollectionHandle& ch, std::vector<Transaction>& tls,
+			 TrackedOpRef op = TrackedOpRef(),
+			 ThreadPool::TPHandle *handle = nullptr) override;
+
+  /**
+   * set replay guard xattr on given file
+   *
+   * This will ensure that we will not replay this (or any previous) operation
+   * against this particular inode/object.
+   *
+   * @param fd open file descriptor for the file/object
+   * @param spos sequencer position of the last operation we should not replay
+   */
+  void _set_replay_guard(int fd,
+			 const SequencerPosition& spos,
+			 const ghobject_t *oid=0,
+			 bool in_progress=false);
+  void _set_replay_guard(const coll_t& cid,
+                         const SequencerPosition& spos,
+                         bool in_progress);
+  void _set_global_replay_guard(const coll_t& cid,
+				const SequencerPosition &spos);
+
+  /// close a replay guard opened with in_progress=true
+  void _close_replay_guard(int fd, const SequencerPosition& spos,
+			   const ghobject_t *oid=0);
+  void _close_replay_guard(const coll_t& cid, const SequencerPosition& spos);
+
+  /**
+   * check replay guard xattr on given file
+   *
+   * Check the current position against any marker on the file that
+   * indicates which operations have already been applied.  If the
+   * current or a newer operation has been marked as applied, we
+   * should not replay the current operation again.
+   *
+   * If we are not replaying the journal, we already return true.  It
+   * is only on replay that we might return false, indicated that the
+   * operation should not be performed (again).
+   *
+   * @param fd open fd on the file/object in question
+   * @param spos sequencerposition for an operation we could apply/replay
+   * @return 1 if we can apply (maybe replay) this operation, -1 if spos has already been applied, 0 if it was in progress
+   */
+  int _check_replay_guard(int fd, const SequencerPosition& spos);
+  int _check_replay_guard(const coll_t& cid, const SequencerPosition& spos);
+  int _check_replay_guard(const coll_t& cid, const ghobject_t &oid, const SequencerPosition& pos);
+  int _check_global_replay_guard(const coll_t& cid, const SequencerPosition& spos);
+
+  // ------------------
+  // objects
+  int pick_object_revision_lt(ghobject_t& oid) {
+    return 0;
+  }
+  using ObjectStore::exists;
+  bool exists(CollectionHandle& c, const ghobject_t& oid) override;
+  using ObjectStore::stat;
+  int stat(
+    CollectionHandle& c,
+    const ghobject_t& oid,
+    struct stat *st,
+    bool allow_eio = false) override;
+  using ObjectStore::set_collection_opts;
+  int set_collection_opts(
+    CollectionHandle& c,
+    const pool_opts_t& opts) override;
+  using ObjectStore::read;
+  int read(
+    CollectionHandle& c,
+    const ghobject_t& oid,
+    uint64_t offset,
+    size_t len,
+    ceph::buffer::list& bl,
+    uint32_t op_flags = 0) override;
+  int _do_fiemap(int fd, uint64_t offset, size_t len,
+                 std::map<uint64_t, uint64_t> *m);
+  int _do_seek_hole_data(int fd, uint64_t offset, size_t len,
+                         std::map<uint64_t, uint64_t> *m);
+  using ObjectStore::fiemap;
+  int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, ceph::buffer::list& bl) override;
+  int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) override;
+
+  int _touch(const coll_t& cid, const ghobject_t& oid);
+  int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len,
+	      const ceph::buffer::list& bl, uint32_t fadvise_flags = 0);
+  int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len);
+  int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
+  int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid,
+	     const SequencerPosition& spos);
+  int _clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid,
+		   uint64_t srcoff, uint64_t len, uint64_t dstoff,
+		   const SequencerPosition& spos);
+  int _do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
+  int _do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff);
+  int _do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc=false);
+  int _remove(const coll_t& cid, const ghobject_t& oid, const SequencerPosition &spos);
+
+  int _fgetattr(int fd, const char *name, ceph::bufferptr& bp);
+  int _fgetattrs(int fd, std::map<std::string, ceph::bufferptr>& aset);
+  int _fsetattrs(int fd, std::map<std::string, ceph::bufferptr> &aset);
+
+  void do_force_sync();
+  void start_sync(Context *onsafe);
+  void sync();
+  void _flush_op_queue();
+  void flush();
+  void sync_and_flush();
+
+  int flush_journal() override;
+  int dump_journal(std::ostream& out) override;
+
+  void set_fsid(uuid_d u) override {
+    fsid = u;
+  }
+  uuid_d get_fsid() override { return fsid; }
+  
+  uint64_t estimate_objects_overhead(uint64_t num_objects) override;
+
+  // DEBUG read error injection, an object is removed from both on delete()
+  ceph::mutex read_error_lock = ceph::make_mutex("FileStore::read_error_lock");
+  std::set<ghobject_t> data_error_set; // read() will return -EIO
+  std::set<ghobject_t> mdata_error_set; // getattr(),stat() will return -EIO
+  void inject_data_error(const ghobject_t &oid) override;
+  void inject_mdata_error(const ghobject_t &oid) override;
+
+  void compact() override {
+    ceph_assert(object_map);
+    object_map->compact();
+  }
+
+  bool has_builtin_csum() const override {
+    return false;
+  }
+
+  void debug_obj_on_delete(const ghobject_t &oid);
+  bool debug_data_eio(const ghobject_t &oid);
+  bool debug_mdata_eio(const ghobject_t &oid);
+
+  int snapshot(const std::string& name) override;
+
+  // attrs
+  using ObjectStore::getattr;
+  using ObjectStore::getattrs;
+  int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, ceph::bufferptr &bp) override;
+  int getattrs(CollectionHandle& c, const ghobject_t& oid, std::map<std::string,ceph::bufferptr>& aset) override;
+
+  int _setattrs(const coll_t& cid, const ghobject_t& oid, std::map<std::string,ceph::bufferptr>& aset,
+		const SequencerPosition &spos);
+  int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name,
+	      const SequencerPosition &spos);
+  int _rmattrs(const coll_t& cid, const ghobject_t& oid,
+	       const SequencerPosition &spos);
+
+  int _collection_remove_recursive(const coll_t &cid,
+				   const SequencerPosition &spos);
+
+  int _collection_set_bits(const coll_t& cid, int bits);
+
+  // collections
+  using ObjectStore::collection_list;
+  int collection_bits(CollectionHandle& c) override;
+  int collection_list(CollectionHandle& c,
+		      const ghobject_t& start, const ghobject_t& end, int max,
+		      std::vector<ghobject_t> *ls, ghobject_t *next) override {
+    c->flush();
+    return collection_list(c->cid, start, end, max, ls, next);
+  }
+  int collection_list(const coll_t& cid,
+		      const ghobject_t& start, const ghobject_t& end, int max,
+		      std::vector<ghobject_t> *ls, ghobject_t *next);
+  int list_collections(std::vector<coll_t>& ls) override;
+  int list_collections(std::vector<coll_t>& ls, bool include_temp);
+  int collection_stat(const coll_t& c, struct stat *st);
+  bool collection_exists(const coll_t& c) override;
+  int collection_empty(CollectionHandle& c, bool *empty) override {
+    c->flush();
+    return collection_empty(c->cid, empty);
+  }
+  int collection_empty(const coll_t& cid, bool *empty);
+
+  // omap (see ObjectStore.h for documentation)
+  using ObjectStore::omap_get;
+  int omap_get(CollectionHandle& c, const ghobject_t &oid, ceph::buffer::list *header,
+	       std::map<std::string, ceph::buffer::list> *out) override;
+  using ObjectStore::omap_get_header;
+  int omap_get_header(
+    CollectionHandle& c,
+    const ghobject_t &oid,
+    ceph::buffer::list *out,
+    bool allow_eio = false) override;
+  using ObjectStore::omap_get_keys;
+  int omap_get_keys(CollectionHandle& c, const ghobject_t &oid, std::set<std::string> *keys) override;
+  using ObjectStore::omap_get_values;
+  int omap_get_values(CollectionHandle& c, const ghobject_t &oid, const std::set<std::string> &keys,
+		      std::map<std::string, ceph::buffer::list> *out) override;
+  using ObjectStore::omap_check_keys;
+  int omap_check_keys(CollectionHandle& c, const ghobject_t &oid, const std::set<std::string> &keys,
+		      std::set<std::string> *out) override;
+  using ObjectStore::get_omap_iterator;
+  ObjectMap::ObjectMapIterator get_omap_iterator(CollectionHandle& c, const ghobject_t &oid) override;
+  ObjectMap::ObjectMapIterator get_omap_iterator(const coll_t& cid, const ghobject_t &oid);
+
+  int _create_collection(const coll_t& c, int bits,
+			 const SequencerPosition &spos);
+  int _destroy_collection(const coll_t& c);
+  /**
+   * Give an expected number of objects hint to the collection.
+   *
+   * @param c                 - collection id.
+   * @param pg_num            - pg number of the pool this collection belongs to
+   * @param expected_num_objs - expected number of objects in this collection
+   * @param spos              - sequence position
+   *
+   * @return 0 on success, an error code otherwise
+   */
+  int _collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num,
+      uint64_t expected_num_objs,
+      const SequencerPosition &spos);
+  int _collection_add(const coll_t& c, const coll_t& ocid, const ghobject_t& oid,
+		      const SequencerPosition& spos);
+  int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+			      coll_t c, const ghobject_t& o,
+			      const SequencerPosition& spos,
+			      bool ignore_enoent = false);
+
+  int _set_alloc_hint(const coll_t& cid, const ghobject_t& oid,
+                      uint64_t expected_object_size,
+                      uint64_t expected_write_size);
+
+  void dump_start(const std::string& file);
+  void dump_stop();
+  void dump_transactions(std::vector<Transaction>& ls, uint64_t seq, OpSequencer *osr);
+
+  virtual int apply_layout_settings(const coll_t &cid, int target_level);
+
+  void get_db_statistics(ceph::Formatter* f) override;
+
+private:
+  void _inject_failure();
+
+  // omap
+  int _omap_clear(const coll_t& cid, const ghobject_t &oid,
+		  const SequencerPosition &spos);
+  int _omap_setkeys(const coll_t& cid, const ghobject_t &oid,
+		    const std::map<std::string, ceph::buffer::list> &aset,
+		    const SequencerPosition &spos);
+  int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, const std::set<std::string> &keys,
+		   const SequencerPosition &spos);
+  int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
+		       const std::string& first, const std::string& last,
+		       const SequencerPosition &spos);
+  int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const ceph::buffer::list &bl,
+		      const SequencerPosition &spos);
+  int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest,
+                        const SequencerPosition &spos);
+  int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest,
+                        const SequencerPosition &spos);
+
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override;
+  int set_throttle_params();
+  float m_filestore_commit_timeout;
+  bool m_filestore_journal_parallel;
+  bool m_filestore_journal_trailing;
+  bool m_filestore_journal_writeahead;
+  int m_filestore_fiemap_threshold;
+  double m_filestore_max_sync_interval;
+  double m_filestore_min_sync_interval;
+  bool m_filestore_fail_eio;
+  bool m_filestore_fadvise;
+  int do_update;
+  bool m_journal_dio, m_journal_aio, m_journal_force_aio;
+  std::string m_osd_rollback_to_cluster_snap;
+  bool m_osd_use_stale_snap;
+  bool m_filestore_do_dump;
+  std::ofstream m_filestore_dump;
+  ceph::JSONFormatter m_filestore_dump_fmt;
+  std::atomic<int64_t> m_filestore_kill_at = { 0 };
+  bool m_filestore_sloppy_crc;
+  int m_filestore_sloppy_crc_block_size;
+  uint64_t m_filestore_max_alloc_hint_size;
+  unsigned long m_fs_type;
+
+  //Determined xattr handling based on fs type
+  void set_xattr_limits_via_conf();
+  uint32_t m_filestore_max_inline_xattr_size;
+  uint32_t m_filestore_max_inline_xattrs;
+  uint32_t m_filestore_max_xattr_value_size;
+
+  FSSuperblock superblock;
+
+  /**
+   * write_superblock()
+   *
+   * Write superblock to persisent storage
+   *
+   * return value: 0 on success, otherwise negative errno
+   */
+  int write_superblock();
+
+  /**
+   * read_superblock()
+   *
+   * Fill in FileStore::superblock by reading persistent storage
+   *
+   * return value: 0 on success, otherwise negative errno
+   */
+  int read_superblock();
+
+  friend class FileStoreBackend;
+  friend class TestFileStore;
+};
+
+std::ostream& operator<<(std::ostream& out, const FileStore::OpSequencer& s);
+
+struct fiemap;
+
+class FileStoreBackend {
+private:
+  FileStore *filestore;
+protected:
+  int get_basedir_fd() {
+    return filestore->basedir_fd;
+  }
+  int get_current_fd() {
+    return filestore->current_fd;
+  }
+  int get_op_fd() {
+    return filestore->op_fd;
+  }
+  size_t get_blksize() {
+    return filestore->blk_size;
+  }
+  const std::string& get_basedir_path() {
+    return filestore->basedir;
+  }
+  const std::string& get_journal_path() {
+    return filestore->journalpath;
+  }
+  const std::string& get_current_path() {
+    return filestore->current_fn;
+  }
+  int _copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) {
+    if (has_fiemap() || has_seek_data_hole()) {
+      return filestore->_do_sparse_copy_range(from, to, srcoff, len, dstoff);
+    } else {
+      return filestore->_do_copy_range(from, to, srcoff, len, dstoff);
+    }
+  }
+  int get_crc_block_size() {
+    return filestore->m_filestore_sloppy_crc_block_size;
+  }
+
+public:
+  explicit FileStoreBackend(FileStore *fs) : filestore(fs) {}
+  virtual ~FileStoreBackend() {}
+
+  CephContext* cct() const {
+    return filestore->cct;
+  }
+
+  static FileStoreBackend *create(unsigned long f_type, FileStore *fs);
+
+  virtual const char *get_name() = 0;
+  virtual int detect_features() = 0;
+  virtual int create_current() = 0;
+  virtual bool can_checkpoint() = 0;
+  virtual int list_checkpoints(std::list<std::string>& ls) = 0;
+  virtual int create_checkpoint(const std::string& name, uint64_t *cid) = 0;
+  virtual int sync_checkpoint(uint64_t id) = 0;
+  virtual int rollback_to(const std::string& name) = 0;
+  virtual int destroy_checkpoint(const std::string& name) = 0;
+  virtual int syncfs() = 0;
+  virtual bool has_fiemap() = 0;
+  virtual bool has_seek_data_hole() = 0;
+  virtual bool is_rotational() = 0;
+  virtual bool is_journal_rotational() = 0;
+  virtual int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) = 0;
+  virtual int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) = 0;
+  virtual int set_alloc_hint(int fd, uint64_t hint) = 0;
+  virtual bool has_splice() const = 0;
+
+  // hooks for (sloppy) crc tracking
+  virtual int _crc_update_write(int fd, loff_t off, size_t len, const ceph::buffer::list& bl) = 0;
+  virtual int _crc_update_truncate(int fd, loff_t off) = 0;
+  virtual int _crc_update_zero(int fd, loff_t off, size_t len) = 0;
+  virtual int _crc_update_clone_range(int srcfd, int destfd,
+				      loff_t srcoff, size_t len, loff_t dstoff) = 0;
+  virtual int _crc_verify_read(int fd, loff_t off, size_t len, const ceph::buffer::list& bl,
+			       std::ostream *out) = 0;
+};
+
+#endif
diff --git a/src/os/filestore/GenericFileStoreBackend.cc b/src/os/filestore/GenericFileStoreBackend.cc
new file mode 100644
index 000000000..d264622b7
--- /dev/null
+++ b/src/os/filestore/GenericFileStoreBackend.cc
@@ -0,0 +1,475 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#endif
+
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "GenericFileStoreBackend.h"
+
+#include "common/errno.h"
+#include "common/config.h"
+#include "common/sync_filesystem.h"
+#include "common/blkdev.h"
+
+#include "common/SloppyCRCMap.h"
+#include "os/filestore/chain_xattr.h"
+
+#define SLOPPY_CRC_XATTR "user.cephos.scrc"
+
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
+
+#define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
+#define ALIGNED(x, by) (!((x) % (by)))
+#define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
+
+using std::ostream;
+using std::ostringstream;
+using std::string;
+
+using ceph::bufferptr;
+using ceph::bufferlist;
+
+GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs):
+  FileStoreBackend(fs),
+  ioctl_fiemap(false),
+  seek_data_hole(false),
+  use_splice(false),
+  m_filestore_fiemap(cct()->_conf->filestore_fiemap),
+  m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole),
+  m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data),
+  m_filestore_splice(cct()->_conf->filestore_splice)
+{
+  // rotational?
+  {
+    // NOTE: the below won't work on btrfs; we'll assume rotational.
+    string fn = get_basedir_path();
+    int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC);
+    if (fd < 0) {
+      return;
+    }
+    BlkDev blkdev(fd);
+    m_rotational = blkdev.is_rotational();
+    dout(20) << __func__ << " basedir " << fn
+	     << " rotational " << (int)m_rotational << dendl;
+    ::close(fd);
+  }
+  // journal rotational?
+  {
+    // NOTE: the below won't work on btrfs; we'll assume rotational.
+    string fn = get_journal_path();
+    int fd = ::open(fn.c_str(), O_RDONLY|O_CLOEXEC);
+    if (fd < 0) {
+      return;
+    }
+    BlkDev blkdev(fd);
+    m_journal_rotational = blkdev.is_rotational();
+    dout(20) << __func__ << " journal filename " << fn.c_str()
+	     << " journal rotational " << (int)m_journal_rotational << dendl;
+    ::close(fd);
+  }
+}
+
+int GenericFileStoreBackend::detect_features()
+{
+  char fn[PATH_MAX];
+  snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str());
+
+  int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, 0644);
+  if (fd < 0) {
+    fd = -errno;
+    derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl;
+    return fd;
+  }
+
+  // ext4 has a bug in older kernels where fiemap will return an empty
+  // result in some cases.  this is a file layout that triggers the bug
+  // on 2.6.34-rc5.
+  int v[] = {
+    0x0000000000016000, 0x0000000000007000,
+    0x000000000004a000, 0x0000000000007000,
+    0x0000000000060000, 0x0000000000001000,
+    0x0000000000061000, 0x0000000000008000,
+    0x0000000000069000, 0x0000000000007000,
+    0x00000000000a3000, 0x000000000000c000,
+    0x000000000024e000, 0x000000000000c000,
+    0x000000000028b000, 0x0000000000009000,
+    0x00000000002b1000, 0x0000000000003000,
+    0, 0
+  };
+  for (int i=0; v[i]; i++) {
+    int off = v[i++];
+    int len = v[i];
+
+    // write a large extent
+    char buf[len];
+    memset(buf, 1, sizeof(buf));
+    int r = ::lseek(fd, off, SEEK_SET);
+    if (r < 0) {
+      r = -errno;
+      derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl;
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+      return r;
+    }
+    r = write(fd, buf, sizeof(buf));
+    if (r < 0) {
+      derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl;
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+      return r;
+    }
+  }
+
+  // fiemap an extent inside that
+  if (!m_filestore_fiemap) {
+    dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
+    ioctl_fiemap = false;
+  } else {
+    struct fiemap *fiemap;
+    int r = do_fiemap(fd, 2430421, 59284, &fiemap);
+    if (r < 0) {
+      dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
+      ioctl_fiemap = false;
+    } else {
+      if (fiemap->fm_mapped_extents == 0) {
+        dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
+        ioctl_fiemap = false;
+      } else {
+        dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
+        ioctl_fiemap = true;
+      }
+      free(fiemap);
+    }
+  }
+
+  // SEEK_DATA/SEEK_HOLE detection
+  if (!m_filestore_seek_data_hole) {
+    dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl;
+    seek_data_hole = false;
+  } else {
+#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
+    // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
+    // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
+    // Fall back to use fiemap.
+    off_t hole_pos;
+
+    hole_pos = lseek(fd, 0, SEEK_HOLE);
+    if (hole_pos < 0) {
+      if (errno == EINVAL) {
+        dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl;
+        seek_data_hole = false;
+      } else {
+        derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl;
+        VOID_TEMP_FAILURE_RETRY(::close(fd));
+        return -errno;
+      }
+    } else {
+      dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl;
+      seek_data_hole = true;
+    }
+#endif
+  }
+
+  //splice detection
+#ifdef CEPH_HAVE_SPLICE
+  if (!m_filestore_splice) {
+    dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl;
+    use_splice = false;
+  } else {
+    int pipefd[2];
+    loff_t off_in = 0;
+    int r;
+    if (pipe_cloexec(pipefd, 0) < 0) {
+      int e = errno;
+      dout(0) << "detect_features: splice pipe met error " << cpp_strerror(e) << dendl;
+    } else {
+      lseek(fd, 0, SEEK_SET);
+      r = splice(fd, &off_in, pipefd[1], NULL, 10, 0);
+      if (!(r < 0 && errno == EINVAL)) {
+	use_splice = true;
+	dout(0) << "detect_features: splice is supported" << dendl;
+      } else
+	dout(0) << "detect_features: splice is NOT supported" << dendl;
+      close(pipefd[0]);
+      close(pipefd[1]);
+    }
+  }
+#endif
+  ::unlink(fn);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+
+
+  bool have_syncfs = false;
+#ifdef HAVE_SYS_SYNCFS
+  if (::syncfs(get_basedir_fd()) == 0) {
+    dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl;
+    have_syncfs = true;
+  } else {
+    dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl;
+  }
+#elif defined(SYS_syncfs)
+  if (syscall(SYS_syncfs, get_basedir_fd()) == 0) {
+    dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl;
+    have_syncfs = true;
+  } else {
+    dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
+  }
+#elif defined(__NR_syncfs)
+  if (syscall(__NR_syncfs, get_basedir_fd()) == 0) {
+    dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl;
+    have_syncfs = true;
+  } else {
+    dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
+  }
+#endif
+  if (!have_syncfs) {
+    dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl;
+    if (m_filestore_fsync_flushes_journal_data) {
+      dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl;
+    } else {
+      dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl;
+      dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl;
+    }
+  }
+
+  return 0;
+}
+
+int GenericFileStoreBackend::create_current()
+{
+  struct stat st;
+  int ret = ::stat(get_current_path().c_str(), &st);
+  if (ret == 0) {
+    // current/ exists
+    if (!S_ISDIR(st.st_mode)) {
+      dout(0) << "_create_current: current/ exists but is not a directory" << dendl;
+      ret = -EINVAL;
+    }
+  } else {
+    ret = ::mkdir(get_current_path().c_str(), 0755);
+    if (ret < 0) {
+      ret = -errno;
+      dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl;
+    }
+  }
+  return ret;
+}
+
+int GenericFileStoreBackend::syncfs()
+{
+  int ret;
+  if (m_filestore_fsync_flushes_journal_data) {
+    dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl;
+    // make the file system's journal commit.
+    //  this works with ext3, but NOT ext4
+    ret = ::fsync(get_op_fd());
+    if (ret < 0)
+      ret = -errno;
+  } else {
+    dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl;
+    ret = sync_filesystem(get_current_fd());
+  }
+  return ret;
+}
+
+int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap)
+{
+  struct fiemap *fiemap = NULL;
+  struct fiemap *_realloc_fiemap = NULL;
+  int size;
+  int ret;
+
+  fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1);
+  if (!fiemap)
+    return -ENOMEM;
+  /*
+   * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
+   * the result is (logical=4096, len=4096). It leak the [3990, 4096).
+   * Commit:"xfs: fix rounding error of fiemap length parameter
+   * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
+   * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
+   */
+  fiemap->fm_start = start - start % CEPH_PAGE_SIZE;
+  fiemap->fm_length = len + start % CEPH_PAGE_SIZE;
+  fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+  ret = -ENOTSUP;
+  goto done_err;
+#else
+  if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
+    ret = -errno;
+    goto done_err;
+  }
+#endif
+  size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
+
+  _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size);
+  if (!_realloc_fiemap) {
+    ret = -ENOMEM;
+    goto done_err;
+  } else {
+    fiemap = _realloc_fiemap;
+  }
+
+  memset(fiemap->fm_extents, 0, size);
+
+  fiemap->fm_extent_count = fiemap->fm_mapped_extents;
+  fiemap->fm_mapped_extents = 0;
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+  ret = -ENOTSUP;
+  goto done_err;
+#else
+  if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
+    ret = -errno;
+    goto done_err;
+  }
+  *pfiemap = fiemap;
+#endif
+  return 0;
+
+done_err:
+  *pfiemap = NULL;
+  free(fiemap);
+  return ret;
+}
+
+
+int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
+{
+  char buf[100];
+  bufferptr bp;
+  int r = 0;
+  int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
+  if (l == -ENODATA) {
+    return 0;
+  }
+  if (l >= 0) {
+    bp = ceph::buffer::create(l);
+    memcpy(bp.c_str(), buf, l);
+  } else if (l == -ERANGE) {
+    l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
+    if (l > 0) {
+      bp = ceph::buffer::create(l);
+      l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
+    }
+  }
+  bufferlist bl;
+  bl.append(std::move(bp));
+  auto p = bl.cbegin();
+  try {
+    decode(*cm, p);
+  }
+  catch (ceph::buffer::error &e) {
+    r = -EIO;
+  }
+  if (r < 0)
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
+{
+  bufferlist bl;
+  encode(*cm, bl);
+  int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
+  if (r < 0)
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
+{
+  SloppyCRCMap scm(get_crc_block_size());
+  int r = _crc_load_or_init(fd, &scm);
+  if (r < 0)
+    return r;
+  ostringstream ss;
+  scm.write(off, len, bl, &ss);
+  dout(30) << __func__ << "\n" << ss.str() << dendl;
+  r = _crc_save(fd, &scm);
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
+{
+  SloppyCRCMap scm(get_crc_block_size());
+  int r = _crc_load_or_init(fd, &scm);
+  if (r < 0)
+    return r;
+  scm.truncate(off);
+  r = _crc_save(fd, &scm);
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
+{
+  SloppyCRCMap scm(get_crc_block_size());
+  int r = _crc_load_or_init(fd, &scm);
+  if (r < 0)
+    return r;
+  scm.zero(off, len);
+  r = _crc_save(fd, &scm);
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
+						     loff_t srcoff, size_t len, loff_t dstoff)
+{
+  SloppyCRCMap scm_src(get_crc_block_size());
+  SloppyCRCMap scm_dst(get_crc_block_size());
+  int r = _crc_load_or_init(srcfd, &scm_src);
+  if (r < 0)
+    return r;
+  r = _crc_load_or_init(destfd, &scm_dst);
+  if (r < 0)
+    return r;
+  ostringstream ss;
+  scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
+  dout(30) << __func__ << "\n" << ss.str() << dendl;
+  r = _crc_save(destfd, &scm_dst);
+  return r;
+}
+
+int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
+					      ostream *out)
+{
+  SloppyCRCMap scm(get_crc_block_size());
+  int r = _crc_load_or_init(fd, &scm);
+  if (r < 0)
+    return r;
+  return scm.read(off, len, bl, out);
+}
diff --git a/src/os/filestore/GenericFileStoreBackend.h b/src/os/filestore/GenericFileStoreBackend.h
new file mode 100644
index 000000000..de10bf948
--- /dev/null
+++ b/src/os/filestore/GenericFileStoreBackend.h
@@ -0,0 +1,75 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_GENERICFILESTOREBACKEDN_H
+#define CEPH_GENERICFILESTOREBACKEDN_H
+
+#include "FileStore.h"
+
+class SloppyCRCMap;
+
+class GenericFileStoreBackend : public FileStoreBackend {
+private:
+  bool ioctl_fiemap;
+  bool seek_data_hole;
+  bool use_splice;
+  bool m_filestore_fiemap;
+  bool m_filestore_seek_data_hole;
+  bool m_filestore_fsync_flushes_journal_data;
+  bool m_filestore_splice;
+  bool m_rotational = true;
+  bool m_journal_rotational = true;
+public:
+  explicit GenericFileStoreBackend(FileStore *fs);
+  ~GenericFileStoreBackend() override {}
+
+  const char *get_name() override {
+    return "generic";
+  }
+  int detect_features() override;
+  int create_current() override;
+  bool can_checkpoint() override { return false; }
+  bool is_rotational() override {
+    return m_rotational;
+  }
+  bool is_journal_rotational() override {
+    return m_journal_rotational;
+  }
+  int list_checkpoints(std::list<std::string>& ls) override { return 0; }
+  int create_checkpoint(const std::string& name, uint64_t *cid) override { return -EOPNOTSUPP; }
+  int sync_checkpoint(uint64_t id) override { return -EOPNOTSUPP; }
+  int rollback_to(const std::string& name) override { return -EOPNOTSUPP; }
+  int destroy_checkpoint(const std::string& name) override { return -EOPNOTSUPP; }
+  int syncfs() override;
+  bool has_fiemap() override { return ioctl_fiemap; }
+  bool has_seek_data_hole() override { return seek_data_hole; }
+  int do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap) override;
+  int clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) override {
+    return _copy_range(from, to, srcoff, len, dstoff);
+  }
+  int set_alloc_hint(int fd, uint64_t hint) override { return -EOPNOTSUPP; }
+  bool has_splice() const override { return use_splice; }
+private:
+  int _crc_load_or_init(int fd, SloppyCRCMap *cm);
+  int _crc_save(int fd, SloppyCRCMap *cm);
+public:
+  int _crc_update_write(int fd, loff_t off, size_t len, const ceph::buffer::list& bl) override;
+  int _crc_update_truncate(int fd, loff_t off) override;
+  int _crc_update_zero(int fd, loff_t off, size_t len) override;
+  int _crc_update_clone_range(int srcfd, int destfd,
+				      loff_t srcoff, size_t len, loff_t dstoff) override;
+  int _crc_verify_read(int fd, loff_t off, size_t len, const ceph::buffer::list& bl,
+		       std::ostream *out) override;
+};
+#endif
diff --git a/src/os/filestore/HashIndex.cc b/src/os/filestore/HashIndex.cc
new file mode 100644
index 000000000..75c3d1b67
--- /dev/null
+++ b/src/os/filestore/HashIndex.cc
@@ -0,0 +1,1226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "include/types.h"
+#include "include/buffer.h"
+#include "osd/osd_types.h"
+#include <errno.h>
+
+#include "HashIndex.h"
+
+#include "common/errno.h"
+#include "common/debug.h"
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferptr;
+using ceph::bufferlist;
+
+const string HashIndex::SUBDIR_ATTR = "contents";
+const string HashIndex::SETTINGS_ATTR = "settings";
+const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
+
+/// hex digit to integer value
+int hex_to_int(char c)
+{
+  if (c >= '0' && c <= '9')
+    return c - '0';
+  if (c >= 'A' && c <= 'F')
+    return c - 'A' + 10;
+  ceph_abort();
+}
+
+/// int value to hex digit
+char int_to_hex(int v)
+{
+  ceph_assert(v < 16);
+  if (v < 10)
+    return '0' + v;
+  return 'A' + v - 10;
+}
+
+/// reverse bits in a nibble (0..15)
+int reverse_nibble_bits(int in)
+{
+  ceph_assert(in < 16);
+  return
+    ((in & 8) >> 3) |
+    ((in & 4) >> 1) |
+    ((in & 2) << 1) |
+    ((in & 1) << 3);
+}
+
+/// reverse nibble bits in a hex digit
+char reverse_hexdigit_bits(char c)
+{
+  return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
+}
+
+/// reverse nibble bits in a hex string
+string reverse_hexdigit_bits_string(string s)
+{
+  for (unsigned i=0; i<s.size(); ++i)
+    s[i] = reverse_hexdigit_bits(s[i]);
+  return s;
+}
+
+/// compare hex digit (as length 1 string) bitwise
+bool cmp_hexdigit_bitwise(const string& l, const string& r)
+{
+  ceph_assert(l.length() == 1 && r.length() == 1);
+  int lv = hex_to_int(l[0]);
+  int rv = hex_to_int(r[0]);
+  ceph_assert(lv < 16);
+  ceph_assert(rv < 16);
+  return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
+}
+
+/// compare hex digit string bitwise
+bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
+{
+  string ll = reverse_hexdigit_bits_string(l);
+  string rr = reverse_hexdigit_bits_string(r);
+  return ll < rr;
+}
+
+int HashIndex::cleanup() {
+  bufferlist bl;
+  int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+  if (r < 0) {
+    // No in progress operations!
+    return 0;
+  }
+  auto i = bl.cbegin();
+  InProgressOp in_progress(i);
+  subdir_info_s info;
+  r = get_info(in_progress.path, &info);
+  if (r == -ENOENT) {
+    return end_split_or_merge(in_progress.path);
+  } else if (r < 0) {
+    return r;
+  }
+
+  if (in_progress.is_split())
+    return complete_split(in_progress.path, info);
+  else if (in_progress.is_merge())
+    return complete_merge(in_progress.path, info);
+  else if (in_progress.is_col_split()) {
+    for (vector<string>::iterator i = in_progress.path.begin();
+	 i != in_progress.path.end();
+	 ++i) {
+      vector<string> path(in_progress.path.begin(), i);
+      int r = reset_attr(path);
+      if (r < 0)
+	return r;
+    }
+    return 0;
+  }
+  else
+    return -EINVAL;
+}
+
+int HashIndex::reset_attr(
+  const vector<string> &path)
+{
+  int exists = 0;
+  int r = path_exists(path, &exists);
+  if (r < 0)
+    return r;
+  if (!exists)
+    return 0;
+  map<string, ghobject_t> objects;
+  vector<string> subdirs;
+  r = list_objects(path, 0, 0, &objects);
+  if (r < 0)
+    return r;
+  r = list_subdirs(path, &subdirs);
+  if (r < 0)
+    return r;
+
+  subdir_info_s info;
+  info.hash_level = path.size();
+  info.objs = objects.size();
+  info.subdirs = subdirs.size();
+  return set_info(path, info);
+}
+
+int HashIndex::col_split_level(
+  HashIndex &from,
+  HashIndex &to,
+  const vector<string> &path,
+  uint32_t inbits,
+  uint32_t match,
+  unsigned *mkdirred)
+{
+  /* For each subdir, move, recurse, or ignore based on comparing the low order
+   * bits of the hash represented by the subdir path with inbits, match passed
+   * in.
+   */
+  vector<string> subdirs;
+  int r = from.list_subdirs(path, &subdirs);
+  if (r < 0)
+    return r;
+  map<string, ghobject_t> objects;
+  r = from.list_objects(path, 0, 0, &objects);
+  if (r < 0)
+    return r;
+
+  set<string> to_move;
+  for (vector<string>::iterator i = subdirs.begin();
+       i != subdirs.end();
+       ++i) {
+    uint32_t bits = 0;
+    uint32_t hash = 0;
+    vector<string> sub_path(path.begin(), path.end());
+    sub_path.push_back(*i);
+    path_to_hobject_hash_prefix(sub_path, &bits, &hash);
+    if (bits < inbits) {
+      if (hobject_t::match_hash(hash, bits, match)) {
+	r = col_split_level(
+	  from,
+	  to,
+	  sub_path,
+	  inbits,
+	  match,
+	  mkdirred);
+	if (r < 0)
+	  return r;
+	if (*mkdirred > path.size())
+	  *mkdirred = path.size();
+      } // else, skip, doesn't need to be moved or recursed into
+    } else {
+      if (hobject_t::match_hash(hash, inbits, match)) {
+	to_move.insert(*i);
+      }
+    } // else, skip, doesn't need to be moved or recursed into
+  }
+
+  /* Then, do the same for each object */
+  map<string, ghobject_t> objs_to_move;
+  for (map<string, ghobject_t>::iterator i = objects.begin();
+       i != objects.end();
+       ++i) {
+    if (i->second.match(inbits, match)) {
+      objs_to_move.insert(*i);
+    }
+  }
+
+  if (objs_to_move.empty() && to_move.empty())
+    return 0;
+
+  // Make parent directories as needed
+  while (*mkdirred < path.size()) {
+    ++*mkdirred;
+    int exists = 0;
+    vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
+    r = to.path_exists(creating_path, &exists);
+    if (r < 0)
+      return r;
+    if (exists)
+      continue;
+    subdir_info_s info;
+    info.objs = 0;
+    info.subdirs = 0;
+    info.hash_level = creating_path.size();
+    if (*mkdirred < path.size() - 1)
+      info.subdirs = 1;
+    r = to.start_col_split(creating_path);
+    if (r < 0)
+      return r;
+    r = to.create_path(creating_path);
+    if (r < 0)
+      return r;
+    r = to.set_info(creating_path, info);
+    if (r < 0)
+      return r;
+    r = to.end_split_or_merge(creating_path);
+    if (r < 0)
+      return r;
+  }
+
+  subdir_info_s from_info;
+  subdir_info_s to_info;
+  r = from.get_info(path, &from_info);
+  if (r < 0)
+    return r;
+  r = to.get_info(path, &to_info);
+  if (r < 0)
+    return r;
+
+  from.start_col_split(path);
+  to.start_col_split(path);
+
+  // Do subdir moves
+  for (set<string>::iterator i = to_move.begin();
+       i != to_move.end();
+       ++i) {
+    from_info.subdirs--;
+    to_info.subdirs++;
+    r = move_subdir(from, to, path, *i);
+    if (r < 0)
+      return r;
+  }
+
+  for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
+       i != objs_to_move.end();
+       ++i) {
+    from_info.objs--;
+    to_info.objs++;
+    r = move_object(from, to, path, *i);
+    if (r < 0)
+      return r;
+  }
+
+
+  r = to.set_info(path, to_info);
+  if (r < 0)
+    return r;
+  r = from.set_info(path, from_info);
+  if (r < 0)
+    return r;
+  from.end_split_or_merge(path);
+  to.end_split_or_merge(path);
+  return 0;
+}
+
+int HashIndex::_merge(
+  uint32_t bits,
+  CollectionIndex* dest) {
+  dout(20) << __func__ << " bits " << bits << dendl;
+  ceph_assert(collection_version() == dest->collection_version());
+
+  vector<string> emptypath;
+
+  // pre-split to common/target level so that any shared prefix DIR_?
+  // directories already exist at the destination.  Since each
+  // directory is a nibble (4 bits),
+  unsigned shared = bits / 4;
+  dout(20) << __func__ << " pre-splitting to shared level " << shared << dendl;
+  if (shared) {
+    split_dirs(emptypath, shared);
+    ((HashIndex*)dest)->split_dirs(emptypath, shared);
+  }
+
+  // now merge the contents
+  _merge_dirs(*this, *(HashIndex*)dest, emptypath);
+
+  return 0;
+}
+
+int HashIndex::_merge_dirs(
+  HashIndex& from,
+  HashIndex& to,
+  const vector<string>& path)
+{
+  dout(20) << __func__ << " path " << path << dendl;
+  int r;
+
+  vector<string> src_subs, dst_subs;
+  r = from.list_subdirs(path, &src_subs);
+  if (r < 0) {
+    lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						  << " r " << r << " from "
+						  << "from.list_subdirs"
+						  << dendl;
+    return r;
+  }
+  r = to.list_subdirs(path, &dst_subs);
+  if (r < 0) {
+    lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						  << " r " << r << " from "
+						  << "to.list_subdirs"
+						  << dendl;
+    return r;
+  }
+
+  for (auto& i : src_subs) {
+    if (std::find(dst_subs.begin(), dst_subs.end(), i) == dst_subs.end()) {
+      // move it
+      r = move_subdir(from, to, path, i);
+      if (r < 0) {
+	lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						      << " r " << r << " from "
+						      << "move_subdir(...,"
+						      << path << "," << i << ")"
+						      << dendl;
+	return r;
+      }
+    } else {
+      // common, recurse!
+      vector<string> nested = path;
+      nested.push_back(i);
+      r = _merge_dirs(from, to, nested);
+      if (r < 0) {
+	lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						      << " r " << r << " from "
+						      << "rec _merge_dirs"
+						      << dendl;
+	return r;
+      }
+
+      // now remove it
+      r = remove_path(nested);
+      if (r < 0) {
+	lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						      << " r " << r << " from "
+						      << "remove_path "
+						      << nested
+						      << dendl;
+	return r;
+      }
+    }
+  }
+
+  // objects
+  map<string, ghobject_t> objects;
+  r = from.list_objects(path, 0, 0, &objects);
+  if (r < 0) {
+    lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						  << " r " << r << " from "
+						  << "from.list_objects"
+						  << dendl;
+    return r;
+  }
+
+  for (auto& i : objects) {
+    r = move_object(from, to, path, i);
+    if (r < 0) {
+      lgeneric_subdout(g_ceph_context,filestore,20) << __func__
+						    << " r " << r << " from "
+						    << "move_object(...,"
+						    << path << "," << i << ")"
+						    << dendl;
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+
+int HashIndex::_split(
+  uint32_t match,
+  uint32_t bits,
+  CollectionIndex* dest) {
+  ceph_assert(collection_version() == dest->collection_version());
+  unsigned mkdirred = 0;
+
+  return col_split_level(
+    *this,
+    *static_cast<HashIndex*>(dest),
+    vector<string>(),
+    bits,
+    match,
+    &mkdirred);
+}
+
+int HashIndex::split_dirs(const vector<string> &path, int target_level) {
+  dout(20) << __func__ << " " << path << " target level: " 
+           << target_level << dendl;
+  subdir_info_s info;
+  int r = get_info(path, &info);
+  if (r < 0) {
+    dout(10) << "error looking up info for " << path << ": "
+	     << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  if (must_split(info, target_level)) {
+    dout(1) << __func__ << " " << path << " has " << info.objs
+            << " objects, " << info.hash_level 
+            << " level, starting split in pg " << coll() << "." << dendl;
+    r = initiate_split(path, info);
+    if (r < 0) {
+      dout(10) << "error initiating split on " << path << ": "
+	       << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    r = complete_split(path, info);
+    dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
+            << dendl;
+    if (r < 0) {
+      dout(10) << "error completing split on " << path << ": "
+	       << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  vector<string> subdirs;
+  r = list_subdirs(path, &subdirs);
+  if (r < 0) {
+    dout(10) << "error listing subdirs of " << path << ": "
+	     << cpp_strerror(r) << dendl;
+    return r;
+  }
+  for (vector<string>::const_iterator it = subdirs.begin();
+       it != subdirs.end(); ++it) {
+    vector<string> subdir_path(path);
+    subdir_path.push_back(*it);
+    r = split_dirs(subdir_path, target_level);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  return r;
+}
+
+int HashIndex::apply_layout_settings(int target_level) {
+  vector<string> path;
+  dout(10) << __func__ << " split multiple = " << split_multiplier
+	   << " merge threshold = " << merge_threshold
+	   << " split rand factor = " << cct->_conf->filestore_split_rand_factor
+	   << " target level = " << target_level
+	   << dendl;
+  int r = write_settings();
+  if (r < 0)
+    return r;
+  return split_dirs(path, target_level);
+}
+
+int HashIndex::_init() {
+  subdir_info_s info;
+  vector<string> path;
+  int r = set_info(path, info);
+  if (r < 0)
+    return r;
+  return write_settings();
+}
+
+int HashIndex::write_settings() {
+  if (cct->_conf->filestore_split_rand_factor > 0) {
+    settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor;
+  } else {
+    settings.split_rand_factor = 0;
+  }
+  vector<string> path;
+  bufferlist bl;
+  settings.encode(bl);
+  return add_attr_path(path, SETTINGS_ATTR, bl);
+}
+
+int HashIndex::read_settings() {
+  vector<string> path;
+  bufferlist bl;
+  int r = get_attr_path(path, SETTINGS_ATTR, bl);
+  if (r == -ENODATA)
+    return 0;
+  if (r < 0) {
+    derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  auto it = bl.cbegin();
+  settings.decode(it);
+  dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl;
+  return 0;
+}
+
+/* LFNIndex virtual method implementations */
+int HashIndex::_created(const vector<string> &path,
+			const ghobject_t &oid,
+			const string &mangled_name) {
+  subdir_info_s info;
+  int r;
+  r = get_info(path, &info);
+  if (r < 0)
+    return r;
+  info.objs++;
+  r = set_info(path, info);
+  if (r < 0)
+    return r;
+
+  if (must_split(info)) {
+    dout(1) << __func__ << " " << path << " has " << info.objs
+            << " objects, starting split in pg " << coll() << "." << dendl;
+    int r = initiate_split(path, info);
+    if (r < 0) {
+      derr << __func__ << " error starting split " << path << " in pg "
+           << coll() << ": " << cpp_strerror(r) << dendl;
+      ceph_assert(!cct->_conf->filestore_fail_eio);
+    } else {
+      r = complete_split(path, info);
+      if (r < 0) {
+        derr << __func__ << " error completing split " << path << " in pg "
+             << coll() << ": " << cpp_strerror(r) << dendl;
+        ceph_assert(!cct->_conf->filestore_fail_eio);
+      }
+      dout(1) << __func__ << " " << path << " split completed in pg " << coll()
+              << "." << dendl;
+    }
+  }
+
+  return 0;
+}
+
+int HashIndex::_remove(const vector<string> &path,
+		       const ghobject_t &oid,
+		       const string &mangled_name) {
+  int r;
+  r = remove_object(path, oid);
+  if (r < 0)
+    return r;
+  subdir_info_s info;
+  r = get_info(path, &info);
+  if (r < 0)
+    return r;
+  info.objs--;
+  r = set_info(path, info);
+  if (r < 0)
+    return r;
+
+  if (must_merge(info)) {
+    dout(1) << __func__ << " " << path << " has " << info.objs
+            << " objects, starting merge in pg " << coll() << "." << dendl;
+    r = initiate_merge(path, info);
+    if (r < 0) {
+      derr << __func__ << " error starting merge " << path << " in pg "
+           << coll() << ": " << cpp_strerror(r) << dendl;
+      ceph_assert(!cct->_conf->filestore_fail_eio);
+    } else {
+      r = complete_merge(path, info);
+      if (r < 0) {
+        derr << __func__ << " error completing merge " << path << " in pg "
+             << coll() << ": " << cpp_strerror(r) << dendl;
+        ceph_assert(!cct->_conf->filestore_fail_eio);
+      }
+      dout(1) << __func__ << " " << path << " merge completed in pg " << coll()
+              << "." << dendl;
+    }
+  }
+
+  return 0;
+}
+
+int HashIndex::_lookup(const ghobject_t &oid,
+		       vector<string> *path,
+		       string *mangled_name,
+		       int *hardlink) {
+  vector<string> path_comp;
+  get_path_components(oid, &path_comp);
+  vector<string>::iterator next = path_comp.begin();
+  int exists;
+  while (1) {
+    int r = path_exists(*path, &exists);
+    if (r < 0)
+      return r;
+    if (!exists) {
+      if (path->empty())
+	return -ENOENT;
+      path->pop_back();
+      break;
+    }
+    if (next == path_comp.end())
+      break;
+    path->push_back(*(next++));
+  }
+  return get_mangled_name(*path, oid, mangled_name, hardlink);
+}
+
+int HashIndex::_collection_list_partial(const ghobject_t &start,
+					const ghobject_t &end,
+					int max_count,
+					vector<ghobject_t> *ls,
+					ghobject_t *next) {
+  vector<string> path;
+  ghobject_t _next;
+  if (!next)
+    next = &_next;
+  *next = start;
+  dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
+  return list_by_hash(path, end, max_count, next, ls);
+}
+
+int HashIndex::prep_delete() {
+  return recursive_remove(vector<string>());
+}
+
+int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
+  int ret;
+  vector<string> path;
+  subdir_info_s root_info;
+  // Make sure there is neither objects nor sub-folders
+  // in this collection
+  ret = get_info(path, &root_info);
+  if (ret < 0)
+    return ret;
+
+  // Do the folder splitting first
+  ret = pre_split_folder(pg_num, expected_num_objs);
+  if (ret < 0)
+    return ret;
+  // Initialize the folder info starting from root
+  return init_split_folder(path, 0);
+}
+
+int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
+{
+  // If folder merging is enabled (by setting the threshold positive),
+  // no need to split
+  if (merge_threshold > 0)
+    return 0;
+  const coll_t c = coll();
+  // Do not split if the expected number of objects in this collection is zero (by default)
+  if (expected_num_objs == 0)
+    return 0;
+
+  // Calculate the number of leaf folders (which actually store files)
+  // need to be created
+  const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16;
+  uint64_t leavies = expected_num_objs / objs_per_folder ;
+  // No need to split
+  if (leavies == 0 || expected_num_objs == objs_per_folder)
+    return 0;
+
+  spg_t spgid;
+  if (!c.is_pg_prefix(&spgid))
+    return -EINVAL;
+  const ps_t ps = spgid.pgid.ps();
+
+  // the most significant bits of pg_num
+  const int pg_num_bits = calc_num_bits(pg_num - 1);
+  ps_t tmp_id = ps;
+  // calculate the number of levels we only create one sub folder
+  int num = pg_num_bits / 4;
+  // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
+  // so that splitting starts at level 3
+  if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
+    --num;
+  }
+
+  int ret;
+  // Start with creation that only has one subfolder
+  vector<string> paths;
+  int dump_num = num;
+  while (num-- > 0) {
+    ps_t v = tmp_id & 0x0000000f;
+    paths.push_back(to_hex(v));
+    ret = create_path(paths);
+    if (ret < 0 && ret != -EEXIST)
+      return ret;
+    tmp_id = tmp_id >> 4;
+  }
+
+  // Starting from here, we can split by creating multiple subfolders
+  const int left_bits = pg_num_bits - dump_num * 4;
+  // this variable denotes how many bits (for this level) that can be
+  // used for sub folder splitting
+  int split_bits = 4 - left_bits;
+  // the below logic is inspired by rados.h#ceph_stable_mod,
+  // it basically determines how many sub-folders should we
+  // create for splitting
+  ceph_assert(pg_num_bits > 0); // otherwise BAD_SHIFT
+  if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
+    ++split_bits;
+  }
+  const uint32_t subs = (1 << split_bits);
+  // Calculate how many levels we create starting from here
+  int level  = 0;
+  int level_limit = MAX_HASH_LEVEL - dump_num - 1;
+  uint64_t actual_leaves = subs;
+  while (actual_leaves < leavies && level < level_limit) {
+    ++level;
+    actual_leaves <<= 4;
+  }
+  for (uint32_t i = 0; i < subs; ++i) {
+    ceph_assert(split_bits <= 4); // otherwise BAD_SHIFT
+    int v = tmp_id | (i << ((4 - split_bits) % 4));
+    paths.push_back(to_hex(v));
+    ret = create_path(paths);
+    if (ret < 0 && ret != -EEXIST)
+      return ret;
+    ret = recursive_create_path(paths, level);
+    if (ret < 0)
+      return ret;
+    paths.pop_back();
+  }
+  return 0;
+}
+
+int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
+{
+  // Get the number of sub directories for the current path
+  vector<string> subdirs;
+  int ret = list_subdirs(path, &subdirs);
+  if (ret < 0)
+    return ret;
+  subdir_info_s info;
+  info.subdirs = subdirs.size();
+  info.hash_level = hash_level;
+  ret = set_info(path, info);
+  if (ret < 0)
+    return ret;
+  ret = fsync_dir(path);
+  if (ret < 0)
+    return ret;
+
+  // Do the same for subdirs
+  vector<string>::const_iterator iter;
+  for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
+    path.push_back(*iter);
+    ret = init_split_folder(path, hash_level + 1);
+    if (ret < 0)
+      return ret;
+    path.pop_back();
+  }
+  return 0;
+}
+
+int HashIndex::recursive_create_path(vector<string>& path, int level)
+{
+  if (level == 0)
+    return 0;
+  for (int i = 0; i < 16; ++i) {
+    path.push_back(to_hex(i));
+    int ret = create_path(path);
+    if (ret < 0 && ret != -EEXIST)
+      return ret;
+    ret = recursive_create_path(path, level - 1);
+    if (ret < 0)
+      return ret;
+    path.pop_back();
+  }
+  return 0;
+}
+
+int HashIndex::recursive_remove(const vector<string> &path) {
+  return _recursive_remove(path, true);
+}
+
+int HashIndex::_recursive_remove(const vector<string> &path, bool top) {
+  vector<string> subdirs;
+  dout(20) << __func__ << " path=" << path << dendl;
+  int r = list_subdirs(path, &subdirs);
+  if (r < 0)
+    return r;
+  map<string, ghobject_t> objects;
+  r = list_objects(path, 0, 0, &objects);
+  if (r < 0)
+    return r;
+  if (!objects.empty())
+    return -ENOTEMPTY;
+  vector<string> subdir(path);
+  for (vector<string>::iterator i = subdirs.begin();
+       i != subdirs.end();
+       ++i) {
+    subdir.push_back(*i);
+    r = _recursive_remove(subdir, false);
+    if (r < 0)
+      return r;
+    subdir.pop_back();
+  }
+  if (top)
+    return 0;
+  else
+    return remove_path(path);
+}
+
+int HashIndex::start_col_split(const vector<string> &path) {
+  bufferlist bl;
+  InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
+  op_tag.encode(bl);
+  int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+  if (r < 0)
+    return r;
+  return fsync_dir(vector<string>());
+}
+
+int HashIndex::start_split(const vector<string> &path) {
+  bufferlist bl;
+  InProgressOp op_tag(InProgressOp::SPLIT, path);
+  op_tag.encode(bl);
+  int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+  if (r < 0)
+    return r;
+  return fsync_dir(vector<string>());
+}
+
+int HashIndex::start_merge(const vector<string> &path) {
+  bufferlist bl;
+  InProgressOp op_tag(InProgressOp::MERGE, path);
+  op_tag.encode(bl);
+  int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
+  if (r < 0)
+    return r;
+  return fsync_dir(vector<string>());
+}
+
+int HashIndex::end_split_or_merge(const vector<string> &path) {
+  return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
+}
+
+int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
+  bufferlist buf;
+  int r = get_attr_path(path, SUBDIR_ATTR, buf);
+  if (r < 0)
+    return r;
+  auto bufiter = buf.cbegin();
+  info->decode(bufiter);
+  ceph_assert(path.size() == (unsigned)info->hash_level);
+  return 0;
+}
+
+int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
+  bufferlist buf;
+  ceph_assert(path.size() == (unsigned)info.hash_level);
+  info.encode(buf);
+  return add_attr_path(path, SUBDIR_ATTR, buf);
+}
+
+bool HashIndex::must_merge(const subdir_info_s &info) {
+  return (info.hash_level > 0 &&
+          merge_threshold > 0 &&
+	  info.objs < (unsigned)merge_threshold &&
+	  info.subdirs == 0);
+}
+
+bool HashIndex::must_split(const subdir_info_s &info, int target_level) {
+  // target_level is used for ceph-objectstore-tool to split dirs offline.
+  // if it is set (defalult is 0) and current hash level < target_level, 
+  // this dir would be split no matters how many objects it has.
+  return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
+         ((target_level > 0 && info.hash_level < (unsigned)target_level) ||
+         (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16))));
+}
+
+int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
+  return start_merge(path);
+}
+
+int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
+  vector<string> dst = path;
+  dst.pop_back();
+  subdir_info_s dstinfo;
+  int r, exists;
+  r = path_exists(path, &exists);
+  if (r < 0)
+    return r;
+  r = get_info(dst, &dstinfo);
+  if (r < 0)
+    return r;
+  if (exists) {
+    r = move_objects(path, dst);
+    if (r < 0)
+      return r;
+    r = reset_attr(dst);
+    if (r < 0)
+      return r;
+    r = remove_path(path);
+    if (r < 0)
+      return r;
+  }
+  if (must_merge(dstinfo)) {
+    r = initiate_merge(dst, dstinfo);
+    if (r < 0)
+      return r;
+    r = fsync_dir(dst);
+    if (r < 0)
+      return r;
+    return complete_merge(dst, dstinfo);
+  }
+  r = fsync_dir(dst);
+  if (r < 0)
+    return r;
+  return end_split_or_merge(path);
+}
+
+int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
+  return start_split(path);
+}
+
+int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
+  int level = info.hash_level;
+  map<string, ghobject_t> objects;
+  vector<string> dst = path;
+  int r;
+  dst.push_back("");
+  r = list_objects(path, 0, 0, &objects);
+  if (r < 0)
+    return r;
+  vector<string> subdirs_vec;
+  r = list_subdirs(path, &subdirs_vec);
+  if (r < 0)
+    return r;
+  set<string> subdirs;
+  subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
+  map<string, map<string, ghobject_t> > mapped;
+  map<string, ghobject_t> moved;
+  int num_moved = 0;
+  for (map<string, ghobject_t>::iterator i = objects.begin();
+       i != objects.end();
+       ++i) {
+    vector<string> new_path;
+    get_path_components(i->second, &new_path);
+    mapped[new_path[level]][i->first] = i->second;
+  }
+  for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
+       i != mapped.end();
+       ) {
+    dst[level] = i->first;
+    /* If the info already exists, it must be correct,
+     * we may be picking up a partially finished split */
+    subdir_info_s temp;
+    // subdir has already been fully copied
+    if (subdirs.count(i->first) && !get_info(dst, &temp)) {
+      for (map<string, ghobject_t>::iterator j = i->second.begin();
+	   j != i->second.end();
+	   ++j) {
+	moved[j->first] = j->second;
+	num_moved++;
+	objects.erase(j->first);
+      }
+      ++i;
+      continue;
+    }
+
+    subdir_info_s info_new;
+    info_new.objs = i->second.size();
+    info_new.subdirs = 0;
+    info_new.hash_level = level + 1;
+    if (must_merge(info_new) && !subdirs.count(i->first)) {
+      mapped.erase(i++);
+      continue;
+    }
+
+    // Subdir doesn't yet exist
+    if (!subdirs.count(i->first)) {
+      info.subdirs += 1;
+      r = create_path(dst);
+      if (r < 0)
+	return r;
+    } // else subdir has been created but only partially copied
+
+    for (map<string, ghobject_t>::iterator j = i->second.begin();
+	 j != i->second.end();
+	 ++j) {
+      moved[j->first] = j->second;
+      num_moved++;
+      objects.erase(j->first);
+      r = link_object(path, dst, j->second, j->first);
+      // May be a partially finished split
+      if (r < 0 && r != -EEXIST) {
+	return r;
+      }
+    }
+
+    r = fsync_dir(dst);
+    if (r < 0)
+      return r;
+
+    // Presence of info must imply that all objects have been copied
+    r = set_info(dst, info_new);
+    if (r < 0)
+      return r;
+
+    r = fsync_dir(dst);
+    if (r < 0)
+      return r;
+
+    ++i;
+  }
+  r = remove_objects(path, moved, &objects);
+  if (r < 0)
+    return r;
+  info.objs = objects.size();
+  r = reset_attr(path);
+  if (r < 0)
+    return r;
+  r = fsync_dir(path);
+  if (r < 0)
+    return r;
+  return end_split_or_merge(path);
+}
+
+void HashIndex::get_path_components(const ghobject_t &oid,
+				    vector<string> *path) {
+  char buf[MAX_HASH_LEVEL + 1];
+  snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
+
+  // Path components are the hex characters of oid.hobj.hash, least
+  // significant first
+  for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
+    path->push_back(string(&buf[i], 1));
+  }
+}
+
+string HashIndex::get_hash_str(uint32_t hash) {
+  char buf[MAX_HASH_LEVEL + 1];
+  snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
+  string retval;
+  for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
+    retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
+  }
+  return retval;
+}
+
+string HashIndex::get_path_str(const ghobject_t &oid) {
+  ceph_assert(!oid.is_max());
+  return get_hash_str(oid.hobj.get_hash());
+}
+
+uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
+  while (prefix.size() < sizeof(uint32_t) * 2) {
+    prefix.push_back('0');
+  }
+  uint32_t hash;
+  sscanf(prefix.c_str(), "%x", &hash);
+  // nibble reverse
+  hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
+  hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
+  hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
+  return hash;
+}
+
+int HashIndex::get_path_contents_by_hash_bitwise(
+  const vector<string> &path,
+  const ghobject_t *next_object,
+  set<string, CmpHexdigitStringBitwise> *hash_prefixes,
+  set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
+{
+  map<string, ghobject_t> rev_objects;
+  int r;
+  r = list_objects(path, 0, 0, &rev_objects);
+  if (r < 0)
+    return r;
+  // bitwise sort
+  for (map<string, ghobject_t>::iterator i = rev_objects.begin();
+       i != rev_objects.end();
+       ++i) {
+    if (next_object && i->second < *next_object)
+      continue;
+    string hash_prefix = get_path_str(i->second);
+    hash_prefixes->insert(hash_prefix);
+    objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
+  }
+  vector<string> subdirs;
+  r = list_subdirs(path, &subdirs);
+  if (r < 0)
+    return r;
+
+  // sort subdirs bitwise (by reversing hex digit nibbles)
+  std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
+
+  // Local to this function, we will convert the prefix strings
+  // (previously simply the reversed hex digits) to also have each
+  // digit's nibbles reversed.  This will make the strings sort
+  // bitwise.
+  string cur_prefix;
+  for (vector<string>::const_iterator i = path.begin();
+       i != path.end();
+       ++i) {
+    cur_prefix.append(reverse_hexdigit_bits_string(*i));
+  }
+  string next_object_string;
+  if (next_object)
+    next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
+  for (vector<string>::iterator i = subdirs.begin();
+       i != subdirs.end();
+       ++i) {
+    string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
+    if (next_object) {
+      if (next_object->is_max())
+	continue;
+      if (candidate < next_object_string.substr(0, candidate.size()))
+	continue;
+    }
+    // re-reverse the hex digit nibbles for the caller
+    hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
+  }
+  return 0;
+}
+
+int HashIndex::list_by_hash(const vector<string> &path,
+			    const ghobject_t &end,
+			    int max_count,
+			    ghobject_t *next,
+			    vector<ghobject_t> *out)
+{
+  ceph_assert(out);
+  return list_by_hash_bitwise(path, end, max_count, next, out);
+}
+
+int HashIndex::list_by_hash_bitwise(
+  const vector<string> &path,
+  const ghobject_t& end,
+  int max_count,
+  ghobject_t *next,
+  vector<ghobject_t> *out)
+{
+  vector<string> next_path = path;
+  next_path.push_back("");
+  set<string, CmpHexdigitStringBitwise> hash_prefixes;
+  set<pair<string, ghobject_t>, CmpPairBitwise> objects;
+  int r = get_path_contents_by_hash_bitwise(path,
+					    next,
+					    &hash_prefixes,
+					    &objects);
+  if (r < 0)
+    return r;
+  for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
+       i != hash_prefixes.end();
+       ++i) {
+    dout(20) << __func__ << " prefix " << *i << dendl;
+    set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
+      make_pair(*i, ghobject_t()));
+    if (j == objects.end() || j->first != *i) {
+      *(next_path.rbegin()) = *(i->rbegin());
+      ghobject_t next_recurse;
+      if (next)
+	next_recurse = *next;
+      r = list_by_hash_bitwise(next_path,
+			       end,
+			       max_count,
+			       &next_recurse,
+			       out);
+
+      if (r < 0)
+	return r;
+      if (!next_recurse.is_max()) {
+	if (next)
+	  *next = next_recurse;
+	return 0;
+      }
+    } else {
+      while (j != objects.end() && j->first == *i) {
+	if (max_count > 0 && out->size() == (unsigned)max_count) {
+	  if (next)
+	    *next = j->second;
+	  return 0;
+	}
+	if (j->second >= end) {
+	  if (next)
+	    *next = j->second;
+	  return 0;
+	}
+	if (!next || j->second >= *next) {
+	  dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
+	  out->push_back(j->second);
+	}
+	++j;
+      }
+    }
+  }
+  if (next)
+    *next = ghobject_t::get_max();
+  return 0;
+}
+
+
diff --git a/src/os/filestore/HashIndex.h b/src/os/filestore/HashIndex.h
new file mode 100644
index 000000000..9ba50c56d
--- /dev/null
+++ b/src/os/filestore/HashIndex.h
@@ -0,0 +1,460 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_HASHINDEX_H
+#define CEPH_HASHINDEX_H
+
+#include "include/buffer_fwd.h"
+#include "include/encoding.h"
+#include "LFNIndex.h"
+
+extern std::string reverse_hexdigit_bits_string(std::string l);
+
+/**
+ * Implements collection prehashing.
+ *
+ * @verbatim
+ *     (root) - 0 - 0
+ *                - 1
+ *                - E
+ *            - 1
+ *            - 2 - D - 0
+ *            .
+ *            .
+ *            .
+ *            - F - 0
+ * @endverbatim
+ *
+ * A file is located at the longest existing directory from the root
+ * given by the hex characters in the hash beginning with the least
+ * significant.
+ *
+ * ex: ghobject_t("object", CEPH_NO_SNAP, 0xA4CEE0D2)
+ * would be located in (root)/2/D/0/
+ *
+ * Subdirectories are created when the number of objects in a
+ * directory exceed 16 * (abs(merge_threshhold) * split_multiplier +
+ * split_rand_factor). The number of objects in a directory is encoded
+ * as subdir_info_s in an xattr on the directory.
+ */
+class HashIndex : public LFNIndex {
+private:
+  /// Attribute name for storing subdir info @see subdir_info_s
+  static const std::string SUBDIR_ATTR;
+  /// Attribute name for storing index-wide settings
+  static const std::string SETTINGS_ATTR;
+  /// Attribute name for storing in progress op tag
+  static const std::string IN_PROGRESS_OP_TAG;
+  /// Size (bits) in object hash
+  static const int PATH_HASH_LEN = 32;
+  /// Max length of hashed path
+  static const int MAX_HASH_LEVEL = (PATH_HASH_LEN/4);
+
+  /**
+   * Merges occur when the number of object drops below
+   * merge_threshold and splits occur when the number of objects
+   * exceeds:
+   *
+   *   16 * (abs(merge_threshold) * split_multiplier + split_rand_factor)
+   *
+   * Please note if merge_threshold is less than zero, it will never
+   * do merging
+   */
+  int merge_threshold;
+  int split_multiplier;
+
+  /// Encodes current subdir state for determining when to split/merge.
+  struct subdir_info_s {
+    uint64_t objs;       ///< Objects in subdir.
+    uint32_t subdirs;    ///< Subdirs in subdir.
+    uint32_t hash_level; ///< Hashlevel of subdir.
+
+    subdir_info_s() : objs(0), subdirs(0), hash_level(0) {}
+
+    void encode(ceph::buffer::list &bl) const
+    {
+      using ceph::encode;
+      __u8 v = 1;
+      encode(v, bl);
+      encode(objs, bl);
+      encode(subdirs, bl);
+      encode(hash_level, bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator &bl)
+    {
+      using ceph::decode;
+      __u8 v;
+      decode(v, bl);
+      ceph_assert(v == 1);
+      decode(objs, bl);
+      decode(subdirs, bl);
+      decode(hash_level, bl);
+    }
+  };
+
+  struct settings_s {
+    uint32_t split_rand_factor; ///< random factor added to split threshold (only on root of collection)
+    settings_s() : split_rand_factor(0) {}
+    void encode(ceph::buffer::list &bl) const
+    {
+      using ceph::encode;
+      __u8 v = 1;
+      encode(v, bl);
+      encode(split_rand_factor, bl);
+    }
+    void decode(ceph::buffer::list::const_iterator &bl)
+    {
+      using ceph::decode;
+      __u8 v;
+      decode(v, bl);
+      decode(split_rand_factor, bl);
+    }
+  } settings;
+
+  /// Encodes in progress split or merge
+  struct InProgressOp {
+    static const int SPLIT = 0;
+    static const int MERGE = 1;
+    static const int COL_SPLIT = 2;
+    int op;
+    std::vector<std::string> path;
+
+    InProgressOp(int op, const std::vector<std::string> &path)
+      : op(op), path(path) {}
+
+    explicit InProgressOp(ceph::buffer::list::const_iterator &bl) {
+      decode(bl);
+    }
+
+    bool is_split() const { return op == SPLIT; }
+    bool is_col_split() const { return op == COL_SPLIT; }
+    bool is_merge() const { return op == MERGE; }
+
+    void encode(ceph::buffer::list &bl) const {
+      using ceph::encode;
+      __u8 v = 1;
+      encode(v, bl);
+      encode(op, bl);
+      encode(path, bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator &bl) {
+      using ceph::decode;
+      __u8 v;
+      decode(v, bl);
+      ceph_assert(v == 1);
+      decode(op, bl);
+      decode(path, bl);
+    }
+  };
+
+
+public:
+  /// Constructor.
+  HashIndex(
+    CephContext* cct,
+    coll_t collection,     ///< [in] Collection
+    const char *base_path, ///< [in] Path to the index root.
+    int merge_at,          ///< [in] Merge threshold.
+    int split_multiple,	   ///< [in] Split threshold.
+    uint32_t index_version,///< [in] Index version
+    double retry_probability=0) ///< [in] retry probability
+    : LFNIndex(cct, collection, base_path, index_version, retry_probability),
+      merge_threshold(merge_at),
+      split_multiplier(split_multiple)
+  {}
+
+  int read_settings() override;
+
+  /// @see CollectionIndex
+  uint32_t collection_version() override { return index_version; }
+
+  /// @see CollectionIndex
+  int cleanup() override;
+
+  /// @see CollectionIndex
+  int prep_delete() override;
+
+  /// @see CollectionIndex
+  int _split(
+    uint32_t match,
+    uint32_t bits,
+    CollectionIndex* dest
+    ) override;
+
+  /// @see CollectionIndex
+  int _merge(
+    uint32_t bits,
+    CollectionIndex* dest
+    ) override;
+
+  int _merge_dirs(
+    HashIndex& from,
+    HashIndex& to,
+    const std::vector<std::string>& path);
+
+  /// @see CollectionIndex
+  int apply_layout_settings(int target_level) override;
+
+protected:
+  int _init() override;
+
+  int _created(
+    const std::vector<std::string> &path,
+    const ghobject_t &oid,
+    const std::string &mangled_name
+    ) override;
+  int _remove(
+    const std::vector<std::string> &path,
+    const ghobject_t &oid,
+    const std::string &mangled_name
+    ) override;
+  int _lookup(
+    const ghobject_t &oid,
+    std::vector<std::string> *path,
+    std::string *mangled_name,
+    int *hardlink
+    ) override;
+
+  /**
+   * Pre-hash the collection to create folders according to the expected number
+   * of objects in this collection.
+   */
+  int _pre_hash_collection(
+      uint32_t pg_num,
+      uint64_t expected_num_objs
+      ) override;
+
+  int _collection_list_partial(
+    const ghobject_t &start,
+    const ghobject_t &end,
+    int max_count,
+    std::vector<ghobject_t> *ls,
+    ghobject_t *next
+    ) override;
+private:
+  /// Internal recursively remove path and its subdirs
+  int _recursive_remove(
+    const std::vector<std::string> &path, ///< [in] path to remove
+    bool top			///< [in] internal tracking of first caller
+    ); /// @return Error Code, 0 on success
+  /// Recursively remove path and its subdirs
+  int recursive_remove(
+    const std::vector<std::string> &path ///< [in] path to remove
+    ); /// @return Error Code, 0 on success
+  /// Tag root directory at beginning of col_split
+  int start_col_split(
+    const std::vector<std::string> &path ///< [in] path to split
+    ); ///< @return Error Code, 0 on success
+  /// Tag root directory at beginning of split
+  int start_split(
+    const std::vector<std::string> &path ///< [in] path to split
+    ); ///< @return Error Code, 0 on success
+  /// Tag root directory at beginning of split
+  int start_merge(
+    const std::vector<std::string> &path ///< [in] path to merge
+    ); ///< @return Error Code, 0 on success
+  /// Remove tag at end of split or merge
+  int end_split_or_merge(
+    const std::vector<std::string> &path ///< [in] path to split or merged
+    ); ///< @return Error Code, 0 on success
+  /// Gets info from the xattr on the subdir represented by path
+  int get_info(
+    const std::vector<std::string> &path, ///< [in] Path from which to read attribute.
+    subdir_info_s *info		///< [out] Attribute value
+    ); /// @return Error Code, 0 on success
+
+  /// Sets info to the xattr on the subdir represented by path
+  int set_info(
+    const std::vector<std::string> &path, ///< [in] Path on which to set attribute.
+    const subdir_info_s &info  	///< [in] Value to set
+    ); /// @return Error Code, 0 on success
+
+  /// Encapsulates logic for when to split.
+  bool must_merge(
+    const subdir_info_s &info ///< [in] Info to check
+    ); /// @return True if info must be merged, False otherwise
+
+  /// Encapsulates logic for when to merge.
+  bool must_split(
+    const subdir_info_s &info, ///< [in] Info to check
+    int target_level = 0
+    ); /// @return True if info must be split, False otherwise
+
+  /// Initiates merge
+  int initiate_merge(
+    const std::vector<std::string> &path, ///< [in] Subdir to merge
+    subdir_info_s info		///< [in] Info attached to path
+    ); /// @return Error Code, 0 on success
+
+  /// Completes merge
+  int complete_merge(
+    const std::vector<std::string> &path, ///< [in] Subdir to merge
+    subdir_info_s info		///< [in] Info attached to path
+    ); /// @return Error Code, 0 on success
+
+  /// Resets attr to match actual subdir contents
+  int reset_attr(
+    const std::vector<std::string> &path ///< [in] path to cleanup
+    );
+
+  /// Initiate Split
+  int initiate_split(
+    const std::vector<std::string> &path, ///< [in] Subdir to split
+    subdir_info_s info		///< [in] Info attached to path
+    ); /// @return Error Code, 0 on success
+
+  /// Completes Split
+  int complete_split(
+    const std::vector<std::string> &path, ///< [in] Subdir to split
+    subdir_info_s info	       ///< [in] Info attached to path
+    ); /// @return Error Code, 0 on success
+
+  /// Determine path components from hoid hash
+  void get_path_components(
+    const ghobject_t &oid, ///< [in] Object for which to get path components
+    std::vector<std::string> *path   ///< [out] Path components for hoid.
+    );
+
+  /// Pre-hash and split folders to avoid runtime splitting
+  /// according to the given expected object number.
+  int pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs);
+
+  /// Initialize the folder (dir info) with the given hash
+  /// level and number of its subdirs.
+  int init_split_folder(std::vector<std::string> &path, uint32_t hash_level);
+
+  /// do collection split for path
+  static int col_split_level(
+    HashIndex &from,            ///< [in] from index
+    HashIndex &dest,            ///< [in] to index
+    const std::vector<std::string> &path, ///< [in] path to split
+    uint32_t bits,              ///< [in] num bits to match
+    uint32_t match,             ///< [in] bits to match
+    unsigned *mkdirred          ///< [in,out] path[:mkdirred] has been mkdirred
+    );
+
+
+  /**
+   * Get std::string representation of ghobject_t/hash
+   *
+   * e.g: 0x01234567 -> "76543210"
+   */
+  static std::string get_path_str(
+    const ghobject_t &oid ///< [in] Object to get hash std::string for
+    ); ///< @return Hash std::string for hoid.
+
+  /// Get std::string from hash, @see get_path_str
+  static std::string get_hash_str(
+    uint32_t hash ///< [in] Hash to convert to a string.
+    ); ///< @return std::string representation of hash
+
+  /// Get hash from hash prefix std::string e.g. "FFFFAB" -> 0xFFFFAB00
+  static uint32_t hash_prefix_to_hash(
+    std::string prefix ///< [in] std::string to convert
+    ); ///< @return Hash
+
+  /// Get hash mod from path
+  static void path_to_hobject_hash_prefix(
+    const std::vector<std::string> &path,///< [in] path to convert
+    uint32_t *bits,            ///< [out] bits
+    uint32_t *hash             ///< [out] hash
+    ) {
+    std::string hash_str;
+    for (auto i = path.begin(); i != path.end(); ++i) {
+      hash_str.push_back(*i->begin());
+    }
+    uint32_t rev_hash = hash_prefix_to_hash(hash_str);
+    if (hash)
+      *hash = rev_hash;
+    if (bits)
+      *bits = path.size() * 4;
+  }
+
+  /// Calculate the number of bits.
+  static int calc_num_bits(uint64_t n) {
+    int ret = 0;
+    while (n > 0) {
+      n = n >> 1;
+      ret++;
+    }
+    return ret;
+  }
+
+  /// Convert a number to hex std::string (upper case).
+  static std::string to_hex(int n) {
+    ceph_assert(n >= 0 && n < 16);
+    char c = (n <= 9 ? ('0' + n) : ('A' + n - 10));
+    std::string str;
+    str.append(1, c);
+    return str;
+  }
+
+  struct CmpPairBitwise {
+    bool operator()(const std::pair<std::string, ghobject_t>& l,
+		    const std::pair<std::string, ghobject_t>& r) const
+    {
+      if (l.first < r.first)
+	return true;
+      if (l.first > r.first)
+	return false;
+      if (cmp(l.second, r.second) < 0)
+	return true;
+      return false;
+    }
+  };
+
+  struct CmpHexdigitStringBitwise {
+    bool operator()(const std::string& l, const std::string& r) const {
+      return reverse_hexdigit_bits_string(l) < reverse_hexdigit_bits_string(r);
+    }
+  };
+
+  /// Get path contents by hash
+  int get_path_contents_by_hash_bitwise(
+    const std::vector<std::string> &path,             /// [in] Path to list
+    const ghobject_t *next_object,          /// [in] list > *next_object
+    std::set<std::string, CmpHexdigitStringBitwise> *hash_prefixes, /// [out] prefixes in dir
+    std::set<std::pair<std::string, ghobject_t>, CmpPairBitwise> *objects /// [out] objects
+    );
+
+  /// List objects in collection in ghobject_t order
+  int list_by_hash(
+    const std::vector<std::string> &path, /// [in] Path to list
+    const ghobject_t &end,      /// [in] List only objects < end
+    int max_count,              /// [in] List at most max_count
+    ghobject_t *next,            /// [in,out] List objects >= *next
+    std::vector<ghobject_t> *out      /// [out] Listed objects
+    ); ///< @return Error Code, 0 on success
+  /// List objects in collection in ghobject_t order
+  int list_by_hash_bitwise(
+    const std::vector<std::string> &path, /// [in] Path to list
+    const ghobject_t &end,      /// [in] List only objects < end
+    int max_count,              /// [in] List at most max_count
+    ghobject_t *next,            /// [in,out] List objects >= *next
+    std::vector<ghobject_t> *out      /// [out] Listed objects
+    ); ///< @return Error Code, 0 on success
+
+  /// Create the given levels of sub directories from the given root.
+  /// The contents of *path* is not changed after calling this function.
+  int recursive_create_path(std::vector<std::string>& path, int level);
+
+  /// split each dir below the given path
+  int split_dirs(const std::vector<std::string> &path, int target_level = 0);
+
+  int write_settings();
+};
+
+#endif
diff --git a/src/os/filestore/IndexManager.cc b/src/os/filestore/IndexManager.cc
new file mode 100644
index 000000000..7d3dbfc84
--- /dev/null
+++ b/src/os/filestore/IndexManager.cc
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/unordered_map.h"
+
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include <errno.h>
+
+#include "common/Cond.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "include/buffer.h"
+
+#include "IndexManager.h"
+#include "HashIndex.h"
+#include "CollectionIndex.h"
+
+#include "chain_xattr.h"
+
+using std::string;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::decode;
+using ceph::encode;
+
+static int set_version(const char *path, uint32_t version) {
+  bufferlist bl;
+  encode(version, bl);
+  return chain_setxattr<true, true>(
+    path, "user.cephos.collection_version", bl.c_str(),
+    bl.length());
+}
+
+static int get_version(const char *path, uint32_t *version) {
+  bufferptr bp(PATH_MAX);
+  int r = chain_getxattr(path, "user.cephos.collection_version",
+		      bp.c_str(), bp.length());
+  if (r < 0) {
+    if (r != -ENOENT) {
+      *version = 0;
+      return 0;
+    } else {
+      return r;
+    }
+  }
+  bp.set_length(r);
+  bufferlist bl;
+  bl.push_back(bp);
+  auto i = bl.cbegin();
+  decode(*version, i);
+  return 0;
+}
+
+IndexManager::~IndexManager() {
+
+  for (ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.begin();
+       it != col_indices.end(); ++it) {
+
+    delete it->second;
+    it->second = NULL;
+  }
+  col_indices.clear();
+}
+
+
+int IndexManager::init_index(coll_t c, const char *path, uint32_t version) {
+  std::unique_lock l{lock};
+  int r = set_version(path, version);
+  if (r < 0)
+    return r;
+  HashIndex index(cct, c, path, cct->_conf->filestore_merge_threshold,
+		  cct->_conf->filestore_split_multiple,
+		  version,
+		  cct->_conf->filestore_index_retry_probability);
+  r = index.init();
+  if (r < 0)
+    return r;
+  return index.read_settings();
+}
+
+int IndexManager::build_index(coll_t c, const char *path, CollectionIndex **index) {
+  if (upgrade) {
+    // Need to check the collection generation
+    int r;
+    uint32_t version = 0;
+    r = get_version(path, &version);
+    if (r < 0)
+      return r;
+
+    switch (version) {
+    case CollectionIndex::FLAT_INDEX_TAG:
+    case CollectionIndex::HASH_INDEX_TAG: // fall through
+    case CollectionIndex::HASH_INDEX_TAG_2: // fall through
+    case CollectionIndex::HOBJECT_WITH_POOL: {
+      // Must be a HashIndex
+      *index = new HashIndex(cct, c, path,
+			     cct->_conf->filestore_merge_threshold,
+			     cct->_conf->filestore_split_multiple,
+			     version);
+      return (*index)->read_settings();
+    }
+    default: ceph_abort();
+    }
+
+  } else {
+    // No need to check
+    *index = new HashIndex(cct, c, path, cct->_conf->filestore_merge_threshold,
+			   cct->_conf->filestore_split_multiple,
+			   CollectionIndex::HOBJECT_WITH_POOL,
+			   cct->_conf->filestore_index_retry_probability);
+    return (*index)->read_settings();
+  }
+}
+
+bool IndexManager::get_index_optimistic(coll_t c, Index *index) {
+  std::shared_lock l{lock};
+  ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c);
+  if (it == col_indices.end()) 
+    return false;
+  index->index = it->second;
+  return true;
+}
+
+int IndexManager::get_index(coll_t c, const string& baseDir, Index *index) {
+  if (get_index_optimistic(c, index))
+    return 0;
+  std::unique_lock l{lock};
+  ceph::unordered_map<coll_t, CollectionIndex* > ::iterator it = col_indices.find(c);
+  if (it == col_indices.end()) {
+    char path[PATH_MAX];
+    snprintf(path, sizeof(path), "%s/current/%s", baseDir.c_str(), c.to_str().c_str());
+    CollectionIndex* colIndex = NULL;
+    int r = build_index(c, path, &colIndex);
+    if (r < 0)
+      return r;
+    col_indices[c] = colIndex;
+    index->index = colIndex;
+  } else {
+    index->index = it->second;
+  }
+  return 0;
+}
diff --git a/src/os/filestore/IndexManager.h b/src/os/filestore/IndexManager.h
new file mode 100644
index 000000000..ef4f72cab
--- /dev/null
+++ b/src/os/filestore/IndexManager.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef OS_INDEXMANAGER_H
+#define OS_INDEXMANAGER_H
+
+#include "include/unordered_map.h"
+
+#include "common/ceph_mutex.h"
+#include "common/Cond.h"
+#include "common/config.h"
+#include "common/debug.h"
+
+#include "CollectionIndex.h"
+#include "HashIndex.h"
+
+
+/// Public type for Index
+struct Index {
+  CollectionIndex *index;
+
+  Index() : index(NULL) {}
+  explicit Index(CollectionIndex* index) : index(index) {}
+
+  CollectionIndex *operator->() { return index; }
+  CollectionIndex &operator*() { return *index; }
+};
+
+
+/**
+ * Encapsulates mutual exclusion for CollectionIndexes.
+ *
+ * Allowing a modification (removal or addition of an object) to occur
+ * while a read is occurring (lookup of an object's path and use of
+ * that path) may result in the path becoming invalid.  Thus, during
+ * the lifetime of a CollectionIndex object and any paths returned
+ * by it, no other concurrent accesses may be allowed.
+ * This is enforced by using CollectionIndex::access_lock
+ */
+class IndexManager {
+  CephContext* cct;
+  /// Lock for Index Manager
+  ceph::shared_mutex lock = ceph::make_shared_mutex("IndexManager lock");
+  bool upgrade;
+  ceph::unordered_map<coll_t, CollectionIndex* > col_indices;
+
+  /**
+   * Index factory
+   *
+   * Encapsulates logic for handling legacy FileStore
+   * layouts
+   *
+   * @param [in] c Collection for which to get index
+   * @param [in] path Path to collection
+   * @param [out] index Index for c
+   * @return error code
+   */
+  int build_index(coll_t c, const char *path, CollectionIndex **index);
+  bool get_index_optimistic(coll_t c, Index *index);
+public:
+  /// Constructor
+  explicit IndexManager(CephContext* cct,
+			bool upgrade) : cct(cct),
+					upgrade(upgrade) {}
+
+  ~IndexManager();
+
+  /**
+   * Reserve and return index for c
+   *
+   * @param [in] c Collection for which to get index
+   * @param [in] baseDir base directory of collections
+   * @param [out] index Index for c
+   * @return error code
+   */
+  int get_index(coll_t c, const std::string& baseDir, Index *index);
+
+  /**
+   * Initialize index for collection c at path
+   *
+   * @param [in] c Collection for which to init Index
+   * @param [in] path Path to collection
+   * @param [in] filestore_version version of containing FileStore
+   * @return error code
+   */
+  int init_index(coll_t c, const char *path, uint32_t filestore_version);
+};
+
+#endif
diff --git a/src/os/filestore/Journal.h b/src/os/filestore/Journal.h
new file mode 100644
index 000000000..0c14730b0
--- /dev/null
+++ b/src/os/filestore/Journal.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_JOURNAL_H
+#define CEPH_JOURNAL_H
+
+#include <errno.h>
+
+#include "include/buffer_fwd.h"
+#include "include/common_fwd.h"
+#include "include/Context.h"
+#include "common/Finisher.h"
+#include "common/TrackedOp.h"
+#include "os/ObjectStore.h"
+#include "common/zipkin_trace.h"
+
+
+class Journal {
+protected:
+  uuid_d fsid;
+  Finisher *finisher;
+public:
+  CephContext* cct;
+  PerfCounters *logger;
+protected:
+  ceph::condition_variable *do_sync_cond;
+  bool wait_on_full;
+
+public:
+  Journal(CephContext* cct, uuid_d f, Finisher *fin, ceph::condition_variable *c=0) :
+    fsid(f), finisher(fin), cct(cct), logger(NULL),
+    do_sync_cond(c),
+    wait_on_full(false) { }
+  virtual ~Journal() { }
+
+  virtual int check() = 0;   ///< check if journal appears valid
+  virtual int create() = 0;  ///< create a fresh journal
+  virtual int open(uint64_t fs_op_seq) = 0;  ///< open an existing journal
+  virtual void close() = 0;  ///< close an open journal
+
+  virtual void flush() = 0;
+
+  virtual void get_devices(std::set<std::string> *ls) {}
+  virtual void collect_metadata(std::map<std::string,std::string> *pm) {}
+  /**
+   * reserve_throttle_and_backoff
+   *
+   * Implementation may throttle or backoff based on ops
+   * reserved here but not yet released using committed_thru.
+   */
+  virtual void reserve_throttle_and_backoff(uint64_t count) = 0;
+
+  virtual int dump(std::ostream& out) { return -EOPNOTSUPP; }
+
+  void set_wait_on_full(bool b) { wait_on_full = b; }
+
+  // writes
+  virtual bool is_writeable() = 0;
+  virtual int make_writeable() = 0;
+  virtual void submit_entry(uint64_t seq, ceph::buffer::list& e, uint32_t orig_len,
+			    Context *oncommit,
+			    TrackedOpRef osd_op = TrackedOpRef()) = 0;
+  virtual void commit_start(uint64_t seq) = 0;
+  virtual void committed_thru(uint64_t seq) = 0;
+
+  /// Read next journal entry - asserts on invalid journal
+  virtual bool read_entry(
+    ceph::buffer::list &bl, ///< [out] payload on successful read
+    uint64_t &seq   ///< [in,out] sequence number on last successful read
+    ) = 0; ///< @return true on successful read, false on journal end
+
+  virtual bool should_commit_now() = 0;
+
+  virtual int prepare_entry(std::vector<ObjectStore::Transaction>& tls, ceph::buffer::list* tbl) = 0;
+
+  virtual off64_t get_journal_size_estimate() { return 0; }
+
+  // reads/recovery
+
+};
+
+#endif
diff --git a/src/os/filestore/JournalThrottle.cc b/src/os/filestore/JournalThrottle.cc
new file mode 100644
index 000000000..dc1b34d07
--- /dev/null
+++ b/src/os/filestore/JournalThrottle.cc
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "JournalThrottle.h"
+#include "include/ceph_assert.h"
+
+bool JournalThrottle::set_params(
+  double _low_threshhold,
+  double _high_threshhold,
+  double _expected_throughput,
+  double _high_multiple,
+  double _max_multiple,
+  uint64_t _throttle_max,
+  std::ostream *errstream)
+{
+  return throttle.set_params(
+    _low_threshhold,
+    _high_threshhold,
+    _expected_throughput,
+    _high_multiple,
+    _max_multiple,
+    _throttle_max,
+    errstream);
+}
+
+std::chrono::duration<double> JournalThrottle::get(uint64_t c)
+{
+  return throttle.get(c);
+}
+
+uint64_t JournalThrottle::take(uint64_t c)
+{
+  return throttle.take(c);
+}
+
+void JournalThrottle::register_throttle_seq(uint64_t seq, uint64_t c)
+{
+  locker l(lock);
+  journaled_ops.push_back(std::make_pair(seq, c));
+}
+
+std::pair<uint64_t, uint64_t> JournalThrottle::flush(uint64_t mono_id)
+{
+  uint64_t to_put_bytes = 0;
+  uint64_t to_put_ops = 0;
+  {
+    locker l(lock);
+    while (!journaled_ops.empty() &&
+	   journaled_ops.front().first <= mono_id) {
+      to_put_bytes += journaled_ops.front().second;
+      to_put_ops++;
+      journaled_ops.pop_front();
+    }
+  }
+  throttle.put(to_put_bytes);
+  return std::make_pair(to_put_ops, to_put_bytes);
+}
+
+uint64_t JournalThrottle::get_current()
+{
+  return throttle.get_current();
+}
+
+uint64_t JournalThrottle::get_max()
+{
+  return throttle.get_max();
+}
diff --git a/src/os/filestore/JournalThrottle.h b/src/os/filestore/JournalThrottle.h
new file mode 100644
index 000000000..f32f5d734
--- /dev/null
+++ b/src/os/filestore/JournalThrottle.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_THROTTLE_H
+#define CEPH_JOURNAL_THROTTLE_H
+
+#include "common/Throttle.h"
+
+#include <list>
+#include <deque>
+#include <condition_variable>
+#include <thread>
+#include <vector>
+#include <chrono>
+#include <iostream>
+
+/**
+ * JournalThrottle
+ *
+ * Throttle designed to implement dynamic throttling as the journal fills
+ * up.  The goal is to not delay ops at all when the journal is relatively
+ * empty, delay ops somewhat as the journal begins to fill (with the delay
+ * getting linearly longer as the journal fills up to a high water mark),
+ * and to delay much more aggressively (though still linearly with usage)
+ * until we hit the max value.
+ *
+ * The implementation simply wraps BackoffThrottle with a queue of
+ * journaled but not synced ops.
+ *
+ * The usage pattern is as follows:
+ * 1) Call get(seq, bytes) before taking the op_queue_throttle
+ * 2) Once the journal is flushed, flush(max_op_id_flushed)
+ */
+class JournalThrottle {
+  BackoffThrottle throttle;
+
+  std::mutex lock;
+  /// deque<id, count>
+  std::deque<std::pair<uint64_t, uint64_t> > journaled_ops;
+  using locker = std::unique_lock<std::mutex>;
+
+public:
+  /**
+   * set_params
+   *
+   * Sets params.  If the params are invalid, returns false
+   * and populates errstream (if non-null) with a user compreshensible
+   * explanation.
+   */
+  bool set_params(
+    double low_threshhold,
+    double high_threshhold,
+    double expected_throughput,
+    double high_multiple,
+    double max_multiple,
+    uint64_t throttle_max,
+    std::ostream *errstream);
+
+  /**
+   * gets specified throttle for id mono_id, waiting as necessary
+   *
+   * @param c [in] amount to take
+   * @return duration waited
+   */
+  std::chrono::duration<double> get(uint64_t c);
+
+  /**
+   * take
+   *
+   * Takes specified throttle without waiting
+   */
+  uint64_t take(uint64_t c);
+
+  /**
+   * register_throttle_seq
+   *
+   * Registers a sequence number with an amount of throttle to
+   * release upon flush()
+   *
+   * @param seq [in] seq
+   */
+  void register_throttle_seq(uint64_t seq, uint64_t c);
+
+
+  /**
+   * Releases throttle held by ids <= mono_id
+   *
+   * @param mono_id [in] id up to which to flush
+   * @returns pair<ops_flushed, bytes_flushed>
+   */
+  std::pair<uint64_t, uint64_t> flush(uint64_t mono_id);
+
+  uint64_t get_current();
+  uint64_t get_max();
+
+  JournalThrottle(
+    CephContext *cct,
+    unsigned expected_concurrency ///< [in] determines size of conds
+    ) : throttle(cct, "filestore_journal", expected_concurrency) {}
+};
+
+#endif
diff --git a/src/os/filestore/JournalingObjectStore.cc b/src/os/filestore/JournalingObjectStore.cc
new file mode 100644
index 000000000..c1555f343
--- /dev/null
+++ b/src/os/filestore/JournalingObjectStore.cc
@@ -0,0 +1,279 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "JournalingObjectStore.h"
+
+#include "common/errno.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_journal
+#undef dout_prefix
+#define dout_prefix *_dout << "journal "
+
+using std::map;
+using std::vector;
+
+using ceph::bufferptr;
+using ceph::bufferlist;
+
+void JournalingObjectStore::journal_start()
+{
+  dout(10) << "journal_start" << dendl;
+  finisher.start();
+}
+
+void JournalingObjectStore::journal_stop()
+{
+  dout(10) << "journal_stop" << dendl;
+  finisher.wait_for_empty();
+  finisher.stop();
+}
+
+// A journal_replay() makes journal writeable, this closes that out.
+void JournalingObjectStore::journal_write_close()
+{
+  if (journal) {
+    journal->close();
+    delete journal;
+    journal = 0;
+  }
+  apply_manager.reset();
+}
+
+int JournalingObjectStore::journal_replay(uint64_t fs_op_seq)
+{
+  dout(10) << "journal_replay fs op_seq " << fs_op_seq << dendl;
+
+  if (cct->_conf->journal_replay_from) {
+    dout(0) << "journal_replay forcing replay from "
+	    << cct->_conf->journal_replay_from
+	    << " instead of " << fs_op_seq << dendl;
+    // the previous op is the last one committed
+    fs_op_seq = cct->_conf->journal_replay_from - 1;
+  }
+
+  uint64_t op_seq = fs_op_seq;
+  apply_manager.init_seq(fs_op_seq);
+
+  if (!journal) {
+    submit_manager.set_op_seq(op_seq);
+    return 0;
+  }
+
+  int err = journal->open(op_seq);
+  if (err < 0) {
+    dout(3) << "journal_replay open failed with "
+	    << cpp_strerror(err) << dendl;
+    delete journal;
+    journal = 0;
+    return err;
+  }
+
+  replaying = true;
+
+  int count = 0;
+  while (1) {
+    bufferlist bl;
+    uint64_t seq = op_seq + 1;
+    if (!journal->read_entry(bl, seq)) {
+      dout(3) << "journal_replay: end of journal, done." << dendl;
+      break;
+    }
+
+    if (seq <= op_seq) {
+      dout(3) << "journal_replay: skipping old op seq " << seq << " <= " << op_seq << dendl;
+      continue;
+    }
+    ceph_assert(op_seq == seq-1);
+
+    dout(3) << "journal_replay: applying op seq " << seq << dendl;
+    auto p = bl.cbegin();
+    vector<ObjectStore::Transaction> tls;
+    while (!p.end()) {
+      tls.emplace_back(Transaction(p));
+    }
+
+    apply_manager.op_apply_start(seq);
+    int r = do_transactions(tls, seq);
+    apply_manager.op_apply_finish(seq);
+
+    op_seq = seq;
+    count++;
+
+    dout(3) << "journal_replay: r = " << r << ", op_seq now " << op_seq << dendl;
+  }
+
+  if (count)
+    dout(3) << "journal_replay: total = " << count << dendl;
+
+  replaying = false;
+
+  submit_manager.set_op_seq(op_seq);
+
+  // done reading, make writeable.
+  err = journal->make_writeable();
+  if (err < 0)
+    return err;
+
+  if (!count)
+    journal->committed_thru(fs_op_seq);
+
+  return count;
+}
+
+
+// ------------------------------------
+
+uint64_t JournalingObjectStore::ApplyManager::op_apply_start(uint64_t op)
+{
+  std::unique_lock l{apply_lock};
+  blocked_cond.wait(l, [this] {
+    if (blocked) {
+      dout(10) << "op_apply_start blocked, waiting" << dendl;
+    }
+    return !blocked;
+  });
+  dout(10) << "op_apply_start " << op << " open_ops " << open_ops << " -> "
+	   << (open_ops+1) << dendl;
+  ceph_assert(!blocked);
+  ceph_assert(op > committed_seq);
+  open_ops++;
+  return op;
+}
+
+void JournalingObjectStore::ApplyManager::op_apply_finish(uint64_t op)
+{
+  std::lock_guard l{apply_lock};
+  dout(10) << "op_apply_finish " << op << " open_ops " << open_ops << " -> "
+	   << (open_ops-1) << ", max_applied_seq " << max_applied_seq << " -> "
+	   << std::max(op, max_applied_seq) << dendl;
+  --open_ops;
+  ceph_assert(open_ops >= 0);
+
+  // signal a blocked commit_start
+  if (blocked) {
+    blocked_cond.notify_all();
+  }
+
+  // there can be multiple applies in flight; track the max value we
+  // note.  note that we can't _read_ this value and learn anything
+  // meaningful unless/until we've quiesced all in-flight applies.
+  if (op > max_applied_seq)
+    max_applied_seq = op;
+}
+
+uint64_t JournalingObjectStore::SubmitManager::op_submit_start()
+{
+  lock.lock();
+  uint64_t op = ++op_seq;
+  dout(10) << "op_submit_start " << op << dendl;
+  return op;
+}
+
+void JournalingObjectStore::SubmitManager::op_submit_finish(uint64_t op)
+{
+  dout(10) << "op_submit_finish " << op << dendl;
+  if (op != op_submitted + 1) {
+    dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1)
+	    << ", OUT OF ORDER" << dendl;
+    ceph_abort_msg("out of order op_submit_finish");
+  }
+  op_submitted = op;
+  lock.unlock();
+}
+
+
+// ------------------------------------------
+
+void JournalingObjectStore::ApplyManager::add_waiter(uint64_t op, Context *c)
+{
+  std::lock_guard l{com_lock};
+  ceph_assert(c);
+  commit_waiters[op].push_back(c);
+}
+
+bool JournalingObjectStore::ApplyManager::commit_start()
+{
+  bool ret = false;
+
+  {
+    std::unique_lock l{apply_lock};
+    dout(10) << "commit_start max_applied_seq " << max_applied_seq
+	     << ", open_ops " << open_ops << dendl;
+    blocked = true;
+    blocked_cond.wait(l, [this] {
+      if (open_ops > 0) {
+        dout(10) << "commit_start waiting for " << open_ops
+		 << " open ops to drain" << dendl;
+      }
+      return open_ops == 0;
+    });
+    ceph_assert(open_ops == 0);
+    dout(10) << "commit_start blocked, all open_ops have completed" << dendl;
+    {
+      std::lock_guard l{com_lock};
+      if (max_applied_seq == committed_seq) {
+	dout(10) << "commit_start nothing to do" << dendl;
+	blocked = false;
+	ceph_assert(commit_waiters.empty());
+	goto out;
+      }
+
+      committing_seq = max_applied_seq;
+
+      dout(10) << "commit_start committing " << committing_seq
+	       << ", still blocked" << dendl;
+    }
+  }
+  ret = true;
+
+  if (journal)
+    journal->commit_start(committing_seq);  // tell the journal too
+ out:
+  return ret;
+}
+
+void JournalingObjectStore::ApplyManager::commit_started()
+{
+  std::lock_guard l{apply_lock};
+  // allow new ops. (underlying fs should now be committing all prior ops)
+  dout(10) << "commit_started committing " << committing_seq << ", unblocking"
+	   << dendl;
+  blocked = false;
+  blocked_cond.notify_all();
+}
+
+void JournalingObjectStore::ApplyManager::commit_finish()
+{
+  std::lock_guard l{com_lock};
+  dout(10) << "commit_finish thru " << committing_seq << dendl;
+
+  if (journal)
+    journal->committed_thru(committing_seq);
+
+  committed_seq = committing_seq;
+
+  map<version_t, vector<Context*> >::iterator p = commit_waiters.begin();
+  while (p != commit_waiters.end() &&
+    p->first <= committing_seq) {
+    finisher.queue(p->second);
+    commit_waiters.erase(p++);
+  }
+}
+
+void JournalingObjectStore::_op_journal_transactions(
+  bufferlist& tbl, uint32_t orig_len, uint64_t op,
+  Context *onjournal, TrackedOpRef osd_op)
+{
+  if (osd_op.get())
+    dout(10) << "op_journal_transactions " << op << " reqid_t "
+             << (static_cast<OpRequest *>(osd_op.get()))->get_reqid() << dendl;
+  else
+    dout(10) << "op_journal_transactions " << op  << dendl;
+
+  if (journal && journal->is_writeable()) {
+    journal->submit_entry(op, tbl, orig_len, onjournal, osd_op);
+  } else if (onjournal) {
+    apply_manager.add_waiter(op, onjournal);
+  }
+}
diff --git a/src/os/filestore/JournalingObjectStore.h b/src/os/filestore/JournalingObjectStore.h
new file mode 100644
index 000000000..de4378b00
--- /dev/null
+++ b/src/os/filestore/JournalingObjectStore.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_JOURNALINGOBJECTSTORE_H
+#define CEPH_JOURNALINGOBJECTSTORE_H
+
+#include "os/ObjectStore.h"
+#include "Journal.h"
+#include "FileJournal.h"
+#include "common/RWLock.h"
+#include "osd/OpRequest.h"
+
+class JournalingObjectStore : public ObjectStore {
+protected:
+  Journal *journal;
+  Finisher finisher;
+
+
+  class SubmitManager {
+    CephContext* cct;
+    ceph::mutex lock = ceph::make_mutex("JOS::SubmitManager::lock");
+    uint64_t op_seq;
+    uint64_t op_submitted;
+  public:
+    SubmitManager(CephContext* cct) :
+      cct(cct),
+      op_seq(0), op_submitted(0)
+    {}
+    uint64_t op_submit_start();
+    void op_submit_finish(uint64_t op);
+    void set_op_seq(uint64_t seq) {
+      std::lock_guard l{lock};
+      op_submitted = op_seq = seq;
+    }
+    uint64_t get_op_seq() {
+      return op_seq;
+    }
+  } submit_manager;
+
+  class ApplyManager {
+    CephContext* cct;
+    Journal *&journal;
+    Finisher &finisher;
+
+    ceph::mutex apply_lock = ceph::make_mutex("JOS::ApplyManager::apply_lock");
+    bool blocked;
+    ceph::condition_variable blocked_cond;
+    int open_ops;
+    uint64_t max_applied_seq;
+
+    ceph::mutex com_lock = ceph::make_mutex("JOS::ApplyManager::com_lock");
+    std::map<version_t, std::vector<Context*> > commit_waiters;
+    uint64_t committing_seq, committed_seq;
+
+  public:
+    ApplyManager(CephContext* cct, Journal *&j, Finisher &f) :
+      cct(cct), journal(j), finisher(f),
+      blocked(false),
+      open_ops(0),
+      max_applied_seq(0),
+      committing_seq(0), committed_seq(0) {}
+    void reset() {
+      ceph_assert(open_ops == 0);
+      ceph_assert(blocked == false);
+      max_applied_seq = 0;
+      committing_seq = 0;
+      committed_seq = 0;
+    }
+    void add_waiter(uint64_t, Context*);
+    uint64_t op_apply_start(uint64_t op);
+    void op_apply_finish(uint64_t op);
+    bool commit_start();
+    void commit_started();
+    void commit_finish();
+    bool is_committing() {
+      std::lock_guard l{com_lock};
+      return committing_seq != committed_seq;
+    }
+    uint64_t get_committed_seq() {
+      std::lock_guard l{com_lock};
+      return committed_seq;
+    }
+    uint64_t get_committing_seq() {
+      std::lock_guard l{com_lock};
+      return committing_seq;
+    }
+    void init_seq(uint64_t fs_op_seq) {
+      {
+	std::lock_guard l{com_lock};
+	committed_seq = fs_op_seq;
+	committing_seq = fs_op_seq;
+      }
+      {
+	std::lock_guard l{apply_lock};
+	max_applied_seq = fs_op_seq;
+      }
+    }
+  } apply_manager;
+
+  bool replaying;
+
+protected:
+  void journal_start();
+  void journal_stop();
+  void journal_write_close();
+  int journal_replay(uint64_t fs_op_seq);
+
+  void _op_journal_transactions(ceph::buffer::list& tls, uint32_t orig_len, uint64_t op,
+				Context *onjournal, TrackedOpRef osd_op);
+
+  virtual int do_transactions(std::vector<ObjectStore::Transaction>& tls, uint64_t op_seq) = 0;
+
+public:
+  bool is_committing() {
+    return apply_manager.is_committing();
+  }
+  uint64_t get_committed_seq() {
+    return apply_manager.get_committed_seq();
+  }
+
+public:
+  JournalingObjectStore(CephContext* cct, const std::string& path)
+    : ObjectStore(cct, path),
+      journal(NULL),
+      finisher(cct, "JournalObjectStore", "fn_jrn_objstore"),
+      submit_manager(cct),
+      apply_manager(cct, journal, finisher),
+      replaying(false) {}
+
+  ~JournalingObjectStore() override {
+  }
+};
+
+#endif
diff --git a/src/os/filestore/LFNIndex.cc b/src/os/filestore/LFNIndex.cc
new file mode 100644
index 000000000..cc4fbad95
--- /dev/null
+++ b/src/os/filestore/LFNIndex.cc
@@ -0,0 +1,1438 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+#include <errno.h>
+#include <string.h>
+
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "include/buffer.h"
+#include "common/ceph_crypto.h"
+#include "common/errno.h"
+#include "include/compat.h"
+#include "chain_xattr.h"
+
+#include "LFNIndex.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "LFNIndex(" << get_base_path() << ") "
+
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::crypto::SHA1;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+
+const string LFNIndex::LFN_ATTR = "user.cephos.lfn";
+const string LFNIndex::PHASH_ATTR_PREFIX = "user.cephos.phash.";
+const string LFNIndex::SUBDIR_PREFIX = "DIR_";
+const string LFNIndex::FILENAME_COOKIE = "long";
+const int LFNIndex::FILENAME_PREFIX_LEN =  FILENAME_SHORT_LEN - FILENAME_HASH_LEN -
+								FILENAME_COOKIE.size() -
+								FILENAME_EXTRA;
+void LFNIndex::maybe_inject_failure()
+{
+  if (error_injection_enabled) {
+    if (current_failure > last_failure &&
+	(((double)(rand() % 10000))/((double)(10000))
+	 < error_injection_probability)) {
+      last_failure = current_failure;
+      current_failure = 0;
+      throw RetryException();
+    }
+    ++current_failure;
+  }
+}
+
+// Helper to close fd's when we leave scope.  This is useful when used
+// in combination with RetryException, thrown by the above.
+struct FDCloser {
+  int fd;
+  explicit FDCloser(int f) : fd(f) {}
+  ~FDCloser() {
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+  }
+};
+
+
+/* Public methods */
+
+uint64_t LFNIndex::get_max_escaped_name_len(const hobject_t &obj)
+{
+  ghobject_t ghobj(obj);
+  ghobj.shard_id = shard_id_t(0);
+  ghobj.generation = 0;
+  ghobj.hobj.snap = 0;
+  return lfn_generate_object_name_current(ghobj).size();
+}
+
+int LFNIndex::init()
+{
+  return _init();
+}
+
+int LFNIndex::created(const ghobject_t &oid, const char *path)
+{
+  WRAP_RETRY(
+  vector<string> path_comp;
+  string short_name;
+  r = decompose_full_path(path, &path_comp, 0, &short_name);
+  if (r < 0)
+    goto out;
+  r = lfn_created(path_comp, oid, short_name);
+  if (r < 0) {
+    if (failed) {
+      /* This is hacky, but the only way we get ENOENT from lfn_created here is
+       * if we did a failure injection in _created below AND actually started the
+       * split or merge.  In that case, lfn_created already suceeded, and
+       * WRAP_RETRY already cleaned it up and we are actually done.  In a real
+       * failure, the filestore itself would have ended up calling this with
+       * the new path, not the old one, so we'd find it.
+       */
+      r = 0;
+    }
+    goto out;
+  }
+  r = _created(path_comp, oid, short_name);
+  if (r < 0)
+    goto out;
+    );
+}
+
+int LFNIndex::unlink(const ghobject_t &oid)
+{
+  WRAP_RETRY(
+  vector<string> path;
+  string short_name;
+  r = _lookup(oid, &path, &short_name, NULL);
+  if (r < 0) {
+    goto out;
+  }
+  r = _remove(path, oid, short_name);
+  if (r < 0) {
+    goto out;
+  }
+  );
+}
+
+int LFNIndex::lookup(const ghobject_t &oid,
+		     IndexedPath *out_path,
+		     int *hardlink)
+{
+  WRAP_RETRY(
+  vector<string> path;
+  string short_name;
+  r = _lookup(oid, &path, &short_name, hardlink);
+  if (r < 0)
+    goto out;
+  string full_path = get_full_path(path, short_name);
+  *out_path = std::make_shared<Path>(full_path, this);
+  r = 0;
+  );
+}
+
+int LFNIndex::pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs)
+{
+  return _pre_hash_collection(pg_num, expected_num_objs);
+}
+
+
+int LFNIndex::collection_list_partial(const ghobject_t &start,
+				      const ghobject_t &end,
+				      int max_count,
+				      vector<ghobject_t> *ls,
+				      ghobject_t *next)
+{
+  return _collection_list_partial(start, end, max_count, ls, next);
+}
+
+/* Derived class utility methods */
+
+int LFNIndex::fsync_dir(const vector<string> &path)
+{
+  maybe_inject_failure();
+  int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY|O_CLOEXEC);
+  if (fd < 0)
+    return -errno;
+  FDCloser f(fd);
+  maybe_inject_failure();
+  int r = ::fsync(fd);
+  maybe_inject_failure();
+  if (r < 0) {
+    derr << __func__ << " fsync failed: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+  return 0;
+}
+
+int LFNIndex::link_object(const vector<string> &from,
+			  const vector<string> &to,
+			  const ghobject_t &oid,
+			  const string &from_short_name)
+{
+  int r;
+  string from_path = get_full_path(from, from_short_name);
+  string to_path;
+  maybe_inject_failure();
+  r = lfn_get_name(to, oid, 0, &to_path, 0);
+  if (r < 0)
+    return r;
+  maybe_inject_failure();
+  r = ::link(from_path.c_str(), to_path.c_str());
+  maybe_inject_failure();
+  if (r < 0)
+    return -errno;
+  else
+    return 0;
+}
+
+int LFNIndex::remove_objects(const vector<string> &dir,
+			     const map<string, ghobject_t> &to_remove,
+			     map<string, ghobject_t> *remaining)
+{
+  set<string> clean_chains;
+  for (map<string, ghobject_t>::const_iterator to_clean = to_remove.begin();
+       to_clean != to_remove.end();
+       ++to_clean) {
+    if (!lfn_is_hashed_filename(to_clean->first)) {
+      maybe_inject_failure();
+      int r = ::unlink(get_full_path(dir, to_clean->first).c_str());
+      maybe_inject_failure();
+      if (r < 0)
+	return -errno;
+      continue;
+    }
+    if (clean_chains.count(lfn_get_short_name(to_clean->second, 0)))
+      continue;
+    set<int> holes;
+    map<int, pair<string, ghobject_t> > chain;
+    for (int i = 0; ; ++i) {
+      string short_name = lfn_get_short_name(to_clean->second, i);
+      if (remaining->count(short_name)) {
+	chain[i] = *(remaining->find(short_name));
+      } else if (to_remove.count(short_name)) {
+	holes.insert(i);
+      } else {
+	break;
+      }
+    }
+
+    map<int, pair<string, ghobject_t > >::reverse_iterator candidate = chain.rbegin();
+    for (set<int>::iterator i = holes.begin();
+	 i != holes.end();
+	 ++i) {
+      if (candidate == chain.rend() || *i > candidate->first) {
+	string remove_path_name =
+	  get_full_path(dir, lfn_get_short_name(to_clean->second, *i));
+	maybe_inject_failure();
+	int r = ::unlink(remove_path_name.c_str());
+	maybe_inject_failure();
+	if (r < 0)
+	  return -errno;
+	continue;
+      }
+      string from = get_full_path(dir, candidate->second.first);
+      string to = get_full_path(dir, lfn_get_short_name(candidate->second.second, *i));
+      maybe_inject_failure();
+      int r = ::rename(from.c_str(), to.c_str());
+      maybe_inject_failure();
+      if (r < 0)
+	return -errno;
+      remaining->erase(candidate->second.first);
+      remaining->insert(pair<string, ghobject_t>(
+			  lfn_get_short_name(candidate->second.second, *i),
+					     candidate->second.second));
+      ++candidate;
+    }
+    if (!holes.empty())
+      clean_chains.insert(lfn_get_short_name(to_clean->second, 0));
+  }
+  return 0;
+}
+
+int LFNIndex::move_objects(const vector<string> &from,
+			   const vector<string> &to)
+{
+  map<string, ghobject_t> to_move;
+  int r;
+  r = list_objects(from, 0, NULL, &to_move);
+  if (r < 0)
+    return r;
+  for (map<string,ghobject_t>::iterator i = to_move.begin();
+       i != to_move.end();
+       ++i) {
+    string from_path = get_full_path(from, i->first);
+    string to_path, to_name;
+    r = lfn_get_name(to, i->second, &to_name, &to_path, 0);
+    if (r < 0)
+      return r;
+    maybe_inject_failure();
+    r = ::link(from_path.c_str(), to_path.c_str());
+    if (r < 0 && errno != EEXIST)
+      return -errno;
+    maybe_inject_failure();
+    r = lfn_created(to, i->second, to_name);
+    maybe_inject_failure();
+    if (r < 0)
+      return r;
+  }
+  r = fsync_dir(to);
+  if (r < 0)
+    return r;
+  for (map<string,ghobject_t>::iterator i = to_move.begin();
+       i != to_move.end();
+       ++i) {
+    maybe_inject_failure();
+    r = ::unlink(get_full_path(from, i->first).c_str());
+    maybe_inject_failure();
+    if (r < 0)
+      return -errno;
+  }
+  return fsync_dir(from);
+}
+
+int LFNIndex::remove_object(const vector<string> &from,
+			    const ghobject_t &oid)
+{
+  string short_name;
+  int r, exist;
+  maybe_inject_failure();
+  r = get_mangled_name(from, oid, &short_name, &exist);
+  maybe_inject_failure();
+  if (r < 0)
+    return r;
+  if (exist == 0)
+    return -ENOENT;
+  return lfn_unlink(from, oid, short_name);
+}
+
+int LFNIndex::get_mangled_name(const vector<string> &from,
+			       const ghobject_t &oid,
+			       string *mangled_name, int *hardlink)
+{
+  return lfn_get_name(from, oid, mangled_name, 0, hardlink);
+}
+
+int LFNIndex::move_subdir(
+  LFNIndex &from,
+  LFNIndex &dest,
+  const vector<string> &path,
+  string dir
+  )
+{
+  vector<string> sub_path(path.begin(), path.end());
+  sub_path.push_back(dir);
+  string from_path(from.get_full_path_subdir(sub_path));
+  string to_path(dest.get_full_path_subdir(sub_path));
+  int r = ::rename(from_path.c_str(), to_path.c_str());
+  if (r < 0)
+    return -errno;
+  return 0;
+}
+
+int LFNIndex::move_object(
+  LFNIndex &from,
+  LFNIndex &dest,
+  const vector<string> &path,
+  const pair<string, ghobject_t> &obj
+  )
+{
+  string from_path(from.get_full_path(path, obj.first));
+  string to_path;
+  string to_name;
+  int exists;
+  int r = dest.lfn_get_name(path, obj.second, &to_name, &to_path, &exists);
+  if (r < 0)
+    return r;
+  if (!exists) {
+    r = ::link(from_path.c_str(), to_path.c_str());
+    if (r < 0)
+      return r;
+  }
+  r = dest.lfn_created(path, obj.second, to_name);
+  if (r < 0)
+    return r;
+  r = dest.fsync_dir(path);
+  if (r < 0)
+    return r;
+  r = from.remove_object(path, obj.second);
+  if (r < 0)
+    return r;
+  return from.fsync_dir(path);
+}
+
+
+static int get_hobject_from_oinfo(const char *dir, const char *file,
+				  ghobject_t *o)
+{
+  char path[PATH_MAX];
+  snprintf(path, sizeof(path), "%s/%s", dir, file);
+  // Hack, user.ceph._ is the attribute used to store the object info
+  bufferptr bp;
+  int r = chain_getxattr_buf(
+    path,
+    "user.ceph._",
+    &bp);
+  if (r < 0)
+    return r;
+  bufferlist bl;
+  if (r > 0)
+    bl.push_back(bp);
+  object_info_t oi(bl);
+  *o = ghobject_t(oi.soid);
+  return 0;
+}
+
+
+int LFNIndex::list_objects(const vector<string> &to_list, int max_objs,
+			   long *handle, map<string, ghobject_t> *out)
+{
+  string to_list_path = get_full_path_subdir(to_list);
+  DIR *dir = ::opendir(to_list_path.c_str());
+  if (!dir) {
+    return -errno;
+  }
+
+  if (handle && *handle) {
+    seekdir(dir, *handle);
+  }
+
+  struct dirent *de = nullptr;
+  int r = 0;
+  int listed = 0;
+  bool end = true;
+  while (true) {
+    errno = 0;
+    de = ::readdir(dir);
+    if (de == nullptr) {
+      if (errno != 0) {
+        r = -errno;
+        dout(0) << "readdir failed " << to_list_path << ": "
+                << cpp_strerror(-r) << dendl;
+        goto cleanup;
+      }
+      break;
+    }
+    end = false;
+    if (max_objs > 0 && listed >= max_objs) {
+      break;
+    }
+    if (de->d_name[0] == '.')
+      continue;
+    string short_name(de->d_name);
+    ghobject_t obj;
+    if (lfn_is_object(short_name)) {
+      r = lfn_translate(to_list, short_name, &obj);
+      if (r == -EINVAL) {
+	continue;
+      } else if (r < 0) {
+	goto cleanup;
+      } else {
+	string long_name = lfn_generate_object_name(obj);
+	if (!lfn_must_hash(long_name)) {
+	  ceph_assert(long_name == short_name);
+	}
+	if (index_version == HASH_INDEX_TAG)
+	  get_hobject_from_oinfo(to_list_path.c_str(), short_name.c_str(), &obj);
+
+	out->insert(pair<string, ghobject_t>(short_name, obj));
+	++listed;
+      }
+    }
+  }
+
+  if (handle && !end) {
+    *handle = telldir(dir);
+  }
+
+  r = 0;
+ cleanup:
+  ::closedir(dir);
+  return r;
+}
+
+int LFNIndex::list_subdirs(const vector<string> &to_list,
+			   vector<string> *out)
+{
+  string to_list_path = get_full_path_subdir(to_list);
+  DIR *dir = ::opendir(to_list_path.c_str());
+  if (!dir)
+    return -errno;
+
+  struct dirent *de = nullptr;
+  int r = 0;
+  while (true) {
+    errno = 0;
+    de = ::readdir(dir);
+    if (de == nullptr) {
+      if (errno != 0) {
+        r = -errno;
+        dout(0) << "readdir failed " << to_list_path << ": "
+                << cpp_strerror(-r) << dendl;
+      }
+      break;
+    }
+    string short_name(de->d_name);
+    string demangled_name;
+    if (lfn_is_subdir(short_name, &demangled_name)) {
+      out->push_back(demangled_name);
+    }
+  }
+
+  ::closedir(dir);
+  return r;
+}
+
+int LFNIndex::create_path(const vector<string> &to_create)
+{
+  maybe_inject_failure();
+  int r = ::mkdir(get_full_path_subdir(to_create).c_str(), 0777);
+  maybe_inject_failure();
+  if (r < 0)
+    return -errno;
+  else
+    return 0;
+}
+
+int LFNIndex::remove_path(const vector<string> &to_remove)
+{
+  maybe_inject_failure();
+  int r = ::rmdir(get_full_path_subdir(to_remove).c_str());
+  maybe_inject_failure();
+  if (r < 0)
+    return -errno;
+  else
+    return 0;
+}
+
+int LFNIndex::path_exists(const vector<string> &to_check, int *exists)
+{
+  string full_path = get_full_path_subdir(to_check);
+  struct stat buf;
+  if (::stat(full_path.c_str(), &buf)) {
+    int r = -errno;
+    if (r == -ENOENT) {
+      *exists = 0;
+      return 0;
+    } else {
+      return r;
+    }
+  } else {
+    *exists = 1;
+    return 0;
+  }
+}
+
+int LFNIndex::add_attr_path(const vector<string> &path,
+			    const string &attr_name,
+			    bufferlist &attr_value)
+{
+  string full_path = get_full_path_subdir(path);
+  maybe_inject_failure();
+  return chain_setxattr<false, true>(
+    full_path.c_str(), mangle_attr_name(attr_name).c_str(),
+    reinterpret_cast<void *>(attr_value.c_str()),
+    attr_value.length());
+}
+
+int LFNIndex::get_attr_path(const vector<string> &path,
+			    const string &attr_name,
+			    bufferlist &attr_value)
+{
+  string full_path = get_full_path_subdir(path);
+  bufferptr bp;
+  int r = chain_getxattr_buf(
+    full_path.c_str(),
+    mangle_attr_name(attr_name).c_str(),
+    &bp);
+  if (r > 0)
+    attr_value.push_back(bp);
+  return r;
+}
+
+int LFNIndex::remove_attr_path(const vector<string> &path,
+			       const string &attr_name)
+{
+  string full_path = get_full_path_subdir(path);
+  string mangled_attr_name = mangle_attr_name(attr_name);
+  maybe_inject_failure();
+  return chain_removexattr(full_path.c_str(), mangled_attr_name.c_str());
+}
+
+string LFNIndex::lfn_generate_object_name_keyless(const ghobject_t &oid)
+{
+  char s[FILENAME_MAX_LEN];
+  char *end = s + sizeof(s);
+  char *t = s;
+
+  ceph_assert(oid.generation == ghobject_t::NO_GEN);
+  const char *i = oid.hobj.oid.name.c_str();
+  // Escape subdir prefix
+  if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+    *t++ = '\\';
+    *t++ = 'd';
+    i += 4;
+  }
+  while (*i && t < end) {
+    if (*i == '\\') {
+      *t++ = '\\';
+      *t++ = '\\';
+    } else if (*i == '.' && i == oid.hobj.oid.name.c_str()) {  // only escape leading .
+      *t++ = '\\';
+      *t++ = '.';
+    } else if (*i == '/') {
+      *t++ = '\\';
+      *t++ = 's';
+    } else
+      *t++ = *i;
+    i++;
+  }
+
+  if (oid.hobj.snap == CEPH_NOSNAP)
+    t += snprintf(t, end - t, "_head");
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
+    t += snprintf(t, end - t, "_snapdir");
+  else
+    t += snprintf(t, end - t, "_%llx", (long long unsigned)oid.hobj.snap);
+  snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+
+  return string(s);
+}
+
+static void append_escaped(string::const_iterator begin,
+			   string::const_iterator end,
+			   string *out)
+{
+  for (string::const_iterator i = begin; i != end; ++i) {
+    if (*i == '\\') {
+      out->append("\\\\");
+    } else if (*i == '/') {
+      out->append("\\s");
+    } else if (*i == '_') {
+      out->append("\\u");
+    } else if (*i == '\0') {
+      out->append("\\n");
+    } else {
+      out->append(i, i+1);
+    }
+  }
+}
+
+string LFNIndex::lfn_generate_object_name_current(const ghobject_t &oid)
+{
+  string full_name;
+  string::const_iterator i = oid.hobj.oid.name.begin();
+  if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+    full_name.append("\\d");
+    i += 4;
+  } else if (oid.hobj.oid.name[0] == '.') {
+    full_name.append("\\.");
+    ++i;
+  }
+  append_escaped(i, oid.hobj.oid.name.end(), &full_name);
+  full_name.append("_");
+  append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
+  full_name.append("_");
+
+  char buf[PATH_MAX];
+  char *t = buf;
+  const char *end = t + sizeof(buf);
+  if (oid.hobj.snap == CEPH_NOSNAP)
+    t += snprintf(t, end - t, "head");
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
+    t += snprintf(t, end - t, "snapdir");
+  else
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+  t += snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+  full_name.append(buf, t);
+  full_name.append("_");
+
+  append_escaped(oid.hobj.nspace.begin(), oid.hobj.nspace.end(), &full_name);
+  full_name.append("_");
+
+  t = buf;
+  if (oid.hobj.pool == -1)
+    t += snprintf(t, end - t, "none");
+  else
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.pool);
+  full_name.append(buf, t);
+
+  if (oid.generation != ghobject_t::NO_GEN ||
+      oid.shard_id != shard_id_t::NO_SHARD) {
+    full_name.append("_");
+
+    t = buf;
+    t += snprintf(t, end - buf, "%llx", (long long unsigned)oid.generation);
+    full_name.append(buf, t);
+
+    full_name.append("_");
+
+    t = buf;
+    t += snprintf(t, end - buf, "%x", (int)oid.shard_id);
+    full_name.append(buf, t);
+  }
+
+  return full_name;
+}
+
+string LFNIndex::lfn_generate_object_name_poolless(const ghobject_t &oid)
+{
+  if (index_version == HASH_INDEX_TAG)
+    return lfn_generate_object_name_keyless(oid);
+
+  ceph_assert(oid.generation == ghobject_t::NO_GEN);
+  string full_name;
+  string::const_iterator i = oid.hobj.oid.name.begin();
+  if (oid.hobj.oid.name.substr(0, 4) == "DIR_") {
+    full_name.append("\\d");
+    i += 4;
+  } else if (oid.hobj.oid.name[0] == '.') {
+    full_name.append("\\.");
+    ++i;
+  }
+  append_escaped(i, oid.hobj.oid.name.end(), &full_name);
+  full_name.append("_");
+  append_escaped(oid.hobj.get_key().begin(), oid.hobj.get_key().end(), &full_name);
+  full_name.append("_");
+
+  char snap_with_hash[PATH_MAX];
+  char *t = snap_with_hash;
+  char *end = t + sizeof(snap_with_hash);
+  if (oid.hobj.snap == CEPH_NOSNAP)
+    t += snprintf(t, end - t, "head");
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
+    t += snprintf(t, end - t, "snapdir");
+  else
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+  snprintf(t, end - t, "_%.*X", (int)(sizeof(oid.hobj.get_hash())*2), oid.hobj.get_hash());
+  full_name += string(snap_with_hash);
+  return full_name;
+}
+
+int LFNIndex::lfn_get_name(const vector<string> &path,
+			   const ghobject_t &oid,
+			   string *mangled_name, string *out_path,
+			   int *hardlink)
+{
+  string full_name = lfn_generate_object_name(oid);
+  int r;
+
+  if (!lfn_must_hash(full_name)) {
+    if (mangled_name)
+      *mangled_name = full_name;
+    if (out_path)
+      *out_path = get_full_path(path, full_name);
+    if (hardlink) {
+      struct stat buf;
+      string full_path = get_full_path(path, full_name);
+      maybe_inject_failure();
+      r = ::stat(full_path.c_str(), &buf);
+      if (r < 0) {
+	if (errno == ENOENT)
+	  *hardlink = 0;
+	else
+	  return -errno;
+      } else {
+	*hardlink = buf.st_nlink;
+      }
+    }
+    return 0;
+  }
+
+  int i = 0;
+  string candidate;
+  string candidate_path;
+  for ( ; ; ++i) {
+    candidate = lfn_get_short_name(oid, i);
+    candidate_path = get_full_path(path, candidate);
+    bufferptr bp;
+    r = chain_getxattr_buf(
+      candidate_path.c_str(),
+      get_lfn_attr().c_str(),
+      &bp);
+    if (r < 0) {
+      if (errno != ENODATA && errno != ENOENT)
+	return -errno;
+      if (errno == ENODATA) {
+	// Left over from incomplete transaction, it'll be replayed
+	maybe_inject_failure();
+	r = ::unlink(candidate_path.c_str());
+	maybe_inject_failure();
+	if (r < 0)
+	  return -errno;
+      }
+      if (mangled_name)
+	*mangled_name = candidate;
+      if (out_path)
+	*out_path = candidate_path;
+      if (hardlink)
+	*hardlink = 0;
+      return 0;
+    }
+    ceph_assert(r > 0);
+    string lfn(bp.c_str(), bp.length());
+    if (lfn == full_name) {
+      if (mangled_name)
+	*mangled_name = candidate;
+      if (out_path)
+	*out_path = candidate_path;
+      if (hardlink) {
+	struct stat st;
+	r = ::stat(candidate_path.c_str(), &st);
+        if (r < 0) {
+          if (errno == ENOENT)
+            *hardlink = 0;
+          else
+            return -errno;
+        } else {
+	  *hardlink = st.st_nlink;
+	}
+      }
+      return 0;
+    }
+    bp = bufferptr();
+    r = chain_getxattr_buf(
+      candidate_path.c_str(),
+      get_alt_lfn_attr().c_str(),
+      &bp);
+    if (r > 0) {
+      // only consider alt name if nlink > 1
+      struct stat st;
+      int rc = ::stat(candidate_path.c_str(), &st);
+      if (rc < 0)
+	return -errno;
+      if (st.st_nlink <= 1) {
+	// left over from incomplete unlink, remove
+	maybe_inject_failure();
+	dout(20) << __func__ << " found extra alt attr for " << candidate_path
+		 << ", long name " << string(bp.c_str(), bp.length()) << dendl;
+	rc = chain_removexattr(candidate_path.c_str(),
+			       get_alt_lfn_attr().c_str());
+	maybe_inject_failure();
+	if (rc < 0)
+	  return rc;
+	continue;
+      }
+      string lfn(bp.c_str(), bp.length());
+      if (lfn == full_name) {
+	dout(20) << __func__ << " used alt attr for " << full_name << dendl;
+	if (mangled_name)
+	  *mangled_name = candidate;
+	if (out_path)
+	  *out_path = candidate_path;
+	if (hardlink)
+	  *hardlink = st.st_nlink;
+	return 0;
+      }
+    }
+  }
+  ceph_abort(); // Unreachable
+  return 0;
+}
+
+int LFNIndex::lfn_created(const vector<string> &path,
+			  const ghobject_t &oid,
+			  const string &mangled_name)
+{
+  if (!lfn_is_hashed_filename(mangled_name))
+    return 0;
+  string full_path = get_full_path(path, mangled_name);
+  string full_name = lfn_generate_object_name(oid);
+  maybe_inject_failure();
+
+  // if the main attr exists and is different, move it to the alt attr.
+  bufferptr bp;
+  int r = chain_getxattr_buf(
+    full_path.c_str(),
+    get_lfn_attr().c_str(),
+    &bp);
+  if (r > 0) {
+    string lfn(bp.c_str(), bp.length());
+    if (lfn != full_name) {
+      dout(20) << __func__ << " " << mangled_name
+	       << " moving old name to alt attr "
+	       << lfn
+	       << ", new name is " << full_name << dendl;
+      r = chain_setxattr<false, true>(
+	full_path.c_str(), get_alt_lfn_attr().c_str(),
+	bp.c_str(), bp.length());
+      if (r < 0)
+	return r;
+    }
+  }
+
+  return chain_setxattr<false, true>(
+    full_path.c_str(), get_lfn_attr().c_str(),
+    full_name.c_str(), full_name.size());
+}
+
+int LFNIndex::lfn_unlink(const vector<string> &path,
+			 const ghobject_t &oid,
+			 const string &mangled_name)
+{
+  if (!lfn_is_hashed_filename(mangled_name)) {
+    string full_path = get_full_path(path, mangled_name);
+    maybe_inject_failure();
+    int r = ::unlink(full_path.c_str());
+    maybe_inject_failure();
+    if (r < 0)
+      return -errno;
+    return 0;
+  }
+
+  int i = 0;
+  for ( ; ; ++i) {
+    string candidate = lfn_get_short_name(oid, i);
+    if (candidate == mangled_name)
+      break;
+  }
+  int removed_index = i;
+  ++i;
+  for ( ; ; ++i) {
+    struct stat buf;
+    string to_check = lfn_get_short_name(oid, i);
+    string to_check_path = get_full_path(path, to_check);
+    int r = ::stat(to_check_path.c_str(), &buf);
+    if (r < 0) {
+      if (errno == ENOENT) {
+	break;
+      } else {
+	return -errno;
+      }
+    }
+  }
+  string full_path = get_full_path(path, mangled_name);
+  int fd = ::open(full_path.c_str(), O_RDONLY|O_CLOEXEC);
+  if (fd < 0)
+    return -errno;
+  FDCloser f(fd);
+  if (i == removed_index + 1) {
+    maybe_inject_failure();
+    int r = ::unlink(full_path.c_str());
+    maybe_inject_failure();
+    if (r < 0)
+      return -errno;
+  } else {
+    string& rename_to = full_path;
+    string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
+    maybe_inject_failure();
+    int r = ::rename(rename_from.c_str(), rename_to.c_str());
+    maybe_inject_failure();
+    if (r < 0)
+      return -errno;
+  }
+  struct stat st;
+  int r = ::fstat(fd, &st);
+  if (r == 0 && st.st_nlink > 0) {
+    // remove alt attr
+    dout(20) << __func__ << " removing alt attr from " << full_path << dendl;
+    fsync_dir(path);
+    chain_fremovexattr(fd, get_alt_lfn_attr().c_str());
+  }
+  return r;
+}
+
+int LFNIndex::lfn_translate(const vector<string> &path,
+			    const string &short_name,
+			    ghobject_t *out)
+{
+  if (!lfn_is_hashed_filename(short_name)) {
+    return lfn_parse_object_name(short_name, out);
+  }
+  string full_path = get_full_path(path, short_name);
+  // First, check alt attr
+  bufferptr bp;
+  int r = chain_getxattr_buf(
+    full_path.c_str(),
+    get_alt_lfn_attr().c_str(),
+    &bp);
+  if (r > 0) {
+    // There is an alt attr, does it match?
+    string lfn(bp.c_str(), bp.length());
+    if (short_name_matches(short_name.c_str(), lfn.c_str())) {
+      return lfn_parse_object_name(lfn, out);
+    }
+  }
+
+  // Get lfn_attr
+  bp = bufferptr();
+  r = chain_getxattr_buf(
+    full_path.c_str(),
+    get_lfn_attr().c_str(),
+    &bp);
+  if (r < 0)
+    return r;
+  if (r == 0)
+    return -EINVAL;
+
+  string long_name(bp.c_str(), bp.length());
+  return lfn_parse_object_name(long_name, out);
+}
+
+bool LFNIndex::lfn_is_object(const string &short_name)
+{
+  return lfn_is_hashed_filename(short_name) || !lfn_is_subdir(short_name, 0);
+}
+
+bool LFNIndex::lfn_is_subdir(const string &name, string *demangled)
+{
+  if (name.substr(0, SUBDIR_PREFIX.size()) == SUBDIR_PREFIX) {
+    if (demangled)
+      *demangled = demangle_path_component(name);
+    return 1;
+  }
+  return 0;
+}
+
+static int parse_object(const char *s, ghobject_t& o)
+{
+  const char *hash = s + strlen(s) - 1;
+  while (*hash != '_' &&
+	 hash > s)
+    hash--;
+  const char *bar = hash - 1;
+  while (*bar != '_' &&
+	 bar > s)
+    bar--;
+  if (*bar == '_') {
+    char buf[bar-s + 1];
+    char *t = buf;
+    const char *i = s;
+    while (i < bar) {
+      if (*i == '\\') {
+	i++;
+	switch (*i) {
+	case '\\': *t++ = '\\'; break;
+	case '.': *t++ = '.'; break;
+	case 's': *t++ = '/'; break;
+	case 'd': {
+	  *t++ = 'D';
+	  *t++ = 'I';
+	  *t++ = 'R';
+	  *t++ = '_';
+	  break;
+	}
+	default: ceph_abort();
+	}
+      } else {
+	*t++ = *i;
+      }
+      i++;
+    }
+    *t = 0;
+    o.hobj.oid.name = string(buf, t-buf);
+    if (strncmp(bar+1, "head", 4) == 0)
+      o.hobj.snap = CEPH_NOSNAP;
+    else if (strncmp(bar+1, "snapdir", 7) == 0)
+      o.hobj.snap = CEPH_SNAPDIR;
+    else
+      o.hobj.snap = strtoull(bar+1, NULL, 16);
+
+    uint32_t hobject_hash_input;
+    sscanf(hash, "_%X", &hobject_hash_input);
+    o.hobj.set_hash(hobject_hash_input);
+
+    return 1;
+  }
+  return 0;
+}
+
+int LFNIndex::lfn_parse_object_name_keyless(const string &long_name, ghobject_t *out)
+{
+  int r = parse_object(long_name.c_str(), *out);
+  int64_t pool = -1;
+  spg_t pg;
+  if (coll().is_pg_prefix(&pg))
+    pool = (int64_t)pg.pgid.pool();
+  out->hobj.pool = pool;
+  if (!r) return -EINVAL;
+  string temp = lfn_generate_object_name(*out);
+  return 0;
+}
+
+static bool append_unescaped(string::const_iterator begin,
+			     string::const_iterator end,
+			     string *out)
+{
+  for (string::const_iterator i = begin; i != end; ++i) {
+    if (*i == '\\') {
+      ++i;
+      if (*i == '\\')
+	out->append("\\");
+      else if (*i == 's')
+	out->append("/");
+      else if (*i == 'n')
+	(*out) += '\0';
+      else if (*i == 'u')
+	out->append("_");
+      else
+	return false;
+    } else {
+      out->append(i, i+1);
+    }
+  }
+  return true;
+}
+
+int LFNIndex::lfn_parse_object_name_poolless(const string &long_name,
+					     ghobject_t *out)
+{
+  string name;
+  string key;
+  uint32_t hash;
+  snapid_t snap;
+
+  string::const_iterator current = long_name.begin();
+  if (*current == '\\') {
+    ++current;
+    if (current == long_name.end()) {
+      return -EINVAL;
+    } else if (*current == 'd') {
+      name.append("DIR_");
+      ++current;
+    } else if (*current == '.') {
+      name.append(".");
+      ++current;
+    } else {
+      --current;
+    }
+  }
+
+  string::const_iterator end = current;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  if (!append_unescaped(current, end, &name))
+    return -EINVAL;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  if (!append_unescaped(current, end, &key))
+    return -EINVAL;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  string snap_str(current, end);
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end != long_name.end())
+    return -EINVAL;
+  string hash_str(current, end);
+
+  if (snap_str == "head")
+    snap = CEPH_NOSNAP;
+  else if (snap_str == "snapdir")
+    snap = CEPH_SNAPDIR;
+  else
+    snap = strtoull(snap_str.c_str(), NULL, 16);
+  sscanf(hash_str.c_str(), "%X", &hash);
+
+
+  int64_t pool = -1;
+  spg_t pg;
+  if (coll().is_pg_prefix(&pg))
+    pool = (int64_t)pg.pgid.pool();
+  (*out) = ghobject_t(hobject_t(name, key, snap, hash, pool, ""));
+  return 0;
+}
+
+
+int LFNIndex::lfn_parse_object_name(const string &long_name, ghobject_t *out)
+{
+  string name;
+  string key;
+  string ns;
+  uint32_t hash;
+  snapid_t snap;
+  uint64_t pool;
+  gen_t generation = ghobject_t::NO_GEN;
+  shard_id_t shard_id = shard_id_t::NO_SHARD;
+
+  if (index_version == HASH_INDEX_TAG)
+    return lfn_parse_object_name_keyless(long_name, out);
+  if (index_version == HASH_INDEX_TAG_2)
+    return lfn_parse_object_name_poolless(long_name, out);
+
+  string::const_iterator current = long_name.begin();
+  if (*current == '\\') {
+    ++current;
+    if (current == long_name.end()) {
+      return -EINVAL;
+    } else if (*current == 'd') {
+      name.append("DIR_");
+      ++current;
+    } else if (*current == '.') {
+      name.append(".");
+      ++current;
+    } else {
+      --current;
+    }
+  }
+
+  string::const_iterator end = current;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  if (!append_unescaped(current, end, &name))
+    return -EINVAL;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  if (!append_unescaped(current, end, &key))
+    return -EINVAL;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  string snap_str(current, end);
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  string hash_str(current, end);
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  if (end == long_name.end())
+    return -EINVAL;
+  if (!append_unescaped(current, end, &ns))
+    return -EINVAL;
+
+  current = ++end;
+  for ( ; end != long_name.end() && *end != '_'; ++end) ;
+  string pstring(current, end);
+
+  // Optional generation/shard_id
+  string genstring, shardstring;
+  if (end != long_name.end()) {
+    current = ++end;
+    for ( ; end != long_name.end() && *end != '_'; ++end) ;
+    if (end == long_name.end())
+      return -EINVAL;
+    genstring = string(current, end);
+
+    generation = (gen_t)strtoull(genstring.c_str(), NULL, 16);
+
+    current = ++end;
+    for ( ; end != long_name.end() && *end != '_'; ++end) ;
+    if (end != long_name.end())
+      return -EINVAL;
+    shardstring = string(current, end);
+
+    shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16);
+  }
+
+  if (snap_str == "head")
+    snap = CEPH_NOSNAP;
+  else if (snap_str == "snapdir")
+    snap = CEPH_SNAPDIR;
+  else
+    snap = strtoull(snap_str.c_str(), NULL, 16);
+  sscanf(hash_str.c_str(), "%X", &hash);
+
+  if (pstring == "none")
+    pool = (uint64_t)-1;
+  else
+    pool = strtoull(pstring.c_str(), NULL, 16);
+
+  (*out) = ghobject_t(hobject_t(name, key, snap, hash, (int64_t)pool, ns), generation, shard_id);
+  return 0;
+}
+
+bool LFNIndex::lfn_is_hashed_filename(const string &name)
+{
+  if (name.size() < (unsigned)FILENAME_SHORT_LEN) {
+    return 0;
+  }
+  if (name.substr(name.size() - FILENAME_COOKIE.size(), FILENAME_COOKIE.size())
+      == FILENAME_COOKIE) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+bool LFNIndex::lfn_must_hash(const string &long_name)
+{
+  return (int)long_name.size() >= FILENAME_SHORT_LEN;
+}
+
+static inline void buf_to_hex(const unsigned char *buf, int len, char *str)
+{
+  int i;
+  str[0] = '\0';
+  for (i = 0; i < len; i++) {
+    sprintf(&str[i*2], "%02x", (int)buf[i]);
+  }
+}
+
+int LFNIndex::hash_filename(const char *filename, char *hash, int buf_len)
+{
+  if (buf_len < FILENAME_HASH_LEN + 1)
+    return -EINVAL;
+
+  char buf[FILENAME_LFN_DIGEST_SIZE];
+  char hex[FILENAME_LFN_DIGEST_SIZE * 2];
+
+  SHA1 h;
+  h.Update((const unsigned char *)filename, strlen(filename));
+  h.Final((unsigned char *)buf);
+
+  buf_to_hex((unsigned char *)buf, (FILENAME_HASH_LEN + 1) / 2, hex);
+  strncpy(hash, hex, FILENAME_HASH_LEN);
+  hash[FILENAME_HASH_LEN] = '\0';
+  return 0;
+}
+
+void LFNIndex::build_filename(const char *old_filename, int i, char *filename, int len)
+{
+  char hash[FILENAME_HASH_LEN + 1];
+
+  ceph_assert(len >= FILENAME_SHORT_LEN + 4);
+
+  strncpy(filename, old_filename, FILENAME_PREFIX_LEN);
+  filename[FILENAME_PREFIX_LEN] = '\0';
+  if ((int)strlen(filename) < FILENAME_PREFIX_LEN)
+    return;
+  if (old_filename[FILENAME_PREFIX_LEN] == '\0')
+    return;
+
+  hash_filename(old_filename, hash, sizeof(hash));
+  int ofs = FILENAME_PREFIX_LEN;
+  while (1) {
+    int suffix_len = sprintf(filename + ofs, "_%s_%d_%s", hash, i, FILENAME_COOKIE.c_str());
+    if (ofs + suffix_len <= FILENAME_SHORT_LEN || !ofs)
+      break;
+    ofs--;
+  }
+}
+
+bool LFNIndex::short_name_matches(const char *short_name, const char *cand_long_name)
+{
+  const char *end = short_name;
+  while (*end) ++end;
+  const char *suffix = end;
+  if (suffix > short_name)  --suffix;                   // last char
+  while (suffix > short_name && *suffix != '_') --suffix; // back to first _
+  if (suffix > short_name) --suffix;                   // one behind that
+  while (suffix > short_name && *suffix != '_') --suffix; // back to second _
+
+  int index = -1;
+  char buf[FILENAME_SHORT_LEN + 4];
+  ceph_assert((end - suffix) < (int)sizeof(buf));
+  int r = sscanf(suffix, "_%d_%s", &index, buf);
+  if (r < 2)
+    return false;
+  if (strcmp(buf, FILENAME_COOKIE.c_str()) != 0)
+    return false;
+  build_filename(cand_long_name, index, buf, sizeof(buf));
+  return strcmp(short_name, buf) == 0;
+}
+
+string LFNIndex::lfn_get_short_name(const ghobject_t &oid, int i)
+{
+  string long_name = lfn_generate_object_name(oid);
+  ceph_assert(lfn_must_hash(long_name));
+  char buf[FILENAME_SHORT_LEN + 4];
+  build_filename(long_name.c_str(), i, buf, sizeof(buf));
+  return string(buf);
+}
+
+const string &LFNIndex::get_base_path()
+{
+  return base_path;
+}
+
+string LFNIndex::get_full_path_subdir(const vector<string> &rel)
+{
+  string retval = get_base_path();
+  for (vector<string>::const_iterator i = rel.begin();
+       i != rel.end();
+       ++i) {
+    retval += "/";
+    retval += mangle_path_component(*i);
+  }
+  return retval;
+}
+
+string LFNIndex::get_full_path(const vector<string> &rel, const string &name)
+{
+  return get_full_path_subdir(rel) + "/" + name;
+}
+
+string LFNIndex::mangle_path_component(const string &component)
+{
+  return SUBDIR_PREFIX + component;
+}
+
+string LFNIndex::demangle_path_component(const string &component)
+{
+  return component.substr(SUBDIR_PREFIX.size(), component.size() - SUBDIR_PREFIX.size());
+}
+
+int LFNIndex::decompose_full_path(const char *in, vector<string> *out,
+				  ghobject_t *oid, string *shortname)
+{
+  const char *beginning = in + get_base_path().size();
+  const char *end = beginning;
+  while (1) {
+    end++;
+    beginning = end++;
+    for ( ; *end != '\0' && *end != '/'; ++end) ;
+    if (*end != '\0') {
+      out->push_back(demangle_path_component(string(beginning, end - beginning)));
+      continue;
+    } else {
+      break;
+    }
+  }
+  *shortname = string(beginning, end - beginning);
+  if (oid) {
+    int r = lfn_translate(*out, *shortname, oid);
+    if (r < 0)
+      return r;
+  }
+  return 0;
+}
+
+string LFNIndex::mangle_attr_name(const string &attr)
+{
+  return PHASH_ATTR_PREFIX + attr;
+}
diff --git a/src/os/filestore/LFNIndex.h b/src/os/filestore/LFNIndex.h
new file mode 100644
index 000000000..23a546480
--- /dev/null
+++ b/src/os/filestore/LFNIndex.h
@@ -0,0 +1,614 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef OS_LFNINDEX_H
+#define OS_LFNINDEX_H
+
+#include <string>
+#include <map>
+#include <set>
+#include <vector>
+#include <exception>
+
+#include "osd/osd_types.h"
+#include "include/object.h"
+#include "common/ceph_crypto.h"
+
+#include "CollectionIndex.h"
+
+/**
+ * LFNIndex also encapsulates logic for manipulating
+ * subdirectories of a collection as well as the long filename
+ * logic.
+ *
+ * The protected methods provide machinery for derived classes to
+ * manipulate subdirectories and objects.
+ *
+ * The virtual methods are to be overridden to provide the actual
+ * hashed layout.
+ *
+ * User must call created when an object is created.
+ *
+ * Synchronization: Calling code must ensure that there are no object
+ * creations or deletions during the lifetime of a Path object (except
+ * of an object at that path).
+ *
+ * Unless otherwise noted, methods which return an int return 0 on success
+ * and a negative error code on failure.
+ */
+#define WRAP_RETRY(x) {				\
+  bool failed = false;				\
+  int r = 0;					\
+  init_inject_failure();			\
+  while (1) {					\
+    try {					\
+      if (failed) {				\
+	r = cleanup();				\
+	ceph_assert(r == 0);				\
+      }						\
+      { x }					\
+      out:					\
+      complete_inject_failure();		\
+      return r;					\
+    } catch (RetryException&) {			\
+      failed = true;				\
+    } catch (...) {				\
+      ceph_abort();				\
+    }						\
+  }						\
+  return -1;					\
+  }						\
+
+
+
+class LFNIndex : public CollectionIndex {
+  /// Hash digest output size.
+  static const int FILENAME_LFN_DIGEST_SIZE = CEPH_CRYPTO_SHA1_DIGESTSIZE;
+  /// Length of filename hash.
+  static const int FILENAME_HASH_LEN = FILENAME_LFN_DIGEST_SIZE;
+  /// Max filename size.
+  static const int FILENAME_MAX_LEN = 4096;
+  /// Length of hashed filename.
+  static const int FILENAME_SHORT_LEN = 255;
+  /// Length of hashed filename prefix.
+  static const int FILENAME_PREFIX_LEN;
+  /// Length of hashed filename cookie.
+  static const int FILENAME_EXTRA = 4;
+  /// Lfn cookie value.
+  static const std::string FILENAME_COOKIE;
+  /// Name of LFN attribute for storing full name.
+  static const std::string LFN_ATTR;
+  /// Prefix for subdir index attributes.
+  static const std::string PHASH_ATTR_PREFIX;
+  /// Prefix for index subdirectories.
+  static const std::string SUBDIR_PREFIX;
+
+  /// Path to Index base.
+  const std::string base_path;
+
+protected:
+  const uint32_t index_version;
+
+  /// true if retry injection is enabled
+  struct RetryException : public std::exception {};
+  bool error_injection_enabled;
+  bool error_injection_on;
+  double error_injection_probability;
+  uint64_t last_failure;
+  uint64_t current_failure;
+  void init_inject_failure() {
+    if (error_injection_on) {
+      error_injection_enabled = true;
+      last_failure = current_failure = 0;
+    }
+  }
+  void maybe_inject_failure();
+  void complete_inject_failure() {
+    error_injection_enabled = false;
+  }
+
+private:
+  std::string lfn_attribute, lfn_alt_attribute;
+  coll_t collection;
+
+public:
+  /// Constructor
+  LFNIndex(
+    CephContext* cct,
+    coll_t collection,
+    const char *base_path, ///< [in] path to Index root
+    uint32_t index_version,
+    double _error_injection_probability=0)
+    : CollectionIndex(cct, collection),
+      base_path(base_path),
+      index_version(index_version),
+      error_injection_enabled(false),
+      error_injection_on(_error_injection_probability != 0),
+      error_injection_probability(_error_injection_probability),
+      last_failure(0), current_failure(0),
+      collection(collection) {
+    if (index_version == HASH_INDEX_TAG) {
+      lfn_attribute = LFN_ATTR;
+    } else {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%d", index_version);
+      lfn_attribute = LFN_ATTR + std::string(buf);
+      lfn_alt_attribute = LFN_ATTR + std::string(buf) + "-alt";
+   }
+  }
+
+  coll_t coll() const override { return collection; }
+
+  /// Virtual destructor
+  ~LFNIndex() override {}
+
+  /// @see CollectionIndex
+  int init() override;
+
+  /// @see CollectionIndex
+  int cleanup() override = 0;
+
+  /// @see CollectionIndex
+  int created(
+    const ghobject_t &oid,
+    const char *path
+    ) override;
+
+  /// @see CollectionIndex
+  int unlink(
+    const ghobject_t &oid
+    ) override;
+
+  /// @see CollectionIndex
+  int lookup(
+    const ghobject_t &oid,
+    IndexedPath *path,
+    int *hardlink
+    ) override;
+
+  /// @see CollectionIndex;
+  int pre_hash_collection(
+      uint32_t pg_num,
+      uint64_t expected_num_objs
+      ) override;
+
+  /// @see CollectionIndex
+  int collection_list_partial(
+    const ghobject_t &start,
+    const ghobject_t &end,
+    int max_count,
+    std::vector<ghobject_t> *ls,
+    ghobject_t *next
+    ) override;
+
+  virtual int _split(
+    uint32_t match,                             //< [in] value to match
+    uint32_t bits,                              //< [in] bits to check
+    CollectionIndex* dest                       //< [in] destination index
+    ) = 0;
+  virtual int _merge(
+    uint32_t bits,                              //< [in] bits for target
+    CollectionIndex* dest                       //< [in] destination index
+    ) = 0;
+
+  /// @see CollectionIndex
+  int split(
+    uint32_t match,
+    uint32_t bits,
+    CollectionIndex* dest
+    ) override {
+    WRAP_RETRY(
+      r = _split(match, bits, dest);
+      goto out;
+      );
+  }
+
+  /// @see CollectionIndex
+  int merge(
+    uint32_t bits,
+    CollectionIndex* dest
+    ) override {
+    WRAP_RETRY(
+      r = _merge(bits, dest);
+      goto out;
+      );
+  }
+
+  /**
+   * Returns the length of the longest escaped name which could result
+   * from any clone, shard, or rollback object of this object
+   */
+  static uint64_t get_max_escaped_name_len(const hobject_t &obj);
+
+protected:
+  virtual int _init() = 0;
+
+  /// Will be called upon object creation
+  virtual int _created(
+    const std::vector<std::string> &path, ///< [in] Path to subdir.
+    const ghobject_t &oid,      ///< [in] Object created.
+    const std::string &mangled_name  ///< [in] Mangled filename.
+    ) = 0;
+
+  /// Will be called to remove an object
+  virtual int _remove(
+    const std::vector<std::string> &path,     ///< [in] Path to subdir.
+    const ghobject_t &oid,          ///< [in] Object to remove.
+    const std::string &mangled_name	    ///< [in] Mangled filename.
+    ) = 0;
+
+  /// Return the path and mangled_name for oid.
+  virtual int _lookup(
+    const ghobject_t &oid,///< [in] Object for lookup.
+    std::vector<std::string> *path, ///< [out] Path to the object.
+    std::string *mangled_name, ///< [out] Mangled filename.
+    int *exists		  ///< [out] True if the object exists.
+    ) = 0;
+
+  /// Pre-hash the collection with the given pg number and
+  /// expected number of objects in the collection.
+  virtual int _pre_hash_collection(
+      uint32_t pg_num,
+      uint64_t expected_num_objs
+      ) = 0;
+
+  /// @see CollectionIndex
+  virtual int _collection_list_partial(
+    const ghobject_t &start,
+    const ghobject_t &end,
+    int max_count,
+    std::vector<ghobject_t> *ls,
+    ghobject_t *next
+    ) = 0;
+
+protected:
+
+  /* Non-virtual utility methods */
+
+  /// Sync a subdirectory
+  int fsync_dir(
+    const std::vector<std::string> &path ///< [in] Path to sync
+    ); ///< @return Error Code, 0 on success
+
+  /// Link an object from from into to
+  int link_object(
+    const std::vector<std::string> &from,   ///< [in] Source subdirectory.
+    const std::vector<std::string> &to,     ///< [in] Dest subdirectory.
+    const ghobject_t &oid,        ///< [in] Object to move.
+    const std::string &from_short_name ///< [in] Mangled filename of oid.
+    ); ///< @return Error Code, 0 on success
+
+  /**
+   * Efficiently remove objects from a subdirectory
+   *
+   * remove_object invalidates mangled names in the directory requiring
+   * the mangled name of each additional object to be looked up a second
+   * time.  remove_objects removes the need for additional lookups
+   *
+   * @param [in] dir Directory from which to remove.
+   * @param [in] map of objects to remove to mangle names
+   * @param [in,out] map of filenames to objects
+   * @return Error Code, 0 on success.
+   */
+  int remove_objects(
+    const std::vector<std::string> &dir,
+    const std::map<std::string, ghobject_t> &to_remove,
+    std::map<std::string, ghobject_t> *remaining
+    );
+
+
+  /**
+   * Moves contents of from into to.
+   *
+   * Invalidates mangled names in to.  If interrupted, all objects will be
+   * present in to before objects are removed from from.  Ignores EEXIST
+   * while linking into to.
+   * @return Error Code, 0 on success
+   */
+  int move_objects(
+    const std::vector<std::string> &from, ///< [in] Source subdirectory.
+    const std::vector<std::string> &to    ///< [in] Dest subdirectory.
+    );
+
+  /**
+   * Remove an object from from.
+   *
+   * Invalidates mangled names in from.
+   * @return Error Code, 0 on success
+   */
+  int remove_object(
+    const std::vector<std::string> &from,  ///< [in] Directory from which to remove.
+    const ghobject_t &to_remove   ///< [in] Object to remove.
+    );
+
+  /**
+   * Gets the filename corresponding to oid in from.
+   *
+   * The filename may differ between subdirectories.  Furthermore,
+   * file creations ore removals in from may invalidate the name.
+   * @return Error code on failure, 0 on success
+   */
+  int get_mangled_name(
+    const std::vector<std::string> &from, ///< [in] Subdirectory
+    const ghobject_t &oid,	///< [in] Object
+    std::string *mangled_name,	///< [out] Filename
+    int *hardlink		///< [out] hardlink for this file, hardlink=0 mean no-exist
+    );
+
+  /// do move subdir from from to dest
+  static int move_subdir(
+    LFNIndex &from,             ///< [in] from index
+    LFNIndex &dest,             ///< [in] to index
+    const std::vector<std::string> &path, ///< [in] path containing dir
+    std::string dir                  ///< [in] dir to move
+    );
+
+  /// do move object from from to dest
+  static int move_object(
+    LFNIndex &from,             ///< [in] from index
+    LFNIndex &dest,             ///< [in] to index
+    const std::vector<std::string> &path, ///< [in] path to split
+    const std::pair<std::string, ghobject_t> &obj ///< [in] obj to move
+    );
+
+  /**
+   * Lists objects in to_list.
+   *
+   * @param [in] to_list Directory to list.
+   * @param [in] max_objects Max number to list.
+   * @param [in,out] handle Cookie for continuing the listing.
+   * Initialize to zero to start at the beginning of the directory.
+   * @param [out] out Mapping of listed object filenames to objects.
+   * @return Error code on failure, 0 on success
+   */
+  int list_objects(
+    const std::vector<std::string> &to_list,
+    int max_objects,
+    long *handle,
+    std::map<std::string, ghobject_t> *out
+    );
+
+  /// Lists subdirectories.
+  int list_subdirs(
+    const std::vector<std::string> &to_list, ///< [in] Directory to list.
+    std::vector<std::string> *out		   ///< [out] Subdirectories listed.
+    );
+
+  /// Create subdirectory.
+  int create_path(
+    const std::vector<std::string> &to_create ///< [in] Subdirectory to create.
+    );
+
+  /// Remove subdirectory.
+  int remove_path(
+    const std::vector<std::string> &to_remove ///< [in] Subdirectory to remove.
+    );
+
+  /// Check whether to_check exists.
+  int path_exists(
+    const std::vector<std::string> &to_check, ///< [in] Subdirectory to check.
+    int *exists			    ///< [out] 1 if it exists, 0 else
+    );
+
+  /// Save attr_value to attr_name attribute on path.
+  int add_attr_path(
+    const std::vector<std::string> &path, ///< [in] Path to modify.
+    const std::string &attr_name, 	///< [in] Name of attribute.
+    ceph::buffer::list &attr_value	///< [in] Value to save.
+    );
+
+  /// Read into attr_value attribute attr_name on path.
+  int get_attr_path(
+    const std::vector<std::string> &path, ///< [in] Path to read.
+    const std::string &attr_name, 	///< [in] Attribute to read.
+    ceph::buffer::list &attr_value	///< [out] Attribute value read.
+    );
+
+  /// Remove attr from path
+  int remove_attr_path(
+    const std::vector<std::string> &path, ///< [in] path from which to remove attr
+    const std::string &attr_name	///< [in] attr to remove
+    ); ///< @return Error code, 0 on success
+
+private:
+  /* lfn translation functions */
+
+  /**
+   * Gets the version specific lfn attribute tag
+   */
+  const std::string &get_lfn_attr() const {
+    return lfn_attribute;
+  }
+  const std::string &get_alt_lfn_attr() const {
+    return lfn_alt_attribute;
+  }
+
+  /**
+   * Gets the filename corresponding to oid in path.
+   *
+   * @param [in] path Path in which to get filename for oid.
+   * @param [in] oid Object for which to get filename.
+   * @param [out] mangled_name Filename for oid, pass NULL if not needed.
+   * @param [out] full_path Fullpath for oid, pass NULL if not needed.
+   * @param [out] hardlink of this file, 0 mean no-exist, pass NULL if
+   * not needed
+   * @return Error Code, 0 on success.
+   */
+  int lfn_get_name(
+    const std::vector<std::string> &path,
+    const ghobject_t &oid,
+    std::string *mangled_name,
+    std::string *full_path,
+    int *hardlink
+    );
+
+  /// Adjusts path contents when oid is created at name mangled_name.
+  int lfn_created(
+    const std::vector<std::string> &path, ///< [in] Path to adjust.
+    const ghobject_t &oid,	///< [in] Object created.
+    const std::string &mangled_name  ///< [in] Filename of created object.
+    );
+
+  /// Removes oid from path while adjusting path contents
+  int lfn_unlink(
+    const std::vector<std::string> &path, ///< [in] Path containing oid.
+    const ghobject_t &oid,	///< [in] Object to remove.
+    const std::string &mangled_name	///< [in] Filename of object to remove.
+    );
+
+  ///Transate a file into and ghobject_t.
+  int lfn_translate(
+    const std::vector<std::string> &path, ///< [in] Path containing the file.
+    const std::string &short_name,	///< [in] Filename to translate.
+    ghobject_t *out		///< [out] Object found.
+    ); ///< @return Negative error code on error, 0 if not an object, 1 else
+
+  /* manglers/demanglers */
+  /// Filters object filenames
+  bool lfn_is_object(
+    const std::string &short_name ///< [in] Filename to check
+    ); ///< True if short_name is an object, false otherwise
+
+  /// Filters subdir filenames
+  bool lfn_is_subdir(
+    const std::string &short_name, ///< [in] Filename to check.
+    std::string *demangled_name    ///< [out] Demangled subdir name.
+    ); ///< @return True if short_name is a subdir, false otherwise
+
+  /// Generate object name
+  std::string lfn_generate_object_name_keyless(
+    const ghobject_t &oid ///< [in] Object for which to generate.
+    ); ///< @return Generated object name.
+
+  /// Generate object name
+  std::string lfn_generate_object_name_poolless(
+    const ghobject_t &oid ///< [in] Object for which to generate.
+    ); ///< @return Generated object name.
+
+  /// Generate object name
+  static std::string lfn_generate_object_name_current(
+    const ghobject_t &oid ///< [in] Object for which to generate.
+    ); ///< @return Generated object name.
+
+  /// Generate object name
+  std::string lfn_generate_object_name(
+    const ghobject_t &oid ///< [in] Object for which to generate.
+    ) {
+    if (index_version == HASH_INDEX_TAG)
+      return lfn_generate_object_name_keyless(oid);
+    if (index_version == HASH_INDEX_TAG_2)
+      return lfn_generate_object_name_poolless(oid);
+    else
+      return lfn_generate_object_name_current(oid);
+  } ///< @return Generated object name.
+
+  /// Parse object name
+  int lfn_parse_object_name_keyless(
+    const std::string &long_name, ///< [in] Name to parse
+    ghobject_t *out	     ///< [out] Resulting Object
+    ); ///< @return True if successful, False otherwise.
+
+  /// Parse object name
+  int lfn_parse_object_name_poolless(
+    const std::string &long_name, ///< [in] Name to parse
+    ghobject_t *out	     ///< [out] Resulting Object
+    ); ///< @return True if successful, False otherwise.
+
+  /// Parse object name
+  int lfn_parse_object_name(
+    const std::string &long_name, ///< [in] Name to parse
+    ghobject_t *out	     ///< [out] Resulting Object
+    ); ///< @return True if successful, False otherwise.
+
+  /// Checks whether short_name is a hashed filename.
+  bool lfn_is_hashed_filename(
+    const std::string &short_name ///< [in] Name to check.
+    ); ///< @return True if short_name is hashed, False otherwise.
+
+  /// Checks whether long_name must be hashed.
+  bool lfn_must_hash(
+    const std::string &long_name ///< [in] Name to check.
+    ); ///< @return True if long_name must be hashed, False otherwise.
+
+  /// Generate hashed name.
+  std::string lfn_get_short_name(
+    const ghobject_t &oid, ///< [in] Object for which to generate.
+    int i		   ///< [in] Index of hashed name to generate.
+    ); ///< @return Hashed filename.
+
+  /* other common methods */
+  /// Gets the base path
+  const std::string &get_base_path(); ///< @return Index base_path
+
+  /// Get full path the subdir
+  std::string get_full_path_subdir(
+    const std::vector<std::string> &rel ///< [in] The subdir.
+    ); ///< @return Full path to rel.
+
+  /// Get full path to object
+  std::string get_full_path(
+    const std::vector<std::string> &rel, ///< [in] Path to object.
+    const std::string &name	       ///< [in] Filename of object.
+    ); ///< @return Fullpath to object at name in rel.
+
+  /// Get mangled path component
+  std::string mangle_path_component(
+    const std::string &component ///< [in] Component to mangle
+    ); /// @return Mangled component
+
+  /// Demangle component
+  std::string demangle_path_component(
+    const std::string &component ///< [in] Subdir name to demangle
+    ); ///< @return Demangled path component.
+
+  /// Decompose full path into object name and filename.
+  int decompose_full_path(
+    const char *in,      ///< [in] Full path to object.
+    std::vector<std::string> *out, ///< [out] Path to object at in.
+    ghobject_t *oid,	 ///< [out] Object at in.
+    std::string *shortname	 ///< [out] Filename of object at in.
+    ); ///< @return Error Code, 0 on success.
+
+  /// Mangle attribute name
+  std::string mangle_attr_name(
+    const std::string &attr ///< [in] Attribute to mangle.
+    ); ///< @return Mangled attribute name.
+
+  /// checks whether long_name could hash to short_name
+  bool short_name_matches(
+    const char *short_name,    ///< [in] name to check against
+    const char *cand_long_name ///< [in] candidate long name
+    );
+
+  /// Builds hashed filename
+  void build_filename(
+    const char *old_filename, ///< [in] Filename to convert.
+    int i,		      ///< [in] Index of hash.
+    char *filename,	      ///< [out] Resulting filename.
+    int len		      ///< [in] Size of buffer for filename
+    ); ///< @return Error Code, 0 on success
+
+  /// Get hash of filename
+  int hash_filename(
+    const char *filename, ///< [in] Filename to hash.
+    char *hash,		  ///< [out] Hash of filename.
+    int len		  ///< [in] Size of hash buffer.
+    ); ///< @return Error Code, 0 on success.
+
+  friend class TestWrapLFNIndex;
+};
+typedef LFNIndex::IndexedPath IndexedPath;
+
+#endif
diff --git a/src/os/filestore/SequencerPosition.h b/src/os/filestore/SequencerPosition.h
new file mode 100644
index 000000000..789854317
--- /dev/null
+++ b/src/os/filestore/SequencerPosition.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_OS_SEQUENCERPOSITION_H
+#define __CEPH_OS_SEQUENCERPOSITION_H
+
+#include "include/types.h"
+#include "include/cmp.h"
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+#include <ostream>
+
+/**
+ * transaction and op offset
+ */
+struct SequencerPosition {
+  uint64_t seq;  ///< seq
+  uint32_t trans; ///< transaction in that seq (0-based)
+  uint32_t op;    ///< op in that transaction (0-based)
+
+  SequencerPosition(uint64_t s=0, int32_t t=0, int32_t o=0) : seq(s), trans(t), op(o) {}
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(seq, bl);
+    encode(trans, bl);
+    encode(op, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    DECODE_START(1, p);
+    decode(seq, p);
+    decode(trans, p);
+    decode(op, p);
+    DECODE_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->dump_unsigned("seq", seq);
+    f->dump_unsigned("trans", trans);
+    f->dump_unsigned("op", op);
+  }
+  static void generate_test_instances(std::list<SequencerPosition*>& o) {
+    o.push_back(new SequencerPosition);
+    o.push_back(new SequencerPosition(1, 2, 3));
+    o.push_back(new SequencerPosition(4, 5, 6));
+  }
+};
+WRITE_CLASS_ENCODER(SequencerPosition)
+
+inline std::ostream& operator<<(std::ostream& out, const SequencerPosition& t) {
+  return out << t.seq << "." << t.trans << "." << t.op;
+}
+
+WRITE_EQ_OPERATORS_3(SequencerPosition, seq, trans, op)
+WRITE_CMP_OPERATORS_3(SequencerPosition, seq, trans, op)
+
+
+#endif
diff --git a/src/os/filestore/WBThrottle.cc b/src/os/filestore/WBThrottle.cc
new file mode 100644
index 000000000..252004335
--- /dev/null
+++ b/src/os/filestore/WBThrottle.cc
@@ -0,0 +1,276 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "acconfig.h"
+
+#include "os/filestore/WBThrottle.h"
+#include "common/perf_counters.h"
+#include "common/errno.h"
+
+using std::pair;
+using std::string;
+
+WBThrottle::WBThrottle(CephContext *cct) :
+  cur_ios(0), cur_size(0),
+  cct(cct),
+  logger(NULL),
+  stopping(true),
+  fs(XFS)
+{
+  {
+    std::lock_guard l{lock};
+    set_from_conf();
+  }
+  ceph_assert(cct);
+  PerfCountersBuilder b(
+    cct, string("WBThrottle"),
+    l_wbthrottle_first, l_wbthrottle_last);
+  b.add_u64(l_wbthrottle_bytes_dirtied, "bytes_dirtied", "Dirty data", NULL, 0, unit_t(UNIT_BYTES));
+  b.add_u64(l_wbthrottle_bytes_wb, "bytes_wb", "Written data", NULL, 0, unit_t(UNIT_BYTES));
+  b.add_u64(l_wbthrottle_ios_dirtied, "ios_dirtied", "Dirty operations");
+  b.add_u64(l_wbthrottle_ios_wb, "ios_wb", "Written operations");
+  b.add_u64(l_wbthrottle_inodes_dirtied, "inodes_dirtied", "Entries waiting for write");
+  b.add_u64(l_wbthrottle_inodes_wb, "inodes_wb", "Written entries");
+  logger = b.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+  for (unsigned i = l_wbthrottle_first + 1; i != l_wbthrottle_last; ++i)
+    logger->set(i, 0);
+
+  cct->_conf.add_observer(this);
+}
+
+WBThrottle::~WBThrottle() {
+  ceph_assert(cct);
+  cct->get_perfcounters_collection()->remove(logger);
+  delete logger;
+  cct->_conf.remove_observer(this);
+}
+
+void WBThrottle::start()
+{
+  {
+    std::lock_guard l{lock};
+    stopping = false;
+  }
+  create("wb_throttle");
+}
+
+void WBThrottle::stop()
+{
+  {
+    std::lock_guard l{lock};
+    stopping = true;
+    cond.notify_all();
+  }
+
+  join();
+}
+
+const char** WBThrottle::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "filestore_wbthrottle_btrfs_bytes_start_flusher",
+    "filestore_wbthrottle_btrfs_bytes_hard_limit",
+    "filestore_wbthrottle_btrfs_ios_start_flusher",
+    "filestore_wbthrottle_btrfs_ios_hard_limit",
+    "filestore_wbthrottle_btrfs_inodes_start_flusher",
+    "filestore_wbthrottle_btrfs_inodes_hard_limit",
+    "filestore_wbthrottle_xfs_bytes_start_flusher",
+    "filestore_wbthrottle_xfs_bytes_hard_limit",
+    "filestore_wbthrottle_xfs_ios_start_flusher",
+    "filestore_wbthrottle_xfs_ios_hard_limit",
+    "filestore_wbthrottle_xfs_inodes_start_flusher",
+    "filestore_wbthrottle_xfs_inodes_hard_limit",
+    NULL
+  };
+  return KEYS;
+}
+
+void WBThrottle::set_from_conf()
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  if (fs == BTRFS) {
+    size_limits.first =
+      cct->_conf->filestore_wbthrottle_btrfs_bytes_start_flusher;
+    size_limits.second =
+      cct->_conf->filestore_wbthrottle_btrfs_bytes_hard_limit;
+    io_limits.first =
+      cct->_conf->filestore_wbthrottle_btrfs_ios_start_flusher;
+    io_limits.second =
+      cct->_conf->filestore_wbthrottle_btrfs_ios_hard_limit;
+    fd_limits.first =
+      cct->_conf->filestore_wbthrottle_btrfs_inodes_start_flusher;
+    fd_limits.second =
+      cct->_conf->filestore_wbthrottle_btrfs_inodes_hard_limit;
+  } else if (fs == XFS) {
+    size_limits.first =
+      cct->_conf->filestore_wbthrottle_xfs_bytes_start_flusher;
+    size_limits.second =
+      cct->_conf->filestore_wbthrottle_xfs_bytes_hard_limit;
+    io_limits.first =
+      cct->_conf->filestore_wbthrottle_xfs_ios_start_flusher;
+    io_limits.second =
+      cct->_conf->filestore_wbthrottle_xfs_ios_hard_limit;
+    fd_limits.first =
+      cct->_conf->filestore_wbthrottle_xfs_inodes_start_flusher;
+    fd_limits.second =
+      cct->_conf->filestore_wbthrottle_xfs_inodes_hard_limit;
+  } else {
+    ceph_abort_msg("invalid value for fs");
+  }
+  cond.notify_all();
+}
+
+void WBThrottle::handle_conf_change(const ConfigProxy& conf,
+				    const std::set<std::string> &changed)
+{
+  std::lock_guard l{lock};
+  for (const char** i = get_tracked_conf_keys(); *i; ++i) {
+    if (changed.count(*i)) {
+      set_from_conf();
+      return;
+    }
+  }
+}
+
+bool WBThrottle::get_next_should_flush(
+  std::unique_lock<ceph::mutex>& locker,
+  boost::tuple<ghobject_t, FDRef, PendingWB> *next)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ceph_assert(next);
+  {
+    cond.wait(locker, [this] {
+      return stopping || (beyond_limit() && !pending_wbs.empty());
+    });
+  }
+  if (stopping)
+    return false;
+  ceph_assert(!pending_wbs.empty());
+  ghobject_t obj(pop_object());
+
+  ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+    pending_wbs.find(obj);
+  *next = boost::make_tuple(obj, i->second.second, i->second.first);
+  pending_wbs.erase(i);
+  return true;
+}
+
+
+void *WBThrottle::entry()
+{
+  std::unique_lock l{lock};
+  boost::tuple<ghobject_t, FDRef, PendingWB> wb;
+  while (get_next_should_flush(l, &wb)) {
+    clearing = wb.get<0>();
+    cur_ios -= wb.get<2>().ios;
+    logger->dec(l_wbthrottle_ios_dirtied, wb.get<2>().ios);
+    logger->inc(l_wbthrottle_ios_wb, wb.get<2>().ios);
+    cur_size -= wb.get<2>().size;
+    logger->dec(l_wbthrottle_bytes_dirtied, wb.get<2>().size);
+    logger->inc(l_wbthrottle_bytes_wb, wb.get<2>().size);
+    logger->dec(l_wbthrottle_inodes_dirtied);
+    logger->inc(l_wbthrottle_inodes_wb);
+    l.unlock();
+#if defined(HAVE_FDATASYNC)
+    int r = ::fdatasync(**wb.get<1>());
+#else
+    int r = ::fsync(**wb.get<1>());
+#endif
+    if (r < 0) {
+      lderr(cct) << "WBThrottle fsync failed: " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+#ifdef HAVE_POSIX_FADVISE
+    if (cct->_conf->filestore_fadvise && wb.get<2>().nocache) {
+      int fa_r = posix_fadvise(**wb.get<1>(), 0, 0, POSIX_FADV_DONTNEED);
+      ceph_assert(fa_r == 0);
+    }
+#endif
+    l.lock();
+    clearing = ghobject_t();
+    cond.notify_all();
+    wb = boost::tuple<ghobject_t, FDRef, PendingWB>();
+  }
+  return 0;
+}
+
+void WBThrottle::queue_wb(
+  FDRef fd, const ghobject_t &hoid, uint64_t offset, uint64_t len,
+  bool nocache)
+{
+  std::lock_guard l{lock};
+  ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator wbiter =
+    pending_wbs.find(hoid);
+  if (wbiter == pending_wbs.end()) {
+    wbiter = pending_wbs.insert(
+      make_pair(hoid,
+	make_pair(
+	  PendingWB(),
+	  fd))).first;
+    logger->inc(l_wbthrottle_inodes_dirtied);
+  } else {
+    remove_object(hoid);
+  }
+
+  cur_ios++;
+  logger->inc(l_wbthrottle_ios_dirtied);
+  cur_size += len;
+  logger->inc(l_wbthrottle_bytes_dirtied, len);
+
+  wbiter->second.first.add(nocache, len, 1);
+  insert_object(hoid);
+  if (beyond_limit())
+    cond.notify_all();
+}
+
+void WBThrottle::clear()
+{
+  std::lock_guard l{lock};
+  for (ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+	 pending_wbs.begin();
+       i != pending_wbs.end();
+       ++i) {
+#ifdef HAVE_POSIX_FADVISE
+    if (cct->_conf->filestore_fadvise && i->second.first.nocache) {
+      int fa_r = posix_fadvise(**i->second.second, 0, 0, POSIX_FADV_DONTNEED);
+      ceph_assert(fa_r == 0);
+    }
+#endif
+
+  }
+  cur_ios = cur_size = 0;
+  logger->set(l_wbthrottle_ios_dirtied, 0);
+  logger->set(l_wbthrottle_bytes_dirtied, 0);
+  logger->set(l_wbthrottle_inodes_dirtied, 0);
+  pending_wbs.clear();
+  lru.clear();
+  rev_lru.clear();
+  cond.notify_all();
+}
+
+void WBThrottle::clear_object(const ghobject_t &hoid)
+{
+  std::unique_lock l{lock};
+  cond.wait(l, [hoid, this] { return clearing != hoid; });
+  ceph::unordered_map<ghobject_t, pair<PendingWB, FDRef> >::iterator i =
+    pending_wbs.find(hoid);
+  if (i == pending_wbs.end())
+    return;
+
+  cur_ios -= i->second.first.ios;
+  logger->dec(l_wbthrottle_ios_dirtied, i->second.first.ios);
+  cur_size -= i->second.first.size;
+  logger->dec(l_wbthrottle_bytes_dirtied, i->second.first.size);
+  logger->dec(l_wbthrottle_inodes_dirtied);
+
+  pending_wbs.erase(i);
+  remove_object(hoid);
+  cond.notify_all();
+}
+
+void WBThrottle::throttle()
+{
+  std::unique_lock l{lock};
+  cond.wait(l, [this] { return stopping || !need_flush(); });
+}
diff --git a/src/os/filestore/WBThrottle.h b/src/os/filestore/WBThrottle.h
new file mode 100644
index 000000000..1af8ba5d5
--- /dev/null
+++ b/src/os/filestore/WBThrottle.h
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef WBTHROTTLE_H
+#define WBTHROTTLE_H
+
+#include "include/unordered_map.h"
+#include <boost/tuple/tuple.hpp>
+#include "common/Formatter.h"
+#include "common/hobject.h"
+#include "include/interval_set.h"
+#include "include/common_fwd.h"
+#include "FDCache.h"
+#include "common/Thread.h"
+#include "common/ceph_context.h"
+
+enum {
+  l_wbthrottle_first = 999090,
+  l_wbthrottle_bytes_dirtied,
+  l_wbthrottle_bytes_wb,
+  l_wbthrottle_ios_dirtied,
+  l_wbthrottle_ios_wb,
+  l_wbthrottle_inodes_dirtied,
+  l_wbthrottle_inodes_wb,
+  l_wbthrottle_last
+};
+
+/**
+ * WBThrottle
+ *
+ * Tracks, throttles, and flushes outstanding IO
+ */
+class WBThrottle : Thread, public md_config_obs_t {
+  ghobject_t clearing;
+  /* *_limits.first is the start_flusher limit and
+   * *_limits.second is the hard limit
+   */
+
+  /// Limits on unflushed bytes
+  std::pair<uint64_t, uint64_t> size_limits;
+
+  /// Limits on unflushed ios
+  std::pair<uint64_t, uint64_t> io_limits;
+
+  /// Limits on unflushed objects
+  std::pair<uint64_t, uint64_t> fd_limits;
+
+  uint64_t cur_ios;  /// Currently unflushed IOs
+  uint64_t cur_size; /// Currently unflushed bytes
+
+  /**
+   * PendingWB tracks the ios pending on an object.
+   */
+  class PendingWB {
+  public:
+    bool nocache;
+    uint64_t size;
+    uint64_t ios;
+    PendingWB() : nocache(true), size(0), ios(0) {}
+    void add(bool _nocache, uint64_t _size, uint64_t _ios) {
+      if (!_nocache)
+	nocache = false; // only nocache if all writes are nocache
+      size += _size;
+      ios += _ios;
+    }
+  };
+
+  CephContext *cct;
+  PerfCounters *logger;
+  bool stopping;
+  ceph::mutex lock = ceph::make_mutex("WBThrottle::lock");
+  ceph::condition_variable cond;
+
+
+  /**
+   * Flush objects in lru order
+   */
+  std::list<ghobject_t> lru;
+  ceph::unordered_map<ghobject_t, std::list<ghobject_t>::iterator> rev_lru;
+  void remove_object(const ghobject_t &oid) {
+    ceph_assert(ceph_mutex_is_locked(lock));
+    ceph::unordered_map<ghobject_t, std::list<ghobject_t>::iterator>::iterator iter =
+      rev_lru.find(oid);
+    if (iter == rev_lru.end())
+      return;
+
+    lru.erase(iter->second);
+    rev_lru.erase(iter);
+  }
+  ghobject_t pop_object() {
+    ceph_assert(!lru.empty());
+    ghobject_t oid(lru.front());
+    lru.pop_front();
+    rev_lru.erase(oid);
+    return oid;
+  }
+  void insert_object(const ghobject_t &oid) {
+    ceph_assert(rev_lru.find(oid) == rev_lru.end());
+    lru.push_back(oid);
+    rev_lru.insert(make_pair(oid, --lru.end()));
+  }
+
+  ceph::unordered_map<ghobject_t, std::pair<PendingWB, FDRef> > pending_wbs;
+
+  /// get next flush to perform
+  bool get_next_should_flush(
+    std::unique_lock<ceph::mutex>& locker,
+    boost::tuple<ghobject_t, FDRef, PendingWB> *next ///< [out] next to flush
+    ); ///< @return false if we are shutting down
+public:
+  enum FS {
+    BTRFS,
+    XFS
+  };
+
+private:
+  FS fs;
+
+  void set_from_conf();
+  bool beyond_limit() const {
+    if (cur_ios < io_limits.first &&
+	pending_wbs.size() < fd_limits.first &&
+	cur_size < size_limits.first)
+      return false;
+    else
+      return true;
+  }
+  bool need_flush() const {
+    if (cur_ios < io_limits.second &&
+	pending_wbs.size() < fd_limits.second &&
+	cur_size < size_limits.second)
+      return false;
+    else
+      return true;
+  }
+
+public:
+  explicit WBThrottle(CephContext *cct);
+  ~WBThrottle() override;
+
+  void start();
+  void stop();
+  /// Set fs as XFS or BTRFS
+  void set_fs(FS new_fs) {
+    std::lock_guard l{lock};
+    fs = new_fs;
+    set_from_conf();
+  }
+
+  /// Queue wb on oid, fd taking throttle (does not block)
+  void queue_wb(
+    FDRef fd,              ///< [in] FDRef to oid
+    const ghobject_t &oid, ///< [in] object
+    uint64_t offset,       ///< [in] offset written
+    uint64_t len,          ///< [in] length written
+    bool nocache           ///< [in] try to clear out of cache after write
+    );
+
+  /// Clear all wb (probably due to sync)
+  void clear();
+
+  /// Clear object
+  void clear_object(const ghobject_t &oid);
+
+  /// Block until there is throttle available
+  void throttle();
+
+  /// md_config_obs_t
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set<std::string> &changed) override;
+
+  /// Thread
+  void *entry() override;
+};
+
+#endif
diff --git a/src/os/filestore/XfsFileStoreBackend.cc b/src/os/filestore/XfsFileStoreBackend.cc
new file mode 100644
index 000000000..1081d146a
--- /dev/null
+++ b/src/os/filestore/XfsFileStoreBackend.cc
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "XfsFileStoreBackend.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/utsname.h>
+
+#include <xfs/xfs.h>
+
+#include "common/errno.h"
+#include "common/linux_version.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "xfsfilestorebackend(" << get_basedir_path() << ") "
+
+XfsFileStoreBackend::XfsFileStoreBackend(FileStore *fs):
+  GenericFileStoreBackend(fs), m_has_extsize(false) { }
+
+/*
+ * Set extsize attr on a file to val.  Should be a free-standing
+ * function, but dout_prefix expanding to a call to get_basedir_path()
+ * protected member function won't let it.
+ */
+int XfsFileStoreBackend::set_extsize(int fd, unsigned int val)
+{
+  struct fsxattr fsx;
+  struct stat sb;
+  int ret;
+
+  if (fstat(fd, &sb) < 0) {
+    ret = -errno;
+    dout(0) << "set_extsize: fstat: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  if (!S_ISREG(sb.st_mode)) {
+    dout(0) << "set_extsize: invalid target file type" << dendl;
+    return -EINVAL;
+  }
+
+  if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) {
+    ret = -errno;
+    dout(0) << "set_extsize: FSGETXATTR: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  // already set?
+  if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val)
+    return 0;
+
+  // xfs won't change extent size if any extents are allocated
+  if (fsx.fsx_nextents != 0)
+    return 0;
+
+  fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE;
+  fsx.fsx_extsize = val;
+
+  if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) {
+    ret = -errno;
+    dout(0) << "set_extsize: FSSETXATTR: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int XfsFileStoreBackend::detect_features()
+{
+  int ret;
+
+  ret = GenericFileStoreBackend::detect_features();
+  if (ret < 0)
+    return ret;
+
+  // extsize?
+  int fd = ::openat(get_basedir_fd(), "extsize_test", O_CREAT|O_WRONLY, 0600);
+  if (fd < 0) {
+    ret = -errno;
+    dout(0) << "detect_feature: failed to create test file for extsize attr: "
+            << cpp_strerror(ret) << dendl;
+    goto out;
+  }
+  if (::unlinkat(get_basedir_fd(), "extsize_test", 0) < 0) {
+    ret = -errno;
+    dout(0) << "detect_feature: failed to unlink test file for extsize attr: "
+            << cpp_strerror(ret) << dendl;
+    goto out_close;
+  }
+
+  if (cct()->_conf->filestore_xfs_extsize) {
+    ret = set_extsize(fd, 1U << 15); // a few pages
+    if (ret) {
+      ret = 0;
+      dout(0) << "detect_feature: failed to set test file extsize, assuming extsize is NOT supported" << dendl;
+      goto out_close;
+    }
+
+    // make sure we have 3.5 or newer, which includes this fix
+    //   aff3a9edb7080f69f07fe76a8bd089b3dfa4cb5d
+    // for this set_extsize bug
+    //   http://oss.sgi.com/bugzilla/show_bug.cgi?id=874
+    int ver = get_linux_version();
+    if (ver == 0) {
+      dout(0) << __func__ << ": couldn't verify extsize not buggy, disabling extsize" << dendl;
+      m_has_extsize = false;
+    } else if (ver < KERNEL_VERSION(3, 5, 0)) {
+      dout(0) << __func__ << ": disabling extsize, your kernel < 3.5 and has buggy extsize ioctl" << dendl;
+      m_has_extsize = false;
+    } else {
+      dout(0) << __func__ << ": extsize is supported and your kernel >= 3.5" << dendl;
+      m_has_extsize = true;
+    }
+  } else {
+    dout(0) << "detect_feature: extsize is disabled by conf" << dendl;
+  }
+
+out_close:
+  TEMP_FAILURE_RETRY(::close(fd));
+out:
+  return ret;
+}
+
+int XfsFileStoreBackend::set_alloc_hint(int fd, uint64_t hint)
+{
+  if (!m_has_extsize)
+    return -EOPNOTSUPP;
+
+  ceph_assert(hint < UINT_MAX);
+  return set_extsize(fd, hint);
+}
diff --git a/src/os/filestore/XfsFileStoreBackend.h b/src/os/filestore/XfsFileStoreBackend.h
new file mode 100644
index 000000000..e8b81f9a1
--- /dev/null
+++ b/src/os/filestore/XfsFileStoreBackend.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_XFSFILESTOREBACKEND_H
+#define CEPH_XFSFILESTOREBACKEND_H
+
+#include "GenericFileStoreBackend.h"
+
+#include "include/int_types.h"
+
+class XfsFileStoreBackend : public GenericFileStoreBackend {
+private:
+  bool m_has_extsize;
+  int set_extsize(int fd, unsigned int val);
+public:
+  explicit XfsFileStoreBackend(FileStore *fs);
+  ~XfsFileStoreBackend() override {}
+  const char *get_name() override {
+    return "xfs";
+  }
+  int detect_features() override;
+  int set_alloc_hint(int fd, uint64_t hint) override;
+};
+
+#endif /* CEPH_XFSFILESTOREBACKEND_H */
diff --git a/src/os/filestore/ZFSFileStoreBackend.cc b/src/os/filestore/ZFSFileStoreBackend.cc
new file mode 100644
index 000000000..e85dbd526
--- /dev/null
+++ b/src/os/filestore/ZFSFileStoreBackend.cc
@@ -0,0 +1,258 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/int_types.h"
+#include "include/types.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+
+#include "include/compat.h"
+#include "include/linux_fiemap.h"
+#include "include/color.h"
+#include "include/buffer.h"
+#include "include/ceph_assert.h"
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "common/errno.h"
+#include "common/config.h"
+#include "common/sync_filesystem.h"
+
+#include "ZFSFileStoreBackend.h"
+
+#define dout_context cct()
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "zfsfilestorebackend(" << get_basedir_path() << ") "
+
+ZFSFileStoreBackend::ZFSFileStoreBackend(FileStore *fs) :
+  GenericFileStoreBackend(fs), base_zh(NULL), current_zh(NULL),
+  m_filestore_zfs_snap(cct()->_conf->filestore_zfs_snap)
+{
+  int ret = zfs.init();
+  if (ret < 0) {
+    dout(0) << "ZFSFileStoreBackend: failed to init libzfs" << dendl;
+    return;
+  }
+
+  base_zh = zfs.path_to_zhandle(get_basedir_path().c_str(), ZFS::TYPE_FILESYSTEM);
+  if (!base_zh) {
+    dout(0) << "ZFSFileStoreBackend: failed to get zfs handler for basedir" << dendl;
+    return;
+  }
+
+  update_current_zh();
+}
+
+ZFSFileStoreBackend::~ZFSFileStoreBackend()
+{
+  if (base_zh)
+    zfs.close(base_zh);
+  if (current_zh)
+    zfs.close(current_zh);
+}
+
+int ZFSFileStoreBackend::update_current_zh()
+{
+  char path[PATH_MAX];
+  snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh));
+  ZFS::Handle *zh = zfs.open(path, ZFS::TYPE_FILESYSTEM);
+  if (zh) {
+    char *mnt;
+    if (zfs.is_mounted(zh, &mnt)) {
+      int ret = get_current_path() == mnt;
+      free(mnt);
+      if (ret) {
+	current_zh = zh;
+	return 0;
+      }
+    } else {
+      int ret = zfs.mount(zh, NULL, 0);
+      if (ret < 0) {
+	ret = -errno;
+	dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(zh)
+		<< "' got " << cpp_strerror(ret) << dendl;
+	return ret;
+      }
+    }
+    zfs.close(zh);
+  } else {
+    dout(0) << "update_current_zh: zfs_open '" << path << "' got NULL" << dendl;
+    return -ENOENT;
+  }
+
+  zh = zfs.path_to_zhandle(get_current_path().c_str(), ZFS::TYPE_FILESYSTEM);
+  if (zh) {
+    if (strcmp(zfs.get_name(base_zh), zfs.get_name(zh))) {
+      current_zh = zh;
+      return 0;
+    }
+    zfs.close(zh);
+    dout(0) << "update_current_zh: basedir and current/ on the same filesystem" << dendl;
+  } else {
+    dout(0) << "update_current_zh: current/ not exist" << dendl;
+  }
+  return -ENOENT;
+}
+
+int ZFSFileStoreBackend::detect_features()
+{
+  if (!current_zh)
+    dout(0) << "detect_features: null zfs handle for current/" << dendl;
+  return 0;
+}
+
+bool ZFSFileStoreBackend::can_checkpoint()
+{
+  return m_filestore_zfs_snap && current_zh != NULL;
+}
+
+int ZFSFileStoreBackend::create_current()
+{
+  struct stat st;
+  int ret = ::stat(get_current_path().c_str(), &st);
+  if (ret == 0) {
+    // current/ exists
+    if (!S_ISDIR(st.st_mode)) {
+      dout(0) << "create_current: current/ exists but is not a directory" << dendl;
+      return -ENOTDIR;
+    }
+    return 0;
+  } else if (errno != ENOENT) {
+    ret = -errno;
+    dout(0) << "create_current: cannot stat current/ " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  char path[PATH_MAX];
+  snprintf(path, sizeof(path), "%s/current", zfs.get_name(base_zh));
+  ret = zfs.create(path, ZFS::TYPE_FILESYSTEM);
+  if (ret < 0 && errno != EEXIST) {
+    ret = -errno;
+    dout(0) << "create_current: zfs_create '" << path << "' got " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  ret = update_current_zh();
+  return ret;
+}
+
+static int list_checkpoints_callback(ZFS::Handle *zh, void *data)
+{
+  list<string> *ls = static_cast<list<string> *>(data);
+  string str = ZFS::get_name(zh);
+  size_t pos = str.find('@');
+  ceph_assert(pos != string::npos && pos + 1 != str.length());
+  ls->push_back(str.substr(pos + 1));
+  return 0;
+}
+
+int ZFSFileStoreBackend::list_checkpoints(list<string>& ls)
+{
+  dout(10) << "list_checkpoints:" << dendl;
+  if (!current_zh)
+    return -EINVAL;
+
+  list<string> snaps;
+  int ret = zfs.iter_snapshots_sorted(current_zh, list_checkpoints_callback, &snaps);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "list_checkpoints: zfs_iter_snapshots_sorted got" << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  ls.swap(snaps);
+  return 0;
+}
+
+int ZFSFileStoreBackend::create_checkpoint(const string& name, uint64_t *cid)
+{
+  dout(10) << "create_checkpoint: '" << name << "'" << dendl;
+  if (!current_zh)
+    return -EINVAL;
+
+  // looks like zfsonlinux doesn't flush dirty data when taking snapshot
+  int ret = sync_filesystem(get_current_fd());
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "create_checkpoint: sync_filesystem got" << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  char path[PATH_MAX];
+  snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str());
+  ret = zfs.snapshot(path, false);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "create_checkpoint: zfs_snapshot '" << path << "' got" << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  if (cid)
+    *cid = 0;
+  return 0;
+}
+
+int ZFSFileStoreBackend::rollback_to(const string& name)
+{
+  dout(10) << "rollback_to: '" << name << "'" << dendl;
+  if (!current_zh)
+    return -EINVAL;
+
+  // umount current to avoid triggering online rollback deadlock
+  int ret;
+  if (zfs.is_mounted(current_zh, NULL)) {
+    ret = zfs.umount(current_zh, NULL, 0);
+    if (ret < 0) {
+      ret = -errno;
+      dout(0) << "rollback_to: zfs_umount '" << zfs.get_name(current_zh) << "' got" << cpp_strerror(ret) << dendl;
+    }
+  }
+
+  char path[PATH_MAX];
+  snprintf(path, sizeof(path), "%s@%s", zfs.get_name(current_zh), name.c_str());
+
+  ZFS::Handle *snap_zh = zfs.open(path, ZFS::TYPE_SNAPSHOT);
+  if (!snap_zh) {
+    dout(0) << "rollback_to: zfs_open '" << path << "' got NULL" << dendl;
+    return -ENOENT;
+  }
+
+  ret = zfs.rollback(current_zh, snap_zh, false);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "rollback_to: zfs_rollback '" << zfs.get_name(snap_zh) << "' got" << cpp_strerror(ret) << dendl;
+  }
+
+  if (!zfs.is_mounted(current_zh, NULL)) {
+    int ret = zfs.mount(current_zh, NULL, 0);
+    if (ret < 0) {
+      ret = -errno;
+      dout(0) << "update_current_zh: zfs_mount '" << zfs.get_name(current_zh) << "' got " << cpp_strerror(ret) << dendl;
+      return ret;
+    }
+  }
+
+  zfs.close(snap_zh);
+  return ret;
+}
+
+int ZFSFileStoreBackend::destroy_checkpoint(const string& name)
+{
+  dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
+  if (!current_zh)
+    return -EINVAL;
+
+  int ret = zfs.destroy_snaps(current_zh, name.c_str(), true);
+  if (ret < 0) {
+    ret = -errno;
+    dout(0) << "destroy_checkpoint: zfs_destroy_snaps '" << name << "' got" << cpp_strerror(ret) << dendl;
+  }
+  return ret;
+}
diff --git a/src/os/filestore/ZFSFileStoreBackend.h b/src/os/filestore/ZFSFileStoreBackend.h
new file mode 100644
index 000000000..b1fa98874
--- /dev/null
+++ b/src/os/filestore/ZFSFileStoreBackend.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_ZFSFILESTOREBACKEND_H
+#define CEPH_ZFSFILESTOREBACKEND_H
+
+#ifdef HAVE_LIBZFS
+#include "GenericFileStoreBackend.h"
+#include "os/fs/ZFS.h"
+
+class ZFSFileStoreBackend : public GenericFileStoreBackend {
+private:
+  ZFS zfs;
+  ZFS::Handle *base_zh;
+  ZFS::Handle *current_zh;
+  bool m_filestore_zfs_snap;
+  int update_current_zh();
+public:
+  explicit ZFSFileStoreBackend(FileStore *fs);
+  ~ZFSFileStoreBackend();
+  const char *get_name() override {
+    return "zfs";
+  }
+  int detect_features();
+  bool can_checkpoint();
+  int create_current();
+  int list_checkpoints(list<string>& ls);
+  int create_checkpoint(const string& name, uint64_t *cid);
+  int rollback_to(const string& name);
+  int destroy_checkpoint(const string& name);
+};
+#endif
+#endif
diff --git a/src/os/filestore/chain_xattr.cc b/src/os/filestore/chain_xattr.cc
new file mode 100644
index 000000000..a7087e566
--- /dev/null
+++ b/src/os/filestore/chain_xattr.cc
@@ -0,0 +1,415 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "chain_xattr.h"
+#include <errno.h>           // for ERANGE, ENODATA, ENOMEM
+#include <stdio.h>           // for size_t, snprintf
+#include <stdlib.h>          // for free, malloc
+#include <string.h>          // for strcpy, strlen
+#include "include/ceph_assert.h"  // for assert
+#include "include/buffer.h"
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#endif
+
+#include "include/ceph_assert.h"
+
+using ceph::bufferptr;
+
+/*
+ * chaining xattrs
+ *
+ * In order to support xattrs that are larger than the xattr size limit that some file systems
+ * impose, we use multiple xattrs to store the value of a single xattr. The xattrs keys
+ * are set as follows:
+ * The first xattr in the chain, has a key that holds the original xattr name, with any '@' char
+ * being esacped ("@@").
+ * The chained keys will have the first xattr's key (with the escaping), and a suffix: "@<id>"
+ * where <id> marks the num of xattr in the chain.
+ */
+
+void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len)
+{
+  int pos = 0;
+
+  while (*name) {
+    switch (*name) {
+    case '@': /* escape it */
+      pos += 2;
+      ceph_assert (pos < raw_len - 1);
+      *raw_name = '@';
+      raw_name++;
+      *raw_name = '@';
+      break;
+    default:
+      pos++;
+      ceph_assert(pos < raw_len - 1);
+      *raw_name = *name;
+      break;
+    }
+    name++;
+    raw_name++;
+  }
+
+  if (!i) {
+    *raw_name = '\0';
+  } else {
+    int r = snprintf(raw_name, raw_len - pos, "@%d", i);
+    ceph_assert(r < raw_len - pos);
+  }
+}
+
+static int translate_raw_name(const char *raw_name, char *name, int name_len, bool *is_first)
+{
+  int pos = 0;
+
+  *is_first = true;
+  while (*raw_name) {
+    switch (*raw_name) {
+    case '@': /* escape it */
+      raw_name++;
+      if (!*raw_name)
+        break;
+      if (*raw_name != '@') {
+        *is_first = false;
+        goto done;
+      }
+
+    /* fall through */
+    default:
+      *name = *raw_name;
+      break;
+    }
+    pos++;
+    ceph_assert(pos < name_len);
+    name++;
+    raw_name++;
+  }
+done:
+  *name = '\0';
+  return pos;
+}
+
+
+// setxattr
+
+static int getxattr_len(const char *fn, const char *name)
+{
+  int i = 0, total = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int r;
+
+  do {
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    r = sys_getxattr(fn, raw_name, 0, 0);
+    if (!i && r < 0)
+      return r;
+    if (r < 0)
+      break;
+    total += r;
+    i++;
+  } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+	   r == CHAIN_XATTR_SHORT_BLOCK_LEN);
+
+  return total;
+}
+
+int chain_getxattr(const char *fn, const char *name, void *val, size_t size)
+{
+  int i = 0, pos = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int ret = 0;
+  int r;
+  size_t chunk_size;
+
+  if (!size)
+    return getxattr_len(fn, name);
+
+  do {
+    chunk_size = size;
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+
+    r = sys_getxattr(fn, raw_name, (char *)val + pos, chunk_size);
+    if (i && r == -ENODATA) {
+      ret = pos;
+      break;
+    }
+    if (r < 0) {
+      ret = r;
+      break;
+    }
+
+    if (r > 0) {
+      pos += r;
+      size -= r;
+    }
+
+    i++;
+  } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+		    r == CHAIN_XATTR_SHORT_BLOCK_LEN));
+
+  if (r >= 0) {
+    ret = pos;
+    /* is there another chunk? that can happen if the last read size span over
+       exactly one block */
+    if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
+	chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
+      get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+      r = sys_getxattr(fn, raw_name, 0, 0);
+      if (r > 0) { // there's another chunk.. the original buffer was too small
+        ret = -ERANGE;
+      }
+    }
+  }
+  return ret;
+}
+
+int chain_getxattr_buf(const char *fn, const char *name, bufferptr *bp)
+{
+  size_t size = 1024; // Initial
+  while (1) {
+    bufferptr buf(size);
+    int r = chain_getxattr(
+      fn,
+      name,
+      buf.c_str(),
+      size);
+    if (r > 0) {
+      buf.set_length(r);
+      if (bp)
+	bp->swap(buf);
+      return r;
+    } else if (r == 0) {
+      return 0;
+    } else {
+      if (r == -ERANGE) {
+	size *= 2;
+      } else {
+	return r;
+      }
+    }
+  }
+  ceph_abort_msg("unreachable");
+  return 0;
+}
+
+static int chain_fgetxattr_len(int fd, const char *name)
+{
+  int i = 0, total = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int r;
+
+  do {
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    r = sys_fgetxattr(fd, raw_name, 0, 0);
+    if (!i && r < 0)
+      return r;
+    if (r < 0)
+      break;
+    total += r;
+    i++;
+  } while (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+	   r == CHAIN_XATTR_SHORT_BLOCK_LEN);
+
+  return total;
+}
+
+int chain_fgetxattr(int fd, const char *name, void *val, size_t size)
+{
+  int i = 0, pos = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int ret = 0;
+  int r;
+  size_t chunk_size;
+
+  if (!size)
+    return chain_fgetxattr_len(fd, name);
+
+  do {
+    chunk_size = size;
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+
+    r = sys_fgetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+    if (i && r == -ENODATA) {
+      ret = pos;
+      break;
+    }
+    if (r < 0) {
+      ret = r;
+      break;
+    }
+
+    if (r > 0) {
+      pos += r;
+      size -= r;
+    }
+
+    i++;
+  } while (size && (r == CHAIN_XATTR_MAX_BLOCK_LEN ||
+		    r == CHAIN_XATTR_SHORT_BLOCK_LEN));
+
+  if (r >= 0) {
+    ret = pos;
+    /* is there another chunk? that can happen if the last read size span over
+       exactly one block */
+    if (chunk_size == CHAIN_XATTR_MAX_BLOCK_LEN ||
+	chunk_size == CHAIN_XATTR_SHORT_BLOCK_LEN) {
+      get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+      r = sys_fgetxattr(fd, raw_name, 0, 0);
+      if (r > 0) { // there's another chunk.. the original buffer was too small
+        ret = -ERANGE;
+      }
+    }
+  }
+  return ret;
+}
+
+
+// setxattr
+
+int get_xattr_block_size(size_t size)
+{
+  if (size <= CHAIN_XATTR_SHORT_LEN_THRESHOLD)
+    // this may fit in the inode; stripe over short attrs so that XFS
+    // won't kick it out.
+    return CHAIN_XATTR_SHORT_BLOCK_LEN;
+  return CHAIN_XATTR_MAX_BLOCK_LEN;
+}
+
+// removexattr
+
+int chain_removexattr(const char *fn, const char *name)
+{
+  int i = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int r;
+
+  do {
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    r = sys_removexattr(fn, raw_name);
+    if (!i && r < 0) {
+      return r;
+    }
+    i++;
+  } while (r >= 0);
+  return 0;
+}
+
+int chain_fremovexattr(int fd, const char *name)
+{
+  int i = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int r;
+
+  do {
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    r = sys_fremovexattr(fd, raw_name);
+    if (!i && r < 0) {
+      return r;
+    }
+    i++;
+  } while (r >= 0);
+  return 0;
+}
+
+
+// listxattr
+
+int chain_listxattr(const char *fn, char *names, size_t len) {
+  int r;
+
+  if (!len)
+    return sys_listxattr(fn, names, len) * 2;
+
+  r = sys_listxattr(fn, 0, 0);
+  if (r < 0)
+    return r;
+
+  size_t total_len = r * 2; // should be enough
+  char *full_buf = (char *)malloc(total_len);
+  if (!full_buf)
+    return -ENOMEM;
+
+  r = sys_listxattr(fn, full_buf, total_len);
+  if (r < 0) {
+    free(full_buf);
+    return r;
+  }
+
+  char *p = full_buf;
+  const char *end = full_buf + r;
+  char *dest = names;
+  char *dest_end = names + len;
+
+  while (p < end) {
+    char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+    int attr_len = strlen(p);
+    bool is_first;
+    int name_len = translate_raw_name(p, name, sizeof(name), &is_first);
+    if (is_first)  {
+      if (dest + name_len > dest_end) {
+        r = -ERANGE;
+        goto done;
+      }
+      strcpy(dest, name);
+      dest += name_len + 1;
+    }
+    p += attr_len + 1;
+  }
+  r = dest - names;
+
+done:
+  free(full_buf);
+  return r;
+}
+
+int chain_flistxattr(int fd, char *names, size_t len) {
+  int r;
+  char *p;
+  const char * end;
+  char *dest;
+  char *dest_end;
+
+  if (!len)
+    return sys_flistxattr(fd, names, len) * 2;
+
+  r = sys_flistxattr(fd, 0, 0);
+  if (r < 0)
+    return r;
+
+  size_t total_len = r * 2; // should be enough
+  char *full_buf = (char *)malloc(total_len);
+  if (!full_buf)
+    return -ENOMEM;
+
+  r = sys_flistxattr(fd, full_buf, total_len);
+  if (r < 0)
+    goto done;
+
+  p = full_buf;
+  end = full_buf + r;
+  dest = names;
+  dest_end = names + len;
+
+  while (p < end) {
+    char name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+    int attr_len = strlen(p);
+    bool is_first;
+    int name_len = translate_raw_name(p, name, sizeof(name), &is_first);
+    if (is_first)  {
+      if (dest + name_len > dest_end) {
+        r = -ERANGE;
+        goto done;
+      }
+      strcpy(dest, name);
+      dest += name_len + 1;
+    }
+    p += attr_len + 1;
+  }
+  r = dest - names;
+
+done:
+  free(full_buf);
+  return r;
+}
diff --git a/src/os/filestore/chain_xattr.h b/src/os/filestore/chain_xattr.h
new file mode 100644
index 000000000..25f75cfaa
--- /dev/null
+++ b/src/os/filestore/chain_xattr.h
@@ -0,0 +1,182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_OSD_CHAIN_XATTR_H
+#define __CEPH_OSD_CHAIN_XATTR_H
+
+#include "include/compat.h"
+#include <errno.h>
+#include <stdio.h>
+#include "os_xattr.h"
+#include "include/ceph_assert.h"
+#include "include/buffer_fwd.h"
+
+#if defined(__linux__)
+#include <linux/limits.h>
+#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2)
+#elif defined(__APPLE__)
+#include <sys/xattr.h>
+#define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_MAXNAMELEN + 1) / 2)
+#else
+#define CHAIN_XATTR_MAX_NAME_LEN  128
+#endif
+
+#define CHAIN_XATTR_MAX_BLOCK_LEN 2048
+
+/*
+ * XFS will only inline xattrs < 255 bytes, so for xattrs that are
+ * likely to fit in the inode, stripe over short xattrs.
+ */
+#define CHAIN_XATTR_SHORT_BLOCK_LEN 250
+#define CHAIN_XATTR_SHORT_LEN_THRESHOLD 1000
+
+// wrappers to hide annoying errno handling.
+
+inline int sys_fgetxattr(int fd, const char *name, void *val, size_t size)
+{
+  int r = ::ceph_os_fgetxattr(fd, name, val, size);
+  return (r < 0 ? -errno : r);
+}
+inline int sys_getxattr(const char *fn, const char *name, void *val, size_t size)
+{
+  int r = ::ceph_os_getxattr(fn, name, val, size);
+  return (r < 0 ? -errno : r);
+}
+
+inline int sys_setxattr(const char *fn, const char *name, const void *val, size_t size)
+{
+  int r = ::ceph_os_setxattr(fn, name, val, size);
+  return (r < 0 ? -errno : r);
+}
+inline int sys_fsetxattr(int fd, const char *name, const void *val, size_t size)
+{
+  int r = ::ceph_os_fsetxattr(fd, name, val, size);
+  return (r < 0 ? -errno : r);
+}
+
+inline int sys_listxattr(const char *fn, char *names, size_t len)
+{
+  int r = ::ceph_os_listxattr(fn, names, len);
+  return (r < 0 ? -errno : r);
+}
+inline int sys_flistxattr(int fd, char *names, size_t len)
+{
+  int r = ::ceph_os_flistxattr(fd, names, len);
+  return (r < 0 ? -errno : r);
+}
+
+inline int sys_removexattr(const char *fn, const char *name)
+{
+  int r = ::ceph_os_removexattr(fn, name);
+  return (r < 0 ? -errno : r);
+}
+inline int sys_fremovexattr(int fd, const char *name)
+{
+  int r = ::ceph_os_fremovexattr(fd, name);
+  return (r < 0 ? -errno : r);
+}
+
+
+// wrappers to chain large values across multiple xattrs
+
+int chain_getxattr(const char *fn, const char *name, void *val, size_t size);
+int chain_getxattr_buf(const char *fn, const char *name, ceph::buffer::ptr *bp);
+int chain_fgetxattr(int fd, const char *name, void *val, size_t size);
+
+int get_xattr_block_size(size_t size);
+void get_raw_xattr_name(const char *name, int i, char *raw_name, int raw_len);
+
+template <bool skip_chain_cleanup=false, bool ensure_single_attr=false>
+int chain_setxattr(
+  const char *fn, const char *name, const void *val, size_t size)
+{
+  int i = 0, pos = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int ret = 0;
+  size_t max_chunk_size =
+    ensure_single_attr ? size : get_xattr_block_size(size);
+
+  static_assert(
+    !skip_chain_cleanup || ensure_single_attr,
+    "skip_chain_cleanup must imply ensure_single_attr");
+
+  do {
+    size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    size -= chunk_size;
+
+    int r = sys_setxattr(fn, raw_name, (char *)val + pos, chunk_size);
+    if (r < 0) {
+      ret = r;
+      break;
+    }
+    pos  += chunk_size;
+    ret = pos;
+    i++;
+    ceph_assert(size == 0 || !ensure_single_attr);
+  } while (size);
+
+  if (ret >= 0 && !skip_chain_cleanup) {
+    int r;
+    do {
+      get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+      r = sys_removexattr(fn, raw_name);
+      if (r < 0 && r != -ENODATA)
+	ret = r;
+      i++;
+    } while (r != -ENODATA);
+  }
+
+  return ret;
+}
+
+template <bool skip_chain_cleanup=false, bool ensure_single_attr=false>
+int chain_fsetxattr(
+  int fd, const char *name, const void *val, size_t size)
+{
+  int i = 0, pos = 0;
+  char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
+  int ret = 0;
+  size_t max_chunk_size =
+    ensure_single_attr ? size : get_xattr_block_size(size);
+
+  static_assert(
+    !skip_chain_cleanup || ensure_single_attr,
+    "skip_chain_cleanup must imply ensure_single_attr");
+
+  do {
+    size_t chunk_size = (size < max_chunk_size ? size : max_chunk_size);
+    get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+    size -= chunk_size;
+
+    int r = sys_fsetxattr(fd, raw_name, (char *)val + pos, chunk_size);
+    if (r < 0) {
+      ret = r;
+      break;
+    }
+    pos  += chunk_size;
+    ret = pos;
+    i++;
+    ceph_assert(size == 0 || !ensure_single_attr);
+  } while (size);
+
+  if (ret >= 0 && !skip_chain_cleanup) {
+    int r;
+    do {
+      get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
+      r = sys_fremovexattr(fd, raw_name);
+      if (r < 0 && r != -ENODATA)
+	ret = r;
+      i++;
+    } while (r != -ENODATA);
+  }
+
+  return ret;
+}
+
+int chain_listxattr(const char *fn, char *names, size_t len);
+int chain_flistxattr(int fd, char *names, size_t len);
+int chain_removexattr(const char *fn, const char *name);
+int chain_fremovexattr(int fd, const char *name);
+
+#endif
diff --git a/src/os/filestore/os_xattr.c b/src/os/filestore/os_xattr.c
new file mode 100644
index 000000000..8de0e1d92
--- /dev/null
+++ b/src/os/filestore/os_xattr.c
@@ -0,0 +1,278 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 Stanislav Sedov <stas@FreeBSD.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#if defined(__FreeBSD__)
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/types.h>
+#include <sys/extattr.h>
+#elif defined(__linux__)
+#include <sys/types.h>
+#include <sys/xattr.h>
+#elif defined(__APPLE__)
+#include <errno.h>
+#include <sys/xattr.h>
+#else
+#error "Your system is not supported!"
+#endif
+
+#include "os_xattr.h"
+
+/*
+ * Sets extended attribute on a file.
+ * Returns 0 on success, -1 on failure.
+ */
+int
+ceph_os_setxattr(const char *path, const char *name,
+    const void *value, size_t size)
+{
+	int error = -1;
+
+#if defined(__FreeBSD__)
+	error = extattr_set_file(path, EXTATTR_NAMESPACE_USER, name, value,
+	    size);
+	if (error > 0)
+		error = 0;
+#elif defined(__linux__) 
+	error = setxattr(path, name, value, size, 0);
+#elif defined(__APPLE__)
+	error = setxattr(path, name, value, size, 0 /* position */, 0);
+#endif
+
+	return (error);
+}
+
+int
+ceph_os_fsetxattr(int fd, const char *name, const void *value,
+    size_t size)
+{
+	int error = -1;
+
+#if defined(__FreeBSD__)
+	error = extattr_set_fd(fd, EXTATTR_NAMESPACE_USER, name, value, size);
+	if (error > 0)
+		error = 0;
+#elif defined(__linux__)
+	error = fsetxattr(fd, name, value, size, 0);
+#elif defined(__APPLE__)
+	error = fsetxattr(fd, name, value, size, 0, 0 /* no options should be identical to Linux */ );
+#endif
+
+	return (error);
+}
+
+ssize_t
+ceph_os_getxattr(const char *path, const char *name,
+void *value, size_t size)
+{
+	ssize_t error = -1;
+
+#if defined(__FreeBSD__)
+	if (value == NULL || size == 0) {
+		error = extattr_get_file(path, EXTATTR_NAMESPACE_USER, name, value,
+		    size);
+	} else {
+		error = extattr_get_file(path, EXTATTR_NAMESPACE_USER, name, NULL,
+		    0);
+		if (error > 0) {
+			if (error > size) {
+				errno = ERANGE;
+				error = -1;
+			} else  {
+				error = extattr_get_file(path, EXTATTR_NAMESPACE_USER,
+				    name, value, size);
+			}
+		}
+	}
+#elif defined(__linux__)
+	error = getxattr(path, name, value, size);
+#elif defined(__APPLE__)
+	error = getxattr(path, name, value, size, 0 /* position  */, 0);
+	/* ENOATTR and ENODATA have different values */
+	if (error < 0 && errno == ENOATTR)
+		errno = ENODATA;
+#endif
+
+	return (error);
+}
+
+ssize_t
+ceph_os_fgetxattr(int fd, const char *name, void *value,
+    size_t size)
+{
+	ssize_t error = -1;
+
+#if defined(__FreeBSD__)
+	if (value == NULL || size == 0) {
+		error = extattr_get_fd(fd, EXTATTR_NAMESPACE_USER, name, value,
+		    size);
+	} else {
+		error = extattr_get_fd(fd, EXTATTR_NAMESPACE_USER, name, NULL,
+		    0);
+		if (error > 0) {
+			if (error > size) {
+				errno = ERANGE;
+				error = -1;
+			} else  {
+				error = extattr_get_fd(fd, EXTATTR_NAMESPACE_USER,
+				    name, value, size);
+			}
+		}
+	}
+#elif defined(__linux__)
+	error = fgetxattr(fd, name, value, size);
+#elif defined(__APPLE__)
+	error = fgetxattr(fd, name, value, size, 0, 0 /* no options */);
+	/* ENOATTR and ENODATA have different values */
+	if (error < 0 && errno == ENOATTR)
+		errno = ENODATA;
+#endif
+
+	return (error);
+}
+
+ssize_t
+ceph_os_listxattr(const char *path, char *list, size_t size)
+{
+	ssize_t error = -1;
+
+#if defined(__FreeBSD__)
+	/*
+	 * XXX. The format of the list FreeBSD returns differs
+	 * from the Linux ones.  We have to perform the conversion. :-(
+	 */
+	char *newlist, *p, *p1;
+
+	if (size != 0) {
+		newlist = malloc(size);
+		if (newlist != NULL) {
+			error = extattr_list_file(path, EXTATTR_NAMESPACE_USER,
+			    newlist, size);
+			if (error > 0) {
+				p = newlist;
+				p1 = list;
+				while ((p - newlist) < error) {
+					uint8_t len = *(uint8_t *)p;
+					p++;
+					if ((p + len - newlist) > error)
+						break;
+					if (len > 0) {
+						bcopy(p, p1, len);
+						p += len;
+						p1 += len;
+						*p1++ = '\0';
+					}
+				}
+				error = p1 - list;
+			}
+			free(newlist);
+		}
+	} else {
+		error = extattr_list_file(path, EXTATTR_NAMESPACE_USER,
+		    list, size);
+	}
+#elif defined(__linux__)
+	error = listxattr(path, list, size);
+#elif defined(__APPLE__)
+	error = listxattr(path, list, size, 0);
+#endif
+
+	return (error);
+}
+
+ssize_t
+ceph_os_flistxattr(int fd, char *list, size_t size)
+{
+	ssize_t error = -1;
+
+#if defined(__FreeBSD__)
+	/*
+	 * XXX. The format of the list FreeBSD returns differs
+	 * from the Linux ones.  We have to perform the conversion. :-(
+	 */
+	char *newlist, *p, *p1;
+
+	if (size != 0) {
+		newlist = malloc(size);
+		if (newlist != NULL) {
+			error = extattr_list_fd(fd, EXTATTR_NAMESPACE_USER,
+			    newlist, size);
+			if (error > 0) {
+				p = newlist;
+				p1 = list;
+				while ((p - newlist) < error) {
+					uint8_t len = *(uint8_t *)p;
+					p++;
+					if ((p + len - newlist) > error)
+						break;
+					if (len > 0) {
+						bcopy(p, p1, len);
+						p += len;
+						p1 += len;
+						*p1++ = '\0';
+					}
+				}
+				error = p1 - list;
+			}
+			free(newlist);
+		}
+	} else {
+		error = extattr_list_fd(fd, EXTATTR_NAMESPACE_USER,
+		    list, size);
+	}
+#elif defined(__linux__)
+	error = flistxattr(fd, list, size);
+#elif defined(__APPLE__)
+	error = flistxattr(fd, list, size, 0);
+#endif
+
+	return (error);
+}
+
+int
+ceph_os_removexattr(const char *path, const char *name)
+{
+	int error = -1;
+
+#if defined(__FreeBSD__)
+	error = extattr_delete_file(path, EXTATTR_NAMESPACE_USER, name);
+#elif defined(__linux__)
+	error = removexattr(path, name);
+#elif defined(__APPLE__)
+	error = removexattr(path, name, 0);
+	/* ENOATTR and ENODATA have different values */
+	if (error < 0 && errno == ENOATTR)
+		errno = ENODATA;
+#endif
+
+	return (error);
+}
+
+int
+ceph_os_fremovexattr(int fd, const char *name)
+{
+	int error = -1;
+
+#if defined(__FreeBSD__)
+	error = extattr_delete_fd(fd, EXTATTR_NAMESPACE_USER, name);
+#elif defined(__linux__)
+	error = fremovexattr(fd, name);
+#elif defined(__APPLE__)
+	error = fremovexattr(fd, name, 0);
+	/* ENOATTR and ENODATA have different values */
+	if (error < 0 && errno == ENOATTR)
+		errno = ENODATA;
+#endif
+
+	return (error);
+}
diff --git a/src/os/filestore/os_xattr.h b/src/os/filestore/os_xattr.h
new file mode 100644
index 000000000..cf98394b2
--- /dev/null
+++ b/src/os/filestore/os_xattr.h
@@ -0,0 +1,46 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 Stanislav Sedov <stas@FreeBSD.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef CEPH_EXTATTR_H
+#define CEPH_EXTATTR_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Almost everyone defines ENOATTR, except for Linux,
+// which does #define ENOATTR ENODATA.  It seems that occasionally that
+// isn't defined, though, so let's make sure.
+#ifndef ENOATTR
+# define ENOATTR ENODATA
+#endif
+
+#include <stddef.h>
+
+int ceph_os_setxattr(const char *path, const char *name,
+                  const void *value, size_t size);
+int ceph_os_fsetxattr(int fd, const char *name, const void *value,
+                   size_t size);
+ssize_t ceph_os_getxattr(const char *path, const char *name,
+                         void *value, size_t size);
+ssize_t ceph_os_fgetxattr(int fd, const char *name, void *value,
+                          size_t size);
+ssize_t ceph_os_listxattr(const char *path, char *list, size_t size);
+ssize_t ceph_os_flistxattr(int fd, char *list, size_t size);
+int ceph_os_removexattr(const char *path, const char *name);
+int ceph_os_fremovexattr(int fd, const char *name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !CEPH_EXTATTR_H */
diff --git a/src/os/fs/FS.cc b/src/os/fs/FS.cc
new file mode 100644
index 000000000..a7d085402
--- /dev/null
+++ b/src/os/fs/FS.cc
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#ifdef __linux__
+#include <linux/falloc.h>
+#endif
+
+#include "FS.h"
+
+#include "acconfig.h"
+
+#ifdef HAVE_LIBXFS
+#include "XFS.h"
+#endif
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/mount.h>
+#else
+#include <sys/vfs.h>
+#endif
+#include "include/compat.h"
+
+// ---------------
+
+FS *FS::create(uint64_t f_type)
+{
+  switch (f_type) {
+#ifdef HAVE_LIBXFS
+  case XFS_SUPER_MAGIC:
+    return new XFS;
+#endif
+  default:
+    return new FS;
+  }
+}
+
+FS *FS::create_by_fd(int fd)
+{
+  struct statfs st;
+  ::fstatfs(fd, &st);
+  return create(st.f_type);
+}
+
+// ---------------
+
+int FS::set_alloc_hint(int fd, uint64_t hint)
+{
+  return 0;  // no-op
+}
+
+#ifdef HAVE_NAME_TO_HANDLE_AT
+int FS::get_handle(int fd, std::string *h)
+{
+  char buf[sizeof(struct file_handle) + MAX_HANDLE_SZ];
+  struct file_handle *fh = (struct file_handle *)buf;
+  int mount_id;
+
+  fh->handle_bytes = MAX_HANDLE_SZ;
+  int r = name_to_handle_at(fd, "", fh, &mount_id, AT_EMPTY_PATH);
+  if (r < 0) {
+    return -errno;
+  }
+  *h = std::string(buf, fh->handle_bytes + sizeof(struct file_handle));
+  return 0;
+}
+
+int FS::open_handle(int mount_fd, const std::string& h, int flags)
+{
+  if (h.length() < sizeof(struct file_handle)) {
+    return -EINVAL;
+  }
+  struct file_handle *fh = (struct file_handle *)h.data();
+  if (fh->handle_bytes > h.length()) {
+    return -ERANGE;
+  }
+  int fd = open_by_handle_at(mount_fd, fh, flags);
+  if (fd < 0)
+    return -errno;
+  return fd;
+}
+
+#else // HAVE_NAME_TO_HANDLE_AT
+
+int FS::get_handle(int fd, std::string *h)
+{
+  return -EOPNOTSUPP;
+}
+
+int FS::open_handle(int mount_fd, const std::string& h, int flags)
+{
+  return -EOPNOTSUPP;
+}
+
+#endif // HAVE_NAME_TO_HANDLE_AT
+
+int FS::copy_file_range(int to_fd, uint64_t to_offset,
+			int from_fd,
+			uint64_t from_offset, uint64_t from_len)
+{
+  ceph_abort_msg("write me");
+}
+
+int FS::zero(int fd, uint64_t offset, uint64_t length)
+{
+  int r;
+
+  /*
+
+    From the fallocate(2) man page:
+
+       Specifying the FALLOC_FL_PUNCH_HOLE flag (available since Linux 2.6.38)
+       in mode deallocates space (i.e., creates a  hole)  in  the  byte  range
+       starting  at offset and continuing for len bytes.  Within the specified
+       range, partial filesystem  blocks  are  zeroed,  and  whole  filesystem
+       blocks  are removed from the file.  After a successful call, subsequent
+       reads from this range will return zeroes.
+
+       The FALLOC_FL_PUNCH_HOLE flag must be ORed with FALLOC_FL_KEEP_SIZE  in
+       mode;  in  other words, even when punching off the end of the file, the
+       file size (as reported by stat(2)) does not change.
+
+       Not all  filesystems  support  FALLOC_FL_PUNCH_HOLE;  if  a  filesystem
+       doesn't  support the operation, an error is returned.  The operation is
+       supported on at least the following filesystems:
+
+       *  XFS (since Linux 2.6.38)
+
+       *  ext4 (since Linux 3.0)
+
+       *  Btrfs (since Linux 3.7)
+
+       *  tmpfs (since Linux 3.5)
+
+   So: we only do this is PUNCH_HOLE *and* KEEP_SIZE are defined.
+
+  */
+#if !defined(__APPLE__) && !defined(__FreeBSD__)
+# ifdef CEPH_HAVE_FALLOCATE
+#  ifdef FALLOC_FL_KEEP_SIZE
+  // first try fallocate
+  r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length);
+  if (r < 0) {
+    r = -errno;
+  }
+  if (r != -EOPNOTSUPP) {
+    goto out;  // a real error
+  }
+  // if that failed (-EOPNOTSUPP), fall back to writing zeros.
+#  endif
+# endif
+#endif
+
+  {
+    // fall back to writing zeros
+    ceph::bufferlist bl;
+    bl.append_zero(length);
+    r = ::lseek64(fd, offset, SEEK_SET);
+    if (r < 0) {
+      r = -errno;
+      goto out;
+    }
+    r = bl.write_fd(fd);
+  }
+
+ out:
+  return r;
+}
+
+// ---------------
+
diff --git a/src/os/fs/FS.h b/src/os/fs/FS.h
new file mode 100644
index 000000000..a1852f49f
--- /dev/null
+++ b/src/os/fs/FS.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_FS_H
+#define CEPH_OS_FS_H
+
+#include <errno.h>
+#include <time.h>
+
+#include <string>
+
+#include "include/types.h"
+#include "common/Cond.h"
+
+class FS {
+public:
+  virtual ~FS() { }
+
+  static FS *create(uint64_t f_type);
+  static FS *create_by_fd(int fd);
+
+  virtual const char *get_name() {
+    return "generic";
+  }
+
+  virtual int set_alloc_hint(int fd, uint64_t hint);
+
+  virtual int get_handle(int fd, std::string *h);
+  virtual int open_handle(int mount_fd, const std::string& h, int flags);
+
+  virtual int copy_file_range(int to_fd, uint64_t to_offset,
+			      int from_fd,
+			      uint64_t from_offset, uint64_t from_len);
+  virtual int zero(int fd, uint64_t offset, uint64_t length);
+
+  // -- aio --
+};
+
+#endif
diff --git a/src/os/fs/XFS.cc b/src/os/fs/XFS.cc
new file mode 100644
index 000000000..c72ee1a08
--- /dev/null
+++ b/src/os/fs/XFS.cc
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "XFS.h"
+
+#include <xfs/xfs.h>
+
+int XFS::set_alloc_hint(int fd, uint64_t val)
+{
+  struct fsxattr fsx;
+  struct stat sb;
+  int ret;
+
+  if (fstat(fd, &sb) < 0) {
+    ret = -errno;
+    return ret;
+  }
+  if (!S_ISREG(sb.st_mode)) {
+    return -EINVAL;
+  }
+
+  if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) {
+    ret = -errno;
+    return ret;
+  }
+
+  // already set?
+  if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val)
+    return 0;
+
+  // xfs won't change extent size if any extents are allocated
+  if (fsx.fsx_nextents != 0)
+    return 0;
+
+  fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE;
+  fsx.fsx_extsize = val;
+
+  if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) {
+    ret = -errno;
+    return ret;
+  }
+
+  return 0;
+}
diff --git a/src/os/fs/XFS.h b/src/os/fs/XFS.h
new file mode 100644
index 000000000..f0ea717e3
--- /dev/null
+++ b/src/os/fs/XFS.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_XFS_H
+#define CEPH_OS_XFS_H
+
+#include "FS.h"
+
+# ifndef XFS_SUPER_MAGIC
+#define XFS_SUPER_MAGIC 0x58465342
+# endif
+
+class XFS : public FS {
+  const char *get_name() override {
+    return "xfs";
+  }
+  int set_alloc_hint(int fd, uint64_t hint) override;
+};
+
+#endif
diff --git a/src/os/fs/ZFS.cc b/src/os/fs/ZFS.cc
new file mode 100644
index 000000000..02520796c
--- /dev/null
+++ b/src/os/fs/ZFS.cc
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#define HAVE_IOCTL_IN_SYS_IOCTL_H
+#include <libzfs.h>
+#include "ZFS.h"
+
+const int ZFS::TYPE_FILESYSTEM 	= ZFS_TYPE_FILESYSTEM;
+const int ZFS::TYPE_SNAPSHOT	= ZFS_TYPE_SNAPSHOT;
+const int ZFS::TYPE_VOLUME	= ZFS_TYPE_VOLUME;
+const int ZFS::TYPE_DATASET	= ZFS_TYPE_DATASET;
+
+ZFS::~ZFS()
+{
+  if (g_zfs)
+    ::libzfs_fini((libzfs_handle_t*)g_zfs);
+}
+
+int ZFS::init()
+{
+  g_zfs = ::libzfs_init();
+  return g_zfs ? 0 : -EINVAL;
+}
+
+ZFS::Handle *ZFS::open(const char *n, int t)
+{
+  return (ZFS::Handle*)::zfs_open((libzfs_handle_t*)g_zfs, n, (zfs_type_t)t);
+}
+
+void ZFS::close(ZFS::Handle *h)
+{
+  ::zfs_close((zfs_handle_t*)h);
+}
+
+const char *ZFS::get_name(ZFS::Handle *h)
+{
+  return ::zfs_get_name((zfs_handle_t*)h);
+}
+
+ZFS::Handle *ZFS::path_to_zhandle(const char *p, int t)
+{
+  return ::zfs_path_to_zhandle((libzfs_handle_t*)g_zfs, (char *)p, (zfs_type_t)t);
+}
+
+int ZFS::create(const char *n, int t)
+{
+  return ::zfs_create((libzfs_handle_t*)g_zfs, n, (zfs_type_t)t, NULL);
+}
+
+int ZFS::snapshot(const char *n, bool r)
+{
+  return ::zfs_snapshot((libzfs_handle_t*)g_zfs, n, (boolean_t)r, NULL);
+}
+
+int ZFS::rollback(ZFS::Handle *h, ZFS::Handle *snap, bool f)
+{
+  return ::zfs_rollback((zfs_handle_t*)h, (zfs_handle_t*)snap, (boolean_t)f);
+}
+
+int ZFS::destroy_snaps(ZFS::Handle *h, const char *n, bool d)
+{
+  return ::zfs_destroy_snaps((zfs_handle_t*)h, (char *)n, (boolean_t)d);
+}
+
+bool ZFS::is_mounted(ZFS::Handle *h, char **p)
+{
+  return (bool)::zfs_is_mounted((zfs_handle_t*)h, p);
+}
+
+int ZFS::mount(ZFS::Handle *h, const char *o, int f)
+{
+  return ::zfs_mount((zfs_handle_t*)h, o, f);
+}
+
+int ZFS::umount(ZFS::Handle *h, const char *o, int f)
+{
+  return ::zfs_unmount((zfs_handle_t*)h, o, f);
+}
+
+int ZFS::iter_snapshots_sorted(ZFS::Handle *h, ZFS::iter_func f, void *d)
+{
+  return ::zfs_iter_snapshots_sorted((zfs_handle_t*)h, (zfs_iter_f)f, d);
+}
diff --git a/src/os/fs/ZFS.h b/src/os/fs/ZFS.h
new file mode 100644
index 000000000..3ebe11107
--- /dev/null
+++ b/src/os/fs/ZFS.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_ZFS_H
+#define CEPH_ZFS_H
+
+// Simple wrapper to hide libzfs.h. (it conflicts with standard linux headers)
+class ZFS {
+  void *g_zfs;
+public:
+
+  static const int TYPE_FILESYSTEM;
+  static const int TYPE_SNAPSHOT;
+  static const int TYPE_VOLUME;
+  static const int TYPE_POOL;
+  static const int TYPE_DATASET;
+
+  typedef void Handle;
+  typedef int (*iter_func)(Handle *, void *);
+
+  static const char *get_name(Handle *);
+
+  ZFS() : g_zfs(NULL) {}
+  ~ZFS();
+  int init();
+  Handle *open(const char *, int);
+  void close(Handle *);
+  Handle *path_to_zhandle(const char *, int);
+  int create(const char *, int);
+  int snapshot(const char *, bool);
+  int rollback(Handle *, Handle *, bool);
+  int destroy_snaps(Handle *, const char *, bool);
+  int iter_snapshots_sorted(Handle *, iter_func, void *);
+  int mount(Handle *, const char *, int);
+  int umount(Handle *, const char *, int);
+  bool is_mounted(Handle *, char **);
+};
+
+#endif
diff --git a/src/os/fs/btrfs_ioctl.h b/src/os/fs/btrfs_ioctl.h
new file mode 100644
index 000000000..277498ca8
--- /dev/null
+++ b/src/os/fs/btrfs_ioctl.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __IOCTL_
+#define __IOCTL_
+
+#if defined(__linux__)
+#include <linux/ioctl.h>
+#elif defined(__FreeBSD__)
+#include <sys/ioctl.h>
+#endif
+
+#define BTRFS_IOCTL_MAGIC 0x94
+#define BTRFS_VOL_NAME_MAX 255
+
+/* this should be 4k */
+#define BTRFS_PATH_NAME_MAX 4087
+struct btrfs_ioctl_vol_args {
+	__s64 fd;
+	char name[BTRFS_PATH_NAME_MAX + 1];
+};
+
+#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
+
+#define BTRFS_SUBVOL_NAME_MAX 4039
+struct btrfs_ioctl_vol_args_v2 {
+	__s64 fd;
+	__u64 transid;
+	__u64 flags;
+	__u64 unused[4];
+	char name[BTRFS_SUBVOL_NAME_MAX + 1];
+};
+
+#define BTRFS_INO_LOOKUP_PATH_MAX 4080
+struct btrfs_ioctl_ino_lookup_args {
+	__u64 treeid;
+	__u64 objectid;
+	char name[BTRFS_INO_LOOKUP_PATH_MAX];
+};
+
+struct btrfs_ioctl_search_key {
+	/* which root are we searching.  0 is the tree of tree roots */
+	__u64 tree_id;
+
+	/* keys returned will be >= min and <= max */
+	__u64 min_objectid;
+	__u64 max_objectid;
+
+	/* keys returned will be >= min and <= max */
+	__u64 min_offset;
+	__u64 max_offset;
+
+	/* max and min transids to search for */
+	__u64 min_transid;
+	__u64 max_transid;
+
+	/* keys returned will be >= min and <= max */
+	__u32 min_type;
+	__u32 max_type;
+
+	/*
+	 * how many items did userland ask for, and how many are we
+	 * returning
+	 */
+	__u32 nr_items;
+
+	/* align to 64 bits */
+	__u32 unused;
+
+	/* some extra for later */
+	__u64 unused1;
+	__u64 unused2;
+	__u64 unused3;
+	__u64 unused4;
+};
+
+struct btrfs_ioctl_search_header {
+	__u64 transid;
+	__u64 objectid;
+	__u64 offset;
+	__u32 type;
+	__u32 len;
+};
+
+#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
+/*
+ * the buf is an array of search headers where
+ * each header is followed by the actual item
+ * the type field is expanded to 32 bits for alignment
+ */
+struct btrfs_ioctl_search_args {
+	struct btrfs_ioctl_search_key key;
+	char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
+};
+
+struct btrfs_ioctl_clone_range_args {
+  __s64 src_fd;
+  __u64 src_offset, src_length;
+  __u64 dest_offset;
+};
+
+/* flags for the defrag range ioctl */
+#define BTRFS_DEFRAG_RANGE_COMPRESS 1
+#define BTRFS_DEFRAG_RANGE_START_IO 2
+
+struct btrfs_ioctl_defrag_range_args {
+	/* start of the defrag operation */
+	__u64 start;
+
+	/* number of bytes to defrag, use (u64)-1 to say all */
+	__u64 len;
+
+	/*
+	 * flags for the operation, which can include turning
+	 * on compression for this one defrag
+	 */
+	__u64 flags;
+
+	/*
+	 * any extent bigger than this will be considered
+	 * already defragged.  Use 0 to take the kernel default
+	 * Use 1 to say every single extent must be rewritten
+	 */
+	__u32 extent_thresh;
+
+	/* spare for later */
+	__u32 unused[5];
+};
+
+struct btrfs_ioctl_space_info {
+	__u64 flags;
+	__u64 total_bytes;
+	__u64 used_bytes;
+};
+
+struct btrfs_ioctl_space_args {
+	__u64 space_slots;
+	__u64 total_spaces;
+	struct btrfs_ioctl_space_info spaces[0];
+};
+
+#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
+				   struct btrfs_ioctl_vol_args)
+/* trans start and trans end are dangerous, and only for
+ * use by applications that know how to avoid the
+ * resulting deadlocks
+ */
+#define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
+#define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
+#define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
+
+#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
+#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
+				   struct btrfs_ioctl_vol_args)
+
+#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
+				  struct btrfs_ioctl_clone_range_args)
+
+#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
+				struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
+				struct btrfs_ioctl_defrag_range_args)
+#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
+				   struct btrfs_ioctl_search_args)
+#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
+				   struct btrfs_ioctl_ino_lookup_args)
+#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
+#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
+				    struct btrfs_ioctl_space_args)
+#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
+#define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
+#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
+				   struct btrfs_ioctl_vol_args_v2)
+#endif
diff --git a/src/os/kstore/KStore.cc b/src/os/kstore/KStore.cc
new file mode 100644
index 000000000..8f6b19bda
--- /dev/null
+++ b/src/os/kstore/KStore.cc
@@ -0,0 +1,3408 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif
+
+#include "KStore.h"
+#include "osd/osd_types.h"
+#include "os/kv.h"
+#include "include/compat.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/Formatter.h"
+#include "common/pretty_binary.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_kstore
+
+/*
+
+  TODO:
+
+  * superblock, features
+  * refcounted extents (for efficient clone)
+
+ */
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::decode;
+using ceph::encode;
+using ceph::JSONFormatter;
+
+const string PREFIX_SUPER = "S"; // field -> value
+const string PREFIX_COLL = "C"; // collection name -> (nothing)
+const string PREFIX_OBJ = "O";  // object name -> onode
+const string PREFIX_DATA = "D"; // nid + offset -> data
+const string PREFIX_OMAP = "M"; // u64 + keyname -> value
+
+/*
+ * object name key structure
+ *
+ * 2 chars: shard (-- for none, or hex digit, so that we sort properly)
+ * encoded u64: poolid + 2^63 (so that it sorts properly)
+ * encoded u32: hash (bit reversed)
+ *
+ * 1 char: '.'
+ *
+ * escaped string: namespace
+ *
+ * 1 char: '<', '=', or '>'.  if =, then object key == object name, and
+ *         we are followed just by the key.  otherwise, we are followed by
+ *         the key and then the object name.
+ * escaped string: key
+ * escaped string: object name (unless '=' above)
+ *
+ * encoded u64: snap
+ * encoded u64: generation
+ */
+
+/*
+ * string encoding in the key
+ *
+ * The key string needs to lexicographically sort the same way that
+ * ghobject_t does.  We do this by escaping anything <= to '#' with #
+ * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
+ * hex digits.
+ *
+ * We use ! as a terminator for strings; this works because it is < #
+ * and will get escaped if it is present in the string.
+ *
+ */
+
+static void append_escaped(const string &in, string *out)
+{
+  char hexbyte[8];
+  for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+    if ((unsigned char)*i <= '#') {
+      snprintf(hexbyte, sizeof(hexbyte), "#%02x", (uint8_t)*i);
+      out->append(hexbyte);
+    } else if ((unsigned char)*i >= '~') {
+      snprintf(hexbyte, sizeof(hexbyte), "~%02x", (uint8_t)*i);
+      out->append(hexbyte);
+    } else {
+      out->push_back(*i);
+    }
+  }
+  out->push_back('!');
+}
+
+static int decode_escaped(const char *p, string *out)
+{
+  const char *orig_p = p;
+  while (*p && *p != '!') {
+    if (*p == '#' || *p == '~') {
+      unsigned hex;
+      int r = sscanf(++p, "%2x", &hex);
+      if (r < 1)
+	return -EINVAL;
+      out->push_back((char)hex);
+      p += 2;
+    } else {
+      out->push_back(*p++);
+    }
+  }
+  return p - orig_p;
+}
+
+static void _key_encode_shard(shard_id_t shard, string *key)
+{
+  // make field ordering match with ghobject_t compare operations
+  if (shard == shard_id_t::NO_SHARD) {
+    // otherwise ff will sort *after* 0, not before.
+    key->append("--");
+  } else {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02x", (int)shard);
+    key->append(buf);
+  }
+}
+static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
+{
+  if (key[0] == '-') {
+    *pshard = shard_id_t::NO_SHARD;
+  } else {
+    unsigned shard;
+    int r = sscanf(key, "%x", &shard);
+    if (r < 1)
+      return NULL;
+    *pshard = shard_id_t(shard);
+  }
+  return key + 2;
+}
+
+static void get_coll_key_range(const coll_t& cid, int bits,
+			       string *temp_start, string *temp_end,
+			       string *start, string *end)
+{
+  temp_start->clear();
+  temp_end->clear();
+  start->clear();
+  end->clear();
+
+  spg_t pgid;
+  if (cid.is_pg(&pgid)) {
+    _key_encode_shard(pgid.shard, start);
+    *end = *start;
+    *temp_start = *start;
+    *temp_end = *start;
+
+    _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
+    _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
+    _key_encode_u32(hobject_t::_reverse_bits(pgid.ps()), start);
+    _key_encode_u32(hobject_t::_reverse_bits(pgid.ps()), temp_start);
+    start->append(".");
+    temp_start->append(".");
+
+    _key_encode_u64(pgid.pool() + 0x8000000000000000ull, end);
+    _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_end);
+
+    uint64_t end_hash =
+      hobject_t::_reverse_bits(pgid.ps()) + (1ull << (32-bits));
+    if (end_hash <= 0xffffffffull) {
+      _key_encode_u32(end_hash, end);
+      _key_encode_u32(end_hash, temp_end);
+      end->append(".");
+      temp_end->append(".");
+    } else {
+      _key_encode_u32(0xffffffff, end);
+      _key_encode_u32(0xffffffff, temp_end);
+      end->append(":");
+      temp_end->append(":");
+    }
+  } else {
+    _key_encode_shard(shard_id_t::NO_SHARD, start);
+    _key_encode_u64(-1ull + 0x8000000000000000ull, start);
+    *end = *start;
+    _key_encode_u32(0, start);
+    start->append(".");
+    _key_encode_u32(0xffffffff, end);
+    end->append(":");
+
+    // no separate temp section
+    *temp_start = *end;
+    *temp_end = *end;
+  }
+}
+
+static int get_key_object(const string& key, ghobject_t *oid);
+
+static void get_object_key(CephContext* cct, const ghobject_t& oid,
+			   string *key)
+{
+  key->clear();
+
+  _key_encode_shard(oid.shard_id, key);
+  _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
+  _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
+  key->append(".");
+
+  append_escaped(oid.hobj.nspace, key);
+
+  if (oid.hobj.get_key().length()) {
+    // is a key... could be < = or >.
+    // (ASCII chars < = and > sort in that order, yay)
+    if (oid.hobj.get_key() < oid.hobj.oid.name) {
+      key->append("<");
+      append_escaped(oid.hobj.get_key(), key);
+      append_escaped(oid.hobj.oid.name, key);
+    } else if (oid.hobj.get_key() > oid.hobj.oid.name) {
+      key->append(">");
+      append_escaped(oid.hobj.get_key(), key);
+      append_escaped(oid.hobj.oid.name, key);
+    } else {
+      // same as no key
+      key->append("=");
+      append_escaped(oid.hobj.oid.name, key);
+    }
+  } else {
+    // no key
+    key->append("=");
+    append_escaped(oid.hobj.oid.name, key);
+  }
+
+  _key_encode_u64(oid.hobj.snap, key);
+  _key_encode_u64(oid.generation, key);
+
+  // sanity check
+  if (true) {
+    ghobject_t t;
+    int r = get_key_object(*key, &t);
+    if (r || t != oid) {
+      derr << "  r " << r << dendl;
+      derr << "key " << pretty_binary_string(*key) << dendl;
+      derr << "oid " << oid << dendl;
+      derr << "  t " << t << dendl;
+      ceph_assert(t == oid);
+    }
+  }
+}
+
+static int get_key_object(const string& key, ghobject_t *oid)
+{
+  int r;
+  const char *p = key.c_str();
+
+  p = _key_decode_shard(p, &oid->shard_id);
+
+  uint64_t pool;
+  p = _key_decode_u64(p, &pool);
+  oid->hobj.pool = pool - 0x8000000000000000ull;
+
+  unsigned hash;
+  p = _key_decode_u32(p, &hash);
+  oid->hobj.set_bitwise_key_u32(hash);
+  if (*p != '.')
+    return -5;
+  ++p;
+
+  r = decode_escaped(p, &oid->hobj.nspace);
+  if (r < 0)
+    return -6;
+  p += r + 1;
+
+  if (*p == '=') {
+    // no key
+    ++p;
+    r = decode_escaped(p, &oid->hobj.oid.name);
+    if (r < 0)
+      return -7;
+    p += r + 1;
+  } else if (*p == '<' || *p == '>') {
+    // key + name
+    ++p;
+    string okey;
+    r = decode_escaped(p, &okey);
+    if (r < 0)
+      return -8;
+    p += r + 1;
+    r = decode_escaped(p, &oid->hobj.oid.name);
+    if (r < 0)
+      return -9;
+    p += r + 1;
+    oid->hobj.set_key(okey);
+  } else {
+    // malformed
+    return -10;
+  }
+
+  p = _key_decode_u64(p, &oid->hobj.snap.val);
+  p = _key_decode_u64(p, &oid->generation);
+  if (*p) {
+    // if we get something other than a null terminator here, 
+    // something goes wrong.
+    return -12;
+  }
+
+  return 0;
+}
+
+
+static void get_data_key(uint64_t nid, uint64_t offset, string *out)
+{
+  _key_encode_u64(nid, out);
+  _key_encode_u64(offset, out);
+}
+
+// '-' < '.' < '~'
+static void get_omap_header(uint64_t id, string *out)
+{
+  _key_encode_u64(id, out);
+  out->push_back('-');
+}
+
+// hmm, I don't think there's any need to escape the user key since we
+// have a clean prefix.
+static void get_omap_key(uint64_t id, const string& key, string *out)
+{
+  _key_encode_u64(id, out);
+  out->push_back('.');
+  out->append(key);
+}
+
+static void rewrite_omap_key(uint64_t id, string old, string *out)
+{
+  _key_encode_u64(id, out);
+  out->append(old.substr(out->length()));
+}
+
+static void decode_omap_key(const string& key, string *user_key)
+{
+  *user_key = key.substr(sizeof(uint64_t) + 1);
+}
+
+static void get_omap_tail(uint64_t id, string *out)
+{
+  _key_encode_u64(id, out);
+  out->push_back('~');
+}
+
+
+
+// Onode
+
+#undef dout_prefix
+#define dout_prefix *_dout << "kstore.onode(" << this << ") "
+
+void KStore::Onode::flush()
+{
+  std::unique_lock<std::mutex> l(flush_lock);
+  dout(20) << __func__ << " " << flush_txns << dendl;
+  while (!flush_txns.empty())
+    flush_cond.wait(l);
+  dout(20) << __func__ << " done" << dendl;
+}
+
+// OnodeHashLRU
+
+#undef dout_prefix
+#define dout_prefix *_dout << "kstore.lru(" << this << ") "
+
+void KStore::OnodeHashLRU::_touch(OnodeRef o)
+{
+  lru_list_t::iterator p = lru.iterator_to(*o);
+  lru.erase(p);
+  lru.push_front(*o);
+}
+
+void KStore::OnodeHashLRU::add(const ghobject_t& oid, OnodeRef o)
+{
+  std::lock_guard<std::mutex> l(lock);
+  dout(30) << __func__ << " " << oid << " " << o << dendl;
+  ceph_assert(onode_map.count(oid) == 0);
+  onode_map[oid] = o;
+  lru.push_front(*o);
+}
+
+KStore::OnodeRef KStore::OnodeHashLRU::lookup(const ghobject_t& oid)
+{
+  std::lock_guard<std::mutex> l(lock);
+  dout(30) << __func__ << dendl;
+  ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
+  if (p == onode_map.end()) {
+    dout(30) << __func__ << " " << oid << " miss" << dendl;
+    return OnodeRef();
+  }
+  dout(30) << __func__ << " " << oid << " hit " << p->second << dendl;
+  _touch(p->second);
+  return p->second;
+}
+
+void KStore::OnodeHashLRU::clear()
+{
+  std::lock_guard<std::mutex> l(lock);
+  dout(10) << __func__ << dendl;
+  lru.clear();
+  onode_map.clear();
+}
+
+void KStore::OnodeHashLRU::rename(const ghobject_t& old_oid,
+				  const ghobject_t& new_oid)
+{
+  std::lock_guard<std::mutex> l(lock);
+  dout(30) << __func__ << " " << old_oid << " -> " << new_oid << dendl;
+  ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
+  po = onode_map.find(old_oid);
+  pn = onode_map.find(new_oid);
+
+  ceph_assert(po != onode_map.end());
+  if (pn != onode_map.end()) {
+    lru_list_t::iterator p = lru.iterator_to(*pn->second);
+    lru.erase(p);
+    onode_map.erase(pn);
+  }
+  OnodeRef o = po->second;
+
+  // install a non-existent onode it its place
+  po->second.reset(new Onode(cct, old_oid, o->key));
+  lru.push_back(*po->second);
+
+  // fix oid, key
+  onode_map.insert(make_pair(new_oid, o));
+  _touch(o);
+  o->oid = new_oid;
+  get_object_key(cct, new_oid, &o->key);
+}
+
+bool KStore::OnodeHashLRU::get_next(
+  const ghobject_t& after,
+  pair<ghobject_t,OnodeRef> *next)
+{
+  std::lock_guard<std::mutex> l(lock);
+  dout(20) << __func__ << " after " << after << dendl;
+
+  if (after == ghobject_t()) {
+    if (lru.empty()) {
+      return false;
+    }
+    ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.begin();
+    ceph_assert(p != onode_map.end());
+    next->first = p->first;
+    next->second = p->second;
+    return true;
+  }
+
+  ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(after);
+  ceph_assert(p != onode_map.end()); // for now
+  lru_list_t::iterator pi = lru.iterator_to(*p->second);
+  ++pi;
+  if (pi == lru.end()) {
+    return false;
+  }
+  next->first = pi->oid;
+  next->second = onode_map[pi->oid];
+  return true;
+}
+
+int KStore::OnodeHashLRU::trim(int max)
+{
+  std::lock_guard<std::mutex> l(lock);
+  dout(20) << __func__ << " max " << max
+	   << " size " << onode_map.size() << dendl;
+  int trimmed = 0;
+  int num = onode_map.size() - max;
+  if (onode_map.size() == 0 || num <= 0)
+    return 0; // don't even try
+
+  lru_list_t::iterator p = lru.end();
+  if (num)
+    --p;
+  while (num > 0) {
+    Onode *o = &*p;
+    int refs = o->nref.load();
+    if (refs > 1) {
+      dout(20) << __func__ << "  " << o->oid << " has " << refs
+	       << " refs; stopping with " << num << " left to trim" << dendl;
+      break;
+    }
+    dout(30) << __func__ << "  trim " << o->oid << dendl;
+    if (p != lru.begin()) {
+      lru.erase(p--);
+    } else {
+      lru.erase(p);
+      ceph_assert(num == 1);
+    }
+    o->get();  // paranoia
+    onode_map.erase(o->oid);
+    o->put();
+    --num;
+    ++trimmed;
+  }
+  return trimmed;
+}
+
+// =======================================================
+
+// Collection
+
+#undef dout_prefix
+#define dout_prefix *_dout << "kstore(" << store->path << ").collection(" << cid << ") "
+
+KStore::Collection::Collection(KStore *ns, coll_t cid)
+  : CollectionImpl(ns->cct, cid),
+    store(ns),
+    osr(new OpSequencer()),
+    onode_map(store->cct)
+{
+}
+
+void KStore::Collection::flush()
+{
+  osr->flush();
+}
+
+bool KStore::Collection::flush_commit(Context *c)
+{
+  return osr->flush_commit(c);
+}
+
+
+KStore::OnodeRef KStore::Collection::get_onode(
+  const ghobject_t& oid,
+  bool create)
+{
+  ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
+
+  spg_t pgid;
+  if (cid.is_pg(&pgid)) {
+    if (!oid.match(cnode.bits, pgid.ps())) {
+      lderr(store->cct) << __func__ << " oid " << oid << " not part of "
+			<< pgid << " bits " << cnode.bits << dendl;
+      ceph_abort();
+    }
+  }
+
+  OnodeRef o = onode_map.lookup(oid);
+  if (o)
+    return o;
+
+  string key;
+  get_object_key(store->cct, oid, &key);
+
+  ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
+			<< pretty_binary_string(key) << dendl;
+
+  bufferlist v;
+  int r = store->db->get(PREFIX_OBJ, key, &v);
+  ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
+  Onode *on;
+  if (v.length() == 0) {
+    ceph_assert(r == -ENOENT);
+    if (!create)
+      return OnodeRef();
+
+    // new
+    on = new Onode(store->cct, oid, key);
+    on->dirty = true;
+  } else {
+    // loaded
+    ceph_assert(r >=0);
+    on = new Onode(store->cct, oid, key);
+    on->exists = true;
+    auto p = v.cbegin();
+    decode(on->onode, p);
+  }
+  o.reset(on);
+  onode_map.add(oid, o);
+  return o;
+}
+
+
+
+// =======================================================
+
+#undef dout_prefix
+#define dout_prefix *_dout << "kstore(" << path << ") "
+
+KStore::KStore(CephContext *cct, const string& path)
+  : ObjectStore(cct, path),
+    db(NULL),
+    basedir(path),
+    path_fd(-1),
+    fsid_fd(-1),
+    mounted(false),
+    nid_last(0),
+    nid_max(0),
+    throttle_ops(cct, "kstore_max_ops", cct->_conf->kstore_max_ops),
+    throttle_bytes(cct, "kstore_max_bytes", cct->_conf->kstore_max_bytes),
+    finisher(cct),
+    kv_sync_thread(this),
+    kv_stop(false),
+    logger(nullptr)
+{
+  _init_logger();
+}
+
+KStore::~KStore()
+{
+  _shutdown_logger();
+  ceph_assert(!mounted);
+  ceph_assert(db == NULL);
+  ceph_assert(fsid_fd < 0);
+}
+
+void KStore::_init_logger()
+{
+  // XXX
+  PerfCountersBuilder b(cct, "KStore",
+                        l_kstore_first, l_kstore_last);
+  b.add_time_avg(l_kstore_state_prepare_lat, "state_prepare_lat", "Average prepare state latency");
+  b.add_time_avg(l_kstore_state_kv_queued_lat, "state_kv_queued_lat", "Average kv_queued state latency");
+  b.add_time_avg(l_kstore_state_kv_done_lat, "state_kv_done_lat", "Average kv_done state latency");
+  b.add_time_avg(l_kstore_state_finishing_lat, "state_finishing_lat", "Average finishing state latency");
+  b.add_time_avg(l_kstore_state_done_lat, "state_done_lat", "Average done state latency");
+  logger = b.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+}
+
+void KStore::_shutdown_logger()
+{
+  // XXX
+  cct->get_perfcounters_collection()->remove(logger);
+  delete logger;
+}
+
+int KStore::_open_path()
+{
+  ceph_assert(path_fd < 0);
+  path_fd = ::open(path.c_str(), O_DIRECTORY|O_CLOEXEC);
+  if (path_fd < 0) {
+    int r = -errno;
+    derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
+	 << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void KStore::_close_path()
+{
+  VOID_TEMP_FAILURE_RETRY(::close(path_fd));
+  path_fd = -1;
+}
+
+int KStore::_open_fsid(bool create)
+{
+  ceph_assert(fsid_fd < 0);
+  int flags = O_RDWR;
+  if (create)
+    flags |= O_CREAT;
+  fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
+  if (fsid_fd < 0) {
+    int err = -errno;
+    derr << __func__ << " " << cpp_strerror(err) << dendl;
+    return err;
+  }
+  return 0;
+}
+
+int KStore::_read_fsid(uuid_d *uuid)
+{
+  char fsid_str[40];
+  memset(fsid_str, 0, sizeof(fsid_str));
+  int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
+  if (ret < 0) {
+    derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  if (ret > 36)
+    fsid_str[36] = 0;
+  else
+    fsid_str[ret] = 0;
+  if (!uuid->parse(fsid_str)) {
+    derr << __func__ << " unparsable uuid " << fsid_str << dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+int KStore::_write_fsid()
+{
+  int r = ::ftruncate(fsid_fd, 0);
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  string str = stringify(fsid) + "\n";
+  r = safe_write(fsid_fd, str.c_str(), str.length());
+  if (r < 0) {
+    derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = ::fsync(fsid_fd);
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void KStore::_close_fsid()
+{
+  VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+  fsid_fd = -1;
+}
+
+int KStore::_lock_fsid()
+{
+  struct flock l;
+  memset(&l, 0, sizeof(l));
+  l.l_type = F_WRLCK;
+  l.l_whence = SEEK_SET;
+  l.l_start = 0;
+  l.l_len = 0;
+  int r = ::fcntl(fsid_fd, F_SETLK, &l);
+  if (r < 0) {
+    int err = errno;
+    derr << __func__ << " failed to lock " << path << "/fsid"
+	 << " (is another ceph-osd still running?)"
+	 << cpp_strerror(err) << dendl;
+    return -err;
+  }
+  return 0;
+}
+
+bool KStore::test_mount_in_use()
+{
+  // most error conditions mean the mount is not in use (e.g., because
+  // it doesn't exist).  only if we fail to lock do we conclude it is
+  // in use.
+  bool ret = false;
+  int r = _open_path();
+  if (r < 0)
+    return false;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+  r = _lock_fsid();
+  if (r < 0)
+    ret = true; // if we can't lock, it is in use
+  _close_fsid();
+ out_path:
+  _close_path();
+  return ret;
+}
+
+int KStore::_open_db(bool create)
+{
+  int r;
+  ceph_assert(!db);
+  char fn[PATH_MAX];
+  snprintf(fn, sizeof(fn), "%s/db", path.c_str());
+
+  string kv_backend;
+  if (create) {
+    kv_backend = cct->_conf->kstore_backend;
+  } else {
+    r = read_meta("kv_backend", &kv_backend);
+    if (r < 0) {
+      derr << __func__ << " uanble to read 'kv_backend' meta" << dendl;
+      return -EIO;
+    }
+  }
+  dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
+
+  if (create) {
+    int r = ::mkdir(fn, 0755);
+    if (r < 0)
+      r = -errno;
+    if (r < 0 && r != -EEXIST) {
+      derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
+	   << dendl;
+      return r;
+    }
+
+    // wal_dir, too!
+    char walfn[PATH_MAX];
+    snprintf(walfn, sizeof(walfn), "%s/db.wal", path.c_str());
+    r = ::mkdir(walfn, 0755);
+    if (r < 0)
+      r = -errno;
+    if (r < 0 && r != -EEXIST) {
+      derr << __func__ << " failed to create " << walfn
+	   << ": " << cpp_strerror(r)
+	   << dendl;
+      return r;
+    }
+  }
+
+  db = KeyValueDB::create(cct, kv_backend, fn);
+  if (!db) {
+    derr << __func__ << " error creating db" << dendl;
+    return -EIO;
+  }
+  string options;
+  if (kv_backend == "rocksdb")
+    options = cct->_conf->kstore_rocksdb_options;
+  db->init(options);
+  stringstream err;
+  if (create)
+    r = db->create_and_open(err);
+  else
+    r = db->open(err);
+  if (r) {
+    derr << __func__ << " erroring opening db: " << err.str() << dendl;
+    delete db;
+    db = NULL;
+    return -EIO;
+  }
+  dout(1) << __func__ << " opened " << kv_backend
+	  << " path " << fn << " options " << options << dendl;
+  return 0;
+}
+
+void KStore::_close_db()
+{
+  ceph_assert(db);
+  delete db;
+  db = NULL;
+}
+
+int KStore::_open_collections(int *errors)
+{
+  ceph_assert(coll_map.empty());
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
+  for (it->upper_bound(string());
+       it->valid();
+       it->next()) {
+    coll_t cid;
+    if (cid.parse(it->key())) {
+      auto c = ceph::make_ref<Collection>(this, cid);
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      try {
+        decode(c->cnode, p);
+      } catch (ceph::buffer::error& e) {
+        derr << __func__ << " failed to decode cnode, key:"
+             << pretty_binary_string(it->key()) << dendl;
+        return -EIO;
+      } 
+      dout(20) << __func__ << " opened " << cid << dendl;
+      coll_map[cid] = c;
+    } else {
+      derr << __func__ << " unrecognized collection " << it->key() << dendl;
+      if (errors)
+	(*errors)++;
+    }
+  }
+  return 0;
+}
+
+int KStore::mkfs()
+{
+  dout(1) << __func__ << " path " << path << dendl;
+  int r;
+  uuid_d old_fsid;
+
+  r = _open_path();
+  if (r < 0)
+    return r;
+
+  r = _open_fsid(true);
+  if (r < 0)
+    goto out_path_fd;
+
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_close_fsid;
+
+  r = _read_fsid(&old_fsid);
+  if (r < 0 || old_fsid.is_zero()) {
+    if (fsid.is_zero()) {
+      fsid.generate_random();
+      dout(1) << __func__ << " generated fsid " << fsid << dendl;
+    } else {
+      dout(1) << __func__ << " using provided fsid " << fsid << dendl;
+    }
+    // we'll write it last.
+  } else {
+    if (!fsid.is_zero() && fsid != old_fsid) {
+      derr << __func__ << " on-disk fsid " << old_fsid
+	   << " != provided " << fsid << dendl;
+      r = -EINVAL;
+      goto out_close_fsid;
+    }
+    fsid = old_fsid;
+    dout(1) << __func__ << " already created, fsid is " << fsid << dendl;
+    goto out_close_fsid;
+  }
+
+  r = _open_db(true);
+  if (r < 0)
+    goto out_close_fsid;
+
+  r = write_meta("kv_backend", cct->_conf->kstore_backend);
+  if (r < 0)
+    goto out_close_db;
+
+  r = write_meta("type", "kstore");
+  if (r < 0)
+    goto out_close_db;
+
+  // indicate mkfs completion/success by writing the fsid file
+  r = _write_fsid();
+  if (r == 0)
+    dout(10) << __func__ << " success" << dendl;
+  else
+    derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
+
+ out_close_db:
+  _close_db();
+ out_close_fsid:
+  _close_fsid();
+ out_path_fd:
+  _close_path();
+  return r;
+}
+
+int KStore::mount()
+{
+  dout(1) << __func__ << " path " << path << dendl;
+
+  if (cct->_conf->kstore_fsck_on_mount) {
+    int rc = fsck(cct->_conf->kstore_fsck_on_mount_deep);
+    if (rc < 0)
+      return rc;
+  }
+
+  int r = _open_path();
+  if (r < 0)
+    return r;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+
+  r = _read_fsid(&fsid);
+  if (r < 0)
+    goto out_fsid;
+
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_fsid;
+
+  r = _open_db(false);
+  if (r < 0)
+    goto out_fsid;
+
+  r = _open_super_meta();
+  if (r < 0)
+    goto out_db;
+
+  r = _open_collections();
+  if (r < 0)
+    goto out_db;
+
+  finisher.start();
+  kv_sync_thread.create("kstore_kv_sync");
+
+  mounted = true;
+  return 0;
+
+ out_db:
+  _close_db();
+ out_fsid:
+  _close_fsid();
+ out_path:
+  _close_path();
+  return r;
+}
+
+int KStore::umount()
+{
+  ceph_assert(mounted);
+  dout(1) << __func__ << dendl;
+
+  _sync();
+  _reap_collections();
+  coll_map.clear();
+
+  dout(20) << __func__ << " stopping kv thread" << dendl;
+  _kv_stop();
+  dout(20) << __func__ << " draining finisher" << dendl;
+  finisher.wait_for_empty();
+  dout(20) << __func__ << " stopping finisher" << dendl;
+  finisher.stop();
+  dout(20) << __func__ << " closing" << dendl;
+
+  mounted = false;
+  _close_db();
+  _close_fsid();
+  _close_path();
+  return 0;
+}
+
+int KStore::fsck(bool deep)
+{
+  dout(1) << __func__ << dendl;
+  int errors = 0;
+  dout(1) << __func__ << " finish with " << errors << " errors" << dendl;
+  return errors;
+}
+
+void KStore::_sync()
+{
+  dout(10) << __func__ << dendl;
+
+  std::unique_lock<std::mutex> l(kv_lock);
+  while (!kv_committing.empty() ||
+	 !kv_queue.empty()) {
+    dout(20) << " waiting for kv to commit" << dendl;
+    kv_sync_cond.wait(l);
+  }
+
+  dout(10) << __func__ << " done" << dendl;
+}
+
+int KStore::statfs(struct store_statfs_t* buf0, osd_alert_list_t* alerts)
+{
+  struct statfs buf;
+  buf0->reset();
+  if (alerts) {
+    alerts->clear(); // returns nothing for now
+  }
+  if (::statfs(basedir.c_str(), &buf) < 0) {
+    int r = -errno;
+    ceph_assert(r != -ENOENT);
+    return r;
+  }
+
+  buf0->total = buf.f_blocks * buf.f_bsize;
+  buf0->available = buf.f_bavail * buf.f_bsize;
+
+  return 0;
+}
+
+ObjectStore::CollectionHandle KStore::open_collection(const coll_t& cid)
+{
+  return _get_collection(cid);
+}
+
+ObjectStore::CollectionHandle KStore::create_new_collection(const coll_t& cid)
+{
+  auto c = ceph::make_ref<Collection>(this, cid);
+  std::unique_lock l{coll_lock};
+  new_coll_map[cid] = c;
+  return c;
+}
+
+int KStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+			bool *per_pool_omap)
+{
+  return -ENOTSUP;
+}
+
+// ---------------
+// cache
+
+KStore::CollectionRef KStore::_get_collection(coll_t cid)
+{
+  std::shared_lock l{coll_lock};
+  ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+  if (cp == coll_map.end())
+    return CollectionRef();
+  return cp->second;
+}
+
+void KStore::_queue_reap_collection(CollectionRef& c)
+{
+  dout(10) << __func__ << " " << c->cid << dendl;
+  std::lock_guard<std::mutex> l(reap_lock);
+  removed_collections.push_back(c);
+}
+
+void KStore::_reap_collections()
+{
+  list<CollectionRef> removed_colls;
+  std::lock_guard<std::mutex> l(reap_lock);
+  removed_colls.swap(removed_collections);
+
+  for (list<CollectionRef>::iterator p = removed_colls.begin();
+       p != removed_colls.end();
+       ++p) {
+    CollectionRef c = *p;
+    dout(10) << __func__ << " " << c->cid << dendl;
+    {
+      pair<ghobject_t,OnodeRef> next;
+      while (c->onode_map.get_next(next.first, &next)) {
+	ceph_assert(!next.second->exists);
+	if (!next.second->flush_txns.empty()) {
+	  dout(10) << __func__ << " " << c->cid << " " << next.second->oid
+		   << " flush_txns " << next.second->flush_txns << dendl;
+	  return;
+	}
+      }
+    }
+    c->onode_map.clear();
+    dout(10) << __func__ << " " << c->cid << " done" << dendl;
+  }
+
+  dout(10) << __func__ << " all reaped" << dendl;
+}
+
+// ---------------
+// read operations
+
+bool KStore::exists(CollectionHandle& ch, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists)
+    return false;
+  return true;
+}
+
+int KStore::stat(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  struct stat *st,
+  bool allow_eio)
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists)
+    return -ENOENT;
+  st->st_size = o->onode.size;
+  st->st_blksize = 4096;
+  st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
+  st->st_nlink = 1;
+  return 0;
+}
+
+int KStore::set_collection_opts(
+  CollectionHandle& ch,
+  const pool_opts_t& opts)
+{
+  return -EOPNOTSUPP;
+}
+
+int KStore::read(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t length,
+  bufferlist& bl,
+  uint32_t op_flags)
+{
+  dout(15) << __func__ << " " << ch->cid << " " << oid
+	   << " " << offset << "~" << length
+	   << dendl;
+  bl.clear();
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+
+  int r;
+
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+
+  if (offset == length && offset == 0)
+    length = o->onode.size;
+
+  r = _do_read(o, offset, length, bl, false, op_flags);
+
+ out:
+  dout(10) << __func__ << " " << ch->cid << " " << oid
+	   << " " << offset << "~" << length
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_do_read(
+    OnodeRef o,
+    uint64_t offset,
+    size_t length,
+    bufferlist& bl,
+    bool do_cache,
+    uint32_t op_flags)
+{
+  int r = 0;
+  uint64_t stripe_size = o->onode.stripe_size;
+  uint64_t stripe_off;
+
+  dout(20) << __func__ << " " << offset << "~" << length << " size "
+	   << o->onode.size << " nid " << o->onode.nid << dendl;
+  bl.clear();
+
+  if (offset > o->onode.size) {
+    goto out;
+  }
+  if (offset + length > o->onode.size) {
+    length = o->onode.size - offset;
+  }
+  if (stripe_size == 0) {
+    bl.append_zero(length);
+    r = length;
+    goto out;
+  }
+
+  o->flush();
+
+  stripe_off = offset % stripe_size;
+  while (length > 0) {
+    bufferlist stripe;
+    _do_read_stripe(o, offset - stripe_off, &stripe, do_cache);
+    dout(30) << __func__ << " stripe " << offset - stripe_off << " got "
+	     << stripe.length() << dendl;
+    unsigned swant = std::min<unsigned>(stripe_size - stripe_off, length);
+    if (stripe.length()) {
+      if (swant == stripe.length()) {
+	bl.claim_append(stripe);
+	dout(30) << __func__ << " taking full stripe" << dendl;
+      } else {
+	unsigned l = 0;
+	if (stripe_off < stripe.length()) {
+	  l = std::min<uint64_t>(stripe.length() - stripe_off, swant);
+	  bufferlist t;
+	  t.substr_of(stripe, stripe_off, l);
+	  bl.claim_append(t);
+	  dout(30) << __func__ << " taking " << stripe_off << "~" << l << dendl;
+	}
+	if (l < swant) {
+	  bl.append_zero(swant - l);
+	  dout(30) << __func__ << " adding " << swant - l << " zeros" << dendl;
+	}
+      }
+    } else {
+      dout(30) << __func__ << " generating " << swant << " zeros" << dendl;
+      bl.append_zero(swant);
+    }
+    offset += swant;
+    length -= swant;
+    stripe_off = 0;
+  }
+  r = bl.length();
+  dout(30) << " result:\n";
+  bl.hexdump(*_dout);
+  *_dout << dendl;
+
+ out:
+  return r;
+}
+
+int KStore::fiemap(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t len,
+  bufferlist& bl)
+{
+  map<uint64_t, uint64_t> m;
+  int r = fiemap(ch, oid, offset, len, m);
+  if (r >= 0) {
+    encode(m, bl);
+  }
+  return r;
+}
+
+int KStore::fiemap(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t len,
+  map<uint64_t, uint64_t>& destmap)
+{
+  CollectionRef c = static_cast<Collection*>(ch.get());
+  if (!c)
+    return -ENOENT;
+  std::shared_lock l{c->lock};
+
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    return -ENOENT;
+  }
+
+  if (offset > o->onode.size)
+    goto out;
+
+  if (offset + len > o->onode.size) {
+    len = o->onode.size - offset;
+  }
+
+  dout(20) << __func__ << " " << offset << "~" << len << " size "
+	   << o->onode.size << dendl;
+
+  // FIXME: do something smarter here
+  destmap[0] = o->onode.size;
+
+ out:
+  dout(20) << __func__ << " " << offset << "~" << len
+	   << " size = 0 (" << destmap << ")" << dendl;
+  return 0;
+}
+
+int KStore::getattr(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  const char *name,
+  bufferptr& value)
+{
+  dout(15) << __func__ << " " << ch->cid << " " << oid << " " << name << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r;
+  string k(name);
+
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+
+  if (!o->onode.attrs.count(k)) {
+    r = -ENODATA;
+    goto out;
+  }
+  value = o->onode.attrs[k];
+  r = 0;
+ out:
+  dout(10) << __func__ << " " << ch->cid << " " << oid << " " << name
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::getattrs(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  map<string,bufferptr>& aset)
+{
+  dout(15) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r;
+
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  aset = o->onode.attrs;
+  r = 0;
+ out:
+  dout(10) << __func__ << " " << ch->cid << " " << oid
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::list_collections(vector<coll_t>& ls)
+{
+  std::shared_lock l{coll_lock};
+  for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
+       p != coll_map.end();
+       ++p)
+    ls.push_back(p->first);
+  return 0;
+}
+
+bool KStore::collection_exists(const coll_t& c)
+{
+  std::shared_lock l{coll_lock};
+  return coll_map.count(c);
+}
+
+int KStore::collection_empty(CollectionHandle& ch, bool *empty)
+{
+  dout(15) << __func__ << " " << ch->cid << dendl;
+  vector<ghobject_t> ls;
+  ghobject_t next;
+  int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
+			  &ls, &next);
+  if (r < 0) {
+    derr << __func__ << " collection_list returned: " << cpp_strerror(r)
+         << dendl;
+    return r;
+  }
+  *empty = ls.empty();
+  dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
+  return 0;
+}
+
+int KStore::collection_bits(CollectionHandle& ch)
+{
+  dout(15) << __func__ << " " << ch->cid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
+  return c->cnode.bits;
+}
+
+int KStore::collection_list(
+  CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
+  vector<ghobject_t> *ls, ghobject_t *pnext)
+
+{
+  Collection *c = static_cast<Collection*>(c_.get());
+  c->flush();
+  dout(15) << __func__ << " " << c->cid
+           << " start " << start << " end " << end << " max " << max << dendl;
+  int r;
+  {
+    std::shared_lock l{c->lock};
+    r = _collection_list(c, start, end, max, ls, pnext);
+  }
+
+  dout(10) << __func__ << " " << c->cid
+    << " start " << start << " end " << end << " max " << max
+    << " = " << r << ", ls.size() = " << ls->size()
+    << ", next = " << (pnext ? *pnext : ghobject_t())  << dendl;
+  return r;
+}
+
+int KStore::_collection_list(
+  Collection* c, const ghobject_t& start, const ghobject_t& end, int max,
+  vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+  int r = 0;
+  KeyValueDB::Iterator it;
+  string temp_start_key, temp_end_key;
+  string start_key, end_key;
+  bool set_next = false;
+  string pend;
+  bool temp;
+
+  ghobject_t static_next;
+  if (!pnext)
+    pnext = &static_next;
+
+  if (start == ghobject_t::get_max() ||
+    start.hobj.is_max()) {
+    goto out;
+  }
+  get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
+		     &start_key, &end_key);
+  dout(20) << __func__
+	   << " range " << pretty_binary_string(temp_start_key)
+	   << " to " << pretty_binary_string(temp_end_key)
+	   << " and " << pretty_binary_string(start_key)
+	   << " to " << pretty_binary_string(end_key)
+	   << " start " << start << dendl;
+  it = db->get_iterator(PREFIX_OBJ);
+  if (start == ghobject_t() || start == c->cid.get_min_hobj()) {
+    it->upper_bound(temp_start_key);
+    temp = true;
+  } else {
+    string k;
+    get_object_key(cct, start, &k);
+    if (start.hobj.is_temp()) {
+      temp = true;
+      ceph_assert(k >= temp_start_key && k < temp_end_key);
+    } else {
+      temp = false;
+      ceph_assert(k >= start_key && k < end_key);
+    }
+    dout(20) << " start from " << pretty_binary_string(k)
+	     << " temp=" << (int)temp << dendl;
+    it->lower_bound(k);
+  }
+  if (end.hobj.is_max()) {
+    pend = temp ? temp_end_key : end_key;
+  } else {
+    if (end.hobj.is_temp()) {
+      if (temp)
+        get_object_key(cct, end, &pend);
+      else
+	goto out;
+    } else {
+      if (temp)
+        pend = temp_end_key;
+      else
+        get_object_key(cct, end, &pend);
+    }
+  }
+  dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
+  while (true) {
+    if (!it->valid() || it->key() >= pend) {
+      if (!it->valid())
+	dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
+      else
+	dout(20) << __func__ << " key " << pretty_binary_string(it->key())
+		 << " > " << end << dendl;
+      if (temp) {
+	if (end.hobj.is_temp()) {
+          if (it->valid() && it->key() < temp_end_key) {
+            int r = get_key_object(it->key(), pnext);
+            ceph_assert(r == 0);
+            set_next = true;
+          }
+	  break;
+	}
+	dout(30) << __func__ << " switch to non-temp namespace" << dendl;
+	temp = false;
+	it->upper_bound(start_key);
+        if (end.hobj.is_max())
+          pend = end_key;
+        else
+          get_object_key(cct, end, &pend);
+	dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
+	continue;
+      }
+      if (it->valid() && it->key() < end_key) {
+        int r = get_key_object(it->key(), pnext);
+        ceph_assert(r == 0);
+        set_next = true;
+      }
+      break;
+    }
+    dout(20) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
+    ghobject_t oid;
+    int r = get_key_object(it->key(), &oid);
+    ceph_assert(r == 0);
+    if (ls->size() >= (unsigned)max) {
+      dout(20) << __func__ << " reached max " << max << dendl;
+      *pnext = oid;
+      set_next = true;
+      break;
+    }
+    ls->push_back(oid);
+    it->next();
+  }
+out:
+  if (!set_next) {
+    *pnext = ghobject_t::get_max();
+  }
+  return r;
+}
+
+// omap reads
+
+KStore::OmapIteratorImpl::OmapIteratorImpl(
+  CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
+  : c(c), o(o), it(it)
+{
+  std::shared_lock l{c->lock};
+  if (o->onode.omap_head) {
+    get_omap_key(o->onode.omap_head, string(), &head);
+    get_omap_tail(o->onode.omap_head, &tail);
+    it->lower_bound(head);
+  }
+}
+
+int KStore::OmapIteratorImpl::seek_to_first()
+{
+  std::shared_lock l{c->lock};
+  if (o->onode.omap_head) {
+    it->lower_bound(head);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  return 0;
+}
+
+int KStore::OmapIteratorImpl::upper_bound(const string& after)
+{
+  std::shared_lock l{c->lock};
+  if (o->onode.omap_head) {
+    string key;
+    get_omap_key(o->onode.omap_head, after, &key);
+    it->upper_bound(key);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  return 0;
+}
+
+int KStore::OmapIteratorImpl::lower_bound(const string& to)
+{
+  std::shared_lock l{c->lock};
+  if (o->onode.omap_head) {
+    string key;
+    get_omap_key(o->onode.omap_head, to, &key);
+    it->lower_bound(key);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  return 0;
+}
+
+bool KStore::OmapIteratorImpl::valid()
+{
+  std::shared_lock l{c->lock};
+  if (o->onode.omap_head && it->valid() && it->raw_key().second <= tail) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int KStore::OmapIteratorImpl::next()
+{
+  std::shared_lock l{c->lock};
+  if (o->onode.omap_head) {
+    it->next();
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+string KStore::OmapIteratorImpl::key()
+{
+  std::shared_lock l{c->lock};
+  ceph_assert(it->valid());
+  string db_key = it->raw_key().second;
+  string user_key;
+  decode_omap_key(db_key, &user_key);
+  return user_key;
+}
+
+bufferlist KStore::OmapIteratorImpl::value()
+{
+  std::shared_lock l{c->lock};
+  ceph_assert(it->valid());
+  return it->value();
+}
+
+int KStore::omap_get(
+  CollectionHandle& ch,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  bufferlist *header,      ///< [out] omap header
+  map<string, bufferlist> *out /// < [out] Key to value map
+  )
+{
+  dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  {
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+    string head, tail;
+    get_omap_header(o->onode.omap_head, &head);
+    get_omap_tail(o->onode.omap_head, &tail);
+    it->lower_bound(head);
+    while (it->valid()) {
+      if (it->key() == head) {
+	dout(30) << __func__ << "  got header" << dendl;
+	*header = it->value();
+      } else if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      } else {
+	string user_key;
+	decode_omap_key(it->key(), &user_key);
+	dout(30) << __func__ << "  got " << pretty_binary_string(it->key())
+		 << " -> " << user_key << dendl;
+	ceph_assert(it->key() < tail);
+	(*out)[user_key] = it->value();
+      }
+      it->next();
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::omap_get_header(
+  CollectionHandle& ch,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  bufferlist *header,      ///< [out] omap header
+  bool allow_eio ///< [in] don't assert on eio
+  )
+{
+  dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  {
+    string head;
+    get_omap_header(o->onode.omap_head, &head);
+    if (db->get(PREFIX_OMAP, head, header) >= 0) {
+      dout(30) << __func__ << "  got header" << dendl;
+    } else {
+      dout(30) << __func__ << "  no header" << dendl;
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::omap_get_keys(
+  CollectionHandle& ch,              ///< [in] Collection containing oid
+  const ghobject_t &oid, ///< [in] Object containing omap
+  set<string> *keys      ///< [out] Keys defined on oid
+  )
+{
+  dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  {
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+    string head, tail;
+    get_omap_key(o->onode.omap_head, string(), &head);
+    get_omap_tail(o->onode.omap_head, &tail);
+    it->lower_bound(head);
+    while (it->valid()) {
+      if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      }
+      string user_key;
+      decode_omap_key(it->key(), &user_key);
+      dout(30) << __func__ << "  got " << pretty_binary_string(it->key())
+	       << " -> " << user_key << dendl;
+      ceph_assert(it->key() < tail);
+      keys->insert(user_key);
+      it->next();
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::omap_get_values(
+  CollectionHandle& ch,                    ///< [in] Collection containing oid
+  const ghobject_t &oid,       ///< [in] Object containing omap
+  const set<string> &keys,     ///< [in] Keys to get
+  map<string, bufferlist> *out ///< [out] Returned keys and values
+  )
+{
+  dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+    string key;
+    get_omap_key(o->onode.omap_head, *p, &key);
+    bufferlist val;
+    if (db->get(PREFIX_OMAP, key, &val) >= 0) {
+      dout(30) << __func__ << "  got " << pretty_binary_string(key)
+	       << " -> " << *p << dendl;
+      out->insert(make_pair(*p, val));
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::omap_check_keys(
+  CollectionHandle& ch,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  const set<string> &keys, ///< [in] Keys to check
+  set<string> *out         ///< [out] Subset of keys defined on oid
+  )
+{
+  dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+    string key;
+    get_omap_key(o->onode.omap_head, *p, &key);
+    bufferlist val;
+    if (db->get(PREFIX_OMAP, key, &val) >= 0) {
+      dout(30) << __func__ << "  have " << pretty_binary_string(key)
+	       << " -> " << *p << dendl;
+      out->insert(*p);
+    } else {
+      dout(30) << __func__ << "  miss " << pretty_binary_string(key)
+	       << " -> " << *p << dendl;
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+ObjectMap::ObjectMapIterator KStore::get_omap_iterator(
+  CollectionHandle& ch,              ///< [in] collection
+  const ghobject_t &oid  ///< [in] object
+  )
+{
+
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
+    return ObjectMap::ObjectMapIterator();
+  }
+  o->flush();
+  dout(10) << __func__ << " header = " << o->onode.omap_head <<dendl;
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+  return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
+}
+
+
+// -----------------
+// write helpers
+
+int KStore::_open_super_meta()
+{
+  // nid
+  {
+    nid_max = 0;
+    bufferlist bl;
+    db->get(PREFIX_SUPER, "nid_max", &bl);
+    auto p = bl.cbegin();
+    try {
+      decode(nid_max, p);
+    } catch (ceph::buffer::error& e) {
+    }
+    dout(10) << __func__ << " old nid_max " << nid_max << dendl;
+    nid_last = nid_max;
+  }
+  return 0;
+}
+
+void KStore::_assign_nid(TransContext *txc, OnodeRef o)
+{
+  if (o->onode.nid)
+    return;
+  std::lock_guard<std::mutex> l(nid_lock);
+  o->onode.nid = ++nid_last;
+  dout(20) << __func__ << " " << o->oid << " nid " << o->onode.nid << dendl;
+  if (nid_last > nid_max) {
+    nid_max += cct->_conf->kstore_nid_prealloc;
+    bufferlist bl;
+    encode(nid_max, bl);
+    txc->t->set(PREFIX_SUPER, "nid_max", bl);
+    dout(10) << __func__ << " nid_max now " << nid_max << dendl;
+  }
+}
+
+KStore::TransContext *KStore::_txc_create(OpSequencer *osr)
+{
+  TransContext *txc = new TransContext(osr);
+  txc->t = db->get_transaction();
+  osr->queue_new(txc);
+  dout(20) << __func__ << " osr " << osr << " = " << txc << dendl;
+  return txc;
+}
+
+void KStore::_txc_state_proc(TransContext *txc)
+{
+  while (true) {
+    dout(10) << __func__ << " txc " << txc
+	     << " " << txc->get_state_name() << dendl;
+    switch (txc->state) {
+    case TransContext::STATE_PREPARE:
+      txc->log_state_latency(logger, l_kstore_state_prepare_lat);
+      txc->state = TransContext::STATE_KV_QUEUED;
+      if (!cct->_conf->kstore_sync_transaction) {
+	std::lock_guard<std::mutex> l(kv_lock);
+	if (cct->_conf->kstore_sync_submit_transaction) {
+          int r = db->submit_transaction(txc->t);
+	  ceph_assert(r == 0);
+	}
+	kv_queue.push_back(txc);
+	kv_cond.notify_one();
+	return;
+      }
+      {
+	int r = db->submit_transaction_sync(txc->t);
+	ceph_assert(r == 0);
+      }
+      break;
+
+    case TransContext::STATE_KV_QUEUED:
+      txc->log_state_latency(logger, l_kstore_state_kv_queued_lat);
+      txc->state = TransContext::STATE_KV_DONE;
+      _txc_finish_kv(txc);
+      // ** fall-thru **
+
+    case TransContext::STATE_KV_DONE:
+      txc->log_state_latency(logger, l_kstore_state_kv_done_lat);
+      txc->state = TransContext::STATE_FINISHING;
+      // ** fall-thru **
+
+    case TransContext::TransContext::STATE_FINISHING:
+      txc->log_state_latency(logger, l_kstore_state_finishing_lat);
+      _txc_finish(txc);
+      return;
+
+    default:
+      derr << __func__ << " unexpected txc " << txc
+	   << " state " << txc->get_state_name() << dendl;
+      ceph_abort_msg("unexpected txc state");
+      return;
+    }
+  }
+}
+
+void KStore::_txc_finalize(OpSequencer *osr, TransContext *txc)
+{
+  dout(20) << __func__ << " osr " << osr << " txc " << txc
+	   << " onodes " << txc->onodes << dendl;
+
+  // finalize onodes
+  for (set<OnodeRef>::iterator p = txc->onodes.begin();
+       p != txc->onodes.end();
+       ++p) {
+    bufferlist bl;
+    encode((*p)->onode, bl);
+    dout(20) << " onode size is " << bl.length() << dendl;
+    txc->t->set(PREFIX_OBJ, (*p)->key, bl);
+
+    std::lock_guard<std::mutex> l((*p)->flush_lock);
+    (*p)->flush_txns.insert(txc);
+  }
+}
+
+void KStore::_txc_finish_kv(TransContext *txc)
+{
+  dout(20) << __func__ << " txc " << txc << dendl;
+
+  // warning: we're calling onreadable_sync inside the sequencer lock
+  if (txc->onreadable_sync) {
+    txc->onreadable_sync->complete(0);
+    txc->onreadable_sync = NULL;
+  }
+  if (txc->onreadable) {
+    finisher.queue(txc->onreadable);
+    txc->onreadable = NULL;
+  }
+  if (txc->oncommit) {
+    finisher.queue(txc->oncommit);
+    txc->oncommit = NULL;
+  }
+  if (!txc->oncommits.empty()) {
+    finisher.queue(txc->oncommits);
+  }
+
+  throttle_ops.put(txc->ops);
+  throttle_bytes.put(txc->bytes);
+}
+
+void KStore::_txc_finish(TransContext *txc)
+{
+  dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
+  ceph_assert(txc->state == TransContext::STATE_FINISHING);
+
+  for (set<OnodeRef>::iterator p = txc->onodes.begin();
+       p != txc->onodes.end();
+       ++p) {
+    std::lock_guard<std::mutex> l((*p)->flush_lock);
+    dout(20) << __func__ << " onode " << *p << " had " << (*p)->flush_txns
+	     << dendl;
+    ceph_assert((*p)->flush_txns.count(txc));
+    (*p)->flush_txns.erase(txc);
+    if ((*p)->flush_txns.empty()) {
+      (*p)->flush_cond.notify_all();
+      (*p)->clear_pending_stripes();
+    }
+  }
+
+  // clear out refs
+  txc->onodes.clear();
+
+  while (!txc->removed_collections.empty()) {
+    _queue_reap_collection(txc->removed_collections.front());
+    txc->removed_collections.pop_front();
+  }
+
+  OpSequencerRef osr = txc->osr;
+  {
+    std::lock_guard<std::mutex> l(osr->qlock);
+    txc->state = TransContext::STATE_DONE;
+  }
+
+  _osr_reap_done(osr.get());
+}
+
+void KStore::_osr_reap_done(OpSequencer *osr)
+{
+  std::lock_guard<std::mutex> l(osr->qlock);
+  dout(20) << __func__ << " osr " << osr << dendl;
+  while (!osr->q.empty()) {
+    TransContext *txc = &osr->q.front();
+    dout(20) << __func__ << "  txc " << txc << " " << txc->get_state_name()
+	     << dendl;
+    if (txc->state != TransContext::STATE_DONE) {
+      break;
+    }
+
+    if (txc->first_collection) {
+      txc->first_collection->onode_map.trim(cct->_conf->kstore_onode_map_size);
+    }
+
+    osr->q.pop_front();
+    txc->log_state_latency(logger, l_kstore_state_done_lat);
+    delete txc;
+    osr->qcond.notify_all();
+    if (osr->q.empty())
+      dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
+  }
+}
+
+void KStore::_kv_sync_thread()
+{
+  dout(10) << __func__ << " start" << dendl;
+  std::unique_lock<std::mutex> l(kv_lock);
+  while (true) {
+    ceph_assert(kv_committing.empty());
+    if (kv_queue.empty()) {
+      if (kv_stop)
+	break;
+      dout(20) << __func__ << " sleep" << dendl;
+      kv_sync_cond.notify_all();
+      kv_cond.wait(l);
+      dout(20) << __func__ << " wake" << dendl;
+    } else {
+      dout(20) << __func__ << " committing " << kv_queue.size() << dendl;
+      kv_committing.swap(kv_queue);
+      utime_t start = ceph_clock_now();
+      l.unlock();
+
+      dout(30) << __func__ << " committing txc " << kv_committing << dendl;
+
+      // one transaction to force a sync
+      KeyValueDB::Transaction t = db->get_transaction();
+      if (!cct->_conf->kstore_sync_submit_transaction) {
+	for (std::deque<TransContext *>::iterator it = kv_committing.begin();
+	     it != kv_committing.end();
+	     ++it) {
+	  int r = db->submit_transaction((*it)->t);
+	  ceph_assert(r == 0);
+	}
+      }
+      int r = db->submit_transaction_sync(t);
+      ceph_assert(r == 0);
+      utime_t finish = ceph_clock_now();
+      utime_t dur = finish - start;
+      dout(20) << __func__ << " committed " << kv_committing.size()
+	       << " in " << dur << dendl;
+      while (!kv_committing.empty()) {
+	TransContext *txc = kv_committing.front();
+	_txc_state_proc(txc);
+	kv_committing.pop_front();
+      }
+
+      // this is as good a place as any ...
+      _reap_collections();
+
+      l.lock();
+    }
+  }
+  dout(10) << __func__ << " finish" << dendl;
+}
+
+
+// ---------------------------
+// transactions
+
+int KStore::queue_transactions(
+  CollectionHandle& ch,
+  vector<Transaction>& tls,
+  TrackedOpRef op,
+  ThreadPool::TPHandle *handle)
+{
+  Context *onreadable;
+  Context *ondisk;
+  Context *onreadable_sync;
+  ObjectStore::Transaction::collect_contexts(
+    tls, &onreadable, &ondisk, &onreadable_sync);
+
+  // set up the sequencer
+  Collection *c = static_cast<Collection*>(ch.get());
+  OpSequencer *osr = c->osr.get();
+  dout(10) << __func__ << " ch " << ch.get() << " " << c->cid << dendl;
+
+  // prepare
+  TransContext *txc = _txc_create(osr);
+  txc->onreadable = onreadable;
+  txc->onreadable_sync = onreadable_sync;
+  txc->oncommit = ondisk;
+
+  for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
+    txc->ops += (*p).get_num_ops();
+    txc->bytes += (*p).get_num_bytes();
+    _txc_add_transaction(txc, &(*p));
+  }
+
+  _txc_finalize(osr, txc);
+
+  throttle_ops.get(txc->ops);
+  throttle_bytes.get(txc->bytes);
+
+  // execute (start)
+  _txc_state_proc(txc);
+  return 0;
+}
+
+void KStore::_txc_add_transaction(TransContext *txc, Transaction *t)
+{
+  Transaction::iterator i = t->begin();
+
+  dout(30) << __func__ << " transaction dump:\n";
+  JSONFormatter f(true);
+  f.open_object_section("transaction");
+  t->dump(&f);
+  f.close_section();
+  f.flush(*_dout);
+  *_dout << dendl;
+
+  vector<CollectionRef> cvec(i.colls.size());
+  unsigned j = 0;
+  for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
+       ++p, ++j) {
+    cvec[j] = _get_collection(*p);
+
+    // note first collection we reference
+    if (!j && !txc->first_collection)
+      txc->first_collection = cvec[j];
+  }
+  vector<OnodeRef> ovec(i.objects.size());
+
+  for (int pos = 0; i.have_op(); ++pos) {
+    Transaction::Op *op = i.decode_op();
+    int r = 0;
+
+    // no coll or obj
+    if (op->op == Transaction::OP_NOP)
+      continue;
+
+    // collection operations
+    CollectionRef &c = cvec[op->cid];
+    switch (op->op) {
+    case Transaction::OP_RMCOLL:
+      {
+        coll_t cid = i.get_cid(op->cid);
+	r = _remove_collection(txc, cid, &c);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_MKCOLL:
+      {
+	ceph_assert(!c);
+        coll_t cid = i.get_cid(op->cid);
+	r = _create_collection(txc, cid, op->split_bits, &c);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_SPLIT_COLLECTION:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_SPLIT_COLLECTION2:
+      {
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+	r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_MERGE_COLLECTION:
+      {
+        uint32_t bits = op->split_bits;
+	r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_COLL_HINT:
+      {
+        uint32_t type = op->hint;
+        bufferlist hint;
+        i.decode_bl(hint);
+        auto hiter = hint.cbegin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          decode(pg_num, hiter);
+          decode(num_objs, hiter);
+          dout(10) << __func__ << " collection hint objects is a no-op, "
+		   << " pg_num " << pg_num << " num_objects " << num_objs
+		   << dendl;
+        } else {
+          // Ignore the hint
+          dout(10) << __func__ << " unknown collection hint " << type << dendl;
+        }
+	continue;
+      }
+      break;
+
+    case Transaction::OP_COLL_SETATTR:
+      r = -EOPNOTSUPP;
+      break;
+
+    case Transaction::OP_COLL_RMATTR:
+      r = -EOPNOTSUPP;
+      break;
+
+    case Transaction::OP_COLL_RENAME:
+      ceph_abort_msg("not implemented");
+      break;
+    }
+    if (r < 0) {
+      derr << " error " << cpp_strerror(r)
+	   << " not handled on operation " << op->op
+	   << " (op " << pos << ", counting from 0)" << dendl;
+      dout(0) << " transaction dump:\n";
+      JSONFormatter f(true);
+      f.open_object_section("transaction");
+      t->dump(&f);
+      f.close_section();
+      f.flush(*_dout);
+      *_dout << dendl;
+      ceph_abort_msg("unexpected error");
+    }
+
+    // object operations
+    std::unique_lock l{c->lock};
+    OnodeRef &o = ovec[op->oid];
+    if (!o) {
+      // these operations implicity create the object
+      bool create = false;
+      if (op->op == Transaction::OP_TOUCH ||
+	  op->op == Transaction::OP_CREATE ||
+	  op->op == Transaction::OP_WRITE ||
+	  op->op == Transaction::OP_ZERO) {
+	create = true;
+      }
+      ghobject_t oid = i.get_oid(op->oid);
+      o = c->get_onode(oid, create);
+      if (!create) {
+	if (!o || !o->exists) {
+	  dout(10) << __func__ << " op " << op->op << " got ENOENT on "
+		   << oid << dendl;
+	  r = -ENOENT;
+	  goto endop;
+	}
+      }
+    }
+
+    switch (op->op) {
+    case Transaction::OP_TOUCH:
+    case Transaction::OP_CREATE:
+	r = _touch(txc, c, o);
+      break;
+
+    case Transaction::OP_WRITE:
+      {
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	uint32_t fadvise_flags = i.get_fadvise_flags();
+        bufferlist bl;
+        i.decode_bl(bl);
+	r = _write(txc, c, o, off, len, bl, fadvise_flags);
+      }
+      break;
+
+    case Transaction::OP_ZERO:
+      {
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	r = _zero(txc, c, o, off, len);
+      }
+      break;
+
+    case Transaction::OP_TRIMCACHE:
+      {
+        // deprecated, no-op
+      }
+      break;
+
+    case Transaction::OP_TRUNCATE:
+      {
+        uint64_t off = op->off;
+	r = _truncate(txc, c, o, off);
+      }
+      break;
+
+    case Transaction::OP_REMOVE:
+	r = _remove(txc, c, o);
+      break;
+
+    case Transaction::OP_SETATTR:
+      {
+        string name = i.decode_string();
+        bufferlist bl;
+        i.decode_bl(bl);
+	map<string, bufferptr> to_set;
+	to_set[name] = bufferptr(bl.c_str(), bl.length());
+	r = _setattrs(txc, c, o, to_set);
+      }
+      break;
+
+    case Transaction::OP_SETATTRS:
+      {
+        map<string, bufferptr> aset;
+        i.decode_attrset(aset);
+	r = _setattrs(txc, c, o, aset);
+      }
+      break;
+
+    case Transaction::OP_RMATTR:
+      {
+	string name = i.decode_string();
+	r = _rmattr(txc, c, o, name);
+      }
+      break;
+
+    case Transaction::OP_RMATTRS:
+      {
+	r = _rmattrs(txc, c, o);
+      }
+      break;
+
+    case Transaction::OP_CLONE:
+      {
+        const ghobject_t& noid = i.get_oid(op->dest_oid);
+	OnodeRef no = c->get_onode(noid, true);
+	r = _clone(txc, c, o, no);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_CLONERANGE2:
+      {
+	const ghobject_t& noid = i.get_oid(op->dest_oid);
+	OnodeRef no = c->get_onode(noid, true);
+        uint64_t srcoff = op->off;
+        uint64_t len = op->len;
+        uint64_t dstoff = op->dest_off;
+	r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
+      }
+      break;
+
+    case Transaction::OP_COLL_ADD:
+      ceph_abort_msg("not implemented");
+      break;
+
+    case Transaction::OP_COLL_REMOVE:
+      ceph_abort_msg("not implemented");
+      break;
+
+    case Transaction::OP_COLL_MOVE:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_COLL_MOVE_RENAME:
+      {
+	ceph_assert(op->cid == op->dest_cid);
+	const ghobject_t& noid = i.get_oid(op->dest_oid);
+	OnodeRef no = c->get_onode(noid, true);
+	r = _rename(txc, c, o, no, noid);
+	o.reset();
+      }
+      break;
+
+    case Transaction::OP_TRY_RENAME:
+      {
+	const ghobject_t& noid = i.get_oid(op->dest_oid);
+	OnodeRef no = c->get_onode(noid, true);
+	r = _rename(txc, c, o, no, noid);
+	if (r == -ENOENT)
+	  r = 0;
+	o.reset();
+      }
+      break;
+
+    case Transaction::OP_OMAP_CLEAR:
+      {
+	r = _omap_clear(txc, c, o);
+      }
+      break;
+    case Transaction::OP_OMAP_SETKEYS:
+      {
+	bufferlist aset_bl;
+        i.decode_attrset_bl(&aset_bl);
+	r = _omap_setkeys(txc, c, o, aset_bl);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYS:
+      {
+	bufferlist keys_bl;
+        i.decode_keyset_bl(&keys_bl);
+	r = _omap_rmkeys(txc, c, o, keys_bl);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYRANGE:
+      {
+        string first, last;
+        first = i.decode_string();
+        last = i.decode_string();
+	r = _omap_rmkey_range(txc, c, o, first, last);
+      }
+      break;
+    case Transaction::OP_OMAP_SETHEADER:
+      {
+        bufferlist bl;
+        i.decode_bl(bl);
+	r = _omap_setheader(txc, c, o, bl);
+      }
+      break;
+
+    case Transaction::OP_SETALLOCHINT:
+      {
+        uint64_t expected_object_size = op->expected_object_size;
+        uint64_t expected_write_size = op->expected_write_size;
+	uint32_t flags = op->hint;
+	r = _setallochint(txc, c, o,
+			  expected_object_size,
+			  expected_write_size,
+			  flags);
+      }
+      break;
+
+    default:
+      derr << "bad op " << op->op << dendl;
+      ceph_abort();
+    }
+
+  endop:
+    if (r < 0) {
+      bool ok = false;
+
+      if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+			    op->op == Transaction::OP_CLONE ||
+			    op->op == Transaction::OP_CLONERANGE2 ||
+			    op->op == Transaction::OP_COLL_ADD))
+	// -ENOENT is usually okay
+	ok = true;
+      if (r == -ENODATA)
+	ok = true;
+
+      if (!ok) {
+	const char *msg = "unexpected error code";
+
+	if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+			     op->op == Transaction::OP_CLONE ||
+			     op->op == Transaction::OP_CLONERANGE2))
+	  msg = "ENOENT on clone suggests osd bug";
+
+	if (r == -ENOSPC)
+	  // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+	  // by partially applying transactions.
+	  msg = "ENOSPC from key value store, misconfigured cluster";
+
+	if (r == -ENOTEMPTY) {
+	  msg = "ENOTEMPTY suggests garbage data in osd data dir";
+	}
+
+	dout(0) << " error " << cpp_strerror(r) << " not handled on operation " << op->op
+		<< " (op " << pos << ", counting from 0)" << dendl;
+	dout(0) << msg << dendl;
+	dout(0) << " transaction dump:\n";
+	JSONFormatter f(true);
+	f.open_object_section("transaction");
+	t->dump(&f);
+	f.close_section();
+	f.flush(*_dout);
+	*_dout << dendl;
+	ceph_abort_msg("unexpected error");
+      }
+    }
+  }
+}
+
+
+
+// -----------------
+// write operations
+
+int KStore::_touch(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef &o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  o->exists = true;
+  _assign_nid(txc, o);
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+void KStore::_dump_onode(OnodeRef o)
+{
+  dout(30) << __func__ << " " << o
+	   << " nid " << o->onode.nid
+	   << " size " << o->onode.size
+	   << " expected_object_size " << o->onode.expected_object_size
+	   << " expected_write_size " << o->onode.expected_write_size
+	   << dendl;
+  for (map<string,bufferptr>::iterator p = o->onode.attrs.begin();
+       p != o->onode.attrs.end();
+       ++p) {
+    dout(30) << __func__ << "  attr " << p->first
+	     << " len " << p->second.length() << dendl;
+  }
+}
+
+void KStore::_do_read_stripe(OnodeRef o, uint64_t offset, bufferlist *pbl, bool do_cache)
+{
+  if (!do_cache) {
+    string key;
+    get_data_key(o->onode.nid, offset, &key);
+    db->get(PREFIX_DATA, key, pbl);
+    return;
+  }
+ 
+  map<uint64_t,bufferlist>::iterator p = o->pending_stripes.find(offset);
+  if (p == o->pending_stripes.end()) {
+    string key;
+    get_data_key(o->onode.nid, offset, &key);
+    db->get(PREFIX_DATA, key, pbl);
+    o->pending_stripes[offset] = *pbl;
+  } else {
+    *pbl = p->second;
+  }
+}
+
+void KStore::_do_write_stripe(TransContext *txc, OnodeRef o,
+			      uint64_t offset, bufferlist& bl)
+{
+  o->pending_stripes[offset] = bl;
+  string key;
+  get_data_key(o->onode.nid, offset, &key);
+  txc->t->set(PREFIX_DATA, key, bl);
+}
+
+void KStore::_do_remove_stripe(TransContext *txc, OnodeRef o, uint64_t offset)
+{
+  o->pending_stripes.erase(offset);
+  string key;
+  get_data_key(o->onode.nid, offset, &key);
+  txc->t->rmkey(PREFIX_DATA, key);
+}
+
+int KStore::_do_write(TransContext *txc,
+		      OnodeRef o,
+		      uint64_t offset, uint64_t length,
+		      bufferlist& orig_bl,
+		      uint32_t fadvise_flags)
+{
+  int r = 0;
+
+  dout(20) << __func__
+	   << " " << o->oid << " " << offset << "~" << length
+	   << " - have " << o->onode.size
+	   << " bytes, nid " << o->onode.nid << dendl;
+  _dump_onode(o);
+  o->exists = true;
+
+  if (length == 0) {
+    return 0;
+  }
+
+  uint64_t stripe_size = o->onode.stripe_size;
+  if (!stripe_size) {
+    o->onode.stripe_size = cct->_conf->kstore_default_stripe_size;
+    stripe_size = o->onode.stripe_size;
+  }
+
+  unsigned bl_off = 0;
+  while (length > 0) {
+    uint64_t offset_rem = offset % stripe_size;
+    uint64_t end_rem = (offset + length) % stripe_size;
+    if (offset_rem == 0 && end_rem == 0) {
+      bufferlist bl;
+      bl.substr_of(orig_bl, bl_off, stripe_size);
+      dout(30) << __func__ << " full stripe " << offset << dendl;
+      _do_write_stripe(txc, o, offset, bl);
+      offset += stripe_size;
+      length -= stripe_size;
+      bl_off += stripe_size;
+      continue;
+    }
+    uint64_t stripe_off = offset - offset_rem;
+    bufferlist prev;
+    _do_read_stripe(o, stripe_off, &prev, true);
+    dout(20) << __func__ << " read previous stripe " << stripe_off
+	     << ", got " << prev.length() << dendl;
+    bufferlist bl;
+    if (offset_rem) {
+      unsigned p = std::min<uint64_t>(prev.length(), offset_rem);
+      if (p) {
+	dout(20) << __func__ << " reuse leading " << p << " bytes" << dendl;
+	bl.substr_of(prev, 0, p);
+      }
+      if (p < offset_rem) {
+	dout(20) << __func__ << " add leading " << offset_rem - p << " zeros" << dendl;
+	bl.append_zero(offset_rem - p);
+      }
+    }
+    unsigned use = stripe_size - offset_rem;
+    if (use > length)
+      use -= stripe_size - end_rem;
+    dout(20) << __func__ << " using " << use << " for this stripe" << dendl;
+    bufferlist t;
+    t.substr_of(orig_bl, bl_off, use);
+    bl.claim_append(t);
+    bl_off += use;
+    if (end_rem) {
+      if (end_rem < prev.length()) {
+	unsigned l = prev.length() - end_rem;
+	dout(20) << __func__ << " reuse trailing " << l << " bytes" << dendl;
+	bufferlist t;
+	t.substr_of(prev, end_rem, l);
+	bl.claim_append(t);
+      }
+    }
+    dout(30) << " writing:\n";
+    bl.hexdump(*_dout);
+    *_dout << dendl;
+    _do_write_stripe(txc, o, stripe_off, bl);
+    offset += use;
+    length -= use;
+  }
+
+  if (offset > o->onode.size) {
+    dout(20) << __func__ << " extending size to " << offset + length
+	     << dendl;
+    o->onode.size = offset;
+  }
+
+  return r;
+}
+
+int KStore::_write(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& o,
+		   uint64_t offset, size_t length,
+		   bufferlist& bl,
+		   uint32_t fadvise_flags)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset << "~" << length
+	   << dendl;
+  _assign_nid(txc, o);
+  int r = _do_write(txc, o, offset, length, bl, fadvise_flags);
+  txc->write_onode(o);
+
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset << "~" << length
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_zero(TransContext *txc,
+		  CollectionRef& c,
+		  OnodeRef& o,
+		  uint64_t offset, size_t length)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset << "~" << length
+	   << dendl;
+  int r = 0;
+  o->exists = true;
+
+  _dump_onode(o);
+  _assign_nid(txc, o);
+
+  uint64_t stripe_size = o->onode.stripe_size;
+  if (stripe_size) {
+    uint64_t end = offset + length;
+    uint64_t pos = offset;
+    uint64_t stripe_off = pos % stripe_size;
+    while (pos < offset + length) {
+      if (stripe_off || end - pos < stripe_size) {
+	bufferlist stripe;
+	_do_read_stripe(o, pos - stripe_off, &stripe, true);
+	dout(30) << __func__ << " stripe " << pos - stripe_off << " got "
+		 << stripe.length() << dendl;
+	bufferlist bl;
+	bl.substr_of(stripe, 0, std::min<uint64_t>(stripe.length(), stripe_off));
+	if (end >= pos - stripe_off + stripe_size ||
+	    end >= o->onode.size) {
+	  dout(20) << __func__ << " truncated stripe " << pos - stripe_off
+		   << " to " << bl.length() << dendl;
+	} else {
+          auto len = end - (pos - stripe_off + bl.length());
+	  bl.append_zero(len);
+	  dout(20) << __func__ << " adding " << len << " of zeros" << dendl;
+	  if (stripe.length() > bl.length()) {
+	    unsigned l = stripe.length() - bl.length();
+	    bufferlist t;
+	    t.substr_of(stripe, stripe.length() - l, l);
+	    dout(20) << __func__ << " keeping tail " << l << " of stripe" << dendl;
+	    bl.claim_append(t);
+	  }
+	}
+	_do_write_stripe(txc, o, pos - stripe_off, bl);
+	pos += stripe_size - stripe_off;
+	stripe_off = 0;
+      } else {
+	dout(20) << __func__ << " rm stripe " << pos << dendl;
+	_do_remove_stripe(txc, o, pos - stripe_off);
+	pos += stripe_size;
+      }
+    }
+  }
+  if (offset + length > o->onode.size) {
+    o->onode.size = offset + length;
+    dout(20) << __func__ << " extending size to " << offset + length
+	     << dendl;
+  }
+  txc->write_onode(o);
+
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset << "~" << length
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_do_truncate(TransContext *txc, OnodeRef o, uint64_t offset)
+{
+  uint64_t stripe_size = o->onode.stripe_size;
+
+  o->flush();
+
+  // trim down stripes
+  if (stripe_size) {
+    uint64_t pos = offset;
+    uint64_t stripe_off = pos % stripe_size;
+    while (pos < o->onode.size) {
+      if (stripe_off) {
+	bufferlist stripe;
+	_do_read_stripe(o, pos - stripe_off, &stripe, true);
+	dout(30) << __func__ << " stripe " << pos - stripe_off << " got "
+		 << stripe.length() << dendl;
+	bufferlist t;
+	t.substr_of(stripe, 0, std::min<uint64_t>(stripe_off, stripe.length()));
+	_do_write_stripe(txc, o, pos - stripe_off, t);
+	dout(20) << __func__ << " truncated stripe " << pos - stripe_off
+		 << " to " << t.length() << dendl;
+	pos += stripe_size - stripe_off;
+	stripe_off = 0;
+      } else {
+	dout(20) << __func__ << " rm stripe " << pos << dendl;
+	_do_remove_stripe(txc, o, pos - stripe_off);
+	pos += stripe_size;
+      }
+    }
+
+    // trim down cached tail
+    if (o->tail_bl.length()) {
+      if (offset / stripe_size != o->onode.size / stripe_size) {
+	dout(20) << __func__ << " clear cached tail" << dendl;
+	o->clear_tail();
+      }
+    }
+  }
+
+  o->onode.size = offset;
+  dout(10) << __func__ << " truncate size to " << offset << dendl;
+
+  txc->write_onode(o);
+  return 0;
+}
+
+int KStore::_truncate(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& o,
+		      uint64_t offset)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset
+	   << dendl;
+  int r = _do_truncate(txc, o, offset);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_do_remove(TransContext *txc,
+		       OnodeRef o)
+{
+  string key;
+
+  _do_truncate(txc, o, 0);
+
+  o->onode.size = 0;
+  if (o->onode.omap_head) {
+    _do_omap_clear(txc, o->onode.omap_head);
+  }
+  o->exists = false;
+  o->onode = kstore_onode_t();
+  txc->onodes.erase(o);
+  get_object_key(cct, o->oid, &key);
+  txc->t->rmkey(PREFIX_OBJ, key);
+  return 0;
+}
+
+int KStore::_remove(TransContext *txc,
+		    CollectionRef& c,
+		    OnodeRef &o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = _do_remove(txc, o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_setattr(TransContext *txc,
+		     CollectionRef& c,
+		     OnodeRef& o,
+		     const string& name,
+		     bufferptr& val)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << " (" << val.length() << " bytes)"
+	   << dendl;
+  int r = 0;
+  o->onode.attrs[name] = val;
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << " (" << val.length() << " bytes)"
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_setattrs(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& o,
+		      const map<string,bufferptr>& aset)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << aset.size() << " keys"
+	   << dendl;
+  int r = 0;
+  for (map<string,bufferptr>::const_iterator p = aset.begin();
+       p != aset.end(); ++p) {
+    if (p->second.is_partial())
+      o->onode.attrs[p->first] = bufferptr(p->second.c_str(), p->second.length());
+    else
+      o->onode.attrs[p->first] = p->second;
+  }
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << aset.size() << " keys"
+	   << " = " << r << dendl;
+  return r;
+}
+
+
+int KStore::_rmattr(TransContext *txc,
+		    CollectionRef& c,
+		    OnodeRef& o,
+		    const string& name)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << dendl;
+  int r = 0;
+  o->onode.attrs.erase(name);
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_rmattrs(TransContext *txc,
+		     CollectionRef& c,
+		     OnodeRef& o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  o->onode.attrs.clear();
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+void KStore::_do_omap_clear(TransContext *txc, uint64_t id)
+{
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+  string prefix, tail;
+  get_omap_header(id, &prefix);
+  get_omap_tail(id, &tail);
+  it->lower_bound(prefix);
+  while (it->valid()) {
+    if (it->key() >= tail) {
+      dout(30) << __func__ << "  stop at " << tail << dendl;
+      break;
+    }
+    txc->t->rmkey(PREFIX_OMAP, it->key());
+    dout(30) << __func__ << "  rm " << pretty_binary_string(it->key()) << dendl;
+    it->next();
+  }
+}
+
+int KStore::_omap_clear(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  if (o->onode.omap_head != 0) {
+    _do_omap_clear(txc, o->onode.omap_head);
+  }
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_omap_setkeys(TransContext *txc,
+			  CollectionRef& c,
+			  OnodeRef& o,
+			  bufferlist &bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r;
+  auto p = bl.cbegin();
+  __u32 num;
+  if (!o->onode.omap_head) {
+    o->onode.omap_head = o->onode.nid;
+    txc->write_onode(o);
+  }
+  decode(num, p);
+  while (num--) {
+    string key;
+    bufferlist value;
+    decode(key, p);
+    decode(value, p);
+    string final_key;
+    get_omap_key(o->onode.omap_head, key, &final_key);
+    dout(30) << __func__ << "  " << pretty_binary_string(final_key)
+	     << " <- " << key << dendl;
+    txc->t->set(PREFIX_OMAP, final_key, value);
+  }
+  r = 0;
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_omap_setheader(TransContext *txc,
+			    CollectionRef& c,
+			    OnodeRef &o,
+			    bufferlist& bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r;
+  string key;
+  if (!o->onode.omap_head) {
+    o->onode.omap_head = o->onode.nid;
+    txc->write_onode(o);
+  }
+  get_omap_header(o->onode.omap_head, &key);
+  txc->t->set(PREFIX_OMAP, key, bl);
+  r = 0;
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_omap_rmkeys(TransContext *txc,
+			 CollectionRef& c,
+			 OnodeRef& o,
+			 const bufferlist& bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  auto p = bl.cbegin();
+  __u32 num;
+
+  if (!o->onode.omap_head) {
+    r = 0;
+    goto out;
+  }
+  decode(num, p);
+  while (num--) {
+    string key;
+    decode(key, p);
+    string final_key;
+    get_omap_key(o->onode.omap_head, key, &final_key);
+    dout(30) << __func__ << "  rm " << pretty_binary_string(final_key)
+	     << " <- " << key << dendl;
+    txc->t->rmkey(PREFIX_OMAP, final_key);
+  }
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_omap_rmkey_range(TransContext *txc,
+			      CollectionRef& c,
+			      OnodeRef& o,
+			      const string& first, const string& last)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  KeyValueDB::Iterator it;
+  string key_first, key_last;
+  int r = 0;
+
+  if (!o->onode.omap_head) {
+    goto out;
+  }
+  it = db->get_iterator(PREFIX_OMAP);
+  get_omap_key(o->onode.omap_head, first, &key_first);
+  get_omap_key(o->onode.omap_head, last, &key_last);
+  it->lower_bound(key_first);
+  while (it->valid()) {
+    if (it->key() >= key_last) {
+      dout(30) << __func__ << "  stop at " << pretty_binary_string(key_last)
+	       << dendl;
+      break;
+    }
+    txc->t->rmkey(PREFIX_OMAP, it->key());
+    dout(30) << __func__ << "  rm " << pretty_binary_string(it->key()) << dendl;
+    it->next();
+  }
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_setallochint(TransContext *txc,
+			  CollectionRef& c,
+			  OnodeRef& o,
+			  uint64_t expected_object_size,
+			  uint64_t expected_write_size,
+			  uint32_t flags)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " object_size " << expected_object_size
+	   << " write_size " << expected_write_size
+	   << " flags " << flags
+	   << dendl;
+  int r = 0;
+  o->onode.expected_object_size = expected_object_size;
+  o->onode.expected_write_size = expected_write_size;
+  o->onode.alloc_hint_flags = flags;
+
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " object_size " << expected_object_size
+	   << " write_size " << expected_write_size
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_clone(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& oldo,
+		   OnodeRef& newo)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << dendl;
+  int r = 0;
+  if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
+    derr << __func__ << " mismatched hash on " << oldo->oid
+	 << " and " << newo->oid << dendl;
+    return -EINVAL;
+  }
+
+  bufferlist bl;
+  newo->exists = true;
+  _assign_nid(txc, newo);
+
+  // data
+  oldo->flush();
+
+  r = _do_read(oldo, 0, oldo->onode.size, bl, true, 0);
+  if (r < 0)
+    goto out;
+
+  // truncate any old data
+  r = _do_truncate(txc, newo, 0);
+  if (r < 0)
+    goto out;
+
+  r = _do_write(txc, newo, 0, oldo->onode.size, bl, 0);
+  if (r < 0)
+    goto out;
+
+  newo->onode.attrs = oldo->onode.attrs;
+
+  // clone omap
+  if (newo->onode.omap_head) {
+    dout(20) << __func__ << " clearing old omap data" << dendl;
+    _do_omap_clear(txc, newo->onode.omap_head);
+  }
+  if (oldo->onode.omap_head) {
+    dout(20) << __func__ << " copying omap data" << dendl;
+    if (!newo->onode.omap_head) {
+      newo->onode.omap_head = newo->onode.nid;
+    }
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+    string head, tail;
+    get_omap_header(oldo->onode.omap_head, &head);
+    get_omap_tail(oldo->onode.omap_head, &tail);
+    it->lower_bound(head);
+    while (it->valid()) {
+      string key;
+      if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      } else {
+	dout(30) << __func__ << "  got header/data "
+		 << pretty_binary_string(it->key()) << dendl;
+	ceph_assert(it->key() < tail);
+	rewrite_omap_key(newo->onode.omap_head, it->key(), &key);
+	txc->t->set(PREFIX_OMAP, key, it->value());
+      }
+      it->next();
+    }
+  }
+
+  txc->write_onode(newo);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_clone_range(TransContext *txc,
+			 CollectionRef& c,
+			 OnodeRef& oldo,
+			 OnodeRef& newo,
+			 uint64_t srcoff, uint64_t length, uint64_t dstoff)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << " from " << srcoff << "~" << length
+	   << " to offset " << dstoff << dendl;
+  int r = 0;
+
+  bufferlist bl;
+  newo->exists = true;
+  _assign_nid(txc, newo);
+
+  r = _do_read(oldo, srcoff, length, bl, true, 0);
+  if (r < 0)
+    goto out;
+
+  r = _do_write(txc, newo, dstoff, bl.length(), bl, 0);
+  if (r < 0)
+    goto out;
+
+  txc->write_onode(newo);
+
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << " from " << srcoff << "~" << length
+	   << " to offset " << dstoff
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_rename(TransContext *txc,
+		    CollectionRef& c,
+		    OnodeRef& oldo,
+		    OnodeRef& newo,
+		    const ghobject_t& new_oid)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << new_oid << dendl;
+  int r;
+  ghobject_t old_oid = oldo->oid;
+  bufferlist bl;
+  string old_key, new_key;
+
+  if (newo && newo->exists) {
+    // destination object already exists, remove it first
+    r = _do_remove(txc, newo);
+    if (r < 0)
+      goto out;
+  }
+
+  txc->t->rmkey(PREFIX_OBJ, oldo->key);
+  txc->write_onode(oldo);
+  c->onode_map.rename(old_oid, new_oid);  // this adjusts oldo->{oid,key}
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
+	   << new_oid << " = " << r << dendl;
+  return r;
+}
+
+// collections
+
+int KStore::_create_collection(
+  TransContext *txc,
+  coll_t cid,
+  unsigned bits,
+  CollectionRef *c)
+{
+  dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
+  int r;
+  bufferlist bl;
+
+  {
+    std::unique_lock l{coll_lock};
+    if (*c) {
+      r = -EEXIST;
+      goto out;
+    }
+    auto p = new_coll_map.find(cid);
+    ceph_assert(p != new_coll_map.end());
+    *c = p->second;
+    ceph_assert((*c)->cid == cid);
+    (*c)->cnode.bits = bits;
+    coll_map[cid] = *c;
+    new_coll_map.erase(p);
+  }
+  encode((*c)->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(cid), bl);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_remove_collection(TransContext *txc, coll_t cid,
+				 CollectionRef *c)
+{
+  dout(15) << __func__ << " " << cid << dendl;
+  int r;
+
+  {
+    std::unique_lock l{coll_lock};
+    if (!*c) {
+      r = -ENOENT;
+      goto out;
+    }
+    size_t nonexistent_count = 0;
+    pair<ghobject_t,OnodeRef> next_onode;
+    while ((*c)->onode_map.get_next(next_onode.first, &next_onode)) {
+      if (next_onode.second->exists) {
+	r = -ENOTEMPTY;
+	goto out;
+      }
+      ++nonexistent_count;
+    }
+    vector<ghobject_t> ls;
+    ghobject_t next;
+    // Enumerate onodes in db, up to nonexistent_count + 1
+    // then check if all of them are marked as non-existent.
+    // Bypass the check if returned number is greater than nonexistent_count
+    r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
+                         nonexistent_count + 1, &ls, &next);
+    if (r >= 0) {
+      bool exists = false; //ls.size() > nonexistent_count;
+      for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
+        dout(10) << __func__ << " oid " << *it << dendl;
+        auto onode = (*c)->onode_map.lookup(*it);
+        exists = !onode || onode->exists;
+        if (exists) {
+          dout(10) << __func__ << " " << *it
+                   << " exists in db" << dendl;
+        }
+      }
+      if (!exists) {
+        coll_map.erase(cid);
+        txc->removed_collections.push_back(*c);
+        c->reset();
+        txc->t->rmkey(PREFIX_COLL, stringify(cid));
+        r = 0;
+      } else {
+        dout(10) << __func__ << " " << cid
+                 << " is non-empty" << dendl;
+        r = -ENOTEMPTY;
+      }
+    }
+  }
+
+ out:
+  dout(10) << __func__ << " " << cid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_split_collection(TransContext *txc,
+				CollectionRef& c,
+				CollectionRef& d,
+				unsigned bits, int rem)
+{
+  dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
+	   << " bits " << bits << dendl;
+  int r;
+  std::unique_lock l{c->lock};
+  std::unique_lock l2{d->lock};
+  c->onode_map.clear();
+  d->onode_map.clear();
+  c->cnode.bits = bits;
+  ceph_assert(d->cnode.bits == bits);
+  r = 0;
+
+  bufferlist bl;
+  encode(c->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
+
+  dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
+	   << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_merge_collection(TransContext *txc,
+			      CollectionRef *c,
+			      CollectionRef& d,
+			      unsigned bits)
+{
+  dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid << " "
+	   << " bits " << bits << dendl;
+  int r;
+  std::scoped_lock l{(*c)->lock, d->lock};
+  (*c)->onode_map.clear();
+  d->onode_map.clear();
+  d->cnode.bits = bits;
+  r = 0;
+
+  coll_t cid = (*c)->cid;
+
+  bufferlist bl;
+  encode(d->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
+
+  coll_map.erase((*c)->cid);
+  txc->removed_collections.push_back(*c);
+  c->reset();
+  txc->t->rmkey(PREFIX_COLL, stringify(cid));
+
+  dout(10) << __func__ << " " << cid << " to " << d->cid << " "
+	   << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+// ===========================================
diff --git a/src/os/kstore/KStore.h b/src/os/kstore/KStore.h
new file mode 100644
index 000000000..b76b4dcfb
--- /dev/null
+++ b/src/os/kstore/KStore.h
@@ -0,0 +1,698 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_KSTORE_H
+#define CEPH_OSD_KSTORE_H
+
+#include "acconfig.h"
+
+#include <unistd.h>
+
+#include <atomic>
+#include <mutex>
+#include <condition_variable>
+
+#include "include/ceph_assert.h"
+#include "include/unordered_map.h"
+#include "common/Finisher.h"
+#include "common/RWLock.h"
+#include "common/Throttle.h"
+#include "common/WorkQueue.h"
+#include "os/ObjectStore.h"
+#include "common/perf_counters.h"
+#include "os/fs/FS.h"
+#include "kv/KeyValueDB.h"
+
+#include "kstore_types.h"
+
+#include "boost/intrusive/list.hpp"
+
+enum {  
+  l_kstore_first = 832430,
+  l_kstore_state_prepare_lat,
+  l_kstore_state_kv_queued_lat,
+  l_kstore_state_kv_done_lat,
+  l_kstore_state_finishing_lat,
+  l_kstore_state_done_lat,
+  l_kstore_last
+};
+
+class KStore : public ObjectStore {
+  // -----------------------------------------------------
+  // types
+public:
+
+  struct TransContext;
+
+  /// an in-memory object
+  struct Onode {
+    CephContext* cct;
+    std::atomic_int nref;  ///< reference count
+
+    ghobject_t oid;
+    std::string key;     ///< key under PREFIX_OBJ where we are stored
+    boost::intrusive::list_member_hook<> lru_item;
+
+    kstore_onode_t onode;  ///< metadata stored as value in kv store
+    bool dirty;     // ???
+    bool exists;
+
+    std::mutex flush_lock;  ///< protect flush_txns
+    std::condition_variable flush_cond;   ///< wait here for unapplied txns
+    std::set<TransContext*> flush_txns;   ///< committing txns
+
+    uint64_t tail_offset;
+    ceph::buffer::list tail_bl;
+
+    std::map<uint64_t,ceph::buffer::list> pending_stripes;  ///< unwritten stripes
+
+    Onode(CephContext* cct, const ghobject_t& o, const std::string& k)
+      : cct(cct),
+	nref(0),
+	oid(o),
+	key(k),
+	dirty(false),
+	exists(false),
+        tail_offset(0) {
+    }
+
+    void flush();
+    void get() {
+      ++nref;
+    }
+    void put() {
+      if (--nref == 0)
+	delete this;
+    }
+
+    void clear_tail() {
+      tail_offset = 0;
+      tail_bl.clear();
+    }
+    void clear_pending_stripes() {
+      pending_stripes.clear();
+    }
+  };
+  typedef boost::intrusive_ptr<Onode> OnodeRef;
+
+  struct OnodeHashLRU {
+    CephContext* cct;
+    typedef boost::intrusive::list<
+      Onode,
+      boost::intrusive::member_hook<
+        Onode,
+	boost::intrusive::list_member_hook<>,
+	&Onode::lru_item> > lru_list_t;
+
+    std::mutex lock;
+    ceph::unordered_map<ghobject_t,OnodeRef> onode_map;  ///< forward lookups
+    lru_list_t lru;                                      ///< lru
+
+    OnodeHashLRU(CephContext* cct) : cct(cct) {}
+
+    void add(const ghobject_t& oid, OnodeRef o);
+    void _touch(OnodeRef o);
+    OnodeRef lookup(const ghobject_t& o);
+    void rename(const ghobject_t& old_oid, const ghobject_t& new_oid);
+    void clear();
+    bool get_next(const ghobject_t& after, std::pair<ghobject_t,OnodeRef> *next);
+    int trim(int max=-1);
+  };
+
+  class OpSequencer;
+  typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
+
+  struct Collection : public CollectionImpl {
+    KStore *store;
+    kstore_cnode_t cnode;
+    ceph::shared_mutex lock =
+      ceph::make_shared_mutex("KStore::Collection::lock", true, false);
+
+    OpSequencerRef osr;
+
+    // cache onodes on a per-collection basis to avoid lock
+    // contention.
+    OnodeHashLRU onode_map;
+
+    OnodeRef get_onode(const ghobject_t& oid, bool create);
+
+    bool contains(const ghobject_t& oid) {
+      if (cid.is_meta())
+	return oid.hobj.pool == -1;
+      spg_t spgid;
+      if (cid.is_pg(&spgid))
+	return
+	  spgid.pgid.contains(cnode.bits, oid) &&
+	  oid.shard_id == spgid.shard;
+      return false;
+    }
+
+    void flush() override;
+    bool flush_commit(Context *c) override;
+
+  private:
+    FRIEND_MAKE_REF(Collection);
+    Collection(KStore *ns, coll_t c);
+  };
+  using CollectionRef = ceph::ref_t<Collection>;
+
+  class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
+    CollectionRef c;
+    OnodeRef o;
+    KeyValueDB::Iterator it;
+    std::string head, tail;
+  public:
+    OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
+    int seek_to_first() override;
+    int upper_bound(const std::string &after) override;
+    int lower_bound(const std::string &to) override;
+    bool valid() override;
+    int next() override;
+    std::string key() override;
+    ceph::buffer::list value() override;
+    int status() override {
+      return 0;
+    }
+  };
+
+  struct TransContext {
+    typedef enum {
+      STATE_PREPARE,
+      STATE_AIO_WAIT,
+      STATE_IO_DONE,
+      STATE_KV_QUEUED,
+      STATE_KV_COMMITTING,
+      STATE_KV_DONE,
+      STATE_FINISHING,
+      STATE_DONE,
+    } state_t;
+
+    state_t state;
+
+    const char *get_state_name() {
+      switch (state) {
+      case STATE_PREPARE: return "prepare";
+      case STATE_AIO_WAIT: return "aio_wait";
+      case STATE_IO_DONE: return "io_done";
+      case STATE_KV_QUEUED: return "kv_queued";
+      case STATE_KV_COMMITTING: return "kv_committing";
+      case STATE_KV_DONE: return "kv_done";
+      case STATE_FINISHING: return "finishing";
+      case STATE_DONE: return "done";
+      }
+      return "???";
+    }
+
+    void log_state_latency(PerfCounters *logger, int state) {
+        utime_t lat, now = ceph_clock_now();
+        lat = now - start;
+        logger->tinc(state, lat);
+        start = now;
+    }
+
+    CollectionRef ch;
+    OpSequencerRef osr;
+    boost::intrusive::list_member_hook<> sequencer_item;
+
+    uint64_t ops, bytes;
+
+    std::set<OnodeRef> onodes;     ///< these onodes need to be updated/written
+    KeyValueDB::Transaction t; ///< then we will commit this
+    Context *oncommit;         ///< signal on commit
+    Context *onreadable;         ///< signal on readable
+    Context *onreadable_sync;         ///< signal on readable
+    std::list<Context*> oncommits;  ///< more commit completions
+    std::list<CollectionRef> removed_collections; ///< colls we removed
+
+    CollectionRef first_collection;  ///< first referenced collection
+    utime_t start;
+    explicit TransContext(OpSequencer *o)
+      : state(STATE_PREPARE),
+	osr(o),
+	ops(0),
+	bytes(0),
+	oncommit(NULL),
+	onreadable(NULL),
+	onreadable_sync(NULL),
+        start(ceph_clock_now()){
+      //cout << "txc new " << this << std::endl;
+    }
+    ~TransContext() {
+      //cout << "txc del " << this << std::endl;
+    }
+
+    void write_onode(OnodeRef &o) {
+      onodes.insert(o);
+    }
+  };
+
+  class OpSequencer : public RefCountedObject {
+  public:
+    std::mutex qlock;
+    std::condition_variable qcond;
+    typedef boost::intrusive::list<
+      TransContext,
+      boost::intrusive::member_hook<
+        TransContext,
+	boost::intrusive::list_member_hook<>,
+	&TransContext::sequencer_item> > q_list_t;
+    q_list_t q;  ///< transactions
+
+    ~OpSequencer() {
+      ceph_assert(q.empty());
+    }
+
+    void queue_new(TransContext *txc) {
+      std::lock_guard<std::mutex> l(qlock);
+      q.push_back(*txc);
+    }
+
+    void flush() {
+      std::unique_lock<std::mutex> l(qlock);
+      while (!q.empty())
+	qcond.wait(l);
+    }
+
+    bool flush_commit(Context *c) {
+      std::lock_guard<std::mutex> l(qlock);
+      if (q.empty()) {
+	return true;
+      }
+      TransContext *txc = &q.back();
+      if (txc->state >= TransContext::STATE_KV_DONE) {
+	return true;
+      }
+      ceph_assert(txc->state < TransContext::STATE_KV_DONE);
+      txc->oncommits.push_back(c);
+      return false;
+    }
+  };
+
+  struct KVSyncThread : public Thread {
+    KStore *store;
+    explicit KVSyncThread(KStore *s) : store(s) {}
+    void *entry() override {
+      store->_kv_sync_thread();
+      return NULL;
+    }
+  };
+
+  // --------------------------------------------------------
+  // members
+private:
+  KeyValueDB *db;
+  uuid_d fsid;
+  std::string basedir;
+  int path_fd;  ///< open handle to $path
+  int fsid_fd;  ///< open handle (locked) to $path/fsid
+  bool mounted;
+
+  /// rwlock to protect coll_map
+  ceph::shared_mutex coll_lock = ceph::make_shared_mutex("KStore::coll_lock");
+  ceph::unordered_map<coll_t, CollectionRef> coll_map;
+  std::map<coll_t,CollectionRef> new_coll_map;
+
+  std::mutex nid_lock;
+  uint64_t nid_last;
+  uint64_t nid_max;
+
+  Throttle throttle_ops, throttle_bytes;          ///< submit to commit
+
+  Finisher finisher;
+
+  KVSyncThread kv_sync_thread;
+  std::mutex kv_lock;
+  std::condition_variable kv_cond, kv_sync_cond;
+  bool kv_stop;
+  std::deque<TransContext*> kv_queue, kv_committing;
+
+  //Logger *logger;
+  PerfCounters *logger;
+  std::mutex reap_lock;
+  std::list<CollectionRef> removed_collections;
+
+
+  // --------------------------------------------------------
+  // private methods
+
+  void _init_logger();
+  void _shutdown_logger();
+
+  int _open_path();
+  void _close_path();
+  int _open_fsid(bool create);
+  int _lock_fsid();
+  int _read_fsid(uuid_d *f);
+  int _write_fsid();
+  void _close_fsid();
+  int _open_db(bool create);
+  void _close_db();
+  int _open_collections(int *errors=0);
+  void _close_collections();
+
+  int _open_super_meta();
+
+  CollectionRef _get_collection(coll_t cid);
+  void _queue_reap_collection(CollectionRef& c);
+  void _reap_collections();
+
+  void _assign_nid(TransContext *txc, OnodeRef o);
+
+  void _dump_onode(OnodeRef o);
+
+  TransContext *_txc_create(OpSequencer *osr);
+  void _txc_release(TransContext *txc, uint64_t offset, uint64_t length);
+  void _txc_add_transaction(TransContext *txc, Transaction *t);
+  void _txc_finalize(OpSequencer *osr, TransContext *txc);
+  void _txc_state_proc(TransContext *txc);
+  void _txc_finish_kv(TransContext *txc);
+  void _txc_finish(TransContext *txc);
+
+  void _osr_reap_done(OpSequencer *osr);
+
+  void _kv_sync_thread();
+  void _kv_stop() {
+    {
+      std::lock_guard<std::mutex> l(kv_lock);
+      kv_stop = true;
+      kv_cond.notify_all();
+    }
+    kv_sync_thread.join();
+    kv_stop = false;
+  }
+
+  void _do_read_stripe(OnodeRef o, uint64_t offset, ceph::buffer::list *pbl, bool do_cache);
+  void _do_write_stripe(TransContext *txc, OnodeRef o,
+			uint64_t offset, ceph::buffer::list& bl);
+  void _do_remove_stripe(TransContext *txc, OnodeRef o, uint64_t offset);
+
+  int _collection_list(
+    Collection *c, const ghobject_t& start, const ghobject_t& end,
+    int max, std::vector<ghobject_t> *ls, ghobject_t *next);
+
+public:
+  KStore(CephContext *cct, const std::string& path);
+  ~KStore() override;
+
+  std::string get_type() override {
+    return "kstore";
+  }
+
+  bool needs_journal() override { return false; };
+  bool wants_journal() override { return false; };
+  bool allows_journal() override { return false; };
+
+  static int get_block_device_fsid(const std::string& path, uuid_d *fsid);
+
+  bool test_mount_in_use() override;
+
+  int mount() override;
+  int umount() override;
+  void _sync();
+
+  int fsck(bool deep) override;
+
+
+  int validate_hobject_key(const hobject_t &obj) const override {
+    return 0;
+  }
+  unsigned get_max_attr_name_length() override {
+    return 256;  // arbitrary; there is no real limit internally
+  }
+
+  int mkfs() override;
+  int mkjournal() override {
+    return 0;
+  }
+  void dump_perf_counters(ceph::Formatter *f) override {
+    f->open_object_section("perf_counters");
+    logger->dump_formatted(f, false);
+    f->close_section();
+  }
+  void get_db_statistics(ceph::Formatter *f) override {
+    db->get_statistics(f);
+  }
+  int statfs(struct store_statfs_t *buf,
+             osd_alert_list_t* alerts = nullptr) override;
+  int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+		  bool *per_pool_omap) override;
+
+  CollectionHandle open_collection(const coll_t& c) override;
+  CollectionHandle create_new_collection(const coll_t& c) override;
+  void set_collection_commit_queue(const coll_t& cid,
+				   ContextQueue *commit_queue) override {
+  }
+
+  using ObjectStore::exists;
+  bool exists(CollectionHandle& c, const ghobject_t& oid) override;
+  using ObjectStore::stat;
+  int stat(
+    CollectionHandle& c,
+    const ghobject_t& oid,
+    struct stat *st,
+    bool allow_eio = false) override; // struct stat?
+  int set_collection_opts(
+    CollectionHandle& c,
+    const pool_opts_t& opts) override;
+  using ObjectStore::read;
+  int read(
+    CollectionHandle& c,
+    const ghobject_t& oid,
+    uint64_t offset,
+    size_t len,
+    ceph::buffer::list& bl,
+    uint32_t op_flags = 0) override;
+  int _do_read(
+    OnodeRef o,
+    uint64_t offset,
+    size_t len,
+    ceph::buffer::list& bl,
+    bool do_cache,
+    uint32_t op_flags = 0);
+
+  using ObjectStore::fiemap;
+  int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) override;
+  int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, ceph::buffer::list& outbl) override;
+  using ObjectStore::getattr;
+  int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, ceph::buffer::ptr& value) override;
+  using ObjectStore::getattrs;
+  int getattrs(CollectionHandle& c, const ghobject_t& oid, std::map<std::string,ceph::buffer::ptr>& aset) override;
+
+  int list_collections(std::vector<coll_t>& ls) override;
+  bool collection_exists(const coll_t& c) override;
+  int collection_empty(CollectionHandle& c, bool *empty) override;
+  int collection_bits(CollectionHandle& c) override;
+  int collection_list(
+    CollectionHandle &c, const ghobject_t& start, const ghobject_t& end,
+    int max,
+    std::vector<ghobject_t> *ls, ghobject_t *next) override;
+
+  using ObjectStore::omap_get;
+  int omap_get(
+    CollectionHandle& c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value std::map
+    ) override;
+
+  using ObjectStore::omap_get_header;
+  /// Get omap header
+  int omap_get_header(
+    CollectionHandle& c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    bool allow_eio = false ///< [in] don't assert on eio
+    ) override;
+
+  using ObjectStore::omap_get_keys;
+  /// Get keys defined on oid
+  int omap_get_keys(
+    CollectionHandle& c,              ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    std::set<std::string> *keys      ///< [out] Keys defined on oid
+    ) override;
+
+  using ObjectStore::omap_get_values;
+  /// Get key values
+  int omap_get_values(
+    CollectionHandle& c,                    ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::set<std::string> &keys,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) override;
+
+  using ObjectStore::omap_check_keys;
+  /// Filters keys into out which are defined on oid
+  int omap_check_keys(
+    CollectionHandle& c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    const std::set<std::string> &keys, ///< [in] Keys to check
+    std::set<std::string> *out         ///< [out] Subset of keys defined on oid
+    ) override;
+
+  using ObjectStore::get_omap_iterator;
+  ObjectMap::ObjectMapIterator get_omap_iterator(
+    CollectionHandle& c,              ///< [in] collection
+    const ghobject_t &oid  ///< [in] object
+    ) override;
+
+  void set_fsid(uuid_d u) override {
+    fsid = u;
+  }
+  uuid_d get_fsid() override {
+    return fsid;
+  }
+
+  uint64_t estimate_objects_overhead(uint64_t num_objects) override {
+    return num_objects * 300; //assuming per-object overhead is 300 bytes
+  }
+
+  objectstore_perf_stat_t get_cur_stats() override {
+    return objectstore_perf_stat_t();
+  }
+  const PerfCounters* get_perf_counters() const override {
+    return logger;
+  }
+
+
+  int queue_transactions(
+    CollectionHandle& ch,
+    std::vector<Transaction>& tls,
+    TrackedOpRef op = TrackedOpRef(),
+    ThreadPool::TPHandle *handle = NULL) override;
+
+  void compact () override {
+    ceph_assert(db);
+    db->compact();
+  }
+  
+private:
+  // --------------------------------------------------------
+  // write ops
+
+  int _write(TransContext *txc,
+	     CollectionRef& c,
+	     OnodeRef& o,
+	     uint64_t offset, size_t len,
+	     ceph::buffer::list& bl,
+	     uint32_t fadvise_flags);
+  int _do_write(TransContext *txc,
+		OnodeRef o,
+		uint64_t offset, uint64_t length,
+		ceph::buffer::list& bl,
+		uint32_t fadvise_flags);
+  int _touch(TransContext *txc,
+	     CollectionRef& c,
+	     OnodeRef& o);
+  int _zero(TransContext *txc,
+	    CollectionRef& c,
+	    OnodeRef& o,
+	    uint64_t offset, size_t len);
+  int _do_truncate(TransContext *txc,
+		   OnodeRef o,
+		   uint64_t offset);
+  int _truncate(TransContext *txc,
+		CollectionRef& c,
+		OnodeRef& o,
+		uint64_t offset);
+  int _remove(TransContext *txc,
+	      CollectionRef& c,
+	      OnodeRef& o);
+  int _do_remove(TransContext *txc,
+		 OnodeRef o);
+  int _setattr(TransContext *txc,
+	       CollectionRef& c,
+	       OnodeRef& o,
+	       const std::string& name,
+	       ceph::buffer::ptr& val);
+  int _setattrs(TransContext *txc,
+		CollectionRef& c,
+		OnodeRef& o,
+		const std::map<std::string,ceph::buffer::ptr>& aset);
+  int _rmattr(TransContext *txc,
+	      CollectionRef& c,
+	      OnodeRef& o,
+	      const std::string& name);
+  int _rmattrs(TransContext *txc,
+	       CollectionRef& c,
+	       OnodeRef& o);
+  void _do_omap_clear(TransContext *txc, uint64_t id);
+  int _omap_clear(TransContext *txc,
+		  CollectionRef& c,
+		  OnodeRef& o);
+  int _omap_setkeys(TransContext *txc,
+		    CollectionRef& c,
+		    OnodeRef& o,
+		    ceph::buffer::list& bl);
+  int _omap_setheader(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& o,
+		      ceph::buffer::list& header);
+  int _omap_rmkeys(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& o,
+		   const ceph::buffer::list& bl);
+  int _omap_rmkey_range(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o,
+			const std::string& first, const std::string& last);
+  int _setallochint(TransContext *txc,
+		    CollectionRef& c,
+		    OnodeRef& o,
+		    uint64_t expected_object_size,
+		    uint64_t expected_write_size,
+		    uint32_t flags);
+  int _clone(TransContext *txc,
+	     CollectionRef& c,
+	     OnodeRef& oldo,
+	     OnodeRef& newo);
+  int _clone_range(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& oldo,
+		   OnodeRef& newo,
+		   uint64_t srcoff, uint64_t length, uint64_t dstoff);
+  int _rename(TransContext *txc,
+	      CollectionRef& c,
+	      OnodeRef& oldo,
+	      OnodeRef& newo,
+	      const ghobject_t& new_oid);
+  int _create_collection(TransContext *txc, coll_t cid, unsigned bits,
+			 CollectionRef *c);
+  int _remove_collection(TransContext *txc, coll_t cid, CollectionRef *c);
+  int _split_collection(TransContext *txc,
+			CollectionRef& c,
+			CollectionRef& d,
+			unsigned bits, int rem);
+  int _merge_collection(TransContext *txc,
+			CollectionRef *c,
+			CollectionRef& d,
+			unsigned bits);
+
+};
+
+static inline void intrusive_ptr_add_ref(KStore::Onode *o) {
+  o->get();
+}
+static inline void intrusive_ptr_release(KStore::Onode *o) {
+  o->put();
+}
+
+static inline void intrusive_ptr_add_ref(KStore::OpSequencer *o) {
+  o->get();
+}
+static inline void intrusive_ptr_release(KStore::OpSequencer *o) {
+  o->put();
+}
+
+#endif
diff --git a/src/os/kstore/kstore_types.cc b/src/os/kstore/kstore_types.cc
new file mode 100644
index 000000000..885c52b60
--- /dev/null
+++ b/src/os/kstore/kstore_types.cc
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "kstore_types.h"
+#include "common/Formatter.h"
+#include "include/stringify.h"
+
+using std::list;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+// cnode_t
+
+void kstore_cnode_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(bits, bl);
+  ENCODE_FINISH(bl);
+}
+
+void kstore_cnode_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(bits, p);
+  DECODE_FINISH(p);
+}
+
+void kstore_cnode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("bits", bits);
+}
+
+void kstore_cnode_t::generate_test_instances(list<kstore_cnode_t*>& o)
+{
+  o.push_back(new kstore_cnode_t());
+  o.push_back(new kstore_cnode_t(0));
+  o.push_back(new kstore_cnode_t(123));
+}
+
+
+// kstore_onode_t
+
+void kstore_onode_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(nid, bl);
+  encode(size, bl);
+  encode(attrs, bl);
+  encode(omap_head, bl);
+  encode(stripe_size, bl);
+  encode(expected_object_size, bl);
+  encode(expected_write_size, bl);
+  encode(alloc_hint_flags, bl);
+  ENCODE_FINISH(bl);
+}
+
+void kstore_onode_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(nid, p);
+  decode(size, p);
+  decode(attrs, p);
+  decode(omap_head, p);
+  decode(stripe_size, p);
+  decode(expected_object_size, p);
+  decode(expected_write_size, p);
+  decode(alloc_hint_flags, p);
+  DECODE_FINISH(p);
+}
+
+void kstore_onode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("nid", nid);
+  f->dump_unsigned("size", size);
+  f->open_object_section("attrs");
+  for (auto p = attrs.begin(); p != attrs.end(); ++p) {
+    f->open_object_section("attr");
+    f->dump_string("name", p->first);
+    f->dump_unsigned("len", p->second.length());
+    f->close_section();
+  }
+  f->close_section();
+  f->dump_unsigned("omap_head", omap_head);
+  f->dump_unsigned("stripe_size", stripe_size);
+  f->dump_unsigned("expected_object_size", expected_object_size);
+  f->dump_unsigned("expected_write_size", expected_write_size);
+  f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
+}
+
+void kstore_onode_t::generate_test_instances(list<kstore_onode_t*>& o)
+{
+  o.push_back(new kstore_onode_t());
+  // FIXME
+}
diff --git a/src/os/kstore/kstore_types.h b/src/os/kstore/kstore_types.h
new file mode 100644
index 000000000..859ea8f7d
--- /dev/null
+++ b/src/os/kstore/kstore_types.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_KSTORE_TYPES_H
+#define CEPH_OSD_KSTORE_TYPES_H
+
+#include <ostream>
+#include "include/types.h"
+#include "include/interval_set.h"
+#include "include/utime.h"
+#include "common/hobject.h"
+
+namespace ceph {
+  class Formatter;
+}
+/// collection metadata
+struct kstore_cnode_t {
+  uint32_t bits;   ///< how many bits of coll pgid are significant
+
+  explicit kstore_cnode_t(int b=0) : bits(b) {}
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<kstore_cnode_t*>& o);
+};
+WRITE_CLASS_ENCODER(kstore_cnode_t)
+
+/// onode: per-object metadata
+struct kstore_onode_t {
+  uint64_t nid;                        ///< numeric id (locally unique)
+  uint64_t size;                       ///< object size
+  std::map<std::string, ceph::buffer::ptr> attrs;        ///< attrs
+  uint64_t omap_head;                  ///< id for omap root node
+  uint32_t stripe_size;                ///< stripe size
+
+  uint32_t expected_object_size;
+  uint32_t expected_write_size;
+  uint32_t alloc_hint_flags;
+
+  kstore_onode_t()
+    : nid(0),
+      size(0),
+      omap_head(0),
+      stripe_size(0),
+      expected_object_size(0),
+      expected_write_size(0),
+      alloc_hint_flags(0) {}
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<kstore_onode_t*>& o);
+};
+WRITE_CLASS_ENCODER(kstore_onode_t)
+
+#endif
diff --git a/src/os/kv.h b/src/os/kv.h
new file mode 100644
index 000000000..64048b088
--- /dev/null
+++ b/src/os/kv.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_KV_H
+#define CEPH_OS_KV_H
+
+#include <string>
+#include "include/byteorder.h"
+
+// some key encoding helpers
+template<typename T>
+inline static void _key_encode_u32(uint32_t u, T *key) {
+  uint32_t bu;
+#ifdef CEPH_BIG_ENDIAN
+  bu = u;
+#elif defined(CEPH_LITTLE_ENDIAN)
+  bu = swab(u);
+#else
+# error wtf
+#endif
+  key->append((char*)&bu, 4);
+}
+
+template<typename T>
+inline static void _key_encode_u32(uint32_t u, size_t pos, T *key) {
+  uint32_t bu;
+#ifdef CEPH_BIG_ENDIAN
+  bu = u;
+#elif defined(CEPH_LITTLE_ENDIAN)
+  bu = swab(u);
+#else
+# error wtf
+#endif
+  key->replace(pos, sizeof(bu), (char*)&bu, sizeof(bu));
+}
+
+inline static const char *_key_decode_u32(const char *key, uint32_t *pu) {
+  uint32_t bu;
+  memcpy(&bu, key, 4);
+#ifdef CEPH_BIG_ENDIAN
+  *pu = bu;
+#elif defined(CEPH_LITTLE_ENDIAN)
+  *pu = swab(bu);
+#else
+# error wtf
+#endif
+  return key + 4;
+}
+
+template<typename T>
+inline static void _key_encode_u64(uint64_t u, T *key) {
+  uint64_t bu;
+#ifdef CEPH_BIG_ENDIAN
+  bu = u;
+#elif defined(CEPH_LITTLE_ENDIAN)
+  bu = swab(u);
+#else
+# error wtf
+#endif
+  key->append((char*)&bu, 8);
+}
+
+inline static const char *_key_decode_u64(const char *key, uint64_t *pu) {
+  uint64_t bu;
+  memcpy(&bu, key, 8);
+#ifdef CEPH_BIG_ENDIAN
+  *pu = bu;
+#elif defined(CEPH_LITTLE_ENDIAN)
+  *pu = swab(bu);
+#else
+# error wtf
+#endif
+  return key + 8;
+}
+
+#endif
diff --git a/src/os/memstore/MemStore.cc b/src/os/memstore/MemStore.cc
new file mode 100644
index 000000000..dc29ab1b6
--- /dev/null
+++ b/src/os/memstore/MemStore.cc
@@ -0,0 +1,1800 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "acconfig.h"
+
+#ifdef HAVE_SYS_MOUNT_H
+#include <sys/mount.h>
+#endif
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+#include "include/types.h"
+#include "include/stringify.h"
+#include "include/unordered_map.h"
+#include "common/errno.h"
+#include "MemStore.h"
+#include "include/compat.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "memstore(" << path << ") "
+
+using ceph::decode;
+using ceph::encode;
+
+// for comparing collections for lock ordering
+bool operator>(const MemStore::CollectionRef& l,
+	       const MemStore::CollectionRef& r)
+{
+  return (unsigned long)l.get() > (unsigned long)r.get();
+}
+
+
+int MemStore::mount()
+{
+  int r = _load();
+  if (r < 0)
+    return r;
+  finisher.start();
+  return 0;
+}
+
+int MemStore::umount()
+{
+  finisher.wait_for_empty();
+  finisher.stop();
+  return _save();
+}
+
+int MemStore::_save()
+{
+  dout(10) << __func__ << dendl;
+  dump_all();
+  std::set<coll_t> collections;
+  for (auto p = coll_map.begin(); p != coll_map.end(); ++p) {
+    dout(20) << __func__ << " coll " << p->first << " " << p->second << dendl;
+    collections.insert(p->first);
+    ceph::buffer::list bl;
+    ceph_assert(p->second);
+    p->second->encode(bl);
+    std::string fn = path + "/" + stringify(p->first);
+    int r = bl.write_file(fn.c_str());
+    if (r < 0)
+      return r;
+  }
+
+  std::string fn = path + "/collections";
+  ceph::buffer::list bl;
+  encode(collections, bl);
+  int r = bl.write_file(fn.c_str());
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+void MemStore::dump_all()
+{
+  auto f = ceph::Formatter::create("json-pretty");
+  f->open_object_section("store");
+  dump(f);
+  f->close_section();
+  dout(0) << "dump:";
+  f->flush(*_dout);
+  *_dout << dendl;
+  delete f;
+}
+
+void MemStore::dump(ceph::Formatter *f)
+{
+  f->open_array_section("collections");
+  for (auto p = coll_map.begin(); p != coll_map.end(); ++p) {
+    f->open_object_section("collection");
+    f->dump_string("name", stringify(p->first));
+
+    f->open_array_section("xattrs");
+    for (auto q = p->second->xattr.begin();
+	 q != p->second->xattr.end();
+	 ++q) {
+      f->open_object_section("xattr");
+      f->dump_string("name", q->first);
+      f->dump_int("length", q->second.length());
+      f->close_section();
+    }
+    f->close_section();
+
+    f->open_array_section("objects");
+    for (auto q = p->second->object_map.begin();
+	 q != p->second->object_map.end();
+	 ++q) {
+      f->open_object_section("object");
+      f->dump_string("name", stringify(q->first));
+      if (q->second)
+	q->second->dump(f);
+      f->close_section();
+    }
+    f->close_section();
+
+    f->close_section();
+  }
+  f->close_section();
+}
+
+int MemStore::_load()
+{
+  dout(10) << __func__ << dendl;
+  ceph::buffer::list bl;
+  std::string fn = path + "/collections";
+  std::string err;
+  int r = bl.read_file(fn.c_str(), &err);
+  if (r < 0)
+    return r;
+
+  std::set<coll_t> collections;
+  auto p = bl.cbegin();
+  decode(collections, p);
+
+  for (auto q = collections.begin();
+       q != collections.end();
+       ++q) {
+    std::string fn = path + "/" + stringify(*q);
+    ceph::buffer::list cbl;
+    int r = cbl.read_file(fn.c_str(), &err);
+    if (r < 0)
+      return r;
+    auto c = ceph::make_ref<Collection>(cct, *q);
+    auto p = cbl.cbegin();
+    c->decode(p);
+    coll_map[*q] = c;
+    used_bytes += c->used_bytes();
+  }
+
+  dump_all();
+
+  return 0;
+}
+
+void MemStore::set_fsid(uuid_d u)
+{
+  int r = write_meta("fsid", stringify(u));
+  ceph_assert(r >= 0);
+}
+
+uuid_d MemStore::get_fsid()
+{
+  std::string fsid_str;
+  int r = read_meta("fsid", &fsid_str);
+  ceph_assert(r >= 0);
+  uuid_d uuid;
+  bool b = uuid.parse(fsid_str.c_str());
+  ceph_assert(b);
+  return uuid;
+}
+
+int MemStore::mkfs()
+{
+  std::string fsid_str;
+  int r = read_meta("fsid", &fsid_str);
+  if (r == -ENOENT) {
+    uuid_d fsid;
+    fsid.generate_random();
+    fsid_str = stringify(fsid);
+    r = write_meta("fsid", fsid_str);
+    if (r < 0)
+      return r;
+    dout(1) << __func__ << " new fsid " << fsid_str << dendl;
+  } else if (r < 0) {
+    return r;
+  } else {  
+    dout(1) << __func__ << " had fsid " << fsid_str << dendl;
+  }
+
+  std::string fn = path + "/collections";
+  derr << path << dendl;
+  ceph::buffer::list bl;
+  std::set<coll_t> collections;
+  encode(collections, bl);
+  r = bl.write_file(fn.c_str());
+  if (r < 0)
+    return r;
+
+  r = write_meta("type", "memstore");
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int MemStore::statfs(struct store_statfs_t *st, osd_alert_list_t* alerts)
+{
+  dout(10) << __func__ << dendl;
+  if (alerts) {
+    alerts->clear(); // returns nothing for now
+  }
+  st->reset();
+  st->total = cct->_conf->memstore_device_bytes;
+  st->available = std::max<int64_t>(st->total - used_bytes, 0);
+  dout(10) << __func__ << ": used_bytes: " << used_bytes
+	   << "/" << cct->_conf->memstore_device_bytes << dendl;
+  return 0;
+}
+
+int MemStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+			  bool *per_pool_omap)
+{
+  return -ENOTSUP;
+}
+
+objectstore_perf_stat_t MemStore::get_cur_stats()
+{
+  // fixme
+  return objectstore_perf_stat_t();
+}
+
+MemStore::CollectionRef MemStore::get_collection(const coll_t& cid)
+{
+  std::shared_lock l{coll_lock};
+  ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+  if (cp == coll_map.end())
+    return CollectionRef();
+  return cp->second;
+}
+
+ObjectStore::CollectionHandle MemStore::create_new_collection(const coll_t& cid)
+{
+  std::lock_guard l{coll_lock};
+  auto c = ceph::make_ref<Collection>(cct, cid);
+  new_coll_map[cid] = c;
+  return c;
+}
+
+
+// ---------------
+// read operations
+
+bool MemStore::exists(CollectionHandle &c_, const ghobject_t& oid)
+{
+  Collection *c = static_cast<Collection*>(c_.get());
+  dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
+  if (!c->exists)
+    return false;
+
+  // Perform equivalent of c->get_object_(oid) != NULL. In C++11 the
+  // shared_ptr needs to be compared to nullptr.
+  return (bool)c->get_object(oid);
+}
+
+int MemStore::stat(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  struct stat *st,
+  bool allow_eio)
+{
+  Collection *c = static_cast<Collection*>(c_.get());
+  dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  st->st_size = o->get_size();
+  st->st_blksize = 4096;
+  st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
+  st->st_nlink = 1;
+  return 0;
+}
+
+int MemStore::set_collection_opts(
+  CollectionHandle& ch,
+  const pool_opts_t& opts)
+{
+  return -EOPNOTSUPP;
+}
+
+int MemStore::read(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t len,
+  ceph::buffer::list& bl,
+  uint32_t op_flags)
+{
+  Collection *c = static_cast<Collection*>(c_.get());
+  dout(10) << __func__ << " " << c->cid << " " << oid << " "
+	   << offset << "~" << len << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  if (offset >= o->get_size())
+    return 0;
+  size_t l = len;
+  if (l == 0 && offset == 0)  // note: len == 0 means read the entire object
+    l = o->get_size();
+  else if (offset + l > o->get_size())
+    l = o->get_size() - offset;
+  bl.clear();
+  return o->read(offset, l, bl);
+}
+
+int MemStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+		     uint64_t offset, size_t len, ceph::buffer::list& bl)
+{
+  std::map<uint64_t, uint64_t> destmap;
+  int r = fiemap(ch, oid, offset, len, destmap);
+  if (r >= 0)
+    encode(destmap, bl);
+  return r;
+}
+
+int MemStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+		     uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap)
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << " " << offset << "~"
+	   << len << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  size_t l = len;
+  if (offset + l > o->get_size())
+    l = o->get_size() - offset;
+  if (offset >= o->get_size())
+    goto out;
+  destmap[offset] = l;
+ out:
+  return 0;
+}
+
+int MemStore::getattr(CollectionHandle &c_, const ghobject_t& oid,
+		      const char *name, ceph::buffer::ptr& value)
+{
+  Collection *c = static_cast<Collection*>(c_.get());
+  dout(10) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::string k(name);
+  std::lock_guard lock{o->xattr_mutex};
+  if (!o->xattr.count(k)) {
+    return -ENODATA;
+  }
+  value = o->xattr[k];
+  return 0;
+}
+
+int MemStore::getattrs(CollectionHandle &c_, const ghobject_t& oid,
+		       std::map<std::string,ceph::buffer::ptr>& aset)
+{
+  Collection *c = static_cast<Collection*>(c_.get());
+  dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->xattr_mutex};
+  aset = o->xattr;
+  return 0;
+}
+
+int MemStore::list_collections(std::vector<coll_t>& ls)
+{
+  dout(10) << __func__ << dendl;
+  std::shared_lock l{coll_lock};
+  for (ceph::unordered_map<coll_t,CollectionRef>::iterator p = coll_map.begin();
+       p != coll_map.end();
+       ++p) {
+    ls.push_back(p->first);
+  }
+  return 0;
+}
+
+bool MemStore::collection_exists(const coll_t& cid)
+{
+  dout(10) << __func__ << " " << cid << dendl;
+  std::shared_lock l{coll_lock};
+  return coll_map.count(cid);
+}
+
+int MemStore::collection_empty(CollectionHandle& ch, bool *empty)
+{
+  dout(10) << __func__ << " " << ch->cid << dendl;
+  CollectionRef c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  *empty = c->object_map.empty();
+  return 0;
+}
+
+int MemStore::collection_bits(CollectionHandle& ch)
+{
+  dout(10) << __func__ << " " << ch->cid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  return c->bits;
+}
+
+int MemStore::collection_list(CollectionHandle& ch,
+			      const ghobject_t& start,
+			      const ghobject_t& end,
+			      int max,
+			      std::vector<ghobject_t> *ls, ghobject_t *next)
+{
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+
+  dout(10) << __func__ << " cid " << ch->cid << " start " << start
+	   << " end " << end << dendl;
+  auto p = c->object_map.lower_bound(start);
+  while (p != c->object_map.end() &&
+	 ls->size() < (unsigned)max &&
+	 p->first < end) {
+    ls->push_back(p->first);
+    ++p;
+  }
+  if (next != NULL) {
+    if (p == c->object_map.end())
+      *next = ghobject_t::get_max();
+    else
+      *next = p->first;
+  }
+  dout(10) << __func__ << " cid " << ch->cid << " got " << ls->size() << dendl;
+  return 0;
+}
+
+int MemStore::omap_get(
+  CollectionHandle& ch,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  ceph::buffer::list *header,      ///< [out] omap header
+  std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+  )
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  *header = o->omap_header;
+  *out = o->omap;
+  return 0;
+}
+
+int MemStore::omap_get_header(
+  CollectionHandle& ch,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  ceph::buffer::list *header,      ///< [out] omap header
+  bool allow_eio ///< [in] don't assert on eio
+  )
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  *header = o->omap_header;
+  return 0;
+}
+
+int MemStore::omap_get_keys(
+  CollectionHandle& ch,              ///< [in] Collection containing oid
+  const ghobject_t &oid, ///< [in] Object containing omap
+  std::set<std::string> *keys      ///< [out] Keys defined on oid
+  )
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  for (auto p = o->omap.begin(); p != o->omap.end(); ++p)
+    keys->insert(p->first);
+  return 0;
+}
+
+int MemStore::omap_get_values(
+  CollectionHandle& ch,                    ///< [in] Collection containing oid
+  const ghobject_t &oid,       ///< [in] Object containing omap
+  const std::set<std::string> &keys,     ///< [in] Keys to get
+  std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+  )
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  for (auto p = keys.begin(); p != keys.end(); ++p) {
+    auto q = o->omap.find(*p);
+    if (q != o->omap.end())
+      out->insert(*q);
+  }
+  return 0;
+}
+
+int MemStore::omap_check_keys(
+  CollectionHandle& ch,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  const std::set<std::string> &keys, ///< [in] Keys to check
+  std::set<std::string> *out         ///< [out] Subset of keys defined on oid
+  )
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  for (auto p = keys.begin(); p != keys.end(); ++p) {
+    auto q = o->omap.find(*p);
+    if (q != o->omap.end())
+      out->insert(*p);
+  }
+  return 0;
+}
+
+class MemStore::OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
+  CollectionRef c;
+  ObjectRef o;
+  std::map<std::string,ceph::buffer::list>::iterator it;
+public:
+  OmapIteratorImpl(CollectionRef c, ObjectRef o)
+    : c(c), o(o), it(o->omap.begin()) {}
+
+  int seek_to_first() override {
+    std::lock_guard lock{o->omap_mutex};
+    it = o->omap.begin();
+    return 0;
+  }
+  int upper_bound(const std::string &after) override {
+    std::lock_guard lock{o->omap_mutex};
+    it = o->omap.upper_bound(after);
+    return 0;
+  }
+  int lower_bound(const std::string &to) override {
+    std::lock_guard lock{o->omap_mutex};
+    it = o->omap.lower_bound(to);
+    return 0;
+  }
+  bool valid() override {
+    std::lock_guard lock{o->omap_mutex};
+    return it != o->omap.end();
+  }
+  int next() override {
+    std::lock_guard lock{o->omap_mutex};
+    ++it;
+    return 0;
+  }
+  std::string key() override {
+    std::lock_guard lock{o->omap_mutex};
+    return it->first;
+  }
+  ceph::buffer::list value() override {
+    std::lock_guard lock{o->omap_mutex};
+    return it->second;
+  }
+  int status() override {
+    return 0;
+  }
+};
+
+ObjectMap::ObjectMapIterator MemStore::get_omap_iterator(
+  CollectionHandle& ch,
+  const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return ObjectMap::ObjectMapIterator();
+  return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o));
+}
+
+
+// ---------------
+// write operations
+
+int MemStore::queue_transactions(
+  CollectionHandle& ch,
+  std::vector<Transaction>& tls,
+  TrackedOpRef op,
+  ThreadPool::TPHandle *handle)
+{
+  // because memstore operations are synchronous, we can implement the
+  // Sequencer with a mutex. this guarantees ordering on a given sequencer,
+  // while allowing operations on different sequencers to happen in parallel
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::unique_lock lock{c->sequencer_mutex};
+
+  for (auto p = tls.begin(); p != tls.end(); ++p) {
+    // poke the TPHandle heartbeat just to exercise that code path
+    if (handle)
+      handle->reset_tp_timeout();
+
+    _do_transaction(*p);
+  }
+
+  Context *on_apply = NULL, *on_apply_sync = NULL, *on_commit = NULL;
+  ObjectStore::Transaction::collect_contexts(tls, &on_apply, &on_commit,
+					     &on_apply_sync);
+  if (on_apply_sync)
+    on_apply_sync->complete(0);
+  if (on_apply)
+    finisher.queue(on_apply);
+  if (on_commit)
+    finisher.queue(on_commit);
+  return 0;
+}
+
+void MemStore::_do_transaction(Transaction& t)
+{
+  Transaction::iterator i = t.begin();
+  int pos = 0;
+
+  while (i.have_op()) {
+    Transaction::Op *op = i.decode_op();
+    int r = 0;
+
+    switch (op->op) {
+    case Transaction::OP_NOP:
+      break;
+    case Transaction::OP_TOUCH:
+    case Transaction::OP_CREATE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _touch(cid, oid);
+      }
+      break;
+
+    case Transaction::OP_WRITE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	uint32_t fadvise_flags = i.get_fadvise_flags();
+        ceph::buffer::list bl;
+        i.decode_bl(bl);
+	r = _write(cid, oid, off, len, bl, fadvise_flags);
+      }
+      break;
+
+    case Transaction::OP_ZERO:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	r = _zero(cid, oid, off, len);
+      }
+      break;
+
+    case Transaction::OP_TRIMCACHE:
+      {
+        // deprecated, no-op
+      }
+      break;
+
+    case Transaction::OP_TRUNCATE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+	r = _truncate(cid, oid, off);
+      }
+      break;
+
+    case Transaction::OP_REMOVE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _remove(cid, oid);
+      }
+      break;
+
+    case Transaction::OP_SETATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        std::string name = i.decode_string();
+        ceph::buffer::list bl;
+        i.decode_bl(bl);
+	std::map<std::string, ceph::buffer::ptr> to_set;
+	to_set[name] = ceph::buffer::ptr(bl.c_str(), bl.length());
+	r = _setattrs(cid, oid, to_set);
+      }
+      break;
+
+    case Transaction::OP_SETATTRS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	std::map<std::string, ceph::buffer::ptr> aset;
+        i.decode_attrset(aset);
+	r = _setattrs(cid, oid, aset);
+      }
+      break;
+
+    case Transaction::OP_RMATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        std::string name = i.decode_string();
+	r = _rmattr(cid, oid, name.c_str());
+      }
+      break;
+
+    case Transaction::OP_RMATTRS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _rmattrs(cid, oid);
+      }
+      break;
+
+    case Transaction::OP_CLONE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ghobject_t noid = i.get_oid(op->dest_oid);
+	r = _clone(cid, oid, noid);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ghobject_t noid = i.get_oid(op->dest_oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	r = _clone_range(cid, oid, noid, off, len, off);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE2:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ghobject_t noid = i.get_oid(op->dest_oid);
+        uint64_t srcoff = op->off;
+        uint64_t len = op->len;
+        uint64_t dstoff = op->dest_off;
+	r = _clone_range(cid, oid, noid, srcoff, len, dstoff);
+      }
+      break;
+
+    case Transaction::OP_MKCOLL:
+      {
+        coll_t cid = i.get_cid(op->cid);
+	r = _create_collection(cid, op->split_bits);
+      }
+      break;
+
+    case Transaction::OP_COLL_HINT:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t type = op->hint;
+        ceph::buffer::list hint;
+        i.decode_bl(hint);
+        auto hiter = hint.cbegin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          decode(pg_num, hiter);
+          decode(num_objs, hiter);
+          r = _collection_hint_expected_num_objs(cid, pg_num, num_objs);
+        } else {
+          // Ignore the hint
+          dout(10) << "Unrecognized collection hint type: " << type << dendl;
+        }
+      }
+      break;
+
+    case Transaction::OP_RMCOLL:
+      {
+        coll_t cid = i.get_cid(op->cid);
+	r = _destroy_collection(cid);
+      }
+      break;
+
+    case Transaction::OP_COLL_ADD:
+      {
+        coll_t ocid = i.get_cid(op->cid);
+        coll_t ncid = i.get_cid(op->dest_cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _collection_add(ncid, ocid, oid);
+      }
+      break;
+
+    case Transaction::OP_COLL_REMOVE:
+       {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _remove(cid, oid);
+       }
+      break;
+
+    case Transaction::OP_COLL_MOVE:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_COLL_MOVE_RENAME:
+      {
+        coll_t oldcid = i.get_cid(op->cid);
+        ghobject_t oldoid = i.get_oid(op->oid);
+        coll_t newcid = i.get_cid(op->dest_cid);
+        ghobject_t newoid = i.get_oid(op->dest_oid);
+	r = _collection_move_rename(oldcid, oldoid, newcid, newoid);
+	if (r == -ENOENT)
+	  r = 0;
+      }
+      break;
+
+    case Transaction::OP_TRY_RENAME:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oldoid = i.get_oid(op->oid);
+        ghobject_t newoid = i.get_oid(op->dest_oid);
+	r = _collection_move_rename(cid, oldoid, cid, newoid);
+	if (r == -ENOENT)
+	  r = 0;
+      }
+      break;
+
+    case Transaction::OP_COLL_SETATTR:
+      {
+	ceph_abort_msg("not implemented");
+      }
+      break;
+
+    case Transaction::OP_COLL_RMATTR:
+      {
+	ceph_abort_msg("not implemented");
+      }
+      break;
+
+    case Transaction::OP_COLL_RENAME:
+      {
+	ceph_abort_msg("not implemented");
+      }
+      break;
+
+    case Transaction::OP_OMAP_CLEAR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _omap_clear(cid, oid);
+      }
+      break;
+    case Transaction::OP_OMAP_SETKEYS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ceph::buffer::list aset_bl;
+        i.decode_attrset_bl(&aset_bl);
+	r = _omap_setkeys(cid, oid, aset_bl);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ceph::buffer::list keys_bl;
+        i.decode_keyset_bl(&keys_bl);
+	r = _omap_rmkeys(cid, oid, keys_bl);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYRANGE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        std::string first, last;
+        first = i.decode_string();
+        last = i.decode_string();
+	r = _omap_rmkeyrange(cid, oid, first, last);
+      }
+      break;
+    case Transaction::OP_OMAP_SETHEADER:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ceph::buffer::list bl;
+        i.decode_bl(bl);
+	r = _omap_setheader(cid, oid, bl);
+      }
+      break;
+    case Transaction::OP_SPLIT_COLLECTION:
+      ceph_abort_msg("deprecated");
+      break;
+    case Transaction::OP_SPLIT_COLLECTION2:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+        coll_t dest = i.get_cid(op->dest_cid);
+	r = _split_collection(cid, bits, rem, dest);
+      }
+      break;
+    case Transaction::OP_MERGE_COLLECTION:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        coll_t dest = i.get_cid(op->dest_cid);
+	r = _merge_collection(cid, bits, dest);
+      }
+      break;
+
+    case Transaction::OP_SETALLOCHINT:
+      {
+        r = 0;
+      }
+      break;
+
+    case Transaction::OP_COLL_SET_BITS:
+      {
+        r = 0;
+      }
+      break;
+
+    default:
+      derr << "bad op " << op->op << dendl;
+      ceph_abort();
+    }
+
+    if (r < 0) {
+      bool ok = false;
+
+      if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+			    op->op == Transaction::OP_CLONE ||
+			    op->op == Transaction::OP_CLONERANGE2 ||
+			    op->op == Transaction::OP_COLL_ADD))
+	// -ENOENT is usually okay
+	ok = true;
+      if (r == -ENODATA)
+	ok = true;
+
+      if (!ok) {
+	const char *msg = "unexpected error code";
+
+	if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+			     op->op == Transaction::OP_CLONE ||
+			     op->op == Transaction::OP_CLONERANGE2))
+	  msg = "ENOENT on clone suggests osd bug";
+
+	if (r == -ENOSPC)
+	  // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+	  // by partially applying transactions.
+	  msg = "ENOSPC from MemStore, misconfigured cluster or insufficient memory";
+
+	if (r == -ENOTEMPTY) {
+	  msg = "ENOTEMPTY suggests garbage data in osd data dir";
+	  dump_all();
+	}
+
+	derr    << " error " << cpp_strerror(r) << " not handled on operation " << op->op
+		<< " (op " << pos << ", counting from 0)" << dendl;
+	dout(0) << msg << dendl;
+	dout(0) << " transaction dump:\n";
+	ceph::JSONFormatter f(true);
+	f.open_object_section("transaction");
+	t.dump(&f);
+	f.close_section();
+	f.flush(*_dout);
+	*_dout << dendl;
+	ceph_abort_msg("unexpected error");
+      }
+    }
+
+    ++pos;
+  }
+}
+
+int MemStore::_touch(const coll_t& cid, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  c->get_or_create_object(oid);
+  return 0;
+}
+
+int MemStore::_write(const coll_t& cid, const ghobject_t& oid,
+		     uint64_t offset, size_t len, const ceph::buffer::list& bl,
+		     uint32_t fadvise_flags)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << " "
+	   << offset << "~" << len << dendl;
+  ceph_assert(len == bl.length());
+
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_or_create_object(oid);
+  if (len > 0 && !cct->_conf->memstore_debug_omit_block_device_write) {
+    const ssize_t old_size = o->get_size();
+    o->write(offset, bl);
+    used_bytes += (o->get_size() - old_size);
+  }
+
+  return 0;
+}
+
+int MemStore::_zero(const coll_t& cid, const ghobject_t& oid,
+		    uint64_t offset, size_t len)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~"
+	   << len << dendl;
+  ceph::buffer::list bl;
+  bl.append_zero(len);
+  return _write(cid, oid, offset, len, bl);
+}
+
+int MemStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << " " << size << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  if (cct->_conf->memstore_debug_omit_block_device_write)
+    return 0;
+  const ssize_t old_size = o->get_size();
+  int r = o->truncate(size);
+  used_bytes += (o->get_size() - old_size);
+  return r;
+}
+
+int MemStore::_remove(const coll_t& cid, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  std::lock_guard l{c->lock};
+
+  auto i = c->object_hash.find(oid);
+  if (i == c->object_hash.end())
+    return -ENOENT;
+  used_bytes -= i->second->get_size();
+  c->object_hash.erase(i);
+  c->object_map.erase(oid);
+
+  return 0;
+}
+
+int MemStore::_setattrs(const coll_t& cid, const ghobject_t& oid,
+			std::map<std::string,ceph::buffer::ptr>& aset)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->xattr_mutex};
+  for (auto p = aset.begin(); p != aset.end(); ++p)
+    o->xattr[p->first] = p->second;
+  return 0;
+}
+
+int MemStore::_rmattr(const coll_t& cid, const ghobject_t& oid, const char *name)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << " " << name << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->xattr_mutex};
+  auto i = o->xattr.find(name);
+  if (i == o->xattr.end())
+    return -ENODATA;
+  o->xattr.erase(i);
+  return 0;
+}
+
+int MemStore::_rmattrs(const coll_t& cid, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->xattr_mutex};
+  o->xattr.clear();
+  return 0;
+}
+
+int MemStore::_clone(const coll_t& cid, const ghobject_t& oldoid,
+		     const ghobject_t& newoid)
+{
+  dout(10) << __func__ << " " << cid << " " << oldoid
+	   << " -> " << newoid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef oo = c->get_object(oldoid);
+  if (!oo)
+    return -ENOENT;
+  ObjectRef no = c->get_or_create_object(newoid);
+  used_bytes += oo->get_size() - no->get_size();
+  no->clone(oo.get(), 0, oo->get_size(), 0);
+
+  // take xattr and omap locks with std::lock()
+  std::scoped_lock l{oo->xattr_mutex,
+		     no->xattr_mutex,
+		     oo->omap_mutex,
+		     no->omap_mutex};
+
+  no->omap_header = oo->omap_header;
+  no->omap = oo->omap;
+  no->xattr = oo->xattr;
+  return 0;
+}
+
+int MemStore::_clone_range(const coll_t& cid, const ghobject_t& oldoid,
+			   const ghobject_t& newoid,
+			   uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+  dout(10) << __func__ << " " << cid << " "
+	   << oldoid << " " << srcoff << "~" << len << " -> "
+	   << newoid << " " << dstoff << "~" << len
+	   << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef oo = c->get_object(oldoid);
+  if (!oo)
+    return -ENOENT;
+  ObjectRef no = c->get_or_create_object(newoid);
+  if (srcoff >= oo->get_size())
+    return 0;
+  if (srcoff + len >= oo->get_size())
+    len = oo->get_size() - srcoff;
+
+  const ssize_t old_size = no->get_size();
+  no->clone(oo.get(), srcoff, len, dstoff);
+  used_bytes += (no->get_size() - old_size);
+
+  return len;
+}
+
+int MemStore::_omap_clear(const coll_t& cid, const ghobject_t &oid)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  o->omap.clear();
+  o->omap_header.clear();
+  return 0;
+}
+
+int MemStore::_omap_setkeys(const coll_t& cid, const ghobject_t &oid,
+			    ceph::buffer::list& aset_bl)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  auto p = aset_bl.cbegin();
+  __u32 num;
+  decode(num, p);
+  while (num--) {
+    std::string key;
+    decode(key, p);
+    decode(o->omap[key], p);
+  }
+  return 0;
+}
+
+int MemStore::_omap_rmkeys(const coll_t& cid, const ghobject_t &oid,
+			   ceph::buffer::list& keys_bl)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  auto p = keys_bl.cbegin();
+  __u32 num;
+  decode(num, p);
+  while (num--) {
+    std::string key;
+    decode(key, p);
+    o->omap.erase(key);
+  }
+  return 0;
+}
+
+int MemStore::_omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
+			       const std::string& first, const std::string& last)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << " " << first
+	   << " " << last << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  auto p = o->omap.lower_bound(first);
+  auto e = o->omap.lower_bound(last);
+  o->omap.erase(p, e);
+  return 0;
+}
+
+int MemStore::_omap_setheader(const coll_t& cid, const ghobject_t &oid,
+			      const ceph::buffer::list &bl)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  o->omap_header = bl;
+  return 0;
+}
+
+int MemStore::_create_collection(const coll_t& cid, int bits)
+{
+  dout(10) << __func__ << " " << cid << dendl;
+  std::lock_guard l{coll_lock};
+  auto result = coll_map.insert(std::make_pair(cid, CollectionRef()));
+  if (!result.second)
+    return -EEXIST;
+  auto p = new_coll_map.find(cid);
+  ceph_assert(p != new_coll_map.end());
+  result.first->second = p->second;
+  result.first->second->bits = bits;
+  new_coll_map.erase(p);
+  return 0;
+}
+
+int MemStore::_destroy_collection(const coll_t& cid)
+{
+  dout(10) << __func__ << " " << cid << dendl;
+  std::lock_guard l{coll_lock};
+  ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+  if (cp == coll_map.end())
+    return -ENOENT;
+  {
+    std::shared_lock l2{cp->second->lock};
+    if (!cp->second->object_map.empty())
+      return -ENOTEMPTY;
+    cp->second->exists = false;
+  }
+  used_bytes -= cp->second->used_bytes();
+  coll_map.erase(cp);
+  return 0;
+}
+
+int MemStore::_collection_add(const coll_t& cid, const coll_t& ocid, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << cid << " " << ocid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  CollectionRef oc = get_collection(ocid);
+  if (!oc)
+    return -ENOENT;
+
+  std::scoped_lock l{std::min(&(*c), &(*oc))->lock,
+		     std::max(&(*c), &(*oc))->lock};
+
+  if (c->object_hash.count(oid))
+    return -EEXIST;
+  if (oc->object_hash.count(oid) == 0)
+    return -ENOENT;
+  ObjectRef o = oc->object_hash[oid];
+  c->object_map[oid] = o;
+  c->object_hash[oid] = o;
+  return 0;
+}
+
+int MemStore::_collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+				      coll_t cid, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << oldcid << " " << oldoid << " -> "
+	   << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  CollectionRef oc = get_collection(oldcid);
+  if (!oc)
+    return -ENOENT;
+
+  // note: c and oc may be the same
+  ceph_assert(&(*c) == &(*oc));
+
+  std::lock_guard l{c->lock};
+  if (c->object_hash.count(oid))
+    return -EEXIST;
+  if (oc->object_hash.count(oldoid) == 0)
+    return -ENOENT;
+  {
+    ObjectRef o = oc->object_hash[oldoid];
+    c->object_map[oid] = o;
+    c->object_hash[oid] = o;
+    oc->object_map.erase(oldoid);
+    oc->object_hash.erase(oldoid);
+  }
+  return 0;
+}
+
+int MemStore::_split_collection(const coll_t& cid, uint32_t bits, uint32_t match,
+				coll_t dest)
+{
+  dout(10) << __func__ << " " << cid << " " << bits << " " << match << " "
+	   << dest << dendl;
+  CollectionRef sc = get_collection(cid);
+  if (!sc)
+    return -ENOENT;
+  CollectionRef dc = get_collection(dest);
+  if (!dc)
+    return -ENOENT;
+
+  std::scoped_lock l{std::min(&(*sc), &(*dc))->lock,
+                     std::max(&(*sc), &(*dc))->lock};
+
+  auto p = sc->object_map.begin();
+  while (p != sc->object_map.end()) {
+    if (p->first.match(bits, match)) {
+      dout(20) << " moving " << p->first << dendl;
+      dc->object_map.insert(std::make_pair(p->first, p->second));
+      dc->object_hash.insert(std::make_pair(p->first, p->second));
+      sc->object_hash.erase(p->first);
+      sc->object_map.erase(p++);
+    } else {
+      ++p;
+    }
+  }
+
+  sc->bits = bits;
+  ceph_assert(dc->bits == (int)bits);
+
+  return 0;
+}
+
+int MemStore::_merge_collection(const coll_t& cid, uint32_t bits, coll_t dest)
+{
+  dout(10) << __func__ << " " << cid << " " << bits << " "
+	   << dest << dendl;
+  CollectionRef sc = get_collection(cid);
+  if (!sc)
+    return -ENOENT;
+  CollectionRef dc = get_collection(dest);
+  if (!dc)
+    return -ENOENT;
+  {
+    std::scoped_lock l{std::min(&(*sc), &(*dc))->lock,
+                       std::max(&(*sc), &(*dc))->lock};
+
+    auto p = sc->object_map.begin();
+    while (p != sc->object_map.end()) {
+      dout(20) << " moving " << p->first << dendl;
+      dc->object_map.insert(std::make_pair(p->first, p->second));
+      dc->object_hash.insert(std::make_pair(p->first, p->second));
+      sc->object_hash.erase(p->first);
+      sc->object_map.erase(p++);
+    }
+
+    dc->bits = bits;
+  }
+
+  {
+    std::lock_guard l{coll_lock};
+    ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+    ceph_assert(cp != coll_map.end());
+    used_bytes -= cp->second->used_bytes();
+    coll_map.erase(cp);
+  }
+
+  return 0;
+}
+
+namespace {
+struct BufferlistObject : public MemStore::Object {
+  ceph::spinlock mutex;
+  ceph::buffer::list data;
+
+  size_t get_size() const override { return data.length(); }
+
+  int read(uint64_t offset, uint64_t len, ceph::buffer::list &bl) override;
+  int write(uint64_t offset, const ceph::buffer::list &bl) override;
+  int clone(Object *src, uint64_t srcoff, uint64_t len,
+            uint64_t dstoff) override;
+  int truncate(uint64_t offset) override;
+
+  void encode(ceph::buffer::list& bl) const override {
+    ENCODE_START(1, 1, bl);
+    encode(data, bl);
+    encode_base(bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) override {
+    DECODE_START(1, p);
+    decode(data, p);
+    decode_base(p);
+    DECODE_FINISH(p);
+  }
+};
+}
+// BufferlistObject
+int BufferlistObject::read(uint64_t offset, uint64_t len,
+                                     ceph::buffer::list &bl)
+{
+  std::lock_guard<decltype(mutex)> lock(mutex);
+  bl.substr_of(data, offset, len);
+  return bl.length();
+}
+
+int BufferlistObject::write(uint64_t offset, const ceph::buffer::list &src)
+{
+  unsigned len = src.length();
+
+  std::lock_guard<decltype(mutex)> lock(mutex);
+
+  // before
+  ceph::buffer::list newdata;
+  if (get_size() >= offset) {
+    newdata.substr_of(data, 0, offset);
+  } else {
+    if (get_size()) {
+      newdata.substr_of(data, 0, get_size());
+    }
+    newdata.append_zero(offset - get_size());
+  }
+
+  newdata.append(src);
+
+  // after
+  if (get_size() > offset + len) {
+    ceph::buffer::list tail;
+    tail.substr_of(data, offset + len, get_size() - (offset + len));
+    newdata.append(tail);
+  }
+
+  data = std::move(newdata);
+  return 0;
+}
+
+int BufferlistObject::clone(Object *src, uint64_t srcoff,
+                                      uint64_t len, uint64_t dstoff)
+{
+  auto srcbl = dynamic_cast<BufferlistObject*>(src);
+  if (srcbl == nullptr)
+    return -ENOTSUP;
+
+  ceph::buffer::list bl;
+  {
+    std::lock_guard<decltype(srcbl->mutex)> lock(srcbl->mutex);
+    if (srcoff == dstoff && len == src->get_size()) {
+      data = srcbl->data;
+      return 0;
+    }
+    bl.substr_of(srcbl->data, srcoff, len);
+  }
+  return write(dstoff, bl);
+}
+
+int BufferlistObject::truncate(uint64_t size)
+{
+  std::lock_guard<decltype(mutex)> lock(mutex);
+  if (get_size() > size) {
+    ceph::buffer::list bl;
+    bl.substr_of(data, 0, size);
+    data = std::move(bl);
+  } else if (get_size() == size) {
+    // do nothing
+  } else {
+    data.append_zero(size - get_size());
+  }
+  return 0;
+}
+
+// PageSetObject
+
+struct MemStore::PageSetObject : public Object {
+  PageSet data;
+  uint64_t data_len;
+#if defined(__GLIBCXX__)
+  // use a thread-local vector for the pages returned by PageSet, so we
+  // can avoid allocations in read/write()
+  static thread_local PageSet::page_vector tls_pages;
+#endif
+
+  size_t get_size() const override { return data_len; }
+
+  int read(uint64_t offset, uint64_t len, ceph::buffer::list &bl) override;
+  int write(uint64_t offset, const ceph::buffer::list &bl) override;
+  int clone(Object *src, uint64_t srcoff, uint64_t len,
+            uint64_t dstoff) override;
+  int truncate(uint64_t offset) override;
+
+  void encode(ceph::buffer::list& bl) const override {
+    ENCODE_START(1, 1, bl);
+    encode(data_len, bl);
+    data.encode(bl);
+    encode_base(bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) override {
+    DECODE_START(1, p);
+    decode(data_len, p);
+    data.decode(p);
+    decode_base(p);
+    DECODE_FINISH(p);
+  }
+
+private:
+  FRIEND_MAKE_REF(PageSetObject);
+  explicit PageSetObject(size_t page_size) : data(page_size), data_len(0) {}
+};
+
+#if defined(__GLIBCXX__)
+// use a thread-local vector for the pages returned by PageSet, so we
+// can avoid allocations in read/write()
+thread_local PageSet::page_vector MemStore::PageSetObject::tls_pages;
+#define DEFINE_PAGE_VECTOR(name)
+#else
+#define DEFINE_PAGE_VECTOR(name) PageSet::page_vector name;
+#endif
+
+int MemStore::PageSetObject::read(uint64_t offset, uint64_t len, ceph::buffer::list& bl)
+{
+  const auto start = offset;
+  const auto end = offset + len;
+  auto remaining = len;
+
+  DEFINE_PAGE_VECTOR(tls_pages);
+  data.get_range(offset, len, tls_pages);
+
+  // allocate a buffer for the data
+  ceph::buffer::ptr buf(len);
+
+  auto p = tls_pages.begin();
+  while (remaining) {
+    // no more pages in range
+    if (p == tls_pages.end() || (*p)->offset >= end) {
+      buf.zero(offset - start, remaining);
+      break;
+    }
+    auto page = *p;
+
+    // fill any holes between pages with zeroes
+    if (page->offset > offset) {
+      const auto count = std::min(remaining, page->offset - offset);
+      buf.zero(offset - start, count);
+      remaining -= count;
+      offset = page->offset;
+      if (!remaining)
+        break;
+    }
+
+    // read from page
+    const auto page_offset = offset - page->offset;
+    const auto count = std::min(remaining, data.get_page_size() - page_offset);
+
+    buf.copy_in(offset - start, count, page->data + page_offset);
+
+    remaining -= count;
+    offset += count;
+
+    ++p;
+  }
+
+  tls_pages.clear(); // drop page refs
+
+  bl.append(std::move(buf));
+  return len;
+}
+
+int MemStore::PageSetObject::write(uint64_t offset, const ceph::buffer::list &src)
+{
+  unsigned len = src.length();
+
+  DEFINE_PAGE_VECTOR(tls_pages);
+  // make sure the page range is allocated
+  data.alloc_range(offset, src.length(), tls_pages);
+
+  auto page = tls_pages.begin();
+
+  auto p = src.begin();
+  while (len > 0) {
+    unsigned page_offset = offset - (*page)->offset;
+    unsigned pageoff = data.get_page_size() - page_offset;
+    unsigned count = std::min(len, pageoff);
+    p.copy(count, (*page)->data + page_offset);
+    offset += count;
+    len -= count;
+    if (count == pageoff)
+      ++page;
+  }
+  if (data_len < offset)
+    data_len = offset;
+  tls_pages.clear(); // drop page refs
+  return 0;
+}
+
+int MemStore::PageSetObject::clone(Object *src, uint64_t srcoff,
+                                   uint64_t len, uint64_t dstoff)
+{
+  const int64_t delta = dstoff - srcoff;
+
+  auto &src_data = static_cast<PageSetObject*>(src)->data;
+  const uint64_t src_page_size = src_data.get_page_size();
+
+  auto &dst_data = data;
+  const auto dst_page_size = dst_data.get_page_size();
+
+  DEFINE_PAGE_VECTOR(tls_pages);
+  PageSet::page_vector dst_pages;
+
+  while (len) {
+    // limit to 16 pages at a time so tls_pages doesn't balloon in size
+    auto count = std::min(len, (uint64_t)src_page_size * 16);
+    src_data.get_range(srcoff, count, tls_pages);
+
+    // allocate the destination range
+    // TODO: avoid allocating pages for holes in the source range
+    dst_data.alloc_range(srcoff + delta, count, dst_pages);
+    auto dst_iter = dst_pages.begin();
+
+    for (auto &src_page : tls_pages) {
+      auto sbegin = std::max(srcoff, src_page->offset);
+      auto send = std::min(srcoff + count, src_page->offset + src_page_size);
+
+      // zero-fill holes before src_page
+      if (srcoff < sbegin) {
+        while (dst_iter != dst_pages.end()) {
+          auto &dst_page = *dst_iter;
+          auto dbegin = std::max(srcoff + delta, dst_page->offset);
+          auto dend = std::min(sbegin + delta, dst_page->offset + dst_page_size);
+          std::fill(dst_page->data + dbegin - dst_page->offset,
+                    dst_page->data + dend - dst_page->offset, 0);
+          if (dend < dst_page->offset + dst_page_size)
+            break;
+          ++dst_iter;
+        }
+        const auto c = sbegin - srcoff;
+        count -= c;
+        len -= c;
+      }
+
+      // copy data from src page to dst pages
+      while (dst_iter != dst_pages.end()) {
+        auto &dst_page = *dst_iter;
+        auto dbegin = std::max(sbegin + delta, dst_page->offset);
+        auto dend = std::min(send + delta, dst_page->offset + dst_page_size);
+
+        std::copy(src_page->data + (dbegin - delta) - src_page->offset,
+                  src_page->data + (dend - delta) - src_page->offset,
+                  dst_page->data + dbegin - dst_page->offset);
+        if (dend < dst_page->offset + dst_page_size)
+          break;
+        ++dst_iter;
+      }
+
+      const auto c = send - sbegin;
+      count -= c;
+      len -= c;
+      srcoff = send;
+      dstoff = send + delta;
+    }
+    tls_pages.clear(); // drop page refs
+
+    // zero-fill holes after the last src_page
+    if (count > 0) {
+      while (dst_iter != dst_pages.end()) {
+        auto &dst_page = *dst_iter;
+        auto dbegin = std::max(dstoff, dst_page->offset);
+        auto dend = std::min(dstoff + count, dst_page->offset + dst_page_size);
+        std::fill(dst_page->data + dbegin - dst_page->offset,
+                  dst_page->data + dend - dst_page->offset, 0);
+        ++dst_iter;
+      }
+      srcoff += count;
+      dstoff += count;
+      len -= count;
+    }
+    dst_pages.clear(); // drop page refs
+  }
+
+  // update object size
+  if (data_len < dstoff)
+    data_len = dstoff;
+  return 0;
+}
+
+int MemStore::PageSetObject::truncate(uint64_t size)
+{
+  data.free_pages_after(size);
+  data_len = size;
+
+  const auto page_size = data.get_page_size();
+  const auto page_offset = size & ~(page_size-1);
+  if (page_offset == size)
+    return 0;
+
+  DEFINE_PAGE_VECTOR(tls_pages);
+  // write zeroes to the rest of the last page
+  data.get_range(page_offset, page_size, tls_pages);
+  if (tls_pages.empty())
+    return 0;
+
+  auto page = tls_pages.begin();
+  auto data = (*page)->data;
+  std::fill(data + (size - page_offset), data + page_size, 0);
+  tls_pages.clear(); // drop page ref
+  return 0;
+}
+
+
+MemStore::ObjectRef MemStore::Collection::create_object() const {
+  if (use_page_set)
+    return ceph::make_ref<PageSetObject>(cct->_conf->memstore_page_size);
+  return new BufferlistObject();
+}
diff --git a/src/os/memstore/MemStore.h b/src/os/memstore/MemStore.h
new file mode 100644
index 000000000..04c3e08d0
--- /dev/null
+++ b/src/os/memstore/MemStore.h
@@ -0,0 +1,406 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013- Sage Weil <sage@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_MEMSTORE_H
+#define CEPH_MEMSTORE_H
+
+#include <mutex>
+#include <boost/intrusive_ptr.hpp>
+
+#include "include/unordered_map.h"
+#include "common/Finisher.h"
+#include "common/RefCountedObj.h"
+#include "common/RWLock.h"
+#include "os/ObjectStore.h"
+#include "PageSet.h"
+#include "include/ceph_assert.h"
+
+class MemStore : public ObjectStore {
+public:
+  struct Object : public RefCountedObject {
+    ceph::mutex xattr_mutex{ceph::make_mutex("MemStore::Object::xattr_mutex")};
+    ceph::mutex omap_mutex{ceph::make_mutex("MemStore::Object::omap_mutex")};
+    std::map<std::string,ceph::buffer::ptr> xattr;
+    ceph::buffer::list omap_header;
+    std::map<std::string,ceph::buffer::list> omap;
+
+    using Ref = ceph::ref_t<Object>;
+
+    // interface for object data
+    virtual size_t get_size() const = 0;
+    virtual int read(uint64_t offset, uint64_t len, ceph::buffer::list &bl) = 0;
+    virtual int write(uint64_t offset, const ceph::buffer::list &bl) = 0;
+    virtual int clone(Object *src, uint64_t srcoff, uint64_t len,
+                      uint64_t dstoff) = 0;
+    virtual int truncate(uint64_t offset) = 0;
+    virtual void encode(ceph::buffer::list& bl) const = 0;
+    virtual void decode(ceph::buffer::list::const_iterator& p) = 0;
+
+    void encode_base(ceph::buffer::list& bl) const {
+      using ceph::encode;
+      encode(xattr, bl);
+      encode(omap_header, bl);
+      encode(omap, bl);
+    }
+    void decode_base(ceph::buffer::list::const_iterator& p) {
+      using ceph::decode;
+      decode(xattr, p);
+      decode(omap_header, p);
+      decode(omap, p);
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->dump_int("data_len", get_size());
+      f->dump_int("omap_header_len", omap_header.length());
+
+      f->open_array_section("xattrs");
+      for (auto p = xattr.begin(); p != xattr.end(); ++p) {
+	f->open_object_section("xattr");
+	f->dump_string("name", p->first);
+	f->dump_int("length", p->second.length());
+	f->close_section();
+      }
+      f->close_section();
+
+      f->open_array_section("omap");
+      for (auto p = omap.begin(); p != omap.end(); ++p) {
+	f->open_object_section("pair");
+	f->dump_string("key", p->first);
+	f->dump_int("length", p->second.length());
+	f->close_section();
+      }
+      f->close_section();
+    }
+  protected:
+    Object() = default;
+  };
+  using ObjectRef = Object::Ref;
+
+  struct PageSetObject;
+  struct Collection : public CollectionImpl {
+    int bits = 0;
+    CephContext *cct;
+    bool use_page_set;
+    ceph::unordered_map<ghobject_t, ObjectRef> object_hash;  ///< for lookup
+    std::map<ghobject_t, ObjectRef> object_map;        ///< for iteration
+    std::map<std::string,ceph::buffer::ptr> xattr;
+    /// for object_{map,hash}
+    ceph::shared_mutex lock{
+      ceph::make_shared_mutex("MemStore::Collection::lock", true, false)};
+
+    bool exists = true;
+    ceph::mutex sequencer_mutex{
+      ceph::make_mutex("MemStore::Collection::sequencer_mutex")};
+
+    typedef boost::intrusive_ptr<Collection> Ref;
+
+    ObjectRef create_object() const;
+
+    // NOTE: The lock only needs to protect the object_map/hash, not the
+    // contents of individual objects.  The osd is already sequencing
+    // reads and writes, so we will never see them concurrently at this
+    // level.
+
+    ObjectRef get_object(ghobject_t oid) {
+      std::shared_lock l{lock};
+      auto o = object_hash.find(oid);
+      if (o == object_hash.end())
+	return ObjectRef();
+      return o->second;
+    }
+
+    ObjectRef get_or_create_object(ghobject_t oid) {
+      std::lock_guard l{lock};
+      auto result = object_hash.emplace(oid, ObjectRef());
+      if (result.second)
+        object_map[oid] = result.first->second = create_object();
+      return result.first->second;
+    }
+
+    void encode(ceph::buffer::list& bl) const {
+      ENCODE_START(1, 1, bl);
+      encode(xattr, bl);
+      encode(use_page_set, bl);
+      uint32_t s = object_map.size();
+      encode(s, bl);
+      for (auto p = object_map.begin(); p != object_map.end(); ++p) {
+	encode(p->first, bl);
+	p->second->encode(bl);
+      }
+      ENCODE_FINISH(bl);
+    }
+    void decode(ceph::buffer::list::const_iterator& p) {
+      DECODE_START(1, p);
+      decode(xattr, p);
+      decode(use_page_set, p);
+      uint32_t s;
+      decode(s, p);
+      while (s--) {
+	ghobject_t k;
+	decode(k, p);
+	auto o = create_object();
+	o->decode(p);
+	object_map.insert(std::make_pair(k, o));
+	object_hash.insert(std::make_pair(k, o));
+      }
+      DECODE_FINISH(p);
+    }
+
+    uint64_t used_bytes() const {
+      uint64_t result = 0;
+      for (auto p = object_map.begin(); p != object_map.end(); ++p) {
+        result += p->second->get_size();
+      }
+
+      return result;
+    }
+
+    void flush() override {
+    }
+    bool flush_commit(Context *c) override {
+      return true;
+    }
+
+  private:
+    FRIEND_MAKE_REF(Collection);
+    explicit Collection(CephContext *cct, coll_t c)
+      : CollectionImpl(cct, c),
+	cct(cct),
+	use_page_set(cct->_conf->memstore_page_set) {}
+  };
+  typedef Collection::Ref CollectionRef;
+
+private:
+  class OmapIteratorImpl;
+
+
+  ceph::unordered_map<coll_t, CollectionRef> coll_map;
+  /// rwlock to protect coll_map
+  ceph::shared_mutex coll_lock{
+    ceph::make_shared_mutex("MemStore::coll_lock")};
+  std::map<coll_t,CollectionRef> new_coll_map;
+
+  CollectionRef get_collection(const coll_t& cid);
+
+  Finisher finisher;
+
+  uint64_t used_bytes;
+
+  void _do_transaction(Transaction& t);
+
+  int _touch(const coll_t& cid, const ghobject_t& oid);
+  int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len,
+	      const ceph::buffer::list& bl, uint32_t fadvise_flags = 0);
+  int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len);
+  int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
+  int _remove(const coll_t& cid, const ghobject_t& oid);
+  int _setattrs(const coll_t& cid, const ghobject_t& oid, std::map<std::string,ceph::buffer::ptr>& aset);
+  int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name);
+  int _rmattrs(const coll_t& cid, const ghobject_t& oid);
+  int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid);
+  int _clone_range(const coll_t& cid, const ghobject_t& oldoid,
+		   const ghobject_t& newoid,
+		   uint64_t srcoff, uint64_t len, uint64_t dstoff);
+  int _omap_clear(const coll_t& cid, const ghobject_t &oid);
+  int _omap_setkeys(const coll_t& cid, const ghobject_t &oid, ceph::buffer::list& aset_bl);
+  int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, ceph::buffer::list& keys_bl);
+  int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
+		       const std::string& first, const std::string& last);
+  int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const ceph::buffer::list &bl);
+
+  int _collection_hint_expected_num_objs(const coll_t& cid, uint32_t pg_num,
+      uint64_t num_objs) const { return 0; }
+  int _create_collection(const coll_t& c, int bits);
+  int _destroy_collection(const coll_t& c);
+  int _collection_add(const coll_t& cid, const coll_t& ocid, const ghobject_t& oid);
+  int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+			      coll_t cid, const ghobject_t& o);
+  int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest);
+  int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest);
+
+  int _save();
+  int _load();
+
+  void dump(ceph::Formatter *f);
+  void dump_all();
+
+public:
+  MemStore(CephContext *cct, const std::string& path)
+    : ObjectStore(cct, path),
+      finisher(cct),
+      used_bytes(0) {}
+  ~MemStore() override { }
+
+  std::string get_type() override {
+    return "memstore";
+  }
+
+  bool test_mount_in_use() override {
+    return false;
+  }
+
+  int mount() override;
+  int umount() override;
+
+  int fsck(bool deep) override {
+    return 0;
+  }
+
+  int validate_hobject_key(const hobject_t &obj) const override {
+    return 0;
+  }
+  unsigned get_max_attr_name_length() override {
+    return 256;  // arbitrary; there is no real limit internally
+  }
+
+  int mkfs() override;
+  int mkjournal() override {
+    return 0;
+  }
+  bool wants_journal() override {
+    return false;
+  }
+  bool allows_journal() override {
+    return false;
+  }
+  bool needs_journal() override {
+    return false;
+  }
+
+  int get_devices(std::set<std::string> *ls) override {
+    // no devices for us!
+    return 0;
+  }
+
+  int statfs(struct store_statfs_t *buf,
+             osd_alert_list_t* alerts = nullptr) override;
+  int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+		  bool *per_pool_omap) override;
+
+  bool exists(CollectionHandle &c, const ghobject_t& oid) override;
+  int stat(CollectionHandle &c, const ghobject_t& oid,
+	   struct stat *st, bool allow_eio = false) override;
+  int set_collection_opts(
+    CollectionHandle& c,
+    const pool_opts_t& opts) override;
+  int read(
+    CollectionHandle &c,
+    const ghobject_t& oid,
+    uint64_t offset,
+    size_t len,
+    ceph::buffer::list& bl,
+    uint32_t op_flags = 0) override;
+  using ObjectStore::fiemap;
+  int fiemap(CollectionHandle& c, const ghobject_t& oid,
+	     uint64_t offset, size_t len, ceph::buffer::list& bl) override;
+  int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset,
+	     size_t len, std::map<uint64_t, uint64_t>& destmap) override;
+  int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
+	      ceph::buffer::ptr& value) override;
+  int getattrs(CollectionHandle &c, const ghobject_t& oid,
+	       std::map<std::string,ceph::buffer::ptr>& aset) override;
+
+  int list_collections(std::vector<coll_t>& ls) override;
+
+  CollectionHandle open_collection(const coll_t& c) override {
+    return get_collection(c);
+  }
+  CollectionHandle create_new_collection(const coll_t& c) override;
+
+  void set_collection_commit_queue(const coll_t& cid,
+				   ContextQueue *commit_queue) override {
+  }
+
+  bool collection_exists(const coll_t& c) override;
+  int collection_empty(CollectionHandle& c, bool *empty) override;
+  int collection_bits(CollectionHandle& c) override;
+  int collection_list(CollectionHandle& cid,
+		      const ghobject_t& start, const ghobject_t& end, int max,
+		      std::vector<ghobject_t> *ls, ghobject_t *next) override;
+
+  using ObjectStore::omap_get;
+  int omap_get(
+    CollectionHandle& c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+    ) override;
+
+  using ObjectStore::omap_get_header;
+  /// Get omap header
+  int omap_get_header(
+    CollectionHandle& c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    bool allow_eio = false ///< [in] don't assert on eio
+    ) override;
+
+  using ObjectStore::omap_get_keys;
+  /// Get keys defined on oid
+  int omap_get_keys(
+    CollectionHandle& c,              ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    std::set<std::string> *keys      ///< [out] Keys defined on oid
+    ) override;
+
+  using ObjectStore::omap_get_values;
+  /// Get key values
+  int omap_get_values(
+    CollectionHandle& c,                    ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::set<std::string> &keys,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) override;
+
+  using ObjectStore::omap_check_keys;
+  /// Filters keys into out which are defined on oid
+  int omap_check_keys(
+    CollectionHandle& c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    const std::set<std::string> &keys, ///< [in] Keys to check
+    std::set<std::string> *out         ///< [out] Subset of keys defined on oid
+    ) override;
+
+  using ObjectStore::get_omap_iterator;
+  ObjectMap::ObjectMapIterator get_omap_iterator(
+    CollectionHandle& c,              ///< [in] collection
+    const ghobject_t &oid  ///< [in] object
+    ) override;
+
+  void set_fsid(uuid_d u) override;
+  uuid_d get_fsid() override;
+
+  uint64_t estimate_objects_overhead(uint64_t num_objects) override {
+    return 0; //do not care
+  }
+
+  objectstore_perf_stat_t get_cur_stats() override;
+
+  const PerfCounters* get_perf_counters() const override {
+    return nullptr;
+  }
+
+
+  int queue_transactions(
+    CollectionHandle& ch,
+    std::vector<Transaction>& tls,
+    TrackedOpRef op = TrackedOpRef(),
+    ThreadPool::TPHandle *handle = NULL) override;
+};
+
+
+
+
+#endif
diff --git a/src/os/memstore/PageSet.h b/src/os/memstore/PageSet.h
new file mode 100644
index 000000000..71954e574
--- /dev/null
+++ b/src/os/memstore/PageSet.h
@@ -0,0 +1,232 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013- Sage Weil <sage@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PAGESET_H
+#define CEPH_PAGESET_H
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <mutex>
+#include <vector>
+#include <boost/intrusive/avl_set.hpp>
+#include <boost/intrusive_ptr.hpp>
+
+#include "include/encoding.h"
+
+struct Page {
+  char *const data;
+  boost::intrusive::avl_set_member_hook<> hook;
+  uint64_t offset;
+
+  // avoid RefCountedObject because it has a virtual destructor
+  std::atomic<uint16_t> nrefs;
+  void get() { ++nrefs; }
+  void put() { if (--nrefs == 0) delete this; }
+
+  typedef boost::intrusive_ptr<Page> Ref;
+  friend void intrusive_ptr_add_ref(Page *p) { p->get(); }
+  friend void intrusive_ptr_release(Page *p) { p->put(); }
+
+  // key-value comparison functor for avl
+  struct Less {
+    bool operator()(uint64_t offset, const Page &page) const {
+      return offset < page.offset;
+    }
+    bool operator()(const Page &page, uint64_t offset) const {
+      return page.offset < offset;
+    }
+    bool operator()(const Page &lhs, const Page &rhs) const {
+      return lhs.offset < rhs.offset;
+    }
+  };
+  void encode(ceph::buffer::list &bl, size_t page_size) const {
+    using ceph::encode;
+    bl.append(ceph::buffer::copy(data, page_size));
+    encode(offset, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &p, size_t page_size) {
+    using ceph::decode;
+    p.copy(page_size, data);
+    decode(offset, p);
+  }
+
+  static Ref create(size_t page_size, uint64_t offset = 0) {
+    // ensure proper alignment of the Page
+    const auto align = alignof(Page);
+    page_size = (page_size + align - 1) & ~(align - 1);
+    // allocate the Page and its data in a single buffer
+    auto buffer = new char[page_size + sizeof(Page)];
+    // place the Page structure at the end of the buffer
+    return new (buffer + page_size) Page(buffer, offset);
+  }
+
+  // copy disabled
+  Page(const Page&) = delete;
+  const Page& operator=(const Page&) = delete;
+
+ private: // private constructor, use create() instead
+  Page(char *data, uint64_t offset) : data(data), offset(offset), nrefs(1) {}
+
+  static void operator delete(void *p) {
+    delete[] reinterpret_cast<Page*>(p)->data;
+  }
+};
+
+class PageSet {
+ public:
+  // alloc_range() and get_range() return page refs in a vector
+  typedef std::vector<Page::Ref> page_vector;
+
+ private:
+  // store pages in a boost intrusive avl_set
+  typedef Page::Less page_cmp;
+  typedef boost::intrusive::member_hook<Page,
+          boost::intrusive::avl_set_member_hook<>,
+          &Page::hook> member_option;
+  typedef boost::intrusive::avl_set<Page,
+          boost::intrusive::compare<page_cmp>, member_option> page_set;
+
+  typedef typename page_set::iterator iterator;
+
+  page_set pages;
+  uint64_t page_size;
+
+  typedef std::mutex lock_type;
+  lock_type mutex;
+
+  void free_pages(iterator cur, iterator end) {
+    while (cur != end) {
+      Page *page = &*cur;
+      cur = pages.erase(cur);
+      page->put();
+    }
+  }
+
+  int count_pages(uint64_t offset, uint64_t len) const {
+    // count the overlapping pages
+    int count = 0;
+    if (offset % page_size) {
+      count++;
+      size_t rem = page_size - offset % page_size;
+      len = len <= rem ? 0 : len - rem;
+    }
+    count += len / page_size;
+    if (len % page_size)
+      count++;
+    return count;
+  }
+
+ public:
+  explicit PageSet(size_t page_size) : page_size(page_size) {}
+  PageSet(PageSet &&rhs)
+    : pages(std::move(rhs.pages)), page_size(rhs.page_size) {}
+  ~PageSet() {
+    free_pages(pages.begin(), pages.end());
+  }
+
+  // disable copy
+  PageSet(const PageSet&) = delete;
+  const PageSet& operator=(const PageSet&) = delete;
+
+  bool empty() const { return pages.empty(); }
+  size_t size() const { return pages.size(); }
+  size_t get_page_size() const { return page_size; }
+
+  // allocate all pages that intersect the range [offset,length)
+  void alloc_range(uint64_t offset, uint64_t length, page_vector &range) {
+    // loop in reverse so we can provide hints to avl_set::insert_check()
+    //	and get O(1) insertions after the first
+    uint64_t position = offset + length - 1;
+
+    range.resize(count_pages(offset, length));
+    auto out = range.rbegin();
+
+    std::lock_guard<lock_type> lock(mutex);
+    iterator cur = pages.end();
+    while (length) {
+      const uint64_t page_offset = position & ~(page_size-1);
+
+      typename page_set::insert_commit_data commit;
+      auto insert = pages.insert_check(cur, page_offset, page_cmp(), commit);
+      if (insert.second) {
+        auto page = Page::create(page_size, page_offset);
+        cur = pages.insert_commit(*page, commit);
+
+        // assume that the caller will write to the range [offset,length),
+        //  so we only need to zero memory outside of this range
+
+        // zero end of page past offset + length
+        if (offset + length < page->offset + page_size)
+          std::fill(page->data + offset + length - page->offset,
+                    page->data + page_size, 0);
+        // zero front of page between page_offset and offset
+        if (offset > page->offset)
+          std::fill(page->data, page->data + offset - page->offset, 0);
+      } else { // exists
+        cur = insert.first;
+      }
+      // add a reference to output vector
+      out->reset(&*cur);
+      ++out;
+
+      auto c = std::min(length, (position & (page_size-1)) + 1);
+      position -= c;
+      length -= c;
+    }
+    // make sure we sized the vector correctly
+    ceph_assert(out == range.rend());
+  }
+
+  // return all allocated pages that intersect the range [offset,length)
+  void get_range(uint64_t offset, uint64_t length, page_vector &range) {
+    auto cur = pages.lower_bound(offset & ~(page_size-1), page_cmp());
+    while (cur != pages.end() && cur->offset < offset + length)
+      range.push_back(&*cur++);
+  }
+
+  void free_pages_after(uint64_t offset) {
+    std::lock_guard<lock_type> lock(mutex);
+    auto cur = pages.lower_bound(offset & ~(page_size-1), page_cmp());
+    if (cur == pages.end())
+      return;
+    if (cur->offset < offset)
+      cur++;
+    free_pages(cur, pages.end());
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    using ceph::encode;
+    encode(page_size, bl);
+    unsigned count = pages.size();
+    encode(count, bl);
+    for (auto p = pages.rbegin(); p != pages.rend(); ++p)
+      p->encode(bl, page_size);
+  }
+  void decode(ceph::buffer::list::const_iterator &p) {
+    using ceph::decode;
+    ceph_assert(empty());
+    decode(page_size, p);
+    unsigned count;
+    decode(count, p);
+    auto cur = pages.end();
+    for (unsigned i = 0; i < count; i++) {
+      auto page = Page::create(page_size);
+      page->decode(p, page_size);
+      cur = pages.insert_before(cur, *page);
+    }
+  }
+};
+
+#endif // CEPH_PAGESET_H
diff --git a/src/osd/CMakeLists.txt b/src/osd/CMakeLists.txt
new file mode 100644
index 000000000..373456fc6
--- /dev/null
+++ b/src/osd/CMakeLists.txt
@@ -0,0 +1,75 @@
+set(osdc_osd_srcs
+  ${CMAKE_SOURCE_DIR}/src/osdc/Objecter.cc
+  ${CMAKE_SOURCE_DIR}/src/osdc/Striper.cc)
+
+if(WITH_OSD_INSTRUMENT_FUNCTIONS AND CMAKE_CXX_COMPILER_ID STREQUAL GNU)
+  add_compile_options(
+    -finstrument-functions
+    -finstrument-functions-exclude-function-list=_mm_loadu_si128,_mm_cmpeq_epi32,_mm_movemask_epi8)
+  set(osd_cyg_functions_src ${CMAKE_SOURCE_DIR}/src/tracing/cyg_profile_functions.c)
+endif()
+
+set(osd_srcs
+  OSD.cc
+  pg_scrubber.cc
+  scrub_machine.cc
+  PrimaryLogScrub.cc
+  Watch.cc
+  ClassHandler.cc
+  PG.cc
+  PGLog.cc
+  PrimaryLogPG.cc
+  ReplicatedBackend.cc
+  ECBackend.cc
+  ECTransaction.cc
+  PGBackend.cc
+  OSDCap.cc
+  Watch.cc
+  Session.cc
+  SnapMapper.cc
+  ScrubStore.cc
+  osd_types.cc
+  ECUtil.cc
+  ExtentCache.cc
+  scheduler/OpScheduler.cc
+  scheduler/OpSchedulerItem.cc
+  scheduler/mClockScheduler.cc
+  PeeringState.cc
+  PGStateUtils.cc
+  recovery_types.cc
+  MissingLoc.cc
+  osd_perf_counters.cc
+  ${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc
+  ${CMAKE_SOURCE_DIR}/src/mgr/OSDPerfMetricTypes.cc
+  ${osd_cyg_functions_src}
+  ${osdc_osd_srcs})
+if(HAS_VTA)
+  set_source_files_properties(osdcap.cc
+    PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
+endif()
+add_library(osd STATIC ${osd_srcs})
+target_link_libraries(osd
+  PUBLIC dmclock::dmclock Boost::MPL
+  PRIVATE os heap_profiler cpu_profiler fmt::fmt ${CMAKE_DL_LIBS})
+if(WITH_LTTNG)
+  add_dependencies(osd osd-tp pg-tp)
+endif()
+if(WITH_EVENTTRACE)
+  add_dependencies(osd eventtrace_tp)
+endif()
+if(WITH_OSD_INSTRUMENT_FUNCTIONS)
+  add_dependencies(osd cyg_profile_tp)
+endif()
+
+# libcls_* are runtime dependencies
+add_dependencies(osd cls_journal cls_hello cls_lock cls_log cls_numops
+  cls_refcount cls_timeindex cls_user cls_version cls_cas cls_cmpomap)
+if(WITH_CEPHFS)
+  add_dependencies(osd cls_cephfs)
+endif()
+if(WITH_RBD)
+  add_dependencies(osd cls_rbd)
+endif()
+if(WITH_RADOSGW)
+  add_dependencies(osd cls_otp cls_rgw cls_queue cls_rgw_gc cls_2pc_queue cls_fifo)
+endif()
diff --git a/src/osd/ClassHandler.cc b/src/osd/ClassHandler.cc
new file mode 100644
index 000000000..d1e726408
--- /dev/null
+++ b/src/osd/ClassHandler.cc
@@ -0,0 +1,350 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "include/types.h"
+#include "ClassHandler.h"
+#include "common/errno.h"
+#include "common/ceph_context.h"
+#include "include/dlfcn_compat.h"
+
+#include <map>
+
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#endif
+
+#include "common/config.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout
+
+
+#define CLS_PREFIX "libcls_"
+#define CLS_SUFFIX SHARED_LIB_SUFFIX
+
+using std::map;
+using std::set;
+using std::string;
+
+using ceph::bufferlist;
+
+
+int ClassHandler::open_class(const string& cname, ClassData **pcls)
+{
+  std::lock_guard lock(mutex);
+  ClassData *cls = _get_class(cname, true);
+  if (!cls)
+    return -EPERM;
+  if (cls->status != ClassData::CLASS_OPEN) {
+    int r = _load_class(cls);
+    if (r)
+      return r;
+  }
+  *pcls = cls;
+  return 0;
+}
+
+int ClassHandler::open_all_classes()
+{
+  ldout(cct, 10) << __func__ << dendl;
+  DIR *dir = ::opendir(cct->_conf->osd_class_dir.c_str());
+  if (!dir)
+    return -errno;
+
+  struct dirent *pde = nullptr;
+  int r = 0;
+  while ((pde = ::readdir(dir))) {
+    if (pde->d_name[0] == '.')
+      continue;
+    if (strlen(pde->d_name) > sizeof(CLS_PREFIX) - 1 + sizeof(CLS_SUFFIX) - 1 &&
+	strncmp(pde->d_name, CLS_PREFIX, sizeof(CLS_PREFIX) - 1) == 0 &&
+	strcmp(pde->d_name + strlen(pde->d_name) - (sizeof(CLS_SUFFIX) - 1), CLS_SUFFIX) == 0) {
+      char cname[PATH_MAX + 1];
+      strncpy(cname, pde->d_name + sizeof(CLS_PREFIX) - 1, sizeof(cname) -1);
+      cname[strlen(cname) - (sizeof(CLS_SUFFIX) - 1)] = '\0';
+      ldout(cct, 10) << __func__ << " found " << cname << dendl;
+      ClassData *cls;
+      // skip classes that aren't in 'osd class load list'
+      r = open_class(cname, &cls);
+      if (r < 0 && r != -EPERM)
+	goto out;
+    }
+  }
+ out:
+  closedir(dir);
+  return r;
+}
+
+void ClassHandler::shutdown()
+{
+  for (auto& cls : classes) {
+    if (cls.second.handle) {
+      dlclose(cls.second.handle);
+    }
+  }
+  classes.clear();
+}
+
+/*
+ * Check if @cname is in the whitespace delimited list @list, or the @list
+ * contains the wildcard "*".
+ *
+ * This is expensive but doesn't consume memory for an index, and is performed
+ * only once when a class is loaded.
+ */
+bool ClassHandler::in_class_list(const std::string& cname,
+    const std::string& list)
+{
+  std::istringstream ss(list);
+  std::istream_iterator<std::string> begin{ss};
+  std::istream_iterator<std::string> end{};
+
+  const std::vector<std::string> targets{cname, "*"};
+
+  auto it = std::find_first_of(begin, end,
+      targets.begin(), targets.end());
+
+  return it != end;
+}
+
+ClassHandler::ClassData *ClassHandler::_get_class(const string& cname,
+    bool check_allowed)
+{
+  ClassData *cls;
+  map<string, ClassData>::iterator iter = classes.find(cname);
+
+  if (iter != classes.end()) {
+    cls = &iter->second;
+  } else {
+    if (check_allowed && !in_class_list(cname, cct->_conf->osd_class_load_list)) {
+      ldout(cct, 0) << "_get_class not permitted to load " << cname << dendl;
+      return NULL;
+    }
+    cls = &classes[cname];
+    ldout(cct, 10) << "_get_class adding new class name " << cname << " " << cls << dendl;
+    cls->name = cname;
+    cls->handler = this;
+    cls->allowed = in_class_list(cname, cct->_conf->osd_class_default_list);
+  }
+  return cls;
+}
+
+int ClassHandler::_load_class(ClassData *cls)
+{
+  // already open
+  if (cls->status == ClassData::CLASS_OPEN)
+    return 0;
+
+  if (cls->status == ClassData::CLASS_UNKNOWN ||
+      cls->status == ClassData::CLASS_MISSING) {
+    char fname[PATH_MAX];
+    snprintf(fname, sizeof(fname), "%s/" CLS_PREFIX "%s" CLS_SUFFIX,
+	     cct->_conf->osd_class_dir.c_str(),
+	     cls->name.c_str());
+    ldout(cct, 10) << "_load_class " << cls->name << " from " << fname << dendl;
+
+    cls->handle = dlopen(fname, RTLD_NOW);
+    if (!cls->handle) {
+      struct stat st;
+      int r = ::stat(fname, &st);
+      if (r < 0) {
+        r = -errno;
+	ldout(cct, 0) << __func__ << " could not stat class " << fname
+		      << ": " << cpp_strerror(r) << dendl;
+      } else {
+	ldout(cct, 0) << "_load_class could not open class " << fname
+		      << " (dlopen failed): " << dlerror() << dendl;
+	r = -EIO;
+      }
+      cls->status = ClassData::CLASS_MISSING;
+      return r;
+    }
+
+    cls_deps_t *(*cls_deps)();
+    cls_deps = (cls_deps_t *(*)())dlsym(cls->handle, "class_deps");
+    if (cls_deps) {
+      cls_deps_t *deps = cls_deps();
+      while (deps) {
+	if (!deps->name)
+	  break;
+	ClassData *cls_dep = _get_class(deps->name, false);
+	cls->dependencies.insert(cls_dep);
+	if (cls_dep->status != ClassData::CLASS_OPEN)
+	  cls->missing_dependencies.insert(cls_dep);
+	deps++;
+      }
+    }
+  }
+
+  // resolve dependencies
+  set<ClassData*>::iterator p = cls->missing_dependencies.begin();
+  while (p != cls->missing_dependencies.end()) {
+    ClassData *dc = *p;
+    int r = _load_class(dc);
+    if (r < 0) {
+      cls->status = ClassData::CLASS_MISSING_DEPS;
+      return r;
+    }
+
+    ldout(cct, 10) << "_load_class " << cls->name << " satisfied dependency " << dc->name << dendl;
+    cls->missing_dependencies.erase(p++);
+  }
+
+  // initialize
+  void (*cls_init)() = (void (*)())dlsym(cls->handle, "__cls_init");
+  if (cls_init) {
+    cls->status = ClassData::CLASS_INITIALIZING;
+    cls_init();
+  }
+
+  ldout(cct, 10) << "_load_class " << cls->name << " success" << dendl;
+  cls->status = ClassData::CLASS_OPEN;
+  return 0;
+}
+
+
+
+ClassHandler::ClassData *ClassHandler::register_class(const char *cname)
+{
+  ceph_assert(ceph_mutex_is_locked(mutex));
+
+  ClassData *cls = _get_class(cname, false);
+  ldout(cct, 10) << "register_class " << cname << " status " << cls->status << dendl;
+
+  if (cls->status != ClassData::CLASS_INITIALIZING) {
+    ldout(cct, 0) << "class " << cname << " isn't loaded; is the class registering under the wrong name?" << dendl;
+    return NULL;
+  }
+  return cls;
+}
+
+void ClassHandler::unregister_class(ClassHandler::ClassData *cls)
+{
+  /* FIXME: do we really need this one? */
+}
+
+ClassHandler::ClassMethod *ClassHandler::ClassData::register_method(const char *mname,
+                                                                    int flags,
+								    cls_method_call_t func)
+{
+  /* no need for locking, called under the class_init mutex */
+  if (!flags) {
+    lderr(handler->cct) << "register_method " << name << "." << mname
+			<< " flags " << flags << " " << (void*)func
+			<< " FAILED -- flags must be non-zero" << dendl;
+    return NULL;
+  }
+  ldout(handler->cct, 10) << "register_method " << name << "." << mname << " flags " << flags << " " << (void*)func << dendl;
+  [[maybe_unused]] auto [method, added] = methods_map.try_emplace(mname, mname, func, flags, this);
+  return &method->second;
+}
+
+ClassHandler::ClassMethod *ClassHandler::ClassData::register_cxx_method(const char *mname,
+                                                                        int flags,
+									cls_method_cxx_call_t func)
+{
+  /* no need for locking, called under the class_init mutex */
+  ldout(handler->cct, 10) << "register_cxx_method " << name << "." << mname << " flags " << flags << " " << (void*)func << dendl;
+  [[maybe_unused]] auto [method, added] = methods_map.try_emplace(mname, mname, func, flags, this);
+  return &method->second;
+}
+
+ClassHandler::ClassFilter *ClassHandler::ClassData::register_cxx_filter(
+    const std::string &filter_name,
+    cls_cxx_filter_factory_t fn)
+{
+  ClassFilter &filter = filters_map[filter_name];
+  filter.fn = fn;
+  filter.name = filter_name;
+  filter.cls = this;
+  return &filter;
+}
+
+ClassHandler::ClassMethod *ClassHandler::ClassData::_get_method(
+    const std::string& mname)
+{
+  if (auto iter = methods_map.find(mname); iter != methods_map.end()) {
+    return &(iter->second);
+  } else {
+    return nullptr;
+  }
+}
+
+int ClassHandler::ClassData::get_method_flags(const std::string& mname)
+{
+  std::lock_guard l(handler->mutex);
+  ClassMethod *method = _get_method(mname);
+  if (!method)
+    return -ENOENT;
+  return method->flags;
+}
+
+void ClassHandler::ClassData::unregister_method(ClassHandler::ClassMethod *method)
+{
+  /* no need for locking, called under the class_init mutex */
+   map<string, ClassMethod>::iterator iter = methods_map.find(method->name);
+   if (iter == methods_map.end())
+     return;
+   methods_map.erase(iter);
+}
+
+void ClassHandler::ClassMethod::unregister()
+{
+  cls->unregister_method(this);
+}
+
+void ClassHandler::ClassData::unregister_filter(ClassHandler::ClassFilter *filter)
+{
+  /* no need for locking, called under the class_init mutex */
+   map<string, ClassFilter>::iterator iter = filters_map.find(filter->name);
+   if (iter == filters_map.end())
+     return;
+   filters_map.erase(iter);
+}
+
+void ClassHandler::ClassFilter::unregister()
+{
+  cls->unregister_filter(this);
+}
+
+int ClassHandler::ClassMethod::exec(cls_method_context_t ctx, bufferlist& indata, bufferlist& outdata)
+{
+  int ret = 0;
+  std::visit([&](auto method) {
+    using method_t = decltype(method);
+    if constexpr (std::is_same_v<method_t, cls_method_cxx_call_t>) {
+      // C++ call version
+      ret = method(ctx, &indata, &outdata);
+    } else if constexpr (std::is_same_v<method_t, cls_method_call_t>) {
+      // C version
+      char *out = nullptr;
+      int olen = 0;
+      ret = method(ctx, indata.c_str(), indata.length(), &out, &olen);
+      if (out) {
+        // assume *out was allocated via cls_alloc (which calls malloc!)
+	ceph::buffer::ptr bp = ceph::buffer::claim_malloc(olen, out);
+        outdata.push_back(bp);
+      }
+    } else {
+      static_assert(std::is_same_v<method_t, void>);
+    }
+  }, func);
+  return ret;
+}
+
+ClassHandler& ClassHandler::get_instance()
+{
+#ifdef WITH_SEASTAR
+  // the context is being used solely for:
+  //   1. random number generation (cls_gen_random_bytes)
+  //   2. accessing the configuration
+  //   3. logging
+  static CephContext cct;
+  static ClassHandler single(&cct);
+#else
+  static ClassHandler single(g_ceph_context);
+#endif // WITH_SEASTAR
+  return single;
+}
diff --git a/src/osd/ClassHandler.h b/src/osd/ClassHandler.h
new file mode 100644
index 000000000..fff61d5d2
--- /dev/null
+++ b/src/osd/ClassHandler.h
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_CLASSHANDLER_H
+#define CEPH_CLASSHANDLER_H
+
+#include <variant>
+
+#include "include/types.h"
+#include "include/common_fwd.h"
+#include "common/ceph_mutex.h"
+#include "objclass/objclass.h"
+
+//forward declaration
+class ClassHandler
+{
+public:
+  CephContext *cct;
+  struct ClassData;
+
+  struct ClassMethod {
+    const std::string name;
+    using func_t = std::variant<cls_method_cxx_call_t, cls_method_call_t>;
+    func_t func;
+    int flags = 0;
+    ClassData *cls = nullptr;
+
+    int exec(cls_method_context_t ctx,
+	     ceph::bufferlist& indata,
+	     ceph::bufferlist& outdata);
+    void unregister();
+
+    int get_flags() {
+      std::lock_guard l(cls->handler->mutex);
+      return flags;
+    }
+    ClassMethod(const char* name, func_t call, int flags, ClassData* cls)
+      : name{name}, func{call}, flags{flags}, cls{cls}
+    {}
+  };
+
+  struct ClassFilter {
+    ClassData *cls = nullptr;
+    std::string name;
+    cls_cxx_filter_factory_t fn = nullptr;
+
+    void unregister();
+  };
+
+  struct ClassData {
+    enum Status { 
+      CLASS_UNKNOWN,
+      CLASS_MISSING,         // missing
+      CLASS_MISSING_DEPS,    // missing dependencies
+      CLASS_INITIALIZING,    // calling init() right now
+      CLASS_OPEN,            // initialized, usable
+    } status = CLASS_UNKNOWN;
+
+    std::string name;
+    ClassHandler *handler = nullptr;
+    void *handle = nullptr;
+
+    bool allowed = false;
+
+    std::map<std::string, ClassMethod> methods_map;
+    std::map<std::string, ClassFilter> filters_map;
+
+    std::set<ClassData *> dependencies;         /* our dependencies */
+    std::set<ClassData *> missing_dependencies; /* only missing dependencies */
+
+    ClassMethod *_get_method(const std::string& mname);
+
+    ClassMethod *register_method(const char *mname,
+                                 int flags,
+                                 cls_method_call_t func);
+    ClassMethod *register_cxx_method(const char *mname,
+                                     int flags,
+                                     cls_method_cxx_call_t func);
+    void unregister_method(ClassMethod *method);
+
+    ClassFilter *register_cxx_filter(const std::string &filter_name,
+                                     cls_cxx_filter_factory_t fn);
+    void unregister_filter(ClassFilter *method);
+
+    ClassMethod *get_method(const std::string& mname) {
+      std::lock_guard l(handler->mutex);
+      return _get_method(mname);
+    }
+    int get_method_flags(const std::string& mname);
+
+    ClassFilter *get_filter(const std::string &filter_name) {
+      std::lock_guard l(handler->mutex);
+      if (auto i = filters_map.find(filter_name); i == filters_map.end()) {
+        return nullptr;
+      } else {
+        return &(i->second);
+      }
+    }
+  };
+
+private:
+  std::map<std::string, ClassData> classes;
+
+  ClassData *_get_class(const std::string& cname, bool check_allowed);
+  int _load_class(ClassData *cls);
+
+  static bool in_class_list(const std::string& cname,
+      const std::string& list);
+
+  ceph::mutex mutex = ceph::make_mutex("ClassHandler");
+
+public:
+  explicit ClassHandler(CephContext *cct) : cct(cct) {}
+
+  int open_all_classes();
+  int open_class(const std::string& cname, ClassData **pcls);
+
+  ClassData *register_class(const char *cname);
+  void unregister_class(ClassData *cls);
+
+  void shutdown();
+
+  static ClassHandler& get_instance();
+};
+
+
+#endif
diff --git a/src/osd/DynamicPerfStats.h b/src/osd/DynamicPerfStats.h
new file mode 100644
index 000000000..1c6c26c71
--- /dev/null
+++ b/src/osd/DynamicPerfStats.h
@@ -0,0 +1,267 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef DYNAMIC_PERF_STATS_H
+#define DYNAMIC_PERF_STATS_H
+
+#include "include/random.h"
+#include "messages/MOSDOp.h"
+#include "mgr/OSDPerfMetricTypes.h"
+#include "osd/OSD.h"
+#include "osd/OpRequest.h"
+
+class DynamicPerfStats {
+public:
+  DynamicPerfStats() {
+  }
+
+  DynamicPerfStats(const std::list<OSDPerfMetricQuery> &queries) {
+    for (auto &query : queries) {
+      data[query];
+    }
+  }
+
+  void merge(const DynamicPerfStats &dps) {
+    for (auto &query_it : dps.data) {
+      auto &query = query_it.first;
+      for (auto &key_it : query_it.second) {
+        auto &key = key_it.first;
+        auto counter_it = key_it.second.begin();
+        auto update_counter_fnc =
+            [&counter_it](const PerformanceCounterDescriptor &d,
+                          PerformanceCounter *c) {
+              c->first  += counter_it->first;
+              c->second += counter_it->second;
+              counter_it++;
+            };
+
+        ceph_assert(key_it.second.size() >= data[query][key].size());
+        query.update_counters(update_counter_fnc, &data[query][key]);
+      }
+    }
+  }
+
+  void set_queries(const std::list<OSDPerfMetricQuery> &queries) {
+    std::map<OSDPerfMetricQuery,
+             std::map<OSDPerfMetricKey, PerformanceCounters>> new_data;
+    for (auto &query : queries) {
+      std::swap(new_data[query], data[query]);
+    }
+    std::swap(data, new_data);
+  }
+
+  bool is_enabled() {
+    return !data.empty();
+  }
+
+  void add(const OSDService *osd, const pg_info_t &pg_info, const OpRequest& op,
+           uint64_t inb, uint64_t outb, const utime_t &latency) {
+
+    auto update_counter_fnc =
+        [&op, inb, outb, &latency](const PerformanceCounterDescriptor &d,
+                                   PerformanceCounter *c) {
+          ceph_assert(d.is_supported());
+
+          switch(d.type) {
+          case PerformanceCounterType::OPS:
+            c->first++;
+            return;
+          case PerformanceCounterType::WRITE_OPS:
+            if (op.may_write() || op.may_cache()) {
+              c->first++;
+            }
+            return;
+          case PerformanceCounterType::READ_OPS:
+            if (op.may_read()) {
+              c->first++;
+            }
+            return;
+          case PerformanceCounterType::BYTES:
+            c->first += inb + outb;
+            return;
+          case PerformanceCounterType::WRITE_BYTES:
+            if (op.may_write() || op.may_cache()) {
+              c->first += inb;
+            }
+            return;
+          case PerformanceCounterType::READ_BYTES:
+            if (op.may_read()) {
+              c->first += outb;
+            }
+            return;
+          case PerformanceCounterType::LATENCY:
+            c->first += latency.to_nsec();
+            c->second++;
+            return;
+          case PerformanceCounterType::WRITE_LATENCY:
+            if (op.may_write() || op.may_cache()) {
+              c->first += latency.to_nsec();
+              c->second++;
+            }
+            return;
+          case PerformanceCounterType::READ_LATENCY:
+            if (op.may_read()) {
+              c->first += latency.to_nsec();
+              c->second++;
+            }
+            return;
+          default:
+            ceph_abort_msg("unknown counter type");
+          }
+        };
+
+    auto get_subkey_fnc =
+        [&osd, &pg_info, &op](const OSDPerfMetricSubKeyDescriptor &d,
+                              OSDPerfMetricSubKey *sub_key) {
+          ceph_assert(d.is_supported());
+
+          auto m = op.get_req<MOSDOp>();
+          std::string match_string;
+          switch(d.type) {
+          case OSDPerfMetricSubKeyType::CLIENT_ID:
+            match_string = stringify(m->get_reqid().name);
+            break;
+          case OSDPerfMetricSubKeyType::CLIENT_ADDRESS:
+            match_string = stringify(m->get_connection()->get_peer_addr());
+            break;
+          case OSDPerfMetricSubKeyType::POOL_ID:
+            match_string = stringify(m->get_spg().pool());
+            break;
+          case OSDPerfMetricSubKeyType::NAMESPACE:
+            match_string = m->get_hobj().nspace;
+            break;
+          case OSDPerfMetricSubKeyType::OSD_ID:
+            match_string = stringify(osd->get_nodeid());
+            break;
+          case OSDPerfMetricSubKeyType::PG_ID:
+            match_string = stringify(pg_info.pgid);
+            break;
+          case OSDPerfMetricSubKeyType::OBJECT_NAME:
+            match_string = m->get_oid().name;
+            break;
+          case OSDPerfMetricSubKeyType::SNAP_ID:
+            match_string = stringify(m->get_snapid());
+            break;
+          default:
+            ceph_abort_msg("unknown counter type");
+          }
+
+          std::smatch match;
+          if (!std::regex_search(match_string, match, d.regex)) {
+            return false;
+          }
+          if (match.size() <= 1) {
+            return false;
+          }
+          for (size_t i = 1; i < match.size(); i++) {
+            sub_key->push_back(match[i].str());
+          }
+          return true;
+        };
+
+    for (auto &it : data) {
+      auto &query = it.first;
+      OSDPerfMetricKey key;
+      if (query.get_key(get_subkey_fnc, &key)) {
+        query.update_counters(update_counter_fnc, &it.second[key]);
+      }
+    }
+  }
+
+  void add_to_reports(
+      const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &limits,
+      std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
+    for (auto &it : data) {
+      auto &query = it.first;
+      auto limit_it = limits.find(query);
+      if (limit_it == limits.end()) {
+        continue;
+      }
+      auto &query_limits = limit_it->second;
+      auto &counters = it.second;
+      auto &report = (*reports)[query];
+
+      query.get_performance_counter_descriptors(
+          &report.performance_counter_descriptors);
+
+      auto &descriptors = report.performance_counter_descriptors;
+      ceph_assert(descriptors.size() > 0);
+
+      if (!is_limited(query_limits, counters.size())) {
+        for (auto &it_counters : counters) {
+          auto &bl = report.group_packed_performance_counters[it_counters.first];
+          query.pack_counters(it_counters.second, &bl);
+        }
+        continue;
+      }
+
+      for (auto &limit : query_limits) {
+        size_t index = 0;
+        for (; index < descriptors.size(); index++) {
+          if (descriptors[index] == limit.order_by) {
+            break;
+          }
+        }
+        if (index == descriptors.size()) {
+          // should not happen
+          continue;
+        }
+
+        // Weighted Random Sampling (Algorithm A-Chao):
+        // Select the first [0, max_count) samples, randomly replace
+        // with samples from [max_count, end) using weighted
+        // probability, and return [0, max_count) as the result.
+
+        ceph_assert(limit.max_count < counters.size());
+        typedef std::map<OSDPerfMetricKey, PerformanceCounters>::iterator
+            Iterator;
+        std::vector<Iterator> counter_iterators;
+        counter_iterators.reserve(limit.max_count);
+
+        Iterator it_counters = counters.begin();
+        uint64_t wsum = 0;
+        for (size_t i = 0; i < limit.max_count; i++) {
+          wsum += it_counters->second[index].first;
+          counter_iterators.push_back(it_counters++);
+        }
+        for (; it_counters != counters.end(); it_counters++) {
+          wsum += it_counters->second[index].first;
+          if (ceph::util::generate_random_number(0, wsum) <=
+              it_counters->second[index].first) {
+            auto i = ceph::util::generate_random_number(0, limit.max_count - 1);
+            counter_iterators[i] = it_counters;
+          }
+        }
+
+        for (auto it_counters : counter_iterators) {
+          auto &bl =
+              report.group_packed_performance_counters[it_counters->first];
+          if (bl.length() == 0) {
+            query.pack_counters(it_counters->second, &bl);
+          }
+        }
+      }
+    }
+  }
+
+private:
+  static bool is_limited(const OSDPerfMetricLimits &limits,
+                         size_t counters_size) {
+    if (limits.empty()) {
+      return false;
+    }
+
+    for (auto &limit : limits) {
+      if (limit.max_count >= counters_size) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  std::map<OSDPerfMetricQuery,
+           std::map<OSDPerfMetricKey, PerformanceCounters>> data;
+};
+
+#endif // DYNAMIC_PERF_STATS_H
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
new file mode 100644
index 000000000..b13a99fbc
--- /dev/null
+++ b/src/osd/ECBackend.cc
@@ -0,0 +1,2637 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include "ECBackend.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDECSubOpWrite.h"
+#include "messages/MOSDECSubOpWriteReply.h"
+#include "messages/MOSDECSubOpRead.h"
+#include "messages/MOSDECSubOpReadReply.h"
+#include "ECMsgTypes.h"
+
+#include "PrimaryLogPG.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#define DOUT_PREFIX_ARGS this
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+using std::dec;
+using std::hex;
+using std::list;
+using std::make_pair;
+using std::map;
+using std::pair;
+using std::ostream;
+using std::set;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferhash;
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::Formatter;
+
+static ostream& _prefix(std::ostream *_dout, ECBackend *pgb) {
+  return pgb->get_parent()->gen_dbg_prefix(*_dout);
+}
+
+struct ECRecoveryHandle : public PGBackend::RecoveryHandle {
+  list<ECBackend::RecoveryOp> ops;
+};
+
+ostream &operator<<(ostream &lhs, const ECBackend::pipeline_state_t &rhs) {
+  switch (rhs.pipeline_state) {
+  case ECBackend::pipeline_state_t::CACHE_VALID:
+    return lhs << "CACHE_VALID";
+  case ECBackend::pipeline_state_t::CACHE_INVALID:
+    return lhs << "CACHE_INVALID";
+  default:
+    ceph_abort_msg("invalid pipeline state");
+  }
+  return lhs; // unreachable
+}
+
+static ostream &operator<<(ostream &lhs, const map<pg_shard_t, bufferlist> &rhs)
+{
+  lhs << "[";
+  for (map<pg_shard_t, bufferlist>::const_iterator i = rhs.begin();
+       i != rhs.end();
+       ++i) {
+    if (i != rhs.begin())
+      lhs << ", ";
+    lhs << make_pair(i->first, i->second.length());
+  }
+  return lhs << "]";
+}
+
+static ostream &operator<<(ostream &lhs, const map<int, bufferlist> &rhs)
+{
+  lhs << "[";
+  for (map<int, bufferlist>::const_iterator i = rhs.begin();
+       i != rhs.end();
+       ++i) {
+    if (i != rhs.begin())
+      lhs << ", ";
+    lhs << make_pair(i->first, i->second.length());
+  }
+  return lhs << "]";
+}
+
+static ostream &operator<<(
+  ostream &lhs,
+  const boost::tuple<uint64_t, uint64_t, map<pg_shard_t, bufferlist> > &rhs)
+{
+  return lhs << "(" << rhs.get<0>() << ", "
+	     << rhs.get<1>() << ", " << rhs.get<2>() << ")";
+}
+
+ostream &operator<<(ostream &lhs, const ECBackend::read_request_t &rhs)
+{
+  return lhs << "read_request_t(to_read=[" << rhs.to_read << "]"
+	     << ", need=" << rhs.need
+	     << ", want_attrs=" << rhs.want_attrs
+	     << ")";
+}
+
+ostream &operator<<(ostream &lhs, const ECBackend::read_result_t &rhs)
+{
+  lhs << "read_result_t(r=" << rhs.r
+      << ", errors=" << rhs.errors;
+  if (rhs.attrs) {
+    lhs << ", attrs=" << *(rhs.attrs);
+  } else {
+    lhs << ", noattrs";
+  }
+  return lhs << ", returned=" << rhs.returned << ")";
+}
+
+ostream &operator<<(ostream &lhs, const ECBackend::ReadOp &rhs)
+{
+  lhs << "ReadOp(tid=" << rhs.tid;
+  if (rhs.op && rhs.op->get_req()) {
+    lhs << ", op=";
+    rhs.op->get_req()->print(lhs);
+  }
+  return lhs << ", to_read=" << rhs.to_read
+	     << ", complete=" << rhs.complete
+	     << ", priority=" << rhs.priority
+	     << ", obj_to_source=" << rhs.obj_to_source
+	     << ", source_to_obj=" << rhs.source_to_obj
+	     << ", in_progress=" << rhs.in_progress << ")";
+}
+
+void ECBackend::ReadOp::dump(Formatter *f) const
+{
+  f->dump_unsigned("tid", tid);
+  if (op && op->get_req()) {
+    f->dump_stream("op") << *(op->get_req());
+  }
+  f->dump_stream("to_read") << to_read;
+  f->dump_stream("complete") << complete;
+  f->dump_int("priority", priority);
+  f->dump_stream("obj_to_source") << obj_to_source;
+  f->dump_stream("source_to_obj") << source_to_obj;
+  f->dump_stream("in_progress") << in_progress;
+}
+
+ostream &operator<<(ostream &lhs, const ECBackend::Op &rhs)
+{
+  lhs << "Op(" << rhs.hoid
+      << " v=" << rhs.version
+      << " tt=" << rhs.trim_to
+      << " tid=" << rhs.tid
+      << " reqid=" << rhs.reqid;
+  if (rhs.client_op && rhs.client_op->get_req()) {
+    lhs << " client_op=";
+    rhs.client_op->get_req()->print(lhs);
+  }
+  lhs << " roll_forward_to=" << rhs.roll_forward_to
+      << " temp_added=" << rhs.temp_added
+      << " temp_cleared=" << rhs.temp_cleared
+      << " pending_read=" << rhs.pending_read
+      << " remote_read=" << rhs.remote_read
+      << " remote_read_result=" << rhs.remote_read_result
+      << " pending_apply=" << rhs.pending_apply
+      << " pending_commit=" << rhs.pending_commit
+      << " plan.to_read=" << rhs.plan.to_read
+      << " plan.will_write=" << rhs.plan.will_write
+      << ")";
+  return lhs;
+}
+
+ostream &operator<<(ostream &lhs, const ECBackend::RecoveryOp &rhs)
+{
+  return lhs << "RecoveryOp("
+	     << "hoid=" << rhs.hoid
+	     << " v=" << rhs.v
+	     << " missing_on=" << rhs.missing_on
+	     << " missing_on_shards=" << rhs.missing_on_shards
+	     << " recovery_info=" << rhs.recovery_info
+	     << " recovery_progress=" << rhs.recovery_progress
+	     << " obc refcount=" << rhs.obc.use_count()
+	     << " state=" << ECBackend::RecoveryOp::tostr(rhs.state)
+	     << " waiting_on_pushes=" << rhs.waiting_on_pushes
+	     << " extent_requested=" << rhs.extent_requested
+	     << ")";
+}
+
+void ECBackend::RecoveryOp::dump(Formatter *f) const
+{
+  f->dump_stream("hoid") << hoid;
+  f->dump_stream("v") << v;
+  f->dump_stream("missing_on") << missing_on;
+  f->dump_stream("missing_on_shards") << missing_on_shards;
+  f->dump_stream("recovery_info") << recovery_info;
+  f->dump_stream("recovery_progress") << recovery_progress;
+  f->dump_stream("state") << tostr(state);
+  f->dump_stream("waiting_on_pushes") << waiting_on_pushes;
+  f->dump_stream("extent_requested") << extent_requested;
+}
+
+ECBackend::ECBackend(
+  PGBackend::Listener *pg,
+  const coll_t &coll,
+  ObjectStore::CollectionHandle &ch,
+  ObjectStore *store,
+  CephContext *cct,
+  ErasureCodeInterfaceRef ec_impl,
+  uint64_t stripe_width)
+  : PGBackend(cct, pg, store, coll, ch),
+    ec_impl(ec_impl),
+    sinfo(ec_impl->get_data_chunk_count(), stripe_width) {
+  ceph_assert((ec_impl->get_data_chunk_count() *
+	  ec_impl->get_chunk_size(stripe_width)) == stripe_width);
+}
+
+PGBackend::RecoveryHandle *ECBackend::open_recovery_op()
+{
+  return new ECRecoveryHandle;
+}
+
+void ECBackend::_failed_push(const hobject_t &hoid,
+  pair<RecoveryMessages *, ECBackend::read_result_t &> &in)
+{
+  ECBackend::read_result_t &res = in.second;
+  dout(10) << __func__ << ": Read error " << hoid << " r="
+	   << res.r << " errors=" << res.errors << dendl;
+  dout(10) << __func__ << ": canceling recovery op for obj " << hoid
+	   << dendl;
+  ceph_assert(recovery_ops.count(hoid));
+  eversion_t v = recovery_ops[hoid].v;
+  recovery_ops.erase(hoid);
+
+  set<pg_shard_t> fl;
+  for (auto&& i : res.errors) {
+    fl.insert(i.first);
+  }
+  get_parent()->on_failed_pull(fl, hoid, v);
+}
+
+struct OnRecoveryReadComplete :
+  public GenContext<pair<RecoveryMessages*, ECBackend::read_result_t& > &> {
+  ECBackend *pg;
+  hobject_t hoid;
+  OnRecoveryReadComplete(ECBackend *pg, const hobject_t &hoid)
+    : pg(pg), hoid(hoid) {}
+  void finish(pair<RecoveryMessages *, ECBackend::read_result_t &> &in) override {
+    ECBackend::read_result_t &res = in.second;
+    if (!(res.r == 0 && res.errors.empty())) {
+        pg->_failed_push(hoid, in);
+        return;
+    }
+    ceph_assert(res.returned.size() == 1);
+    pg->handle_recovery_read_complete(
+      hoid,
+      res.returned.back(),
+      res.attrs,
+      in.first);
+  }
+};
+
+struct RecoveryMessages {
+  map<hobject_t,
+      ECBackend::read_request_t> reads;
+  map<hobject_t, set<int>> want_to_read;
+  void read(
+    ECBackend *ec,
+    const hobject_t &hoid, uint64_t off, uint64_t len,
+    set<int> &&_want_to_read,
+    const map<pg_shard_t, vector<pair<int, int>>> &need,
+    bool attrs) {
+    list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read;
+    to_read.push_back(boost::make_tuple(off, len, 0));
+    ceph_assert(!reads.count(hoid));
+    want_to_read.insert(make_pair(hoid, std::move(_want_to_read)));
+    reads.insert(
+      make_pair(
+	hoid,
+	ECBackend::read_request_t(
+	  to_read,
+	  need,
+	  attrs,
+	  new OnRecoveryReadComplete(
+	    ec,
+	    hoid))));
+  }
+
+  map<pg_shard_t, vector<PushOp> > pushes;
+  map<pg_shard_t, vector<PushReplyOp> > push_replies;
+  ObjectStore::Transaction t;
+  RecoveryMessages() {}
+  ~RecoveryMessages() {}
+};
+
+void ECBackend::handle_recovery_push(
+  const PushOp &op,
+  RecoveryMessages *m,
+  bool is_repair)
+{
+  if (get_parent()->check_failsafe_full()) {
+    dout(10) << __func__ << " Out of space (failsafe) processing push request." << dendl;
+    ceph_abort();
+  }
+
+  bool oneshot = op.before_progress.first && op.after_progress.data_complete;
+  ghobject_t tobj;
+  if (oneshot) {
+    tobj = ghobject_t(op.soid, ghobject_t::NO_GEN,
+		      get_parent()->whoami_shard().shard);
+  } else {
+    tobj = ghobject_t(get_parent()->get_temp_recovery_object(op.soid,
+							     op.version),
+		      ghobject_t::NO_GEN,
+		      get_parent()->whoami_shard().shard);
+    if (op.before_progress.first) {
+      dout(10) << __func__ << ": Adding oid "
+	       << tobj.hobj << " in the temp collection" << dendl;
+      add_temp_obj(tobj.hobj);
+    }
+  }
+
+  if (op.before_progress.first) {
+    m->t.remove(coll, tobj);
+    m->t.touch(coll, tobj);
+  }
+
+  if (!op.data_included.empty()) {
+    uint64_t start = op.data_included.range_start();
+    uint64_t end = op.data_included.range_end();
+    ceph_assert(op.data.length() == (end - start));
+
+    m->t.write(
+      coll,
+      tobj,
+      start,
+      op.data.length(),
+      op.data);
+  } else {
+    ceph_assert(op.data.length() == 0);
+  }
+
+  if (get_parent()->pg_is_remote_backfilling()) {
+    get_parent()->pg_add_local_num_bytes(op.data.length());
+    get_parent()->pg_add_num_bytes(op.data.length() * get_ec_data_chunk_count());
+    dout(10) << __func__ << " " << op.soid
+             << " add new actual data by " << op.data.length()
+             << " add new num_bytes by " << op.data.length() * get_ec_data_chunk_count()
+             << dendl;
+  }
+
+  if (op.before_progress.first) {
+    ceph_assert(op.attrset.count(string("_")));
+    m->t.setattrs(
+      coll,
+      tobj,
+      op.attrset);
+  }
+
+  if (op.after_progress.data_complete && !oneshot) {
+    dout(10) << __func__ << ": Removing oid "
+	     << tobj.hobj << " from the temp collection" << dendl;
+    clear_temp_obj(tobj.hobj);
+    m->t.remove(coll, ghobject_t(
+	op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+    m->t.collection_move_rename(
+      coll, tobj,
+      coll, ghobject_t(
+	op.soid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+  }
+  if (op.after_progress.data_complete) {
+    if ((get_parent()->pgb_is_primary())) {
+      ceph_assert(recovery_ops.count(op.soid));
+      ceph_assert(recovery_ops[op.soid].obc);
+      if (get_parent()->pg_is_repair())
+        get_parent()->inc_osd_stat_repaired();
+      get_parent()->on_local_recover(
+	op.soid,
+	op.recovery_info,
+	recovery_ops[op.soid].obc,
+	false,
+	&m->t);
+    } else {
+      // If primary told us this is a repair, bump osd_stat_t::num_objects_repaired
+      if (is_repair)
+        get_parent()->inc_osd_stat_repaired();
+      get_parent()->on_local_recover(
+	op.soid,
+	op.recovery_info,
+	ObjectContextRef(),
+	false,
+	&m->t);
+      if (get_parent()->pg_is_remote_backfilling()) {
+        struct stat st;
+        int r = store->stat(ch, ghobject_t(op.soid, ghobject_t::NO_GEN,
+                            get_parent()->whoami_shard().shard), &st);
+        if (r == 0) {
+          get_parent()->pg_sub_local_num_bytes(st.st_size);
+         // XXX: This can be way overestimated for small objects
+         get_parent()->pg_sub_num_bytes(st.st_size * get_ec_data_chunk_count());
+         dout(10) << __func__ << " " << op.soid
+                  << " sub actual data by " << st.st_size
+                  << " sub num_bytes by " << st.st_size * get_ec_data_chunk_count()
+                  << dendl;
+        }
+      }
+    }
+  }
+  m->push_replies[get_parent()->primary_shard()].push_back(PushReplyOp());
+  m->push_replies[get_parent()->primary_shard()].back().soid = op.soid;
+}
+
+void ECBackend::handle_recovery_push_reply(
+  const PushReplyOp &op,
+  pg_shard_t from,
+  RecoveryMessages *m)
+{
+  if (!recovery_ops.count(op.soid))
+    return;
+  RecoveryOp &rop = recovery_ops[op.soid];
+  ceph_assert(rop.waiting_on_pushes.count(from));
+  rop.waiting_on_pushes.erase(from);
+  continue_recovery_op(rop, m);
+}
+
+void ECBackend::handle_recovery_read_complete(
+  const hobject_t &hoid,
+  boost::tuple<uint64_t, uint64_t, map<pg_shard_t, bufferlist> > &to_read,
+  std::optional<map<string, bufferlist> > attrs,
+  RecoveryMessages *m)
+{
+  dout(10) << __func__ << ": returned " << hoid << " "
+	   << "(" << to_read.get<0>()
+	   << ", " << to_read.get<1>()
+	   << ", " << to_read.get<2>()
+	   << ")"
+	   << dendl;
+  ceph_assert(recovery_ops.count(hoid));
+  RecoveryOp &op = recovery_ops[hoid];
+  ceph_assert(op.returned_data.empty());
+  map<int, bufferlist*> target;
+  for (set<shard_id_t>::iterator i = op.missing_on_shards.begin();
+       i != op.missing_on_shards.end();
+       ++i) {
+    target[*i] = &(op.returned_data[*i]);
+  }
+  map<int, bufferlist> from;
+  for(map<pg_shard_t, bufferlist>::iterator i = to_read.get<2>().begin();
+      i != to_read.get<2>().end();
+      ++i) {
+    from[i->first.shard] = std::move(i->second);
+  }
+  dout(10) << __func__ << ": " << from << dendl;
+  int r;
+  r = ECUtil::decode(sinfo, ec_impl, from, target);
+  ceph_assert(r == 0);
+  if (attrs) {
+    op.xattrs.swap(*attrs);
+
+    if (!op.obc) {
+      // attrs only reference the origin bufferlist (decode from
+      // ECSubReadReply message) whose size is much greater than attrs
+      // in recovery. If obc cache it (get_obc maybe cache the attr),
+      // this causes the whole origin bufferlist would not be free
+      // until obc is evicted from obc cache. So rebuild the
+      // bufferlist before cache it.
+      for (map<string, bufferlist>::iterator it = op.xattrs.begin();
+           it != op.xattrs.end();
+           ++it) {
+        it->second.rebuild();
+      }
+      // Need to remove ECUtil::get_hinfo_key() since it should not leak out
+      // of the backend (see bug #12983)
+      map<string, bufferlist> sanitized_attrs(op.xattrs);
+      sanitized_attrs.erase(ECUtil::get_hinfo_key());
+      op.obc = get_parent()->get_obc(hoid, sanitized_attrs);
+      ceph_assert(op.obc);
+      op.recovery_info.size = op.obc->obs.oi.size;
+      op.recovery_info.oi = op.obc->obs.oi;
+    }
+
+    ECUtil::HashInfo hinfo(ec_impl->get_chunk_count());
+    if (op.obc->obs.oi.size > 0) {
+      ceph_assert(op.xattrs.count(ECUtil::get_hinfo_key()));
+      auto bp = op.xattrs[ECUtil::get_hinfo_key()].cbegin();
+      decode(hinfo, bp);
+    }
+    op.hinfo = unstable_hashinfo_registry.lookup_or_create(hoid, hinfo);
+  }
+  ceph_assert(op.xattrs.size());
+  ceph_assert(op.obc);
+  continue_recovery_op(op, m);
+}
+
+struct SendPushReplies : public Context {
+  PGBackend::Listener *l;
+  epoch_t epoch;
+  map<int, MOSDPGPushReply*> replies;
+  SendPushReplies(
+    PGBackend::Listener *l,
+    epoch_t epoch,
+    map<int, MOSDPGPushReply*> &in) : l(l), epoch(epoch) {
+    replies.swap(in);
+  }
+  void finish(int) override {
+    std::vector<std::pair<int, Message*>> messages;
+    messages.reserve(replies.size());
+    for (map<int, MOSDPGPushReply*>::iterator i = replies.begin();
+	 i != replies.end();
+	 ++i) {
+      messages.push_back(std::make_pair(i->first, i->second));
+    }
+    if (!messages.empty()) {
+      l->send_message_osd_cluster(messages, epoch);
+    }
+    replies.clear();
+  }
+  ~SendPushReplies() override {
+    for (map<int, MOSDPGPushReply*>::iterator i = replies.begin();
+	 i != replies.end();
+	 ++i) {
+      i->second->put();
+    }
+    replies.clear();
+  }
+};
+
+void ECBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority)
+{
+  for (map<pg_shard_t, vector<PushOp> >::iterator i = m.pushes.begin();
+       i != m.pushes.end();
+       m.pushes.erase(i++)) {
+    MOSDPGPush *msg = new MOSDPGPush();
+    msg->set_priority(priority);
+    msg->map_epoch = get_osdmap_epoch();
+    msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
+    msg->from = get_parent()->whoami_shard();
+    msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard);
+    msg->pushes.swap(i->second);
+    msg->compute_cost(cct);
+    msg->is_repair = get_parent()->pg_is_repair();
+    get_parent()->send_message(
+      i->first.osd,
+      msg);
+  }
+  map<int, MOSDPGPushReply*> replies;
+  for (map<pg_shard_t, vector<PushReplyOp> >::iterator i =
+	 m.push_replies.begin();
+       i != m.push_replies.end();
+       m.push_replies.erase(i++)) {
+    MOSDPGPushReply *msg = new MOSDPGPushReply();
+    msg->set_priority(priority);
+    msg->map_epoch = get_osdmap_epoch();
+    msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
+    msg->from = get_parent()->whoami_shard();
+    msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard);
+    msg->replies.swap(i->second);
+    msg->compute_cost(cct);
+    replies.insert(make_pair(i->first.osd, msg));
+  }
+
+  if (!replies.empty()) {
+    (m.t).register_on_complete(
+	get_parent()->bless_context(
+	  new SendPushReplies(
+	    get_parent(),
+	    get_osdmap_epoch(),
+	    replies)));
+    get_parent()->queue_transaction(std::move(m.t));
+  } 
+
+  if (m.reads.empty())
+    return;
+  start_read_op(
+    priority,
+    m.want_to_read,
+    m.reads,
+    OpRequestRef(),
+    false, true);
+}
+
+void ECBackend::continue_recovery_op(
+  RecoveryOp &op,
+  RecoveryMessages *m)
+{
+  dout(10) << __func__ << ": continuing " << op << dendl;
+  while (1) {
+    switch (op.state) {
+    case RecoveryOp::IDLE: {
+      // start read
+      op.state = RecoveryOp::READING;
+      ceph_assert(!op.recovery_progress.data_complete);
+      set<int> want(op.missing_on_shards.begin(), op.missing_on_shards.end());
+      uint64_t from = op.recovery_progress.data_recovered_to;
+      uint64_t amount = get_recovery_chunk_size();
+
+      if (op.recovery_progress.first && op.obc) {
+	/* We've got the attrs and the hinfo, might as well use them */
+	op.hinfo = get_hash_info(op.hoid);
+	if (!op.hinfo) {
+          derr << __func__ << ": " << op.hoid << " has inconsistent hinfo"
+               << dendl;
+          ceph_assert(recovery_ops.count(op.hoid));
+          eversion_t v = recovery_ops[op.hoid].v;
+          recovery_ops.erase(op.hoid);
+          get_parent()->on_failed_pull({get_parent()->whoami_shard()},
+                                       op.hoid, v);
+          return;
+        }
+	op.xattrs = op.obc->attr_cache;
+	encode(*(op.hinfo), op.xattrs[ECUtil::get_hinfo_key()]);
+      }
+
+      map<pg_shard_t, vector<pair<int, int>>> to_read;
+      int r = get_min_avail_to_read_shards(
+	op.hoid, want, true, false, &to_read);
+      if (r != 0) {
+	// we must have lost a recovery source
+	ceph_assert(!op.recovery_progress.first);
+	dout(10) << __func__ << ": canceling recovery op for obj " << op.hoid
+		 << dendl;
+	get_parent()->cancel_pull(op.hoid);
+	recovery_ops.erase(op.hoid);
+	return;
+      }
+      m->read(
+	this,
+	op.hoid,
+	op.recovery_progress.data_recovered_to,
+	amount,
+	std::move(want),
+	to_read,
+	op.recovery_progress.first && !op.obc);
+      op.extent_requested = make_pair(
+	from,
+	amount);
+      dout(10) << __func__ << ": IDLE return " << op << dendl;
+      return;
+    }
+    case RecoveryOp::READING: {
+      // read completed, start write
+      ceph_assert(op.xattrs.size());
+      ceph_assert(op.returned_data.size());
+      op.state = RecoveryOp::WRITING;
+      ObjectRecoveryProgress after_progress = op.recovery_progress;
+      after_progress.data_recovered_to += op.extent_requested.second;
+      after_progress.first = false;
+      if (after_progress.data_recovered_to >= op.obc->obs.oi.size) {
+	after_progress.data_recovered_to =
+	  sinfo.logical_to_next_stripe_offset(
+	    op.obc->obs.oi.size);
+	after_progress.data_complete = true;
+      }
+      for (set<pg_shard_t>::iterator mi = op.missing_on.begin();
+	   mi != op.missing_on.end();
+	   ++mi) {
+	ceph_assert(op.returned_data.count(mi->shard));
+	m->pushes[*mi].push_back(PushOp());
+	PushOp &pop = m->pushes[*mi].back();
+	pop.soid = op.hoid;
+	pop.version = op.v;
+	pop.data = op.returned_data[mi->shard];
+	dout(10) << __func__ << ": before_progress=" << op.recovery_progress
+		 << ", after_progress=" << after_progress
+		 << ", pop.data.length()=" << pop.data.length()
+		 << ", size=" << op.obc->obs.oi.size << dendl;
+	ceph_assert(
+	  pop.data.length() ==
+	  sinfo.aligned_logical_offset_to_chunk_offset(
+	    after_progress.data_recovered_to -
+	    op.recovery_progress.data_recovered_to)
+	  );
+	if (pop.data.length())
+	  pop.data_included.insert(
+	    sinfo.aligned_logical_offset_to_chunk_offset(
+	      op.recovery_progress.data_recovered_to),
+	    pop.data.length()
+	    );
+	if (op.recovery_progress.first) {
+	  pop.attrset = op.xattrs;
+	}
+	pop.recovery_info = op.recovery_info;
+	pop.before_progress = op.recovery_progress;
+	pop.after_progress = after_progress;
+	if (*mi != get_parent()->primary_shard())
+	  get_parent()->begin_peer_recover(
+	    *mi,
+	    op.hoid);
+      }
+      op.returned_data.clear();
+      op.waiting_on_pushes = op.missing_on;
+      op.recovery_progress = after_progress;
+      dout(10) << __func__ << ": READING return " << op << dendl;
+      return;
+    }
+    case RecoveryOp::WRITING: {
+      if (op.waiting_on_pushes.empty()) {
+	if (op.recovery_progress.data_complete) {
+	  op.state = RecoveryOp::COMPLETE;
+	  for (set<pg_shard_t>::iterator i = op.missing_on.begin();
+	       i != op.missing_on.end();
+	       ++i) {
+	    if (*i != get_parent()->primary_shard()) {
+	      dout(10) << __func__ << ": on_peer_recover on " << *i
+		       << ", obj " << op.hoid << dendl;
+	      get_parent()->on_peer_recover(
+		*i,
+		op.hoid,
+		op.recovery_info);
+	    }
+	  }
+	  object_stat_sum_t stat;
+	  stat.num_bytes_recovered = op.recovery_info.size;
+	  stat.num_keys_recovered = 0; // ??? op ... omap_entries.size(); ?
+	  stat.num_objects_recovered = 1;
+	  if (get_parent()->pg_is_repair())
+	    stat.num_objects_repaired = 1;
+	  get_parent()->on_global_recover(op.hoid, stat, false);
+	  dout(10) << __func__ << ": WRITING return " << op << dendl;
+	  recovery_ops.erase(op.hoid);
+	  return;
+	} else {
+	  op.state = RecoveryOp::IDLE;
+	  dout(10) << __func__ << ": WRITING continue " << op << dendl;
+	  continue;
+	}
+      }
+      return;
+    }
+    // should never be called once complete
+    case RecoveryOp::COMPLETE:
+    default: {
+      ceph_abort();
+    };
+    }
+  }
+}
+
+void ECBackend::run_recovery_op(
+  RecoveryHandle *_h,
+  int priority)
+{
+  ECRecoveryHandle *h = static_cast<ECRecoveryHandle*>(_h);
+  RecoveryMessages m;
+  for (list<RecoveryOp>::iterator i = h->ops.begin();
+       i != h->ops.end();
+       ++i) {
+    dout(10) << __func__ << ": starting " << *i << dendl;
+    ceph_assert(!recovery_ops.count(i->hoid));
+    RecoveryOp &op = recovery_ops.insert(make_pair(i->hoid, *i)).first->second;
+    continue_recovery_op(op, &m);
+  }
+
+  dispatch_recovery_messages(m, priority);
+  send_recovery_deletes(priority, h->deletes);
+  delete _h;
+}
+
+int ECBackend::recover_object(
+  const hobject_t &hoid,
+  eversion_t v,
+  ObjectContextRef head,
+  ObjectContextRef obc,
+  RecoveryHandle *_h)
+{
+  ECRecoveryHandle *h = static_cast<ECRecoveryHandle*>(_h);
+  h->ops.push_back(RecoveryOp());
+  h->ops.back().v = v;
+  h->ops.back().hoid = hoid;
+  h->ops.back().obc = obc;
+  h->ops.back().recovery_info.soid = hoid;
+  h->ops.back().recovery_info.version = v;
+  if (obc) {
+    h->ops.back().recovery_info.size = obc->obs.oi.size;
+    h->ops.back().recovery_info.oi = obc->obs.oi;
+  }
+  if (hoid.is_snap()) {
+    if (obc) {
+      ceph_assert(obc->ssc);
+      h->ops.back().recovery_info.ss = obc->ssc->snapset;
+    } else if (head) {
+      ceph_assert(head->ssc);
+      h->ops.back().recovery_info.ss = head->ssc->snapset;
+    } else {
+      ceph_abort_msg("neither obc nor head set for a snap object");
+    }
+  }
+  h->ops.back().recovery_progress.omap_complete = true;
+  for (set<pg_shard_t>::const_iterator i =
+	 get_parent()->get_acting_recovery_backfill_shards().begin();
+       i != get_parent()->get_acting_recovery_backfill_shards().end();
+       ++i) {
+    dout(10) << "checking " << *i << dendl;
+    if (get_parent()->get_shard_missing(*i).is_missing(hoid)) {
+      h->ops.back().missing_on.insert(*i);
+      h->ops.back().missing_on_shards.insert(i->shard);
+    }
+  }
+  dout(10) << __func__ << ": built op " << h->ops.back() << dendl;
+  return 0;
+}
+
+bool ECBackend::can_handle_while_inactive(
+  OpRequestRef _op)
+{
+  return false;
+}
+
+bool ECBackend::_handle_message(
+  OpRequestRef _op)
+{
+  dout(10) << __func__ << ": " << *_op->get_req() << dendl;
+  int priority = _op->get_req()->get_priority();
+  switch (_op->get_req()->get_type()) {
+  case MSG_OSD_EC_WRITE: {
+    // NOTE: this is non-const because handle_sub_write modifies the embedded
+    // ObjectStore::Transaction in place (and then std::move's it).  It does
+    // not conflict with ECSubWrite's operator<<.
+    MOSDECSubOpWrite *op = static_cast<MOSDECSubOpWrite*>(
+      _op->get_nonconst_req());
+    parent->maybe_preempt_replica_scrub(op->op.soid);
+    handle_sub_write(op->op.from, _op, op->op, _op->pg_trace);
+    return true;
+  }
+  case MSG_OSD_EC_WRITE_REPLY: {
+    const MOSDECSubOpWriteReply *op = static_cast<const MOSDECSubOpWriteReply*>(
+      _op->get_req());
+    handle_sub_write_reply(op->op.from, op->op, _op->pg_trace);
+    return true;
+  }
+  case MSG_OSD_EC_READ: {
+    auto op = _op->get_req<MOSDECSubOpRead>();
+    MOSDECSubOpReadReply *reply = new MOSDECSubOpReadReply;
+    reply->pgid = get_parent()->primary_spg_t();
+    reply->map_epoch = get_osdmap_epoch();
+    reply->min_epoch = get_parent()->get_interval_start_epoch();
+    handle_sub_read(op->op.from, op->op, &(reply->op), _op->pg_trace);
+    reply->trace = _op->pg_trace;
+    get_parent()->send_message_osd_cluster(
+      reply, _op->get_req()->get_connection());
+    return true;
+  }
+  case MSG_OSD_EC_READ_REPLY: {
+    // NOTE: this is non-const because handle_sub_read_reply steals resulting
+    // buffers.  It does not conflict with ECSubReadReply operator<<.
+    MOSDECSubOpReadReply *op = static_cast<MOSDECSubOpReadReply*>(
+      _op->get_nonconst_req());
+    RecoveryMessages rm;
+    handle_sub_read_reply(op->op.from, op->op, &rm, _op->pg_trace);
+    dispatch_recovery_messages(rm, priority);
+    return true;
+  }
+  case MSG_OSD_PG_PUSH: {
+    auto op = _op->get_req<MOSDPGPush>();
+    RecoveryMessages rm;
+    for (vector<PushOp>::const_iterator i = op->pushes.begin();
+	 i != op->pushes.end();
+	 ++i) {
+      handle_recovery_push(*i, &rm, op->is_repair);
+    }
+    dispatch_recovery_messages(rm, priority);
+    return true;
+  }
+  case MSG_OSD_PG_PUSH_REPLY: {
+    const MOSDPGPushReply *op = static_cast<const MOSDPGPushReply *>(
+      _op->get_req());
+    RecoveryMessages rm;
+    for (vector<PushReplyOp>::const_iterator i = op->replies.begin();
+	 i != op->replies.end();
+	 ++i) {
+      handle_recovery_push_reply(*i, op->from, &rm);
+    }
+    dispatch_recovery_messages(rm, priority);
+    return true;
+  }
+  default:
+    return false;
+  }
+  return false;
+}
+
+struct SubWriteCommitted : public Context {
+  ECBackend *pg;
+  OpRequestRef msg;
+  ceph_tid_t tid;
+  eversion_t version;
+  eversion_t last_complete;
+  const ZTracer::Trace trace;
+  SubWriteCommitted(
+    ECBackend *pg,
+    OpRequestRef msg,
+    ceph_tid_t tid,
+    eversion_t version,
+    eversion_t last_complete,
+    const ZTracer::Trace &trace)
+    : pg(pg), msg(msg), tid(tid),
+      version(version), last_complete(last_complete), trace(trace) {}
+  void finish(int) override {
+    if (msg)
+      msg->mark_event("sub_op_committed");
+    pg->sub_write_committed(tid, version, last_complete, trace);
+  }
+};
+void ECBackend::sub_write_committed(
+  ceph_tid_t tid, eversion_t version, eversion_t last_complete,
+  const ZTracer::Trace &trace) {
+  if (get_parent()->pgb_is_primary()) {
+    ECSubWriteReply reply;
+    reply.tid = tid;
+    reply.last_complete = last_complete;
+    reply.committed = true;
+    reply.applied = true;
+    reply.from = get_parent()->whoami_shard();
+    handle_sub_write_reply(
+      get_parent()->whoami_shard(),
+      reply, trace);
+  } else {
+    get_parent()->update_last_complete_ondisk(last_complete);
+    MOSDECSubOpWriteReply *r = new MOSDECSubOpWriteReply;
+    r->pgid = get_parent()->primary_spg_t();
+    r->map_epoch = get_osdmap_epoch();
+    r->min_epoch = get_parent()->get_interval_start_epoch();
+    r->op.tid = tid;
+    r->op.last_complete = last_complete;
+    r->op.committed = true;
+    r->op.applied = true;
+    r->op.from = get_parent()->whoami_shard();
+    r->set_priority(CEPH_MSG_PRIO_HIGH);
+    r->trace = trace;
+    r->trace.event("sending sub op commit");
+    get_parent()->send_message_osd_cluster(
+      get_parent()->primary_shard().osd, r, get_osdmap_epoch());
+  }
+}
+
+void ECBackend::handle_sub_write(
+  pg_shard_t from,
+  OpRequestRef msg,
+  ECSubWrite &op,
+  const ZTracer::Trace &trace)
+{
+  if (msg)
+    msg->mark_event("sub_op_started");
+  trace.event("handle_sub_write");
+#ifdef HAVE_JAEGER
+  if (msg->osd_parent_span) {
+    auto ec_sub_trans = jaeger_tracing::child_span(__func__, msg->osd_parent_span);
+  }
+#endif
+  if (!get_parent()->pgb_is_primary())
+    get_parent()->update_stats(op.stats);
+  ObjectStore::Transaction localt;
+  if (!op.temp_added.empty()) {
+    add_temp_objs(op.temp_added);
+  }
+  if (op.backfill_or_async_recovery) {
+    for (set<hobject_t>::iterator i = op.temp_removed.begin();
+	 i != op.temp_removed.end();
+	 ++i) {
+      dout(10) << __func__ << ": removing object " << *i
+	       << " since we won't get the transaction" << dendl;
+      localt.remove(
+	coll,
+	ghobject_t(
+	  *i,
+	  ghobject_t::NO_GEN,
+	  get_parent()->whoami_shard().shard));
+    }
+  }
+  clear_temp_objs(op.temp_removed);
+  dout(30) << __func__ << " missing before " << get_parent()->get_log().get_missing().get_items() << dendl;
+  // flag set to true during async recovery
+  bool async = false;
+  pg_missing_tracker_t pmissing = get_parent()->get_local_missing();
+  if (pmissing.is_missing(op.soid)) {
+    async = true;
+    dout(30) << __func__ << " is_missing " << pmissing.is_missing(op.soid) << dendl;
+    for (auto &&e: op.log_entries) {
+      dout(30) << " add_next_event entry " << e << dendl;
+      get_parent()->add_local_next_event(e);
+      dout(30) << " entry is_delete " << e.is_delete() << dendl;
+    }
+  }
+  get_parent()->log_operation(
+    std::move(op.log_entries),
+    op.updated_hit_set_history,
+    op.trim_to,
+    op.roll_forward_to,
+    op.roll_forward_to,
+    !op.backfill_or_async_recovery,
+    localt,
+    async);
+
+  if (!get_parent()->pg_is_undersized() &&
+      (unsigned)get_parent()->whoami_shard().shard >=
+      ec_impl->get_data_chunk_count())
+    op.t.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+
+  localt.register_on_commit(
+    get_parent()->bless_context(
+      new SubWriteCommitted(
+	this, msg, op.tid,
+	op.at_version,
+	get_parent()->get_info().last_complete, trace)));
+  vector<ObjectStore::Transaction> tls;
+  tls.reserve(2);
+  tls.push_back(std::move(op.t));
+  tls.push_back(std::move(localt));
+  get_parent()->queue_transactions(tls, msg);
+  dout(30) << __func__ << " missing after" << get_parent()->get_log().get_missing().get_items() << dendl;
+  if (op.at_version != eversion_t()) {
+    // dummy rollforward transaction doesn't get at_version (and doesn't advance it)
+    get_parent()->op_applied(op.at_version);
+  }
+}
+
+void ECBackend::handle_sub_read(
+  pg_shard_t from,
+  const ECSubRead &op,
+  ECSubReadReply *reply,
+  const ZTracer::Trace &trace)
+{
+  trace.event("handle sub read");
+  shard_id_t shard = get_parent()->whoami_shard().shard;
+  for(auto i = op.to_read.begin();
+      i != op.to_read.end();
+      ++i) {
+    int r = 0;
+    for (auto j = i->second.begin(); j != i->second.end(); ++j) {
+      bufferlist bl;
+      if ((op.subchunks.find(i->first)->second.size() == 1) && 
+          (op.subchunks.find(i->first)->second.front().second == 
+                                            ec_impl->get_sub_chunk_count())) {
+        dout(25) << __func__ << " case1: reading the complete chunk/shard." << dendl;
+        r = store->read(
+	  ch,
+	  ghobject_t(i->first, ghobject_t::NO_GEN, shard),
+	  j->get<0>(),
+	  j->get<1>(),
+	  bl, j->get<2>()); // Allow EIO return
+      } else {
+        dout(25) << __func__ << " case2: going to do fragmented read." << dendl;
+        int subchunk_size =
+          sinfo.get_chunk_size() / ec_impl->get_sub_chunk_count();
+        bool error = false;
+        for (int m = 0; m < (int)j->get<1>() && !error;
+             m += sinfo.get_chunk_size()) {
+          for (auto &&k:op.subchunks.find(i->first)->second) {
+            bufferlist bl0;
+            r = store->read(
+                ch,
+                ghobject_t(i->first, ghobject_t::NO_GEN, shard),
+                j->get<0>() + m + (k.first)*subchunk_size,
+                (k.second)*subchunk_size,
+                bl0, j->get<2>());
+            if (r < 0) {
+              error = true;
+              break;
+            }
+            bl.claim_append(bl0);
+          }
+        }
+      }
+
+      if (r < 0) {
+	// if we are doing fast reads, it's possible for one of the shard
+	// reads to cross paths with another update and get a (harmless)
+	// ENOENT.  Suppress the message to the cluster log in that case.
+	if (r == -ENOENT && get_parent()->get_pool().fast_read) {
+	  dout(5) << __func__ << ": Error " << r
+		  << " reading " << i->first << ", fast read, probably ok"
+		  << dendl;
+	} else {
+	  get_parent()->clog_error() << "Error " << r
+				     << " reading object "
+				     << i->first;
+	  dout(5) << __func__ << ": Error " << r
+		  << " reading " << i->first << dendl;
+	}
+	goto error;
+      } else {
+        dout(20) << __func__ << " read request=" << j->get<1>() << " r=" << r << " len=" << bl.length() << dendl;
+	reply->buffers_read[i->first].push_back(
+	  make_pair(
+	    j->get<0>(),
+	    bl)
+	  );
+      }
+
+      if (!get_parent()->get_pool().allows_ecoverwrites()) {
+	// This shows that we still need deep scrub because large enough files
+	// are read in sections, so the digest check here won't be done here.
+	// Do NOT check osd_read_eio_on_bad_digest here.  We need to report
+	// the state of our chunk in case other chunks could substitute.
+        ECUtil::HashInfoRef hinfo;
+        hinfo = get_hash_info(i->first);
+        if (!hinfo) {
+          r = -EIO;
+          get_parent()->clog_error() << "Corruption detected: object "
+                                     << i->first
+                                     << " is missing hash_info";
+          dout(5) << __func__ << ": No hinfo for " << i->first << dendl;
+          goto error;
+        }
+	ceph_assert(hinfo->has_chunk_hash());
+	if ((bl.length() == hinfo->get_total_chunk_size()) &&
+	    (j->get<0>() == 0)) {
+	  dout(20) << __func__ << ": Checking hash of " << i->first << dendl;
+	  bufferhash h(-1);
+	  h << bl;
+	  if (h.digest() != hinfo->get_chunk_hash(shard)) {
+	    get_parent()->clog_error() << "Bad hash for " << i->first << " digest 0x"
+				       << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec;
+	    dout(5) << __func__ << ": Bad hash for " << i->first << " digest 0x"
+		    << hex << h.digest() << " expected 0x" << hinfo->get_chunk_hash(shard) << dec << dendl;
+	    r = -EIO;
+	    goto error;
+	  }
+	}
+      }
+    }
+    continue;
+error:
+    // Do NOT check osd_read_eio_on_bad_digest here.  We need to report
+    // the state of our chunk in case other chunks could substitute.
+    reply->buffers_read.erase(i->first);
+    reply->errors[i->first] = r;
+  }
+  for (set<hobject_t>::iterator i = op.attrs_to_read.begin();
+       i != op.attrs_to_read.end();
+       ++i) {
+    dout(10) << __func__ << ": fulfilling attr request on "
+	     << *i << dendl;
+    if (reply->errors.count(*i))
+      continue;
+    int r = store->getattrs(
+      ch,
+      ghobject_t(
+	*i, ghobject_t::NO_GEN, shard),
+      reply->attrs_read[*i]);
+    if (r < 0) {
+      // If we read error, we should not return the attrs too.
+      reply->attrs_read.erase(*i);
+      reply->buffers_read.erase(*i);
+      reply->errors[*i] = r;
+    }
+  }
+  reply->from = get_parent()->whoami_shard();
+  reply->tid = op.tid;
+}
+
+void ECBackend::handle_sub_write_reply(
+  pg_shard_t from,
+  const ECSubWriteReply &op,
+  const ZTracer::Trace &trace)
+{
+  map<ceph_tid_t, Op>::iterator i = tid_to_op_map.find(op.tid);
+  ceph_assert(i != tid_to_op_map.end());
+  if (op.committed) {
+    trace.event("sub write committed");
+    ceph_assert(i->second.pending_commit.count(from));
+    i->second.pending_commit.erase(from);
+    if (from != get_parent()->whoami_shard()) {
+      get_parent()->update_peer_last_complete_ondisk(from, op.last_complete);
+    }
+  }
+  if (op.applied) {
+    trace.event("sub write applied");
+    ceph_assert(i->second.pending_apply.count(from));
+    i->second.pending_apply.erase(from);
+  }
+
+  if (i->second.pending_commit.empty() &&
+      i->second.on_all_commit &&
+      // also wait for apply, to preserve ordering with luminous peers.
+      i->second.pending_apply.empty()) {
+    dout(10) << __func__ << " Calling on_all_commit on " << i->second << dendl;
+    i->second.on_all_commit->complete(0);
+    i->second.on_all_commit = 0;
+    i->second.trace.event("ec write all committed");
+  }
+  check_ops();
+}
+
+void ECBackend::handle_sub_read_reply(
+  pg_shard_t from,
+  ECSubReadReply &op,
+  RecoveryMessages *m,
+  const ZTracer::Trace &trace)
+{
+  trace.event("ec sub read reply");
+  dout(10) << __func__ << ": reply " << op << dendl;
+  map<ceph_tid_t, ReadOp>::iterator iter = tid_to_read_map.find(op.tid);
+  if (iter == tid_to_read_map.end()) {
+    //canceled
+    dout(20) << __func__ << ": dropped " << op << dendl;
+    return;
+  }
+  ReadOp &rop = iter->second;
+  for (auto i = op.buffers_read.begin();
+       i != op.buffers_read.end();
+       ++i) {
+    ceph_assert(!op.errors.count(i->first));	// If attribute error we better not have sent a buffer
+    if (!rop.to_read.count(i->first)) {
+      // We canceled this read! @see filter_read_op
+      dout(20) << __func__ << " to_read skipping" << dendl;
+      continue;
+    }
+    list<boost::tuple<uint64_t, uint64_t, uint32_t> >::const_iterator req_iter =
+      rop.to_read.find(i->first)->second.to_read.begin();
+    list<
+      boost::tuple<
+	uint64_t, uint64_t, map<pg_shard_t, bufferlist> > >::iterator riter =
+      rop.complete[i->first].returned.begin();
+    for (list<pair<uint64_t, bufferlist> >::iterator j = i->second.begin();
+	 j != i->second.end();
+	 ++j, ++req_iter, ++riter) {
+      ceph_assert(req_iter != rop.to_read.find(i->first)->second.to_read.end());
+      ceph_assert(riter != rop.complete[i->first].returned.end());
+      pair<uint64_t, uint64_t> adjusted =
+	sinfo.aligned_offset_len_to_chunk(
+	  make_pair(req_iter->get<0>(), req_iter->get<1>()));
+      ceph_assert(adjusted.first == j->first);
+      riter->get<2>()[from] = std::move(j->second);
+    }
+  }
+  for (auto i = op.attrs_read.begin();
+       i != op.attrs_read.end();
+       ++i) {
+    ceph_assert(!op.errors.count(i->first));	// if read error better not have sent an attribute
+    if (!rop.to_read.count(i->first)) {
+      // We canceled this read! @see filter_read_op
+      dout(20) << __func__ << " to_read skipping" << dendl;
+      continue;
+    }
+    rop.complete[i->first].attrs = map<string, bufferlist>();
+    (*(rop.complete[i->first].attrs)).swap(i->second);
+  }
+  for (auto i = op.errors.begin();
+       i != op.errors.end();
+       ++i) {
+    rop.complete[i->first].errors.insert(
+      make_pair(
+	from,
+	i->second));
+    dout(20) << __func__ << " shard=" << from << " error=" << i->second << dendl;
+  }
+
+  map<pg_shard_t, set<ceph_tid_t> >::iterator siter =
+					shard_to_read_map.find(from);
+  ceph_assert(siter != shard_to_read_map.end());
+  ceph_assert(siter->second.count(op.tid));
+  siter->second.erase(op.tid);
+
+  ceph_assert(rop.in_progress.count(from));
+  rop.in_progress.erase(from);
+  unsigned is_complete = 0;
+  bool need_resend = false;
+  // For redundant reads check for completion as each shard comes in,
+  // or in a non-recovery read check for completion once all the shards read.
+  if (rop.do_redundant_reads || rop.in_progress.empty()) {
+    for (map<hobject_t, read_result_t>::const_iterator iter =
+        rop.complete.begin();
+      iter != rop.complete.end();
+      ++iter) {
+      set<int> have;
+      for (map<pg_shard_t, bufferlist>::const_iterator j =
+          iter->second.returned.front().get<2>().begin();
+        j != iter->second.returned.front().get<2>().end();
+        ++j) {
+        have.insert(j->first.shard);
+        dout(20) << __func__ << " have shard=" << j->first.shard << dendl;
+      }
+      map<int, vector<pair<int, int>>> dummy_minimum;
+      int err;
+      if ((err = ec_impl->minimum_to_decode(rop.want_to_read[iter->first], have, &dummy_minimum)) < 0) {
+	dout(20) << __func__ << " minimum_to_decode failed" << dendl;
+        if (rop.in_progress.empty()) {
+	  // If we don't have enough copies, try other pg_shard_ts if available.
+	  // During recovery there may be multiple osds with copies of the same shard,
+	  // so getting EIO from one may result in multiple passes through this code path.
+	  if (!rop.do_redundant_reads) {
+	    int r = send_all_remaining_reads(iter->first, rop);
+	    if (r == 0) {
+	      // We changed the rop's to_read and not incrementing is_complete
+	      need_resend = true;
+	      continue;
+	    }
+	    // Couldn't read any additional shards so handle as completed with errors
+	  }
+	  // We don't want to confuse clients / RBD with objectstore error
+	  // values in particular ENOENT.  We may have different error returns
+	  // from different shards, so we'll return minimum_to_decode() error
+	  // (usually EIO) to reader.  It is likely an error here is due to a
+	  // damaged pg.
+	  rop.complete[iter->first].r = err;
+	  ++is_complete;
+	}
+      } else {
+        ceph_assert(rop.complete[iter->first].r == 0);
+	if (!rop.complete[iter->first].errors.empty()) {
+	  if (cct->_conf->osd_read_ec_check_for_errors) {
+	    dout(10) << __func__ << ": Not ignoring errors, use one shard err=" << err << dendl;
+	    err = rop.complete[iter->first].errors.begin()->second;
+            rop.complete[iter->first].r = err;
+	  } else {
+	    get_parent()->clog_warn() << "Error(s) ignored for "
+				       << iter->first << " enough copies available";
+	    dout(10) << __func__ << " Error(s) ignored for " << iter->first
+		     << " enough copies available" << dendl;
+	    rop.complete[iter->first].errors.clear();
+	  }
+	}
+	// avoid re-read for completed object as we may send remaining reads for uncopmpleted objects
+	rop.to_read.at(iter->first).need.clear();
+	rop.to_read.at(iter->first).want_attrs = false;
+	++is_complete;
+      }
+    }
+  }
+  if (need_resend) {
+    do_read_op(rop);
+  } else if (rop.in_progress.empty() || 
+             is_complete == rop.complete.size()) {
+    dout(20) << __func__ << " Complete: " << rop << dendl;
+    rop.trace.event("ec read complete");
+    complete_read_op(rop, m);
+  } else {
+    dout(10) << __func__ << " readop not complete: " << rop << dendl;
+  }
+}
+
+void ECBackend::complete_read_op(ReadOp &rop, RecoveryMessages *m)
+{
+  map<hobject_t, read_request_t>::iterator reqiter =
+    rop.to_read.begin();
+  map<hobject_t, read_result_t>::iterator resiter =
+    rop.complete.begin();
+  ceph_assert(rop.to_read.size() == rop.complete.size());
+  for (; reqiter != rop.to_read.end(); ++reqiter, ++resiter) {
+    if (reqiter->second.cb) {
+      pair<RecoveryMessages *, read_result_t &> arg(
+	m, resiter->second);
+      reqiter->second.cb->complete(arg);
+      reqiter->second.cb = nullptr;
+    }
+  }
+  // if the read op is over. clean all the data of this tid.
+  for (set<pg_shard_t>::iterator iter = rop.in_progress.begin();
+    iter != rop.in_progress.end();
+    iter++) {
+    shard_to_read_map[*iter].erase(rop.tid);
+  }
+  rop.in_progress.clear();
+  tid_to_read_map.erase(rop.tid);
+}
+
+struct FinishReadOp : public GenContext<ThreadPool::TPHandle&>  {
+  ECBackend *ec;
+  ceph_tid_t tid;
+  FinishReadOp(ECBackend *ec, ceph_tid_t tid) : ec(ec), tid(tid) {}
+  void finish(ThreadPool::TPHandle &handle) override {
+    auto ropiter = ec->tid_to_read_map.find(tid);
+    ceph_assert(ropiter != ec->tid_to_read_map.end());
+    int priority = ropiter->second.priority;
+    RecoveryMessages rm;
+    ec->complete_read_op(ropiter->second, &rm);
+    ec->dispatch_recovery_messages(rm, priority);
+  }
+};
+
+void ECBackend::filter_read_op(
+  const OSDMapRef& osdmap,
+  ReadOp &op)
+{
+  set<hobject_t> to_cancel;
+  for (map<pg_shard_t, set<hobject_t> >::iterator i = op.source_to_obj.begin();
+       i != op.source_to_obj.end();
+       ++i) {
+    if (osdmap->is_down(i->first.osd)) {
+      to_cancel.insert(i->second.begin(), i->second.end());
+      op.in_progress.erase(i->first);
+      continue;
+    }
+  }
+
+  if (to_cancel.empty())
+    return;
+
+  for (map<pg_shard_t, set<hobject_t> >::iterator i = op.source_to_obj.begin();
+       i != op.source_to_obj.end();
+       ) {
+    for (set<hobject_t>::iterator j = i->second.begin();
+	 j != i->second.end();
+	 ) {
+      if (to_cancel.count(*j))
+	i->second.erase(j++);
+      else
+	++j;
+    }
+    if (i->second.empty()) {
+      op.source_to_obj.erase(i++);
+    } else {
+      ceph_assert(!osdmap->is_down(i->first.osd));
+      ++i;
+    }
+  }
+
+  for (set<hobject_t>::iterator i = to_cancel.begin();
+       i != to_cancel.end();
+       ++i) {
+    get_parent()->cancel_pull(*i);
+
+    ceph_assert(op.to_read.count(*i));
+    read_request_t &req = op.to_read.find(*i)->second;
+    dout(10) << __func__ << ": canceling " << req
+	     << "  for obj " << *i << dendl;
+    ceph_assert(req.cb);
+    delete req.cb;
+    req.cb = nullptr;
+
+    op.to_read.erase(*i);
+    op.complete.erase(*i);
+    recovery_ops.erase(*i);
+  }
+
+  if (op.in_progress.empty()) {
+    get_parent()->schedule_recovery_work(
+      get_parent()->bless_unlocked_gencontext(
+	new FinishReadOp(this, op.tid)));
+  }
+}
+
+void ECBackend::check_recovery_sources(const OSDMapRef& osdmap)
+{
+  set<ceph_tid_t> tids_to_filter;
+  for (map<pg_shard_t, set<ceph_tid_t> >::iterator 
+       i = shard_to_read_map.begin();
+       i != shard_to_read_map.end();
+       ) {
+    if (osdmap->is_down(i->first.osd)) {
+      tids_to_filter.insert(i->second.begin(), i->second.end());
+      shard_to_read_map.erase(i++);
+    } else {
+      ++i;
+    }
+  }
+  for (set<ceph_tid_t>::iterator i = tids_to_filter.begin();
+       i != tids_to_filter.end();
+       ++i) {
+    map<ceph_tid_t, ReadOp>::iterator j = tid_to_read_map.find(*i);
+    ceph_assert(j != tid_to_read_map.end());
+    filter_read_op(osdmap, j->second);
+  }
+}
+
+void ECBackend::on_change()
+{
+  dout(10) << __func__ << dendl;
+
+  completed_to = eversion_t();
+  committed_to = eversion_t();
+  pipeline_state.clear();
+  waiting_reads.clear();
+  waiting_state.clear();
+  waiting_commit.clear();
+  for (auto &&op: tid_to_op_map) {
+    cache.release_write_pin(op.second.pin);
+  }
+  tid_to_op_map.clear();
+
+  for (map<ceph_tid_t, ReadOp>::iterator i = tid_to_read_map.begin();
+       i != tid_to_read_map.end();
+       ++i) {
+    dout(10) << __func__ << ": cancelling " << i->second << dendl;
+    for (map<hobject_t, read_request_t>::iterator j =
+	   i->second.to_read.begin();
+	 j != i->second.to_read.end();
+	 ++j) {
+      delete j->second.cb;
+      j->second.cb = nullptr;
+    }
+  }
+  tid_to_read_map.clear();
+  in_progress_client_reads.clear();
+  shard_to_read_map.clear();
+  clear_recovery_state();
+}
+
+void ECBackend::clear_recovery_state()
+{
+  recovery_ops.clear();
+}
+
+void ECBackend::dump_recovery_info(Formatter *f) const
+{
+  f->open_array_section("recovery_ops");
+  for (map<hobject_t, RecoveryOp>::const_iterator i = recovery_ops.begin();
+       i != recovery_ops.end();
+       ++i) {
+    f->open_object_section("op");
+    i->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("read_ops");
+  for (map<ceph_tid_t, ReadOp>::const_iterator i = tid_to_read_map.begin();
+       i != tid_to_read_map.end();
+       ++i) {
+    f->open_object_section("read_op");
+    i->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void ECBackend::submit_transaction(
+  const hobject_t &hoid,
+  const object_stat_sum_t &delta_stats,
+  const eversion_t &at_version,
+  PGTransactionUPtr &&t,
+  const eversion_t &trim_to,
+  const eversion_t &min_last_complete_ondisk,
+  vector<pg_log_entry_t>&& log_entries,
+  std::optional<pg_hit_set_history_t> &hset_history,
+  Context *on_all_commit,
+  ceph_tid_t tid,
+  osd_reqid_t reqid,
+  OpRequestRef client_op
+  )
+{
+  ceph_assert(!tid_to_op_map.count(tid));
+  Op *op = &(tid_to_op_map[tid]);
+  op->hoid = hoid;
+  op->delta_stats = delta_stats;
+  op->version = at_version;
+  op->trim_to = trim_to;
+  op->roll_forward_to = std::max(min_last_complete_ondisk, committed_to);
+  op->log_entries = log_entries;
+  std::swap(op->updated_hit_set_history, hset_history);
+  op->on_all_commit = on_all_commit;
+  op->tid = tid;
+  op->reqid = reqid;
+  op->client_op = client_op;
+  if (client_op)
+    op->trace = client_op->pg_trace;
+
+#ifdef HAVE_JAEGER
+  if (client_op->osd_parent_span) {
+    auto ec_sub_trans = jaeger_tracing::child_span("ECBackend::submit_transaction", client_op->osd_parent_span);
+  }
+#endif
+  dout(10) << __func__ << ": op " << *op << " starting" << dendl;
+  start_rmw(op, std::move(t));
+}
+
+void ECBackend::call_write_ordered(std::function<void(void)> &&cb) {
+  if (!waiting_state.empty()) {
+    waiting_state.back().on_write.emplace_back(std::move(cb));
+  } else if (!waiting_reads.empty()) {
+    waiting_reads.back().on_write.emplace_back(std::move(cb));
+  } else {
+    // Nothing earlier in the pipeline, just call it
+    cb();
+  }
+}
+
+void ECBackend::get_all_avail_shards(
+  const hobject_t &hoid,
+  const set<pg_shard_t> &error_shards,
+  set<int> &have,
+  map<shard_id_t, pg_shard_t> &shards,
+  bool for_recovery)
+{
+  for (set<pg_shard_t>::const_iterator i =
+	 get_parent()->get_acting_shards().begin();
+       i != get_parent()->get_acting_shards().end();
+       ++i) {
+    dout(10) << __func__ << ": checking acting " << *i << dendl;
+    const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
+    if (error_shards.find(*i) != error_shards.end())
+      continue;
+    if (!missing.is_missing(hoid)) {
+      ceph_assert(!have.count(i->shard));
+      have.insert(i->shard);
+      ceph_assert(!shards.count(i->shard));
+      shards.insert(make_pair(i->shard, *i));
+    }
+  }
+
+  if (for_recovery) {
+    for (set<pg_shard_t>::const_iterator i =
+	   get_parent()->get_backfill_shards().begin();
+	 i != get_parent()->get_backfill_shards().end();
+	 ++i) {
+      if (error_shards.find(*i) != error_shards.end())
+	continue;
+      if (have.count(i->shard)) {
+	ceph_assert(shards.count(i->shard));
+	continue;
+      }
+      dout(10) << __func__ << ": checking backfill " << *i << dendl;
+      ceph_assert(!shards.count(i->shard));
+      const pg_info_t &info = get_parent()->get_shard_info(*i);
+      const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
+      if (hoid < info.last_backfill &&
+	  !missing.is_missing(hoid)) {
+	have.insert(i->shard);
+	shards.insert(make_pair(i->shard, *i));
+      }
+    }
+
+    map<hobject_t, set<pg_shard_t>>::const_iterator miter =
+      get_parent()->get_missing_loc_shards().find(hoid);
+    if (miter != get_parent()->get_missing_loc_shards().end()) {
+      for (set<pg_shard_t>::iterator i = miter->second.begin();
+	   i != miter->second.end();
+	   ++i) {
+	dout(10) << __func__ << ": checking missing_loc " << *i << dendl;
+	auto m = get_parent()->maybe_get_shard_missing(*i);
+	if (m) {
+	  ceph_assert(!(*m).is_missing(hoid));
+	}
+	if (error_shards.find(*i) != error_shards.end())
+	  continue;
+	have.insert(i->shard);
+	shards.insert(make_pair(i->shard, *i));
+      }
+    }
+  }
+}
+
+int ECBackend::get_min_avail_to_read_shards(
+  const hobject_t &hoid,
+  const set<int> &want,
+  bool for_recovery,
+  bool do_redundant_reads,
+  map<pg_shard_t, vector<pair<int, int>>> *to_read)
+{
+  // Make sure we don't do redundant reads for recovery
+  ceph_assert(!for_recovery || !do_redundant_reads);
+
+  set<int> have;
+  map<shard_id_t, pg_shard_t> shards;
+  set<pg_shard_t> error_shards;
+
+  get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
+
+  map<int, vector<pair<int, int>>> need;
+  int r = ec_impl->minimum_to_decode(want, have, &need);
+  if (r < 0)
+    return r;
+
+  if (do_redundant_reads) {
+      vector<pair<int, int>> subchunks_list;
+      subchunks_list.push_back(make_pair(0, ec_impl->get_sub_chunk_count()));
+      for (auto &&i: have) {
+        need[i] = subchunks_list;
+      }
+  } 
+
+  if (!to_read)
+    return 0;
+
+  for (auto &&i:need) {
+    ceph_assert(shards.count(shard_id_t(i.first)));
+    to_read->insert(make_pair(shards[shard_id_t(i.first)], i.second));
+  }
+  return 0;
+}
+
+int ECBackend::get_remaining_shards(
+  const hobject_t &hoid,
+  const set<int> &avail,
+  const set<int> &want,
+  const read_result_t &result,
+  map<pg_shard_t, vector<pair<int, int>>> *to_read,
+  bool for_recovery)
+{
+  ceph_assert(to_read);
+
+  set<int> have;
+  map<shard_id_t, pg_shard_t> shards;
+  set<pg_shard_t> error_shards;
+  for (auto &p : result.errors) {
+    error_shards.insert(p.first);
+  }
+
+  get_all_avail_shards(hoid, error_shards, have, shards, for_recovery);
+
+  map<int, vector<pair<int, int>>> need;
+  int r = ec_impl->minimum_to_decode(want, have, &need);
+  if (r < 0) {
+    dout(0) << __func__ << " not enough shards left to try for " << hoid
+	    << " read result was " << result << dendl;
+    return -EIO;
+  }
+
+  set<int> shards_left;
+  for (auto p : need) {
+    if (avail.find(p.first) == avail.end()) {
+      shards_left.insert(p.first);
+    }
+  }
+
+  vector<pair<int, int>> subchunks;
+  subchunks.push_back(make_pair(0, ec_impl->get_sub_chunk_count()));
+  for (set<int>::iterator i = shards_left.begin();
+       i != shards_left.end();
+       ++i) {
+    ceph_assert(shards.count(shard_id_t(*i)));
+    ceph_assert(avail.find(*i) == avail.end());
+    to_read->insert(make_pair(shards[shard_id_t(*i)], subchunks));
+  }
+  return 0;
+}
+
+void ECBackend::start_read_op(
+  int priority,
+  map<hobject_t, set<int>> &want_to_read,
+  map<hobject_t, read_request_t> &to_read,
+  OpRequestRef _op,
+  bool do_redundant_reads,
+  bool for_recovery)
+{
+  ceph_tid_t tid = get_parent()->get_tid();
+  ceph_assert(!tid_to_read_map.count(tid));
+  auto &op = tid_to_read_map.emplace(
+    tid,
+    ReadOp(
+      priority,
+      tid,
+      do_redundant_reads,
+      for_recovery,
+      _op,
+      std::move(want_to_read),
+      std::move(to_read))).first->second;
+  dout(10) << __func__ << ": starting " << op << dendl;
+  if (_op) {
+    op.trace = _op->pg_trace;
+    op.trace.event("start ec read");
+  }
+  do_read_op(op);
+}
+
+void ECBackend::do_read_op(ReadOp &op)
+{
+  int priority = op.priority;
+  ceph_tid_t tid = op.tid;
+
+  dout(10) << __func__ << ": starting read " << op << dendl;
+
+  map<pg_shard_t, ECSubRead> messages;
+  for (map<hobject_t, read_request_t>::iterator i = op.to_read.begin();
+       i != op.to_read.end();
+       ++i) {
+    bool need_attrs = i->second.want_attrs;
+
+    for (auto j = i->second.need.begin();
+	 j != i->second.need.end();
+	 ++j) {
+      if (need_attrs) {
+	messages[j->first].attrs_to_read.insert(i->first);
+	need_attrs = false;
+      }
+      messages[j->first].subchunks[i->first] = j->second;
+      op.obj_to_source[i->first].insert(j->first);
+      op.source_to_obj[j->first].insert(i->first);
+    }
+    for (list<boost::tuple<uint64_t, uint64_t, uint32_t> >::const_iterator j =
+	   i->second.to_read.begin();
+	 j != i->second.to_read.end();
+	 ++j) {
+      pair<uint64_t, uint64_t> chunk_off_len =
+	sinfo.aligned_offset_len_to_chunk(make_pair(j->get<0>(), j->get<1>()));
+      for (auto k = i->second.need.begin();
+	   k != i->second.need.end();
+	   ++k) {
+	messages[k->first].to_read[i->first].push_back(
+	  boost::make_tuple(
+	    chunk_off_len.first,
+	    chunk_off_len.second,
+	    j->get<2>()));
+      }
+      ceph_assert(!need_attrs);
+    }
+  }
+
+  std::vector<std::pair<int, Message*>> m;
+  m.reserve(messages.size());
+  for (map<pg_shard_t, ECSubRead>::iterator i = messages.begin();
+       i != messages.end();
+       ++i) {
+    op.in_progress.insert(i->first);
+    shard_to_read_map[i->first].insert(op.tid);
+    i->second.tid = tid;
+    MOSDECSubOpRead *msg = new MOSDECSubOpRead;
+    msg->set_priority(priority);
+    msg->pgid = spg_t(
+      get_parent()->whoami_spg_t().pgid,
+      i->first.shard);
+    msg->map_epoch = get_osdmap_epoch();
+    msg->min_epoch = get_parent()->get_interval_start_epoch();
+    msg->op = i->second;
+    msg->op.from = get_parent()->whoami_shard();
+    msg->op.tid = tid;
+    if (op.trace) {
+      // initialize a child span for this shard
+      msg->trace.init("ec sub read", nullptr, &op.trace);
+      msg->trace.keyval("shard", i->first.shard.id);
+    }
+    m.push_back(std::make_pair(i->first.osd, msg));
+  }
+  if (!m.empty()) {
+    get_parent()->send_message_osd_cluster(m, get_osdmap_epoch());
+  }
+
+  dout(10) << __func__ << ": started " << op << dendl;
+}
+
+ECUtil::HashInfoRef ECBackend::get_hash_info(
+  const hobject_t &hoid, bool create, const map<string,bufferptr> *attrs)
+{
+  dout(10) << __func__ << ": Getting attr on " << hoid << dendl;
+  ECUtil::HashInfoRef ref = unstable_hashinfo_registry.lookup(hoid);
+  if (!ref) {
+    dout(10) << __func__ << ": not in cache " << hoid << dendl;
+    struct stat st;
+    int r = store->stat(
+      ch,
+      ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      &st);
+    ECUtil::HashInfo hinfo(ec_impl->get_chunk_count());
+    if (r >= 0) {
+      dout(10) << __func__ << ": found on disk, size " << st.st_size << dendl;
+      bufferlist bl;
+      if (attrs) {
+	map<string, bufferptr>::const_iterator k = attrs->find(ECUtil::get_hinfo_key());
+	if (k == attrs->end()) {
+	  dout(5) << __func__ << " " << hoid << " missing hinfo attr" << dendl;
+	} else {
+	  bl.push_back(k->second);
+	}
+      } else {
+	r = store->getattr(
+	  ch,
+	  ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+	  ECUtil::get_hinfo_key(),
+	  bl);
+	if (r < 0) {
+	  dout(5) << __func__ << ": getattr failed: " << cpp_strerror(r) << dendl;
+	  bl.clear(); // just in case
+	}
+      }
+      if (bl.length() > 0) {
+	auto bp = bl.cbegin();
+        try {
+	  decode(hinfo, bp);
+        } catch(...) {
+	  dout(0) << __func__ << ": Can't decode hinfo for " << hoid << dendl;
+	  return ECUtil::HashInfoRef();
+        }
+	if (hinfo.get_total_chunk_size() != (uint64_t)st.st_size) {
+	  dout(0) << __func__ << ": Mismatch of total_chunk_size "
+			       << hinfo.get_total_chunk_size() << dendl;
+	  return ECUtil::HashInfoRef();
+	}
+      } else if (st.st_size > 0) { // If empty object and no hinfo, create it
+	return ECUtil::HashInfoRef();
+      }
+    } else if (r != -ENOENT || !create) {
+      derr << __func__ << ": stat " << hoid << " failed: " << cpp_strerror(r)
+           << dendl;
+      return ECUtil::HashInfoRef();
+    }
+    ref = unstable_hashinfo_registry.lookup_or_create(hoid, hinfo);
+  }
+  return ref;
+}
+
+void ECBackend::start_rmw(Op *op, PGTransactionUPtr &&t)
+{
+  ceph_assert(op);
+
+  op->plan = ECTransaction::get_write_plan(
+    sinfo,
+    std::move(t),
+    [&](const hobject_t &i) {
+      ECUtil::HashInfoRef ref = get_hash_info(i, true);
+      if (!ref) {
+	derr << __func__ << ": get_hash_info(" << i << ")"
+	     << " returned a null pointer and there is no "
+	     << " way to recover from such an error in this "
+	     << " context" << dendl;
+	ceph_abort();
+      }
+      return ref;
+    },
+    get_parent()->get_dpp());
+
+  dout(10) << __func__ << ": " << *op << dendl;
+
+  waiting_state.push_back(*op);
+  check_ops();
+}
+
+bool ECBackend::try_state_to_reads()
+{
+  if (waiting_state.empty())
+    return false;
+
+  Op *op = &(waiting_state.front());
+  if (op->requires_rmw() && pipeline_state.cache_invalid()) {
+    ceph_assert(get_parent()->get_pool().allows_ecoverwrites());
+    dout(20) << __func__ << ": blocking " << *op
+	     << " because it requires an rmw and the cache is invalid "
+	     << pipeline_state
+	     << dendl;
+    return false;
+  }
+
+  if (!pipeline_state.caching_enabled()) {
+    op->using_cache = false;
+  } else if (op->invalidates_cache()) {
+    dout(20) << __func__ << ": invalidating cache after this op"
+	     << dendl;
+    pipeline_state.invalidate();
+  }
+
+  waiting_state.pop_front();
+  waiting_reads.push_back(*op);
+
+  if (op->using_cache) {
+    cache.open_write_pin(op->pin);
+
+    extent_set empty;
+    for (auto &&hpair: op->plan.will_write) {
+      auto to_read_plan_iter = op->plan.to_read.find(hpair.first);
+      const extent_set &to_read_plan =
+	to_read_plan_iter == op->plan.to_read.end() ?
+	empty :
+	to_read_plan_iter->second;
+
+      extent_set remote_read = cache.reserve_extents_for_rmw(
+	hpair.first,
+	op->pin,
+	hpair.second,
+	to_read_plan);
+
+      extent_set pending_read = to_read_plan;
+      pending_read.subtract(remote_read);
+
+      if (!remote_read.empty()) {
+	op->remote_read[hpair.first] = std::move(remote_read);
+      }
+      if (!pending_read.empty()) {
+	op->pending_read[hpair.first] = std::move(pending_read);
+      }
+    }
+  } else {
+    op->remote_read = op->plan.to_read;
+  }
+
+  dout(10) << __func__ << ": " << *op << dendl;
+
+  if (!op->remote_read.empty()) {
+    ceph_assert(get_parent()->get_pool().allows_ecoverwrites());
+    objects_read_async_no_cache(
+      op->remote_read,
+      [this, op](map<hobject_t,pair<int, extent_map> > &&results) {
+	for (auto &&i: results) {
+	  op->remote_read_result.emplace(i.first, i.second.second);
+	}
+	check_ops();
+      });
+  }
+
+  return true;
+}
+
+bool ECBackend::try_reads_to_commit()
+{
+  if (waiting_reads.empty())
+    return false;
+  Op *op = &(waiting_reads.front());
+  if (op->read_in_progress())
+    return false;
+  waiting_reads.pop_front();
+  waiting_commit.push_back(*op);
+
+  dout(10) << __func__ << ": starting commit on " << *op << dendl;
+  dout(20) << __func__ << ": " << cache << dendl;
+
+  get_parent()->apply_stats(
+    op->hoid,
+    op->delta_stats);
+
+  if (op->using_cache) {
+    for (auto &&hpair: op->pending_read) {
+      op->remote_read_result[hpair.first].insert(
+	cache.get_remaining_extents_for_rmw(
+	  hpair.first,
+	  op->pin,
+	  hpair.second));
+    }
+    op->pending_read.clear();
+  } else {
+    ceph_assert(op->pending_read.empty());
+  }
+
+  map<shard_id_t, ObjectStore::Transaction> trans;
+  for (set<pg_shard_t>::const_iterator i =
+	 get_parent()->get_acting_recovery_backfill_shards().begin();
+       i != get_parent()->get_acting_recovery_backfill_shards().end();
+       ++i) {
+    trans[i->shard];
+  }
+
+  op->trace.event("start ec write");
+
+  map<hobject_t,extent_map> written;
+  if (op->plan.t) {
+    ECTransaction::generate_transactions(
+      op->plan,
+      ec_impl,
+      get_parent()->get_info().pgid.pgid,
+      sinfo,
+      op->remote_read_result,
+      op->log_entries,
+      &written,
+      &trans,
+      &(op->temp_added),
+      &(op->temp_cleared),
+      get_parent()->get_dpp(),
+      get_osdmap()->require_osd_release);
+  }
+
+  dout(20) << __func__ << ": " << cache << dendl;
+  dout(20) << __func__ << ": written: " << written << dendl;
+  dout(20) << __func__ << ": op: " << *op << dendl;
+
+  if (!get_parent()->get_pool().allows_ecoverwrites()) {
+    for (auto &&i: op->log_entries) {
+      if (i.requires_kraken()) {
+	derr << __func__ << ": log entry " << i << " requires kraken"
+	     << " but overwrites are not enabled!" << dendl;
+	ceph_abort();
+      }
+    }
+  }
+
+  map<hobject_t,extent_set> written_set;
+  for (auto &&i: written) {
+    written_set[i.first] = i.second.get_interval_set();
+  }
+  dout(20) << __func__ << ": written_set: " << written_set << dendl;
+  ceph_assert(written_set == op->plan.will_write);
+
+  if (op->using_cache) {
+    for (auto &&hpair: written) {
+      dout(20) << __func__ << ": " << hpair << dendl;
+      cache.present_rmw_update(hpair.first, op->pin, hpair.second);
+    }
+  }
+  op->remote_read.clear();
+  op->remote_read_result.clear();
+
+  ObjectStore::Transaction empty;
+  bool should_write_local = false;
+  ECSubWrite local_write_op;
+  std::vector<std::pair<int, Message*>> messages;
+  messages.reserve(get_parent()->get_acting_recovery_backfill_shards().size());
+  set<pg_shard_t> backfill_shards = get_parent()->get_backfill_shards();
+  for (set<pg_shard_t>::const_iterator i =
+	 get_parent()->get_acting_recovery_backfill_shards().begin();
+       i != get_parent()->get_acting_recovery_backfill_shards().end();
+       ++i) {
+    op->pending_apply.insert(*i);
+    op->pending_commit.insert(*i);
+    map<shard_id_t, ObjectStore::Transaction>::iterator iter =
+      trans.find(i->shard);
+    ceph_assert(iter != trans.end());
+    bool should_send = get_parent()->should_send_op(*i, op->hoid);
+    const pg_stat_t &stats =
+      (should_send || !backfill_shards.count(*i)) ?
+      get_info().stats :
+      parent->get_shard_info().find(*i)->second.stats;
+
+    ECSubWrite sop(
+      get_parent()->whoami_shard(),
+      op->tid,
+      op->reqid,
+      op->hoid,
+      stats,
+      should_send ? iter->second : empty,
+      op->version,
+      op->trim_to,
+      op->roll_forward_to,
+      op->log_entries,
+      op->updated_hit_set_history,
+      op->temp_added,
+      op->temp_cleared,
+      !should_send);
+
+    ZTracer::Trace trace;
+    if (op->trace) {
+      // initialize a child span for this shard
+      trace.init("ec sub write", nullptr, &op->trace);
+      trace.keyval("shard", i->shard.id);
+    }
+
+    if (*i == get_parent()->whoami_shard()) {
+      should_write_local = true;
+      local_write_op.claim(sop);
+    } else {
+      MOSDECSubOpWrite *r = new MOSDECSubOpWrite(sop);
+      r->pgid = spg_t(get_parent()->primary_spg_t().pgid, i->shard);
+      r->map_epoch = get_osdmap_epoch();
+      r->min_epoch = get_parent()->get_interval_start_epoch();
+      r->trace = trace;
+      messages.push_back(std::make_pair(i->osd, r));
+    }
+  }
+
+#ifdef HAVE_JAEGER
+   if (op->client_op->osd_parent_span) {
+      auto sub_write_span = jaeger_tracing::child_span("EC sub write", op->client_op->osd_parent_span);
+    }
+#endif
+  if (!messages.empty()) {
+    get_parent()->send_message_osd_cluster(messages, get_osdmap_epoch());
+  }
+
+  if (should_write_local) {
+    handle_sub_write(
+      get_parent()->whoami_shard(),
+      op->client_op,
+      local_write_op,
+      op->trace);
+  }
+
+  for (auto i = op->on_write.begin();
+       i != op->on_write.end();
+       op->on_write.erase(i++)) {
+    (*i)();
+  }
+
+  return true;
+}
+
+bool ECBackend::try_finish_rmw()
+{
+  if (waiting_commit.empty())
+    return false;
+  Op *op = &(waiting_commit.front());
+  if (op->write_in_progress())
+    return false;
+  waiting_commit.pop_front();
+
+  dout(10) << __func__ << ": " << *op << dendl;
+  dout(20) << __func__ << ": " << cache << dendl;
+
+  if (op->roll_forward_to > completed_to)
+    completed_to = op->roll_forward_to;
+  if (op->version > committed_to)
+    committed_to = op->version;
+
+  if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
+    if (op->version > get_parent()->get_log().get_can_rollback_to() &&
+	waiting_reads.empty() &&
+	waiting_commit.empty()) {
+      // submit a dummy transaction to kick the rollforward
+      auto tid = get_parent()->get_tid();
+      Op *nop = &(tid_to_op_map[tid]);
+      nop->hoid = op->hoid;
+      nop->trim_to = op->trim_to;
+      nop->roll_forward_to = op->version;
+      nop->tid = tid;
+      nop->reqid = op->reqid;
+      waiting_reads.push_back(*nop);
+    }
+  }
+
+  if (op->using_cache) {
+    cache.release_write_pin(op->pin);
+  }
+  tid_to_op_map.erase(op->tid);
+
+  if (waiting_reads.empty() &&
+      waiting_commit.empty()) {
+    pipeline_state.clear();
+    dout(20) << __func__ << ": clearing pipeline_state "
+	     << pipeline_state
+	     << dendl;
+  }
+  return true;
+}
+
+void ECBackend::check_ops()
+{
+  while (try_state_to_reads() ||
+	 try_reads_to_commit() ||
+	 try_finish_rmw());
+}
+
+int ECBackend::objects_read_sync(
+  const hobject_t &hoid,
+  uint64_t off,
+  uint64_t len,
+  uint32_t op_flags,
+  bufferlist *bl)
+{
+  return -EOPNOTSUPP;
+}
+
+void ECBackend::objects_read_async(
+  const hobject_t &hoid,
+  const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+             pair<bufferlist*, Context*> > > &to_read,
+  Context *on_complete,
+  bool fast_read)
+{
+  map<hobject_t,std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > >
+    reads;
+
+  uint32_t flags = 0;
+  extent_set es;
+  for (list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+	 pair<bufferlist*, Context*> > >::const_iterator i =
+	 to_read.begin();
+       i != to_read.end();
+       ++i) {
+    pair<uint64_t, uint64_t> tmp =
+      sinfo.offset_len_to_stripe_bounds(
+	make_pair(i->first.get<0>(), i->first.get<1>()));
+
+    es.union_insert(tmp.first, tmp.second);
+    flags |= i->first.get<2>();
+  }
+
+  if (!es.empty()) {
+    auto &offsets = reads[hoid];
+    for (auto j = es.begin();
+	 j != es.end();
+	 ++j) {
+      offsets.push_back(
+	boost::make_tuple(
+	  j.get_start(),
+	  j.get_len(),
+	  flags));
+    }
+  }
+
+  struct cb {
+    ECBackend *ec;
+    hobject_t hoid;
+    list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+	      pair<bufferlist*, Context*> > > to_read;
+    unique_ptr<Context> on_complete;
+    cb(const cb&) = delete;
+    cb(cb &&) = default;
+    cb(ECBackend *ec,
+       const hobject_t &hoid,
+       const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+                  pair<bufferlist*, Context*> > > &to_read,
+       Context *on_complete)
+      : ec(ec),
+	hoid(hoid),
+	to_read(to_read),
+	on_complete(on_complete) {}
+    void operator()(map<hobject_t,pair<int, extent_map> > &&results) {
+      auto dpp = ec->get_parent()->get_dpp();
+      ldpp_dout(dpp, 20) << "objects_read_async_cb: got: " << results
+			 << dendl;
+      ldpp_dout(dpp, 20) << "objects_read_async_cb: cache: " << ec->cache
+			 << dendl;
+
+      auto &got = results[hoid];
+
+      int r = 0;
+      for (auto &&read: to_read) {
+	if (got.first < 0) {
+	  if (read.second.second) {
+	    read.second.second->complete(got.first);
+	  }
+	  if (r == 0)
+	    r = got.first;
+	} else {
+	  ceph_assert(read.second.first);
+	  uint64_t offset = read.first.get<0>();
+	  uint64_t length = read.first.get<1>();
+	  auto range = got.second.get_containing_range(offset, length);
+	  ceph_assert(range.first != range.second);
+	  ceph_assert(range.first.get_off() <= offset);
+          ldpp_dout(dpp, 30) << "offset: " << offset << dendl;
+          ldpp_dout(dpp, 30) << "range offset: " << range.first.get_off() << dendl;
+          ldpp_dout(dpp, 30) << "length: " << length << dendl;
+          ldpp_dout(dpp, 30) << "range length: " << range.first.get_len()  << dendl;
+	  ceph_assert(
+	    (offset + length) <=
+	    (range.first.get_off() + range.first.get_len()));
+	  read.second.first->substr_of(
+	    range.first.get_val(),
+	    offset - range.first.get_off(),
+	    length);
+	  if (read.second.second) {
+	    read.second.second->complete(length);
+	    read.second.second = nullptr;
+	  }
+	}
+      }
+      to_read.clear();
+      if (on_complete) {
+	on_complete.release()->complete(r);
+      }
+    }
+    ~cb() {
+      for (auto &&i: to_read) {
+	delete i.second.second;
+      }
+      to_read.clear();
+    }
+  };
+  objects_read_and_reconstruct(
+    reads,
+    fast_read,
+    make_gen_lambda_context<
+      map<hobject_t,pair<int, extent_map> > &&, cb>(
+	cb(this,
+	   hoid,
+	   to_read,
+	   on_complete)));
+}
+
+struct CallClientContexts :
+  public GenContext<pair<RecoveryMessages*, ECBackend::read_result_t& > &> {
+  hobject_t hoid;
+  ECBackend *ec;
+  ECBackend::ClientAsyncReadStatus *status;
+  list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read;
+  CallClientContexts(
+    hobject_t hoid,
+    ECBackend *ec,
+    ECBackend::ClientAsyncReadStatus *status,
+    const list<boost::tuple<uint64_t, uint64_t, uint32_t> > &to_read)
+    : hoid(hoid), ec(ec), status(status), to_read(to_read) {}
+  void finish(pair<RecoveryMessages *, ECBackend::read_result_t &> &in) override {
+    ECBackend::read_result_t &res = in.second;
+    extent_map result;
+    if (res.r != 0)
+      goto out;
+    ceph_assert(res.returned.size() == to_read.size());
+    ceph_assert(res.errors.empty());
+    for (auto &&read: to_read) {
+      pair<uint64_t, uint64_t> adjusted =
+	ec->sinfo.offset_len_to_stripe_bounds(
+	  make_pair(read.get<0>(), read.get<1>()));
+      ceph_assert(res.returned.front().get<0>() == adjusted.first &&
+	     res.returned.front().get<1>() == adjusted.second);
+      map<int, bufferlist> to_decode;
+      bufferlist bl;
+      for (map<pg_shard_t, bufferlist>::iterator j =
+	     res.returned.front().get<2>().begin();
+	   j != res.returned.front().get<2>().end();
+	   ++j) {
+	to_decode[j->first.shard] = std::move(j->second);
+      }
+      int r = ECUtil::decode(
+	ec->sinfo,
+	ec->ec_impl,
+	to_decode,
+	&bl);
+      if (r < 0) {
+        res.r = r;
+        goto out;
+      }
+      bufferlist trimmed;
+      trimmed.substr_of(
+	bl,
+	read.get<0>() - adjusted.first,
+	std::min(read.get<1>(),
+	    bl.length() - (read.get<0>() - adjusted.first)));
+      result.insert(
+	read.get<0>(), trimmed.length(), std::move(trimmed));
+      res.returned.pop_front();
+    }
+out:
+    status->complete_object(hoid, res.r, std::move(result));
+    ec->kick_reads();
+  }
+};
+
+void ECBackend::objects_read_and_reconstruct(
+  const map<hobject_t,
+    std::list<boost::tuple<uint64_t, uint64_t, uint32_t> >
+  > &reads,
+  bool fast_read,
+  GenContextURef<map<hobject_t,pair<int, extent_map> > &&> &&func)
+{
+  in_progress_client_reads.emplace_back(
+    reads.size(), std::move(func));
+  if (!reads.size()) {
+    kick_reads();
+    return;
+  }
+
+  map<hobject_t, set<int>> obj_want_to_read;
+  set<int> want_to_read;
+  get_want_to_read_shards(&want_to_read);
+    
+  map<hobject_t, read_request_t> for_read_op;
+  for (auto &&to_read: reads) {
+    map<pg_shard_t, vector<pair<int, int>>> shards;
+    int r = get_min_avail_to_read_shards(
+      to_read.first,
+      want_to_read,
+      false,
+      fast_read,
+      &shards);
+    ceph_assert(r == 0);
+
+    CallClientContexts *c = new CallClientContexts(
+      to_read.first,
+      this,
+      &(in_progress_client_reads.back()),
+      to_read.second);
+    for_read_op.insert(
+      make_pair(
+	to_read.first,
+	read_request_t(
+	  to_read.second,
+	  shards,
+	  false,
+	  c)));
+    obj_want_to_read.insert(make_pair(to_read.first, want_to_read));
+  }
+
+  start_read_op(
+    CEPH_MSG_PRIO_DEFAULT,
+    obj_want_to_read,
+    for_read_op,
+    OpRequestRef(),
+    fast_read, false);
+  return;
+}
+
+
+int ECBackend::send_all_remaining_reads(
+  const hobject_t &hoid,
+  ReadOp &rop)
+{
+  set<int> already_read;
+  const set<pg_shard_t>& ots = rop.obj_to_source[hoid];
+  for (set<pg_shard_t>::iterator i = ots.begin(); i != ots.end(); ++i)
+    already_read.insert(i->shard);
+  dout(10) << __func__ << " have/error shards=" << already_read << dendl;
+  map<pg_shard_t, vector<pair<int, int>>> shards;
+  int r = get_remaining_shards(hoid, already_read, rop.want_to_read[hoid],
+			       rop.complete[hoid], &shards, rop.for_recovery);
+  if (r)
+    return r;
+
+  list<boost::tuple<uint64_t, uint64_t, uint32_t> > offsets =
+    rop.to_read.find(hoid)->second.to_read;
+  GenContext<pair<RecoveryMessages *, read_result_t& > &> *c =
+    rop.to_read.find(hoid)->second.cb;
+
+  // (Note cuixf) If we need to read attrs and we read failed, try to read again.
+  bool want_attrs =
+    rop.to_read.find(hoid)->second.want_attrs &&
+    (!rop.complete[hoid].attrs || rop.complete[hoid].attrs->empty());
+  if (want_attrs) {
+    dout(10) << __func__ << " want attrs again" << dendl;
+  }
+
+  rop.to_read.erase(hoid);
+  rop.to_read.insert(make_pair(
+      hoid,
+      read_request_t(
+	offsets,
+	shards,
+	want_attrs,
+	c)));
+  return 0;
+}
+
+int ECBackend::objects_get_attrs(
+  const hobject_t &hoid,
+  map<string, bufferlist> *out)
+{
+  int r = store->getattrs(
+    ch,
+    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    *out);
+  if (r < 0)
+    return r;
+
+  for (map<string, bufferlist>::iterator i = out->begin();
+       i != out->end();
+       ) {
+    if (ECUtil::is_hinfo_key_string(i->first))
+      out->erase(i++);
+    else
+      ++i;
+  }
+  return r;
+}
+
+void ECBackend::rollback_append(
+  const hobject_t &hoid,
+  uint64_t old_size,
+  ObjectStore::Transaction *t)
+{
+  ceph_assert(old_size % sinfo.get_stripe_width() == 0);
+  t->truncate(
+    coll,
+    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    sinfo.aligned_logical_offset_to_chunk_offset(
+      old_size));
+}
+
+int ECBackend::be_deep_scrub(
+  const hobject_t &poid,
+  ScrubMap &map,
+  ScrubMapBuilder &pos,
+  ScrubMap::object &o)
+{
+  dout(10) << __func__ << " " << poid << " pos " << pos << dendl;
+  int r;
+
+  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+                           CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
+  utime_t sleeptime;
+  sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep);
+  if (sleeptime != utime_t()) {
+    lgeneric_derr(cct) << __func__ << " sleeping for " << sleeptime << dendl;
+    sleeptime.sleep();
+  }
+
+  if (pos.data_pos == 0) {
+    pos.data_hash = bufferhash(-1);
+  }
+
+  uint64_t stride = cct->_conf->osd_deep_scrub_stride;
+  if (stride % sinfo.get_chunk_size())
+    stride += sinfo.get_chunk_size() - (stride % sinfo.get_chunk_size());
+
+  bufferlist bl;
+  r = store->read(
+    ch,
+    ghobject_t(
+      poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    pos.data_pos,
+    stride, bl,
+    fadvise_flags);
+  if (r < 0) {
+    dout(20) << __func__ << "  " << poid << " got "
+	     << r << " on read, read_error" << dendl;
+    o.read_error = true;
+    return 0;
+  }
+  if (bl.length() % sinfo.get_chunk_size()) {
+    dout(20) << __func__ << "  " << poid << " got "
+	     << r << " on read, not chunk size " << sinfo.get_chunk_size() << " aligned"
+	     << dendl;
+    o.read_error = true;
+    return 0;
+  }
+  if (r > 0) {
+    pos.data_hash << bl;
+  }
+  pos.data_pos += r;
+  if (r == (int)stride) {
+    return -EINPROGRESS;
+  }
+
+  ECUtil::HashInfoRef hinfo = get_hash_info(poid, false, &o.attrs);
+  if (!hinfo) {
+    dout(0) << "_scan_list  " << poid << " could not retrieve hash info" << dendl;
+    o.read_error = true;
+    o.digest_present = false;
+    return 0;
+  } else {
+    if (!get_parent()->get_pool().allows_ecoverwrites()) {
+      if (!hinfo->has_chunk_hash()) {
+        dout(0) << "_scan_list  " << poid << " got invalid hash info" << dendl;
+        o.ec_size_mismatch = true;
+        return 0;
+      }
+      if (hinfo->get_total_chunk_size() != (unsigned)pos.data_pos) {
+	dout(0) << "_scan_list  " << poid << " got incorrect size on read 0x"
+		<< std::hex << pos
+		<< " expected 0x" << hinfo->get_total_chunk_size() << std::dec
+		<< dendl;
+	o.ec_size_mismatch = true;
+	return 0;
+      }
+
+      if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) !=
+	  pos.data_hash.digest()) {
+	dout(0) << "_scan_list  " << poid << " got incorrect hash on read 0x"
+		<< std::hex << pos.data_hash.digest() << " !=  expected 0x"
+		<< hinfo->get_chunk_hash(get_parent()->whoami_shard().shard)
+		<< std::dec << dendl;
+	o.ec_hash_mismatch = true;
+	return 0;
+      }
+
+      /* We checked above that we match our own stored hash.  We cannot
+       * send a hash of the actual object, so instead we simply send
+       * our locally stored hash of shard 0 on the assumption that if
+       * we match our chunk hash and our recollection of the hash for
+       * chunk 0 matches that of our peers, there is likely no corruption.
+       */
+      o.digest = hinfo->get_chunk_hash(0);
+      o.digest_present = true;
+    } else {
+      /* Hack! We must be using partial overwrites, and partial overwrites
+       * don't support deep-scrub yet
+       */
+      o.digest = 0;
+      o.digest_present = true;
+    }
+  }
+
+  o.omap_digest = -1;
+  o.omap_digest_present = true;
+  return 0;
+}
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h
new file mode 100644
index 000000000..45495376a
--- /dev/null
+++ b/src/osd/ECBackend.h
@@ -0,0 +1,686 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef ECBACKEND_H
+#define ECBACKEND_H
+
+#include <boost/intrusive/set.hpp>
+#include <boost/intrusive/list.hpp>
+
+#include "OSD.h"
+#include "PGBackend.h"
+#include "erasure-code/ErasureCodeInterface.h"
+#include "ECUtil.h"
+#include "ECTransaction.h"
+#include "ExtentCache.h"
+
+//forward declaration
+struct ECSubWrite;
+struct ECSubWriteReply;
+struct ECSubRead;
+struct ECSubReadReply;
+
+struct RecoveryMessages;
+class ECBackend : public PGBackend {
+public:
+  RecoveryHandle *open_recovery_op() override;
+
+  void run_recovery_op(
+    RecoveryHandle *h,
+    int priority
+    ) override;
+
+  int recover_object(
+    const hobject_t &hoid,
+    eversion_t v,
+    ObjectContextRef head,
+    ObjectContextRef obc,
+    RecoveryHandle *h
+    ) override;
+
+  bool _handle_message(
+    OpRequestRef op
+    ) override;
+  bool can_handle_while_inactive(
+    OpRequestRef op
+    ) override;
+  friend struct SubWriteApplied;
+  friend struct SubWriteCommitted;
+  void sub_write_committed(
+    ceph_tid_t tid,
+    eversion_t version,
+    eversion_t last_complete,
+    const ZTracer::Trace &trace);
+  void handle_sub_write(
+    pg_shard_t from,
+    OpRequestRef msg,
+    ECSubWrite &op,
+    const ZTracer::Trace &trace
+    );
+  void handle_sub_read(
+    pg_shard_t from,
+    const ECSubRead &op,
+    ECSubReadReply *reply,
+    const ZTracer::Trace &trace
+    );
+  void handle_sub_write_reply(
+    pg_shard_t from,
+    const ECSubWriteReply &op,
+    const ZTracer::Trace &trace
+    );
+  void handle_sub_read_reply(
+    pg_shard_t from,
+    ECSubReadReply &op,
+    RecoveryMessages *m,
+    const ZTracer::Trace &trace
+    );
+
+  /// @see ReadOp below
+  void check_recovery_sources(const OSDMapRef& osdmap) override;
+
+  void on_change() override;
+  void clear_recovery_state() override;
+
+  void dump_recovery_info(ceph::Formatter *f) const override;
+
+  void call_write_ordered(std::function<void(void)> &&cb) override;
+
+  void submit_transaction(
+    const hobject_t &hoid,
+    const object_stat_sum_t &delta_stats,
+    const eversion_t &at_version,
+    PGTransactionUPtr &&t,
+    const eversion_t &trim_to,
+    const eversion_t &min_last_complete_ondisk,
+    std::vector<pg_log_entry_t>&& log_entries,
+    std::optional<pg_hit_set_history_t> &hset_history,
+    Context *on_all_commit,
+    ceph_tid_t tid,
+    osd_reqid_t reqid,
+    OpRequestRef op
+    ) override;
+
+  int objects_read_sync(
+    const hobject_t &hoid,
+    uint64_t off,
+    uint64_t len,
+    uint32_t op_flags,
+    ceph::buffer::list *bl) override;
+
+  /**
+   * Async read mechanism
+   *
+   * Async reads use the same async read mechanism as does recovery.
+   * CallClientContexts is responsible for reconstructing the response
+   * buffer as well as for calling the callbacks.
+   *
+   * One tricky bit is that two reads may possibly not read from the same
+   * std::set of replicas.  This could result in two reads completing in the
+   * wrong (from the interface user's point of view) order.  Thus, we
+   * maintain a queue of in progress reads (@see in_progress_client_reads)
+   * to ensure that we always call the completion callback in order.
+   *
+   * Another subtly is that while we may read a degraded object, we will
+   * still only perform a client read from shards in the acting std::set.  This
+   * ensures that we won't ever have to restart a client initiated read in
+   * check_recovery_sources.
+   */
+  void objects_read_and_reconstruct(
+    const std::map<hobject_t, std::list<boost::tuple<uint64_t, uint64_t, uint32_t> >
+    > &reads,
+    bool fast_read,
+    GenContextURef<std::map<hobject_t,std::pair<int, extent_map> > &&> &&func);
+
+  friend struct CallClientContexts;
+  struct ClientAsyncReadStatus {
+    unsigned objects_to_read;
+    GenContextURef<std::map<hobject_t,std::pair<int, extent_map> > &&> func;
+    std::map<hobject_t,std::pair<int, extent_map> > results;
+    explicit ClientAsyncReadStatus(
+      unsigned objects_to_read,
+      GenContextURef<std::map<hobject_t,std::pair<int, extent_map> > &&> &&func)
+      : objects_to_read(objects_to_read), func(std::move(func)) {}
+    void complete_object(
+      const hobject_t &hoid,
+      int err,
+      extent_map &&buffers) {
+      ceph_assert(objects_to_read);
+      --objects_to_read;
+      ceph_assert(!results.count(hoid));
+      results.emplace(hoid, std::make_pair(err, std::move(buffers)));
+    }
+    bool is_complete() const {
+      return objects_to_read == 0;
+    }
+    void run() {
+      func.release()->complete(std::move(results));
+    }
+  };
+  std::list<ClientAsyncReadStatus> in_progress_client_reads;
+  void objects_read_async(
+    const hobject_t &hoid,
+    const std::list<std::pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+		    std::pair<ceph::buffer::list*, Context*> > > &to_read,
+    Context *on_complete,
+    bool fast_read = false) override;
+
+  template <typename Func>
+  void objects_read_async_no_cache(
+    const std::map<hobject_t,extent_set> &to_read,
+    Func &&on_complete) {
+    std::map<hobject_t,std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > > _to_read;
+    for (auto &&hpair: to_read) {
+      auto &l = _to_read[hpair.first];
+      for (auto extent: hpair.second) {
+	l.emplace_back(extent.first, extent.second, 0);
+      }
+    }
+    objects_read_and_reconstruct(
+      _to_read,
+      false,
+      make_gen_lambda_context<
+      std::map<hobject_t,std::pair<int, extent_map> > &&, Func>(
+	  std::forward<Func>(on_complete)));
+  }
+  void kick_reads() {
+    while (in_progress_client_reads.size() &&
+	   in_progress_client_reads.front().is_complete()) {
+      in_progress_client_reads.front().run();
+      in_progress_client_reads.pop_front();
+    }
+  }
+
+private:
+  friend struct ECRecoveryHandle;
+  uint64_t get_recovery_chunk_size() const {
+    return round_up_to(cct->_conf->osd_recovery_max_chunk,
+			sinfo.get_stripe_width());
+  }
+
+  void get_want_to_read_shards(std::set<int> *want_to_read) const {
+    const std::vector<int> &chunk_mapping = ec_impl->get_chunk_mapping();
+    for (int i = 0; i < (int)ec_impl->get_data_chunk_count(); ++i) {
+      int chunk = (int)chunk_mapping.size() > i ? chunk_mapping[i] : i;
+      want_to_read->insert(chunk);
+    }
+  }
+
+  /**
+   * Recovery
+   *
+   * Recovery uses the same underlying read mechanism as client reads
+   * with the slight difference that recovery reads may come from non
+   * acting shards.  Thus, check_recovery_sources may wind up calling
+   * cancel_pull for a read originating with RecoveryOp.
+   *
+   * The recovery process is expressed as a state machine:
+   * - IDLE: Nothing is currently in progress, reads will be started and
+   *         we will transition to READING
+   * - READING: We are awaiting a pending read op.  Once complete, we will
+   *            decode the buffers and proceed to WRITING
+   * - WRITING: We are awaiting a completed push.  Once complete, we will
+   *            either transition to COMPLETE or to IDLE to continue.
+   * - COMPLETE: complete
+   *
+   * We use the existing Push and PushReply messages and structures to
+   * handle actually shuffling the data over to the replicas.  recovery_info
+   * and recovery_progress are expressed in terms of the logical offset
+   * space except for data_included which is in terms of the chunked object
+   * space (to match the passed buffer).
+   *
+   * xattrs are requested on the first read and used to initialize the
+   * object_context if missing on completion of the first read.
+   *
+   * In order to batch up reads and writes, we batch Push, PushReply,
+   * Transaction, and reads in a RecoveryMessages object which is passed
+   * among the recovery methods.
+   */
+  struct RecoveryOp {
+    hobject_t hoid;
+    eversion_t v;
+    std::set<pg_shard_t> missing_on;
+    std::set<shard_id_t> missing_on_shards;
+
+    ObjectRecoveryInfo recovery_info;
+    ObjectRecoveryProgress recovery_progress;
+
+    enum state_t { IDLE, READING, WRITING, COMPLETE } state;
+
+    static const char* tostr(state_t state) {
+      switch (state) {
+      case ECBackend::RecoveryOp::IDLE:
+	return "IDLE";
+      case ECBackend::RecoveryOp::READING:
+	return "READING";
+      case ECBackend::RecoveryOp::WRITING:
+	return "WRITING";
+      case ECBackend::RecoveryOp::COMPLETE:
+	return "COMPLETE";
+      default:
+	ceph_abort();
+	return "";
+      }
+    }
+
+    // must be filled if state == WRITING
+    std::map<int, ceph::buffer::list> returned_data;
+    std::map<std::string, ceph::buffer::list> xattrs;
+    ECUtil::HashInfoRef hinfo;
+    ObjectContextRef obc;
+    std::set<pg_shard_t> waiting_on_pushes;
+
+    // valid in state READING
+    std::pair<uint64_t, uint64_t> extent_requested;
+
+    void dump(ceph::Formatter *f) const;
+
+    RecoveryOp() : state(IDLE) {}
+  };
+  friend ostream &operator<<(ostream &lhs, const RecoveryOp &rhs);
+  std::map<hobject_t, RecoveryOp> recovery_ops;
+
+  void continue_recovery_op(
+    RecoveryOp &op,
+    RecoveryMessages *m);
+  void dispatch_recovery_messages(RecoveryMessages &m, int priority);
+  friend struct OnRecoveryReadComplete;
+  void handle_recovery_read_complete(
+    const hobject_t &hoid,
+    boost::tuple<uint64_t, uint64_t, std::map<pg_shard_t, ceph::buffer::list> > &to_read,
+    std::optional<std::map<std::string, ceph::buffer::list> > attrs,
+    RecoveryMessages *m);
+  void handle_recovery_push(
+    const PushOp &op,
+    RecoveryMessages *m,
+    bool is_repair);
+  void handle_recovery_push_reply(
+    const PushReplyOp &op,
+    pg_shard_t from,
+    RecoveryMessages *m);
+  void get_all_avail_shards(
+    const hobject_t &hoid,
+    const std::set<pg_shard_t> &error_shards,
+    std::set<int> &have,
+    std::map<shard_id_t, pg_shard_t> &shards,
+    bool for_recovery);
+
+public:
+  /**
+   * Low level async read mechanism
+   *
+   * To avoid duplicating the logic for requesting and waiting for
+   * multiple object shards, there is a common async read mechanism
+   * taking a std::map of hobject_t->read_request_t which defines callbacks
+   * taking read_result_ts as arguments.
+   *
+   * tid_to_read_map gives open read ops.  check_recovery_sources uses
+   * shard_to_read_map and ReadOp::source_to_obj to restart reads
+   * involving down osds.
+   *
+   * The user is responsible for specifying replicas on which to read
+   * and for reassembling the buffer on the other side since client
+   * reads require the original object buffer while recovery only needs
+   * the missing pieces.
+   *
+   * Rather than handling reads on the primary directly, we simply send
+   * ourselves a message.  This avoids a dedicated primary path for that
+   * part.
+   */
+  struct read_result_t {
+    int r;
+    std::map<pg_shard_t, int> errors;
+    std::optional<std::map<std::string, ceph::buffer::list> > attrs;
+    std::list<
+      boost::tuple<
+	uint64_t, uint64_t, std::map<pg_shard_t, ceph::buffer::list> > > returned;
+    read_result_t() : r(0) {}
+  };
+  struct read_request_t {
+    const std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read;
+    std::map<pg_shard_t, std::vector<std::pair<int, int>>> need;
+    bool want_attrs;
+    GenContext<std::pair<RecoveryMessages *, read_result_t& > &> *cb;
+    read_request_t(
+      const std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > &to_read,
+      const std::map<pg_shard_t, std::vector<std::pair<int, int>>> &need,
+      bool want_attrs,
+      GenContext<std::pair<RecoveryMessages *, read_result_t& > &> *cb)
+      : to_read(to_read), need(need), want_attrs(want_attrs),
+	cb(cb) {}
+  };
+  friend ostream &operator<<(ostream &lhs, const read_request_t &rhs);
+
+  struct ReadOp {
+    int priority;
+    ceph_tid_t tid;
+    OpRequestRef op; // may be null if not on behalf of a client
+    // True if redundant reads are issued, false otherwise,
+    // this is useful to tradeoff some resources (redundant ops) for
+    // low latency read, especially on relatively idle cluster
+    bool do_redundant_reads;
+    // True if reading for recovery which could possibly reading only a subset
+    // of the available shards.
+    bool for_recovery;
+
+    ZTracer::Trace trace;
+
+    std::map<hobject_t, std::set<int>> want_to_read;
+    std::map<hobject_t, read_request_t> to_read;
+    std::map<hobject_t, read_result_t> complete;
+
+    std::map<hobject_t, std::set<pg_shard_t>> obj_to_source;
+    std::map<pg_shard_t, std::set<hobject_t> > source_to_obj;
+
+    void dump(ceph::Formatter *f) const;
+
+    std::set<pg_shard_t> in_progress;
+
+    ReadOp(
+      int priority,
+      ceph_tid_t tid,
+      bool do_redundant_reads,
+      bool for_recovery,
+      OpRequestRef op,
+      std::map<hobject_t, std::set<int>> &&_want_to_read,
+      std::map<hobject_t, read_request_t> &&_to_read)
+      : priority(priority), tid(tid), op(op), do_redundant_reads(do_redundant_reads),
+	for_recovery(for_recovery), want_to_read(std::move(_want_to_read)),
+	to_read(std::move(_to_read)) {
+      for (auto &&hpair: to_read) {
+	auto &returned = complete[hpair.first].returned;
+	for (auto &&extent: hpair.second.to_read) {
+	  returned.push_back(
+	    boost::make_tuple(
+	      extent.get<0>(),
+	      extent.get<1>(),
+	      std::map<pg_shard_t, ceph::buffer::list>()));
+	}
+      }
+    }
+    ReadOp() = delete;
+    ReadOp(const ReadOp &) = default;
+    ReadOp(ReadOp &&) = default;
+  };
+  friend struct FinishReadOp;
+  void filter_read_op(
+    const OSDMapRef& osdmap,
+    ReadOp &op);
+  void complete_read_op(ReadOp &rop, RecoveryMessages *m);
+  friend ostream &operator<<(ostream &lhs, const ReadOp &rhs);
+  std::map<ceph_tid_t, ReadOp> tid_to_read_map;
+  std::map<pg_shard_t, std::set<ceph_tid_t> > shard_to_read_map;
+  void start_read_op(
+    int priority,
+    std::map<hobject_t, std::set<int>> &want_to_read,
+    std::map<hobject_t, read_request_t> &to_read,
+    OpRequestRef op,
+    bool do_redundant_reads, bool for_recovery);
+
+  void do_read_op(ReadOp &rop);
+  int send_all_remaining_reads(
+    const hobject_t &hoid,
+    ReadOp &rop);
+
+
+  /**
+   * Client writes
+   *
+   * ECTransaction is responsible for generating a transaction for
+   * each shard to which we need to send the write.  As required
+   * by the PGBackend interface, the ECBackend write mechanism
+   * passes trim information with the write and last_complete back
+   * with the reply.
+   *
+   * As with client reads, there is a possibility of out-of-order
+   * completions. Thus, callbacks and completion are called in order
+   * on the writing std::list.
+   */
+  struct Op : boost::intrusive::list_base_hook<> {
+    /// From submit_transaction caller, describes operation
+    hobject_t hoid;
+    object_stat_sum_t delta_stats;
+    eversion_t version;
+    eversion_t trim_to;
+    std::optional<pg_hit_set_history_t> updated_hit_set_history;
+    std::vector<pg_log_entry_t> log_entries;
+    ceph_tid_t tid;
+    osd_reqid_t reqid;
+    ZTracer::Trace trace;
+
+    eversion_t roll_forward_to; /// Soon to be generated internally
+
+    /// Ancillary also provided from submit_transaction caller
+    std::map<hobject_t, ObjectContextRef> obc_map;
+
+    /// see call_write_ordered
+    std::list<std::function<void(void)> > on_write;
+
+    /// Generated internally
+    std::set<hobject_t> temp_added;
+    std::set<hobject_t> temp_cleared;
+
+    ECTransaction::WritePlan plan;
+    bool requires_rmw() const { return !plan.to_read.empty(); }
+    bool invalidates_cache() const { return plan.invalidates_cache; }
+
+    // must be true if requires_rmw(), must be false if invalidates_cache()
+    bool using_cache = true;
+
+    /// In progress read state;
+    std::map<hobject_t,extent_set> pending_read; // subset already being read
+    std::map<hobject_t,extent_set> remote_read;  // subset we must read
+    std::map<hobject_t,extent_map> remote_read_result;
+    bool read_in_progress() const {
+      return !remote_read.empty() && remote_read_result.empty();
+    }
+
+    /// In progress write state.
+    std::set<pg_shard_t> pending_commit;
+    // we need pending_apply for pre-mimic peers so that we don't issue a
+    // read on a remote shard before it has applied a previous write.  We can
+    // remove this after nautilus.
+    std::set<pg_shard_t> pending_apply;
+    bool write_in_progress() const {
+      return !pending_commit.empty() || !pending_apply.empty();
+    }
+
+    /// optional, may be null, for tracking purposes
+    OpRequestRef client_op;
+
+    /// pin for cache
+    ExtentCache::write_pin pin;
+
+    /// Callbacks
+    Context *on_all_commit = nullptr;
+    ~Op() {
+      delete on_all_commit;
+    }
+  };
+  using op_list = boost::intrusive::list<Op>;
+  friend ostream &operator<<(ostream &lhs, const Op &rhs);
+
+  ExtentCache cache;
+  std::map<ceph_tid_t, Op> tid_to_op_map; /// Owns Op structure
+
+  /**
+   * We model the possible rmw states as a std::set of waitlists.
+   * All writes at this time complete in order, so a write blocked
+   * at waiting_state blocks all writes behind it as well (same for
+   * other states).
+   *
+   * Future work: We can break this up into a per-object pipeline
+   * (almost).  First, provide an ordering token to submit_transaction
+   * and require that all operations within a single transaction take
+   * place on a subset of hobject_t space partitioned by that token
+   * (the hashid seem about right to me -- even works for temp objects
+   * if you recall that a temp object created for object head foo will
+   * only ever be referenced by other transactions on foo and aren't
+   * reused).  Next, factor this part into a class and maintain one per
+   * ordering token.  Next, fixup PrimaryLogPG's repop queue to be
+   * partitioned by ordering token.  Finally, refactor the op pipeline
+   * so that the log entries passed into submit_transaction aren't
+   * versioned.  We can't assign versions to them until we actually
+   * submit the operation.  That's probably going to be the hard part.
+   */
+  class pipeline_state_t {
+    enum {
+      CACHE_VALID = 0,
+      CACHE_INVALID = 1
+    } pipeline_state = CACHE_VALID;
+  public:
+    bool caching_enabled() const {
+      return pipeline_state == CACHE_VALID;
+    }
+    bool cache_invalid() const {
+      return !caching_enabled();
+    }
+    void invalidate() {
+      pipeline_state = CACHE_INVALID;
+    }
+    void clear() {
+      pipeline_state = CACHE_VALID;
+    }
+    friend ostream &operator<<(ostream &lhs, const pipeline_state_t &rhs);
+  } pipeline_state;
+
+
+  op_list waiting_state;        /// writes waiting on pipe_state
+  op_list waiting_reads;        /// writes waiting on partial stripe reads
+  op_list waiting_commit;       /// writes waiting on initial commit
+  eversion_t completed_to;
+  eversion_t committed_to;
+  void start_rmw(Op *op, PGTransactionUPtr &&t);
+  bool try_state_to_reads();
+  bool try_reads_to_commit();
+  bool try_finish_rmw();
+  void check_ops();
+
+  ceph::ErasureCodeInterfaceRef ec_impl;
+
+
+  /**
+   * ECRecPred
+   *
+   * Determines the whether _have is sufficient to recover an object
+   */
+  class ECRecPred : public IsPGRecoverablePredicate {
+    std::set<int> want;
+    ceph::ErasureCodeInterfaceRef ec_impl;
+  public:
+    explicit ECRecPred(ceph::ErasureCodeInterfaceRef ec_impl) : ec_impl(ec_impl) {
+      for (unsigned i = 0; i < ec_impl->get_chunk_count(); ++i) {
+	want.insert(i);
+      }
+    }
+    bool operator()(const std::set<pg_shard_t> &_have) const override {
+      std::set<int> have;
+      for (std::set<pg_shard_t>::const_iterator i = _have.begin();
+	   i != _have.end();
+	   ++i) {
+	have.insert(i->shard);
+      }
+      std::map<int, std::vector<std::pair<int, int>>> min;
+      return ec_impl->minimum_to_decode(want, have, &min) == 0;
+    }
+  };
+  IsPGRecoverablePredicate *get_is_recoverable_predicate() const override {
+    return new ECRecPred(ec_impl);
+  }
+
+  int get_ec_data_chunk_count() const override {
+    return ec_impl->get_data_chunk_count();
+  }
+  int get_ec_stripe_chunk_size() const override {
+    return sinfo.get_chunk_size();
+  }
+
+  /**
+   * ECReadPred
+   *
+   * Determines the whether _have is sufficient to read an object
+   */
+  class ECReadPred : public IsPGReadablePredicate {
+    pg_shard_t whoami;
+    ECRecPred rec_pred;
+  public:
+    ECReadPred(
+      pg_shard_t whoami,
+      ceph::ErasureCodeInterfaceRef ec_impl) : whoami(whoami), rec_pred(ec_impl) {}
+    bool operator()(const std::set<pg_shard_t> &_have) const override {
+      return _have.count(whoami) && rec_pred(_have);
+    }
+  };
+  IsPGReadablePredicate *get_is_readable_predicate() const override {
+    return new ECReadPred(get_parent()->whoami_shard(), ec_impl);
+  }
+
+
+  const ECUtil::stripe_info_t sinfo;
+  /// If modified, ensure that the ref is held until the update is applied
+  SharedPtrRegistry<hobject_t, ECUtil::HashInfo> unstable_hashinfo_registry;
+  ECUtil::HashInfoRef get_hash_info(const hobject_t &hoid, bool create = false,
+				    const std::map<std::string, ceph::buffer::ptr> *attr = NULL);
+
+public:
+  ECBackend(
+    PGBackend::Listener *pg,
+    const coll_t &coll,
+    ObjectStore::CollectionHandle &ch,
+    ObjectStore *store,
+    CephContext *cct,
+    ceph::ErasureCodeInterfaceRef ec_impl,
+    uint64_t stripe_width);
+
+  /// Returns to_read replicas sufficient to reconstruct want
+  int get_min_avail_to_read_shards(
+    const hobject_t &hoid,     ///< [in] object
+    const std::set<int> &want,      ///< [in] desired shards
+    bool for_recovery,         ///< [in] true if we may use non-acting replicas
+    bool do_redundant_reads,   ///< [in] true if we want to issue redundant reads to reduce latency
+    std::map<pg_shard_t, std::vector<std::pair<int, int>>> *to_read   ///< [out] shards, corresponding subchunks to read
+    ); ///< @return error code, 0 on success
+
+  int get_remaining_shards(
+    const hobject_t &hoid,
+    const std::set<int> &avail,
+    const std::set<int> &want,
+    const read_result_t &result,
+    std::map<pg_shard_t, std::vector<std::pair<int, int>>> *to_read,
+    bool for_recovery);
+
+  int objects_get_attrs(
+    const hobject_t &hoid,
+    std::map<std::string, ceph::buffer::list> *out) override;
+
+  void rollback_append(
+    const hobject_t &hoid,
+    uint64_t old_size,
+    ObjectStore::Transaction *t) override;
+
+  bool auto_repair_supported() const override { return true; }
+
+  int be_deep_scrub(
+    const hobject_t &poid,
+    ScrubMap &map,
+    ScrubMapBuilder &pos,
+    ScrubMap::object &o) override;
+  uint64_t be_get_ondisk_size(uint64_t logical_size) override {
+    return sinfo.logical_to_next_chunk_offset(logical_size);
+  }
+  void _failed_push(const hobject_t &hoid,
+    std::pair<RecoveryMessages *, ECBackend::read_result_t &> &in);
+};
+ostream &operator<<(ostream &lhs, const ECBackend::pipeline_state_t &rhs);
+
+#endif
diff --git a/src/osd/ECMsgTypes.cc b/src/osd/ECMsgTypes.cc
new file mode 100644
index 000000000..a65676643
--- /dev/null
+++ b/src/osd/ECMsgTypes.cc
@@ -0,0 +1,393 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "ECMsgTypes.h"
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::pair;
+using std::set;
+using ceph::bufferlist;
+using ceph::Formatter;
+
+void ECSubWrite::encode(bufferlist &bl) const
+{
+  ENCODE_START(4, 1, bl);
+  encode(from, bl);
+  encode(tid, bl);
+  encode(reqid, bl);
+  encode(soid, bl);
+  encode(stats, bl);
+  encode(t, bl);
+  encode(at_version, bl);
+  encode(trim_to, bl);
+  encode(log_entries, bl);
+  encode(temp_added, bl);
+  encode(temp_removed, bl);
+  encode(updated_hit_set_history, bl);
+  encode(roll_forward_to, bl);
+  encode(backfill_or_async_recovery, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ECSubWrite::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(4, bl);
+  decode(from, bl);
+  decode(tid, bl);
+  decode(reqid, bl);
+  decode(soid, bl);
+  decode(stats, bl);
+  decode(t, bl);
+  decode(at_version, bl);
+  decode(trim_to, bl);
+  decode(log_entries, bl);
+  decode(temp_added, bl);
+  decode(temp_removed, bl);
+  if (struct_v >= 2) {
+    decode(updated_hit_set_history, bl);
+  }
+  if (struct_v >= 3) {
+    decode(roll_forward_to, bl);
+  } else {
+    roll_forward_to = trim_to;
+  }
+  if (struct_v >= 4) {
+    decode(backfill_or_async_recovery, bl);
+  } else {
+    // The old protocol used an empty transaction to indicate backfill or async_recovery
+    backfill_or_async_recovery = t.empty();
+  }
+  DECODE_FINISH(bl);
+}
+
+std::ostream &operator<<(
+  std::ostream &lhs, const ECSubWrite &rhs)
+{
+  lhs << "ECSubWrite(tid=" << rhs.tid
+      << ", reqid=" << rhs.reqid
+      << ", at_version=" << rhs.at_version
+      << ", trim_to=" << rhs.trim_to
+      << ", roll_forward_to=" << rhs.roll_forward_to;
+  if (rhs.updated_hit_set_history)
+    lhs << ", has_updated_hit_set_history";
+  if (rhs.backfill_or_async_recovery)
+    lhs << ", backfill_or_async_recovery";
+  return lhs <<  ")";
+}
+
+void ECSubWrite::dump(Formatter *f) const
+{
+  f->dump_unsigned("tid", tid);
+  f->dump_stream("reqid") << reqid;
+  f->dump_stream("at_version") << at_version;
+  f->dump_stream("trim_to") << trim_to;
+  f->dump_stream("roll_forward_to") << roll_forward_to;
+  f->dump_bool("has_updated_hit_set_history",
+      static_cast<bool>(updated_hit_set_history));
+  f->dump_bool("backfill_or_async_recovery", backfill_or_async_recovery);
+}
+
+void ECSubWrite::generate_test_instances(list<ECSubWrite*> &o)
+{
+  o.push_back(new ECSubWrite());
+  o.back()->tid = 1;
+  o.back()->at_version = eversion_t(2, 100);
+  o.back()->trim_to = eversion_t(1, 40);
+  o.push_back(new ECSubWrite());
+  o.back()->tid = 4;
+  o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
+  o.back()->at_version = eversion_t(10, 300);
+  o.back()->trim_to = eversion_t(5, 42);
+  o.push_back(new ECSubWrite());
+  o.back()->tid = 9;
+  o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
+  o.back()->at_version = eversion_t(10, 300);
+  o.back()->trim_to = eversion_t(5, 42);
+  o.back()->roll_forward_to = eversion_t(8, 250);
+}
+
+void ECSubWriteReply::encode(bufferlist &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(from, bl);
+  encode(tid, bl);
+  encode(last_complete, bl);
+  encode(committed, bl);
+  encode(applied, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ECSubWriteReply::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  decode(from, bl);
+  decode(tid, bl);
+  decode(last_complete, bl);
+  decode(committed, bl);
+  decode(applied, bl);
+  DECODE_FINISH(bl);
+}
+
+std::ostream &operator<<(
+  std::ostream &lhs, const ECSubWriteReply &rhs)
+{
+  return lhs
+    << "ECSubWriteReply(tid=" << rhs.tid
+    << ", last_complete=" << rhs.last_complete
+    << ", committed=" << rhs.committed
+    << ", applied=" << rhs.applied << ")";
+}
+
+void ECSubWriteReply::dump(Formatter *f) const
+{
+  f->dump_unsigned("tid", tid);
+  f->dump_stream("last_complete") << last_complete;
+  f->dump_bool("committed", committed);
+  f->dump_bool("applied", applied);
+}
+
+void ECSubWriteReply::generate_test_instances(list<ECSubWriteReply*>& o)
+{
+  o.push_back(new ECSubWriteReply());
+  o.back()->tid = 20;
+  o.back()->last_complete = eversion_t(100, 2000);
+  o.back()->committed = true;
+  o.push_back(new ECSubWriteReply());
+  o.back()->tid = 80;
+  o.back()->last_complete = eversion_t(50, 200);
+  o.back()->applied = true;
+}
+
+void ECSubRead::encode(bufferlist &bl, uint64_t features) const
+{
+  if ((features & CEPH_FEATURE_OSD_FADVISE_FLAGS) == 0) {
+    ENCODE_START(2, 1, bl);
+    encode(from, bl);
+    encode(tid, bl);
+    map<hobject_t, list<pair<uint64_t, uint64_t> >> tmp;
+    for (auto m = to_read.cbegin(); m != to_read.cend(); ++m) {
+      list<pair<uint64_t, uint64_t> > tlist;
+      for (auto l = m->second.cbegin(); l != m->second.cend(); ++l) {
+	tlist.push_back(std::make_pair(l->get<0>(), l->get<1>()));
+      }
+      tmp[m->first] = tlist;
+    }
+    encode(tmp, bl);
+    encode(attrs_to_read, bl);
+    encode(subchunks, bl);
+    ENCODE_FINISH(bl);
+    return;
+  }
+
+  ENCODE_START(3, 2, bl);
+  encode(from, bl);
+  encode(tid, bl);
+  encode(to_read, bl);
+  encode(attrs_to_read, bl);
+  encode(subchunks, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ECSubRead::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(3, bl);
+  decode(from, bl);
+  decode(tid, bl);
+  if (struct_v == 1) {
+    map<hobject_t, list<pair<uint64_t, uint64_t> >>tmp;
+    decode(tmp, bl);
+    for (auto m = tmp.cbegin(); m != tmp.cend(); ++m) {
+      list<boost::tuple<uint64_t, uint64_t, uint32_t> > tlist;
+      for (auto l = m->second.cbegin(); l != m->second.cend(); ++l) {
+	tlist.push_back(boost::make_tuple(l->first, l->second, 0));
+      }
+      to_read[m->first] = tlist;
+    }
+  } else {
+    decode(to_read, bl);
+  }
+  decode(attrs_to_read, bl);
+  if (struct_v > 2 && struct_v > struct_compat) {
+    decode(subchunks, bl);
+  } else {
+    for (auto &i : to_read) {
+      subchunks[i.first].push_back(make_pair(0, 1));
+    }
+  }
+  DECODE_FINISH(bl);
+}
+
+std::ostream &operator<<(
+  std::ostream &lhs, const ECSubRead &rhs)
+{
+  return lhs
+    << "ECSubRead(tid=" << rhs.tid
+    << ", to_read=" << rhs.to_read
+    << ", subchunks=" << rhs.subchunks
+    << ", attrs_to_read=" << rhs.attrs_to_read << ")";
+}
+
+void ECSubRead::dump(Formatter *f) const
+{
+  f->dump_stream("from") << from;
+  f->dump_unsigned("tid", tid);
+  f->open_array_section("objects");
+  for (auto i = to_read.cbegin(); i != to_read.cend(); ++i) {
+    f->open_object_section("object");
+    f->dump_stream("oid") << i->first;
+    f->open_array_section("extents");
+    for (auto j = i->second.cbegin(); j != i->second.cend(); ++j) {
+      f->open_object_section("extent");
+      f->dump_unsigned("off", j->get<0>());
+      f->dump_unsigned("len", j->get<1>());
+      f->dump_unsigned("flags", j->get<2>());
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("object_attrs_requested");
+  for (auto i = attrs_to_read.cbegin(); i != attrs_to_read.cend(); ++i) {
+    f->open_object_section("object");
+    f->dump_stream("oid") << *i;
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void ECSubRead::generate_test_instances(list<ECSubRead*>& o)
+{
+  hobject_t hoid1(sobject_t("asdf", 1));
+  hobject_t hoid2(sobject_t("asdf2", CEPH_NOSNAP));
+  o.push_back(new ECSubRead());
+  o.back()->from = pg_shard_t(2, shard_id_t(-1));
+  o.back()->tid = 1;
+  o.back()->to_read[hoid1].push_back(boost::make_tuple(100, 200, 0));
+  o.back()->to_read[hoid1].push_back(boost::make_tuple(400, 600, 0));
+  o.back()->to_read[hoid2].push_back(boost::make_tuple(400, 600, 0));
+  o.back()->attrs_to_read.insert(hoid1);
+  o.push_back(new ECSubRead());
+  o.back()->from = pg_shard_t(2, shard_id_t(-1));
+  o.back()->tid = 300;
+  o.back()->to_read[hoid1].push_back(boost::make_tuple(300, 200, 0));
+  o.back()->to_read[hoid2].push_back(boost::make_tuple(400, 600, 0));
+  o.back()->to_read[hoid2].push_back(boost::make_tuple(2000, 600, 0));
+  o.back()->attrs_to_read.insert(hoid2);
+}
+
+void ECSubReadReply::encode(bufferlist &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(from, bl);
+  encode(tid, bl);
+  encode(buffers_read, bl);
+  encode(attrs_read, bl);
+  encode(errors, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ECSubReadReply::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  decode(from, bl);
+  decode(tid, bl);
+  decode(buffers_read, bl);
+  decode(attrs_read, bl);
+  decode(errors, bl);
+  DECODE_FINISH(bl);
+}
+
+std::ostream &operator<<(
+  std::ostream &lhs, const ECSubReadReply &rhs)
+{
+  return lhs
+    << "ECSubReadReply(tid=" << rhs.tid
+    << ", attrs_read=" << rhs.attrs_read.size()
+    << ")";
+}
+
+void ECSubReadReply::dump(Formatter *f) const
+{
+  f->dump_stream("from") << from;
+  f->dump_unsigned("tid", tid);
+  f->open_array_section("buffers_read");
+  for (auto i = buffers_read.cbegin(); i != buffers_read.cend(); ++i) {
+    f->open_object_section("object");
+    f->dump_stream("oid") << i->first;
+    f->open_array_section("data");
+    for (auto j = i->second.cbegin(); j != i->second.cend(); ++j) {
+      f->open_object_section("extent");
+      f->dump_unsigned("off", j->first);
+      f->dump_unsigned("buf_len", j->second.length());
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("attrs_returned");
+  for (auto i = attrs_read.cbegin(); i != attrs_read.cend(); ++i) {
+    f->open_object_section("object_attrs");
+    f->dump_stream("oid") << i->first;
+    f->open_array_section("attrs");
+    for (auto j = i->second.cbegin(); j != i->second.cend(); ++j) {
+      f->open_object_section("attr");
+      f->dump_string("attr", j->first);
+      f->dump_unsigned("val_len", j->second.length());
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("errors");
+  for (auto i = errors.cbegin(); i != errors.cend(); ++i) {
+    f->open_object_section("error_pair");
+    f->dump_stream("oid") << i->first;
+    f->dump_int("error", i->second);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void ECSubReadReply::generate_test_instances(list<ECSubReadReply*>& o)
+{
+  hobject_t hoid1(sobject_t("asdf", 1));
+  hobject_t hoid2(sobject_t("asdf2", CEPH_NOSNAP));
+  bufferlist bl;
+  bl.append_zero(100);
+  bufferlist bl2;
+  bl2.append_zero(200);
+  o.push_back(new ECSubReadReply());
+  o.back()->from = pg_shard_t(2, shard_id_t(-1));
+  o.back()->tid = 1;
+  o.back()->buffers_read[hoid1].push_back(make_pair(20, bl));
+  o.back()->buffers_read[hoid1].push_back(make_pair(2000, bl2));
+  o.back()->buffers_read[hoid2].push_back(make_pair(0, bl));
+  o.back()->attrs_read[hoid1]["foo"] = bl;
+  o.back()->attrs_read[hoid1]["_"] = bl2;
+  o.push_back(new ECSubReadReply());
+  o.back()->from = pg_shard_t(2, shard_id_t(-1));
+  o.back()->tid = 300;
+  o.back()->buffers_read[hoid2].push_back(make_pair(0, bl2));
+  o.back()->attrs_read[hoid2]["foo"] = bl;
+  o.back()->attrs_read[hoid2]["_"] = bl2;
+  o.back()->errors[hoid1] = -2;
+}
diff --git a/src/osd/ECMsgTypes.h b/src/osd/ECMsgTypes.h
new file mode 100644
index 000000000..77b4222b2
--- /dev/null
+++ b/src/osd/ECMsgTypes.h
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef ECBMSGTYPES_H
+#define ECBMSGTYPES_H
+
+#include "osd_types.h"
+#include "include/buffer.h"
+#include "os/ObjectStore.h"
+#include "boost/tuple/tuple.hpp"
+
+struct ECSubWrite {
+  pg_shard_t from;
+  ceph_tid_t tid;
+  osd_reqid_t reqid;
+  hobject_t soid;
+  pg_stat_t stats;
+  ObjectStore::Transaction t;
+  eversion_t at_version;
+  eversion_t trim_to;
+  eversion_t roll_forward_to;
+  std::vector<pg_log_entry_t> log_entries;
+  std::set<hobject_t> temp_added;
+  std::set<hobject_t> temp_removed;
+  std::optional<pg_hit_set_history_t> updated_hit_set_history;
+  bool backfill_or_async_recovery = false;
+  ECSubWrite() : tid(0) {}
+  ECSubWrite(
+    pg_shard_t from,
+    ceph_tid_t tid,
+    osd_reqid_t reqid,
+    hobject_t soid,
+    const pg_stat_t &stats,
+    const ObjectStore::Transaction &t,
+    eversion_t at_version,
+    eversion_t trim_to,
+    eversion_t roll_forward_to,
+    std::vector<pg_log_entry_t> log_entries,
+    std::optional<pg_hit_set_history_t> updated_hit_set_history,
+    const std::set<hobject_t> &temp_added,
+    const std::set<hobject_t> &temp_removed,
+    bool backfill_or_async_recovery)
+    : from(from), tid(tid), reqid(reqid),
+      soid(soid), stats(stats), t(t),
+      at_version(at_version),
+      trim_to(trim_to), roll_forward_to(roll_forward_to),
+      log_entries(log_entries),
+      temp_added(temp_added),
+      temp_removed(temp_removed),
+      updated_hit_set_history(updated_hit_set_history),
+      backfill_or_async_recovery(backfill_or_async_recovery)
+    {}
+  void claim(ECSubWrite &other) {
+    from = other.from;
+    tid = other.tid;
+    reqid = other.reqid;
+    soid = other.soid;
+    stats = other.stats;
+    t.swap(other.t);
+    at_version = other.at_version;
+    trim_to = other.trim_to;
+    roll_forward_to = other.roll_forward_to;
+    log_entries.swap(other.log_entries);
+    temp_added.swap(other.temp_added);
+    temp_removed.swap(other.temp_removed);
+    updated_hit_set_history = other.updated_hit_set_history;
+    backfill_or_async_recovery = other.backfill_or_async_recovery;
+  }
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<ECSubWrite*>& o);
+private:
+  // no outside copying -- slow
+  ECSubWrite(ECSubWrite& other);
+  const ECSubWrite& operator=(const ECSubWrite& other);
+};
+WRITE_CLASS_ENCODER(ECSubWrite)
+
+struct ECSubWriteReply {
+  pg_shard_t from;
+  ceph_tid_t tid;
+  eversion_t last_complete;
+  bool committed;
+  bool applied;
+  ECSubWriteReply() : tid(0), committed(false), applied(false) {}
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<ECSubWriteReply*>& o);
+};
+WRITE_CLASS_ENCODER(ECSubWriteReply)
+
+struct ECSubRead {
+  pg_shard_t from;
+  ceph_tid_t tid;
+  std::map<hobject_t, std::list<boost::tuple<uint64_t, uint64_t, uint32_t> >> to_read;
+  std::set<hobject_t> attrs_to_read;
+  std::map<hobject_t, std::vector<std::pair<int, int>>> subchunks;
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<ECSubRead*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(ECSubRead)
+
+struct ECSubReadReply {
+  pg_shard_t from;
+  ceph_tid_t tid;
+  std::map<hobject_t, std::list<std::pair<uint64_t, ceph::buffer::list> >> buffers_read;
+  std::map<hobject_t, std::map<std::string, ceph::buffer::list>> attrs_read;
+  std::map<hobject_t, int> errors;
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<ECSubReadReply*>& o);
+};
+WRITE_CLASS_ENCODER(ECSubReadReply)
+
+std::ostream &operator<<(
+  std::ostream &lhs, const ECSubWrite &rhs);
+std::ostream &operator<<(
+  std::ostream &lhs, const ECSubWriteReply &rhs);
+std::ostream &operator<<(
+  std::ostream &lhs, const ECSubRead &rhs);
+std::ostream &operator<<(
+  std::ostream &lhs, const ECSubReadReply &rhs);
+
+#endif
diff --git a/src/osd/ECTransaction.cc b/src/osd/ECTransaction.cc
new file mode 100644
index 000000000..603f9af0e
--- /dev/null
+++ b/src/osd/ECTransaction.cc
@@ -0,0 +1,670 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include <vector>
+#include <sstream>
+
+#include "ECTransaction.h"
+#include "ECUtil.h"
+#include "os/ObjectStore.h"
+#include "common/inline_variant.h"
+
+using std::make_pair;
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::ErasureCodeInterfaceRef;
+
+void encode_and_write(
+  pg_t pgid,
+  const hobject_t &oid,
+  const ECUtil::stripe_info_t &sinfo,
+  ErasureCodeInterfaceRef &ecimpl,
+  const set<int> &want,
+  uint64_t offset,
+  bufferlist bl,
+  uint32_t flags,
+  ECUtil::HashInfoRef hinfo,
+  extent_map &written,
+  map<shard_id_t, ObjectStore::Transaction> *transactions,
+  DoutPrefixProvider *dpp) {
+  const uint64_t before_size = hinfo->get_total_logical_size(sinfo);
+  ceph_assert(sinfo.logical_offset_is_stripe_aligned(offset));
+  ceph_assert(sinfo.logical_offset_is_stripe_aligned(bl.length()));
+  ceph_assert(bl.length());
+
+  map<int, bufferlist> buffers;
+  int r = ECUtil::encode(
+    sinfo, ecimpl, bl, want, &buffers);
+  ceph_assert(r == 0);
+
+  written.insert(offset, bl.length(), bl);
+
+  ldpp_dout(dpp, 20) << __func__ << ": " << oid
+		     << " new_size "
+		     << offset + bl.length()
+		     << dendl;
+
+  if (offset >= before_size) {
+    ceph_assert(offset == before_size);
+    hinfo->append(
+      sinfo.aligned_logical_offset_to_chunk_offset(offset),
+      buffers);
+  }
+
+  for (auto &&i : *transactions) {
+    ceph_assert(buffers.count(i.first));
+    bufferlist &enc_bl = buffers[i.first];
+    if (offset >= before_size) {
+      i.second.set_alloc_hint(
+	coll_t(spg_t(pgid, i.first)),
+	ghobject_t(oid, ghobject_t::NO_GEN, i.first),
+	0, 0,
+	CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE |
+	CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY);
+    }
+    i.second.write(
+      coll_t(spg_t(pgid, i.first)),
+      ghobject_t(oid, ghobject_t::NO_GEN, i.first),
+      sinfo.logical_to_prev_chunk_offset(
+	offset),
+      enc_bl.length(),
+      enc_bl,
+      flags);
+  }
+}
+
+bool ECTransaction::requires_overwrite(
+  uint64_t prev_size,
+  const PGTransaction::ObjectOperation &op) {
+  // special handling for truncates to 0
+  if (op.truncate && op.truncate->first == 0)
+    return false;
+  return op.is_none() &&
+    ((!op.buffer_updates.empty() &&
+      (op.buffer_updates.begin().get_off() < prev_size)) ||
+     (op.truncate &&
+      (op.truncate->first < prev_size)));
+}
+
+void ECTransaction::generate_transactions(
+  WritePlan &plan,
+  ErasureCodeInterfaceRef &ecimpl,
+  pg_t pgid,
+  const ECUtil::stripe_info_t &sinfo,
+  const map<hobject_t,extent_map> &partial_extents,
+  vector<pg_log_entry_t> &entries,
+  map<hobject_t,extent_map> *written_map,
+  map<shard_id_t, ObjectStore::Transaction> *transactions,
+  set<hobject_t> *temp_added,
+  set<hobject_t> *temp_removed,
+  DoutPrefixProvider *dpp,
+  const ceph_release_t require_osd_release)
+{
+  ceph_assert(written_map);
+  ceph_assert(transactions);
+  ceph_assert(temp_added);
+  ceph_assert(temp_removed);
+  ceph_assert(plan.t);
+  auto &t = *(plan.t);
+
+  auto &hash_infos = plan.hash_infos;
+
+  map<hobject_t, pg_log_entry_t*> obj_to_log;
+  for (auto &&i: entries) {
+    obj_to_log.insert(make_pair(i.soid, &i));
+  }
+
+  t.safe_create_traverse(
+    [&](pair<const hobject_t, PGTransaction::ObjectOperation> &opair) {
+      const hobject_t &oid = opair.first;
+      auto &op = opair.second;
+      auto &obc_map = t.obc_map;
+      auto &written = (*written_map)[oid];
+
+      auto iter = obj_to_log.find(oid);
+      pg_log_entry_t *entry = iter != obj_to_log.end() ? iter->second : nullptr;
+
+      ObjectContextRef obc;
+      auto obiter = t.obc_map.find(oid);
+      if (obiter != t.obc_map.end()) {
+	obc = obiter->second;
+      }
+      if (entry) {
+	ceph_assert(obc);
+      } else {
+	ceph_assert(oid.is_temp());
+      }
+
+      ECUtil::HashInfoRef hinfo;
+      {
+	auto iter = hash_infos.find(oid);
+	ceph_assert(iter != hash_infos.end());
+	hinfo = iter->second;
+      }
+
+      if (oid.is_temp()) {
+	if (op.is_fresh_object()) {
+	  temp_added->insert(oid);
+	} else if (op.is_delete()) {
+	  temp_removed->insert(oid);
+	}
+      }
+
+      if (entry &&
+	  entry->is_modify() &&
+	  op.updated_snaps) {
+	bufferlist bl(op.updated_snaps->second.size() * 8 + 8);
+	encode(op.updated_snaps->second, bl);
+	entry->snaps.swap(bl);
+	entry->snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+      }
+
+      ldpp_dout(dpp, 20) << "generate_transactions: "
+			 << opair.first
+			 << ", current size is "
+			 << hinfo->get_total_logical_size(sinfo)
+			 << " buffers are "
+			 << op.buffer_updates
+			 << dendl;
+      if (op.truncate) {
+	ldpp_dout(dpp, 20) << "generate_transactions: "
+			   << " truncate is "
+			   << *(op.truncate)
+			   << dendl;
+      }
+
+      if (entry && op.updated_snaps) {
+	entry->mod_desc.update_snaps(op.updated_snaps->first);
+      }
+
+      map<string, std::optional<bufferlist> > xattr_rollback;
+      ceph_assert(hinfo);
+      bufferlist old_hinfo;
+      encode(*hinfo, old_hinfo);
+      xattr_rollback[ECUtil::get_hinfo_key()] = old_hinfo;
+
+      if (op.is_none() && op.truncate && op.truncate->first == 0) {
+	ceph_assert(op.truncate->first == 0);
+	ceph_assert(op.truncate->first ==
+	       op.truncate->second);
+	ceph_assert(entry);
+	ceph_assert(obc);
+
+	if (op.truncate->first != op.truncate->second) {
+	  op.truncate->first = op.truncate->second;
+	} else {
+	  op.truncate = std::nullopt;
+	}
+
+	op.delete_first = true;
+	op.init_type = PGTransaction::ObjectOperation::Init::Create();
+
+	if (obc) {
+	  /* We need to reapply all of the cached xattrs.
+	     * std::map insert fortunately only writes keys
+	     * which don't already exist, so this should do
+	     * the right thing. */
+	  op.attr_updates.insert(
+	    obc->attr_cache.begin(),
+	    obc->attr_cache.end());
+	}
+      }
+
+      if (op.delete_first) {
+	/* We also want to remove the std::nullopt entries since
+	   * the keys already won't exist */
+	for (auto j = op.attr_updates.begin();
+	     j != op.attr_updates.end();
+	  ) {
+	  if (j->second) {
+	    ++j;
+	  } else {
+	    op.attr_updates.erase(j++);
+	  }
+	}
+	/* Fill in all current entries for xattr rollback */
+	if (obc) {
+	  xattr_rollback.insert(
+	    obc->attr_cache.begin(),
+	    obc->attr_cache.end());
+	  obc->attr_cache.clear();
+	}
+	if (entry) {
+	  entry->mod_desc.rmobject(entry->version.version);
+	  for (auto &&st: *transactions) {
+	    st.second.collection_move_rename(
+	      coll_t(spg_t(pgid, st.first)),
+	      ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+	      coll_t(spg_t(pgid, st.first)),
+	      ghobject_t(oid, entry->version.version, st.first));
+	  }
+	} else {
+	  for (auto &&st: *transactions) {
+	    st.second.remove(
+	      coll_t(spg_t(pgid, st.first)),
+	      ghobject_t(oid, ghobject_t::NO_GEN, st.first));
+	  }
+	}
+	hinfo->clear();
+      }
+
+      if (op.is_fresh_object() && entry) {
+	entry->mod_desc.create();
+      }
+
+      match(
+	op.init_type,
+	[&](const PGTransaction::ObjectOperation::Init::None &) {},
+	[&](const PGTransaction::ObjectOperation::Init::Create &op) {
+	  for (auto &&st: *transactions) {
+	    if (require_osd_release >= ceph_release_t::octopus) {
+	      st.second.create(
+		coll_t(spg_t(pgid, st.first)),
+		ghobject_t(oid, ghobject_t::NO_GEN, st.first));
+	    } else {
+	      st.second.touch(
+		coll_t(spg_t(pgid, st.first)),
+		ghobject_t(oid, ghobject_t::NO_GEN, st.first));
+	    }
+	  }
+	},
+	[&](const PGTransaction::ObjectOperation::Init::Clone &op) {
+	  for (auto &&st: *transactions) {
+	    st.second.clone(
+	      coll_t(spg_t(pgid, st.first)),
+	      ghobject_t(op.source, ghobject_t::NO_GEN, st.first),
+	      ghobject_t(oid, ghobject_t::NO_GEN, st.first));
+	  }
+
+	  auto siter = hash_infos.find(op.source);
+	  ceph_assert(siter != hash_infos.end());
+	  hinfo->update_to(*(siter->second));
+
+	  if (obc) {
+	    auto cobciter = obc_map.find(op.source);
+	    ceph_assert(cobciter != obc_map.end());
+	    obc->attr_cache = cobciter->second->attr_cache;
+	  }
+	},
+	[&](const PGTransaction::ObjectOperation::Init::Rename &op) {
+	  ceph_assert(op.source.is_temp());
+	  for (auto &&st: *transactions) {
+	    st.second.collection_move_rename(
+	      coll_t(spg_t(pgid, st.first)),
+	      ghobject_t(op.source, ghobject_t::NO_GEN, st.first),
+	      coll_t(spg_t(pgid, st.first)),
+	      ghobject_t(oid, ghobject_t::NO_GEN, st.first));
+	  }
+	  auto siter = hash_infos.find(op.source);
+	  ceph_assert(siter != hash_infos.end());
+	  hinfo->update_to(*(siter->second));
+	  if (obc) {
+	    auto cobciter = obc_map.find(op.source);
+	    ceph_assert(cobciter == obc_map.end());
+	    obc->attr_cache.clear();
+	  }
+	});
+
+      // omap not supported (except 0, handled above)
+      ceph_assert(!(op.clear_omap));
+      ceph_assert(!(op.omap_header));
+      ceph_assert(op.omap_updates.empty());
+
+      if (!op.attr_updates.empty()) {
+	map<string, bufferlist> to_set;
+	for (auto &&j: op.attr_updates) {
+	  if (j.second) {
+	    to_set[j.first] = *(j.second);
+	  } else {
+	    for (auto &&st : *transactions) {
+	      st.second.rmattr(
+		coll_t(spg_t(pgid, st.first)),
+		ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+		j.first);
+	    }
+	  }
+	  if (obc) {
+	    auto citer = obc->attr_cache.find(j.first);
+	    if (entry) {
+	      if (citer != obc->attr_cache.end()) {
+		// won't overwrite anything we put in earlier
+		xattr_rollback.insert(
+		  make_pair(
+		    j.first,
+		    std::optional<bufferlist>(citer->second)));
+	      } else {
+		// won't overwrite anything we put in earlier
+		xattr_rollback.insert(
+		  make_pair(
+		    j.first,
+		    std::nullopt));
+	      }
+	    }
+	    if (j.second) {
+	      obc->attr_cache[j.first] = *(j.second);
+	    } else if (citer != obc->attr_cache.end()) {
+	      obc->attr_cache.erase(citer);
+	    }
+	  } else {
+	    ceph_assert(!entry);
+	  }
+	}
+	for (auto &&st : *transactions) {
+	  st.second.setattrs(
+	    coll_t(spg_t(pgid, st.first)),
+	    ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+	    to_set);
+	}
+	ceph_assert(!xattr_rollback.empty());
+      }
+      if (entry && !xattr_rollback.empty()) {
+	entry->mod_desc.setattrs(xattr_rollback);
+      }
+
+      if (op.alloc_hint) {
+	/* logical_to_next_chunk_offset() scales down both aligned and
+	   * unaligned offsets
+	   
+	   * we don't bother to roll this back at this time for two reasons:
+	   * 1) it's advisory
+	   * 2) we don't track the old value */
+	uint64_t object_size = sinfo.logical_to_next_chunk_offset(
+	  op.alloc_hint->expected_object_size);
+	uint64_t write_size = sinfo.logical_to_next_chunk_offset(
+	  op.alloc_hint->expected_write_size);
+	
+	for (auto &&st : *transactions) {
+	  st.second.set_alloc_hint(
+	    coll_t(spg_t(pgid, st.first)),
+	    ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+	    object_size,
+	    write_size,
+	    op.alloc_hint->flags);
+	}
+      }
+
+      extent_map to_write;
+      auto pextiter = partial_extents.find(oid);
+      if (pextiter != partial_extents.end()) {
+	to_write = pextiter->second;
+      }
+
+      vector<pair<uint64_t, uint64_t> > rollback_extents;
+      const uint64_t orig_size = hinfo->get_total_logical_size(sinfo);
+
+      uint64_t new_size = orig_size;
+      uint64_t append_after = new_size;
+      ldpp_dout(dpp, 20) << __func__ << ": new_size start " << new_size << dendl;
+      if (op.truncate && op.truncate->first < new_size) {
+	ceph_assert(!op.is_fresh_object());
+	new_size = sinfo.logical_to_next_stripe_offset(
+	  op.truncate->first);
+	ldpp_dout(dpp, 20) << __func__ << ": new_size truncate down "
+			   << new_size << dendl;
+	if (new_size != op.truncate->first) { // 0 the unaligned part
+	  bufferlist bl;
+	  bl.append_zero(new_size - op.truncate->first);
+	  to_write.insert(
+	    op.truncate->first,
+	    bl.length(),
+	    bl);
+	  append_after = sinfo.logical_to_prev_stripe_offset(
+	    op.truncate->first);
+	} else {
+	  append_after = new_size;
+	}
+	to_write.erase(
+	  new_size,
+	  std::numeric_limits<uint64_t>::max() - new_size);
+
+	if (entry && !op.is_fresh_object()) {
+	  uint64_t restore_from = sinfo.logical_to_prev_chunk_offset(
+	    op.truncate->first);
+	  uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset(
+	    orig_size -
+	    sinfo.logical_to_prev_stripe_offset(op.truncate->first));
+	  ceph_assert(rollback_extents.empty());
+
+	  ldpp_dout(dpp, 20) << __func__ << ": saving extent "
+			     << make_pair(restore_from, restore_len)
+			     << dendl;
+	  ldpp_dout(dpp, 20) << __func__ << ": truncating to "
+			     << new_size
+			     << dendl;
+	  rollback_extents.emplace_back(
+	    make_pair(restore_from, restore_len));
+	  for (auto &&st : *transactions) {
+	    st.second.touch(
+	      coll_t(spg_t(pgid, st.first)),
+	      ghobject_t(oid, entry->version.version, st.first));
+	    st.second.clone_range(
+	      coll_t(spg_t(pgid, st.first)),
+	      ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+	      ghobject_t(oid, entry->version.version, st.first),
+	      restore_from,
+	      restore_len,
+	      restore_from);
+	    
+	  }
+	} else {
+	  ldpp_dout(dpp, 20) << __func__ << ": not saving extents, fresh object"
+			     << dendl;
+	}
+	for (auto &&st : *transactions) {
+	  st.second.truncate(
+	    coll_t(spg_t(pgid, st.first)),
+	    ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+	    sinfo.aligned_logical_offset_to_chunk_offset(new_size));
+	}
+      }
+
+      uint32_t fadvise_flags = 0;
+      for (auto &&extent: op.buffer_updates) {
+	using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate;
+	bufferlist bl;
+	match(
+	  extent.get_val(),
+	  [&](const BufferUpdate::Write &op) {
+	    bl = op.buffer;
+	    fadvise_flags |= op.fadvise_flags;
+	  },
+	  [&](const BufferUpdate::Zero &) {
+	    bl.append_zero(extent.get_len());
+	  },
+	  [&](const BufferUpdate::CloneRange &) {
+	    ceph_assert(
+	      0 ==
+	      "CloneRange is not allowed, do_op should have returned ENOTSUPP");
+	  });
+
+	uint64_t off = extent.get_off();
+	uint64_t len = extent.get_len();
+	uint64_t end = off + len;
+	ldpp_dout(dpp, 20) << __func__ << ": adding buffer_update "
+			   << make_pair(off, len)
+			   << dendl;
+	ceph_assert(len > 0);
+	if (off > new_size) {
+	  ceph_assert(off > append_after);
+	  bl.prepend_zero(off - new_size);
+	  len += off - new_size;
+	  ldpp_dout(dpp, 20) << __func__ << ": prepending zeroes to align "
+			     << off << "->" << new_size
+			     << dendl;
+	  off = new_size;
+	}
+	if (!sinfo.logical_offset_is_stripe_aligned(end) && (end > append_after)) {
+	  uint64_t aligned_end = sinfo.logical_to_next_stripe_offset(
+	    end);
+	  uint64_t tail = aligned_end - end;
+	  bl.append_zero(tail);
+	  ldpp_dout(dpp, 20) << __func__ << ": appending zeroes to align end "
+			     << end << "->" << end+tail
+			     << ", len: " << len << "->" << len+tail
+			     << dendl;
+	  end += tail;
+	  len += tail;
+	}
+
+	to_write.insert(off, len, bl);
+	if (end > new_size)
+	  new_size = end;
+      }
+
+      if (op.truncate &&
+	  op.truncate->second > new_size) {
+	ceph_assert(op.truncate->second > append_after);
+	uint64_t truncate_to =
+	  sinfo.logical_to_next_stripe_offset(
+	    op.truncate->second);
+	uint64_t zeroes = truncate_to - new_size;
+	bufferlist bl;
+	bl.append_zero(zeroes);
+	to_write.insert(
+	  new_size,
+	  zeroes,
+	  bl);
+	new_size = truncate_to;
+	ldpp_dout(dpp, 20) << __func__ << ": truncating out to "
+			   << truncate_to
+			   << dendl;
+      }
+
+      set<int> want;
+      for (unsigned i = 0; i < ecimpl->get_chunk_count(); ++i) {
+	want.insert(i);
+      }
+      auto to_overwrite = to_write.intersect(0, append_after);
+      ldpp_dout(dpp, 20) << __func__ << ": to_overwrite: "
+			 << to_overwrite
+			 << dendl;
+      for (auto &&extent: to_overwrite) {
+	ceph_assert(extent.get_off() + extent.get_len() <= append_after);
+	ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off()));
+	ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len()));
+	if (entry) {
+	  uint64_t restore_from = sinfo.aligned_logical_offset_to_chunk_offset(
+	    extent.get_off());
+	  uint64_t restore_len = sinfo.aligned_logical_offset_to_chunk_offset(
+	    extent.get_len());
+	  ldpp_dout(dpp, 20) << __func__ << ": overwriting "
+			     << restore_from << "~" << restore_len
+			     << dendl;
+	  if (rollback_extents.empty()) {
+	    for (auto &&st : *transactions) {
+	      st.second.touch(
+		coll_t(spg_t(pgid, st.first)),
+		ghobject_t(oid, entry->version.version, st.first));
+	    }
+	  }
+	  rollback_extents.emplace_back(make_pair(restore_from, restore_len));
+	  for (auto &&st : *transactions) {
+	    st.second.clone_range(
+	      coll_t(spg_t(pgid, st.first)),
+	      ghobject_t(oid, ghobject_t::NO_GEN, st.first),
+	      ghobject_t(oid, entry->version.version, st.first),
+	      restore_from,
+	      restore_len,
+	      restore_from);
+	  }
+	}
+	encode_and_write(
+	  pgid,
+	  oid,
+	  sinfo,
+	  ecimpl,
+	  want,
+	  extent.get_off(),
+	  extent.get_val(),
+	  fadvise_flags,
+	  hinfo,
+	  written,
+	  transactions,
+	  dpp);
+      }
+
+      auto to_append = to_write.intersect(
+	append_after,
+	std::numeric_limits<uint64_t>::max() - append_after);
+      ldpp_dout(dpp, 20) << __func__ << ": to_append: "
+			 << to_append
+			 << dendl;
+      for (auto &&extent: to_append) {
+	ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_off()));
+	ceph_assert(sinfo.logical_offset_is_stripe_aligned(extent.get_len()));
+	ldpp_dout(dpp, 20) << __func__ << ": appending "
+			   << extent.get_off() << "~" << extent.get_len()
+			   << dendl;
+	encode_and_write(
+	  pgid,
+	  oid,
+	  sinfo,
+	  ecimpl,
+	  want,
+	  extent.get_off(),
+	  extent.get_val(),
+	  fadvise_flags,
+	  hinfo,
+	  written,
+	  transactions,
+	  dpp);
+      }
+
+      ldpp_dout(dpp, 20) << __func__ << ": " << oid
+			 << " resetting hinfo to logical size "
+			 << new_size
+			 << dendl;
+      if (!rollback_extents.empty() && entry) {
+	if (entry) {
+	  ldpp_dout(dpp, 20) << __func__ << ": " << oid
+			     << " marking rollback extents "
+			     << rollback_extents
+			     << dendl;
+	  entry->mod_desc.rollback_extents(
+	    entry->version.version, rollback_extents);
+	}
+	hinfo->set_total_chunk_size_clear_hash(
+	  sinfo.aligned_logical_offset_to_chunk_offset(new_size));
+      } else {
+	ceph_assert(hinfo->get_total_logical_size(sinfo) == new_size);
+      }
+
+      if (entry && !to_append.empty()) {
+	ldpp_dout(dpp, 20) << __func__ << ": marking append "
+			   << append_after
+			   << dendl;
+	entry->mod_desc.append(append_after);
+      }
+
+      if (!op.is_delete()) {
+	bufferlist hbuf;
+	encode(*hinfo, hbuf);
+	for (auto &&i : *transactions) {
+	  i.second.setattr(
+	    coll_t(spg_t(pgid, i.first)),
+	    ghobject_t(oid, ghobject_t::NO_GEN, i.first),
+	    ECUtil::get_hinfo_key(),
+	    hbuf);
+	}
+      }
+    });
+}
diff --git a/src/osd/ECTransaction.h b/src/osd/ECTransaction.h
new file mode 100644
index 000000000..5cb16261a
--- /dev/null
+++ b/src/osd/ECTransaction.h
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef ECTRANSACTION_H
+#define ECTRANSACTION_H
+
+#include "OSD.h"
+#include "PGBackend.h"
+#include "ECUtil.h"
+#include "erasure-code/ErasureCodeInterface.h"
+#include "PGTransaction.h"
+#include "ExtentCache.h"
+
+namespace ECTransaction {
+  struct WritePlan {
+    PGTransactionUPtr t;
+    bool invalidates_cache = false; // Yes, both are possible
+    std::map<hobject_t,extent_set> to_read;
+    std::map<hobject_t,extent_set> will_write; // superset of to_read
+
+    std::map<hobject_t,ECUtil::HashInfoRef> hash_infos;
+  };
+
+  bool requires_overwrite(
+    uint64_t prev_size,
+    const PGTransaction::ObjectOperation &op);
+
+  template <typename F>
+  WritePlan get_write_plan(
+    const ECUtil::stripe_info_t &sinfo,
+    PGTransactionUPtr &&t,
+    F &&get_hinfo,
+    DoutPrefixProvider *dpp) {
+    WritePlan plan;
+    t->safe_create_traverse(
+      [&](std::pair<const hobject_t, PGTransaction::ObjectOperation> &i) {
+	ECUtil::HashInfoRef hinfo = get_hinfo(i.first);
+	plan.hash_infos[i.first] = hinfo;
+
+	uint64_t projected_size =
+	  hinfo->get_projected_total_logical_size(sinfo);
+
+	if (i.second.deletes_first()) {
+	  ldpp_dout(dpp, 20) << __func__ << ": delete, setting projected size"
+			     << " to 0" << dendl;
+	  projected_size = 0;
+	}
+
+	hobject_t source;
+	if (i.second.has_source(&source)) {
+	  plan.invalidates_cache = true;
+
+	  ECUtil::HashInfoRef shinfo = get_hinfo(source);
+	  projected_size = shinfo->get_projected_total_logical_size(sinfo);
+	  plan.hash_infos[source] = shinfo;
+	}
+
+	auto &will_write = plan.will_write[i.first];
+	if (i.second.truncate &&
+	    i.second.truncate->first < projected_size) {
+	  if (!(sinfo.logical_offset_is_stripe_aligned(
+		  i.second.truncate->first))) {
+	    plan.to_read[i.first].union_insert(
+	      sinfo.logical_to_prev_stripe_offset(i.second.truncate->first),
+	      sinfo.get_stripe_width());
+
+	    ldpp_dout(dpp, 20) << __func__ << ": unaligned truncate" << dendl;
+
+	    will_write.union_insert(
+	      sinfo.logical_to_prev_stripe_offset(i.second.truncate->first),
+	      sinfo.get_stripe_width());
+	  }
+	  projected_size = sinfo.logical_to_next_stripe_offset(
+	    i.second.truncate->first);
+	}
+
+	extent_set raw_write_set;
+	for (auto &&extent: i.second.buffer_updates) {
+	  using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate;
+	  if (boost::get<BufferUpdate::CloneRange>(&(extent.get_val()))) {
+	    ceph_assert(
+	      0 ==
+	      "CloneRange is not allowed, do_op should have returned ENOTSUPP");
+	  }
+	  raw_write_set.insert(extent.get_off(), extent.get_len());
+	}
+
+	auto orig_size = projected_size;
+	for (auto extent = raw_write_set.begin();
+	     extent != raw_write_set.end();
+	     ++extent) {
+	  uint64_t head_start =
+	    sinfo.logical_to_prev_stripe_offset(extent.get_start());
+	  uint64_t head_finish =
+	    sinfo.logical_to_next_stripe_offset(extent.get_start());
+	  if (head_start > projected_size) {
+	    head_start = projected_size;
+	  }
+	  if (head_start != head_finish &&
+	      head_start < orig_size) {
+	    ceph_assert(head_finish <= orig_size);
+	    ceph_assert(head_finish - head_start == sinfo.get_stripe_width());
+	    ldpp_dout(dpp, 20) << __func__ << ": reading partial head stripe "
+			       << head_start << "~" << sinfo.get_stripe_width()
+			       << dendl;
+	    plan.to_read[i.first].union_insert(
+	      head_start, sinfo.get_stripe_width());
+	  }
+
+	  uint64_t tail_start =
+	    sinfo.logical_to_prev_stripe_offset(
+	      extent.get_start() + extent.get_len());
+	  uint64_t tail_finish =
+	    sinfo.logical_to_next_stripe_offset(
+	      extent.get_start() + extent.get_len());
+	  if (tail_start != tail_finish &&
+	      (head_start == head_finish || tail_start != head_start) &&
+	      tail_start < orig_size) {
+	    ceph_assert(tail_finish <= orig_size);
+	    ceph_assert(tail_finish - tail_start == sinfo.get_stripe_width());
+	    ldpp_dout(dpp, 20) << __func__ << ": reading partial tail stripe "
+			       << tail_start << "~" << sinfo.get_stripe_width()
+			       << dendl;
+	    plan.to_read[i.first].union_insert(
+	      tail_start, sinfo.get_stripe_width());
+	  }
+
+	  if (head_start != tail_finish) {
+	    ceph_assert(
+	      sinfo.logical_offset_is_stripe_aligned(
+		tail_finish - head_start)
+	      );
+	    will_write.union_insert(
+	      head_start, tail_finish - head_start);
+	    if (tail_finish > projected_size)
+	      projected_size = tail_finish;
+	  } else {
+	    ceph_assert(tail_finish <= projected_size);
+	  }
+	}
+
+	if (i.second.truncate &&
+	    i.second.truncate->second > projected_size) {
+	  uint64_t truncating_to =
+	    sinfo.logical_to_next_stripe_offset(i.second.truncate->second);
+	  ldpp_dout(dpp, 20) << __func__ << ": truncating out to "
+			     <<  truncating_to
+			     << dendl;
+	  will_write.union_insert(projected_size,
+				  truncating_to - projected_size);
+	  projected_size = truncating_to;
+	}
+
+	ldpp_dout(dpp, 20) << __func__ << ": " << i.first
+			   << " projected size "
+			   << projected_size
+			   << dendl;
+	hinfo->set_projected_total_logical_size(
+	  sinfo,
+	  projected_size);
+
+	/* validate post conditions:
+	 * to_read should have an entry for i.first iff it isn't empty
+	 * and if we are reading from i.first, we can't be renaming or
+	 * cloning it */
+	ceph_assert(plan.to_read.count(i.first) == 0 ||
+	       (!plan.to_read.at(i.first).empty() &&
+		!i.second.has_source()));
+      });
+    plan.t = std::move(t);
+    return plan;
+  }
+
+  void generate_transactions(
+    WritePlan &plan,
+    ceph::ErasureCodeInterfaceRef &ecimpl,
+    pg_t pgid,
+    const ECUtil::stripe_info_t &sinfo,
+    const std::map<hobject_t,extent_map> &partial_extents,
+    std::vector<pg_log_entry_t> &entries,
+    std::map<hobject_t,extent_map> *written,
+    std::map<shard_id_t, ObjectStore::Transaction> *transactions,
+    std::set<hobject_t> *temp_added,
+    std::set<hobject_t> *temp_removed,
+    DoutPrefixProvider *dpp,
+    const ceph_release_t require_osd_release = ceph_release_t::unknown);
+};
+
+#endif
diff --git a/src/osd/ECUtil.cc b/src/osd/ECUtil.cc
new file mode 100644
index 000000000..94b328458
--- /dev/null
+++ b/src/osd/ECUtil.cc
@@ -0,0 +1,248 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include <errno.h>
+#include "include/encoding.h"
+#include "ECUtil.h"
+
+using namespace std;
+using ceph::bufferlist;
+using ceph::ErasureCodeInterfaceRef;
+using ceph::Formatter;
+
+int ECUtil::decode(
+  const stripe_info_t &sinfo,
+  ErasureCodeInterfaceRef &ec_impl,
+  map<int, bufferlist> &to_decode,
+  bufferlist *out) {
+  ceph_assert(to_decode.size());
+
+  uint64_t total_data_size = to_decode.begin()->second.length();
+  ceph_assert(total_data_size % sinfo.get_chunk_size() == 0);
+
+  ceph_assert(out);
+  ceph_assert(out->length() == 0);
+
+  for (map<int, bufferlist>::iterator i = to_decode.begin();
+       i != to_decode.end();
+       ++i) {
+    ceph_assert(i->second.length() == total_data_size);
+  }
+
+  if (total_data_size == 0)
+    return 0;
+
+  for (uint64_t i = 0; i < total_data_size; i += sinfo.get_chunk_size()) {
+    map<int, bufferlist> chunks;
+    for (map<int, bufferlist>::iterator j = to_decode.begin();
+	 j != to_decode.end();
+	 ++j) {
+      chunks[j->first].substr_of(j->second, i, sinfo.get_chunk_size());
+    }
+    bufferlist bl;
+    int r = ec_impl->decode_concat(chunks, &bl);
+    ceph_assert(r == 0);
+    ceph_assert(bl.length() == sinfo.get_stripe_width());
+    out->claim_append(bl);
+  }
+  return 0;
+}
+
+int ECUtil::decode(
+  const stripe_info_t &sinfo,
+  ErasureCodeInterfaceRef &ec_impl,
+  map<int, bufferlist> &to_decode,
+  map<int, bufferlist*> &out) {
+
+  ceph_assert(to_decode.size());
+
+  for (auto &&i : to_decode) {
+    if(i.second.length() == 0)
+      return 0;
+  }
+
+  set<int> need;
+  for (map<int, bufferlist*>::iterator i = out.begin();
+       i != out.end();
+       ++i) {
+    ceph_assert(i->second);
+    ceph_assert(i->second->length() == 0);
+    need.insert(i->first);
+  }
+
+  set<int> avail;
+  for (auto &&i : to_decode) {
+    ceph_assert(i.second.length() != 0);
+    avail.insert(i.first);
+  }
+
+  map<int, vector<pair<int, int>>> min;
+  int r = ec_impl->minimum_to_decode(need, avail, &min);
+  ceph_assert(r == 0);
+
+  int chunks_count = 0;
+  int repair_data_per_chunk = 0;
+  int subchunk_size = sinfo.get_chunk_size()/ec_impl->get_sub_chunk_count();
+
+  for (auto &&i : to_decode) {
+    auto found = min.find(i.first);
+    if (found != min.end()) {
+      int repair_subchunk_count = 0;
+      for (auto& subchunks : min[i.first]) {
+        repair_subchunk_count += subchunks.second;
+      }
+      repair_data_per_chunk = repair_subchunk_count * subchunk_size;
+      chunks_count = (int)i.second.length() / repair_data_per_chunk;
+      break;
+    }
+  }
+
+  for (int i = 0; i < chunks_count; i++) {
+    map<int, bufferlist> chunks;
+    for (auto j = to_decode.begin();
+	 j != to_decode.end();
+	 ++j) {
+      chunks[j->first].substr_of(j->second, 
+                                 i*repair_data_per_chunk, 
+                                 repair_data_per_chunk);
+    }
+    map<int, bufferlist> out_bls;
+    r = ec_impl->decode(need, chunks, &out_bls, sinfo.get_chunk_size());
+    ceph_assert(r == 0);
+    for (auto j = out.begin(); j != out.end(); ++j) {
+      ceph_assert(out_bls.count(j->first));
+      ceph_assert(out_bls[j->first].length() == sinfo.get_chunk_size());
+      j->second->claim_append(out_bls[j->first]);
+    }
+  }
+  for (auto &&i : out) {
+    ceph_assert(i.second->length() == chunks_count * sinfo.get_chunk_size());
+  }
+  return 0;
+}
+
+int ECUtil::encode(
+  const stripe_info_t &sinfo,
+  ErasureCodeInterfaceRef &ec_impl,
+  bufferlist &in,
+  const set<int> &want,
+  map<int, bufferlist> *out) {
+
+  uint64_t logical_size = in.length();
+
+  ceph_assert(logical_size % sinfo.get_stripe_width() == 0);
+  ceph_assert(out);
+  ceph_assert(out->empty());
+
+  if (logical_size == 0)
+    return 0;
+
+  for (uint64_t i = 0; i < logical_size; i += sinfo.get_stripe_width()) {
+    map<int, bufferlist> encoded;
+    bufferlist buf;
+    buf.substr_of(in, i, sinfo.get_stripe_width());
+    int r = ec_impl->encode(want, buf, &encoded);
+    ceph_assert(r == 0);
+    for (map<int, bufferlist>::iterator i = encoded.begin();
+	 i != encoded.end();
+	 ++i) {
+      ceph_assert(i->second.length() == sinfo.get_chunk_size());
+      (*out)[i->first].claim_append(i->second);
+    }
+  }
+
+  for (map<int, bufferlist>::iterator i = out->begin();
+       i != out->end();
+       ++i) {
+    ceph_assert(i->second.length() % sinfo.get_chunk_size() == 0);
+    ceph_assert(
+      sinfo.aligned_chunk_offset_to_logical_offset(i->second.length()) ==
+      logical_size);
+  }
+  return 0;
+}
+
+void ECUtil::HashInfo::append(uint64_t old_size,
+			      map<int, bufferlist> &to_append) {
+  ceph_assert(old_size == total_chunk_size);
+  uint64_t size_to_append = to_append.begin()->second.length();
+  if (has_chunk_hash()) {
+    ceph_assert(to_append.size() == cumulative_shard_hashes.size());
+    for (map<int, bufferlist>::iterator i = to_append.begin();
+	 i != to_append.end();
+	 ++i) {
+      ceph_assert(size_to_append == i->second.length());
+      ceph_assert((unsigned)i->first < cumulative_shard_hashes.size());
+      uint32_t new_hash = i->second.crc32c(cumulative_shard_hashes[i->first]);
+      cumulative_shard_hashes[i->first] = new_hash;
+    }
+  }
+  total_chunk_size += size_to_append;
+}
+
+void ECUtil::HashInfo::encode(bufferlist &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(total_chunk_size, bl);
+  encode(cumulative_shard_hashes, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ECUtil::HashInfo::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  decode(total_chunk_size, bl);
+  decode(cumulative_shard_hashes, bl);
+  projected_total_chunk_size = total_chunk_size;
+  DECODE_FINISH(bl);
+}
+
+void ECUtil::HashInfo::dump(Formatter *f) const
+{
+  f->dump_unsigned("total_chunk_size", total_chunk_size);
+  f->open_array_section("cumulative_shard_hashes");
+  for (unsigned i = 0; i != cumulative_shard_hashes.size(); ++i) {
+    f->open_object_section("hash");
+    f->dump_unsigned("shard", i);
+    f->dump_unsigned("hash", cumulative_shard_hashes[i]);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+namespace ECUtil {
+std::ostream& operator<<(std::ostream& out, const HashInfo& hi)
+{
+  ostringstream hashes;
+  for (auto hash: hi.cumulative_shard_hashes)
+    hashes << " " << hex << hash;
+  return out << "tcs=" << hi.total_chunk_size << hashes.str();
+}
+}
+
+void ECUtil::HashInfo::generate_test_instances(list<HashInfo*>& o)
+{
+  o.push_back(new HashInfo(3));
+  {
+    bufferlist bl;
+    bl.append_zero(20);
+    map<int, bufferlist> buffers;
+    buffers[0] = bl;
+    buffers[1] = bl;
+    buffers[2] = bl;
+    o.back()->append(0, buffers);
+    o.back()->append(20, buffers);
+  }
+  o.push_back(new HashInfo(4));
+}
+
+const string HINFO_KEY = "hinfo_key";
+
+bool ECUtil::is_hinfo_key_string(const string &key)
+{
+  return key == HINFO_KEY;
+}
+
+const string &ECUtil::get_hinfo_key()
+{
+  return HINFO_KEY;
+}
diff --git a/src/osd/ECUtil.h b/src/osd/ECUtil.h
new file mode 100644
index 000000000..dce78b8a8
--- /dev/null
+++ b/src/osd/ECUtil.h
@@ -0,0 +1,169 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef ECUTIL_H
+#define ECUTIL_H
+
+#include <ostream>
+#include "erasure-code/ErasureCodeInterface.h"
+#include "include/buffer_fwd.h"
+#include "include/ceph_assert.h"
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+namespace ECUtil {
+
+class stripe_info_t {
+  const uint64_t stripe_width;
+  const uint64_t chunk_size;
+public:
+  stripe_info_t(uint64_t stripe_size, uint64_t stripe_width)
+    : stripe_width(stripe_width),
+      chunk_size(stripe_width / stripe_size) {
+    ceph_assert(stripe_width % stripe_size == 0);
+  }
+  bool logical_offset_is_stripe_aligned(uint64_t logical) const {
+    return (logical % stripe_width) == 0;
+  }
+  uint64_t get_stripe_width() const {
+    return stripe_width;
+  }
+  uint64_t get_chunk_size() const {
+    return chunk_size;
+  }
+  uint64_t logical_to_prev_chunk_offset(uint64_t offset) const {
+    return (offset / stripe_width) * chunk_size;
+  }
+  uint64_t logical_to_next_chunk_offset(uint64_t offset) const {
+    return ((offset + stripe_width - 1)/ stripe_width) * chunk_size;
+  }
+  uint64_t logical_to_prev_stripe_offset(uint64_t offset) const {
+    return offset - (offset % stripe_width);
+  }
+  uint64_t logical_to_next_stripe_offset(uint64_t offset) const {
+    return ((offset % stripe_width) ?
+      (offset - (offset % stripe_width) + stripe_width) :
+      offset);
+  }
+  uint64_t aligned_logical_offset_to_chunk_offset(uint64_t offset) const {
+    ceph_assert(offset % stripe_width == 0);
+    return (offset / stripe_width) * chunk_size;
+  }
+  uint64_t aligned_chunk_offset_to_logical_offset(uint64_t offset) const {
+    ceph_assert(offset % chunk_size == 0);
+    return (offset / chunk_size) * stripe_width;
+  }
+  std::pair<uint64_t, uint64_t> aligned_offset_len_to_chunk(
+    std::pair<uint64_t, uint64_t> in) const {
+    return std::make_pair(
+      aligned_logical_offset_to_chunk_offset(in.first),
+      aligned_logical_offset_to_chunk_offset(in.second));
+  }
+  std::pair<uint64_t, uint64_t> offset_len_to_stripe_bounds(
+    std::pair<uint64_t, uint64_t> in) const {
+    uint64_t off = logical_to_prev_stripe_offset(in.first);
+    uint64_t len = logical_to_next_stripe_offset(
+      (in.first - off) + in.second);
+    return std::make_pair(off, len);
+  }
+};
+
+int decode(
+  const stripe_info_t &sinfo,
+  ceph::ErasureCodeInterfaceRef &ec_impl,
+  std::map<int, ceph::buffer::list> &to_decode,
+  ceph::buffer::list *out);
+
+int decode(
+  const stripe_info_t &sinfo,
+  ceph::ErasureCodeInterfaceRef &ec_impl,
+  std::map<int, ceph::buffer::list> &to_decode,
+  std::map<int, ceph::buffer::list*> &out);
+
+int encode(
+  const stripe_info_t &sinfo,
+  ceph::ErasureCodeInterfaceRef &ec_impl,
+  ceph::buffer::list &in,
+  const std::set<int> &want,
+  std::map<int, ceph::buffer::list> *out);
+
+class HashInfo {
+  uint64_t total_chunk_size = 0;
+  std::vector<uint32_t> cumulative_shard_hashes;
+
+  // purely ephemeral, represents the size once all in-flight ops commit
+  uint64_t projected_total_chunk_size = 0;
+public:
+  HashInfo() {}
+  explicit HashInfo(unsigned num_chunks) :
+    cumulative_shard_hashes(num_chunks, -1) {}
+  void append(uint64_t old_size, std::map<int, ceph::buffer::list> &to_append);
+  void clear() {
+    total_chunk_size = 0;
+    cumulative_shard_hashes = std::vector<uint32_t>(
+      cumulative_shard_hashes.size(),
+      -1);
+  }
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<HashInfo*>& o);
+  uint32_t get_chunk_hash(int shard) const {
+    ceph_assert((unsigned)shard < cumulative_shard_hashes.size());
+    return cumulative_shard_hashes[shard];
+  }
+  uint64_t get_total_chunk_size() const {
+    return total_chunk_size;
+  }
+  uint64_t get_projected_total_chunk_size() const {
+    return projected_total_chunk_size;
+  }
+  uint64_t get_total_logical_size(const stripe_info_t &sinfo) const {
+    return get_total_chunk_size() *
+      (sinfo.get_stripe_width()/sinfo.get_chunk_size());
+  }
+  uint64_t get_projected_total_logical_size(const stripe_info_t &sinfo) const {
+    return get_projected_total_chunk_size() *
+      (sinfo.get_stripe_width()/sinfo.get_chunk_size());
+  }
+  void set_projected_total_logical_size(
+    const stripe_info_t &sinfo,
+    uint64_t logical_size) {
+    ceph_assert(sinfo.logical_offset_is_stripe_aligned(logical_size));
+    projected_total_chunk_size = sinfo.aligned_logical_offset_to_chunk_offset(
+      logical_size);
+  }
+  void set_total_chunk_size_clear_hash(uint64_t new_chunk_size) {
+    cumulative_shard_hashes.clear();
+    total_chunk_size = new_chunk_size;
+  }
+  bool has_chunk_hash() const {
+    return !cumulative_shard_hashes.empty();
+  }
+  void update_to(const HashInfo &rhs) {
+    auto ptcs = projected_total_chunk_size;
+    *this = rhs;
+    projected_total_chunk_size = ptcs;
+  }
+  friend std::ostream& operator<<(std::ostream& out, const HashInfo& hi);
+};
+
+typedef std::shared_ptr<HashInfo> HashInfoRef;
+
+bool is_hinfo_key_string(const std::string &key);
+const std::string &get_hinfo_key();
+
+WRITE_CLASS_ENCODER(ECUtil::HashInfo)
+}
+#endif
diff --git a/src/osd/ExtentCache.cc b/src/osd/ExtentCache.cc
new file mode 100644
index 000000000..3a8bbf11b
--- /dev/null
+++ b/src/osd/ExtentCache.cc
@@ -0,0 +1,245 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "ExtentCache.h"
+
+using std::ostream;
+
+using ceph::bufferlist;
+
+void ExtentCache::extent::_link_pin_state(pin_state &pin_state)
+{
+  ceph_assert(parent_extent_set);
+  ceph_assert(!parent_pin_state);
+  parent_pin_state = &pin_state;
+  pin_state.pin_list.push_back(*this);
+}
+
+void ExtentCache::extent::_unlink_pin_state()
+{
+  ceph_assert(parent_extent_set);
+  ceph_assert(parent_pin_state);
+  auto liter = pin_state::list::s_iterator_to(*this);
+  parent_pin_state->pin_list.erase(liter);
+  parent_pin_state = nullptr;
+}
+
+void ExtentCache::extent::unlink()
+{
+  ceph_assert(parent_extent_set);
+  ceph_assert(parent_pin_state);
+
+  _unlink_pin_state();
+
+  // remove from extent set
+  {
+    auto siter = object_extent_set::set::s_iterator_to(*this);
+    auto &set = object_extent_set::set::container_from_iterator(siter);
+    ceph_assert(&set == &(parent_extent_set->extent_set));
+    set.erase(siter);
+  }
+
+  parent_extent_set = nullptr;
+  ceph_assert(!parent_pin_state);
+}
+
+void ExtentCache::extent::link(
+  object_extent_set &extent_set,
+  pin_state &pin_state)
+{
+  ceph_assert(!parent_extent_set);
+  parent_extent_set = &extent_set;
+  extent_set.extent_set.insert(*this);
+
+  _link_pin_state(pin_state);
+}
+
+void ExtentCache::extent::move(
+  pin_state &to)
+{
+  _unlink_pin_state();
+  _link_pin_state(to);
+}
+
+void ExtentCache::remove_and_destroy_if_empty(object_extent_set &eset)
+{
+  if (eset.extent_set.empty()) {
+    auto siter = cache_set::s_iterator_to(eset);
+    auto &set = cache_set::container_from_iterator(siter);
+    ceph_assert(&set == &per_object_caches);
+
+    // per_object_caches owns eset
+    per_object_caches.erase(eset);
+    delete &eset;
+  }
+}
+
+ExtentCache::object_extent_set &ExtentCache::get_or_create(
+  const hobject_t &oid)
+{
+  cache_set::insert_commit_data data;
+  auto p = per_object_caches.insert_check(oid, Cmp(), data);
+  if (p.second) {
+    auto *eset = new object_extent_set(oid);
+    per_object_caches.insert_commit(*eset, data);
+    return *eset;
+  } else {
+    return *(p.first);
+  }
+}
+
+ExtentCache::object_extent_set *ExtentCache::get_if_exists(
+  const hobject_t &oid)
+{
+  cache_set::insert_commit_data data;
+  auto p = per_object_caches.insert_check(oid, Cmp(), data);
+  if (p.second) {
+    return nullptr;
+  } else {
+    return &*(p.first);
+  }
+}
+
+std::pair<
+  ExtentCache::object_extent_set::set::iterator,
+  ExtentCache::object_extent_set::set::iterator
+  > ExtentCache::object_extent_set::get_containing_range(
+    uint64_t off, uint64_t len)
+{
+  // fst is first iterator with end after off (may be end)
+  auto fst = extent_set.upper_bound(off, uint_cmp());
+  if (fst != extent_set.begin())
+    --fst;
+  if (fst != extent_set.end() && off >= (fst->offset + fst->get_length()))
+    ++fst;
+
+  // lst is first iterator with start >= off + len (may be end)
+  auto lst = extent_set.lower_bound(off + len, uint_cmp());
+  return std::make_pair(fst, lst);
+}
+
+extent_set ExtentCache::reserve_extents_for_rmw(
+  const hobject_t &oid,
+  write_pin &pin,
+  const extent_set &to_write,
+  const extent_set &to_read)
+{
+  if (to_write.empty() && to_read.empty()) {
+    return extent_set();
+  }
+  extent_set must_read;
+  auto &eset = get_or_create(oid);
+  extent_set missing;
+  for (auto &&res: to_write) {
+    eset.traverse_update(
+      pin,
+      res.first,
+      res.second,
+      [&](uint64_t off, uint64_t len,
+	  extent *ext, object_extent_set::update_action *action) {
+	action->action = object_extent_set::update_action::UPDATE_PIN;
+	if (!ext) {
+	  missing.insert(off, len);
+	}
+      });
+  }
+  must_read.intersection_of(
+    to_read,
+    missing);
+  return must_read;
+}
+
+extent_map ExtentCache::get_remaining_extents_for_rmw(
+  const hobject_t &oid,
+  write_pin &pin,
+  const extent_set &to_get)
+{
+  if (to_get.empty()) {
+    return extent_map();
+  }
+  extent_map ret;
+  auto &eset = get_or_create(oid);
+  for (auto &&res: to_get) {
+    bufferlist bl;
+    uint64_t cur = res.first;
+    eset.traverse_update(
+      pin,
+      res.first,
+      res.second,
+      [&](uint64_t off, uint64_t len,
+	  extent *ext, object_extent_set::update_action *action) {
+	ceph_assert(off == cur);
+	cur = off + len;
+	action->action = object_extent_set::update_action::NONE;
+	ceph_assert(ext && ext->bl && ext->pinned_by_write());
+	bl.substr_of(
+	  *(ext->bl),
+	  off - ext->offset,
+	  len);
+	ret.insert(off, len, bl);
+      });
+  }
+  return ret;
+}
+
+void ExtentCache::present_rmw_update(
+  const hobject_t &oid,
+  write_pin &pin,
+  const extent_map &extents)
+{
+  if (extents.empty()) {
+    return;
+  }
+  auto &eset = get_or_create(oid);
+  for (auto &&res: extents) {
+    eset.traverse_update(
+      pin,
+      res.get_off(),
+      res.get_len(),
+      [&](uint64_t off, uint64_t len,
+	  extent *ext, object_extent_set::update_action *action) {
+	action->action = object_extent_set::update_action::NONE;
+	ceph_assert(ext && ext->pinned_by_write());
+	action->bl = bufferlist();
+	action->bl->substr_of(
+	  res.get_val(),
+	  off - res.get_off(),
+	  len);
+      });
+  }
+}
+
+ostream &ExtentCache::print(ostream &out) const
+{
+  out << "ExtentCache(" << std::endl;
+  for (auto esiter = per_object_caches.begin();
+       esiter != per_object_caches.end();
+       ++esiter) {
+    out << "  Extents(" << esiter->oid << ")[" << std::endl;
+    for (auto exiter = esiter->extent_set.begin();
+	 exiter != esiter->extent_set.end();
+	 ++exiter) {
+      out << "    Extent(" << exiter->offset
+	  << "~" << exiter->get_length()
+	  << ":" << exiter->pin_tid()
+	  << ")" << std::endl;
+    }
+  }
+  return out << ")" << std::endl;
+}
+
+ostream &operator<<(ostream &lhs, const ExtentCache &cache)
+{
+  return cache.print(lhs);
+}
diff --git a/src/osd/ExtentCache.h b/src/osd/ExtentCache.h
new file mode 100644
index 000000000..972228cd0
--- /dev/null
+++ b/src/osd/ExtentCache.h
@@ -0,0 +1,489 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef EXTENT_CACHE_H
+#define EXTENT_CACHE_H
+
+#include <map>
+#include <list>
+#include <vector>
+#include <utility>
+#include <optional>
+#include <boost/intrusive/set.hpp>
+#include <boost/intrusive/list.hpp>
+#include "include/interval_set.h"
+#include "common/interval_map.h"
+#include "include/buffer.h"
+#include "common/hobject.h"
+
+/**
+   ExtentCache
+
+   The main purpose of this cache is to ensure that we can pipeline
+   overlapping partial overwrites.
+
+   To that end we need to ensure that an extent pinned for an operation is
+   live until that operation completes.  However, a particular extent
+   might be pinned by multiple operations (several pipelined writes
+   on the same object).
+
+   1) When we complete an operation, we only look at extents owned only
+      by that operation.
+   2) Per-extent overhead is fixed size.
+   2) Per-operation metadata is fixed size.
+
+   This is simple enough to realize with two main structures:
+   - extent: contains a pointer to the pin owning it and intrusive list
+             pointers to other extents owned by the same pin
+   - pin_state: contains the list head for extents owned by it
+
+   This works as long as we only need to remember one "owner" for
+   each extent.  To make this work, we'll need to leverage some
+   invariants guaranteed by higher layers:
+
+   1) Writes on a particular object must be ordered
+   2) A particular object will have outstanding reads or writes, but not
+      both (note that you can have a read while a write is committed, but
+      not applied).
+
+   Our strategy therefore will be to have whichever in-progress op will
+   finish "last" be the owner of a particular extent.  For now, we won't
+   cache reads, so 2) simply means that we can assume that reads and
+   recovery operations imply no unstable extents on the object in
+   question.
+
+   Write: WaitRead -> WaitCommit -> Complete
+
+   Invariant 1) above actually indicates that we can't have writes
+   bypassing the WaitRead state while there are writes waiting on
+   Reads.  Thus, the set of operations pinning a particular extent
+   must always complete in order or arrival.
+
+   This suggests that a particular extent may be in only the following
+   states:
+
+
+   0) Empty (not in the map at all)
+   1) Write Pending N
+      - Some write with reqid <= N is currently fetching the data for
+        this extent
+      - The extent must persist until Write reqid N completes
+      - All ops pinning this extent are writes in the WaitRead state of
+        the Write pipeline (there must be an in progress write, so no
+	reads can be in progress).
+   2) Write Pinned N:
+      - This extent has data corresponding to some reqid M <= N
+      - The extent must persist until Write reqid N commits
+      - All ops pinning this extent are writes in some Write
+        state (all are possible).  Reads are not possible
+	in this state (or the others) due to 2).
+
+   All of the above suggests that there are 3 things users can
+   ask of the cache corresponding to the 3 Write pipelines
+   states.
+ */
+
+/// If someone wants these types, but not ExtentCache, move to another file
+struct bl_split_merge {
+  ceph::buffer::list split(
+    uint64_t offset,
+    uint64_t length,
+    ceph::buffer::list &bl) const {
+    ceph::buffer::list out;
+    out.substr_of(bl, offset, length);
+    return out;
+  }
+  bool can_merge(const ceph::buffer::list &left, const ceph::buffer::list &right) const {
+    return true;
+  }
+  ceph::buffer::list merge(ceph::buffer::list &&left, ceph::buffer::list &&right) const {
+    ceph::buffer::list bl{std::move(left)};
+    bl.claim_append(right);
+    return bl;
+  }
+  uint64_t length(const ceph::buffer::list &b) const { return b.length(); }
+};
+using extent_set = interval_set<uint64_t>;
+using extent_map = interval_map<uint64_t, ceph::buffer::list, bl_split_merge>;
+
+class ExtentCache {
+  struct object_extent_set;
+  struct pin_state;
+private:
+
+  struct extent {
+    object_extent_set *parent_extent_set = nullptr;
+    pin_state *parent_pin_state = nullptr;
+    boost::intrusive::set_member_hook<> extent_set_member;
+    boost::intrusive::list_member_hook<> pin_list_member;
+
+    uint64_t offset;
+    uint64_t length;
+    std::optional<ceph::buffer::list> bl;
+
+    uint64_t get_length() const {
+      return length;
+    }
+
+    bool is_pending() const {
+      return bl == std::nullopt;
+    }
+
+    bool pinned_by_write() const {
+      ceph_assert(parent_pin_state);
+      return parent_pin_state->is_write();
+    }
+
+    uint64_t pin_tid() const {
+      ceph_assert(parent_pin_state);
+      return parent_pin_state->tid;
+    }
+
+    extent(uint64_t offset, ceph::buffer::list _bl)
+      : offset(offset), length(_bl.length()), bl(_bl) {}
+
+    extent(uint64_t offset, uint64_t length)
+      : offset(offset), length(length) {}
+
+    bool operator<(const extent &rhs) const {
+      return offset < rhs.offset;
+    }
+  private:
+    // can briefly violate the two link invariant, used in unlink() and move()
+    void _link_pin_state(pin_state &pin_state);
+    void _unlink_pin_state();
+  public:
+    void unlink();
+    void link(object_extent_set &parent_extent_set, pin_state &pin_state);
+    void move(pin_state &to);
+  };
+
+  struct object_extent_set : boost::intrusive::set_base_hook<> {
+    hobject_t oid;
+    explicit object_extent_set(const hobject_t &oid) : oid(oid) {}
+
+    using set_member_options = boost::intrusive::member_hook<
+      extent,
+      boost::intrusive::set_member_hook<>,
+      &extent::extent_set_member>;
+    using set = boost::intrusive::set<extent, set_member_options>;
+    set extent_set;
+
+    bool operator<(const object_extent_set &rhs) const {
+      return oid < rhs.oid;
+    }
+
+    struct uint_cmp {
+      bool operator()(uint64_t lhs, const extent &rhs) const {
+	return lhs < rhs.offset;
+      }
+      bool operator()(const extent &lhs, uint64_t rhs) const {
+	return lhs.offset < rhs;
+      }
+    };
+    std::pair<set::iterator, set::iterator> get_containing_range(
+      uint64_t offset, uint64_t length);
+
+    void erase(uint64_t offset, uint64_t length);
+
+    struct update_action {
+      enum type {
+	NONE,
+	UPDATE_PIN
+      };
+      type action = NONE;
+      std::optional<ceph::buffer::list> bl;
+    };
+    template <typename F>
+    void traverse_update(
+      pin_state &pin,
+      uint64_t offset,
+      uint64_t length,
+      F &&f) {
+      auto range = get_containing_range(offset, length);
+
+      if (range.first == range.second || range.first->offset > offset) {
+	uint64_t extlen = range.first == range.second ?
+	  length : range.first->offset - offset;
+
+	update_action action;
+	f(offset, extlen, nullptr, &action);
+	ceph_assert(!action.bl || action.bl->length() == extlen);
+	if (action.action == update_action::UPDATE_PIN) {
+	  extent *ext = action.bl ?
+	    new extent(offset, *action.bl) :
+	    new extent(offset, extlen);
+	  ext->link(*this, pin);
+	} else {
+	  ceph_assert(!action.bl);
+	}
+      }
+
+      for (auto p = range.first; p != range.second;) {
+	extent *ext = &*p;
+	++p;
+
+	uint64_t extoff = std::max(ext->offset, offset);
+	uint64_t extlen = std::min(
+	  ext->length - (extoff - ext->offset),
+	  offset + length - extoff);
+
+	update_action action;
+	f(extoff, extlen, ext, &action);
+	ceph_assert(!action.bl || action.bl->length() == extlen);
+	extent *final_extent = nullptr;
+	if (action.action == update_action::NONE) {
+	  final_extent = ext;
+	} else {
+	  pin_state *ps = ext->parent_pin_state;
+	  ext->unlink();
+	  if ((ext->offset < offset) &&
+	      (ext->offset + ext->get_length() > offset)) {
+	    extent *head = nullptr;
+	    if (ext->bl) {
+	      ceph::buffer::list bl;
+	      bl.substr_of(
+		*(ext->bl),
+		0,
+		offset - ext->offset);
+	      head = new extent(ext->offset, bl);
+	    } else {
+	      head = new extent(
+		ext->offset, offset - ext->offset);
+	    }
+	    head->link(*this, *ps);
+	  }
+	  if ((ext->offset + ext->length > offset + length) &&
+	      (offset + length > ext->offset)) {
+	    uint64_t nlen =
+	      (ext->offset + ext->get_length()) - (offset + length);
+	    extent *tail = nullptr;
+	    if (ext->bl) {
+	      ceph::buffer::list bl;
+	      bl.substr_of(
+		*(ext->bl),
+		ext->get_length() - nlen,
+		nlen);
+	      tail = new extent(offset + length, bl);
+	    } else {
+	      tail = new extent(offset + length, nlen);
+	    }
+	    tail->link(*this, *ps);
+	  }
+	  if (action.action == update_action::UPDATE_PIN) {
+	    if (ext->bl) {
+	      ceph::buffer::list bl;
+	      bl.substr_of(
+		*(ext->bl),
+		extoff - ext->offset,
+		extlen);
+	      final_extent = new ExtentCache::extent(
+		extoff,
+		bl);
+	    } else {
+	      final_extent = new ExtentCache::extent(
+		extoff, extlen);
+	    }
+	    final_extent->link(*this, pin);
+	  }
+	  delete ext;
+	}
+
+	if (action.bl) {
+	  ceph_assert(final_extent);
+	  ceph_assert(final_extent->length == action.bl->length());
+	  final_extent->bl = *(action.bl);
+	}
+
+	uint64_t next_off = p == range.second ?
+	  offset + length : p->offset;
+	if (extoff + extlen < next_off) {
+	  uint64_t tailoff = extoff + extlen;
+	  uint64_t taillen = next_off - tailoff;
+
+	  update_action action;
+	  f(tailoff, taillen, nullptr, &action);
+	  ceph_assert(!action.bl || action.bl->length() == taillen);
+	  if (action.action == update_action::UPDATE_PIN) {
+	    extent *ext = action.bl ?
+	      new extent(tailoff, *action.bl) :
+	      new extent(tailoff, taillen);
+	    ext->link(*this, pin);
+	  } else {
+	    ceph_assert(!action.bl);
+	  }
+	}
+      }
+    }
+  };
+  struct Cmp {
+    bool operator()(const hobject_t &oid, const object_extent_set &rhs) const {
+      return oid < rhs.oid;
+    }
+    bool operator()(const object_extent_set &lhs, const hobject_t &oid) const {
+      return lhs.oid < oid;
+    }
+  };
+
+  object_extent_set &get_or_create(const hobject_t &oid);
+  object_extent_set *get_if_exists(const hobject_t &oid);
+
+  void remove_and_destroy_if_empty(object_extent_set &set);
+  using cache_set = boost::intrusive::set<object_extent_set>;
+  cache_set per_object_caches;
+
+  uint64_t next_write_tid = 1;
+  uint64_t next_read_tid = 1;
+  struct pin_state {
+    uint64_t tid = 0;
+    enum pin_type_t {
+      NONE,
+      WRITE,
+    };
+    pin_type_t pin_type = NONE;
+    bool is_write() const { return pin_type == WRITE; }
+
+    pin_state(const pin_state &other) = delete;
+    pin_state &operator=(const pin_state &other) = delete;
+    pin_state(pin_state &&other) = delete;
+    pin_state() = default;
+
+    using list_member_options = boost::intrusive::member_hook<
+      extent,
+      boost::intrusive::list_member_hook<>,
+      &extent::pin_list_member>;
+    using list = boost::intrusive::list<extent, list_member_options>;
+    list pin_list;
+    ~pin_state() {
+      ceph_assert(pin_list.empty());
+      ceph_assert(tid == 0);
+      ceph_assert(pin_type == NONE);
+    }
+    void _open(uint64_t in_tid, pin_type_t in_type) {
+      ceph_assert(pin_type == NONE);
+      ceph_assert(in_tid > 0);
+      tid = in_tid;
+      pin_type = in_type;
+    }
+  };
+
+  void release_pin(pin_state &p) {
+    for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) {
+      std::unique_ptr<extent> extent(&*iter); // we now own this
+      iter++; // unlink will invalidate
+      ceph_assert(extent->parent_extent_set);
+      auto &eset = *(extent->parent_extent_set);
+      extent->unlink();
+      remove_and_destroy_if_empty(eset);
+    }
+    p.tid = 0;
+    p.pin_type = pin_state::NONE;
+  }
+
+public:
+  class write_pin : private pin_state {
+    friend class ExtentCache;
+  private:
+    void open(uint64_t in_tid) {
+      _open(in_tid, pin_state::WRITE);
+    }
+  public:
+    write_pin() : pin_state() {}
+  };
+
+  void open_write_pin(write_pin &pin) {
+    pin.open(next_write_tid++);
+  }
+
+  /**
+   * Reserves extents required for rmw, and learn
+   * which need to be read
+   *
+   * Pins all extents in to_write.  Returns subset of to_read not
+   * currently present in the cache.  Caller must obtain those
+   * extents before calling get_remaining_extents_for_rmw.
+   *
+   * Transition table:
+   * - Empty -> Write Pending pin.reqid
+   * - Write Pending N -> Write Pending pin.reqid
+   * - Write Pinned N -> Write Pinned pin.reqid
+   *
+   * @param oid [in] object undergoing rmw
+   * @param pin [in,out] pin to use (obtained from create_write_pin)
+   * @param to_write [in] extents which will be written
+   * @param to_read [in] extents to read prior to write (must be subset
+   *                     of to_write)
+   * @return subset of to_read which isn't already present or pending
+   */
+  extent_set reserve_extents_for_rmw(
+    const hobject_t &oid,
+    write_pin &pin,
+    const extent_set &to_write,
+    const extent_set &to_read);
+
+  /**
+   * Gets extents required for rmw not returned from
+   * reserve_extents_for_rmw
+   *
+   * Requested extents (to_get) must be the set to_read \ the set
+   * returned from reserve_extents_for_rmw.  No transition table,
+   * all extents at this point must be present and already pinned
+   * for this pin by reserve_extents_for_rmw.
+   *
+   * @param oid [in] object
+   * @param pin [in,out] pin associated with this IO
+   * @param to_get [in] extents to get (see above for restrictions)
+   * @return map of buffers from to_get
+   */
+  extent_map get_remaining_extents_for_rmw(
+    const hobject_t &oid,
+    write_pin &pin,
+    const extent_set &to_get);
+
+  /**
+   * Updates the cache to reflect the rmw write
+   *
+   * All presented extents must already have been specified in
+   * reserve_extents_for_rmw under to_write.
+   *
+   * Transition table:
+   * - Empty -> invalid, must call reserve_extents_for_rmw first
+   * - Write Pending N -> Write Pinned N, update buffer
+   *     (assert N >= pin.reqid)
+   * - Write Pinned N -> Update buffer (assert N >= pin.reqid)
+   *
+   * @param oid [in] object
+   * @param pin [in,out] pin associated with this IO
+   * @param extents [in] map of buffers to update
+   * @return void
+   */
+  void present_rmw_update(
+    const hobject_t &oid,
+    write_pin &pin,
+    const extent_map &extents);
+
+  /**
+   * Release all buffers pinned by pin
+   */
+  void release_write_pin(
+    write_pin &pin) {
+    release_pin(pin);
+  }
+
+  std::ostream &print(std::ostream &out) const;
+};
+
+std::ostream &operator <<(std::ostream &lhs, const ExtentCache &cache);
+
+#endif
diff --git a/src/osd/HitSet.cc b/src/osd/HitSet.cc
new file mode 100644
index 000000000..03475d36f
--- /dev/null
+++ b/src/osd/HitSet.cc
@@ -0,0 +1,256 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank <info@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "HitSet.h"
+#include "common/Formatter.h"
+
+using std::ostream;
+using std::list;
+using ceph::Formatter;
+
+// -- HitSet --
+
+HitSet::HitSet(const HitSet::Params& params)
+  : sealed(false)
+{
+  switch (params.get_type()) {
+  case TYPE_BLOOM:
+    {
+      BloomHitSet::Params *p =
+	static_cast<BloomHitSet::Params*>(params.impl.get());
+      impl.reset(new BloomHitSet(p));
+    }
+    break;
+
+  case TYPE_EXPLICIT_HASH:
+    impl.reset(new ExplicitHashHitSet(static_cast<ExplicitHashHitSet::Params*>(params.impl.get())));
+    break;
+
+  case TYPE_EXPLICIT_OBJECT:
+    impl.reset(new ExplicitObjectHitSet(static_cast<ExplicitObjectHitSet::Params*>(params.impl.get())));
+    break;
+
+  default:
+    assert (0 == "unknown HitSet type");
+  }
+}
+
+void HitSet::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(sealed, bl);
+  if (impl) {
+    encode((__u8)impl->get_type(), bl);
+    impl->encode(bl);
+  } else {
+    encode((__u8)TYPE_NONE, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void HitSet::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(sealed, bl);
+  __u8 type;
+  decode(type, bl);
+  switch ((impl_type_t)type) {
+  case TYPE_EXPLICIT_HASH:
+    impl.reset(new ExplicitHashHitSet);
+    break;
+  case TYPE_EXPLICIT_OBJECT:
+    impl.reset(new ExplicitObjectHitSet);
+    break;
+  case TYPE_BLOOM:
+    impl.reset(new BloomHitSet);
+    break;
+  case TYPE_NONE:
+    impl.reset(NULL);
+    break;
+  default:
+    throw ceph::buffer::malformed_input("unrecognized HitMap type");
+  }
+  if (impl)
+    impl->decode(bl);
+  DECODE_FINISH(bl);
+}
+
+void HitSet::dump(Formatter *f) const
+{
+  f->dump_string("type", get_type_name());
+  f->dump_string("sealed", sealed ? "yes" : "no");
+  if (impl)
+    impl->dump(f);
+}
+
+void HitSet::generate_test_instances(list<HitSet*>& o)
+{
+  o.push_back(new HitSet);
+  o.push_back(new HitSet(new BloomHitSet(10, .1, 1)));
+  o.back()->insert(hobject_t());
+  o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, ""));
+  o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, ""));
+  o.push_back(new HitSet(new ExplicitHashHitSet));
+  o.back()->insert(hobject_t());
+  o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, ""));
+  o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, ""));
+  o.push_back(new HitSet(new ExplicitObjectHitSet));
+  o.back()->insert(hobject_t());
+  o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, ""));
+  o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, ""));
+}
+
+HitSet::Params::Params(const Params& o) noexcept
+{
+  if (o.get_type() != TYPE_NONE) {
+    create_impl(o.get_type());
+    // it's annoying to write virtual operator= methods; use encode/decode
+    // instead.
+    ceph::buffer::list bl;
+    o.impl->encode(bl);
+    auto p = bl.cbegin();
+    impl->decode(p);
+  } // else we don't need to do anything
+}
+
+const HitSet::Params& HitSet::Params::operator=(const Params& o)
+{
+  create_impl(o.get_type());
+  if (o.impl) {
+    // it's annoying to write virtual operator= methods; use encode/decode
+    // instead.
+    ceph::buffer::list bl;
+    o.impl->encode(bl);
+    auto p = bl.cbegin();
+    impl->decode(p);
+  }
+  return *this;
+}
+
+void HitSet::Params::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  if (impl) {
+    encode((__u8)impl->get_type(), bl);
+    impl->encode(bl);
+  } else {
+    encode((__u8)TYPE_NONE, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+bool HitSet::Params::create_impl(impl_type_t type)
+{
+  switch ((impl_type_t)type) {
+  case TYPE_EXPLICIT_HASH:
+    impl.reset(new ExplicitHashHitSet::Params);
+    break;
+  case TYPE_EXPLICIT_OBJECT:
+    impl.reset(new ExplicitObjectHitSet::Params);
+    break;
+  case TYPE_BLOOM:
+    impl.reset(new BloomHitSet::Params);
+    break;
+  case TYPE_NONE:
+    impl.reset(NULL);
+    break;
+  default:
+    return false;
+  }
+  return true;
+}
+
+void HitSet::Params::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  __u8 type;
+  decode(type, bl);
+  if (!create_impl((impl_type_t)type))
+    throw ceph::buffer::malformed_input("unrecognized HitMap type");
+  if (impl)
+    impl->decode(bl);
+  DECODE_FINISH(bl);
+}
+
+void HitSet::Params::dump(Formatter *f) const
+{
+  f->dump_string("type", HitSet::get_type_name(get_type()));
+  if (impl)
+    impl->dump(f);
+}
+
+void HitSet::Params::generate_test_instances(list<HitSet::Params*>& o)
+{
+#define loop_hitset_params(kind) \
+{ \
+  list<kind::Params*> params; \
+  kind::Params::generate_test_instances(params); \
+  for (list<kind::Params*>::iterator i = params.begin(); \
+  i != params.end(); ++i) \
+    o.push_back(new Params(*i)); \
+}
+  o.push_back(new Params);
+  o.push_back(new Params(new BloomHitSet::Params));
+  loop_hitset_params(BloomHitSet);
+  o.push_back(new Params(new ExplicitHashHitSet::Params));
+  loop_hitset_params(ExplicitHashHitSet);
+  o.push_back(new Params(new ExplicitObjectHitSet::Params));
+  loop_hitset_params(ExplicitObjectHitSet);
+}
+
+ostream& operator<<(ostream& out, const HitSet::Params& p) {
+  out << HitSet::get_type_name(p.get_type());
+  if (p.impl) {
+    out << "{";
+    p.impl->dump_stream(out);
+  }
+  out << "}";
+  return out;
+}
+
+
+void ExplicitHashHitSet::dump(Formatter *f) const {
+  f->dump_unsigned("insert_count", count);
+  f->open_array_section("hash_set");
+  for (ceph::unordered_set<uint32_t>::const_iterator p = hits.begin();
+       p != hits.end();
+       ++p)
+    f->dump_unsigned("hash", *p);
+  f->close_section();
+}
+
+void ExplicitObjectHitSet::dump(Formatter *f) const {
+  f->dump_unsigned("insert_count", count);
+  f->open_array_section("set");
+  for (ceph::unordered_set<hobject_t>::const_iterator p = hits.begin();
+       p != hits.end();
+       ++p) {
+    f->open_object_section("object");
+    p->dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void BloomHitSet::Params::dump(Formatter *f) const {
+  f->dump_float("false_positive_probability", get_fpp());
+  f->dump_int("target_size", target_size);
+  f->dump_int("seed", seed);
+}
+
+void BloomHitSet::dump(Formatter *f) const {
+  f->open_object_section("bloom_filter");
+  bloom.dump(f);
+  f->close_section();
+}
diff --git a/src/osd/HitSet.h b/src/osd/HitSet.h
new file mode 100644
index 000000000..dedc45ed4
--- /dev/null
+++ b/src/osd/HitSet.h
@@ -0,0 +1,455 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank <info@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_HITSET_H
+#define CEPH_OSD_HITSET_H
+
+#include <string_view>
+
+#include <boost/scoped_ptr.hpp>
+
+#include "include/encoding.h"
+#include "include/unordered_set.h"
+#include "common/bloom_filter.hpp"
+#include "common/hobject.h"
+
+/**
+ * generic container for a HitSet
+ *
+ * Encapsulate a HitSetImpl of any type.  Expose a generic interface
+ * to users and wrap the encoded object with a type so that it can be
+ * safely decoded later.
+ */
+
+class HitSet {
+public:
+  typedef enum {
+    TYPE_NONE = 0,
+    TYPE_EXPLICIT_HASH = 1,
+    TYPE_EXPLICIT_OBJECT = 2,
+    TYPE_BLOOM = 3
+  } impl_type_t;
+
+  static std::string_view get_type_name(impl_type_t t) {
+    switch (t) {
+    case TYPE_NONE: return "none";
+    case TYPE_EXPLICIT_HASH: return "explicit_hash";
+    case TYPE_EXPLICIT_OBJECT: return "explicit_object";
+    case TYPE_BLOOM: return "bloom";
+    default: return "???";
+    }
+  }
+  std::string_view get_type_name() const {
+    if (impl)
+      return get_type_name(impl->get_type());
+    return get_type_name(TYPE_NONE);
+  }
+
+  /// abstract interface for a HitSet implementation
+  class Impl {
+  public:
+    virtual impl_type_t get_type() const = 0;
+    virtual bool is_full() const = 0;
+    virtual void insert(const hobject_t& o) = 0;
+    virtual bool contains(const hobject_t& o) const = 0;
+    virtual unsigned insert_count() const = 0;
+    virtual unsigned approx_unique_insert_count() const = 0;
+    virtual void encode(ceph::buffer::list &bl) const = 0;
+    virtual void decode(ceph::buffer::list::const_iterator& p) = 0;
+    virtual void dump(ceph::Formatter *f) const = 0;
+    virtual Impl* clone() const = 0;
+    virtual void seal() {}
+    virtual ~Impl() {}
+  };
+
+  boost::scoped_ptr<Impl> impl;
+  bool sealed;
+
+  class Params {
+    /// create an Impl* of the given type
+    bool create_impl(impl_type_t t);
+
+  public:
+    class Impl {
+    public:
+      virtual impl_type_t get_type() const = 0;
+      virtual HitSet::Impl *get_new_impl() const = 0;
+      virtual void encode(ceph::buffer::list &bl) const {}
+      virtual void decode(ceph::buffer::list::const_iterator& p) {}
+      virtual void dump(ceph::Formatter *f) const {}
+      virtual void dump_stream(std::ostream& o) const {}
+      virtual ~Impl() {}
+    };
+
+    Params()  {}
+    explicit Params(Impl *i) : impl(i) {}
+    virtual ~Params() {}
+
+    boost::scoped_ptr<Params::Impl> impl;
+
+    impl_type_t get_type() const {
+      if (impl)
+	return impl->get_type();
+      return TYPE_NONE;
+    }
+
+    Params(const Params& o) noexcept;
+    const Params& operator=(const Params& o);
+
+    void encode(ceph::buffer::list &bl) const;
+    void decode(ceph::buffer::list::const_iterator& bl);
+    void dump(ceph::Formatter *f) const;
+    static void generate_test_instances(std::list<HitSet::Params*>& o);
+
+    friend std::ostream& operator<<(std::ostream& out, const HitSet::Params& p);
+  };
+
+  HitSet() : impl(NULL), sealed(false) {}
+  explicit HitSet(Impl *i) : impl(i), sealed(false) {}
+  explicit HitSet(const HitSet::Params& params);
+
+  HitSet(const HitSet& o) {
+    sealed = o.sealed;
+    if (o.impl)
+      impl.reset(o.impl->clone());
+    else
+      impl.reset(NULL);
+  }
+  const HitSet& operator=(const HitSet& o) {
+    sealed = o.sealed;
+    if (o.impl)
+      impl.reset(o.impl->clone());
+    else
+      impl.reset(NULL);
+    return *this;
+  }
+
+
+  bool is_full() const {
+    return impl->is_full();
+  }
+  /// insert a hash into the set
+  void insert(const hobject_t& o) {
+    impl->insert(o);
+  }
+  /// query whether a hash is in the set
+  bool contains(const hobject_t& o) const {
+    return impl->contains(o);
+  }
+
+  unsigned insert_count() const {
+    return impl->insert_count();
+  }
+  unsigned approx_unique_insert_count() const {
+    return impl->approx_unique_insert_count();
+  }
+  void seal() {
+    ceph_assert(!sealed);
+    sealed = true;
+    impl->seal();
+  }
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<HitSet*>& o);
+
+private:
+  void reset_to_type(impl_type_t type);
+};
+WRITE_CLASS_ENCODER(HitSet)
+WRITE_CLASS_ENCODER(HitSet::Params)
+
+typedef boost::shared_ptr<HitSet> HitSetRef;
+
+std::ostream& operator<<(std::ostream& out, const HitSet::Params& p);
+
+/**
+ * explicitly enumerate hash hits in the set
+ */
+class ExplicitHashHitSet : public HitSet::Impl {
+  uint64_t count;
+  ceph::unordered_set<uint32_t> hits;
+public:
+  class Params : public HitSet::Params::Impl {
+  public:
+    HitSet::impl_type_t get_type() const override {
+      return HitSet::TYPE_EXPLICIT_HASH;
+    }
+    HitSet::Impl *get_new_impl() const override {
+      return new ExplicitHashHitSet;
+    }
+    static void generate_test_instances(std::list<Params*>& o) {
+      o.push_back(new Params);
+    }
+  };
+
+  ExplicitHashHitSet() : count(0) {}
+  explicit ExplicitHashHitSet(const ExplicitHashHitSet::Params *p) : count(0) {}
+  ExplicitHashHitSet(const ExplicitHashHitSet &o) : count(o.count),
+      hits(o.hits) {}
+
+  HitSet::Impl *clone() const override {
+    return new ExplicitHashHitSet(*this);
+  }
+
+  HitSet::impl_type_t get_type() const override {
+    return HitSet::TYPE_EXPLICIT_HASH;
+  }
+  bool is_full() const override {
+    return false;
+  }
+  void insert(const hobject_t& o) override {
+    hits.insert(o.get_hash());
+    ++count;
+  }
+  bool contains(const hobject_t& o) const override {
+    return hits.count(o.get_hash());
+  }
+  unsigned insert_count() const override {
+    return count;
+  }
+  unsigned approx_unique_insert_count() const override {
+    return hits.size();
+  }
+  void encode(ceph::buffer::list &bl) const override {
+    ENCODE_START(1, 1, bl);
+    encode(count, bl);
+    encode(hits, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &bl) override {
+    DECODE_START(1, bl);
+    decode(count, bl);
+    decode(hits, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(ceph::Formatter *f) const override;
+  static void generate_test_instances(std::list<ExplicitHashHitSet*>& o) {
+    o.push_back(new ExplicitHashHitSet);
+    o.push_back(new ExplicitHashHitSet);
+    o.back()->insert(hobject_t());
+    o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, ""));
+    o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, ""));
+  }
+};
+WRITE_CLASS_ENCODER(ExplicitHashHitSet)
+
+/**
+ * explicitly enumerate objects in the set
+ */
+class ExplicitObjectHitSet : public HitSet::Impl {
+  uint64_t count;
+  ceph::unordered_set<hobject_t> hits;
+public:
+  class Params : public HitSet::Params::Impl {
+  public:
+    HitSet::impl_type_t get_type() const override {
+      return HitSet::TYPE_EXPLICIT_OBJECT;
+    }
+    HitSet::Impl *get_new_impl() const override {
+      return new ExplicitObjectHitSet;
+    }
+    static void generate_test_instances(std::list<Params*>& o) {
+      o.push_back(new Params);
+    }
+  };
+
+  ExplicitObjectHitSet() : count(0) {}
+  explicit ExplicitObjectHitSet(const ExplicitObjectHitSet::Params *p) : count(0) {}
+  ExplicitObjectHitSet(const ExplicitObjectHitSet &o) : count(o.count),
+      hits(o.hits) {}
+
+  HitSet::Impl *clone() const override {
+    return new ExplicitObjectHitSet(*this);
+  }
+
+  HitSet::impl_type_t get_type() const override {
+    return HitSet::TYPE_EXPLICIT_OBJECT;
+  }
+  bool is_full() const override {
+    return false;
+  }
+  void insert(const hobject_t& o) override {
+    hits.insert(o);
+    ++count;
+  }
+  bool contains(const hobject_t& o) const override {
+    return hits.count(o);
+  }
+  unsigned insert_count() const override {
+    return count;
+  }
+  unsigned approx_unique_insert_count() const override {
+    return hits.size();
+  }
+  void encode(ceph::buffer::list &bl) const override {
+    ENCODE_START(1, 1, bl);
+    encode(count, bl);
+    encode(hits, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) override {
+    DECODE_START(1, bl);
+    decode(count, bl);
+    decode(hits, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(ceph::Formatter *f) const override;
+  static void generate_test_instances(std::list<ExplicitObjectHitSet*>& o) {
+    o.push_back(new ExplicitObjectHitSet);
+    o.push_back(new ExplicitObjectHitSet);
+    o.back()->insert(hobject_t());
+    o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, ""));
+    o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, ""));
+  }
+};
+WRITE_CLASS_ENCODER(ExplicitObjectHitSet)
+
+/**
+ * use a bloom_filter to track hits to the set
+ */
+class BloomHitSet : public HitSet::Impl {
+  compressible_bloom_filter bloom;
+
+public:
+  HitSet::impl_type_t get_type() const override {
+    return HitSet::TYPE_BLOOM;
+  }
+
+  class Params : public HitSet::Params::Impl {
+  public:
+    HitSet::impl_type_t get_type() const override {
+      return HitSet::TYPE_BLOOM;
+    }
+    HitSet::Impl *get_new_impl() const override {
+      return new BloomHitSet;
+    }
+
+    uint32_t fpp_micro;    ///< false positive probability / 1M
+    uint64_t target_size;  ///< number of unique insertions we expect to this HitSet
+    uint64_t seed;         ///< seed to use when initializing the bloom filter
+
+    Params()
+      : fpp_micro(0), target_size(0), seed(0) {}
+    Params(double fpp, uint64_t t, uint64_t s)
+      : fpp_micro(fpp * 1000000.0), target_size(t), seed(s) {}
+    Params(const Params &o)
+      : fpp_micro(o.fpp_micro),
+	target_size(o.target_size),
+	seed(o.seed) {}
+    ~Params() override {}
+
+    double get_fpp() const {
+      return (double)fpp_micro / 1000000.0;
+    }
+    void set_fpp(double f) {
+      fpp_micro = (unsigned)(llrintl(f * 1000000.0));
+    }
+
+    void encode(ceph::buffer::list& bl) const override {
+      ENCODE_START(1, 1, bl);
+      encode(fpp_micro, bl);
+      encode(target_size, bl);
+      encode(seed, bl);
+      ENCODE_FINISH(bl);
+    }
+    void decode(ceph::buffer::list::const_iterator& bl) override {
+      DECODE_START(1, bl);
+      decode(fpp_micro, bl);
+      decode(target_size, bl);
+      decode(seed, bl);
+      DECODE_FINISH(bl);
+    }
+    void dump(ceph::Formatter *f) const override;
+    void dump_stream(std::ostream& o) const override {
+      o << "false_positive_probability: "
+	<< get_fpp() << ", target_size: " << target_size
+	<< ", seed: " << seed;
+    }
+    static void generate_test_instances(std::list<Params*>& o) {
+      o.push_back(new Params);
+      o.push_back(new Params);
+      (*o.rbegin())->fpp_micro = 123456;
+      (*o.rbegin())->target_size = 300;
+      (*o.rbegin())->seed = 99;
+    }
+  };
+
+  BloomHitSet() {}
+  BloomHitSet(unsigned inserts, double fpp, int seed)
+    : bloom(inserts, fpp, seed)
+  {}
+  explicit BloomHitSet(const BloomHitSet::Params *p) : bloom(p->target_size,
+                                                    p->get_fpp(),
+                                                    p->seed)
+  {}
+
+  BloomHitSet(const BloomHitSet &o) {
+    // oh god
+    ceph::buffer::list bl;
+    o.encode(bl);
+    auto bli = std::cbegin(bl);
+    this->decode(bli);
+  }
+
+  HitSet::Impl *clone() const override {
+    return new BloomHitSet(*this);
+  }
+
+  bool is_full() const override {
+    return bloom.is_full();
+  }
+
+  void insert(const hobject_t& o) override {
+    bloom.insert(o.get_hash());
+  }
+  bool contains(const hobject_t& o) const override {
+    return bloom.contains(o.get_hash());
+  }
+  unsigned insert_count() const override {
+    return bloom.element_count();
+  }
+  unsigned approx_unique_insert_count() const override {
+    return bloom.approx_unique_element_count();
+  }
+  void seal() override {
+    // aim for a density of .5 (50% of bit set)
+    double pc = bloom.density() * 2.0;
+    if (pc < 1.0)
+      bloom.compress(pc);
+  }
+
+  void encode(ceph::buffer::list &bl) const override {
+    ENCODE_START(1, 1, bl);
+    encode(bloom, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) override {
+    DECODE_START(1, bl);
+    decode(bloom, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(ceph::Formatter *f) const override;
+  static void generate_test_instances(std::list<BloomHitSet*>& o) {
+    o.push_back(new BloomHitSet);
+    o.push_back(new BloomHitSet(10, .1, 1));
+    o.back()->insert(hobject_t());
+    o.back()->insert(hobject_t("asdf", "", CEPH_NOSNAP, 123, 1, ""));
+    o.back()->insert(hobject_t("qwer", "", CEPH_NOSNAP, 456, 1, ""));
+  }
+};
+WRITE_CLASS_ENCODER(BloomHitSet)
+
+#endif
diff --git a/src/osd/MissingLoc.cc b/src/osd/MissingLoc.cc
new file mode 100644
index 000000000..d45220a82
--- /dev/null
+++ b/src/osd/MissingLoc.cc
@@ -0,0 +1,226 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "MissingLoc.h"
+
+#define dout_context cct
+#undef dout_prefix
+#define dout_prefix (gen_prefix(*_dout))
+#define dout_subsys ceph_subsys_osd
+
+using std::set;
+
+bool MissingLoc::readable_with_acting(
+  const hobject_t &hoid,
+  const set<pg_shard_t> &acting,
+  eversion_t* v) const {
+  if (!needs_recovery(hoid, v))
+    return true;
+  if (is_deleted(hoid))
+    return false;
+  auto missing_loc_entry = missing_loc.find(hoid);
+  if (missing_loc_entry == missing_loc.end())
+    return false;
+  const set<pg_shard_t> &locs = missing_loc_entry->second;
+  ldout(cct, 10) << __func__ << ": locs:" << locs << dendl;
+  set<pg_shard_t> have_acting;
+  for (auto i = locs.begin(); i != locs.end(); ++i) {
+    if (acting.count(*i))
+      have_acting.insert(*i);
+  }
+  return (*is_readable)(have_acting);
+}
+
+void MissingLoc::add_batch_sources_info(
+  const set<pg_shard_t> &sources,
+  HBHandle *handle)
+{
+  ldout(cct, 10) << __func__ << ": adding sources in batch "
+		     << sources.size() << dendl;
+  unsigned loop = 0;
+  bool sources_updated = false;
+  for (auto i = needs_recovery_map.begin();
+      i != needs_recovery_map.end();
+      ++i) {
+    if (handle && ++loop >= cct->_conf->osd_loop_before_reset_tphandle) {
+      handle->reset_tp_timeout();
+      loop = 0;
+    }
+    if (i->second.is_delete())
+      continue;
+
+    auto p = missing_loc.find(i->first);
+    if (p == missing_loc.end()) {
+      p = missing_loc.emplace(i->first, set<pg_shard_t>()).first;
+    } else {
+      _dec_count(p->second);
+    }
+    missing_loc[i->first].insert(sources.begin(), sources.end());
+    _inc_count(p->second);
+
+    if (!sources_updated) {
+      missing_loc_sources.insert(sources.begin(), sources.end());
+      sources_updated = true;
+    }
+  }
+}
+
+bool MissingLoc::add_source_info(
+  pg_shard_t fromosd,
+  const pg_info_t &oinfo,
+  const pg_missing_t &omissing,
+  HBHandle *handle)
+{
+  bool found_missing = false;
+  unsigned loop = 0;
+  bool sources_updated = false;
+  // found items?
+  for (auto p = needs_recovery_map.begin();
+       p != needs_recovery_map.end();
+       ++p) {
+    const hobject_t &soid(p->first);
+    eversion_t need = p->second.need;
+    if (handle && ++loop >= cct->_conf->osd_loop_before_reset_tphandle) {
+      handle->reset_tp_timeout();
+      loop = 0;
+    }
+    if (p->second.is_delete()) {
+      ldout(cct, 10) << __func__ << " " << soid
+		     << " delete, ignoring source" << dendl;
+      continue;
+    }
+    if (oinfo.last_update < need) {
+      ldout(cct, 10) << "search_for_missing " << soid << " " << need
+		     << " also missing on osd." << fromosd
+		     << " (last_update " << oinfo.last_update
+		     << " < needed " << need << ")" << dendl;
+      continue;
+    }
+    if (p->first >= oinfo.last_backfill) {
+      // FIXME: this is _probably_ true, although it could conceivably
+      // be in the undefined region!  Hmm!
+      ldout(cct, 10) << "search_for_missing " << soid << " " << need
+		     << " also missing on osd." << fromosd
+		     << " (past last_backfill " << oinfo.last_backfill
+		     << ")" << dendl;
+      continue;
+    }
+    if (omissing.is_missing(soid)) {
+      ldout(cct, 10) << "search_for_missing " << soid << " " << need
+		     << " also missing on osd." << fromosd << dendl;
+      continue;
+    }
+
+    ldout(cct, 10) << "search_for_missing " << soid << " " << need
+		   << " is on osd." << fromosd << dendl;
+
+    {
+      auto p = missing_loc.find(soid);
+      if (p == missing_loc.end()) {
+	p = missing_loc.emplace(soid, set<pg_shard_t>()).first;
+      } else {
+	_dec_count(p->second);
+      }
+      p->second.insert(fromosd);
+      _inc_count(p->second);
+    }
+
+    if (!sources_updated) {
+      missing_loc_sources.insert(fromosd);
+      sources_updated = true;
+    }
+    found_missing = true;
+  }
+
+  ldout(cct, 20) << "needs_recovery_map missing " << needs_recovery_map
+		 << dendl;
+  return found_missing;
+}
+
+void MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
+{
+  set<pg_shard_t> now_down;
+  for (auto p = missing_loc_sources.begin();
+       p != missing_loc_sources.end();
+       ) {
+    if (osdmap->is_up(p->osd)) {
+      ++p;
+      continue;
+    }
+    ldout(cct, 10) << __func__ << " source osd." << *p << " now down" << dendl;
+    now_down.insert(*p);
+    missing_loc_sources.erase(p++);
+  }
+
+  if (now_down.empty()) {
+    ldout(cct, 10) << __func__ << " no source osds (" << missing_loc_sources << ") went down" << dendl;
+  } else {
+    ldout(cct, 10) << __func__ << " sources osds " << now_down << " now down, remaining sources are "
+		       << missing_loc_sources << dendl;
+
+    // filter missing_loc
+    auto p = missing_loc.begin();
+    while (p != missing_loc.end()) {
+      auto q = p->second.begin();
+      bool changed = false;
+      while (q != p->second.end()) {
+	if (now_down.count(*q)) {
+	  if (!changed) {
+	    changed = true;
+	    _dec_count(p->second);
+	  }
+	  p->second.erase(q++);
+	} else {
+	  ++q;
+	}
+      }
+      if (p->second.empty()) {
+	missing_loc.erase(p++);
+      } else {
+	if (changed) {
+	  _inc_count(p->second);
+	}
+	++p;
+      }
+    }
+  }
+}
+
+void MissingLoc::remove_stray_recovery_sources(pg_shard_t stray)
+{
+  ldout(cct, 10) << __func__ << " remove osd " << stray << " from missing_loc" << dendl;
+  // filter missing_loc
+  auto p = missing_loc.begin();
+  while (p != missing_loc.end()) {
+    auto q = p->second.begin();
+    bool changed = false;
+    while (q != p->second.end()) {
+      if (*q == stray) {
+        if (!changed) {
+          changed = true;
+          _dec_count(p->second);
+        }
+        p->second.erase(q++);
+      } else {
+        ++q;
+      }
+    }
+    if (p->second.empty()) {
+      missing_loc.erase(p++);
+    } else {
+      if (changed) {
+        _inc_count(p->second);
+      }
+      ++p;
+    }
+  }
+  // filter missing_loc_sources
+  for (auto p = missing_loc_sources.begin(); p != missing_loc_sources.end();) {
+    if (*p != stray) {
+      ++p;
+      continue;
+    }
+    ldout(cct, 10) << __func__ << " remove osd" << stray << " from missing_loc_sources" << dendl;
+    missing_loc_sources.erase(p++);
+  }
+}
diff --git a/src/osd/MissingLoc.h b/src/osd/MissingLoc.h
new file mode 100644
index 000000000..9bce3ceda
--- /dev/null
+++ b/src/osd/MissingLoc.h
@@ -0,0 +1,353 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <set>
+
+#include "OSDMap.h"
+#include "common/HBHandle.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "osd_types.h"
+
+class MissingLoc {
+ public:
+
+  class MappingInfo {
+  public:
+    virtual const std::set<pg_shard_t> &get_upset() const = 0;
+    virtual bool is_ec_pg() const = 0;
+    virtual int get_pg_size() const = 0;
+    virtual ~MappingInfo() {}
+  };
+
+  // a loc_count indicates how many locations we know in each of
+  // these distinct sets
+  struct loc_count_t {
+    int up = 0;        //< up
+    int other = 0;    //< other
+
+    friend bool operator<(const loc_count_t& l,
+			  const loc_count_t& r) {
+      return (l.up < r.up ||
+	      (l.up == r.up &&
+	       (l.other < r.other)));
+    }
+    friend std::ostream& operator<<(std::ostream& out, const loc_count_t& l) {
+      ceph_assert(l.up >= 0);
+      ceph_assert(l.other >= 0);
+      return out << "(" << l.up << "+" << l.other << ")";
+    }
+  };
+
+
+  using missing_by_count_t = std::map<shard_id_t, std::map<loc_count_t,int>>;
+ private:
+  loc_count_t _get_count(const std::set<pg_shard_t> &shards) {
+    loc_count_t r;
+    for (auto s : shards) {
+      if (mapping_info->get_upset().count(s)) {
+	r.up++;
+      } else {
+	r.other++;
+      }
+    }
+    return r;
+  }
+
+  std::map<hobject_t, pg_missing_item> needs_recovery_map;
+  std::map<hobject_t, std::set<pg_shard_t> > missing_loc;
+  std::set<pg_shard_t> missing_loc_sources;
+
+  // for every entry in missing_loc, we count how many of each type of shard we have,
+  // and maintain totals here.  The sum of the values for this std::map will always equal
+  // missing_loc.size().
+  missing_by_count_t missing_by_count;
+
+  void pgs_by_shard_id(
+    const std::set<pg_shard_t>& s,
+    std::map<shard_id_t, std::set<pg_shard_t> >& pgsbs) {
+    if (mapping_info->is_ec_pg()) {
+      int num_shards = mapping_info->get_pg_size();
+      // For completely missing shards initialize with empty std::set<pg_shard_t>
+      for (int i = 0 ; i < num_shards ; ++i) {
+	shard_id_t shard(i);
+	pgsbs[shard];
+      }
+      for (auto pgs: s)
+	pgsbs[pgs.shard].insert(pgs);
+    } else {
+      pgsbs[shard_id_t::NO_SHARD] = s;
+    }
+  }
+
+  void _inc_count(const std::set<pg_shard_t>& s) {
+    std::map< shard_id_t, std::set<pg_shard_t> > pgsbs;
+    pgs_by_shard_id(s, pgsbs);
+    for (auto shard: pgsbs)
+      ++missing_by_count[shard.first][_get_count(shard.second)];
+  }
+  void _dec_count(const std::set<pg_shard_t>& s) {
+    std::map< shard_id_t, std::set<pg_shard_t> > pgsbs;
+    pgs_by_shard_id(s, pgsbs);
+    for (auto shard: pgsbs) {
+      auto p = missing_by_count[shard.first].find(_get_count(shard.second));
+      ceph_assert(p != missing_by_count[shard.first].end());
+      if (--p->second == 0) {
+	missing_by_count[shard.first].erase(p);
+      }
+    }
+  }
+
+  spg_t pgid;
+  MappingInfo *mapping_info;
+  DoutPrefixProvider *dpp;
+  CephContext *cct;
+  std::set<pg_shard_t> empty_set;
+ public:
+  boost::scoped_ptr<IsPGReadablePredicate> is_readable;
+  boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
+  explicit MissingLoc(
+    spg_t pgid,
+    MappingInfo *mapping_info,
+    DoutPrefixProvider *dpp,
+    CephContext *cct)
+    : pgid(pgid), mapping_info(mapping_info), dpp(dpp), cct(cct) { }
+  void set_backend_predicates(
+    IsPGReadablePredicate *_is_readable,
+    IsPGRecoverablePredicate *_is_recoverable) {
+    is_readable.reset(_is_readable);
+    is_recoverable.reset(_is_recoverable);
+  }
+  const IsPGRecoverablePredicate &get_recoverable_predicate() const {
+    return *is_recoverable;
+  }
+  std::ostream& gen_prefix(std::ostream& out) const {
+    return dpp->gen_prefix(out);
+  }
+  bool needs_recovery(
+    const hobject_t &hoid,
+    eversion_t *v = 0) const {
+    std::map<hobject_t, pg_missing_item>::const_iterator i =
+      needs_recovery_map.find(hoid);
+    if (i == needs_recovery_map.end())
+      return false;
+    if (v)
+      *v = i->second.need;
+    return true;
+  }
+  bool is_deleted(const hobject_t &hoid) const {
+    auto i = needs_recovery_map.find(hoid);
+    if (i == needs_recovery_map.end())
+      return false;
+    return i->second.is_delete();
+  }
+  bool is_unfound(const hobject_t &hoid) const {
+    auto it = needs_recovery_map.find(hoid);
+    if (it == needs_recovery_map.end()) {
+      return false;
+    }
+    if (it->second.is_delete()) {
+      return false;
+    }
+    auto mit = missing_loc.find(hoid);
+    return mit == missing_loc.end() || !(*is_recoverable)(mit->second);
+  }
+  bool readable_with_acting(
+    const hobject_t &hoid,
+    const std::set<pg_shard_t> &acting,
+    eversion_t* v = 0) const;
+  uint64_t num_unfound() const {
+    uint64_t ret = 0;
+    for (std::map<hobject_t, pg_missing_item>::const_iterator i =
+	   needs_recovery_map.begin();
+	 i != needs_recovery_map.end();
+	 ++i) {
+      if (i->second.is_delete())
+	continue;
+      auto mi = missing_loc.find(i->first);
+      if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
+	++ret;
+    }
+    return ret;
+  }
+
+  bool have_unfound() const {
+    for (std::map<hobject_t, pg_missing_item>::const_iterator i =
+	   needs_recovery_map.begin();
+	 i != needs_recovery_map.end();
+	 ++i) {
+      if (i->second.is_delete())
+	continue;
+      auto mi = missing_loc.find(i->first);
+      if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
+	return true;
+    }
+    return false;
+  }
+  void clear() {
+    needs_recovery_map.clear();
+    missing_loc.clear();
+    missing_loc_sources.clear();
+    missing_by_count.clear();
+  }
+
+  void add_location(const hobject_t &hoid, pg_shard_t location) {
+    auto p = missing_loc.find(hoid);
+    if (p == missing_loc.end()) {
+      p = missing_loc.emplace(hoid, std::set<pg_shard_t>()).first;
+    } else {
+      _dec_count(p->second);
+    }
+    p->second.insert(location);
+    _inc_count(p->second);
+  }
+  void remove_location(const hobject_t &hoid, pg_shard_t location) {
+    auto p = missing_loc.find(hoid);
+    if (p != missing_loc.end()) {
+      _dec_count(p->second);
+      p->second.erase(location);
+      if (p->second.empty()) {
+	missing_loc.erase(p);
+      } else {
+	_inc_count(p->second);
+      }
+    }
+  }
+
+  void clear_location(const hobject_t &hoid) {
+    auto p = missing_loc.find(hoid);
+    if (p != missing_loc.end()) {
+      _dec_count(p->second);
+      missing_loc.erase(p);
+    }
+  }
+
+  void add_active_missing(const pg_missing_t &missing) {
+    for (std::map<hobject_t, pg_missing_item>::const_iterator i =
+	   missing.get_items().begin();
+	 i != missing.get_items().end();
+	 ++i) {
+      std::map<hobject_t, pg_missing_item>::const_iterator j =
+	needs_recovery_map.find(i->first);
+      if (j == needs_recovery_map.end()) {
+	needs_recovery_map.insert(*i);
+      } else {
+	if (i->second.need != j->second.need) {
+	  lgeneric_dout(cct, 0) << this << " " << pgid << " unexpected need for "
+				<< i->first << " have " << j->second
+				<< " tried to add " << i->second << dendl;
+	  ceph_assert(0 == "unexpected need for missing item");
+	}
+      }
+    }
+  }
+
+  void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) {
+    needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete);
+  }
+  void revise_need(const hobject_t &hoid, eversion_t need) {
+    auto it = needs_recovery_map.find(hoid);
+    ceph_assert(it != needs_recovery_map.end());
+    it->second.need = need;
+  }
+
+  /// Adds info about a possible recovery source
+  bool add_source_info(
+    pg_shard_t source,           ///< [in] source
+    const pg_info_t &oinfo,      ///< [in] info
+    const pg_missing_t &omissing, ///< [in] (optional) missing
+    HBHandle *handle             ///< [in] ThreadPool handle
+    ); ///< @return whether a new object location was discovered
+
+  /// Adds recovery sources in batch
+  void add_batch_sources_info(
+    const std::set<pg_shard_t> &sources,  ///< [in] a std::set of resources which can be used for all objects
+    HBHandle *handle  ///< [in] ThreadPool handle
+    );
+
+  /// Uses osdmap to update structures for now down sources
+  void check_recovery_sources(const OSDMapRef& osdmap);
+
+  /// Remove stray from recovery sources
+  void remove_stray_recovery_sources(pg_shard_t stray);
+
+  /// Call when hoid is no longer missing in acting std::set
+  void recovered(const hobject_t &hoid) {
+    needs_recovery_map.erase(hoid);
+    auto p = missing_loc.find(hoid);
+    if (p != missing_loc.end()) {
+      _dec_count(p->second);
+      missing_loc.erase(p);
+    }
+  }
+
+  /// Call to update structures for hoid after a change
+  void rebuild(
+    const hobject_t &hoid,
+    pg_shard_t self,
+    const std::set<pg_shard_t> &to_recover,
+    const pg_info_t &info,
+    const pg_missing_t &missing,
+    const std::map<pg_shard_t, pg_missing_t> &pmissing,
+    const std::map<pg_shard_t, pg_info_t> &pinfo) {
+    recovered(hoid);
+    std::optional<pg_missing_item> item;
+    auto miter = missing.get_items().find(hoid);
+    if (miter != missing.get_items().end()) {
+      item = miter->second;
+    } else {
+      for (auto &&i: to_recover) {
+	if (i == self)
+	  continue;
+	auto pmiter = pmissing.find(i);
+	ceph_assert(pmiter != pmissing.end());
+	miter = pmiter->second.get_items().find(hoid);
+	if (miter != pmiter->second.get_items().end()) {
+	  item = miter->second;
+	  break;
+	}
+      }
+    }
+    if (!item)
+      return; // recovered!
+
+    needs_recovery_map[hoid] = *item;
+    if (item->is_delete())
+      return;
+    auto mliter =
+      missing_loc.emplace(hoid, std::set<pg_shard_t>()).first;
+    ceph_assert(info.last_backfill.is_max());
+    ceph_assert(info.last_update >= item->need);
+    if (!missing.is_missing(hoid))
+      mliter->second.insert(self);
+    for (auto &&i: pmissing) {
+      if (i.first == self)
+	continue;
+      auto pinfoiter = pinfo.find(i.first);
+      ceph_assert(pinfoiter != pinfo.end());
+      if (item->need <= pinfoiter->second.last_update &&
+	  hoid <= pinfoiter->second.last_backfill &&
+	  !i.second.is_missing(hoid))
+	mliter->second.insert(i.first);
+    }
+    _inc_count(mliter->second);
+  }
+
+  const std::set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
+    auto it = missing_loc.find(hoid);
+    return it == missing_loc.end() ? empty_set : it->second;
+  }
+  const std::map<hobject_t, std::set<pg_shard_t>> &get_missing_locs() const {
+    return missing_loc;
+  }
+  const std::map<hobject_t, pg_missing_item> &get_needs_recovery() const {
+    return needs_recovery_map;
+  }
+
+  const missing_by_count_t &get_missing_by_count() const {
+    return missing_by_count;
+  }
+};
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
new file mode 100644
index 000000000..4066a679f
--- /dev/null
+++ b/src/osd/OSD.cc
@@ -0,0 +1,11378 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "acconfig.h"
+
+#include <cctype>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+
+#include <unistd.h>
+#include <sys/stat.h>
+#include <signal.h>
+#include <time.h>
+#include <boost/scoped_ptr.hpp>
+#include <boost/range/adaptor/reversed.hpp>
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+#ifdef HAVE_SYS_MOUNT_H
+#include <sys/mount.h>
+#endif
+
+#include "osd/PG.h"
+#include "osd/scrub_machine.h"
+#include "osd/pg_scrubber.h"
+
+#include "include/types.h"
+#include "include/compat.h"
+#include "include/random.h"
+
+#include "OSD.h"
+#include "OSDMap.h"
+#include "Watch.h"
+#include "osdc/Objecter.h"
+
+#include "common/errno.h"
+#include "common/ceph_argparse.h"
+#include "common/ceph_releases.h"
+#include "common/ceph_time.h"
+#include "common/version.h"
+#include "common/async/blocked_completion.h"
+#include "common/pick_address.h"
+#include "common/blkdev.h"
+#include "common/numa.h"
+
+#include "os/ObjectStore.h"
+#ifdef HAVE_LIBFUSE
+#include "os/FuseStore.h"
+#endif
+
+#include "PrimaryLogPG.h"
+
+#include "msg/Messenger.h"
+#include "msg/Message.h"
+
+#include "mon/MonClient.h"
+
+#include "messages/MLog.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MOSDPing.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDMarkMeDown.h"
+#include "messages/MOSDMarkMeDead.h"
+#include "messages/MOSDFull.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDBackoff.h"
+#include "messages/MOSDBeacon.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDPGTemp.h"
+#include "messages/MOSDPGReadyToMerge.h"
+
+#include "messages/MOSDMap.h"
+#include "messages/MMonGetOSDMap.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGNotify2.h"
+#include "messages/MOSDPGQuery.h"
+#include "messages/MOSDPGQuery2.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGRemove.h"
+#include "messages/MOSDPGInfo.h"
+#include "messages/MOSDPGInfo2.h"
+#include "messages/MOSDPGCreate.h"
+#include "messages/MOSDPGCreate2.h"
+#include "messages/MBackfillReserve.h"
+#include "messages/MRecoveryReserve.h"
+#include "messages/MOSDForceRecovery.h"
+#include "messages/MOSDECSubOpWrite.h"
+#include "messages/MOSDECSubOpWriteReply.h"
+#include "messages/MOSDECSubOpRead.h"
+#include "messages/MOSDECSubOpReadReply.h"
+#include "messages/MOSDPGCreated.h"
+#include "messages/MOSDPGUpdateLogMissing.h"
+#include "messages/MOSDPGUpdateLogMissingReply.h"
+
+#include "messages/MOSDPeeringOp.h"
+
+#include "messages/MOSDAlive.h"
+
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrub2.h"
+#include "messages/MOSDRepScrub.h"
+
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+
+#include "messages/MPGStats.h"
+
+#include "messages/MWatchNotify.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDPGPull.h"
+
+#include "messages/MMonGetPurgedSnaps.h"
+#include "messages/MMonGetPurgedSnapsReply.h"
+
+#include "common/perf_counters.h"
+#include "common/Timer.h"
+#include "common/LogClient.h"
+#include "common/AsyncReserver.h"
+#include "common/HeartbeatMap.h"
+#include "common/admin_socket.h"
+#include "common/ceph_context.h"
+
+#include "global/signal_handler.h"
+#include "global/pidfile.h"
+
+#include "include/color.h"
+#include "perfglue/cpu_profiler.h"
+#include "perfglue/heap_profiler.h"
+
+#include "osd/ClassHandler.h"
+#include "osd/OpRequest.h"
+
+#include "auth/AuthAuthorizeHandler.h"
+#include "auth/RotatingKeyRing.h"
+
+#include "objclass/objclass.h"
+
+#include "common/cmdparse.h"
+#include "include/str_list.h"
+#include "include/util.h"
+
+#include "include/ceph_assert.h"
+#include "common/config.h"
+#include "common/EventTrace.h"
+
+#include "json_spirit/json_spirit_reader.h"
+#include "json_spirit/json_spirit_writer.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/osd.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+#ifdef HAVE_JAEGER
+#include "common/tracer.h"
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
+
+using std::deque;
+using std::list;
+using std::lock_guard;
+using std::make_pair;
+using std::make_tuple;
+using std::make_unique;
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::decode;
+using ceph::encode;
+using ceph::fixed_u_to_string;
+using ceph::Formatter;
+using ceph::heartbeat_handle_d;
+using ceph::make_mutex;
+
+using namespace ceph::osd::scheduler;
+using TOPNSPC::common::cmd_getval;
+
+static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
+  return *_dout << "osd." << whoami << " " << epoch << " ";
+}
+
+//Initial features in new superblock.
+//Features here are also automatically upgraded
+CompatSet OSD::get_osd_initial_compat_set() {
+  CompatSet::FeatureSet ceph_osd_feature_compat;
+  CompatSet::FeatureSet ceph_osd_feature_ro_compat;
+  CompatSet::FeatureSet ceph_osd_feature_incompat;
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
+  ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
+  return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
+		   ceph_osd_feature_incompat);
+}
+
+//Features are added here that this OSD supports.
+CompatSet OSD::get_osd_compat_set() {
+  CompatSet compat =  get_osd_initial_compat_set();
+  //Any features here can be set in code, but not in initial superblock
+  compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+  return compat;
+}
+
+OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
+  osd(osd),
+  cct(osd->cct),
+  whoami(osd->whoami), store(osd->store),
+  log_client(osd->log_client), clog(osd->clog),
+  pg_recovery_stats(osd->pg_recovery_stats),
+  cluster_messenger(osd->cluster_messenger),
+  client_messenger(osd->client_messenger),
+  logger(osd->logger),
+  recoverystate_perf(osd->recoverystate_perf),
+  monc(osd->monc),
+  osd_max_object_size(cct->_conf, "osd_max_object_size"),
+  osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
+  publish_lock{ceph::make_mutex("OSDService::publish_lock")},
+  pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
+  max_oldest_map(0),
+  scrubs_local(0),
+  scrubs_remote(0),
+  agent_valid_iterator(false),
+  agent_ops(0),
+  flush_mode_high_count(0),
+  agent_active(true),
+  agent_thread(this),
+  agent_stop_flag(false),
+  agent_timer(osd->client_messenger->cct, agent_timer_lock),
+  last_recalibrate(ceph_clock_now()),
+  promote_max_objects(0),
+  promote_max_bytes(0),
+  poolctx(poolctx),
+  objecter(make_unique<Objecter>(osd->client_messenger->cct,
+				 osd->objecter_messenger,
+				 osd->monc, poolctx)),
+  m_objecter_finishers(cct->_conf->osd_objecter_finishers),
+  watch_timer(osd->client_messenger->cct, watch_lock),
+  next_notif_id(0),
+  recovery_request_timer(cct, recovery_request_lock, false),
+  sleep_timer(cct, sleep_lock, false),
+  reserver_finisher(cct),
+  local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
+		 cct->_conf->osd_min_recovery_priority),
+  remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
+		  cct->_conf->osd_min_recovery_priority),
+  snap_reserver(cct, &reserver_finisher,
+		cct->_conf->osd_max_trimming_pgs),
+  recovery_ops_active(0),
+  recovery_ops_reserved(0),
+  recovery_paused(false),
+  map_cache(cct, cct->_conf->osd_map_cache_size),
+  map_bl_cache(cct->_conf->osd_map_cache_size),
+  map_bl_inc_cache(cct->_conf->osd_map_cache_size),
+  cur_state(NONE),
+  cur_ratio(0), physical_ratio(0),
+  boot_epoch(0), up_epoch(0), bind_epoch(0)
+{
+  objecter->init();
+
+  for (int i = 0; i < m_objecter_finishers; i++) {
+    ostringstream str;
+    str << "objecter-finisher-" << i;
+    auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
+    objecter_finishers.push_back(std::move(fin));
+  }
+}
+
+#ifdef PG_DEBUG_REFS
+void OSDService::add_pgid(spg_t pgid, PG *pg) {
+  std::lock_guard l(pgid_lock);
+  if (!pgid_tracker.count(pgid)) {
+    live_pgs[pgid] = pg;
+  }
+  pgid_tracker[pgid]++;
+}
+void OSDService::remove_pgid(spg_t pgid, PG *pg)
+{
+  std::lock_guard l(pgid_lock);
+  ceph_assert(pgid_tracker.count(pgid));
+  ceph_assert(pgid_tracker[pgid] > 0);
+  pgid_tracker[pgid]--;
+  if (pgid_tracker[pgid] == 0) {
+    pgid_tracker.erase(pgid);
+    live_pgs.erase(pgid);
+  }
+}
+void OSDService::dump_live_pgids()
+{
+  std::lock_guard l(pgid_lock);
+  derr << "live pgids:" << dendl;
+  for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
+       i != pgid_tracker.cend();
+       ++i) {
+    derr << "\t" << *i << dendl;
+    live_pgs[i->first]->dump_live_ids();
+  }
+}
+#endif
+
+
+ceph::signedspan OSDService::get_mnow()
+{
+  return ceph::mono_clock::now() - osd->startup_time;
+}
+
+void OSDService::identify_splits_and_merges(
+  OSDMapRef old_map,
+  OSDMapRef new_map,
+  spg_t pgid,
+  set<pair<spg_t,epoch_t>> *split_children,
+  set<pair<spg_t,epoch_t>> *merge_pgs)
+{
+  if (!old_map->have_pg_pool(pgid.pool())) {
+    return;
+  }
+  int old_pgnum = old_map->get_pg_num(pgid.pool());
+  auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
+  if (p == osd->pg_num_history.pg_nums.end()) {
+    return;
+  }
+  dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
+	   << " to e" << new_map->get_epoch()
+	   << " pg_nums " << p->second << dendl;
+  deque<spg_t> queue;
+  queue.push_back(pgid);
+  set<spg_t> did;
+  while (!queue.empty()) {
+    auto cur = queue.front();
+    queue.pop_front();
+    did.insert(cur);
+    unsigned pgnum = old_pgnum;
+    for (auto q = p->second.lower_bound(old_map->get_epoch());
+	 q != p->second.end() &&
+	   q->first <= new_map->get_epoch();
+	 ++q) {
+      if (pgnum < q->second) {
+	// split?
+	if (cur.ps() < pgnum) {
+	  set<spg_t> children;
+	  if (cur.is_split(pgnum, q->second, &children)) {
+	    dout(20) << __func__ << " " << cur << " e" << q->first
+		     << " pg_num " << pgnum << " -> " << q->second
+		     << " children " << children << dendl;
+	    for (auto i : children) {
+	      split_children->insert(make_pair(i, q->first));
+              if (!did.count(i))
+	        queue.push_back(i);
+	    }
+	  }
+	} else if (cur.ps() < q->second) {
+	  dout(20) << __func__ << " " << cur << " e" << q->first
+		   << " pg_num " << pgnum << " -> " << q->second
+		   << " is a child" << dendl;
+	  // normally we'd capture this from the parent, but it's
+	  // possible the parent doesn't exist yet (it will be
+	  // fabricated to allow an intervening merge).  note this PG
+	  // as a split child here to be sure we catch it.
+	  split_children->insert(make_pair(cur, q->first));
+	} else {
+	  dout(20) << __func__ << " " << cur << " e" << q->first
+		   << " pg_num " << pgnum << " -> " << q->second
+		   << " is post-split, skipping" << dendl;
+	}
+      } else if (merge_pgs) {
+	// merge?
+	if (cur.ps() >= q->second) {
+	  if (cur.ps() < pgnum) {
+	    spg_t parent;
+	    if (cur.is_merge_source(pgnum, q->second, &parent)) {
+	      set<spg_t> children;
+	      parent.is_split(q->second, pgnum, &children);
+	      dout(20) << __func__ << " " << cur << " e" << q->first
+		       << " pg_num " << pgnum << " -> " << q->second
+		       << " is merge source, target " << parent
+		       << ", source(s) " << children << dendl;
+	      merge_pgs->insert(make_pair(parent, q->first));
+              if (!did.count(parent)) {
+                // queue (and re-scan) parent in case it might not exist yet
+                // and there are some future splits pending on it
+                queue.push_back(parent);
+              }
+	      for (auto c : children) {
+		merge_pgs->insert(make_pair(c, q->first));
+                if (!did.count(c))
+                  queue.push_back(c);
+	      }
+	    }
+	  } else {
+	    dout(20) << __func__ << " " << cur << " e" << q->first
+		     << " pg_num " << pgnum << " -> " << q->second
+		     << " is beyond old pgnum, skipping" << dendl;
+	  }
+	} else {
+	  set<spg_t> children;
+	  if (cur.is_split(q->second, pgnum, &children)) {
+	    dout(20) << __func__ << " " << cur << " e" << q->first
+		     << " pg_num " << pgnum << " -> " << q->second
+		     << " is merge target, source " << children << dendl;
+	    for (auto c : children) {
+	      merge_pgs->insert(make_pair(c, q->first));
+              if (!did.count(c))
+                queue.push_back(c);
+	    }
+	    merge_pgs->insert(make_pair(cur, q->first));
+	  }
+	}
+      }
+      pgnum = q->second;
+    }
+  }
+}
+
+void OSDService::need_heartbeat_peer_update()
+{
+  osd->need_heartbeat_peer_update();
+}
+
+HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
+{
+  std::lock_guard l(hb_stamp_lock);
+  if (peer >= hb_stamps.size()) {
+    hb_stamps.resize(peer + 1);
+  }
+  if (!hb_stamps[peer]) {
+    hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
+  }
+  return hb_stamps[peer];
+}
+
+void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
+{
+  osd->enqueue_peering_evt(
+    spgid,
+    PGPeeringEventRef(
+      std::make_shared<PGPeeringEvent>(
+	epoch, epoch,
+	RenewLease())));
+}
+
+void OSDService::start_shutdown()
+{
+  {
+    std::lock_guard l(agent_timer_lock);
+    agent_timer.shutdown();
+  }
+
+  {
+    std::lock_guard l(sleep_lock);
+    sleep_timer.shutdown();
+  }
+
+  {
+    std::lock_guard l(recovery_request_lock);
+    recovery_request_timer.shutdown();
+  }
+}
+
+void OSDService::shutdown_reserver()
+{
+  reserver_finisher.wait_for_empty();
+  reserver_finisher.stop();
+}
+
+void OSDService::shutdown()
+{
+  mono_timer.suspend();
+
+  {
+    std::lock_guard l(watch_lock);
+    watch_timer.shutdown();
+  }
+
+  objecter->shutdown();
+  for (auto& f : objecter_finishers) {
+    f->wait_for_empty();
+    f->stop();
+  }
+
+  publish_map(OSDMapRef());
+  next_osdmap = OSDMapRef();
+}
+
+void OSDService::init()
+{
+  reserver_finisher.start();
+  for (auto& f : objecter_finishers) {
+    f->start();
+  }
+  objecter->set_client_incarnation(0);
+
+  // deprioritize objecter in daemonperf output
+  objecter->get_logger()->set_prio_adjust(-3);
+
+  watch_timer.init();
+  agent_timer.init();
+  mono_timer.resume();
+
+  agent_thread.create("osd_srv_agent");
+
+  if (cct->_conf->osd_recovery_delay_start)
+    defer_recovery(cct->_conf->osd_recovery_delay_start);
+}
+
+void OSDService::final_init()
+{
+  objecter->start(osdmap.get());
+}
+
+void OSDService::activate_map()
+{
+  // wake/unwake the tiering agent
+  std::lock_guard l{agent_lock};
+  agent_active =
+    !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
+    osd->is_active();
+  agent_cond.notify_all();
+}
+
+void OSDService::request_osdmap_update(epoch_t e)
+{
+  osd->osdmap_subscribe(e, false);
+}
+
+
+class AgentTimeoutCB : public Context {
+  PGRef pg;
+public:
+  explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
+  void finish(int) override {
+    pg->agent_choose_mode_restart();
+  }
+};
+
+void OSDService::agent_entry()
+{
+  dout(10) << __func__ << " start" << dendl;
+  std::unique_lock agent_locker{agent_lock};
+
+  while (!agent_stop_flag) {
+    if (agent_queue.empty()) {
+      dout(20) << __func__ << " empty queue" << dendl;
+      agent_cond.wait(agent_locker);
+      continue;
+    }
+    uint64_t level = agent_queue.rbegin()->first;
+    set<PGRef>& top = agent_queue.rbegin()->second;
+    dout(10) << __func__
+	     << " tiers " << agent_queue.size()
+	     << ", top is " << level
+	     << " with pgs " << top.size()
+	     << ", ops " << agent_ops << "/"
+	     << cct->_conf->osd_agent_max_ops
+	     << (agent_active ? " active" : " NOT ACTIVE")
+	     << dendl;
+    dout(20) << __func__ << " oids " << agent_oids << dendl;
+    int max = cct->_conf->osd_agent_max_ops - agent_ops;
+    int agent_flush_quota = max;
+    if (!flush_mode_high_count)
+      agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
+    if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
+      agent_cond.wait(agent_locker);
+      continue;
+    }
+
+    if (!agent_valid_iterator || agent_queue_pos == top.end()) {
+      agent_queue_pos = top.begin();
+      agent_valid_iterator = true;
+    }
+    PGRef pg = *agent_queue_pos;
+    dout(10) << "high_count " << flush_mode_high_count
+	     << " agent_ops " << agent_ops
+	     << " flush_quota " << agent_flush_quota << dendl;
+    agent_locker.unlock();
+    if (!pg->agent_work(max, agent_flush_quota)) {
+      dout(10) << __func__ << " " << pg->pg_id
+	<< " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
+	<< " seconds" << dendl;
+
+      logger->inc(l_osd_tier_delay);
+      // Queue a timer to call agent_choose_mode for this pg in 5 seconds
+      std::lock_guard timer_locker{agent_timer_lock};
+      Context *cb = new AgentTimeoutCB(pg);
+      agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
+    }
+    agent_locker.lock();
+  }
+  dout(10) << __func__ << " finish" << dendl;
+}
+
+void OSDService::agent_stop()
+{
+  {
+    std::lock_guard l(agent_lock);
+
+    // By this time all ops should be cancelled
+    ceph_assert(agent_ops == 0);
+    // By this time all PGs are shutdown and dequeued
+    if (!agent_queue.empty()) {
+      set<PGRef>& top = agent_queue.rbegin()->second;
+      derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
+      ceph_abort_msg("agent queue not empty");
+    }
+
+    agent_stop_flag = true;
+    agent_cond.notify_all();
+  }
+  agent_thread.join();
+}
+
+// -------------------------------------
+
+void OSDService::promote_throttle_recalibrate()
+{
+  utime_t now = ceph_clock_now();
+  double dur = now - last_recalibrate;
+  last_recalibrate = now;
+  unsigned prob = promote_probability_millis;
+
+  uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
+  uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
+
+  unsigned min_prob = 1;
+
+  uint64_t attempts, obj, bytes;
+  promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
+  dout(10) << __func__ << " " << attempts << " attempts, promoted "
+	   << obj << " objects and " << byte_u_t(bytes) << "; target "
+	   << target_obj_sec << " obj/sec or "
+	   << byte_u_t(target_bytes_sec) << "/sec"
+	   << dendl;
+
+  // calculate what the probability *should* be, given the targets
+  unsigned new_prob;
+  if (attempts && dur > 0) {
+    uint64_t avg_size = 1;
+    if (obj)
+      avg_size = std::max<uint64_t>(bytes / obj, 1);
+    unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
+    unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
+      / (double)attempts;
+    dout(20) << __func__ << "  po " << po << " pb " << pb << " avg_size "
+	     << avg_size << dendl;
+    if (target_obj_sec && target_bytes_sec)
+      new_prob = std::min(po, pb);
+    else if (target_obj_sec)
+      new_prob = po;
+    else if (target_bytes_sec)
+      new_prob = pb;
+    else
+      new_prob = 1000;
+  } else {
+    new_prob = 1000;
+  }
+  dout(20) << __func__ << "  new_prob " << new_prob << dendl;
+
+  // correct for persistent skew between target rate and actual rate, adjust
+  double ratio = 1.0;
+  unsigned actual = 0;
+  if (attempts && obj) {
+    actual = obj * 1000 / attempts;
+    ratio = (double)actual / (double)prob;
+    new_prob = (double)new_prob / ratio;
+  }
+  new_prob = std::max(new_prob, min_prob);
+  new_prob = std::min(new_prob, 1000u);
+
+  // adjust
+  prob = (prob + new_prob) / 2;
+  prob = std::max(prob, min_prob);
+  prob = std::min(prob, 1000u);
+  dout(10) << __func__ << "  actual " << actual
+	   << ", actual/prob ratio " << ratio
+	   << ", adjusted new_prob " << new_prob
+	   << ", prob " << promote_probability_millis << " -> " << prob
+	   << dendl;
+  promote_probability_millis = prob;
+
+  // set hard limits for this interval to mitigate stampedes
+  promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
+  promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
+}
+
+// -------------------------------------
+
+float OSDService::get_failsafe_full_ratio()
+{
+  float full_ratio = cct->_conf->osd_failsafe_full_ratio;
+  if (full_ratio > 1.0) full_ratio /= 100.0;
+  return full_ratio;
+}
+
+OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
+{
+  // The OSDMap ratios take precendence.  So if the failsafe is .95 and
+  // the admin sets the cluster full to .96, the failsafe moves up to .96
+  // too.  (Not that having failsafe == full is ideal, but it's better than
+  // dropping writes before the clusters appears full.)
+  OSDMapRef osdmap = get_osdmap();
+  if (!osdmap || osdmap->get_epoch() == 0) {
+    return NONE;
+  }
+  float nearfull_ratio = osdmap->get_nearfull_ratio();
+  float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
+  float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
+  float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
+
+  if (osdmap->require_osd_release < ceph_release_t::luminous) {
+    // use the failsafe for nearfull and full; the mon isn't using the
+    // flags anyway because we're mid-upgrade.
+    full_ratio = failsafe_ratio;
+    backfillfull_ratio = failsafe_ratio;
+    nearfull_ratio = failsafe_ratio;
+  } else if (full_ratio <= 0 ||
+	     backfillfull_ratio <= 0 ||
+	     nearfull_ratio <= 0) {
+    derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
+    // use failsafe flag.  ick.  the monitor did something wrong or the user
+    // did something stupid.
+    full_ratio = failsafe_ratio;
+    backfillfull_ratio = failsafe_ratio;
+    nearfull_ratio = failsafe_ratio;
+  }
+
+  if (injectfull_state > NONE && injectfull) {
+    inject = "(Injected)";
+    return injectfull_state;
+  } else if (pratio > failsafe_ratio) {
+    return FAILSAFE;
+  } else if (ratio > full_ratio) {
+    return FULL;
+  } else if (ratio > backfillfull_ratio) {
+    return BACKFILLFULL;
+  } else if (pratio > nearfull_ratio) {
+    return NEARFULL;
+  }
+   return NONE;
+}
+
+void OSDService::check_full_status(float ratio, float pratio)
+{
+  std::lock_guard l(full_status_lock);
+
+  cur_ratio = ratio;
+  physical_ratio = pratio;
+
+  string inject;
+  s_names new_state;
+  new_state = recalc_full_state(ratio, pratio, inject);
+
+  dout(20) << __func__ << " cur ratio " << ratio
+           << ", physical ratio " << pratio
+	   << ", new state " << get_full_state_name(new_state)
+	   << " " << inject
+	   << dendl;
+
+  // warn
+  if (cur_state != new_state) {
+    dout(10) << __func__ << " " << get_full_state_name(cur_state)
+	     << " -> " << get_full_state_name(new_state) << dendl;
+    if (new_state == FAILSAFE) {
+      clog->error() << "full status failsafe engaged, dropping updates, now "
+		    << (int)roundf(ratio * 100) << "% full";
+    } else if (cur_state == FAILSAFE) {
+      clog->error() << "full status failsafe disengaged, no longer dropping "
+		     << "updates, now " << (int)roundf(ratio * 100) << "% full";
+    }
+    cur_state = new_state;
+  }
+}
+
+bool OSDService::need_fullness_update()
+{
+  OSDMapRef osdmap = get_osdmap();
+  s_names cur = NONE;
+  if (osdmap->exists(whoami)) {
+    if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
+      cur = FULL;
+    } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
+      cur = BACKFILLFULL;
+    } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
+      cur = NEARFULL;
+    }
+  }
+  s_names want = NONE;
+  if (is_full())
+    want = FULL;
+  else if (is_backfillfull())
+    want = BACKFILLFULL;
+  else if (is_nearfull())
+    want = NEARFULL;
+  return want != cur;
+}
+
+bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
+{
+  if (injectfull && injectfull_state >= type) {
+    // injectfull is either a count of the number of times to return failsafe full
+    // or if -1 then always return full
+    if (injectfull > 0)
+      --injectfull;
+    ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
+             << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
+             << dendl;
+    return true;
+  }
+  return false;
+}
+
+bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
+{
+  std::lock_guard l(full_status_lock);
+
+  if (_check_inject_full(dpp, type))
+    return true;
+
+  if (cur_state >= type)
+    ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
+                       << " physical " << physical_ratio << dendl;
+
+  return cur_state >= type;
+}
+
+bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
+{
+  ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
+  {
+    std::lock_guard l(full_status_lock);
+    if (_check_inject_full(dpp, type)) {
+      return true;
+    }
+  }
+
+  float pratio;
+  float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
+
+  string notused;
+  s_names tentative_state = recalc_full_state(ratio, pratio, notused);
+
+  if (tentative_state >= type)
+    ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
+
+  return tentative_state >= type;
+}
+
+bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
+{
+  return _check_full(dpp, FAILSAFE);
+}
+
+bool OSDService::check_full(DoutPrefixProvider *dpp) const
+{
+  return _check_full(dpp, FULL);
+}
+
+bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
+{
+  return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
+}
+
+bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
+{
+  return _check_full(dpp, BACKFILLFULL);
+}
+
+bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
+{
+  return _check_full(dpp, NEARFULL);
+}
+
+bool OSDService::is_failsafe_full() const
+{
+  std::lock_guard l(full_status_lock);
+  return cur_state == FAILSAFE;
+}
+
+bool OSDService::is_full() const
+{
+  std::lock_guard l(full_status_lock);
+  return cur_state >= FULL;
+}
+
+bool OSDService::is_backfillfull() const
+{
+  std::lock_guard l(full_status_lock);
+  return cur_state >= BACKFILLFULL;
+}
+
+bool OSDService::is_nearfull() const
+{
+  std::lock_guard l(full_status_lock);
+  return cur_state >= NEARFULL;
+}
+
+void OSDService::set_injectfull(s_names type, int64_t count)
+{
+  std::lock_guard l(full_status_lock);
+  injectfull_state = type;
+  injectfull = count;
+}
+
+void OSDService::set_statfs(const struct store_statfs_t &stbuf,
+			    osd_alert_list_t& alerts)
+{
+  uint64_t bytes = stbuf.total;
+  uint64_t avail = stbuf.available;
+  uint64_t used = stbuf.get_used_raw();
+
+  // For testing fake statfs values so it doesn't matter if all
+  // OSDs are using the same partition.
+  if (cct->_conf->fake_statfs_for_testing) {
+    uint64_t total_num_bytes = 0;
+    vector<PGRef> pgs;
+    osd->_get_pgs(&pgs);
+    for (auto p : pgs) {
+      total_num_bytes += p->get_stats_num_bytes();
+    }
+    bytes = cct->_conf->fake_statfs_for_testing;
+    if (total_num_bytes < bytes)
+      avail = bytes - total_num_bytes;
+    else
+      avail = 0;
+    dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
+            << " adjust available " << avail
+            << dendl;
+    used = bytes - avail;
+  }
+
+  logger->set(l_osd_stat_bytes, bytes);
+  logger->set(l_osd_stat_bytes_used, used);
+  logger->set(l_osd_stat_bytes_avail, avail);
+
+  std::lock_guard l(stat_lock);
+  osd_stat.statfs = stbuf;
+  osd_stat.os_alerts.clear();
+  osd_stat.os_alerts[whoami].swap(alerts);
+  if (cct->_conf->fake_statfs_for_testing) {
+    osd_stat.statfs.total = bytes;
+    osd_stat.statfs.available = avail;
+    // For testing don't want used to go negative, so clear reserved
+    osd_stat.statfs.internally_reserved = 0;
+  }
+}
+
+osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
+				    int num_pgs)
+{
+  utime_t now = ceph_clock_now();
+  auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
+  std::lock_guard l(stat_lock);
+  osd_stat.hb_peers.swap(hb_peers);
+  osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
+  osd_stat.num_pgs = num_pgs;
+  // Clean entries that aren't updated
+  // This is called often enough that we can just remove 1 at a time
+  for (auto i: osd_stat.hb_pingtime) {
+    if (i.second.last_update == 0)
+      continue;
+    if (stale_time && now.sec() - i.second.last_update > stale_time) {
+      dout(20) << __func__ << " time out heartbeat for osd " << i.first
+	       << " last_update " << i.second.last_update << dendl;
+      osd_stat.hb_pingtime.erase(i.first);
+      break;
+    }
+  }
+  return osd_stat;
+}
+
+void OSDService::inc_osd_stat_repaired()
+{
+  std::lock_guard l(stat_lock);
+  osd_stat.num_shards_repaired++;
+  return;
+}
+
+float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
+				         uint64_t adjust_used)
+{
+  *pratio =
+   ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
+
+  if (adjust_used) {
+    dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used()  << dendl;
+    if (new_stat.statfs.available > adjust_used)
+      new_stat.statfs.available -= adjust_used;
+    else
+      new_stat.statfs.available = 0;
+    dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
+  }
+
+  // Check all pgs and adjust kb_used to include all pending backfill data
+  int backfill_adjusted = 0;
+  vector<PGRef> pgs;
+  osd->_get_pgs(&pgs);
+  for (auto p : pgs) {
+    backfill_adjusted += p->pg_stat_adjust(&new_stat);
+  }
+  if (backfill_adjusted) {
+    dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
+  }
+  return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
+}
+
+void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
+{
+  OSDMapRef next_map = get_nextmap_reserved();
+  // service map is always newer/newest
+  ceph_assert(from_epoch <= next_map->get_epoch());
+
+  if (next_map->is_down(peer) ||
+      next_map->get_info(peer).up_from > from_epoch) {
+    m->put();
+    release_map(next_map);
+    return;
+  }
+  ConnectionRef peer_con;
+  if (peer == whoami) {
+    peer_con = osd->cluster_messenger->get_loopback_connection();
+  } else {
+    peer_con = osd->cluster_messenger->connect_to_osd(
+	next_map->get_cluster_addrs(peer), false, true);
+  }
+  maybe_share_map(peer_con.get(), next_map);
+  peer_con->send_message(m);
+  release_map(next_map);
+}
+
+void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
+{
+  OSDMapRef next_map = get_nextmap_reserved();
+  // service map is always newer/newest
+  ceph_assert(from_epoch <= next_map->get_epoch());
+
+  for (auto& iter : messages) {
+    if (next_map->is_down(iter.first) ||
+	next_map->get_info(iter.first).up_from > from_epoch) {
+      iter.second->put();
+      continue;
+    }
+    ConnectionRef peer_con;
+    if (iter.first == whoami) {
+      peer_con = osd->cluster_messenger->get_loopback_connection();
+    } else {
+      peer_con = osd->cluster_messenger->connect_to_osd(
+	  next_map->get_cluster_addrs(iter.first), false, true);
+    }
+    maybe_share_map(peer_con.get(), next_map);
+    peer_con->send_message(iter.second);
+  }
+  release_map(next_map);
+}
+ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
+{
+  OSDMapRef next_map = get_nextmap_reserved();
+  // service map is always newer/newest
+  ceph_assert(from_epoch <= next_map->get_epoch());
+
+  if (next_map->is_down(peer) ||
+      next_map->get_info(peer).up_from > from_epoch) {
+    release_map(next_map);
+    return NULL;
+  }
+  ConnectionRef con;
+  if (peer == whoami) {
+    con = osd->cluster_messenger->get_loopback_connection();
+  } else {
+    con = osd->cluster_messenger->connect_to_osd(
+	next_map->get_cluster_addrs(peer), false, true);
+  }
+  release_map(next_map);
+  return con;
+}
+
+pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
+{
+  OSDMapRef next_map = get_nextmap_reserved();
+  // service map is always newer/newest
+  ceph_assert(from_epoch <= next_map->get_epoch());
+
+  pair<ConnectionRef,ConnectionRef> ret;
+  if (next_map->is_down(peer) ||
+      next_map->get_info(peer).up_from > from_epoch) {
+    release_map(next_map);
+    return ret;
+  }
+  ret.first = osd->hb_back_client_messenger->connect_to_osd(
+    next_map->get_hb_back_addrs(peer));
+  ret.second = osd->hb_front_client_messenger->connect_to_osd(
+    next_map->get_hb_front_addrs(peer));
+  release_map(next_map);
+  return ret;
+}
+
+entity_name_t OSDService::get_cluster_msgr_name() const
+{
+  return cluster_messenger->get_myname();
+}
+
+void OSDService::queue_want_pg_temp(pg_t pgid,
+				    const vector<int>& want,
+				    bool forced)
+{
+  std::lock_guard l(pg_temp_lock);
+  auto p = pg_temp_pending.find(pgid);
+  if (p == pg_temp_pending.end() ||
+      p->second.acting != want ||
+      forced) {
+    pg_temp_wanted[pgid] = {want, forced};
+  }
+}
+
+void OSDService::remove_want_pg_temp(pg_t pgid)
+{
+  std::lock_guard l(pg_temp_lock);
+  pg_temp_wanted.erase(pgid);
+  pg_temp_pending.erase(pgid);
+}
+
+void OSDService::_sent_pg_temp()
+{
+#ifdef HAVE_STDLIB_MAP_SPLICING
+  pg_temp_pending.merge(pg_temp_wanted);
+#else
+  pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
+			 make_move_iterator(end(pg_temp_wanted)));
+#endif
+  pg_temp_wanted.clear();
+}
+
+void OSDService::requeue_pg_temp()
+{
+  std::lock_guard l(pg_temp_lock);
+  // wanted overrides pending.  note that remove_want_pg_temp
+  // clears the item out of both.
+  unsigned old_wanted = pg_temp_wanted.size();
+  unsigned old_pending = pg_temp_pending.size();
+  _sent_pg_temp();
+  pg_temp_wanted.swap(pg_temp_pending);
+  dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
+	   << pg_temp_wanted.size() << dendl;
+}
+
+std::ostream& operator<<(std::ostream& out,
+			 const OSDService::pg_temp_t& pg_temp)
+{
+  out << pg_temp.acting;
+  if (pg_temp.forced) {
+    out << " (forced)";
+  }
+  return out;
+}
+
+void OSDService::send_pg_temp()
+{
+  std::lock_guard l(pg_temp_lock);
+  if (pg_temp_wanted.empty())
+    return;
+  dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
+  MOSDPGTemp *ms[2] = {nullptr, nullptr};
+  for (auto& [pgid, pg_temp] : pg_temp_wanted) {
+    auto& m = ms[pg_temp.forced];
+    if (!m) {
+      m = new MOSDPGTemp(osdmap->get_epoch());
+      m->forced = pg_temp.forced;
+    }
+    m->pg_temp.emplace(pgid, pg_temp.acting);
+  }
+  for (auto m : ms) {
+    if (m) {
+      monc->send_mon_message(m);
+    }
+  }
+  _sent_pg_temp();
+}
+
+void OSDService::send_pg_created(pg_t pgid)
+{
+  std::lock_guard l(pg_created_lock);
+  dout(20) << __func__ << dendl;
+  auto o = get_osdmap();
+  if (o->require_osd_release >= ceph_release_t::luminous) {
+    pg_created.insert(pgid);
+    monc->send_mon_message(new MOSDPGCreated(pgid));
+  }
+}
+
+void OSDService::send_pg_created()
+{
+  std::lock_guard l(pg_created_lock);
+  dout(20) << __func__ << dendl;
+  auto o = get_osdmap();
+  if (o->require_osd_release >= ceph_release_t::luminous) {
+    for (auto pgid : pg_created) {
+      monc->send_mon_message(new MOSDPGCreated(pgid));
+    }
+  }
+}
+
+void OSDService::prune_pg_created()
+{
+  std::lock_guard l(pg_created_lock);
+  dout(20) << __func__ << dendl;
+  auto o = get_osdmap();
+  auto i = pg_created.begin();
+  while (i != pg_created.end()) {
+    auto p = o->get_pg_pool(i->pool());
+    if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
+      dout(20) << __func__ << " pruning " << *i << dendl;
+      i = pg_created.erase(i);
+    } else {
+      dout(20) << __func__ << " keeping " << *i << dendl;
+      ++i;
+    }
+  }
+}
+
+
+// --------------------------------------
+// dispatch
+
+bool OSDService::can_inc_scrubs()
+{
+  bool can_inc = false;
+  std::lock_guard l(sched_scrub_lock);
+
+  if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
+    dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
+	     << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
+    can_inc = true;
+  } else {
+    dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
+	     << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
+  }
+
+  return can_inc;
+}
+
+bool OSDService::inc_scrubs_local()
+{
+  bool result = false;
+  std::lock_guard l{sched_scrub_lock};
+  if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
+    dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
+	     << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
+    result = true;
+    ++scrubs_local;
+  } else {
+    dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
+  }
+  return result;
+}
+
+void OSDService::dec_scrubs_local()
+{
+  std::lock_guard l{sched_scrub_lock};
+  dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
+	   << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
+  --scrubs_local;
+  ceph_assert(scrubs_local >= 0);
+}
+
+bool OSDService::inc_scrubs_remote()
+{
+  bool result = false;
+  std::lock_guard l{sched_scrub_lock};
+  if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
+    dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
+	     << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
+    result = true;
+    ++scrubs_remote;
+  } else {
+    dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
+  }
+  return result;
+}
+
+void OSDService::dec_scrubs_remote()
+{
+  std::lock_guard l{sched_scrub_lock};
+  dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
+	   << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
+  --scrubs_remote;
+  ceph_assert(scrubs_remote >= 0);
+}
+
+void OSDService::dump_scrub_reservations(Formatter *f)
+{
+  std::lock_guard l{sched_scrub_lock};
+  f->dump_int("scrubs_local", scrubs_local);
+  f->dump_int("scrubs_remote", scrubs_remote);
+  f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
+}
+
+void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
+                                 epoch_t *_bind_epoch) const
+{
+  std::lock_guard l(epoch_lock);
+  if (_boot_epoch)
+    *_boot_epoch = boot_epoch;
+  if (_up_epoch)
+    *_up_epoch = up_epoch;
+  if (_bind_epoch)
+    *_bind_epoch = bind_epoch;
+}
+
+void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
+                            const epoch_t *_bind_epoch)
+{
+  std::lock_guard l(epoch_lock);
+  if (_boot_epoch) {
+    ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
+    boot_epoch = *_boot_epoch;
+  }
+  if (_up_epoch) {
+    ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
+    up_epoch = *_up_epoch;
+  }
+  if (_bind_epoch) {
+    ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
+    bind_epoch = *_bind_epoch;
+  }
+}
+
+bool OSDService::prepare_to_stop()
+{
+  std::unique_lock l(is_stopping_lock);
+  if (get_state() != NOT_STOPPING)
+    return false;
+
+  OSDMapRef osdmap = get_osdmap();
+  if (osdmap && osdmap->is_up(whoami)) {
+    dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl;
+    set_state(PREPARING_TO_STOP);
+    monc->send_mon_message(
+      new MOSDMarkMeDown(
+	monc->get_fsid(),
+	whoami,
+	osdmap->get_addrs(whoami),
+	osdmap->get_epoch(),
+	true,  // request ack
+	true   // mark as down and dead
+	));
+    const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
+    is_stopping_cond.wait_for(l, timeout,
+      [this] { return get_state() == STOPPING; });
+  }
+
+  dout(0) << __func__ << " starting shutdown" << dendl;
+  set_state(STOPPING);
+  return true;
+}
+
+void OSDService::got_stop_ack()
+{
+  std::scoped_lock l(is_stopping_lock);
+  if (get_state() == PREPARING_TO_STOP) {
+    dout(0) << __func__ << " starting shutdown" << dendl;
+    set_state(STOPPING);
+    is_stopping_cond.notify_all();
+  } else {
+    dout(10) << __func__ << " ignoring msg" << dendl;
+  }
+}
+
+MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
+                                               OSDSuperblock& sblock)
+{
+  MOSDMap *m = new MOSDMap(monc->get_fsid(),
+			   osdmap->get_encoding_features());
+  m->oldest_map = max_oldest_map;
+  m->newest_map = sblock.newest_map;
+
+  int max = cct->_conf->osd_map_message_max;
+  ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
+
+  if (since < m->oldest_map) {
+    // we don't have the next map the target wants, so start with a
+    // full map.
+    bufferlist bl;
+    dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
+	     << since << ", starting with full map" << dendl;
+    since = m->oldest_map;
+    if (!get_map_bl(since, bl)) {
+      derr << __func__ << " missing full map " << since << dendl;
+      goto panic;
+    }
+    max--;
+    max_bytes -= bl.length();
+    m->maps[since] = std::move(bl);
+  }
+  for (epoch_t e = since + 1; e <= to; ++e) {
+    bufferlist bl;
+    if (get_inc_map_bl(e, bl)) {
+      m->incremental_maps[e] = std::move(bl);
+    } else {
+      dout(10) << __func__ << " missing incremental map " << e << dendl;
+      if (!get_map_bl(e, bl)) {
+	derr << __func__ << " also missing full map " << e << dendl;
+	goto panic;
+      }
+      m->maps[e] = std::move(bl);
+    }
+    max--;
+    max_bytes -= bl.length();
+    if (max <= 0 || max_bytes <= 0) {
+      break;
+    }
+  }
+  return m;
+
+ panic:
+  if (!m->maps.empty() ||
+      !m->incremental_maps.empty()) {
+    // send what we have so far
+    return m;
+  }
+  // send something
+  bufferlist bl;
+  if (get_inc_map_bl(m->newest_map, bl)) {
+    m->incremental_maps[m->newest_map] = std::move(bl);
+  } else {
+    derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
+    if (!get_map_bl(m->newest_map, bl)) {
+      derr << __func__ << " unable to load latest full map " << m->newest_map
+	   << dendl;
+      ceph_abort();
+    }
+    m->maps[m->newest_map] = std::move(bl);
+  }
+  return m;
+}
+
+void OSDService::send_map(MOSDMap *m, Connection *con)
+{
+  con->send_message(m);
+}
+
+void OSDService::send_incremental_map(epoch_t since, Connection *con,
+                                      const OSDMapRef& osdmap)
+{
+  epoch_t to = osdmap->get_epoch();
+  dout(10) << "send_incremental_map " << since << " -> " << to
+           << " to " << con << " " << con->get_peer_addr() << dendl;
+
+  MOSDMap *m = NULL;
+  while (!m) {
+    OSDSuperblock sblock(get_superblock());
+    if (since < sblock.oldest_map) {
+      // just send latest full map
+      MOSDMap *m = new MOSDMap(monc->get_fsid(),
+			       osdmap->get_encoding_features());
+      m->oldest_map = max_oldest_map;
+      m->newest_map = sblock.newest_map;
+      get_map_bl(to, m->maps[to]);
+      send_map(m, con);
+      return;
+    }
+
+    if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
+      dout(10) << "  " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
+	       << ", only sending most recent" << dendl;
+      since = to - cct->_conf->osd_map_share_max_epochs;
+    }
+
+    m = build_incremental_map_msg(since, to, sblock);
+  }
+  send_map(m, con);
+}
+
+bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
+{
+  bool found = map_bl_cache.lookup(e, &bl);
+  if (found) {
+    logger->inc(l_osd_map_bl_cache_hit);
+    return true;
+  }
+  logger->inc(l_osd_map_bl_cache_miss);
+  found = store->read(meta_ch,
+		      OSD::get_osdmap_pobject_name(e), 0, 0, bl,
+		      CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
+  if (found) {
+    _add_map_bl(e, bl);
+  }
+  return found;
+}
+
+bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
+{
+  std::lock_guard l(map_cache_lock);
+  bool found = map_bl_inc_cache.lookup(e, &bl);
+  if (found) {
+    logger->inc(l_osd_map_bl_cache_hit);
+    return true;
+  }
+  logger->inc(l_osd_map_bl_cache_miss);
+  found = store->read(meta_ch,
+		      OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
+		      CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
+  if (found) {
+    _add_map_inc_bl(e, bl);
+  }
+  return found;
+}
+
+void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
+{
+  dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
+  // cache a contiguous buffer
+  if (bl.get_num_buffers() > 1) {
+    bl.rebuild();
+  }
+  bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
+  map_bl_cache.add(e, bl);
+}
+
+void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
+{
+  dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
+  // cache a contiguous buffer
+  if (bl.get_num_buffers() > 1) {
+    bl.rebuild();
+  }
+  bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
+  map_bl_inc_cache.add(e, bl);
+}
+
+OSDMapRef OSDService::_add_map(OSDMap *o)
+{
+  epoch_t e = o->get_epoch();
+
+  if (cct->_conf->osd_map_dedup) {
+    // Dedup against an existing map at a nearby epoch
+    OSDMapRef for_dedup = map_cache.lower_bound(e);
+    if (for_dedup) {
+      OSDMap::dedup(for_dedup.get(), o);
+    }
+  }
+  bool existed;
+  OSDMapRef l = map_cache.add(e, o, &existed);
+  if (existed) {
+    delete o;
+  }
+  return l;
+}
+
+OSDMapRef OSDService::try_get_map(epoch_t epoch)
+{
+  std::lock_guard l(map_cache_lock);
+  OSDMapRef retval = map_cache.lookup(epoch);
+  if (retval) {
+    dout(30) << "get_map " << epoch << " -cached" << dendl;
+    logger->inc(l_osd_map_cache_hit);
+    return retval;
+  }
+  {
+    logger->inc(l_osd_map_cache_miss);
+    epoch_t lb = map_cache.cached_key_lower_bound();
+    if (epoch < lb) {
+      dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
+      logger->inc(l_osd_map_cache_miss_low);
+      logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
+    }
+  }
+
+  OSDMap *map = new OSDMap;
+  if (epoch > 0) {
+    dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
+    bufferlist bl;
+    if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
+      derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
+      delete map;
+      return OSDMapRef();
+    }
+    map->decode(bl);
+  } else {
+    dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
+  }
+  return _add_map(map);
+}
+
+// ops
+
+
+void OSDService::reply_op_error(OpRequestRef op, int err)
+{
+  reply_op_error(op, err, eversion_t(), 0, {});
+}
+
+void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
+                                version_t uv,
+				vector<pg_log_op_return_item_t> op_returns)
+{
+  auto m = op->get_req<MOSDOp>();
+  ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+  int flags;
+  flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
+
+  MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
+				       !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
+  reply->set_reply_versions(v, uv);
+  reply->set_op_returns(op_returns);
+  m->get_connection()->send_message(reply);
+}
+
+void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
+{
+  if (!cct->_conf->osd_debug_misdirected_ops) {
+    return;
+  }
+
+  auto m = op->get_req<MOSDOp>();
+  ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+
+  ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
+
+  if (pg->is_ec_pg()) {
+    /**
+       * OSD recomputes op target based on current OSDMap. With an EC pg, we
+       * can get this result:
+       * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
+       *    [CRUSH_ITEM_NONE, 2, 3]/3
+       * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
+       *    [3, 2, 3]/3
+       * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
+       *    -- misdirected op
+       * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
+       *    it and fulfils it
+       *
+       * We can't compute the op target based on the sending map epoch due to
+       * splitting.  The simplest thing is to detect such cases here and drop
+       * them without an error (the client will resend anyway).
+       */
+    ceph_assert(m->get_map_epoch() <= superblock.newest_map);
+    OSDMapRef opmap = try_get_map(m->get_map_epoch());
+    if (!opmap) {
+      dout(7) << __func__ << ": " << *pg << " no longer have map for "
+	      << m->get_map_epoch() << ", dropping" << dendl;
+      return;
+    }
+    pg_t _pgid = m->get_raw_pg();
+    spg_t pgid;
+    if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
+      _pgid = opmap->raw_pg_to_pg(_pgid);
+    if (opmap->get_primary_shard(_pgid, &pgid) &&
+	pgid.shard != pg->pg_id.shard) {
+      dout(7) << __func__ << ": " << *pg << " primary changed since "
+	      << m->get_map_epoch() << ", dropping" << dendl;
+      return;
+    }
+  }
+
+  dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
+  clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
+	       << " pg " << m->get_raw_pg()
+	       << " to osd." << whoami
+	       << " not " << pg->get_acting()
+	       << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
+}
+
+void OSDService::enqueue_back(OpSchedulerItem&& qi)
+{
+  osd->op_shardedwq.queue(std::move(qi));
+}
+
+void OSDService::enqueue_front(OpSchedulerItem&& qi)
+{
+  osd->op_shardedwq.queue_front(std::move(qi));
+}
+
+void OSDService::queue_recovery_context(
+  PG *pg,
+  GenContext<ThreadPool::TPHandle&> *c)
+{
+  epoch_t e = get_osdmap_epoch();
+  enqueue_back(
+    OpSchedulerItem(
+      unique_ptr<OpSchedulerItem::OpQueueable>(
+	new PGRecoveryContext(pg->get_pgid(), c, e)),
+      cct->_conf->osd_recovery_cost,
+      cct->_conf->osd_recovery_priority,
+      ceph_clock_now(),
+      0,
+      e));
+}
+
+void OSDService::queue_for_snap_trim(PG *pg)
+{
+  dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
+  enqueue_back(
+    OpSchedulerItem(
+      unique_ptr<OpSchedulerItem::OpQueueable>(
+	new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
+      cct->_conf->osd_snap_trim_cost,
+      cct->_conf->osd_snap_trim_priority,
+      ceph_clock_now(),
+      0,
+      pg->get_osdmap_epoch()));
+}
+
+template <class MSG_TYPE>
+void OSDService::queue_scrub_event_msg(PG* pg,
+				       Scrub::scrub_prio_t with_priority,
+				       unsigned int qu_priority,
+				       Scrub::act_token_t act_token)
+{
+  const auto epoch = pg->get_osdmap_epoch();
+  auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
+  dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
+           << ". Epoch: " << epoch << " token: " << act_token << dendl;
+
+  enqueue_back(OpSchedulerItem(
+    unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
+    pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
+}
+
+template <class MSG_TYPE>
+void OSDService::queue_scrub_event_msg(PG* pg,
+                                       Scrub::scrub_prio_t with_priority)
+{
+  const auto epoch = pg->get_osdmap_epoch();
+  auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
+  dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
+
+  enqueue_back(OpSchedulerItem(
+    unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
+    pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
+}
+
+void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  queue_scrub_event_msg<PGScrub>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
+}
+
+void OSDService::queue_for_rep_scrub(PG* pg,
+				     Scrub::scrub_prio_t with_priority,
+				     unsigned int qu_priority,
+				     Scrub::act_token_t act_token)
+{
+  queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
+}
+
+void OSDService::queue_for_rep_scrub_resched(PG* pg,
+					     Scrub::scrub_prio_t with_priority,
+					     unsigned int qu_priority,
+					     Scrub::act_token_t act_token)
+{
+  // Resulting scrub event: 'SchedReplica'
+  queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
+					   act_token);
+}
+
+void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'RemotesReserved'
+  queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
+}
+
+void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'ReservationFailure'
+  queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
+}
+
+void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'InternalSchedScrub'
+  queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'ActivePushesUpd'
+  queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'SelectedChunkFree'
+  queue_scrub_event_msg<PGScrubChunkIsFree>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'ChunkIsBusy'
+  queue_scrub_event_msg<PGScrubChunkIsBusy>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'Unblocked'
+  queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'DigestUpdate'
+  queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'IntLocalMapDone'
+  queue_scrub_event_msg<PGScrubGotLocalMap>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'GotReplicas'
+  queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'MapsCompared'
+  queue_scrub_event_msg<PGScrubMapsCompared>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'ReplicaPushesUpd'
+  queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
+}
+
+void OSDService::queue_scrub_is_finished(PG *pg)
+{
+  // Resulting scrub event: 'ScrubFinished'
+  queue_scrub_event_msg<PGScrubScrubFinished>(pg, Scrub::scrub_prio_t::high_priority);
+}
+
+void OSDService::queue_scrub_next_chunk(PG *pg, Scrub::scrub_prio_t with_priority)
+{
+  // Resulting scrub event: 'NextChunk'
+  queue_scrub_event_msg<PGScrubGetNextChunk>(pg, with_priority);
+}
+
+void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
+{
+  dout(10) << __func__ << " on " << pgid << " e " << e  << dendl;
+  enqueue_back(
+    OpSchedulerItem(
+      unique_ptr<OpSchedulerItem::OpQueueable>(
+	new PGDelete(pgid, e)),
+      cct->_conf->osd_pg_delete_cost,
+      cct->_conf->osd_pg_delete_priority,
+      ceph_clock_now(),
+      0,
+      e));
+}
+
+bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
+{
+  return osd->try_finish_pg_delete(pg, old_pg_num);
+}
+
+// ---
+
+void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
+{
+  std::lock_guard l(merge_lock);
+  dout(10) << __func__ << " " << pg->pg_id << dendl;
+  ready_to_merge_source[pg->pg_id.pgid] = version;
+  assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
+  _send_ready_to_merge();
+}
+
+void OSDService::set_ready_to_merge_target(PG *pg,
+					   eversion_t version,
+					   epoch_t last_epoch_started,
+					   epoch_t last_epoch_clean)
+{
+  std::lock_guard l(merge_lock);
+  dout(10) << __func__ << " " << pg->pg_id << dendl;
+  ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
+					 make_tuple(version,
+						    last_epoch_started,
+						    last_epoch_clean)));
+  assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
+  _send_ready_to_merge();
+}
+
+void OSDService::set_not_ready_to_merge_source(pg_t source)
+{
+  std::lock_guard l(merge_lock);
+  dout(10) << __func__ << " " << source << dendl;
+  not_ready_to_merge_source.insert(source);
+  assert(ready_to_merge_source.count(source) == 0);
+  _send_ready_to_merge();
+}
+
+void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
+{
+  std::lock_guard l(merge_lock);
+  dout(10) << __func__ << " " << target << " source " << source << dendl;
+  not_ready_to_merge_target[target] = source;
+  assert(ready_to_merge_target.count(target) == 0);
+  _send_ready_to_merge();
+}
+
+void OSDService::send_ready_to_merge()
+{
+  std::lock_guard l(merge_lock);
+  _send_ready_to_merge();
+}
+
+void OSDService::_send_ready_to_merge()
+{
+  dout(20) << __func__
+	   << " ready_to_merge_source " << ready_to_merge_source
+    	   << " not_ready_to_merge_source " << not_ready_to_merge_source
+	   << " ready_to_merge_target " << ready_to_merge_target
+    	   << " not_ready_to_merge_target " << not_ready_to_merge_target
+	   << " sent_ready_to_merge_source " << sent_ready_to_merge_source
+	   << dendl;
+  for (auto src : not_ready_to_merge_source) {
+    if (sent_ready_to_merge_source.count(src) == 0) {
+      monc->send_mon_message(new MOSDPGReadyToMerge(
+			       src,
+			       {}, {}, 0, 0,
+			       false,
+			       osdmap->get_epoch()));
+      sent_ready_to_merge_source.insert(src);
+    }
+  }
+  for (auto p : not_ready_to_merge_target) {
+    if (sent_ready_to_merge_source.count(p.second) == 0) {
+      monc->send_mon_message(new MOSDPGReadyToMerge(
+			       p.second,
+			       {}, {}, 0, 0,
+			       false,
+			       osdmap->get_epoch()));
+      sent_ready_to_merge_source.insert(p.second);
+    }
+  }
+  for (auto src : ready_to_merge_source) {
+    if (not_ready_to_merge_source.count(src.first) ||
+	not_ready_to_merge_target.count(src.first.get_parent())) {
+      continue;
+    }
+    auto p = ready_to_merge_target.find(src.first.get_parent());
+    if (p != ready_to_merge_target.end() &&
+	sent_ready_to_merge_source.count(src.first) == 0) {
+      monc->send_mon_message(new MOSDPGReadyToMerge(
+			       src.first,           // source pgid
+			       src.second,          // src version
+			       std::get<0>(p->second), // target version
+			       std::get<1>(p->second), // PG's last_epoch_started
+			       std::get<2>(p->second), // PG's last_epoch_clean
+			       true,
+			       osdmap->get_epoch()));
+      sent_ready_to_merge_source.insert(src.first);
+    }
+  }
+}
+
+void OSDService::clear_ready_to_merge(PG *pg)
+{
+  std::lock_guard l(merge_lock);
+  dout(10) << __func__ << " " << pg->pg_id << dendl;
+  ready_to_merge_source.erase(pg->pg_id.pgid);
+  ready_to_merge_target.erase(pg->pg_id.pgid);
+  not_ready_to_merge_source.erase(pg->pg_id.pgid);
+  not_ready_to_merge_target.erase(pg->pg_id.pgid);
+  sent_ready_to_merge_source.erase(pg->pg_id.pgid);
+}
+
+void OSDService::clear_sent_ready_to_merge()
+{
+  std::lock_guard l(merge_lock);
+  sent_ready_to_merge_source.clear();
+}
+
+void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
+{
+  std::lock_guard l(merge_lock);
+  auto i = sent_ready_to_merge_source.begin();
+  while (i != sent_ready_to_merge_source.end()) {
+    if (!osdmap->pg_exists(*i)) {
+      dout(10) << __func__ << " " << *i << dendl;
+      i = sent_ready_to_merge_source.erase(i);
+    } else {
+      ++i;
+    }
+  }
+}
+
+// ---
+
+void OSDService::_queue_for_recovery(
+  std::pair<epoch_t, PGRef> p,
+  uint64_t reserved_pushes)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
+  enqueue_back(
+    OpSchedulerItem(
+      unique_ptr<OpSchedulerItem::OpQueueable>(
+	new PGRecovery(
+	  p.second->get_pgid(), p.first, reserved_pushes)),
+      cct->_conf->osd_recovery_cost,
+      cct->_conf->osd_recovery_priority,
+      ceph_clock_now(),
+      0,
+      p.first));
+}
+
+// ====================================================================
+// OSD
+
+#undef dout_prefix
+#define dout_prefix *_dout
+
+// Commands shared between OSD's console and admin console:
+namespace ceph::osd_cmds {
+
+int heap(CephContext& cct,
+         const cmdmap_t& cmdmap,
+         std::ostream& outos,
+         std::ostream& erros);
+
+} // namespace ceph::osd_cmds
+
+int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
+{
+  int ret;
+
+  OSDSuperblock sb;
+  bufferlist sbbl;
+  ObjectStore::CollectionHandle ch;
+
+  // if we are fed a uuid for this osd, use it.
+  store->set_fsid(cct->_conf->osd_uuid);
+
+  ret = store->mkfs();
+  if (ret) {
+    derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
+         << cpp_strerror(ret) << dendl;
+    goto free_store;
+  }
+
+  store->set_cache_shards(1);  // doesn't matter for mkfs!
+
+  ret = store->mount();
+  if (ret) {
+    derr << "OSD::mkfs: couldn't mount ObjectStore: error "
+         << cpp_strerror(ret) << dendl;
+    goto free_store;
+  }
+
+  ch = store->open_collection(coll_t::meta());
+  if (ch) {
+    ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
+    if (ret < 0) {
+      derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
+      goto free_store;
+    }
+    /* if we already have superblock, check content of superblock */
+    dout(0) << " have superblock" << dendl;
+    auto p = sbbl.cbegin();
+    decode(sb, p);
+    if (whoami != sb.whoami) {
+      derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
+	   << dendl;
+      ret = -EINVAL;
+      goto umount_store;
+    }
+    if (fsid != sb.cluster_fsid) {
+      derr << "provided cluster fsid " << fsid
+	   << " != superblock's " << sb.cluster_fsid << dendl;
+      ret = -EINVAL;
+      goto umount_store;
+    }
+  } else {
+    // create superblock
+    sb.cluster_fsid = fsid;
+    sb.osd_fsid = store->get_fsid();
+    sb.whoami = whoami;
+    sb.compat_features = get_osd_initial_compat_set();
+
+    bufferlist bl;
+    encode(sb, bl);
+
+    ObjectStore::CollectionHandle ch = store->create_new_collection(
+      coll_t::meta());
+    ObjectStore::Transaction t;
+    t.create_collection(coll_t::meta(), 0);
+    t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
+    ret = store->queue_transaction(ch, std::move(t));
+    if (ret) {
+      derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
+	   << "queue_transaction returned " << cpp_strerror(ret) << dendl;
+      goto umount_store;
+    }
+    ch->flush();
+  }
+
+  ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
+  if (ret) {
+    derr << "OSD::mkfs: failed to write fsid file: error "
+         << cpp_strerror(ret) << dendl;
+    goto umount_store;
+  }
+
+umount_store:
+  if (ch) {
+    ch.reset();
+  }
+  store->umount();
+free_store:
+  delete store;
+  return ret;
+}
+
+int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
+{
+  char val[80];
+  int r;
+
+  snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
+  r = store->write_meta("magic", val);
+  if (r < 0)
+    return r;
+
+  snprintf(val, sizeof(val), "%d", whoami);
+  r = store->write_meta("whoami", val);
+  if (r < 0)
+    return r;
+
+  cluster_fsid.print(val);
+  r = store->write_meta("ceph_fsid", val);
+  if (r < 0)
+    return r;
+
+  string key = cct->_conf.get_val<string>("key");
+  if (key.size()) {
+    r = store->write_meta("osd_key", key);
+    if (r < 0)
+      return r;
+  } else {
+    string keyfile = cct->_conf.get_val<string>("keyfile");
+    if (!keyfile.empty()) {
+      bufferlist keybl;
+      string err;
+      r = keybl.read_file(keyfile.c_str(), &err);
+      if (r < 0) {
+	derr << __func__ << " failed to read keyfile " << keyfile << ": "
+	     << err << ": " << cpp_strerror(r) << dendl;
+	return r;
+      }
+      r = store->write_meta("osd_key", keybl.to_str());
+      if (r < 0)
+	return r;
+    }
+  }
+  if (!osdspec_affinity.empty()) {
+    r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
+    if (r < 0)
+      return r;
+  }
+
+  r = store->write_meta("ceph_version_when_created", pretty_version_to_str());
+  if (r < 0)
+    return r;
+
+  ostringstream created_at;
+  utime_t now = ceph_clock_now();
+  now.gmtime(created_at);
+  r = store->write_meta("created_at", created_at.str());
+  if (r < 0)
+    return r;
+
+  r = store->write_meta("ready", "ready");
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int OSD::peek_meta(ObjectStore *store,
+		   std::string *magic,
+		   uuid_d *cluster_fsid,
+		   uuid_d *osd_fsid,
+		   int *whoami,
+		   ceph_release_t *require_osd_release)
+{
+  string val;
+
+  int r = store->read_meta("magic", &val);
+  if (r < 0)
+    return r;
+  *magic = val;
+
+  r = store->read_meta("whoami", &val);
+  if (r < 0)
+    return r;
+  *whoami = atoi(val.c_str());
+
+  r = store->read_meta("ceph_fsid", &val);
+  if (r < 0)
+    return r;
+  r = cluster_fsid->parse(val.c_str());
+  if (!r)
+    return -EINVAL;
+
+  r = store->read_meta("fsid", &val);
+  if (r < 0) {
+    *osd_fsid = uuid_d();
+  } else {
+    r = osd_fsid->parse(val.c_str());
+    if (!r)
+      return -EINVAL;
+  }
+
+  r = store->read_meta("require_osd_release", &val);
+  if (r >= 0) {
+    *require_osd_release = ceph_release_from_name(val);
+  }
+
+  return 0;
+}
+
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
+
+// cons/des
+
+OSD::OSD(CephContext *cct_, ObjectStore *store_,
+	 int id,
+	 Messenger *internal_messenger,
+	 Messenger *external_messenger,
+	 Messenger *hb_client_front,
+	 Messenger *hb_client_back,
+	 Messenger *hb_front_serverm,
+	 Messenger *hb_back_serverm,
+	 Messenger *osdc_messenger,
+	 MonClient *mc,
+	 const std::string &dev, const std::string &jdev,
+	 ceph::async::io_context_pool& poolctx) :
+  Dispatcher(cct_),
+  tick_timer(cct, osd_lock),
+  tick_timer_without_osd_lock(cct, tick_timer_lock),
+  gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
+  cluster_messenger(internal_messenger),
+  client_messenger(external_messenger),
+  objecter_messenger(osdc_messenger),
+  monc(mc),
+  mgrc(cct_, client_messenger, &mc->monmap),
+  logger(create_logger()),
+  recoverystate_perf(create_recoverystate_perf()),
+  store(store_),
+  log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
+  clog(log_client.create_channel()),
+  whoami(id),
+  dev_path(dev), journal_path(jdev),
+  store_is_rotational(store->is_rotational()),
+  trace_endpoint("0.0.0.0", 0, "osd"),
+  asok_hook(NULL),
+  m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
+				  "osd_pg_epoch_max_lag_factor")),
+  osd_compat(get_osd_compat_set()),
+  osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
+	    get_num_op_threads()),
+  heartbeat_stop(false),
+  heartbeat_need_update(true),
+  hb_front_client_messenger(hb_client_front),
+  hb_back_client_messenger(hb_client_back),
+  hb_front_server_messenger(hb_front_serverm),
+  hb_back_server_messenger(hb_back_serverm),
+  daily_loadavg(0.0),
+  heartbeat_thread(this),
+  heartbeat_dispatcher(this),
+  op_tracker(cct, cct->_conf->osd_enable_op_tracker,
+                  cct->_conf->osd_num_op_tracker_shard),
+  test_ops_hook(NULL),
+  op_shardedwq(
+    this,
+    ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
+    ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
+    &osd_op_tp),
+  last_pg_create_epoch(0),
+  boot_finisher(cct),
+  up_thru_wanted(0),
+  requested_full_first(0),
+  requested_full_last(0),
+  service(this, poolctx)
+{
+
+  if (!gss_ktfile_client.empty()) {
+    // Assert we can export environment variable
+    /*
+        The default client keytab is used, if it is present and readable,
+        to automatically obtain initial credentials for GSSAPI client
+        applications. The principal name of the first entry in the client
+        keytab is used by default when obtaining initial credentials.
+        1. The KRB5_CLIENT_KTNAME environment variable.
+        2. The default_client_keytab_name profile variable in [libdefaults].
+        3. The hardcoded default, DEFCKTNAME.
+    */
+    const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
+                                    gss_ktfile_client.c_str(), 1));
+    ceph_assert(set_result == 0);
+  }
+
+  monc->set_messenger(client_messenger);
+  op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
+                                         cct->_conf->osd_op_log_threshold);
+  op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
+                                           cct->_conf->osd_op_history_duration);
+  op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
+                                                    cct->_conf->osd_op_history_slow_op_threshold);
+  ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
+#ifdef WITH_BLKIN
+  std::stringstream ss;
+  ss << "osd." << whoami;
+  trace_endpoint.copy_name(ss.str());
+#endif
+
+  // initialize shards
+  num_shards = get_num_op_shards();
+  for (uint32_t i = 0; i < num_shards; i++) {
+    OSDShard *one_shard = new OSDShard(
+      i,
+      cct,
+      this);
+    shards.push_back(one_shard);
+  }
+}
+
+OSD::~OSD()
+{
+  while (!shards.empty()) {
+    delete shards.back();
+    shards.pop_back();
+  }
+  cct->get_perfcounters_collection()->remove(recoverystate_perf);
+  cct->get_perfcounters_collection()->remove(logger);
+  delete recoverystate_perf;
+  delete logger;
+  delete store;
+}
+
+double OSD::get_tick_interval() const
+{
+  // vary +/- 5% to avoid scrub scheduling livelocks
+  constexpr auto delta = 0.05;
+  return (OSD_TICK_INTERVAL *
+	  ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
+}
+
+void OSD::handle_signal(int signum)
+{
+  ceph_assert(signum == SIGINT || signum == SIGTERM);
+  derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
+  shutdown();
+}
+
+int OSD::pre_init()
+{
+  std::lock_guard lock(osd_lock);
+  if (is_stopping())
+    return 0;
+
+  if (store->test_mount_in_use()) {
+    derr << "OSD::pre_init: object store '" << dev_path << "' is "
+         << "currently in use. (Is ceph-osd already running?)" << dendl;
+    return -EBUSY;
+  }
+
+  cct->_conf.add_observer(this);
+  return 0;
+}
+
+int OSD::set_numa_affinity()
+{
+  // storage numa node
+  int store_node = -1;
+  store->get_numa_node(&store_node, nullptr, nullptr);
+  if (store_node >= 0) {
+    dout(1) << __func__ << " storage numa node " << store_node << dendl;
+  }
+
+  // check network numa node(s)
+  int front_node = -1, back_node = -1;
+  string front_iface = pick_iface(
+    cct,
+    client_messenger->get_myaddrs().front().get_sockaddr_storage());
+  string back_iface = pick_iface(
+    cct,
+    cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
+  int r = get_iface_numa_node(front_iface, &front_node);
+  if (r >= 0 && front_node >= 0) {
+    dout(1) << __func__ << " public network " << front_iface << " numa node "
+            << front_node << dendl;
+    r = get_iface_numa_node(back_iface, &back_node);
+    if (r >= 0 && back_node >= 0) {
+      dout(1) << __func__ << " cluster network " << back_iface << " numa node "
+	      << back_node << dendl;
+      if (front_node == back_node &&
+	  front_node == store_node) {
+	dout(1) << " objectstore and network numa nodes all match" << dendl;
+	if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
+	  numa_node = front_node;
+	}
+      } else if (front_node != back_node) {
+        dout(1) << __func__ << " public and cluster network numa nodes do not match"
+                << dendl;
+      } else {
+	dout(1) << __func__ << " objectstore and network numa nodes do not match"
+		<< dendl;
+      }
+    } else if (back_node == -2) {
+      dout(1) << __func__ << " cluster network " << back_iface
+              << " ports numa nodes do not match" << dendl;
+    } else {
+      derr << __func__ << " unable to identify cluster interface '" << back_iface
+           << "' numa node: " << cpp_strerror(r) << dendl;
+    }
+  } else if (front_node == -2) {
+    dout(1) << __func__ << " public network " << front_iface
+            << " ports numa nodes do not match" << dendl;
+  } else {
+    derr << __func__ << " unable to identify public interface '" << front_iface
+	 << "' numa node: " << cpp_strerror(r) << dendl;
+  }
+  if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
+    // this takes precedence over the automagic logic above
+    numa_node = node;
+  }
+  if (numa_node >= 0) {
+    int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
+    if (r < 0) {
+      dout(1) << __func__ << " unable to determine numa node " << numa_node
+	      << " CPUs" << dendl;
+      numa_node = -1;
+    } else {
+      dout(1) << __func__ << " setting numa affinity to node " << numa_node
+	      << " cpus "
+	      << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
+	      << dendl;
+      r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
+      if (r < 0) {
+	r = -errno;
+	derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
+	     << dendl;
+	numa_node = -1;
+      }
+    }
+  } else {
+    dout(1) << __func__ << " not setting numa affinity" << dendl;
+  }
+  return 0;
+}
+
+// asok
+
+class OSDSocketHook : public AdminSocketHook {
+  OSD *osd;
+public:
+  explicit OSDSocketHook(OSD *o) : osd(o) {}
+  int call(std::string_view prefix, const cmdmap_t& cmdmap,
+	   Formatter *f,
+	   std::ostream& ss,
+	   bufferlist& out) override {
+    ceph_abort("should use async hook");
+  }
+  void call_async(
+    std::string_view prefix,
+    const cmdmap_t& cmdmap,
+    Formatter *f,
+    const bufferlist& inbl,
+    std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
+    try {
+      osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
+    } catch (const TOPNSPC::common::bad_cmd_get& e) {
+      bufferlist empty;
+      on_finish(-EINVAL, e.what(), empty);
+    }
+  }
+};
+
+std::set<int64_t> OSD::get_mapped_pools()
+{
+  std::set<int64_t> pools;
+  std::vector<spg_t> pgids;
+  _get_pgids(&pgids);
+  for (const auto &pgid : pgids) {
+    pools.insert(pgid.pool());
+  }
+  return pools;
+}
+
+void OSD::asok_command(
+  std::string_view prefix, const cmdmap_t& cmdmap,
+  Formatter *f,
+  const bufferlist& inbl,
+  std::function<void(int,const std::string&,bufferlist&)> on_finish)
+{
+  int ret = 0;
+  stringstream ss;   // stderr error message stream
+  bufferlist outbl;  // if empty at end, we'll dump formatter as output
+
+  // --- PG commands are routed here to PG::do_command ---
+  if (prefix == "pg" ||
+      prefix == "query" ||
+      prefix == "mark_unfound_lost" ||
+      prefix == "list_unfound" ||
+      prefix == "scrub" ||
+      prefix == "deep_scrub"
+    ) {
+    string pgidstr;
+    pg_t pgid;
+    if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
+      ss << "no pgid specified";
+      ret = -EINVAL;
+      goto out;
+    }
+    if (!pgid.parse(pgidstr.c_str())) {
+      ss << "couldn't parse pgid '" << pgidstr << "'";
+      ret = -EINVAL;
+      goto out;
+    }
+    spg_t pcand;
+    PGRef pg;
+    if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
+	(pg = _lookup_lock_pg(pcand))) {
+      if (pg->is_primary()) {
+	cmdmap_t new_cmdmap = cmdmap;
+	try {
+	  pg->do_command(prefix, new_cmdmap, inbl, on_finish);
+	  pg->unlock();
+	  return; // the pg handler calls on_finish directly
+	} catch (const TOPNSPC::common::bad_cmd_get& e) {
+	  pg->unlock();
+	  ss << e.what();
+	  ret = -EINVAL;
+	  goto out;
+	}
+      } else {
+	ss << "not primary for pgid " << pgid;
+	// do not reply; they will get newer maps and realize they
+	// need to resend.
+	pg->unlock();
+	ret = -EAGAIN;
+	goto out;
+      }
+    } else {
+      ss << "i don't have pgid " << pgid;
+      ret = -ENOENT;
+    }
+  }
+
+  // --- OSD commands follow ---
+
+  else if (prefix == "status") {
+    lock_guard l(osd_lock);
+    f->open_object_section("status");
+    f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
+    f->dump_stream("osd_fsid") << superblock.osd_fsid;
+    f->dump_unsigned("whoami", superblock.whoami);
+    f->dump_string("state", get_state_name(get_state()));
+    f->dump_unsigned("oldest_map", superblock.oldest_map);
+    f->dump_unsigned("newest_map", superblock.newest_map);
+    f->dump_unsigned("num_pgs", num_pgs);
+    f->close_section();
+  } else if (prefix == "flush_journal") {
+    store->flush_journal();
+  } else if (prefix == "dump_ops_in_flight" ||
+             prefix == "ops" ||
+             prefix == "dump_blocked_ops" ||
+             prefix == "dump_historic_ops" ||
+             prefix == "dump_historic_ops_by_duration" ||
+             prefix == "dump_historic_slow_ops") {
+
+    const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
+even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
+will start to track new ops received afterwards.";
+
+    set<string> filters;
+    vector<string> filter_str;
+    if (cmd_getval(cmdmap, "filterstr", filter_str)) {
+        copy(filter_str.begin(), filter_str.end(),
+           inserter(filters, filters.end()));
+    }
+
+    if (prefix == "dump_ops_in_flight" ||
+        prefix == "ops") {
+      if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
+        ss << error_str;
+	ret = -EINVAL;
+	goto out;
+      }
+    }
+    if (prefix == "dump_blocked_ops") {
+      if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
+        ss << error_str;
+	ret = -EINVAL;
+	goto out;
+      }
+    }
+    if (prefix == "dump_historic_ops") {
+      if (!op_tracker.dump_historic_ops(f, false, filters)) {
+        ss << error_str;
+	ret = -EINVAL;
+	goto out;
+      }
+    }
+    if (prefix == "dump_historic_ops_by_duration") {
+      if (!op_tracker.dump_historic_ops(f, true, filters)) {
+        ss << error_str;
+	ret = -EINVAL;
+	goto out;
+      }
+    }
+    if (prefix == "dump_historic_slow_ops") {
+      if (!op_tracker.dump_historic_slow_ops(f, filters)) {
+        ss << error_str;
+	ret = -EINVAL;
+	goto out;
+      }
+    }
+  } else if (prefix == "dump_op_pq_state") {
+    f->open_object_section("pq");
+    op_shardedwq.dump(f);
+    f->close_section();
+  } else if (prefix == "dump_blocklist") {
+    list<pair<entity_addr_t,utime_t> > bl;
+    list<pair<entity_addr_t,utime_t> > rbl;
+    OSDMapRef curmap = service.get_osdmap();
+    curmap->get_blocklist(&bl, &rbl);
+
+    f->open_array_section("blocklist");
+    for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
+	it != bl.end(); ++it) {
+      f->open_object_section("entry");
+      f->open_object_section("entity_addr_t");
+      it->first.dump(f);
+      f->close_section(); //entity_addr_t
+      it->second.localtime(f->dump_stream("expire_time"));
+      f->close_section(); //entry
+    }
+    f->close_section(); //blocklist
+    f->open_array_section("range_blocklist");
+    for (list<pair<entity_addr_t,utime_t> >::iterator it = rbl.begin();
+	it != rbl.end(); ++it) {
+      f->open_object_section("entry");
+      f->open_object_section("entity_addr_t");
+      it->first.dump(f);
+      f->close_section(); //entity_addr_t
+      it->second.localtime(f->dump_stream("expire_time"));
+      f->close_section(); //entry
+    }
+    f->close_section(); //blocklist
+  } else if (prefix == "dump_watchers") {
+    list<obj_watch_item_t> watchers;
+    // scan pg's
+    vector<PGRef> pgs;
+    _get_pgs(&pgs);
+    for (auto& pg : pgs) {
+      list<obj_watch_item_t> pg_watchers;
+      pg->get_watchers(&pg_watchers);
+      watchers.splice(watchers.end(), pg_watchers);
+    }
+
+    f->open_array_section("watchers");
+    for (list<obj_watch_item_t>::iterator it = watchers.begin();
+	it != watchers.end(); ++it) {
+
+      f->open_object_section("watch");
+
+      f->dump_string("namespace", it->obj.nspace);
+      f->dump_string("object", it->obj.oid.name);
+
+      f->open_object_section("entity_name");
+      it->wi.name.dump(f);
+      f->close_section(); //entity_name_t
+
+      f->dump_unsigned("cookie", it->wi.cookie);
+      f->dump_unsigned("timeout", it->wi.timeout_seconds);
+
+      f->open_object_section("entity_addr_t");
+      it->wi.addr.dump(f);
+      f->close_section(); //entity_addr_t
+
+      f->close_section(); //watch
+    }
+
+    f->close_section(); //watchers
+  } else if (prefix == "dump_recovery_reservations") {
+    f->open_object_section("reservations");
+    f->open_object_section("local_reservations");
+    service.local_reserver.dump(f);
+    f->close_section();
+    f->open_object_section("remote_reservations");
+    service.remote_reserver.dump(f);
+    f->close_section();
+    f->close_section();
+  } else if (prefix == "dump_scrub_reservations") {
+    f->open_object_section("scrub_reservations");
+    service.dump_scrub_reservations(f);
+    f->close_section();
+  } else if (prefix == "get_latest_osdmap") {
+    get_latest_osdmap();
+  } else if (prefix == "set_heap_property") {
+    string property;
+    int64_t value = 0;
+    string error;
+    bool success = false;
+    if (!cmd_getval(cmdmap, "property", property)) {
+      error = "unable to get property";
+      success = false;
+    } else if (!cmd_getval(cmdmap, "value", value)) {
+      error = "unable to get value";
+      success = false;
+    } else if (value < 0) {
+      error = "negative value not allowed";
+      success = false;
+    } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
+      error = "invalid property";
+      success = false;
+    } else {
+      success = true;
+    }
+    f->open_object_section("result");
+    f->dump_string("error", error);
+    f->dump_bool("success", success);
+    f->close_section();
+  } else if (prefix == "get_heap_property") {
+    string property;
+    size_t value = 0;
+    string error;
+    bool success = false;
+    if (!cmd_getval(cmdmap, "property", property)) {
+      error = "unable to get property";
+      success = false;
+    } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
+      error = "invalid property";
+      success = false;
+    } else {
+      success = true;
+    }
+    f->open_object_section("result");
+    f->dump_string("error", error);
+    f->dump_bool("success", success);
+    f->dump_int("value", value);
+    f->close_section();
+  } else if (prefix == "dump_objectstore_kv_stats") {
+    store->get_db_statistics(f);
+  } else if (prefix == "dump_scrubs") {
+    service.dumps_scrub(f);
+  } else if (prefix == "calc_objectstore_db_histogram") {
+    store->generate_db_histogram(f);
+  } else if (prefix == "flush_store_cache") {
+    store->flush_cache(&ss);
+  } else if (prefix == "dump_pgstate_history") {
+    f->open_object_section("pgstate_history");
+    f->open_array_section("pgs");
+    vector<PGRef> pgs;
+    _get_pgs(&pgs);
+    for (auto& pg : pgs) {
+      f->open_object_section("pg");
+      f->dump_stream("pg") << pg->pg_id;
+      f->dump_string("currently", pg->get_current_state());
+      pg->dump_pgstate_history(f);
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  } else if (prefix == "compact") {
+    dout(1) << "triggering manual compaction" << dendl;
+    auto start = ceph::coarse_mono_clock::now();
+    store->compact();
+    auto end = ceph::coarse_mono_clock::now();
+    double duration = std::chrono::duration<double>(end-start).count();
+    dout(1) << "finished manual compaction in "
+            << duration
+            << " seconds" << dendl;
+    f->open_object_section("compact_result");
+    f->dump_float("elapsed_time", duration);
+    f->close_section();
+  } else if (prefix == "get_mapped_pools") {
+    f->open_array_section("mapped_pools");
+    set<int64_t> poollist = get_mapped_pools();
+    for (auto pool : poollist) {
+      f->dump_int("pool_id", pool);
+    }
+    f->close_section();
+  } else if (prefix == "smart") {
+    string devid;
+    cmd_getval(cmdmap, "devid", devid);
+    ostringstream out;
+    probe_smart(devid, out);
+    outbl.append(out.str());
+  } else if (prefix == "list_devices") {
+    set<string> devnames;
+    store->get_devices(&devnames);
+    f->open_array_section("list_devices");
+    for (auto dev : devnames) {
+      if (dev.find("dm-") == 0) {
+	continue;
+      }
+      string err;
+      f->open_object_section("device");
+      f->dump_string("device", "/dev/" + dev);
+      f->dump_string("device_id", get_device_id(dev, &err));
+      f->close_section();
+    }
+    f->close_section();
+  } else if (prefix == "send_beacon") {
+    lock_guard l(osd_lock);
+    if (is_active()) {
+      send_beacon(ceph::coarse_mono_clock::now());
+    }
+  }
+
+  else if (prefix == "cluster_log") {
+    vector<string> msg;
+    cmd_getval(cmdmap, "message", msg);
+    if (msg.empty()) {
+      ret = -EINVAL;
+      ss << "ignoring empty log message";
+      goto out;
+    }
+    string message = msg.front();
+    for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
+      message += " " + *a;
+    string lvl;
+    cmd_getval(cmdmap, "level", lvl);
+    clog_type level = string_to_clog_type(lvl);
+    if (level < 0) {
+      ret = -EINVAL;
+      ss << "unknown level '" << lvl << "'";
+      goto out;
+    }
+    clog->do_log(level, message);
+  }
+
+  else if (prefix == "bench") {
+    int64_t count;
+    int64_t bsize;
+    int64_t osize, onum;
+    // default count 1G, size 4MB
+    cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
+    cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
+    cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
+    cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
+    double elapsed = 0.0;
+
+    ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
+    if (ret != 0) {
+      goto out;
+    }
+
+    double rate = count / elapsed;
+    double iops = rate / bsize;
+    f->open_object_section("osd_bench_results");
+    f->dump_int("bytes_written", count);
+    f->dump_int("blocksize", bsize);
+    f->dump_float("elapsed_sec", elapsed);
+    f->dump_float("bytes_per_sec", rate);
+    f->dump_float("iops", iops);
+    f->close_section();
+  }
+
+  else if (prefix == "flush_pg_stats") {
+    mgrc.send_pgstats();
+    f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
+  }
+
+  else if (prefix == "heap") {
+    std::stringstream outss;
+    ret = ceph::osd_cmds::heap(*cct, cmdmap, outss, ss);
+    outbl.append(outss);
+  }
+
+  else if (prefix == "debug dump_missing") {
+    f->open_array_section("pgs");
+    vector<PGRef> pgs;
+    _get_pgs(&pgs);
+    for (auto& pg : pgs) {
+      string s = stringify(pg->pg_id);
+      f->open_array_section(s.c_str());
+      pg->lock();
+      pg->dump_missing(f);
+      pg->unlock();
+      f->close_section();
+    }
+    f->close_section();
+  }
+
+  else if (prefix == "debug kick_recovery_wq") {
+    int64_t delay;
+    cmd_getval(cmdmap, "delay", delay);
+    ostringstream oss;
+    oss << delay;
+    ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
+    if (ret != 0) {
+      ss << "kick_recovery_wq: error setting "
+	 << "osd_recovery_delay_start to '" << delay << "': error "
+	 << ret;
+      goto out;
+    }
+    cct->_conf.apply_changes(nullptr);
+    ss << "kicking recovery queue. set osd_recovery_delay_start "
+       << "to " << cct->_conf->osd_recovery_delay_start;
+  }
+
+  else if (prefix == "cpu_profiler") {
+    ostringstream ds;
+    string arg;
+    cmd_getval(cmdmap, "arg", arg);
+    vector<string> argvec;
+    get_str_vec(arg, argvec);
+    cpu_profiler_handle_command(argvec, ds);
+    outbl.append(ds.str());
+  }
+
+  else if (prefix == "dump_pg_recovery_stats") {
+    lock_guard l(osd_lock);
+    pg_recovery_stats.dump_formatted(f);
+  }
+
+  else if (prefix == "reset_pg_recovery_stats") {
+    lock_guard l(osd_lock);
+    pg_recovery_stats.reset();
+  }
+
+  else if (prefix == "perf histogram dump") {
+    std::string logger;
+    std::string counter;
+    cmd_getval(cmdmap, "logger", logger);
+    cmd_getval(cmdmap, "counter", counter);
+    cct->get_perfcounters_collection()->dump_formatted_histograms(
+      f, false, logger, counter);
+  }
+
+  else if (prefix == "cache drop") {
+    lock_guard l(osd_lock);
+    dout(20) << "clearing all caches" << dendl;
+    // Clear the objectstore's cache - onode and buffer for Bluestore,
+    // system's pagecache for Filestore
+    ret = store->flush_cache(&ss);
+    if (ret < 0) {
+      ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
+      goto out;
+    }
+    // Clear the objectcontext cache (per PG)
+    vector<PGRef> pgs;
+    _get_pgs(&pgs);
+    for (auto& pg: pgs) {
+      pg->clear_cache();
+    }
+  }
+
+  else if (prefix == "cache status") {
+    lock_guard l(osd_lock);
+    int obj_ctx_count = 0;
+    vector<PGRef> pgs;
+    _get_pgs(&pgs);
+    for (auto& pg: pgs) {
+      obj_ctx_count += pg->get_cache_obj_count();
+    }
+    f->open_object_section("cache_status");
+    f->dump_int("object_ctx", obj_ctx_count);
+    store->dump_cache_stats(f);
+    f->close_section();
+  }
+
+  else if (prefix == "scrub_purged_snaps") {
+    lock_guard l(osd_lock);
+    scrub_purged_snaps();
+  }
+
+  else if (prefix == "dump_osd_network") {
+    lock_guard l(osd_lock);
+    int64_t value = 0;
+    if (!(cmd_getval(cmdmap, "value", value))) {
+      // Convert milliseconds to microseconds
+      value = static_cast<double>(g_conf().get_val<double>(
+				    "mon_warn_on_slow_ping_time")) * 1000;
+      if (value == 0) {
+	double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
+	value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
+	value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
+      }
+    } else {
+      // Convert user input to microseconds
+      value *= 1000;
+    }
+    if (value < 0) value = 0;
+
+    struct osd_ping_time_t {
+      uint32_t pingtime;
+      int to;
+      bool back;
+      std::array<uint32_t,3> times;
+      std::array<uint32_t,3> min;
+      std::array<uint32_t,3> max;
+      uint32_t last;
+      uint32_t last_update;
+
+      bool operator<(const osd_ping_time_t& rhs) const {
+	if (pingtime < rhs.pingtime)
+          return true;
+	if (pingtime > rhs.pingtime)
+	  return false;
+        if (to < rhs.to)
+	  return true;
+        if (to > rhs.to)
+	  return false;
+	return back;
+      }
+    };
+
+    set<osd_ping_time_t> sorted;
+    // Get pingtimes under lock and not on the stack
+    map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
+    service.get_hb_pingtime(pingtimes);
+    for (auto j : *pingtimes) {
+      if (j.second.last_update == 0)
+	continue;
+      osd_ping_time_t item;
+      item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
+      item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
+      if (item.pingtime >= value) {
+	item.to = j.first;
+	item.times[0] = j.second.back_pingtime[0];
+	item.times[1] = j.second.back_pingtime[1];
+	item.times[2] = j.second.back_pingtime[2];
+	item.min[0] = j.second.back_min[0];
+	item.min[1] = j.second.back_min[1];
+	item.min[2] = j.second.back_min[2];
+	item.max[0] = j.second.back_max[0];
+	item.max[1] = j.second.back_max[1];
+	item.max[2] = j.second.back_max[2];
+	item.last = j.second.back_last;
+	item.back = true;
+	item.last_update = j.second.last_update;
+	sorted.emplace(item);
+      }
+      if (j.second.front_last == 0)
+	continue;
+      item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
+      item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
+      if (item.pingtime >= value) {
+	item.to = j.first;
+	item.times[0] = j.second.front_pingtime[0];
+	item.times[1] = j.second.front_pingtime[1];
+	item.times[2] = j.second.front_pingtime[2];
+	item.min[0] = j.second.front_min[0];
+	item.min[1] = j.second.front_min[1];
+	item.min[2] = j.second.front_min[2];
+	item.max[0] = j.second.front_max[0];
+	item.max[1] = j.second.front_max[1];
+	item.max[2] = j.second.front_max[2];
+	item.last = j.second.front_last;
+	item.last_update = j.second.last_update;
+	item.back = false;
+	sorted.emplace(item);
+      }
+    }
+    delete pingtimes;
+    //
+    // Network ping times (1min 5min 15min)
+    f->open_object_section("network_ping_times");
+    f->dump_int("threshold", value / 1000);
+    f->open_array_section("entries");
+    for (auto &sitem : boost::adaptors::reverse(sorted)) {
+      ceph_assert(sitem.pingtime >= value);
+      f->open_object_section("entry");
+
+      const time_t lu(sitem.last_update);
+      char buffer[26];
+      string lustr(ctime_r(&lu, buffer));
+      lustr.pop_back();   // Remove trailing \n
+      auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
+      f->dump_string("last update", lustr);
+      f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
+      f->dump_int("from osd", whoami);
+      f->dump_int("to osd", sitem.to);
+      f->dump_string("interface", (sitem.back ? "back" : "front"));
+      f->open_object_section("average");
+      f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
+      f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
+      f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
+      f->close_section();  // average
+      f->open_object_section("min");
+      f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
+      f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
+      f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
+      f->close_section();  // min
+      f->open_object_section("max");
+      f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
+      f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
+      f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
+      f->close_section();  // max
+      f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
+      f->close_section();  // entry
+    }
+    f->close_section(); // entries
+    f->close_section(); // network_ping_times
+  } else {
+    ceph_abort_msg("broken asok registration");
+  }
+
+ out:
+  on_finish(ret, ss.str(), outbl);
+}
+
+int OSD::run_osd_bench_test(
+  int64_t count,
+  int64_t bsize,
+  int64_t osize,
+  int64_t onum,
+  double *elapsed,
+  ostream &ss)
+{
+  int ret = 0;
+  uint32_t duration = cct->_conf->osd_bench_duration;
+
+  if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
+    // let us limit the block size because the next checks rely on it
+    // having a sane value.  If we allow any block size to be set things
+    // can still go sideways.
+    ss << "block 'size' values are capped at "
+       << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
+       << " a higher value, please adjust 'osd_bench_max_block_size'";
+    ret = -EINVAL;
+    return ret;
+  } else if (bsize < (int64_t) (1 << 20)) {
+    // entering the realm of small block sizes.
+    // limit the count to a sane value, assuming a configurable amount of
+    // IOPS and duration, so that the OSD doesn't get hung up on this,
+    // preventing timeouts from going off
+    int64_t max_count =
+      bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
+    if (count > max_count) {
+      ss << "'count' values greater than " << max_count
+         << " for a block size of " << byte_u_t(bsize) << ", assuming "
+         << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
+         << " for " << duration << " seconds,"
+         << " can cause ill effects on osd. "
+         << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
+         << " value if you wish to use a higher 'count'.";
+      ret = -EINVAL;
+      return ret;
+    }
+  } else {
+    // 1MB block sizes are big enough so that we get more stuff done.
+    // However, to avoid the osd from getting hung on this and having
+    // timers being triggered, we are going to limit the count assuming
+    // a configurable throughput and duration.
+    // NOTE: max_count is the total amount of bytes that we believe we
+    //       will be able to write during 'duration' for the given
+    //       throughput.  The block size hardly impacts this unless it's
+    //       way too big.  Given we already check how big the block size
+    //       is, it's safe to assume everything will check out.
+    int64_t max_count =
+      cct->_conf->osd_bench_large_size_max_throughput * duration;
+    if (count > max_count) {
+      ss << "'count' values greater than " << max_count
+         << " for a block size of " << byte_u_t(bsize) << ", assuming "
+         << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
+         << " for " << duration << " seconds,"
+         << " can cause ill effects on osd. "
+         << " Please adjust 'osd_bench_large_size_max_throughput'"
+         << " with a higher value if you wish to use a higher 'count'.";
+      ret = -EINVAL;
+      return ret;
+    }
+  }
+
+  if (osize && bsize > osize) {
+    bsize = osize;
+  }
+
+  dout(1) << " bench count " << count
+          << " bsize " << byte_u_t(bsize) << dendl;
+
+  ObjectStore::Transaction cleanupt;
+
+  if (osize && onum) {
+    bufferlist bl;
+    bufferptr bp(osize);
+    bp.zero();
+    bl.push_back(std::move(bp));
+    bl.rebuild_page_aligned();
+    for (int i=0; i<onum; ++i) {
+      char nm[30];
+      snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
+      object_t oid(nm);
+      hobject_t soid(sobject_t(oid, 0));
+      ObjectStore::Transaction t;
+      t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
+      store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+      cleanupt.remove(coll_t(), ghobject_t(soid));
+    }
+  }
+
+  bufferlist bl;
+  bufferptr bp(bsize);
+  bp.zero();
+  bl.push_back(std::move(bp));
+  bl.rebuild_page_aligned();
+
+  {
+    C_SaferCond waiter;
+    if (!service.meta_ch->flush_commit(&waiter)) {
+      waiter.wait();
+    }
+  }
+
+  utime_t start = ceph_clock_now();
+  for (int64_t pos = 0; pos < count; pos += bsize) {
+    char nm[30];
+    unsigned offset = 0;
+    if (onum && osize) {
+      snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
+      offset = rand() % (osize / bsize) * bsize;
+    } else {
+      snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
+    }
+    object_t oid(nm);
+    hobject_t soid(sobject_t(oid, 0));
+    ObjectStore::Transaction t;
+    t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
+    store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+    if (!onum || !osize) {
+      cleanupt.remove(coll_t::meta(), ghobject_t(soid));
+    }
+  }
+
+  {
+    C_SaferCond waiter;
+    if (!service.meta_ch->flush_commit(&waiter)) {
+      waiter.wait();
+    }
+  }
+  utime_t end = ceph_clock_now();
+  *elapsed = end - start;
+
+  // clean up
+  store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
+  {
+    C_SaferCond waiter;
+    if (!service.meta_ch->flush_commit(&waiter)) {
+      waiter.wait();
+    }
+  }
+
+ return ret;
+}
+
+class TestOpsSocketHook : public AdminSocketHook {
+  OSDService *service;
+  ObjectStore *store;
+public:
+  TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+	   Formatter *f,
+	   std::ostream& errss,
+	   bufferlist& out) override {
+    int r = 0;
+    stringstream outss;
+    try {
+      test_ops(service, store, command, cmdmap, outss);
+      out.append(outss);
+    } catch (const TOPNSPC::common::bad_cmd_get& e) {
+      errss << e.what();
+      r = -EINVAL;
+    }
+    return r;
+  }
+  void test_ops(OSDService *service, ObjectStore *store,
+		std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
+
+};
+
+class OSD::C_Tick : public Context {
+  OSD *osd;
+  public:
+  explicit C_Tick(OSD *o) : osd(o) {}
+  void finish(int r) override {
+    osd->tick();
+  }
+};
+
+class OSD::C_Tick_WithoutOSDLock : public Context {
+  OSD *osd;
+  public:
+  explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
+  void finish(int r) override {
+    osd->tick_without_osd_lock();
+  }
+};
+
+int OSD::enable_disable_fuse(bool stop)
+{
+#ifdef HAVE_LIBFUSE
+  int r;
+  string mntpath = cct->_conf->osd_data + "/fuse";
+  if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
+    dout(1) << __func__ << " disabling" << dendl;
+    fuse_store->stop();
+    delete fuse_store;
+    fuse_store = NULL;
+    r = ::rmdir(mntpath.c_str());
+    if (r < 0) {
+      r = -errno;
+      derr << __func__ << " failed to rmdir " << mntpath << ": "
+           << cpp_strerror(r) << dendl;
+      return r;
+    }
+    return 0;
+  }
+  if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
+    dout(1) << __func__ << " enabling" << dendl;
+    r = ::mkdir(mntpath.c_str(), 0700);
+    if (r < 0)
+      r = -errno;
+    if (r < 0 && r != -EEXIST) {
+      derr << __func__ << " unable to create " << mntpath << ": "
+	   << cpp_strerror(r) << dendl;
+      return r;
+    }
+    fuse_store = new FuseStore(store, mntpath);
+    r = fuse_store->start();
+    if (r < 0) {
+      derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
+      delete fuse_store;
+      fuse_store = NULL;
+      return r;
+    }
+  }
+#endif  // HAVE_LIBFUSE
+  return 0;
+}
+
+size_t OSD::get_num_cache_shards()
+{
+  return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
+}
+
+int OSD::get_num_op_shards()
+{
+  if (cct->_conf->osd_op_num_shards)
+    return cct->_conf->osd_op_num_shards;
+  if (store_is_rotational)
+    return cct->_conf->osd_op_num_shards_hdd;
+  else
+    return cct->_conf->osd_op_num_shards_ssd;
+}
+
+int OSD::get_num_op_threads()
+{
+  if (cct->_conf->osd_op_num_threads_per_shard)
+    return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
+  if (store_is_rotational)
+    return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
+  else
+    return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
+}
+
+float OSD::get_osd_recovery_sleep()
+{
+  if (cct->_conf->osd_recovery_sleep)
+    return cct->_conf->osd_recovery_sleep;
+  if (!store_is_rotational && !journal_is_rotational)
+    return cct->_conf->osd_recovery_sleep_ssd;
+  else if (store_is_rotational && !journal_is_rotational)
+    return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
+  else
+    return cct->_conf->osd_recovery_sleep_hdd;
+}
+
+float OSD::get_osd_delete_sleep()
+{
+  float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
+  if (osd_delete_sleep > 0)
+    return osd_delete_sleep;
+  if (!store_is_rotational && !journal_is_rotational)
+    return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
+  if (store_is_rotational && !journal_is_rotational)
+    return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
+  return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
+}
+
+int OSD::get_recovery_max_active()
+{
+  if (cct->_conf->osd_recovery_max_active)
+    return cct->_conf->osd_recovery_max_active;
+  if (store_is_rotational)
+    return cct->_conf->osd_recovery_max_active_hdd;
+  else
+    return cct->_conf->osd_recovery_max_active_ssd;
+}
+
+float OSD::get_osd_snap_trim_sleep()
+{
+  float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
+  if (osd_snap_trim_sleep > 0)
+    return osd_snap_trim_sleep;
+  if (!store_is_rotational && !journal_is_rotational)
+    return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
+  if (store_is_rotational && !journal_is_rotational)
+    return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
+  return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
+}
+
+int OSD::init()
+{
+  OSDMapRef osdmap;
+  CompatSet initial, diff;
+  std::lock_guard lock(osd_lock);
+  if (is_stopping())
+    return 0;
+
+  tick_timer.init();
+  tick_timer_without_osd_lock.init();
+  service.recovery_request_timer.init();
+  service.sleep_timer.init();
+
+  boot_finisher.start();
+
+  {
+    string val;
+    store->read_meta("require_osd_release", &val);
+    last_require_osd_release = ceph_release_from_name(val);
+  }
+
+  // mount.
+  dout(2) << "init " << dev_path
+	  << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
+	  << dendl;
+  dout(2) << "journal " << journal_path << dendl;
+  ceph_assert(store);  // call pre_init() first!
+
+  store->set_cache_shards(get_num_cache_shards());
+
+  int r = store->mount();
+  if (r < 0) {
+    derr << "OSD:init: unable to mount object store" << dendl;
+    return r;
+  }
+  journal_is_rotational = store->is_journal_rotational();
+  dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
+          << dendl;
+
+  enable_disable_fuse(false);
+
+  dout(2) << "boot" << dendl;
+
+  service.meta_ch = store->open_collection(coll_t::meta());
+
+  // initialize the daily loadavg with current 15min loadavg
+  double loadavgs[3];
+  if (getloadavg(loadavgs, 3) == 3) {
+    daily_loadavg = loadavgs[2];
+  } else {
+    derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
+    daily_loadavg = 1.0;
+  }
+
+  int rotating_auth_attempts = 0;
+  auto rotating_auth_timeout =
+    g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
+
+  // sanity check long object name handling
+  {
+    hobject_t l;
+    l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
+    l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
+    l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
+    r = store->validate_hobject_key(l);
+    if (r < 0) {
+      derr << "backend (" << store->get_type() << ") is unable to support max "
+	   << "object name[space] len" << dendl;
+      derr << "   osd max object name len = "
+	   << cct->_conf->osd_max_object_name_len << dendl;
+      derr << "   osd max object namespace len = "
+	   << cct->_conf->osd_max_object_namespace_len << dendl;
+      derr << cpp_strerror(r) << dendl;
+      if (cct->_conf->osd_check_max_object_name_len_on_startup) {
+	goto out;
+      }
+      derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
+	   << dendl;
+    } else {
+      dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
+    }
+  }
+
+  // read superblock
+  r = read_superblock();
+  if (r < 0) {
+    derr << "OSD::init() : unable to read osd superblock" << dendl;
+    r = -EINVAL;
+    goto out;
+  }
+
+  if (osd_compat.compare(superblock.compat_features) < 0) {
+    derr << "The disk uses features unsupported by the executable." << dendl;
+    derr << " ondisk features " << superblock.compat_features << dendl;
+    derr << " daemon features " << osd_compat << dendl;
+
+    if (osd_compat.writeable(superblock.compat_features)) {
+      CompatSet diff = osd_compat.unsupported(superblock.compat_features);
+      derr << "it is still writeable, though. Missing features: " << diff << dendl;
+      r = -EOPNOTSUPP;
+      goto out;
+    }
+    else {
+      CompatSet diff = osd_compat.unsupported(superblock.compat_features);
+      derr << "Cannot write to disk! Missing features: " << diff << dendl;
+      r = -EOPNOTSUPP;
+      goto out;
+    }
+  }
+
+  assert_warn(whoami == superblock.whoami);
+  if (whoami != superblock.whoami) {
+    derr << "OSD::init: superblock says osd"
+	 << superblock.whoami << " but I am osd." << whoami << dendl;
+    r = -EINVAL;
+    goto out;
+  }
+
+  startup_time = ceph::mono_clock::now();
+
+  // load up "current" osdmap
+  assert_warn(!get_osdmap());
+  if (get_osdmap()) {
+    derr << "OSD::init: unable to read current osdmap" << dendl;
+    r = -EINVAL;
+    goto out;
+  }
+  osdmap = get_map(superblock.current_epoch);
+  set_osdmap(osdmap);
+
+  // make sure we don't have legacy pgs deleting
+  {
+    vector<coll_t> ls;
+    int r = store->list_collections(ls);
+    ceph_assert(r >= 0);
+    for (auto c : ls) {
+      spg_t pgid;
+      if (c.is_pg(&pgid) &&
+	  !osdmap->have_pg_pool(pgid.pool())) {
+	ghobject_t oid = make_final_pool_info_oid(pgid.pool());
+	if (!store->exists(service.meta_ch, oid)) {
+	  derr << __func__ << " missing pg_pool_t for deleted pool "
+	       << pgid.pool() << " for pg " << pgid
+	       << "; please downgrade to luminous and allow "
+	       << "pg deletion to complete before upgrading" << dendl;
+	  ceph_abort();
+	}
+      }
+    }
+  }
+
+  initial = get_osd_initial_compat_set();
+  diff = superblock.compat_features.unsupported(initial);
+  if (superblock.compat_features.merge(initial)) {
+    // Are we adding SNAPMAPPER2?
+    if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
+      dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
+	      << dendl;
+      auto ch = service.meta_ch;
+      auto hoid = make_snapmapper_oid();
+      unsigned max = cct->_conf->osd_target_transaction_size;
+      r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
+      if (r < 0)
+	goto out;
+    }
+    // We need to persist the new compat_set before we
+    // do anything else
+    dout(5) << "Upgrading superblock adding: " << diff << dendl;
+    ObjectStore::Transaction t;
+    write_superblock(t);
+    r = store->queue_transaction(service.meta_ch, std::move(t));
+    if (r < 0)
+      goto out;
+  }
+
+  // make sure snap mapper object exists
+  if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
+    dout(10) << "init creating/touching snapmapper object" << dendl;
+    ObjectStore::Transaction t;
+    t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
+    r = store->queue_transaction(service.meta_ch, std::move(t));
+    if (r < 0)
+      goto out;
+  }
+  if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
+    dout(10) << "init creating/touching purged_snaps object" << dendl;
+    ObjectStore::Transaction t;
+    t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
+    r = store->queue_transaction(service.meta_ch, std::move(t));
+    if (r < 0)
+      goto out;
+  }
+
+  if (cct->_conf->osd_open_classes_on_start) {
+    int r = ClassHandler::get_instance().open_all_classes();
+    if (r)
+      dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
+  }
+
+  check_osdmap_features();
+
+  {
+    epoch_t bind_epoch = osdmap->get_epoch();
+    service.set_epochs(NULL, NULL, &bind_epoch);
+  }
+
+  clear_temp_objects();
+
+  // initialize osdmap references in sharded wq
+  for (auto& shard : shards) {
+    std::lock_guard l(shard->osdmap_lock);
+    shard->shard_osdmap = osdmap;
+  }
+
+  // load up pgs (as they previously existed)
+  load_pgs();
+
+  dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
+
+  if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
+    dout(2) << "compacting object store's omap" << dendl;
+    store->compact();
+  }
+
+  // prime osd stats
+  {
+    struct store_statfs_t stbuf;
+    osd_alert_list_t alerts;
+    int r = store->statfs(&stbuf, &alerts);
+    ceph_assert(r == 0);
+    service.set_statfs(stbuf, alerts);
+  }
+
+  // client_messenger's auth_client will be set up by monc->init() later.
+  for (auto m : { cluster_messenger,
+	objecter_messenger,
+	hb_front_client_messenger,
+	hb_back_client_messenger,
+	hb_front_server_messenger,
+	hb_back_server_messenger } ) {
+    m->set_auth_client(monc);
+  }
+  for (auto m : { client_messenger,
+	cluster_messenger,
+	hb_front_server_messenger,
+	hb_back_server_messenger }) {
+    m->set_auth_server(monc);
+  }
+  monc->set_handle_authentication_dispatcher(this);
+
+  monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
+                      | CEPH_ENTITY_TYPE_MGR);
+  r = monc->init();
+  if (r < 0)
+    goto out;
+
+  mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
+  mgrc.set_perf_metric_query_cb(
+    [this](const ConfigPayload &config_payload) {
+        set_perf_queries(config_payload);
+      },
+      [this] {
+        return get_perf_reports();
+      });
+  mgrc.init();
+
+  // tell monc about log_client so it will know about mon session resets
+  monc->set_log_client(&log_client);
+  update_log_config();
+
+  // i'm ready!
+  client_messenger->add_dispatcher_tail(&mgrc);
+  client_messenger->add_dispatcher_tail(this);
+  cluster_messenger->add_dispatcher_head(this);
+
+  hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+  hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+  hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+  hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
+
+  objecter_messenger->add_dispatcher_head(service.objecter.get());
+
+  service.init();
+  service.publish_map(osdmap);
+  service.publish_superblock(superblock);
+  service.max_oldest_map = superblock.oldest_map;
+
+  for (auto& shard : shards) {
+    // put PGs in a temporary set because we may modify pg_slots
+    // unordered_map below.
+    set<PGRef> pgs;
+    for (auto& i : shard->pg_slots) {
+      PGRef pg = i.second->pg;
+      if (!pg) {
+	continue;
+      }
+      pgs.insert(pg);
+    }
+    for (auto pg : pgs) {
+      std::scoped_lock l{*pg};
+      set<pair<spg_t,epoch_t>> new_children;
+      set<pair<spg_t,epoch_t>> merge_pgs;
+      service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
+					 &new_children, &merge_pgs);
+      if (!new_children.empty()) {
+	for (auto shard : shards) {
+	  shard->prime_splits(osdmap, &new_children);
+	}
+	assert(new_children.empty());
+      }
+      if (!merge_pgs.empty()) {
+	for (auto shard : shards) {
+	  shard->prime_merges(osdmap, &merge_pgs);
+	}
+	assert(merge_pgs.empty());
+      }
+    }
+  }
+
+  osd_op_tp.start();
+
+  // start the heartbeat
+  heartbeat_thread.create("osd_srv_heartbt");
+
+  // tick
+  tick_timer.add_event_after(get_tick_interval(),
+			     new C_Tick(this));
+  {
+    std::lock_guard l(tick_timer_lock);
+    tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
+						new C_Tick_WithoutOSDLock(this));
+  }
+
+  osd_lock.unlock();
+
+  r = monc->authenticate();
+  if (r < 0) {
+    derr << __func__ << " authentication failed: " << cpp_strerror(r)
+         << dendl;
+    exit(1);
+  }
+
+  while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
+    derr << "unable to obtain rotating service keys; retrying" << dendl;
+    ++rotating_auth_attempts;
+    if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
+        derr << __func__ << " wait_auth_rotating timed out" << dendl;
+	exit(1);
+    }
+  }
+
+  r = update_crush_device_class();
+  if (r < 0) {
+    derr << __func__ << " unable to update_crush_device_class: "
+	 << cpp_strerror(r) << dendl;
+    exit(1);
+  }
+
+  r = update_crush_location();
+  if (r < 0) {
+    derr << __func__ << " unable to update_crush_location: "
+         << cpp_strerror(r) << dendl;
+    exit(1);
+  }
+
+  osd_lock.lock();
+  if (is_stopping())
+    return 0;
+
+  // start objecter *after* we have authenticated, so that we don't ignore
+  // the OSDMaps it requests.
+  service.final_init();
+
+  check_config();
+
+  dout(10) << "ensuring pgs have consumed prior maps" << dendl;
+  consume_map();
+
+  dout(0) << "done with init, starting boot process" << dendl;
+
+  // subscribe to any pg creations
+  monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
+
+  // MgrClient needs this (it doesn't have MonClient reference itself)
+  monc->sub_want("mgrmap", 0, 0);
+
+  // we don't need to ask for an osdmap here; objecter will
+  //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
+
+  monc->renew_subs();
+
+  start_boot();
+
+  // Override a few options if mclock scheduler is enabled.
+  maybe_override_max_osd_capacity_for_qos();
+  maybe_override_options_for_qos();
+
+  return 0;
+
+out:
+  enable_disable_fuse(true);
+  store->umount();
+  delete store;
+  store = NULL;
+  return r;
+}
+
+void OSD::final_init()
+{
+  AdminSocket *admin_socket = cct->get_admin_socket();
+  asok_hook = new OSDSocketHook(this);
+  int r = admin_socket->register_command("status", asok_hook,
+					 "high-level status of OSD");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("flush_journal",
+                                     asok_hook,
+                                     "flush the journal to permanent store");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_ops_in_flight " \
+				     "name=filterstr,type=CephString,n=N,req=false",
+				     asok_hook,
+				     "show the ops currently in flight");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("ops " \
+				     "name=filterstr,type=CephString,n=N,req=false",
+				     asok_hook,
+				     "show the ops currently in flight");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_blocked_ops " \
+				     "name=filterstr,type=CephString,n=N,req=false",
+				     asok_hook,
+				     "show the blocked ops currently in flight");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_historic_ops " \
+                                     "name=filterstr,type=CephString,n=N,req=false",
+				     asok_hook,
+				     "show recent ops");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_historic_slow_ops " \
+                                     "name=filterstr,type=CephString,n=N,req=false",
+				     asok_hook,
+				     "show slowest recent ops");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_historic_ops_by_duration " \
+                                     "name=filterstr,type=CephString,n=N,req=false",
+				     asok_hook,
+				     "show slowest recent ops, sorted by duration");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_op_pq_state",
+				     asok_hook,
+				     "dump op priority queue state");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_blocklist",
+				     asok_hook,
+				     "dump blocklisted clients and times");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_watchers",
+				     asok_hook,
+				     "show clients which have active watches,"
+				     " and on which objects");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_recovery_reservations",
+				     asok_hook,
+				     "show recovery reservations");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_scrub_reservations",
+				     asok_hook,
+				     "show scrub reservations");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("get_latest_osdmap",
+				     asok_hook,
+				     "force osd to update the latest map from "
+				     "the mon");
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command("set_heap_property " \
+				     "name=property,type=CephString " \
+				     "name=value,type=CephInt",
+				     asok_hook,
+				     "update malloc extension heap property");
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command("get_heap_property " \
+				     "name=property,type=CephString",
+				     asok_hook,
+				     "get malloc extension heap property");
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command("dump_objectstore_kv_stats",
+				     asok_hook,
+				     "print statistics of kvdb which used by bluestore");
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command("dump_scrubs",
+				     asok_hook,
+				     "print scheduled scrubs");
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command("calc_objectstore_db_histogram",
+                                     asok_hook,
+                                     "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command("flush_store_cache",
+                                     asok_hook,
+                                     "Flush bluestore internal cache");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command("dump_pgstate_history",
+				     asok_hook,
+				     "show recent state history");
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command("compact",
+				     asok_hook,
+				     "Commpact object store's omap."
+                                     " WARNING: Compaction probably slows your requests");
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command("get_mapped_pools",
+                                     asok_hook,
+                                     "dump pools whose PG(s) are mapped to this OSD.");
+
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
+                                     asok_hook,
+                                     "probe OSD devices for SMART data.");
+
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command("list_devices",
+                                     asok_hook,
+                                     "list OSD devices.");
+  r = admin_socket->register_command("send_beacon",
+                                     asok_hook,
+                                     "send OSD beacon to mon immediately");
+
+  r = admin_socket->register_command(
+    "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
+    "Dump osd heartbeat network ping times");
+  ceph_assert(r == 0);
+
+  test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
+  // Note: pools are CephString instead of CephPoolname because
+  // these commands traditionally support both pool names and numbers
+  r = admin_socket->register_command(
+   "setomapval " \
+   "name=pool,type=CephString " \
+   "name=objname,type=CephObjectname " \
+   "name=key,type=CephString "\
+   "name=val,type=CephString",
+   test_ops_hook,
+   "set omap key");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "rmomapkey " \
+    "name=pool,type=CephString " \
+    "name=objname,type=CephObjectname " \
+    "name=key,type=CephString",
+    test_ops_hook,
+    "remove omap key");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "setomapheader " \
+    "name=pool,type=CephString " \
+    "name=objname,type=CephObjectname " \
+    "name=header,type=CephString",
+    test_ops_hook,
+    "set omap header");
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command(
+    "getomap " \
+    "name=pool,type=CephString " \
+    "name=objname,type=CephObjectname",
+    test_ops_hook,
+    "output entire object map");
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command(
+    "truncobj " \
+    "name=pool,type=CephString " \
+    "name=objname,type=CephObjectname " \
+    "name=len,type=CephInt",
+    test_ops_hook,
+    "truncate object to length");
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command(
+    "injectdataerr " \
+    "name=pool,type=CephString " \
+    "name=objname,type=CephObjectname " \
+    "name=shardid,type=CephInt,req=false,range=0|255",
+    test_ops_hook,
+    "inject data error to an object");
+  ceph_assert(r == 0);
+
+  r = admin_socket->register_command(
+    "injectmdataerr " \
+    "name=pool,type=CephString " \
+    "name=objname,type=CephObjectname " \
+    "name=shardid,type=CephInt,req=false,range=0|255",
+    test_ops_hook,
+    "inject metadata error to an object");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "set_recovery_delay " \
+    "name=utime,type=CephInt,req=false",
+    test_ops_hook,
+     "Delay osd recovery by specified seconds");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+   "injectfull " \
+   "name=type,type=CephString,req=false " \
+   "name=count,type=CephInt,req=false ",
+   test_ops_hook,
+   "Inject a full disk (optional count times)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "bench " \
+    "name=count,type=CephInt,req=false "    \
+    "name=size,type=CephInt,req=false "		   \
+    "name=object_size,type=CephInt,req=false "	   \
+    "name=object_num,type=CephInt,req=false ",
+    asok_hook,
+    "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
+    "(default count=1G default size=4MB). Results in log.");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "cluster_log " \
+    "name=level,type=CephChoices,strings=error,warning,info,debug "	\
+    "name=message,type=CephString,n=N",
+    asok_hook,
+    "log a message to the cluster log");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "flush_pg_stats",
+    asok_hook,
+    "flush pg stats");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "heap " \
+    "name=heapcmd,type=CephChoices,strings="				\
+    "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
+    "name=value,type=CephString,req=false",
+    asok_hook,
+    "show heap usage info (available only if compiled with tcmalloc)");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "debug dump_missing "			\
+    "name=filename,type=CephFilepath",
+    asok_hook,
+    "dump missing objects to a named file");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "debug kick_recovery_wq "						\
+    "name=delay,type=CephInt,range=0",
+    asok_hook,
+    "set osd_recovery_delay_start to <val>");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "cpu_profiler "						\
+    "name=arg,type=CephChoices,strings=status|flush",
+    asok_hook,
+    "run cpu profiling on daemon");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "dump_pg_recovery_stats",
+    asok_hook,
+    "dump pg recovery statistics");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "reset_pg_recovery_stats",
+    asok_hook,
+    "reset pg recovery statistics");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "cache drop",
+    asok_hook,
+    "Drop all OSD caches");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "cache status",
+    asok_hook,
+    "Get OSD caches statistics");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "scrub_purged_snaps",
+    asok_hook,
+    "Scrub purged_snaps vs snapmapper index");
+  ceph_assert(r == 0);
+
+  // -- pg commands --
+  // old form: ceph pg <pgid> command ...
+  r = admin_socket->register_command(
+    "pg "			   \
+    "name=pgid,type=CephPgid "	   \
+    "name=cmd,type=CephChoices,strings=query",
+    asok_hook,
+    "");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "pg "			   \
+    "name=pgid,type=CephPgid "	   \
+    "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
+    "name=mulcmd,type=CephChoices,strings=revert|delete",
+    asok_hook,
+    "");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "pg "			   \
+    "name=pgid,type=CephPgid "	   \
+    "name=cmd,type=CephChoices,strings=list_unfound " \
+    "name=offset,type=CephString,req=false",
+    asok_hook,
+    "");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "pg "			   \
+    "name=pgid,type=CephPgid "	   \
+    "name=cmd,type=CephChoices,strings=scrub " \
+    "name=time,type=CephInt,req=false",
+    asok_hook,
+    "");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "pg "			   \
+    "name=pgid,type=CephPgid "	   \
+    "name=cmd,type=CephChoices,strings=deep_scrub " \
+    "name=time,type=CephInt,req=false",
+    asok_hook,
+    "");
+  ceph_assert(r == 0);
+  // new form: tell <pgid> <cmd> for both cli and rest
+  r = admin_socket->register_command(
+    "query",
+    asok_hook,
+    "show details of a specific pg");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "mark_unfound_lost "					\
+    "name=pgid,type=CephPgid,req=false "			\
+    "name=mulcmd,type=CephChoices,strings=revert|delete",
+    asok_hook,
+    "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "list_unfound "					\
+    "name=pgid,type=CephPgid,req=false "		\
+    "name=offset,type=CephString,req=false",
+    asok_hook,
+    "list unfound objects on this pg, perhaps starting at an offset given in JSON");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "scrub "				\
+    "name=pgid,type=CephPgid,req=false "	\
+    "name=time,type=CephInt,req=false",
+    asok_hook,
+    "Trigger a scheduled scrub ");
+  ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "deep_scrub "			\
+    "name=pgid,type=CephPgid,req=false "	\
+    "name=time,type=CephInt,req=false",
+    asok_hook,
+    "Trigger a scheduled deep scrub ");
+  ceph_assert(r == 0);
+}
+
+PerfCounters* OSD::create_logger()
+{
+  PerfCounters* logger = build_osd_logger(cct);
+  cct->get_perfcounters_collection()->add(logger);
+  return logger;
+}
+
+PerfCounters* OSD::create_recoverystate_perf()
+{
+  PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
+  cct->get_perfcounters_collection()->add(recoverystate_perf);
+  return recoverystate_perf;
+}
+
+int OSD::shutdown()
+{
+  if (cct->_conf->osd_fast_shutdown) {
+    derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
+    if (cct->_conf->osd_fast_shutdown_notify_mon)
+      service.prepare_to_stop();
+    cct->_log->flush();
+    _exit(0);
+  }
+
+  if (!service.prepare_to_stop())
+    return 0; // already shutting down
+  osd_lock.lock();
+  if (is_stopping()) {
+    osd_lock.unlock();
+    return 0;
+  }
+  dout(0) << "shutdown" << dendl;
+
+  set_state(STATE_STOPPING);
+
+  // Debugging
+  if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
+    cct->_conf.set_val("debug_osd", "100");
+    cct->_conf.set_val("debug_journal", "100");
+    cct->_conf.set_val("debug_filestore", "100");
+    cct->_conf.set_val("debug_bluestore", "100");
+    cct->_conf.set_val("debug_ms", "100");
+    cct->_conf.apply_changes(nullptr);
+  }
+
+  // stop MgrClient earlier as it's more like an internal consumer of OSD
+  mgrc.shutdown();
+
+  service.start_shutdown();
+
+  // stop sending work to pgs.  this just prevents any new work in _process
+  // from racing with on_shutdown and potentially entering the pg after.
+  op_shardedwq.drain();
+
+  // Shutdown PGs
+  {
+    vector<PGRef> pgs;
+    _get_pgs(&pgs);
+    for (auto pg : pgs) {
+      pg->shutdown();
+    }
+  }
+
+  // drain op queue again (in case PGs requeued something)
+  op_shardedwq.drain();
+  {
+    finished.clear(); // zap waiters (bleh, this is messy)
+    waiting_for_osdmap.clear();
+  }
+
+  // unregister commands
+  cct->get_admin_socket()->unregister_commands(asok_hook);
+  delete asok_hook;
+  asok_hook = NULL;
+
+  cct->get_admin_socket()->unregister_commands(test_ops_hook);
+  delete test_ops_hook;
+  test_ops_hook = NULL;
+
+  osd_lock.unlock();
+
+  {
+    std::lock_guard l{heartbeat_lock};
+    heartbeat_stop = true;
+    heartbeat_cond.notify_all();
+    heartbeat_peers.clear();
+  }
+  heartbeat_thread.join();
+
+  hb_back_server_messenger->mark_down_all();
+  hb_front_server_messenger->mark_down_all();
+  hb_front_client_messenger->mark_down_all();
+  hb_back_client_messenger->mark_down_all();
+
+  osd_op_tp.drain();
+  osd_op_tp.stop();
+  dout(10) << "op sharded tp stopped" << dendl;
+
+  dout(10) << "stopping agent" << dendl;
+  service.agent_stop();
+
+  boot_finisher.wait_for_empty();
+
+  osd_lock.lock();
+
+  boot_finisher.stop();
+  reset_heartbeat_peers(true);
+
+  tick_timer.shutdown();
+
+  {
+    std::lock_guard l(tick_timer_lock);
+    tick_timer_without_osd_lock.shutdown();
+  }
+
+  // note unmount epoch
+  dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
+  superblock.mounted = service.get_boot_epoch();
+  superblock.clean_thru = get_osdmap_epoch();
+  ObjectStore::Transaction t;
+  write_superblock(t);
+  int r = store->queue_transaction(service.meta_ch, std::move(t));
+  if (r) {
+    derr << "OSD::shutdown: error writing superblock: "
+	 << cpp_strerror(r) << dendl;
+  }
+
+
+  service.shutdown_reserver();
+
+  // Remove PGs
+#ifdef PG_DEBUG_REFS
+  service.dump_live_pgids();
+#endif
+  while (true) {
+    vector<PGRef> pgs;
+    _get_pgs(&pgs, true);
+    if (pgs.empty()) {
+      break;
+    }
+    for (auto& pg : pgs) {
+      if (pg->is_deleted()) {
+	continue;
+      }
+      dout(20) << " kicking pg " << pg << dendl;
+      pg->lock();
+      if (pg->get_num_ref() != 1) {
+	derr << "pgid " << pg->get_pgid() << " has ref count of "
+	     << pg->get_num_ref() << dendl;
+#ifdef PG_DEBUG_REFS
+	pg->dump_live_ids();
+#endif
+	if (cct->_conf->osd_shutdown_pgref_assert) {
+	  ceph_abort();
+	}
+      }
+      pg->ch.reset();
+      pg->unlock();
+    }
+  }
+#ifdef PG_DEBUG_REFS
+  service.dump_live_pgids();
+#endif
+
+  osd_lock.unlock();
+  cct->_conf.remove_observer(this);
+  osd_lock.lock();
+
+  service.meta_ch.reset();
+
+  dout(10) << "syncing store" << dendl;
+  enable_disable_fuse(true);
+
+  if (cct->_conf->osd_journal_flush_on_shutdown) {
+    dout(10) << "flushing journal" << dendl;
+    store->flush_journal();
+  }
+
+  monc->shutdown();
+  osd_lock.unlock();
+  {
+    std::unique_lock l{map_lock};
+    set_osdmap(OSDMapRef());
+  }
+  for (auto s : shards) {
+    std::lock_guard l(s->osdmap_lock);
+    s->shard_osdmap = OSDMapRef();
+  }
+  service.shutdown();
+
+  std::lock_guard lock(osd_lock);
+  store->umount();
+  delete store;
+  store = nullptr;
+  dout(10) << "Store synced" << dendl;
+
+  op_tracker.on_shutdown();
+
+  ClassHandler::get_instance().shutdown();
+  client_messenger->shutdown();
+  cluster_messenger->shutdown();
+  hb_front_client_messenger->shutdown();
+  hb_back_client_messenger->shutdown();
+  objecter_messenger->shutdown();
+  hb_front_server_messenger->shutdown();
+  hb_back_server_messenger->shutdown();
+
+  return r;
+}
+
+int OSD::mon_cmd_maybe_osd_create(string &cmd)
+{
+  bool created = false;
+  while (true) {
+    dout(10) << __func__ << " cmd: " << cmd << dendl;
+    vector<string> vcmd{cmd};
+    bufferlist inbl;
+    C_SaferCond w;
+    string outs;
+    monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
+    int r = w.wait();
+    if (r < 0) {
+      if (r == -ENOENT && !created) {
+	string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
+	  + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
+	vector<string> vnewcmd{newcmd};
+	bufferlist inbl;
+	C_SaferCond w;
+	string outs;
+	monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
+	int r = w.wait();
+	if (r < 0) {
+	  derr << __func__ << " fail: osd does not exist and created failed: "
+	       << cpp_strerror(r) << dendl;
+	  return r;
+	}
+	created = true;
+	continue;
+      }
+      derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    break;
+  }
+
+  return 0;
+}
+
+int OSD::update_crush_location()
+{
+  if (!cct->_conf->osd_crush_update_on_start) {
+    dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
+    return 0;
+  }
+
+  char weight[32];
+  if (cct->_conf->osd_crush_initial_weight >= 0) {
+    snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
+  } else {
+    struct store_statfs_t st;
+    osd_alert_list_t alerts;
+    int r = store->statfs(&st, &alerts);
+    if (r < 0) {
+      derr << "statfs: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    snprintf(weight, sizeof(weight), "%.4lf",
+	     std::max(.00001,
+		      double(st.total) /
+		      double(1ull << 40 /* TB */)));
+  }
+
+  dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
+
+  string cmd =
+    string("{\"prefix\": \"osd crush create-or-move\", ") +
+    string("\"id\": ") + stringify(whoami) + ", " +
+    string("\"weight\":") + weight + ", " +
+    string("\"args\": [") + stringify(cct->crush_location) + "]}";
+  return mon_cmd_maybe_osd_create(cmd);
+}
+
+int OSD::update_crush_device_class()
+{
+  if (!cct->_conf->osd_class_update_on_start) {
+    dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
+    return 0;
+  }
+
+  string device_class;
+  int r = store->read_meta("crush_device_class", &device_class);
+  if (r < 0 || device_class.empty()) {
+    device_class = store->get_default_device_class();
+  }
+
+  if (device_class.empty()) {
+    dout(20) << __func__ << " no device class stored locally" << dendl;
+    return 0;
+  }
+
+  string cmd =
+    string("{\"prefix\": \"osd crush set-device-class\", ") +
+    string("\"class\": \"") + device_class + string("\", ") +
+    string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
+
+  r = mon_cmd_maybe_osd_create(cmd);
+  if (r == -EBUSY) {
+    // good, already bound to a device-class
+    return 0;
+  } else {
+    return r;
+  }
+}
+
+void OSD::write_superblock(ObjectStore::Transaction& t)
+{
+  dout(10) << "write_superblock " << superblock << dendl;
+
+  //hack: at minimum it's using the baseline feature set
+  if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
+    superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+
+  bufferlist bl;
+  encode(superblock, bl);
+  t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
+}
+
+int OSD::read_superblock()
+{
+  bufferlist bl;
+  int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
+  if (r < 0)
+    return r;
+
+  auto p = bl.cbegin();
+  decode(superblock, p);
+
+  dout(10) << "read_superblock " << superblock << dendl;
+
+  return 0;
+}
+
+void OSD::clear_temp_objects()
+{
+  dout(10) << __func__ << dendl;
+  vector<coll_t> ls;
+  store->list_collections(ls);
+  for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+    spg_t pgid;
+    if (!p->is_pg(&pgid))
+      continue;
+
+    // list temp objects
+    dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
+
+    vector<ghobject_t> temps;
+    ghobject_t next;
+    while (1) {
+      vector<ghobject_t> objects;
+      auto ch = store->open_collection(*p);
+      ceph_assert(ch);
+      store->collection_list(ch, next, ghobject_t::get_max(),
+			     store->get_ideal_list_max(),
+			     &objects, &next);
+      if (objects.empty())
+	break;
+      vector<ghobject_t>::iterator q;
+      for (q = objects.begin(); q != objects.end(); ++q) {
+	// Hammer set pool for temps to -1, so check for clean-up
+	if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
+	  temps.push_back(*q);
+	} else {
+	  break;
+	}
+      }
+      // If we saw a non-temp object and hit the break above we can
+      // break out of the while loop too.
+      if (q != objects.end())
+	break;
+    }
+    if (!temps.empty()) {
+      ObjectStore::Transaction t;
+      int removed = 0;
+      for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
+	dout(20) << "  removing " << *p << " object " << *q << dendl;
+	t.remove(*p, *q);
+        if (++removed > cct->_conf->osd_target_transaction_size) {
+          store->queue_transaction(service.meta_ch, std::move(t));
+          t = ObjectStore::Transaction();
+          removed = 0;
+        }
+      }
+      if (removed) {
+        store->queue_transaction(service.meta_ch, std::move(t));
+      }
+    }
+  }
+}
+
+void OSD::recursive_remove_collection(CephContext* cct,
+				      ObjectStore *store, spg_t pgid,
+				      coll_t tmp)
+{
+  OSDriver driver(
+    store,
+    coll_t(),
+    make_snapmapper_oid());
+
+  ObjectStore::CollectionHandle ch = store->open_collection(tmp);
+  ObjectStore::Transaction t;
+  SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
+
+  ghobject_t next;
+  int max = cct->_conf->osd_target_transaction_size;
+  vector<ghobject_t> objects;
+  objects.reserve(max);
+  while (true) {
+    objects.clear();
+    store->collection_list(ch, next, ghobject_t::get_max(),
+      max, &objects, &next);
+    generic_dout(10) << __func__ << " " << objects << dendl;
+    if (objects.empty())
+      break;
+    for (auto& p: objects) {
+      OSDriver::OSTransaction _t(driver.get_transaction(&t));
+      int r = mapper.remove_oid(p.hobj, &_t);
+      if (r != 0 && r != -ENOENT)
+        ceph_abort();
+      t.remove(tmp, p);
+    }
+    int r = store->queue_transaction(ch, std::move(t));
+    ceph_assert(r == 0);
+    t = ObjectStore::Transaction();
+  }
+  t.remove_collection(tmp);
+  int r = store->queue_transaction(ch, std::move(t));
+  ceph_assert(r == 0);
+
+  C_SaferCond waiter;
+  if (!ch->flush_commit(&waiter)) {
+    waiter.wait();
+  }
+}
+
+
+// ======================================================
+// PG's
+
+PG* OSD::_make_pg(
+  OSDMapRef createmap,
+  spg_t pgid)
+{
+  dout(10) << __func__ << " " << pgid << dendl;
+  pg_pool_t pi;
+  map<string,string> ec_profile;
+  string name;
+  if (createmap->have_pg_pool(pgid.pool())) {
+    pi = *createmap->get_pg_pool(pgid.pool());
+    name = createmap->get_pool_name(pgid.pool());
+    if (pi.is_erasure()) {
+      ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
+    }
+  } else {
+    // pool was deleted; grab final pg_pool_t off disk.
+    ghobject_t oid = make_final_pool_info_oid(pgid.pool());
+    bufferlist bl;
+    int r = store->read(service.meta_ch, oid, 0, 0, bl);
+    if (r < 0) {
+      derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
+	   << dendl;
+      return nullptr;
+    }
+    ceph_assert(r >= 0);
+    auto p = bl.cbegin();
+    decode(pi, p);
+    decode(name, p);
+    if (p.end()) { // dev release v13.0.2 did not include ec_profile
+      derr << __func__ << " missing ec_profile from pool " << pgid.pool()
+	   << " tombstone" << dendl;
+      return nullptr;
+    }
+    decode(ec_profile, p);
+  }
+  PGPool pool(createmap, pgid.pool(), pi, name);
+  PG *pg;
+  if (pi.type == pg_pool_t::TYPE_REPLICATED ||
+      pi.type == pg_pool_t::TYPE_ERASURE)
+    pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
+  else
+    ceph_abort();
+  return pg;
+}
+
+void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
+{
+  v->clear();
+  v->reserve(get_num_pgs());
+  for (auto& s : shards) {
+    std::lock_guard l(s->shard_lock);
+    for (auto& j : s->pg_slots) {
+      if (j.second->pg &&
+	  !j.second->pg->is_deleted()) {
+	v->push_back(j.second->pg);
+	if (clear_too) {
+	  s->_detach_pg(j.second.get());
+	}
+      }
+    }
+  }
+}
+
+void OSD::_get_pgids(vector<spg_t> *v)
+{
+  v->clear();
+  v->reserve(get_num_pgs());
+  for (auto& s : shards) {
+    std::lock_guard l(s->shard_lock);
+    for (auto& j : s->pg_slots) {
+      if (j.second->pg &&
+	  !j.second->pg->is_deleted()) {
+	v->push_back(j.first);
+      }
+    }
+  }
+}
+
+void OSD::register_pg(PGRef pg)
+{
+  spg_t pgid = pg->get_pgid();
+  uint32_t shard_index = pgid.hash_to_shard(num_shards);
+  auto sdata = shards[shard_index];
+  std::lock_guard l(sdata->shard_lock);
+  auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
+  ceph_assert(r.second);
+  auto *slot = r.first->second.get();
+  dout(20) << __func__ << " " << pgid << " " << pg << dendl;
+  sdata->_attach_pg(slot, pg.get());
+}
+
+bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
+{
+  auto sdata = pg->osd_shard;
+  ceph_assert(sdata);
+  {
+    std::lock_guard l(sdata->shard_lock);
+    auto p = sdata->pg_slots.find(pg->pg_id);
+    if (p == sdata->pg_slots.end() ||
+	!p->second->pg) {
+      dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
+      return false;
+    }
+    if (p->second->waiting_for_merge_epoch) {
+      dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
+      return false;
+    }
+    dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
+    sdata->_detach_pg(p->second.get());
+  }
+
+  for (auto shard : shards) {
+    shard->unprime_split_children(pg->pg_id, old_pg_num);
+  }
+
+  // update pg count now since we might not get an osdmap any time soon.
+  if (pg->is_primary())
+    service.logger->dec(l_osd_pg_primary);
+  else if (pg->is_nonprimary())
+    service.logger->dec(l_osd_pg_replica); // misnomver
+  else
+    service.logger->dec(l_osd_pg_stray);
+
+  return true;
+}
+
+PGRef OSD::_lookup_pg(spg_t pgid)
+{
+  uint32_t shard_index = pgid.hash_to_shard(num_shards);
+  auto sdata = shards[shard_index];
+  std::lock_guard l(sdata->shard_lock);
+  auto p = sdata->pg_slots.find(pgid);
+  if (p == sdata->pg_slots.end()) {
+    return nullptr;
+  }
+  return p->second->pg;
+}
+
+PGRef OSD::_lookup_lock_pg(spg_t pgid)
+{
+  PGRef pg = _lookup_pg(pgid);
+  if (!pg) {
+    return nullptr;
+  }
+  pg->lock();
+  if (!pg->is_deleted()) {
+    return pg;
+  }
+  pg->unlock();
+  return nullptr;
+}
+
+PGRef OSD::lookup_lock_pg(spg_t pgid)
+{
+  return _lookup_lock_pg(pgid);
+}
+
+void OSD::load_pgs()
+{
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+  dout(0) << "load_pgs" << dendl;
+
+  {
+    auto pghist = make_pg_num_history_oid();
+    bufferlist bl;
+    int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
+    if (r >= 0 && bl.length() > 0) {
+      auto p = bl.cbegin();
+      decode(pg_num_history, p);
+    }
+    dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
+  }
+
+  vector<coll_t> ls;
+  int r = store->list_collections(ls);
+  if (r < 0) {
+    derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
+  }
+
+  int num = 0;
+  for (vector<coll_t>::iterator it = ls.begin();
+       it != ls.end();
+       ++it) {
+    spg_t pgid;
+    if (it->is_temp(&pgid) ||
+       (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
+      dout(10) << "load_pgs " << *it
+	       << " removing, legacy or flagged for removal pg" << dendl;
+      recursive_remove_collection(cct, store, pgid, *it);
+      continue;
+    }
+
+    if (!it->is_pg(&pgid)) {
+      dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
+      continue;
+    }
+
+    dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
+    epoch_t map_epoch = 0;
+    int r = PG::peek_map_epoch(store, pgid, &map_epoch);
+    if (r < 0) {
+      derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
+	   << dendl;
+      continue;
+    }
+
+    PGRef pg;
+    if (map_epoch > 0) {
+      OSDMapRef pgosdmap = service.try_get_map(map_epoch);
+      if (!pgosdmap) {
+	if (!get_osdmap()->have_pg_pool(pgid.pool())) {
+	  derr << __func__ << ": could not find map for epoch " << map_epoch
+	       << " on pg " << pgid << ", but the pool is not present in the "
+	       << "current map, so this is probably a result of bug 10617.  "
+	       << "Skipping the pg for now, you can use ceph-objectstore-tool "
+	       << "to clean it up later." << dendl;
+	  continue;
+	} else {
+	  derr << __func__ << ": have pgid " << pgid << " at epoch "
+	       << map_epoch << ", but missing map.  Crashing."
+	       << dendl;
+	  ceph_abort_msg("Missing map in load_pgs");
+	}
+      }
+      pg = _make_pg(pgosdmap, pgid);
+    } else {
+      pg = _make_pg(get_osdmap(), pgid);
+    }
+    if (!pg) {
+      recursive_remove_collection(cct, store, pgid, *it);
+      continue;
+    }
+
+    // there can be no waiters here, so we don't call _wake_pg_slot
+
+    pg->lock();
+    pg->ch = store->open_collection(pg->coll);
+
+    // read pg state, log
+    pg->read_state(store);
+
+    if (pg->dne())  {
+      dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
+      pg->ch = nullptr;
+      pg->unlock();
+      recursive_remove_collection(cct, store, pgid, *it);
+      continue;
+    }
+    {
+      uint32_t shard_index = pgid.hash_to_shard(shards.size());
+      assert(NULL != shards[shard_index]);
+      store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
+    }
+
+    pg->reg_next_scrub();
+
+    dout(10) << __func__ << " loaded " << *pg << dendl;
+    pg->unlock();
+
+    register_pg(pg);
+    ++num;
+  }
+  dout(0) << __func__ << " opened " << num << " pgs" << dendl;
+}
+
+
+PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
+				 const PGCreateInfo *info)
+{
+  spg_t pgid = info->pgid;
+
+  if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
+    dout(10) << __func__ << " hit max pg, dropping" << dendl;
+    return nullptr;
+  }
+
+  PeeringCtx rctx = create_context();
+
+  OSDMapRef startmap = get_map(info->epoch);
+
+  if (info->by_mon) {
+    int64_t pool_id = pgid.pgid.pool();
+    const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
+    if (!pool) {
+      dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
+      return nullptr;
+    }
+    if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
+	!pool->has_flag(pg_pool_t::FLAG_CREATING)) {
+      // this ensures we do not process old creating messages after the
+      // pool's initial pgs have been created (and pg are subsequently
+      // allowed to split or merge).
+      dout(20) << __func__ << "  dropping " << pgid
+	       << "create, pool does not have CREATING flag set" << dendl;
+      return nullptr;
+    }
+  }
+
+  int up_primary, acting_primary;
+  vector<int> up, acting;
+  startmap->pg_to_up_acting_osds(
+    pgid.pgid, &up, &up_primary, &acting, &acting_primary);
+
+  const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
+  if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
+      store->get_type() != "bluestore") {
+    clog->warn() << "pg " << pgid
+		 << " is at risk of silent data corruption: "
+		 << "the pool allows ec overwrites but is not stored in "
+		 << "bluestore, so deep scrubbing will not detect bitrot";
+  }
+  create_pg_collection(
+    rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
+  init_pg_ondisk(rctx.transaction, pgid, pp);
+
+  int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
+
+  PGRef pg = _make_pg(startmap, pgid);
+  pg->ch = store->create_new_collection(pg->coll);
+
+  {
+    uint32_t shard_index = pgid.hash_to_shard(shards.size());
+    assert(NULL != shards[shard_index]);
+    store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
+  }
+
+  pg->lock(true);
+
+  // we are holding the shard lock
+  ceph_assert(!pg->is_deleted());
+
+  pg->init(
+    role,
+    up,
+    up_primary,
+    acting,
+    acting_primary,
+    info->history,
+    info->past_intervals,
+    false,
+    rctx.transaction);
+
+  pg->init_collection_pool_opts();
+
+  if (pg->is_primary()) {
+    std::lock_guard locker{m_perf_queries_lock};
+    pg->set_dynamic_perf_stats_queries(m_perf_queries);
+  }
+
+  pg->handle_initialize(rctx);
+  pg->handle_activate_map(rctx);
+
+  dispatch_context(rctx, pg.get(), osdmap, nullptr);
+
+  dout(10) << __func__ << " new pg " << *pg << dendl;
+  return pg;
+}
+
+bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
+				spg_t pgid,
+				bool is_mon_create)
+{
+  const auto max_pgs_per_osd =
+    (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
+     cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+
+  if (num_pgs < max_pgs_per_osd) {
+    return false;
+  }
+
+  std::lock_guard l(pending_creates_lock);
+  if (is_mon_create) {
+    pending_creates_from_mon++;
+  } else {
+    bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
+    pending_creates_from_osd.emplace(pgid, is_primary);
+  }
+  dout(1) << __func__ << " withhold creation of pg " << pgid
+	  << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
+  return true;
+}
+
+// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
+// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
+// to up set if pg_temp is empty. so an empty pg_temp won't work.
+static vector<int32_t> twiddle(const vector<int>& acting) {
+  if (acting.size() > 1) {
+    return {acting[0]};
+  } else {
+    vector<int32_t> twiddled(acting.begin(), acting.end());
+    twiddled.push_back(-1);
+    return twiddled;
+  }
+}
+
+void OSD::resume_creating_pg()
+{
+  bool do_sub_pg_creates = false;
+  bool have_pending_creates = false;
+  {
+    const auto max_pgs_per_osd =
+      (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
+       cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+    if (max_pgs_per_osd <= num_pgs) {
+      // this could happen if admin decreases this setting before a PG is removed
+      return;
+    }
+    unsigned spare_pgs = max_pgs_per_osd - num_pgs;
+    std::lock_guard l(pending_creates_lock);
+    if (pending_creates_from_mon > 0) {
+      dout(20) << __func__ << " pending_creates_from_mon "
+	       << pending_creates_from_mon << dendl;
+      do_sub_pg_creates = true;
+      if (pending_creates_from_mon >= spare_pgs) {
+	spare_pgs = pending_creates_from_mon = 0;
+      } else {
+	spare_pgs -= pending_creates_from_mon;
+	pending_creates_from_mon = 0;
+      }
+    }
+    auto pg = pending_creates_from_osd.cbegin();
+    while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
+      dout(20) << __func__ << " pg " << pg->first << dendl;
+      vector<int> acting;
+      get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
+      service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
+      pg = pending_creates_from_osd.erase(pg);
+      do_sub_pg_creates = true;
+      spare_pgs--;
+    }
+    have_pending_creates = (pending_creates_from_mon > 0 ||
+			    !pending_creates_from_osd.empty());
+  }
+
+  bool do_renew_subs = false;
+  if (do_sub_pg_creates) {
+    if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
+      dout(4) << __func__ << ": resolicit pg creates from mon since "
+	      << last_pg_create_epoch << dendl;
+      do_renew_subs = true;
+    }
+  }
+  version_t start = get_osdmap_epoch() + 1;
+  if (have_pending_creates) {
+    // don't miss any new osdmap deleting PGs
+    if (monc->sub_want("osdmap", start, 0)) {
+      dout(4) << __func__ << ": resolicit osdmap from mon since "
+	      << start << dendl;
+      do_renew_subs = true;
+    }
+  } else if (do_sub_pg_creates) {
+    // no need to subscribe the osdmap continuously anymore
+    // once the pgtemp and/or mon_subscribe(pg_creates) is sent
+    if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
+      dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
+	      << start << dendl;
+      do_renew_subs = true;
+    }
+  }
+
+  if (do_renew_subs) {
+    monc->renew_subs();
+  }
+
+  service.send_pg_temp();
+}
+
+void OSD::build_initial_pg_history(
+  spg_t pgid,
+  epoch_t created,
+  utime_t created_stamp,
+  pg_history_t *h,
+  PastIntervals *pi)
+{
+  dout(10) << __func__ << " " << pgid << " created " << created << dendl;
+  *h = pg_history_t(created, created_stamp);
+
+  OSDMapRef lastmap = service.get_map(created);
+  int up_primary, acting_primary;
+  vector<int> up, acting;
+  lastmap->pg_to_up_acting_osds(
+    pgid.pgid, &up, &up_primary, &acting, &acting_primary);
+
+  ostringstream debug;
+  for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
+    OSDMapRef osdmap = service.get_map(e);
+    int new_up_primary, new_acting_primary;
+    vector<int> new_up, new_acting;
+    osdmap->pg_to_up_acting_osds(
+      pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
+
+    // this is a bit imprecise, but sufficient?
+    struct min_size_predicate_t : public IsPGRecoverablePredicate {
+      const pg_pool_t *pi;
+      bool operator()(const set<pg_shard_t> &have) const {
+	return have.size() >= pi->min_size;
+      }
+      explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
+    } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
+
+    bool new_interval = PastIntervals::check_new_interval(
+      acting_primary,
+      new_acting_primary,
+      acting, new_acting,
+      up_primary,
+      new_up_primary,
+      up, new_up,
+      h->same_interval_since,
+      h->last_epoch_clean,
+      osdmap.get(),
+      lastmap.get(),
+      pgid.pgid,
+      min_size_predicate,
+      pi,
+      &debug);
+    if (new_interval) {
+      h->same_interval_since = e;
+      if (up != new_up) {
+        h->same_up_since = e;
+      }
+      if (acting_primary != new_acting_primary) {
+        h->same_primary_since = e;
+      }
+      if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
+                             osdmap->get_pg_num(pgid.pgid.pool()),
+                             nullptr)) {
+        h->last_epoch_split = e;
+      }
+      up = new_up;
+      acting = new_acting;
+      up_primary = new_up_primary;
+      acting_primary = new_acting_primary;
+    }
+    lastmap = osdmap;
+  }
+  dout(20) << __func__ << " " << debug.str() << dendl;
+  dout(10) << __func__ << " " << *h << " " << *pi
+	   << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
+		       pi->get_bounds()) << ")"
+	   << dendl;
+}
+
+void OSD::_add_heartbeat_peer(int p)
+{
+  if (p == whoami)
+    return;
+  HeartbeatInfo *hi;
+
+  map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
+  if (i == heartbeat_peers.end()) {
+    pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
+    if (!cons.first)
+      return;
+    assert(cons.second);
+
+    hi = &heartbeat_peers[p];
+    hi->peer = p;
+
+    auto stamps = service.get_hb_stamps(p);
+
+    auto sb = ceph::make_ref<Session>(cct, cons.first.get());
+    sb->peer = p;
+    sb->stamps = stamps;
+    hi->hb_interval_start = ceph_clock_now();
+    hi->con_back = cons.first.get();
+    hi->con_back->set_priv(sb);
+
+    auto sf = ceph::make_ref<Session>(cct, cons.second.get());
+    sf->peer = p;
+    sf->stamps = stamps;
+    hi->con_front = cons.second.get();
+    hi->con_front->set_priv(sf);
+
+    dout(10) << "_add_heartbeat_peer: new peer osd." << p
+	     << " " << hi->con_back->get_peer_addr()
+	     << " " << hi->con_front->get_peer_addr()
+	     << dendl;
+  } else {
+    hi = &i->second;
+  }
+  hi->epoch = get_osdmap_epoch();
+}
+
+void OSD::_remove_heartbeat_peer(int n)
+{
+  map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
+  ceph_assert(q != heartbeat_peers.end());
+  dout(20) << " removing heartbeat peer osd." << n
+	   << " " << q->second.con_back->get_peer_addr()
+	   << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
+	   << dendl;
+  q->second.clear_mark_down();
+  heartbeat_peers.erase(q);
+}
+
+void OSD::need_heartbeat_peer_update()
+{
+  if (is_stopping())
+    return;
+  dout(20) << "need_heartbeat_peer_update" << dendl;
+  heartbeat_set_peers_need_update();
+}
+
+void OSD::maybe_update_heartbeat_peers()
+{
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+
+  if (is_waiting_for_healthy() || is_active()) {
+    utime_t now = ceph_clock_now();
+    if (last_heartbeat_resample == utime_t()) {
+      last_heartbeat_resample = now;
+      heartbeat_set_peers_need_update();
+    } else if (!heartbeat_peers_need_update()) {
+      utime_t dur = now - last_heartbeat_resample;
+      if (dur > cct->_conf->osd_heartbeat_grace) {
+	dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
+	heartbeat_set_peers_need_update();
+	last_heartbeat_resample = now;
+	// automatically clean up any stale heartbeat peers
+	// if we are unhealthy, then clean all
+	reset_heartbeat_peers(is_waiting_for_healthy());
+      }
+    }
+  }
+
+  if (!heartbeat_peers_need_update())
+    return;
+  heartbeat_clear_peers_need_update();
+
+  std::lock_guard l(heartbeat_lock);
+
+  dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
+
+
+  // build heartbeat from set
+  if (is_active()) {
+    vector<PGRef> pgs;
+    _get_pgs(&pgs);
+    for (auto& pg : pgs) {
+      pg->with_heartbeat_peers([&](int peer) {
+	  if (get_osdmap()->is_up(peer)) {
+	    _add_heartbeat_peer(peer);
+	  }
+	});
+    }
+  }
+
+  // include next and previous up osds to ensure we have a fully-connected set
+  set<int> want, extras;
+  const int next = get_osdmap()->get_next_up_osd_after(whoami);
+  if (next >= 0)
+    want.insert(next);
+  int prev = get_osdmap()->get_previous_up_osd_before(whoami);
+  if (prev >= 0 && prev != next)
+    want.insert(prev);
+
+  // make sure we have at least **min_down** osds coming from different
+  // subtree level (e.g., hosts) for fast failure detection.
+  auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
+  auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
+  auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
+  get_osdmap()->get_random_up_osds_by_subtree(
+    whoami, subtree, limit, want, &want);
+
+  for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
+    dout(10) << " adding neighbor peer osd." << *p << dendl;
+    extras.insert(*p);
+    _add_heartbeat_peer(*p);
+  }
+
+  // remove down peers; enumerate extras
+  map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
+  while (p != heartbeat_peers.end()) {
+    if (!get_osdmap()->is_up(p->first)) {
+      int o = p->first;
+      ++p;
+      _remove_heartbeat_peer(o);
+      continue;
+    }
+    if (p->second.epoch < get_osdmap_epoch()) {
+      extras.insert(p->first);
+    }
+    ++p;
+  }
+
+  // too few?
+  for (int n = next; n >= 0; ) {
+    if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
+      break;
+    if (!extras.count(n) && !want.count(n) && n != whoami) {
+      dout(10) << " adding random peer osd." << n << dendl;
+      extras.insert(n);
+      _add_heartbeat_peer(n);
+    }
+    n = get_osdmap()->get_next_up_osd_after(n);
+    if (n == next)
+      break;  // came full circle; stop
+  }
+
+  // too many?
+  for (set<int>::iterator p = extras.begin();
+       (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
+       ++p) {
+    if (want.count(*p))
+      continue;
+    _remove_heartbeat_peer(*p);
+  }
+
+  dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
+
+  // clean up stale failure pending
+  for (auto it = failure_pending.begin(); it != failure_pending.end();) {
+    if (heartbeat_peers.count(it->first) == 0) {
+      send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
+      failure_pending.erase(it++);
+    } else {
+      it++;
+    }
+  }
+}
+
+void OSD::reset_heartbeat_peers(bool all)
+{
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+  dout(10) << "reset_heartbeat_peers" << dendl;
+  utime_t stale = ceph_clock_now();
+  stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
+  std::lock_guard l(heartbeat_lock);
+  for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
+    auto& [peer, hi] = *it;
+    if (all || hi.is_stale(stale)) {
+      hi.clear_mark_down();
+      // stop sending failure_report to mon too
+      failure_queue.erase(peer);
+      failure_pending.erase(peer);
+      it = heartbeat_peers.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+
+void OSD::handle_osd_ping(MOSDPing *m)
+{
+  if (superblock.cluster_fsid != m->fsid) {
+    dout(20) << "handle_osd_ping from " << m->get_source_inst()
+	     << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
+	     << dendl;
+    m->put();
+    return;
+  }
+
+  int from = m->get_source().num();
+
+  heartbeat_lock.lock();
+  if (is_stopping()) {
+    heartbeat_lock.unlock();
+    m->put();
+    return;
+  }
+
+  utime_t now = ceph_clock_now();
+  auto mnow = service.get_mnow();
+  ConnectionRef con(m->get_connection());
+  OSDMapRef curmap = service.get_osdmap();
+  if (!curmap) {
+    heartbeat_lock.unlock();
+    m->put();
+    return;
+  }
+
+  auto sref = con->get_priv();
+  Session *s = static_cast<Session*>(sref.get());
+  if (!s) {
+    heartbeat_lock.unlock();
+    m->put();
+    return;
+  }
+  if (!s->stamps) {
+    s->peer = from;
+    s->stamps = service.get_hb_stamps(from);
+  }
+
+  switch (m->op) {
+
+  case MOSDPing::PING:
+    {
+      if (cct->_conf->osd_debug_drop_ping_probability > 0) {
+	auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
+	if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
+	  if (heartbeat_drop->second == 0) {
+	    debug_heartbeat_drops_remaining.erase(heartbeat_drop);
+	  } else {
+	    --heartbeat_drop->second;
+	    dout(5) << "Dropping heartbeat from " << from
+		    << ", " << heartbeat_drop->second
+		    << " remaining to drop" << dendl;
+	    break;
+	  }
+	} else if (cct->_conf->osd_debug_drop_ping_probability >
+	           ((((double)(rand()%100))/100.0))) {
+	  heartbeat_drop =
+	    debug_heartbeat_drops_remaining.insert(std::make_pair(from,
+	                     cct->_conf->osd_debug_drop_ping_duration)).first;
+	  dout(5) << "Dropping heartbeat from " << from
+		  << ", " << heartbeat_drop->second
+		  << " remaining to drop" << dendl;
+	  break;
+	}
+      }
+
+      ceph::signedspan sender_delta_ub{};
+      s->stamps->got_ping(
+	m->up_from,
+	mnow,
+	m->mono_send_stamp,
+	m->delta_ub,
+	&sender_delta_ub);
+      dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
+
+      if (!cct->get_heartbeat_map()->is_healthy()) {
+	dout(10) << "internal heartbeat not healthy, dropping ping request"
+		 << dendl;
+	break;
+      }
+
+      Message *r = new MOSDPing(monc->get_fsid(),
+				curmap->get_epoch(),
+				MOSDPing::PING_REPLY,
+				m->ping_stamp,
+				m->mono_ping_stamp,
+				mnow,
+				service.get_up_epoch(),
+				cct->_conf->osd_heartbeat_min_size,
+				sender_delta_ub);
+      con->send_message(r);
+
+      if (curmap->is_up(from)) {
+	if (is_active()) {
+	  ConnectionRef cluster_con = service.get_con_osd_cluster(
+	    from, curmap->get_epoch());
+	  if (cluster_con) {
+	    service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
+	  }
+	}
+      } else if (!curmap->exists(from) ||
+		 curmap->get_down_at(from) > m->map_epoch) {
+	// tell them they have died
+	Message *r = new MOSDPing(monc->get_fsid(),
+				  curmap->get_epoch(),
+				  MOSDPing::YOU_DIED,
+				  m->ping_stamp,
+				  m->mono_ping_stamp,
+				  mnow,
+				  service.get_up_epoch(),
+				  cct->_conf->osd_heartbeat_min_size);
+	con->send_message(r);
+      }
+    }
+    break;
+
+  case MOSDPing::PING_REPLY:
+    {
+      map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
+      if (i != heartbeat_peers.end()) {
+        auto acked = i->second.ping_history.find(m->ping_stamp);
+        if (acked != i->second.ping_history.end()) {
+          int &unacknowledged = acked->second.second;
+          if (con == i->second.con_back) {
+            dout(25) << "handle_osd_ping got reply from osd." << from
+                     << " first_tx " << i->second.first_tx
+                     << " last_tx " << i->second.last_tx
+                     << " last_rx_back " << i->second.last_rx_back
+		     << " -> " << now
+                     << " last_rx_front " << i->second.last_rx_front
+                     << dendl;
+            i->second.last_rx_back = now;
+            ceph_assert(unacknowledged > 0);
+            --unacknowledged;
+            // if there is no front con, set both stamps.
+            if (i->second.con_front == NULL) {
+              i->second.last_rx_front = now;
+              ceph_assert(unacknowledged > 0);
+              --unacknowledged;
+            }
+          } else if (con == i->second.con_front) {
+            dout(25) << "handle_osd_ping got reply from osd." << from
+                     << " first_tx " << i->second.first_tx
+                     << " last_tx " << i->second.last_tx
+                     << " last_rx_back " << i->second.last_rx_back
+                     << " last_rx_front " << i->second.last_rx_front
+		     << " -> " << now
+                     << dendl;
+            i->second.last_rx_front = now;
+            ceph_assert(unacknowledged > 0);
+            --unacknowledged;
+          }
+
+          if (unacknowledged == 0) {
+            // succeeded in getting all replies
+            dout(25) << "handle_osd_ping got all replies from osd." << from
+                     << " , erase pending ping(sent at " << m->ping_stamp << ")"
+                     << " and older pending ping(s)"
+                     << dendl;
+
+#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
+	    ++i->second.hb_average_count;
+	    uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
+	    i->second.hb_total_back += back_pingtime;
+	    if (back_pingtime < i->second.hb_min_back)
+	      i->second.hb_min_back = back_pingtime;
+	    if (back_pingtime > i->second.hb_max_back)
+	      i->second.hb_max_back = back_pingtime;
+	    uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
+	    i->second.hb_total_front += front_pingtime;
+	    if (front_pingtime < i->second.hb_min_front)
+	      i->second.hb_min_front = front_pingtime;
+	    if (front_pingtime > i->second.hb_max_front)
+	      i->second.hb_max_front = front_pingtime;
+
+	    ceph_assert(i->second.hb_interval_start != utime_t());
+	    if (i->second.hb_interval_start == utime_t())
+	      i->second.hb_interval_start = now;
+	    int64_t hb_avg_time_period = 60;
+	    if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
+	      hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
+	    }
+	    if (now - i->second.hb_interval_start >=  utime_t(hb_avg_time_period, 0)) {
+              uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
+              uint32_t back_min = i->second.hb_min_back;
+              uint32_t back_max = i->second.hb_max_back;
+              uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
+              uint32_t front_min = i->second.hb_min_front;
+              uint32_t front_max = i->second.hb_max_front;
+
+	      // Reset for new interval
+	      i->second.hb_average_count = 0;
+	      i->second.hb_interval_start = now;
+	      i->second.hb_total_back = i->second.hb_max_back = 0;
+	      i->second.hb_min_back =  UINT_MAX;
+	      i->second.hb_total_front = i->second.hb_max_front = 0;
+	      i->second.hb_min_front = UINT_MAX;
+
+	      // Record per osd interace ping times
+	      // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
+	      if (i->second.hb_back_pingtime.size() == 0) {
+		ceph_assert(i->second.hb_front_pingtime.size() == 0);
+		for (unsigned k = 0 ; k < hb_vector_size; ++k) {
+	          i->second.hb_back_pingtime.push_back(back_avg);
+	          i->second.hb_back_min.push_back(back_min);
+	          i->second.hb_back_max.push_back(back_max);
+	          i->second.hb_front_pingtime.push_back(front_avg);
+	          i->second.hb_front_min.push_back(front_min);
+	          i->second.hb_front_max.push_back(front_max);
+	          ++i->second.hb_index;
+		}
+	      } else {
+	        int index = i->second.hb_index & (hb_vector_size - 1);
+	        i->second.hb_back_pingtime[index] = back_avg;
+	        i->second.hb_back_min[index] = back_min;
+	        i->second.hb_back_max[index] = back_max;
+	        i->second.hb_front_pingtime[index] = front_avg;
+	        i->second.hb_front_min[index] = front_min;
+	        i->second.hb_front_max[index] = front_max;
+	        ++i->second.hb_index;
+	      }
+
+	      {
+		std::lock_guard l(service.stat_lock);
+		service.osd_stat.hb_pingtime[from].last_update = now.sec();
+		service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
+
+		uint32_t total = 0;
+		uint32_t min = UINT_MAX;
+		uint32_t max = 0;
+		uint32_t count = 0;
+		uint32_t which = 0;
+		uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
+		for (int32_t k = size - 1 ; k >= 0; --k) {
+		  ++count;
+		  int index = (i->second.hb_index + k) % size;
+		  total += i->second.hb_back_pingtime[index];
+		  if (i->second.hb_back_min[index] < min)
+		    min = i->second.hb_back_min[index];
+		  if (i->second.hb_back_max[index] > max)
+		    max = i->second.hb_back_max[index];
+		  if (count == 1 || count == 5 || count == 15) {
+		    service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
+		    service.osd_stat.hb_pingtime[from].back_min[which] = min;
+		    service.osd_stat.hb_pingtime[from].back_max[which] = max;
+		    which++;
+		    if (count == 15)
+		      break;
+		  }
+		}
+
+                if (i->second.con_front != NULL) {
+		  service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
+
+		  total = 0;
+		  min = UINT_MAX;
+		  max = 0;
+		  count = 0;
+		  which = 0;
+		  for (int32_t k = size - 1 ; k >= 0; --k) {
+		    ++count;
+		    int index = (i->second.hb_index + k) % size;
+		    total += i->second.hb_front_pingtime[index];
+		    if (i->second.hb_front_min[index] < min)
+		      min = i->second.hb_front_min[index];
+		    if (i->second.hb_front_max[index] > max)
+		      max = i->second.hb_front_max[index];
+		    if (count == 1 || count == 5 || count == 15) {
+		      service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
+		      service.osd_stat.hb_pingtime[from].front_min[which] = min;
+		      service.osd_stat.hb_pingtime[from].front_max[which] = max;
+		      which++;
+		      if (count == 15)
+		        break;
+		    }
+		  }
+		}
+	      }
+	    } else {
+		std::lock_guard l(service.stat_lock);
+		service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
+                if (i->second.con_front != NULL)
+		  service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
+	    }
+            i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
+          }
+
+          if (i->second.is_healthy(now)) {
+            // Cancel false reports
+            auto failure_queue_entry = failure_queue.find(from);
+            if (failure_queue_entry != failure_queue.end()) {
+              dout(10) << "handle_osd_ping canceling queued "
+                       << "failure report for osd." << from << dendl;
+              failure_queue.erase(failure_queue_entry);
+            }
+
+            auto failure_pending_entry = failure_pending.find(from);
+            if (failure_pending_entry != failure_pending.end()) {
+              dout(10) << "handle_osd_ping canceling in-flight "
+                       << "failure report for osd." << from << dendl;
+              send_still_alive(curmap->get_epoch(),
+                               from,
+                               failure_pending_entry->second.second);
+              failure_pending.erase(failure_pending_entry);
+            }
+          }
+        } else {
+          // old replies, deprecated by newly sent pings.
+          dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
+                   << ") is found, treat as covered by newly sent pings "
+                   << "and ignore"
+                   << dendl;
+        }
+      }
+
+      if (m->map_epoch &&
+	  curmap->is_up(from)) {
+	if (is_active()) {
+	  ConnectionRef cluster_con = service.get_con_osd_cluster(
+	    from, curmap->get_epoch());
+	  if (cluster_con) {
+	    service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
+	  }
+	}
+      }
+
+      s->stamps->got_ping_reply(
+	mnow,
+	m->mono_send_stamp,
+	m->delta_ub);
+      dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
+    }
+    break;
+
+  case MOSDPing::YOU_DIED:
+    dout(10) << "handle_osd_ping " << m->get_source_inst()
+	     << " says i am down in " << m->map_epoch << dendl;
+    osdmap_subscribe(curmap->get_epoch()+1, false);
+    break;
+  }
+
+  heartbeat_lock.unlock();
+  m->put();
+}
+
+void OSD::heartbeat_entry()
+{
+  std::unique_lock l(heartbeat_lock);
+  if (is_stopping())
+    return;
+  while (!heartbeat_stop) {
+    heartbeat();
+
+    double wait;
+    if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
+      wait = (float)cct->_conf->osd_heartbeat_interval;
+    } else {
+      wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
+    }
+    auto w = ceph::make_timespan(wait);
+    dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
+    heartbeat_cond.wait_for(l, w);
+    if (is_stopping())
+      return;
+    dout(30) << "heartbeat_entry woke up" << dendl;
+  }
+}
+
+void OSD::heartbeat_check()
+{
+  ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
+  utime_t now = ceph_clock_now();
+
+  // check for incoming heartbeats (move me elsewhere?)
+  for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
+       p != heartbeat_peers.end();
+       ++p) {
+
+    if (p->second.first_tx == utime_t()) {
+      dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
+               << " yet, skipping" << dendl;
+      continue;
+    }
+
+    dout(25) << "heartbeat_check osd." << p->first
+	     << " first_tx " << p->second.first_tx
+	     << " last_tx " << p->second.last_tx
+	     << " last_rx_back " << p->second.last_rx_back
+	     << " last_rx_front " << p->second.last_rx_front
+	     << dendl;
+    if (p->second.is_unhealthy(now)) {
+      utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
+      if (p->second.last_rx_back == utime_t() ||
+	  p->second.last_rx_front == utime_t()) {
+        derr << "heartbeat_check: no reply from "
+             << p->second.con_front->get_peer_addr().get_sockaddr()
+             << " osd." << p->first
+             << " ever on either front or back, first ping sent "
+             << p->second.first_tx
+             << " (oldest deadline " << oldest_deadline << ")"
+             << dendl;
+	// fail
+	failure_queue[p->first] = p->second.first_tx;
+      } else {
+	derr << "heartbeat_check: no reply from "
+             << p->second.con_front->get_peer_addr().get_sockaddr()
+	     << " osd." << p->first << " since back " << p->second.last_rx_back
+	     << " front " << p->second.last_rx_front
+	     << " (oldest deadline " << oldest_deadline << ")"
+             << dendl;
+	// fail
+	failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
+      }
+    }
+  }
+}
+
+void OSD::heartbeat()
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
+  dout(30) << "heartbeat" << dendl;
+
+  // get CPU load avg
+  double loadavgs[1];
+  int hb_interval = cct->_conf->osd_heartbeat_interval;
+  int n_samples = 86400;
+  if (hb_interval > 1) {
+    n_samples /= hb_interval;
+    if (n_samples < 1)
+      n_samples = 1;
+  }
+
+  if (getloadavg(loadavgs, 1) == 1) {
+    logger->set(l_osd_loadavg, 100 * loadavgs[0]);
+    daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
+    dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
+  }
+
+  dout(30) << "heartbeat checking stats" << dendl;
+
+  // refresh peer list and osd stats
+  vector<int> hb_peers;
+  for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
+       p != heartbeat_peers.end();
+       ++p)
+    hb_peers.push_back(p->first);
+
+  auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
+  dout(5) << __func__ << " " << new_stat << dendl;
+  ceph_assert(new_stat.statfs.total);
+
+  float pratio;
+  float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
+
+  service.check_full_status(ratio, pratio);
+
+  utime_t now = ceph_clock_now();
+  auto mnow = service.get_mnow();
+  utime_t deadline = now;
+  deadline += cct->_conf->osd_heartbeat_grace;
+
+  // send heartbeats
+  for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
+       i != heartbeat_peers.end();
+       ++i) {
+    int peer = i->first;
+    Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
+    if (!s) {
+      dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
+      continue;
+    }
+    dout(30) << "heartbeat sending ping to osd." << peer << dendl;
+
+    i->second.last_tx = now;
+    if (i->second.first_tx == utime_t())
+      i->second.first_tx = now;
+    i->second.ping_history[now] = make_pair(deadline,
+      HeartbeatInfo::HEARTBEAT_MAX_CONN);
+    if (i->second.hb_interval_start == utime_t())
+      i->second.hb_interval_start = now;
+
+    std::optional<ceph::signedspan> delta_ub;
+    s->stamps->sent_ping(&delta_ub);
+
+    i->second.con_back->send_message(
+      new MOSDPing(monc->get_fsid(),
+		   service.get_osdmap_epoch(),
+		   MOSDPing::PING,
+		   now,
+		   mnow,
+		   mnow,
+		   service.get_up_epoch(),
+		   cct->_conf->osd_heartbeat_min_size,
+		   delta_ub));
+
+    if (i->second.con_front)
+      i->second.con_front->send_message(
+	new MOSDPing(monc->get_fsid(),
+		     service.get_osdmap_epoch(),
+		     MOSDPing::PING,
+		     now,
+		     mnow,
+		     mnow,
+		     service.get_up_epoch(),
+		     cct->_conf->osd_heartbeat_min_size,
+		     delta_ub));
+  }
+
+  logger->set(l_osd_hb_to, heartbeat_peers.size());
+
+  // hmm.. am i all alone?
+  dout(30) << "heartbeat lonely?" << dendl;
+  if (heartbeat_peers.empty()) {
+    if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
+      last_mon_heartbeat = now;
+      dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
+      osdmap_subscribe(get_osdmap_epoch() + 1, false);
+    }
+  }
+
+  dout(30) << "heartbeat done" << dendl;
+}
+
+bool OSD::heartbeat_reset(Connection *con)
+{
+  std::lock_guard l(heartbeat_lock);
+  auto s = con->get_priv();
+  dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
+  con->set_priv(nullptr);
+  if (s) {
+    if (is_stopping()) {
+      return true;
+    }
+    auto session = static_cast<Session*>(s.get());
+    auto p = heartbeat_peers.find(session->peer);
+    if (p != heartbeat_peers.end() &&
+	(p->second.con_back == con ||
+	 p->second.con_front == con)) {
+      dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
+	       << ", reopening" << dendl;
+      p->second.clear_mark_down(con);
+      pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
+      if (newcon.first) {
+	p->second.con_back = newcon.first.get();
+	p->second.con_back->set_priv(s);
+	if (newcon.second) {
+	  p->second.con_front = newcon.second.get();
+	  p->second.con_front->set_priv(s);
+	}
+        p->second.ping_history.clear();
+      } else {
+	dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
+		 << ", raced with osdmap update, closing out peer" << dendl;
+	heartbeat_peers.erase(p);
+      }
+    } else {
+      dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
+    }
+  }
+  return true;
+}
+
+
+
+// =========================================
+
+void OSD::tick()
+{
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+  dout(10) << "tick" << dendl;
+
+  utime_t now = ceph_clock_now();
+  // throw out any obsolete markdown log
+  utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
+  while (!osd_markdown_log.empty() &&
+          osd_markdown_log.front() + grace < now)
+    osd_markdown_log.pop_front();
+
+  if (is_active() || is_waiting_for_healthy()) {
+    maybe_update_heartbeat_peers();
+  }
+
+  if (is_waiting_for_healthy()) {
+    start_boot();
+  }
+
+  if (is_waiting_for_healthy() || is_booting()) {
+    std::lock_guard l(heartbeat_lock);
+    if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
+      last_mon_heartbeat = now;
+      dout(1) << __func__ << " checking mon for new map" << dendl;
+      osdmap_subscribe(get_osdmap_epoch() + 1, false);
+    }
+  }
+
+  do_waiters();
+
+  // scrub purged_snaps every deep scrub interval
+  {
+    const utime_t last = superblock.last_purged_snaps_scrub;
+    utime_t next = last;
+    next += cct->_conf->osd_scrub_min_interval;
+    std::mt19937 rng;
+    // use a seed that is stable for each scrub interval, but varies
+    // by OSD to avoid any herds.
+    rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
+    double r = (rng() % 1024) / 1024.0;
+    next +=
+      cct->_conf->osd_scrub_min_interval *
+      cct->_conf->osd_scrub_interval_randomize_ratio * r;
+    if (next < ceph_clock_now()) {
+      dout(20) << __func__ << " last_purged_snaps_scrub " << last
+	       << " next " << next << " ... now" << dendl;
+      scrub_purged_snaps();
+    } else {
+      dout(20) << __func__ << " last_purged_snaps_scrub " << last
+	       << " next " << next << dendl;
+    }
+  }
+
+  tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
+}
+
+void OSD::tick_without_osd_lock()
+{
+  ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
+  dout(10) << "tick_without_osd_lock" << dendl;
+
+  logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
+  logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
+  logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
+
+  // refresh osd stats
+  struct store_statfs_t stbuf;
+  osd_alert_list_t alerts;
+  int r = store->statfs(&stbuf, &alerts);
+  ceph_assert(r == 0);
+  service.set_statfs(stbuf, alerts);
+
+  // osd_lock is not being held, which means the OSD state
+  // might change when doing the monitor report
+  if (is_active() || is_waiting_for_healthy()) {
+    {
+      std::lock_guard l{heartbeat_lock};
+      heartbeat_check();
+    }
+    map_lock.lock_shared();
+    std::lock_guard l(mon_report_lock);
+
+    // mon report?
+    utime_t now = ceph_clock_now();
+    if (service.need_fullness_update() ||
+	now - last_mon_report > cct->_conf->osd_mon_report_interval) {
+      last_mon_report = now;
+      send_full_update();
+      send_failures();
+    }
+    map_lock.unlock_shared();
+
+    epoch_t max_waiting_epoch = 0;
+    for (auto s : shards) {
+      max_waiting_epoch = std::max(max_waiting_epoch,
+				   s->get_max_waiting_epoch());
+    }
+    if (max_waiting_epoch > get_osdmap()->get_epoch()) {
+      dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
+	       << ", requesting new map" << dendl;
+      osdmap_subscribe(superblock.newest_map + 1, false);
+    }
+  }
+
+  if (is_active()) {
+    if (!scrub_random_backoff()) {
+      sched_scrub();
+    }
+    service.promote_throttle_recalibrate();
+    resume_creating_pg();
+    bool need_send_beacon = false;
+    const auto now = ceph::coarse_mono_clock::now();
+    {
+      // borrow lec lock to pretect last_sent_beacon from changing
+      std::lock_guard l{min_last_epoch_clean_lock};
+      const auto elapsed = now - last_sent_beacon;
+      if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
+        cct->_conf->osd_beacon_report_interval) {
+        need_send_beacon = true;
+      }
+    }
+    if (need_send_beacon) {
+      send_beacon(now);
+    }
+  }
+
+  mgrc.update_daemon_health(get_health_metrics());
+  service.kick_recovery_queue();
+  tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
+					      new C_Tick_WithoutOSDLock(this));
+}
+
+// Usage:
+//   setomapval <pool-id> [namespace/]<obj-name> <key> <val>
+//   rmomapkey <pool-id> [namespace/]<obj-name> <key>
+//   setomapheader <pool-id> [namespace/]<obj-name> <header>
+//   getomap <pool> [namespace/]<obj-name>
+//   truncobj <pool-id> [namespace/]<obj-name> <newlen>
+//   injectmdataerr [namespace/]<obj-name> [shardid]
+//   injectdataerr [namespace/]<obj-name> [shardid]
+//
+//   set_recovery_delay [utime]
+void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
+				 std::string_view command,
+				 const cmdmap_t& cmdmap, ostream &ss)
+{
+  //Test support
+  //Support changing the omap on a single osd by using the Admin Socket to
+  //directly request the osd make a change.
+  if (command == "setomapval" || command == "rmomapkey" ||
+      command == "setomapheader" || command == "getomap" ||
+      command == "truncobj" || command == "injectmdataerr" ||
+      command == "injectdataerr"
+    ) {
+    pg_t rawpg;
+    int64_t pool;
+    OSDMapRef curmap = service->get_osdmap();
+    int r = -1;
+
+    string poolstr;
+
+    cmd_getval(cmdmap, "pool", poolstr);
+    pool = curmap->lookup_pg_pool_name(poolstr);
+    //If we can't find it by name then maybe id specified
+    if (pool < 0 && isdigit(poolstr[0]))
+      pool = atoll(poolstr.c_str());
+    if (pool < 0) {
+      ss << "Invalid pool '" << poolstr << "''";
+      return;
+    }
+
+    string objname, nspace;
+    cmd_getval(cmdmap, "objname", objname);
+    std::size_t found = objname.find_first_of('/');
+    if (found != string::npos) {
+      nspace = objname.substr(0, found);
+      objname = objname.substr(found+1);
+    }
+    object_locator_t oloc(pool, nspace);
+    r = curmap->object_locator_to_pg(object_t(objname), oloc,  rawpg);
+
+    if (r < 0) {
+      ss << "Invalid namespace/objname";
+      return;
+    }
+
+    int64_t shardid;
+    cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
+    hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
+    ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
+    spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
+    if (curmap->pg_is_ec(rawpg)) {
+        if ((command != "injectdataerr") && (command != "injectmdataerr")) {
+            ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
+            return;
+        }
+    }
+
+    ObjectStore::Transaction t;
+
+    if (command == "setomapval") {
+      map<string, bufferlist> newattrs;
+      bufferlist val;
+      string key, valstr;
+      cmd_getval(cmdmap, "key", key);
+      cmd_getval(cmdmap, "val", valstr);
+
+      val.append(valstr);
+      newattrs[key] = val;
+      t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
+      r = store->queue_transaction(service->meta_ch, std::move(t));
+      if (r < 0)
+        ss << "error=" << r;
+      else
+        ss << "ok";
+    } else if (command == "rmomapkey") {
+      string key;
+      cmd_getval(cmdmap, "key", key);
+
+      t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
+      r = store->queue_transaction(service->meta_ch, std::move(t));
+      if (r < 0)
+        ss << "error=" << r;
+      else
+        ss << "ok";
+    } else if (command == "setomapheader") {
+      bufferlist newheader;
+      string headerstr;
+
+      cmd_getval(cmdmap, "header", headerstr);
+      newheader.append(headerstr);
+      t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
+      r = store->queue_transaction(service->meta_ch, std::move(t));
+      if (r < 0)
+        ss << "error=" << r;
+      else
+        ss << "ok";
+    } else if (command == "getomap") {
+      //Debug: Output entire omap
+      bufferlist hdrbl;
+      map<string, bufferlist> keyvals;
+      auto ch = store->open_collection(coll_t(pgid));
+      if (!ch) {
+	ss << "unable to open collection for " << pgid;
+	r = -ENOENT;
+      } else {
+	r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
+	if (r >= 0) {
+          ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
+          for (map<string, bufferlist>::iterator it = keyvals.begin();
+	       it != keyvals.end(); ++it)
+            ss << " key=" << (*it).first << " val="
+               << string((*it).second.c_str(), (*it).second.length());
+	} else {
+          ss << "error=" << r;
+	}
+      }
+    } else if (command == "truncobj") {
+      int64_t trunclen;
+      cmd_getval(cmdmap, "len", trunclen);
+      t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
+      r = store->queue_transaction(service->meta_ch, std::move(t));
+      if (r < 0)
+	ss << "error=" << r;
+      else
+	ss << "ok";
+    } else if (command == "injectdataerr") {
+      store->inject_data_error(gobj);
+      ss << "ok";
+    } else if (command == "injectmdataerr") {
+      store->inject_mdata_error(gobj);
+      ss << "ok";
+    }
+    return;
+  }
+  if (command == "set_recovery_delay") {
+    int64_t delay;
+    cmd_getval(cmdmap, "utime", delay, (int64_t)0);
+    ostringstream oss;
+    oss << delay;
+    int r = service->cct->_conf.set_val("osd_recovery_delay_start",
+					 oss.str().c_str());
+    if (r != 0) {
+      ss << "set_recovery_delay: error setting "
+	 << "osd_recovery_delay_start to '" << delay << "': error "
+	 << r;
+      return;
+    }
+    service->cct->_conf.apply_changes(nullptr);
+    ss << "set_recovery_delay: set osd_recovery_delay_start "
+       << "to " << service->cct->_conf->osd_recovery_delay_start;
+    return;
+  }
+  if (command == "injectfull") {
+    int64_t count;
+    string type;
+    OSDService::s_names state;
+    cmd_getval(cmdmap, "type", type, string("full"));
+    cmd_getval(cmdmap, "count", count, (int64_t)-1);
+    if (type == "none" || count == 0) {
+      type = "none";
+      count = 0;
+    }
+    state = service->get_full_state(type);
+    if (state == OSDService::s_names::INVALID) {
+      ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
+      return;
+    }
+    service->set_injectfull(state, count);
+    return;
+  }
+  ss << "Internal error - command=" << command;
+}
+
+// =========================================
+
+void OSD::ms_handle_connect(Connection *con)
+{
+  dout(10) << __func__ << " con " << con << dendl;
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
+    std::lock_guard l(osd_lock);
+    if (is_stopping())
+      return;
+    dout(10) << __func__ << " on mon" << dendl;
+
+    if (is_preboot()) {
+      start_boot();
+    } else if (is_booting()) {
+      _send_boot();       // resend boot message
+    } else {
+      map_lock.lock_shared();
+      std::lock_guard l2(mon_report_lock);
+
+      utime_t now = ceph_clock_now();
+      last_mon_report = now;
+
+      // resend everything, it's a new session
+      send_full_update();
+      send_alive();
+      service.requeue_pg_temp();
+      service.clear_sent_ready_to_merge();
+      service.send_pg_temp();
+      service.send_ready_to_merge();
+      service.send_pg_created();
+      requeue_failures();
+      send_failures();
+
+      map_lock.unlock_shared();
+      if (is_active()) {
+	send_beacon(ceph::coarse_mono_clock::now());
+      }
+    }
+
+    // full map requests may happen while active or pre-boot
+    if (requested_full_first) {
+      rerequest_full_maps();
+    }
+  }
+}
+
+void OSD::ms_handle_fast_connect(Connection *con)
+{
+  if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
+      con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
+    if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
+      s = ceph::make_ref<Session>(cct, con);
+      con->set_priv(s);
+      dout(10) << " new session (outgoing) " << s << " con=" << s->con
+          << " addr=" << s->con->get_peer_addr() << dendl;
+      // we don't connect to clients
+      ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
+      s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
+    }
+  }
+}
+
+void OSD::ms_handle_fast_accept(Connection *con)
+{
+  if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
+      con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
+    if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
+      s = ceph::make_ref<Session>(cct, con);
+      con->set_priv(s);
+      dout(10) << "new session (incoming)" << s << " con=" << con
+          << " addr=" << con->get_peer_addr()
+          << " must have raced with connect" << dendl;
+      ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
+      s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
+    }
+  }
+}
+
+bool OSD::ms_handle_reset(Connection *con)
+{
+  auto session = ceph::ref_cast<Session>(con->get_priv());
+  dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
+  if (!session)
+    return false;
+  session->wstate.reset(con);
+  session->con->set_priv(nullptr);
+  session->con.reset();  // break con <-> session ref cycle
+  // note that we break session->con *before* the session_handle_reset
+  // cleanup below.  this avoids a race between us and
+  // PG::add_backoff, Session::check_backoff, etc.
+  session_handle_reset(session);
+  return true;
+}
+
+bool OSD::ms_handle_refused(Connection *con)
+{
+  if (!cct->_conf->osd_fast_fail_on_connection_refused)
+    return false;
+
+  auto session = ceph::ref_cast<Session>(con->get_priv());
+  dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
+  if (!session)
+    return false;
+  int type = con->get_peer_type();
+  // handle only OSD failures here
+  if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
+    OSDMapRef osdmap = get_osdmap();
+    if (osdmap) {
+      int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
+      if (id >= 0 && osdmap->is_up(id)) {
+	// I'm cheating mon heartbeat grace logic, because we know it's not going
+	// to respawn alone. +1 so we won't hit any boundary case.
+	monc->send_mon_message(
+	  new MOSDFailure(
+	    monc->get_fsid(),
+	    id,
+	    osdmap->get_addrs(id),
+	    cct->_conf->osd_heartbeat_grace + 1,
+	    osdmap->get_epoch(),
+	    MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
+	    ));
+      }
+    }
+  }
+  return true;
+}
+
+struct CB_OSD_GetVersion {
+  OSD *osd;
+  explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
+  void operator ()(boost::system::error_code ec, version_t newest,
+		   version_t oldest) {
+    if (!ec)
+      osd->_got_mon_epochs(oldest, newest);
+  }
+};
+
+void OSD::start_boot()
+{
+  if (!_is_healthy()) {
+    // if we are not healthy, do not mark ourselves up (yet)
+    dout(1) << "not healthy; waiting to boot" << dendl;
+    if (!is_waiting_for_healthy())
+      start_waiting_for_healthy();
+    // send pings sooner rather than later
+    heartbeat_kick();
+    return;
+  }
+  dout(1) << __func__ << dendl;
+  set_state(STATE_PREBOOT);
+  dout(10) << "start_boot - have maps " << superblock.oldest_map
+	   << ".." << superblock.newest_map << dendl;
+  monc->get_version("osdmap", CB_OSD_GetVersion(this));
+}
+
+void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
+{
+  std::lock_guard l(osd_lock);
+  if (is_preboot()) {
+    _preboot(oldest, newest);
+  }
+}
+
+void OSD::_preboot(epoch_t oldest, epoch_t newest)
+{
+  ceph_assert(is_preboot());
+  dout(10) << __func__ << " _preboot mon has osdmaps "
+	   << oldest << ".." << newest << dendl;
+
+  // ensure our local fullness awareness is accurate
+  {
+    std::lock_guard l(heartbeat_lock);
+    heartbeat();
+  }
+
+  const auto& monmap = monc->monmap;
+  const auto osdmap = get_osdmap();
+  // if our map within recent history, try to add ourselves to the osdmap.
+  if (osdmap->get_epoch() == 0) {
+    derr << "waiting for initial osdmap" << dendl;
+  } else if (osdmap->is_destroyed(whoami)) {
+    derr << "osdmap says I am destroyed" << dendl;
+    // provide a small margin so we don't livelock seeing if we
+    // un-destroyed ourselves.
+    if (osdmap->get_epoch() > newest - 1) {
+      exit(0);
+    }
+  } else if (osdmap->is_noup(whoami)) {
+    derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
+  } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
+    derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
+	 << dendl;
+  } else if (service.need_fullness_update()) {
+    derr << "osdmap fullness state needs update" << dendl;
+    send_full_update();
+  } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
+	     superblock.purged_snaps_last < superblock.current_epoch) {
+    dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
+	     << " < newest_map " << superblock.current_epoch << dendl;
+    _get_purged_snaps();
+  } else if (osdmap->get_epoch() >= oldest - 1 &&
+	     osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
+
+    // wait for pgs to fully catch up in a different thread, since
+    // this thread might be required for splitting and merging PGs to
+    // make progress.
+    boot_finisher.queue(
+      new LambdaContext(
+	[this](int r) {
+	  std::unique_lock l(osd_lock);
+	  if (is_preboot()) {
+	    dout(10) << __func__ << " waiting for peering work to drain"
+		     << dendl;
+	    l.unlock();
+	    for (auto shard : shards) {
+	      shard->wait_min_pg_epoch(get_osdmap_epoch());
+	    }
+	    l.lock();
+	  }
+	  if (is_preboot()) {
+	    _send_boot();
+	  }
+	}));
+    return;
+  }
+
+  // get all the latest maps
+  if (osdmap->get_epoch() + 1 >= oldest)
+    osdmap_subscribe(osdmap->get_epoch() + 1, false);
+  else
+    osdmap_subscribe(oldest - 1, true);
+}
+
+void OSD::_get_purged_snaps()
+{
+  // NOTE: this is a naive, stateless implementaiton.  it may send multiple
+  // overlapping requests to the mon, which will be somewhat inefficient, but
+  // it should be reliable.
+  dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
+	   << ", newest_map " << superblock.current_epoch << dendl;
+  MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
+    superblock.purged_snaps_last + 1,
+    superblock.current_epoch + 1);
+  monc->send_mon_message(m);
+}
+
+void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
+{
+  dout(10) << __func__ << " " << *m << dendl;
+  ObjectStore::Transaction t;
+  if (!is_preboot() ||
+      m->last < superblock.purged_snaps_last) {
+    goto out;
+  }
+  SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
+				  make_purged_snaps_oid(), &t,
+				  m->purged_snaps);
+  superblock.purged_snaps_last = m->last;
+  write_superblock(t);
+  store->queue_transaction(
+    service.meta_ch,
+    std::move(t));
+  service.publish_superblock(superblock);
+  if (m->last < superblock.current_epoch) {
+    _get_purged_snaps();
+  } else {
+    start_boot();
+  }
+out:
+  m->put();
+}
+
+void OSD::send_full_update()
+{
+  if (!service.need_fullness_update())
+    return;
+  unsigned state = 0;
+  if (service.is_full()) {
+    state = CEPH_OSD_FULL;
+  } else if (service.is_backfillfull()) {
+    state = CEPH_OSD_BACKFILLFULL;
+  } else if (service.is_nearfull()) {
+    state = CEPH_OSD_NEARFULL;
+  }
+  set<string> s;
+  OSDMap::calc_state_set(state, s);
+  dout(10) << __func__ << " want state " << s << dendl;
+  monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
+}
+
+void OSD::start_waiting_for_healthy()
+{
+  dout(1) << "start_waiting_for_healthy" << dendl;
+  set_state(STATE_WAITING_FOR_HEALTHY);
+  last_heartbeat_resample = utime_t();
+
+  // subscribe to osdmap updates, in case our peers really are known to be dead
+  osdmap_subscribe(get_osdmap_epoch() + 1, false);
+}
+
+bool OSD::_is_healthy()
+{
+  if (!cct->get_heartbeat_map()->is_healthy()) {
+    dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
+    return false;
+  }
+
+  if (is_waiting_for_healthy()) {
+     utime_t now = ceph_clock_now();
+     if (osd_markdown_log.empty()) {
+       dout(5) << __func__ << " force returning true since last markdown"
+               << " was " << cct->_conf->osd_max_markdown_period
+               << "s ago" << dendl;
+       return true;
+    }
+    std::lock_guard l(heartbeat_lock);
+    int num = 0, up = 0;
+    for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
+	 p != heartbeat_peers.end();
+	 ++p) {
+      if (p->second.is_healthy(now))
+	++up;
+      ++num;
+    }
+    if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
+      dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
+	      << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void OSD::_send_boot()
+{
+  dout(10) << "_send_boot" << dendl;
+  Connection *local_connection =
+    cluster_messenger->get_loopback_connection().get();
+  entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
+  entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
+  entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
+  entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
+
+  dout(20) << " initial client_addrs " << client_addrs
+	   << ", cluster_addrs " << cluster_addrs
+	   << ", hb_back_addrs " << hb_back_addrs
+	   << ", hb_front_addrs " << hb_front_addrs
+	   << dendl;
+  if (cluster_messenger->set_addr_unknowns(client_addrs)) {
+    dout(10) << " assuming cluster_addrs match client_addrs "
+	     << client_addrs << dendl;
+    cluster_addrs = cluster_messenger->get_myaddrs();
+  }
+  if (auto session = local_connection->get_priv(); !session) {
+    cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
+  }
+
+  local_connection = hb_back_server_messenger->get_loopback_connection().get();
+  if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
+    dout(10) << " assuming hb_back_addrs match cluster_addrs "
+	     << cluster_addrs << dendl;
+    hb_back_addrs = hb_back_server_messenger->get_myaddrs();
+  }
+  if (auto session = local_connection->get_priv(); !session) {
+    hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
+  }
+
+  local_connection = hb_front_server_messenger->get_loopback_connection().get();
+  if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
+    dout(10) << " assuming hb_front_addrs match client_addrs "
+	     << client_addrs << dendl;
+    hb_front_addrs = hb_front_server_messenger->get_myaddrs();
+  }
+  if (auto session = local_connection->get_priv(); !session) {
+    hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
+  }
+
+  // we now know what our front and back addrs will be, and we are
+  // about to tell the mon what our metadata (including numa bindings)
+  // are, so now is a good time!
+  set_numa_affinity();
+
+  MOSDBoot *mboot = new MOSDBoot(
+    superblock, get_osdmap_epoch(), service.get_boot_epoch(),
+    hb_back_addrs, hb_front_addrs, cluster_addrs,
+    CEPH_FEATURES_ALL);
+  dout(10) << " final client_addrs " << client_addrs
+	   << ", cluster_addrs " << cluster_addrs
+	   << ", hb_back_addrs " << hb_back_addrs
+	   << ", hb_front_addrs " << hb_front_addrs
+	   << dendl;
+  _collect_metadata(&mboot->metadata);
+  monc->send_mon_message(mboot);
+  set_state(STATE_BOOTING);
+}
+
+void OSD::_collect_metadata(map<string,string> *pm)
+{
+  // config info
+  (*pm)["osd_data"] = dev_path;
+  if (store->get_type() == "filestore") {
+    // not applicable for bluestore
+    (*pm)["osd_journal"] = journal_path;
+  }
+  (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
+  (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
+  (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
+  (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
+
+  // backend
+  (*pm)["osd_objectstore"] = store->get_type();
+  (*pm)["rotational"] = store_is_rotational ? "1" : "0";
+  (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
+  (*pm)["default_device_class"] = store->get_default_device_class();
+  string osdspec_affinity;
+  int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
+  if (r < 0 || osdspec_affinity.empty()) {
+    osdspec_affinity = "";
+  }
+  (*pm)["osdspec_affinity"] = osdspec_affinity;
+  string ceph_version_when_created;
+  r = store->read_meta("ceph_version_when_created", &ceph_version_when_created);
+  if (r <0 || ceph_version_when_created.empty()) {
+    ceph_version_when_created = "";
+  }
+  (*pm)["ceph_version_when_created"] = ceph_version_when_created;
+  string created_at;
+  r = store->read_meta("created_at", &created_at);
+  if (r < 0 || created_at.empty()) {
+    created_at = "";
+  }
+  (*pm)["created_at"] = created_at;
+  store->collect_metadata(pm);
+
+  collect_sys_info(pm, cct);
+
+  (*pm)["front_iface"] = pick_iface(
+    cct,
+    client_messenger->get_myaddrs().front().get_sockaddr_storage());
+  (*pm)["back_iface"] = pick_iface(
+    cct,
+    cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
+
+  // network numa
+  {
+    int node = -1;
+    set<int> nodes;
+    set<string> unknown;
+    for (auto nm : { "front_iface", "back_iface" }) {
+      if (!(*pm)[nm].size()) {
+	unknown.insert(nm);
+	continue;
+      }
+      int n = -1;
+      int r = get_iface_numa_node((*pm)[nm], &n);
+      if (r < 0) {
+	unknown.insert((*pm)[nm]);
+	continue;
+      }
+      nodes.insert(n);
+      if (node < 0) {
+	node = n;
+      }
+    }
+    if (unknown.size()) {
+      (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
+    }
+    if (!nodes.empty()) {
+      (*pm)["network_numa_nodes"] = stringify(nodes);
+    }
+    if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
+      (*pm)["network_numa_node"] = stringify(node);
+    }
+  }
+
+  if (numa_node >= 0) {
+    (*pm)["numa_node"] = stringify(numa_node);
+    (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
+						  &numa_cpu_set);
+  }
+
+  set<string> devnames;
+  store->get_devices(&devnames);
+  map<string,string> errs;
+  get_device_metadata(devnames, pm, &errs);
+  for (auto& i : errs) {
+    dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
+  }
+  dout(10) << __func__ << " " << *pm << dendl;
+}
+
+void OSD::queue_want_up_thru(epoch_t want)
+{
+  std::shared_lock map_locker{map_lock};
+  epoch_t cur = get_osdmap()->get_up_thru(whoami);
+  std::lock_guard report_locker(mon_report_lock);
+  if (want > up_thru_wanted) {
+    dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
+	     << ", currently " << cur
+	     << dendl;
+    up_thru_wanted = want;
+    send_alive();
+  } else {
+    dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
+	     << ", currently " << cur
+	     << dendl;
+  }
+}
+
+void OSD::send_alive()
+{
+  ceph_assert(ceph_mutex_is_locked(mon_report_lock));
+  const auto osdmap = get_osdmap();
+  if (!osdmap->exists(whoami))
+    return;
+  epoch_t up_thru = osdmap->get_up_thru(whoami);
+  dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
+  if (up_thru_wanted > up_thru) {
+    dout(10) << "send_alive want " << up_thru_wanted << dendl;
+    monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
+  }
+}
+
+void OSD::request_full_map(epoch_t first, epoch_t last)
+{
+  dout(10) << __func__ << " " << first << ".." << last
+	   << ", previously requested "
+	   << requested_full_first << ".." << requested_full_last << dendl;
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+  ceph_assert(first > 0 && last > 0);
+  ceph_assert(first <= last);
+  ceph_assert(first >= requested_full_first);  // we shouldn't ever ask for older maps
+  if (requested_full_first == 0) {
+    // first request
+    requested_full_first = first;
+    requested_full_last = last;
+  } else if (last <= requested_full_last) {
+    // dup
+    return;
+  } else {
+    // additional request
+    first = requested_full_last + 1;
+    requested_full_last = last;
+  }
+  MMonGetOSDMap *req = new MMonGetOSDMap;
+  req->request_full(first, last);
+  monc->send_mon_message(req);
+}
+
+void OSD::got_full_map(epoch_t e)
+{
+  ceph_assert(requested_full_first <= requested_full_last);
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+  if (requested_full_first == 0) {
+    dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
+    return;
+  }
+  if (e < requested_full_first) {
+    dout(10) << __func__ << " " << e << ", requested " << requested_full_first
+	     << ".." << requested_full_last
+	     << ", ignoring" << dendl;
+    return;
+  }
+  if (e >= requested_full_last) {
+    dout(10) << __func__ << " " << e << ", requested " << requested_full_first
+	     << ".." << requested_full_last << ", resetting" << dendl;
+    requested_full_first = requested_full_last = 0;
+    return;
+  }
+
+  requested_full_first = e + 1;
+
+  dout(10) << __func__ << " " << e << ", requested " << requested_full_first
+           << ".." << requested_full_last
+           << ", still need more" << dendl;
+}
+
+void OSD::requeue_failures()
+{
+  std::lock_guard l(heartbeat_lock);
+  unsigned old_queue = failure_queue.size();
+  unsigned old_pending = failure_pending.size();
+  for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
+    failure_queue[p->first] = p->second.first;
+    failure_pending.erase(p++);
+  }
+  dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
+	   << failure_queue.size() << dendl;
+}
+
+void OSD::send_failures()
+{
+  ceph_assert(ceph_mutex_is_locked(map_lock));
+  ceph_assert(ceph_mutex_is_locked(mon_report_lock));
+  std::lock_guard l(heartbeat_lock);
+  utime_t now = ceph_clock_now();
+  const auto osdmap = get_osdmap();
+  while (!failure_queue.empty()) {
+    int osd = failure_queue.begin()->first;
+    if (!failure_pending.count(osd)) {
+      int failed_for = (int)(double)(now - failure_queue.begin()->second);
+      monc->send_mon_message(
+	new MOSDFailure(
+	  monc->get_fsid(),
+	  osd,
+	  osdmap->get_addrs(osd),
+	  failed_for,
+	  osdmap->get_epoch()));
+      failure_pending[osd] = make_pair(failure_queue.begin()->second,
+				       osdmap->get_addrs(osd));
+    }
+    failure_queue.erase(osd);
+  }
+}
+
+void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
+{
+  MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
+				   MOSDFailure::FLAG_ALIVE);
+  monc->send_mon_message(m);
+}
+
+void OSD::cancel_pending_failures()
+{
+  std::lock_guard l(heartbeat_lock);
+  auto it = failure_pending.begin();
+  while (it != failure_pending.end()) {
+    dout(10) << __func__ << " canceling in-flight failure report for osd."
+             << it->first << dendl;
+    send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
+    failure_pending.erase(it++);
+  }
+}
+
+void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
+{
+  const auto& monmap = monc->monmap;
+  // send beacon to mon even if we are just connected, and the monmap is not
+  // initialized yet by then.
+  if (monmap.epoch > 0 &&
+      monmap.get_required_features().contains_all(
+        ceph::features::mon::FEATURE_LUMINOUS)) {
+    dout(20) << __func__ << " sending" << dendl;
+    MOSDBeacon* beacon = nullptr;
+    {
+      std::lock_guard l{min_last_epoch_clean_lock};
+      beacon = new MOSDBeacon(get_osdmap_epoch(),
+			      min_last_epoch_clean,
+			      superblock.last_purged_snaps_scrub,
+			      cct->_conf->osd_beacon_report_interval);
+      beacon->pgs = min_last_epoch_clean_pgs;
+      last_sent_beacon = now;
+    }
+    monc->send_mon_message(beacon);
+  } else {
+    dout(20) << __func__ << " not sending" << dendl;
+  }
+}
+
+void OSD::handle_command(MCommand *m)
+{
+  ConnectionRef con = m->get_connection();
+  auto session = ceph::ref_cast<Session>(con->get_priv());
+  if (!session) {
+    con->send_message(new MCommandReply(m, -EACCES));
+    m->put();
+    return;
+  }
+  if (!session->caps.allow_all()) {
+    con->send_message(new MCommandReply(m, -EACCES));
+    m->put();
+    return;
+  }
+  cct->get_admin_socket()->queue_tell_command(m);
+  m->put();
+}
+
+namespace {
+  class unlock_guard {
+    ceph::mutex& m;
+  public:
+    explicit unlock_guard(ceph::mutex& mutex)
+      : m(mutex)
+    {
+      m.unlock();
+    }
+    unlock_guard(unlock_guard&) = delete;
+    ~unlock_guard() {
+      m.lock();
+    }
+  };
+}
+
+void OSD::scrub_purged_snaps()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+  SnapMapper::Scrubber s(cct, store, service.meta_ch,
+			 make_snapmapper_oid(),
+			 make_purged_snaps_oid());
+  clog->debug() << "purged_snaps scrub starts";
+  osd_lock.unlock();
+  s.run();
+  if (s.stray.size()) {
+    clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
+  } else {
+    clog->debug() << "purged_snaps scrub ok";
+  }
+  set<pair<spg_t,snapid_t>> queued;
+  for (auto& [pool, snap, hash, shard] : s.stray) {
+    const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
+    if (!pi) {
+      dout(20) << __func__ << " pool " << pool << " dne" << dendl;
+      continue;
+    }
+    pg_t pgid(pi->raw_hash_to_pg(hash), pool);
+    spg_t spgid(pgid, shard);
+    pair<spg_t,snapid_t> p(spgid, snap);
+    if (queued.count(p)) {
+      dout(20) << __func__ << " pg " << spgid << " snap " << snap
+	       << " already queued" << dendl;
+      continue;
+    }
+    PGRef pg = lookup_lock_pg(spgid);
+    if (!pg) {
+      dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
+      continue;
+    }
+    queued.insert(p);
+    dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
+	     << snap << dendl;
+    pg->queue_snap_retrim(snap);
+    pg->unlock();
+  }
+  osd_lock.lock();
+  if (is_stopping()) {
+    return;
+  }
+  dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
+  ObjectStore::Transaction t;
+  superblock.last_purged_snaps_scrub = ceph_clock_now();
+  write_superblock(t);
+  int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+  ceph_assert(tr == 0);
+  if (is_active()) {
+    send_beacon(ceph::coarse_mono_clock::now());
+  }
+  dout(10) << __func__ << " done" << dendl;
+}
+
+void OSD::probe_smart(const string& only_devid, ostream& ss)
+{
+  set<string> devnames;
+  store->get_devices(&devnames);
+  uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
+    "osd_smart_report_timeout");
+
+  // == typedef std::map<std::string, mValue> mObject;
+  json_spirit::mObject json_map;
+
+  for (auto dev : devnames) {
+    // smartctl works only on physical devices; filter out any logical device
+    if (dev.find("dm-") == 0) {
+      continue;
+    }
+
+    string err;
+    string devid = get_device_id(dev, &err);
+    if (devid.size() == 0) {
+      dout(10) << __func__ << " no unique id for dev " << dev << " ("
+	       << err << "), skipping" << dendl;
+      continue;
+    }
+    if (only_devid.size() && devid != only_devid) {
+      continue;
+    }
+
+    json_spirit::mValue smart_json;
+    if (block_device_get_metrics(dev, smart_timeout,
+				 &smart_json)) {
+      dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
+      continue;
+    }
+    json_map[devid] = smart_json;
+  }
+  json_spirit::write(json_map, ss, json_spirit::pretty_print);
+}
+
+bool OSD::heartbeat_dispatch(Message *m)
+{
+  dout(30) << "heartbeat_dispatch " << m << dendl;
+  switch (m->get_type()) {
+
+  case CEPH_MSG_PING:
+    dout(10) << "ping from " << m->get_source_inst() << dendl;
+    m->put();
+    break;
+
+  case MSG_OSD_PING:
+    handle_osd_ping(static_cast<MOSDPing*>(m));
+    break;
+
+  default:
+    dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
+    m->put();
+  }
+
+  return true;
+}
+
+bool OSD::ms_dispatch(Message *m)
+{
+  dout(20) << "OSD::ms_dispatch: " << *m << dendl;
+  if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
+    service.got_stop_ack();
+    m->put();
+    return true;
+  }
+
+  // lock!
+
+  osd_lock.lock();
+  if (is_stopping()) {
+    osd_lock.unlock();
+    m->put();
+    return true;
+  }
+
+  do_waiters();
+  _dispatch(m);
+
+  osd_lock.unlock();
+
+  return true;
+}
+
+void OSDService::maybe_share_map(
+  Connection *con,
+  const OSDMapRef& osdmap,
+  epoch_t peer_epoch_lb)
+{
+  // NOTE: we assume caller hold something that keeps the Connection itself
+  // pinned (e.g., an OpRequest's MessageRef).
+  auto session = ceph::ref_cast<Session>(con->get_priv());
+  if (!session) {
+    return;
+  }
+
+  // assume the peer has the newer of the op's sent_epoch and what
+  // we think we sent them.
+  session->sent_epoch_lock.lock();
+  if (peer_epoch_lb > session->last_sent_epoch) {
+    dout(10) << __func__ << " con " << con
+	     << " " << con->get_peer_addr()
+	     << " map epoch " << session->last_sent_epoch
+	     << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
+    session->last_sent_epoch = peer_epoch_lb;
+  }
+  epoch_t last_sent_epoch = session->last_sent_epoch;
+  session->sent_epoch_lock.unlock();
+
+  if (osdmap->get_epoch() <= last_sent_epoch) {
+    return;
+  }
+
+  send_incremental_map(last_sent_epoch, con, osdmap);
+  last_sent_epoch = osdmap->get_epoch();
+
+  session->sent_epoch_lock.lock();
+  if (session->last_sent_epoch < last_sent_epoch) {
+    dout(10) << __func__ << " con " << con
+	     << " " << con->get_peer_addr()
+	     << " map epoch " << session->last_sent_epoch
+	     << " -> " << last_sent_epoch << " (shared)" << dendl;
+    session->last_sent_epoch = last_sent_epoch;
+  }
+  session->sent_epoch_lock.unlock();
+}
+
+void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
+{
+  ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
+
+  auto i = session->waiting_on_map.begin();
+  while (i != session->waiting_on_map.end()) {
+    OpRequestRef op = &(*i);
+    ceph_assert(ms_can_fast_dispatch(op->get_req()));
+    auto m = op->get_req<MOSDFastDispatchOp>();
+    if (m->get_min_epoch() > osdmap->get_epoch()) {
+      break;
+    }
+    session->waiting_on_map.erase(i++);
+    op->put();
+
+    spg_t pgid;
+    if (m->get_type() == CEPH_MSG_OSD_OP) {
+      pg_t actual_pgid = osdmap->raw_pg_to_pg(
+	static_cast<const MOSDOp*>(m)->get_pg());
+      if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
+	continue;
+      }
+    } else {
+      pgid = m->get_spg();
+    }
+    enqueue_op(pgid, std::move(op), m->get_map_epoch());
+  }
+
+  if (session->waiting_on_map.empty()) {
+    clear_session_waiting_on_map(session);
+  } else {
+    register_session_waiting_on_map(session);
+  }
+}
+
+void OSD::ms_fast_dispatch(Message *m)
+{
+
+#ifdef HAVE_JAEGER
+  jaeger_tracing::init_tracer("osd-services-reinit");
+  dout(10) << "jaeger tracer after " << opentracing::Tracer::Global() << dendl;
+  auto dispatch_span = jaeger_tracing::new_span(__func__);
+#endif
+  FUNCTRACE(cct);
+  if (service.is_stopping()) {
+    m->put();
+    return;
+  }
+
+  // peering event?
+  switch (m->get_type()) {
+  case CEPH_MSG_PING:
+    dout(10) << "ping from " << m->get_source() << dendl;
+    m->put();
+    return;
+  case MSG_OSD_FORCE_RECOVERY:
+    handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
+    return;
+  case MSG_OSD_SCRUB2:
+    handle_fast_scrub(static_cast<MOSDScrub2*>(m));
+    return;
+
+  case MSG_OSD_PG_CREATE2:
+    return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
+  case MSG_OSD_PG_QUERY:
+    return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
+  case MSG_OSD_PG_NOTIFY:
+    return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
+  case MSG_OSD_PG_INFO:
+    return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
+  case MSG_OSD_PG_REMOVE:
+    return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
+
+    // these are single-pg messages that handle themselves
+  case MSG_OSD_PG_LOG:
+  case MSG_OSD_PG_TRIM:
+  case MSG_OSD_PG_NOTIFY2:
+  case MSG_OSD_PG_QUERY2:
+  case MSG_OSD_PG_INFO2:
+  case MSG_OSD_BACKFILL_RESERVE:
+  case MSG_OSD_RECOVERY_RESERVE:
+  case MSG_OSD_PG_LEASE:
+  case MSG_OSD_PG_LEASE_ACK:
+    {
+      MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
+      if (require_osd_peer(pm)) {
+	enqueue_peering_evt(
+	  pm->get_spg(),
+	  PGPeeringEventRef(pm->get_event()));
+      }
+      pm->put();
+      return;
+    }
+  }
+
+  OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
+  {
+#ifdef WITH_LTTNG
+    osd_reqid_t reqid = op->get_reqid();
+#endif
+    tracepoint(osd, ms_fast_dispatch, reqid.name._type,
+        reqid.name._num, reqid.tid, reqid.inc);
+  }
+#ifdef HAVE_JAEGER
+  op->set_osd_parent_span(dispatch_span);
+  if (op->osd_parent_span) {
+    auto op_req_span = jaeger_tracing::child_span("op-request-created", op->osd_parent_span);
+    op->set_osd_parent_span(op_req_span);
+  }
+#endif
+  if (m->trace)
+    op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
+
+  // note sender epoch, min req's epoch
+  op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
+  op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
+  ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
+
+  service.maybe_inject_dispatch_delay();
+
+  if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
+      m->get_type() != CEPH_MSG_OSD_OP) {
+    // queue it directly
+    enqueue_op(
+      static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
+      std::move(op),
+      static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
+  } else {
+    // legacy client, and this is an MOSDOp (the *only* fast dispatch
+    // message that didn't have an explicit spg_t); we need to map
+    // them to an spg_t while preserving delivery order.
+    auto priv = m->get_connection()->get_priv();
+    if (auto session = static_cast<Session*>(priv.get()); session) {
+      std::lock_guard l{session->session_dispatch_lock};
+      op->get();
+      session->waiting_on_map.push_back(*op);
+      OSDMapRef nextmap = service.get_nextmap_reserved();
+      dispatch_session_waiting(session, nextmap);
+      service.release_map(nextmap);
+    }
+  }
+  OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
+}
+
+int OSD::ms_handle_authentication(Connection *con)
+{
+  int ret = 0;
+  auto s = ceph::ref_cast<Session>(con->get_priv());
+  if (!s) {
+    s = ceph::make_ref<Session>(cct, con);
+    con->set_priv(s);
+    s->entity_name = con->get_peer_entity_name();
+    dout(10) << __func__ << " new session " << s << " con " << s->con
+	     << " entity " << s->entity_name
+	     << " addr " << con->get_peer_addrs() << dendl;
+  } else {
+    dout(10) << __func__ << " existing session " << s << " con " << s->con
+	     << " entity " << s->entity_name
+	     << " addr " << con->get_peer_addrs() << dendl;
+  }
+
+  AuthCapsInfo &caps_info = con->get_peer_caps_info();
+  if (caps_info.allow_all) {
+    s->caps.set_allow_all();
+  } else if (caps_info.caps.length() > 0) {
+    bufferlist::const_iterator p = caps_info.caps.cbegin();
+    string str;
+    try {
+      decode(str, p);
+    }
+    catch (ceph::buffer::error& e) {
+      dout(10) << __func__ << " session " << s << " " << s->entity_name
+	       << " failed to decode caps string" << dendl;
+      ret = -EACCES;
+    }
+    if (!ret) {
+      bool success = s->caps.parse(str);
+      if (success) {
+	dout(10) << __func__ << " session " << s
+		 << " " << s->entity_name
+		 << " has caps " << s->caps << " '" << str << "'" << dendl;
+	ret = 1;
+      } else {
+	dout(10) << __func__ << " session " << s << " " << s->entity_name
+		 << " failed to parse caps '" << str << "'" << dendl;
+	ret = -EACCES;
+      }
+    }
+  }
+  return ret;
+}
+
+void OSD::do_waiters()
+{
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+
+  dout(10) << "do_waiters -- start" << dendl;
+  while (!finished.empty()) {
+    OpRequestRef next = finished.front();
+    finished.pop_front();
+    dispatch_op(next);
+  }
+  dout(10) << "do_waiters -- finish" << dendl;
+}
+
+void OSD::dispatch_op(OpRequestRef op)
+{
+  switch (op->get_req()->get_type()) {
+
+  case MSG_OSD_PG_CREATE:
+    handle_pg_create(op);
+    break;
+  }
+}
+
+void OSD::_dispatch(Message *m)
+{
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+  dout(20) << "_dispatch " << m << " " << *m << dendl;
+
+  switch (m->get_type()) {
+    // -- don't need OSDMap --
+
+    // map and replication
+  case CEPH_MSG_OSD_MAP:
+    handle_osd_map(static_cast<MOSDMap*>(m));
+    break;
+  case MSG_MON_GET_PURGED_SNAPS_REPLY:
+    handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
+    break;
+
+    // osd
+  case MSG_OSD_SCRUB:
+    handle_scrub(static_cast<MOSDScrub*>(m));
+    break;
+
+  case MSG_COMMAND:
+    handle_command(static_cast<MCommand*>(m));
+    return;
+
+    // -- need OSDMap --
+
+  case MSG_OSD_PG_CREATE:
+    {
+      OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
+      if (m->trace)
+        op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
+      // no map?  starting up?
+      if (!get_osdmap()) {
+        dout(7) << "no OSDMap, not booted" << dendl;
+	logger->inc(l_osd_waiting_for_map);
+        waiting_for_osdmap.push_back(op);
+	op->mark_delayed("no osdmap");
+        break;
+      }
+
+      // need OSDMap
+      dispatch_op(op);
+    }
+  }
+}
+
+// remove me post-nautilus
+void OSD::handle_scrub(MOSDScrub *m)
+{
+  dout(10) << "handle_scrub " << *m << dendl;
+  if (!require_mon_or_mgr_peer(m)) {
+    m->put();
+    return;
+  }
+  if (m->fsid != monc->get_fsid()) {
+    dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
+	    << dendl;
+    m->put();
+    return;
+  }
+
+  vector<spg_t> spgs;
+  _get_pgids(&spgs);
+
+  if (!m->scrub_pgs.empty()) {
+    vector<spg_t> v;
+    for (auto pgid : m->scrub_pgs) {
+      spg_t pcand;
+      if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
+	  std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
+	v.push_back(pcand);
+      }
+    }
+    spgs.swap(v);
+  }
+
+  for (auto pgid : spgs) {
+    enqueue_peering_evt(
+      pgid,
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	  get_osdmap_epoch(),
+	  get_osdmap_epoch(),
+	  PeeringState::RequestScrub(m->deep, m->repair))));
+  }
+
+  m->put();
+}
+
+void OSD::handle_fast_scrub(MOSDScrub2 *m)
+{
+  dout(10) << __func__ <<  " " << *m << dendl;
+  if (!require_mon_or_mgr_peer(m)) {
+    m->put();
+    return;
+  }
+  if (m->fsid != monc->get_fsid()) {
+    dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
+	    << dendl;
+    m->put();
+    return;
+  }
+  for (auto pgid : m->scrub_pgs) {
+    enqueue_peering_evt(
+      pgid,
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	  m->epoch,
+	  m->epoch,
+	  PeeringState::RequestScrub(m->deep, m->repair))));
+  }
+  m->put();
+}
+
+bool OSD::scrub_random_backoff()
+{
+  bool coin_flip = (rand() / (double)RAND_MAX >=
+		    cct->_conf->osd_scrub_backoff_ratio);
+  if (!coin_flip) {
+    dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
+    return true;
+  }
+  return false;
+}
+
+OSDService::ScrubJob::ScrubJob(CephContext* cct,
+			       const spg_t& pg, const utime_t& timestamp,
+			       double pool_scrub_min_interval,
+			       double pool_scrub_max_interval, bool must)
+  : cct(cct),
+    pgid(pg),
+    sched_time(timestamp),
+    deadline(timestamp)
+{
+  // if not explicitly requested, postpone the scrub with a random delay
+  if (!must) {
+    double scrub_min_interval = pool_scrub_min_interval > 0 ?
+      pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
+    double scrub_max_interval = pool_scrub_max_interval > 0 ?
+      pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
+
+    sched_time += scrub_min_interval;
+    double r = rand() / (double)RAND_MAX;
+    sched_time +=
+      scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
+    if (scrub_max_interval == 0) {
+      deadline = utime_t();
+    } else {
+      deadline += scrub_max_interval;
+    }
+
+  }
+}
+
+bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
+  if (sched_time < rhs.sched_time)
+    return true;
+  if (sched_time > rhs.sched_time)
+    return false;
+  return pgid < rhs.pgid;
+}
+
+void OSDService::dumps_scrub(ceph::Formatter *f)
+{
+  ceph_assert(f != nullptr);
+  std::lock_guard l(sched_scrub_lock);
+
+  f->open_array_section("scrubs");
+  for (const auto &i: sched_scrub_pg) {
+    f->open_object_section("scrub");
+    f->dump_stream("pgid") << i.pgid;
+    f->dump_stream("sched_time") << i.sched_time;
+    f->dump_stream("deadline") << i.deadline;
+    f->dump_bool("forced", i.sched_time == PgScrubber::scrub_must_stamp());
+    f->close_section();
+  }
+  f->close_section();
+}
+
+double OSD::scrub_sleep_time(bool must_scrub)
+{
+  if (must_scrub) {
+    return cct->_conf->osd_scrub_sleep;
+  }
+  utime_t now = ceph_clock_now();
+  if (scrub_time_permit(now)) {
+    return cct->_conf->osd_scrub_sleep;
+  }
+  double normal_sleep = cct->_conf->osd_scrub_sleep;
+  double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
+  return std::max(extended_sleep, normal_sleep);
+}
+
+bool OSD::scrub_time_permit(utime_t now)
+{
+  struct tm bdt;
+  time_t tt = now.sec();
+  localtime_r(&tt, &bdt);
+
+  bool day_permit = false;
+  if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
+    if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
+      day_permit = true;
+    }
+  } else {
+    if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
+      day_permit = true;
+    }
+  }
+
+  if (!day_permit) {
+    dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
+            << " - " << cct->_conf->osd_scrub_end_week_day
+            << " now " << bdt.tm_wday << " = no" << dendl;
+    return false;
+  }
+
+  bool time_permit = false;
+  if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
+    if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
+      time_permit = true;
+    }
+  } else {
+    if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
+      time_permit = true;
+    }
+  }
+  if (time_permit) {
+    dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
+            << " - " << cct->_conf->osd_scrub_end_hour
+            << " now " << bdt.tm_hour << " = yes" << dendl;
+  } else {
+    dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
+            << " - " << cct->_conf->osd_scrub_end_hour
+            << " now " << bdt.tm_hour << " = no" << dendl;
+  }
+  return time_permit;
+}
+
+bool OSD::scrub_load_below_threshold()
+{
+  double loadavgs[3];
+  if (getloadavg(loadavgs, 3) != 3) {
+    dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
+    return false;
+  }
+
+  // allow scrub if below configured threshold
+  long cpus = sysconf(_SC_NPROCESSORS_ONLN);
+  double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
+  if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
+    dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
+	     << " < max " << cct->_conf->osd_scrub_load_threshold
+	     << " = yes" << dendl;
+    return true;
+  }
+
+  // allow scrub if below daily avg and currently decreasing
+  if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
+    dout(20) << __func__ << " loadavg " << loadavgs[0]
+	     << " < daily_loadavg " << daily_loadavg
+	     << " and < 15m avg " << loadavgs[2]
+	     << " = yes" << dendl;
+    return true;
+  }
+
+  dout(20) << __func__ << " loadavg " << loadavgs[0]
+	   << " >= max " << cct->_conf->osd_scrub_load_threshold
+	   << " and ( >= daily_loadavg " << daily_loadavg
+	   << " or >= 15m avg " << loadavgs[2]
+	   << ") = no" << dendl;
+  return false;
+}
+
+void OSD::sched_scrub()
+{
+  dout(20) << __func__ << " sched_scrub starts" << dendl;
+
+  // if not permitted, fail fast
+  if (!service.can_inc_scrubs()) {
+    dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
+    return;
+  }
+  bool allow_requested_repair_only = false;
+  if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
+    if (!cct->_conf->osd_repair_during_recovery) {
+      dout(15) << __func__ << ": not scheduling scrubs due to active recovery" << dendl;
+      return;
+    }
+    dout(10) << __func__
+             << " will only schedule explicitly requested repair due to active recovery"
+             << dendl;
+    allow_requested_repair_only = true;
+  }
+
+  utime_t now = ceph_clock_now();
+  bool time_permit = scrub_time_permit(now);
+  bool load_is_low = scrub_load_below_threshold();
+  dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
+
+  OSDService::ScrubJob scrub_job;
+  if (service.first_scrub_stamp(&scrub_job)) {
+    do {
+      dout(30) << "sched_scrub examine " << scrub_job.pgid << " at " << scrub_job.sched_time << dendl;
+
+      if (scrub_job.sched_time > now) {
+	// save ourselves some effort
+	dout(20) << "sched_scrub " << scrub_job.pgid << " scheduled at " << scrub_job.sched_time
+		 << " > " << now << dendl;
+	break;
+      }
+
+      if ((scrub_job.deadline.is_zero() || scrub_job.deadline >= now) && !(time_permit && load_is_low)) {
+        dout(15) << __func__ << " not scheduling scrub for " << scrub_job.pgid << " due to "
+                 << (!time_permit ? "time not permit" : "high load") << dendl;
+        continue;
+      }
+
+      PGRef pg = _lookup_lock_pg(scrub_job.pgid);
+      if (!pg) {
+	dout(20) << __func__ << " pg  " << scrub_job.pgid << " not found" << dendl;
+	continue;
+      }
+
+      // This has already started, so go on to the next scrub job
+      if (pg->is_scrub_queued_or_active()) {
+	pg->unlock();
+	dout(20) << __func__ << ": already in progress pgid " << scrub_job.pgid << dendl;
+	continue;
+      }
+      // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
+      if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
+        pg->unlock();
+        dout(10) << __func__ << " skip " << scrub_job.pgid
+                 << " because repairing is not explicitly requested on it"
+                 << dendl;
+        continue;
+      }
+
+      // If it is reserving, let it resolve before going to the next scrub job
+      if (pg->m_scrubber->is_reserving()) {
+	pg->unlock();
+	dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl;
+	break;
+      }
+      dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time
+	       << (pg->get_must_scrub() ? ", explicitly requested" :
+		   (load_is_low ? ", load_is_low" : " deadline < now"))
+	       << dendl;
+      if (pg->sched_scrub()) {
+	pg->unlock();
+        dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl;
+	break;
+      }
+      pg->unlock();
+    } while (service.next_scrub_stamp(scrub_job, &scrub_job));
+  }
+  dout(20) << "sched_scrub done" << dendl;
+}
+
+void OSD::resched_all_scrubs()
+{
+  dout(10) << __func__ << ": start" << dendl;
+  const vector<spg_t> pgs = [this] {
+    vector<spg_t> pgs;
+    OSDService::ScrubJob job;
+    if (service.first_scrub_stamp(&job)) {
+      do {
+        pgs.push_back(job.pgid);
+      } while (service.next_scrub_stamp(job, &job));
+    }
+    return pgs;
+  }();
+  for (auto& pgid : pgs) {
+      dout(20) << __func__ << ": examine " << pgid << dendl;
+      PGRef pg = _lookup_lock_pg(pgid);
+      if (!pg)
+	continue;
+      if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
+        dout(15) << __func__ << ": reschedule " << pgid << dendl;
+        pg->on_info_history_change();
+      }
+      pg->unlock();
+  }
+  dout(10) << __func__ << ": done" << dendl;
+}
+
+MPGStats* OSD::collect_pg_stats()
+{
+  // This implementation unconditionally sends every is_primary PG's
+  // stats every time we're called.  This has equivalent cost to the
+  // previous implementation's worst case where all PGs are busy and
+  // their stats are always enqueued for sending.
+  std::shared_lock l{map_lock};
+
+  osd_stat_t cur_stat = service.get_osd_stat();
+  cur_stat.os_perf_stat = store->get_cur_stats();
+
+  auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
+  m->osd_stat = cur_stat;
+
+  std::lock_guard lec{min_last_epoch_clean_lock};
+  min_last_epoch_clean = get_osdmap_epoch();
+  min_last_epoch_clean_pgs.clear();
+
+  std::set<int64_t> pool_set;
+  vector<PGRef> pgs;
+  _get_pgs(&pgs);
+  for (auto& pg : pgs) {
+    auto pool = pg->pg_id.pgid.pool();
+    pool_set.emplace((int64_t)pool);
+    if (!pg->is_primary()) {
+      continue;
+    }
+    pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
+	m->pg_stat[pg->pg_id.pgid] = s;
+	min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
+	min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
+      });
+  }
+  store_statfs_t st;
+  bool per_pool_stats = true;
+  bool per_pool_omap_stats = false;
+  for (auto p : pool_set) {
+    int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
+    if (r == -ENOTSUP) {
+      per_pool_stats = false;
+      break;
+    } else {
+      assert(r >= 0);
+      m->pool_stat[p] = st;
+    }
+  }
+
+  // indicate whether we are reporting per-pool stats
+  m->osd_stat.num_osds = 1;
+  m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
+  m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
+
+  return m;
+}
+
+vector<DaemonHealthMetric> OSD::get_health_metrics()
+{
+  vector<DaemonHealthMetric> metrics;
+  {
+    utime_t oldest_secs;
+    const utime_t now = ceph_clock_now();
+    auto too_old = now;
+    too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
+    int slow = 0;
+    TrackedOpRef oldest_op;
+    OSDMapRef osdmap = get_osdmap();
+    // map of slow op counts by slow op event type for an aggregated logging to
+    // the cluster log.
+    map<uint8_t, int> slow_op_types;
+    // map of slow op counts by pool for reporting a pool name with highest
+    // slow ops.
+    map<uint64_t, int> slow_op_pools;
+    bool log_aggregated_slow_op =
+	    cct->_conf.get_val<bool>("osd_aggregated_slow_ops_logging");
+    auto count_slow_ops = [&](TrackedOp& op) {
+      if (op.get_initiated() < too_old) {
+        stringstream ss;
+        ss << "slow request " << op.get_desc()
+           << " initiated "
+           << op.get_initiated()
+           << " currently "
+           << op.state_string();
+        lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
+        if (log_aggregated_slow_op) {
+          if (const OpRequest *req = dynamic_cast<const OpRequest *>(&op)) {
+            uint8_t op_type = req->state_flag();
+            auto m = req->get_req<MOSDFastDispatchOp>();
+            uint64_t poolid = m->get_spg().pgid.m_pool;
+            slow_op_types[op_type]++;
+            if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) {
+              slow_op_pools[poolid]++;
+            }
+          }
+        } else {
+          clog->warn() << ss.str();
+        }
+	slow++;
+	if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
+	  oldest_op = &op;
+	}
+	return true;
+      } else {
+	return false;
+      }
+    };
+    if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
+      if (slow) {
+	derr << __func__ << " reporting " << slow << " slow ops, oldest is "
+	     << oldest_op->get_desc() << dendl;
+        if (log_aggregated_slow_op &&
+             slow_op_types.size() > 0) {
+          stringstream ss;
+          ss << slow << " slow requests (by type [ ";
+          for (const auto& [op_type, count] : slow_op_types) {
+            ss << "'" << OpRequest::get_state_string(op_type)
+               << "' : " << count
+               << " ";
+          }
+          auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
+                                 [](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
+                                   return p1.second < p2.second;
+                                 });
+          if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
+            string pool_name = osdmap->get_pool_name(slow_pool_it->first);
+            ss << "] most affected pool [ '"
+               << pool_name
+               << "' : "
+               << slow_pool_it->second
+               << " ])";
+          } else {
+            ss << "])";
+          }
+          lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
+          clog->warn() << ss.str();
+        }
+      }
+      metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
+    } else {
+      // no news is not good news.
+      metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
+    }
+  }
+  {
+    std::lock_guard l(pending_creates_lock);
+    auto n_primaries = pending_creates_from_mon;
+    for (const auto& create : pending_creates_from_osd) {
+      if (create.second) {
+	n_primaries++;
+      }
+    }
+    metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
+  }
+  return metrics;
+}
+
+// =====================================================
+// MAP
+
+void OSD::wait_for_new_map(OpRequestRef op)
+{
+  // ask?
+  if (waiting_for_osdmap.empty()) {
+    osdmap_subscribe(get_osdmap_epoch() + 1, false);
+  }
+
+  logger->inc(l_osd_waiting_for_map);
+  waiting_for_osdmap.push_back(op);
+  op->mark_delayed("wait for new map");
+}
+
+
+/** update_map
+ * assimilate new OSDMap(s).  scan pgs, etc.
+ */
+
+void OSD::note_down_osd(int peer)
+{
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+  cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
+
+  std::lock_guard l{heartbeat_lock};
+  failure_queue.erase(peer);
+  failure_pending.erase(peer);
+  map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
+  if (p != heartbeat_peers.end()) {
+    p->second.clear_mark_down();
+    heartbeat_peers.erase(p);
+  }
+}
+
+void OSD::note_up_osd(int peer)
+{
+  heartbeat_set_peers_need_update();
+}
+
+struct C_OnMapCommit : public Context {
+  OSD *osd;
+  epoch_t first, last;
+  MOSDMap *msg;
+  C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
+    : osd(o), first(f), last(l), msg(m) {}
+  void finish(int r) override {
+    osd->_committed_osd_maps(first, last, msg);
+    msg->put();
+  }
+};
+
+void OSD::osdmap_subscribe(version_t epoch, bool force_request)
+{
+  std::lock_guard l(osdmap_subscribe_lock);
+  if (latest_subscribed_epoch >= epoch && !force_request)
+    return;
+
+  latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
+
+  if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
+      force_request) {
+    monc->renew_subs();
+  }
+}
+
+void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
+{
+  epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
+  if (min <= superblock.oldest_map)
+    return;
+
+  int num = 0;
+  ObjectStore::Transaction t;
+  for (epoch_t e = superblock.oldest_map; e < min; ++e) {
+    dout(20) << " removing old osdmap epoch " << e << dendl;
+    t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
+    t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
+    superblock.oldest_map = e + 1;
+    num++;
+    if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
+      service.publish_superblock(superblock);
+      write_superblock(t);
+      int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+      ceph_assert(tr == 0);
+      num = 0;
+      if (!skip_maps) {
+	// skip_maps leaves us with a range of old maps if we fail to remove all
+	// of them before moving superblock.oldest_map forward to the first map
+	// in the incoming MOSDMap msg. so we should continue removing them in
+	// this case, even we could do huge series of delete transactions all at
+	// once.
+	break;
+      }
+    }
+  }
+  if (num > 0) {
+    service.publish_superblock(superblock);
+    write_superblock(t);
+    int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+    ceph_assert(tr == 0);
+  }
+  // we should not remove the cached maps
+  ceph_assert(min <= service.map_cache.cached_key_lower_bound());
+}
+
+void OSD::handle_osd_map(MOSDMap *m)
+{
+  // wait for pgs to catch up
+  {
+    // we extend the map cache pins to accomodate pgs slow to consume maps
+    // for some period, until we hit the max_lag_factor bound, at which point
+    // we block here to stop injesting more maps than they are able to keep
+    // up with.
+    epoch_t max_lag = cct->_conf->osd_map_cache_size *
+      m_osd_pg_epoch_max_lag_factor;
+    ceph_assert(max_lag > 0);
+    epoch_t osd_min = 0;
+    for (auto shard : shards) {
+      epoch_t min = shard->get_min_pg_epoch();
+      if (osd_min == 0 || min < osd_min) {
+	osd_min = min;
+      }
+    }
+    epoch_t osdmap_epoch = get_osdmap_epoch();
+    if (osd_min > 0 &&
+	osdmap_epoch > max_lag &&
+	osdmap_epoch - max_lag > osd_min) {
+      epoch_t need = osdmap_epoch - max_lag;
+      dout(10) << __func__ << " waiting for pgs to catch up (need " << need
+	       << " max_lag " << max_lag << ")" << dendl;
+      for (auto shard : shards) {
+	epoch_t min = shard->get_min_pg_epoch();
+	if (need > min) {
+	  dout(10) << __func__ << " waiting for pgs to consume " << need
+		   << " (shard " << shard->shard_id << " min " << min
+		   << ", map cache is " << cct->_conf->osd_map_cache_size
+		   << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
+		   << ")" << dendl;
+	  unlock_guard unlock{osd_lock};
+	  shard->wait_min_pg_epoch(need);
+	}
+      }
+    }
+  }
+
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+  map<epoch_t,OSDMapRef> added_maps;
+  map<epoch_t,bufferlist> added_maps_bl;
+  if (m->fsid != monc->get_fsid()) {
+    dout(0) << "handle_osd_map fsid " << m->fsid << " != "
+	    << monc->get_fsid() << dendl;
+    m->put();
+    return;
+  }
+  if (is_initializing()) {
+    dout(0) << "ignoring osdmap until we have initialized" << dendl;
+    m->put();
+    return;
+  }
+
+  auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
+  if (session && !(session->entity_name.is_mon() ||
+		   session->entity_name.is_osd())) {
+    //not enough perms!
+    dout(10) << "got osd map from Session " << session
+             << " which we can't take maps from (not a mon or osd)" << dendl;
+    m->put();
+    return;
+  }
+
+  // share with the objecter
+  if (!is_preboot())
+    service.objecter->handle_osd_map(m);
+
+  epoch_t first = m->get_first();
+  epoch_t last = m->get_last();
+  dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
+	  << superblock.newest_map
+	  << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
+	  << dendl;
+
+  logger->inc(l_osd_map);
+  logger->inc(l_osd_mape, last - first + 1);
+  if (first <= superblock.newest_map)
+    logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
+  if (service.max_oldest_map < m->oldest_map) {
+    service.max_oldest_map = m->oldest_map;
+    ceph_assert(service.max_oldest_map >= superblock.oldest_map);
+  }
+
+  // make sure there is something new, here, before we bother flushing
+  // the queues and such
+  if (last <= superblock.newest_map) {
+    dout(10) << " no new maps here, dropping" << dendl;
+    m->put();
+    return;
+  }
+
+  // missing some?
+  bool skip_maps = false;
+  if (first > superblock.newest_map + 1) {
+    dout(10) << "handle_osd_map message skips epochs "
+	     << superblock.newest_map + 1 << ".." << (first-1) << dendl;
+    if (m->oldest_map <= superblock.newest_map + 1) {
+      osdmap_subscribe(superblock.newest_map + 1, false);
+      m->put();
+      return;
+    }
+    // always try to get the full range of maps--as many as we can.  this
+    //  1- is good to have
+    //  2- is at present the only way to ensure that we get a *full* map as
+    //     the first map!
+    if (m->oldest_map < first) {
+      osdmap_subscribe(m->oldest_map - 1, true);
+      m->put();
+      return;
+    }
+    skip_maps = true;
+  }
+
+  ObjectStore::Transaction t;
+  uint64_t txn_size = 0;
+
+  map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
+
+  // store new maps: queue for disk and put in the osdmap cache
+  epoch_t start = std::max(superblock.newest_map + 1, first);
+  for (epoch_t e = start; e <= last; e++) {
+    if (txn_size >= t.get_num_bytes()) {
+      derr << __func__ << " transaction size overflowed" << dendl;
+      ceph_assert(txn_size < t.get_num_bytes());
+    }
+    txn_size = t.get_num_bytes();
+    map<epoch_t,bufferlist>::iterator p;
+    p = m->maps.find(e);
+    if (p != m->maps.end()) {
+      dout(10) << "handle_osd_map  got full map for epoch " << e << dendl;
+      OSDMap *o = new OSDMap;
+      bufferlist& bl = p->second;
+
+      o->decode(bl);
+
+      purged_snaps[e] = o->get_new_purged_snaps();
+
+      ghobject_t fulloid = get_osdmap_pobject_name(e);
+      t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
+      added_maps[e] = add_map(o);
+      added_maps_bl[e] = bl;
+      got_full_map(e);
+      continue;
+    }
+
+    p = m->incremental_maps.find(e);
+    if (p != m->incremental_maps.end()) {
+      dout(10) << "handle_osd_map  got inc map for epoch " << e << dendl;
+      bufferlist& bl = p->second;
+      ghobject_t oid = get_inc_osdmap_pobject_name(e);
+      t.write(coll_t::meta(), oid, 0, bl.length(), bl);
+
+      OSDMap *o = new OSDMap;
+      if (e > 1) {
+	bufferlist obl;
+        bool got = get_map_bl(e - 1, obl);
+	if (!got) {
+	  auto p = added_maps_bl.find(e - 1);
+	  ceph_assert(p != added_maps_bl.end());
+	  obl = p->second;
+	}
+	o->decode(obl);
+      }
+
+      OSDMap::Incremental inc;
+      auto p = bl.cbegin();
+      inc.decode(p);
+
+      if (o->apply_incremental(inc) < 0) {
+	derr << "ERROR: bad fsid?  i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
+	ceph_abort_msg("bad fsid");
+      }
+
+      bufferlist fbl;
+      o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
+
+      bool injected_failure = false;
+      if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
+	  (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
+	derr << __func__ << " injecting map crc failure" << dendl;
+	injected_failure = true;
+      }
+
+      if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
+	dout(2) << "got incremental " << e
+		<< " but failed to encode full with correct crc; requesting"
+		<< dendl;
+	clog->warn() << "failed to encode map e" << e << " with expected crc";
+	dout(20) << "my encoded map was:\n";
+	fbl.hexdump(*_dout);
+	*_dout << dendl;
+	delete o;
+	request_full_map(e, last);
+	last = e - 1;
+
+	// don't continue committing if we failed to enc the first inc map
+	if (last < start) {
+	  dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
+	  m->put();
+	  return;
+	}
+	break;
+      }
+      got_full_map(e);
+      purged_snaps[e] = o->get_new_purged_snaps();
+
+      ghobject_t fulloid = get_osdmap_pobject_name(e);
+      t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
+      added_maps[e] = add_map(o);
+      added_maps_bl[e] = fbl;
+      continue;
+    }
+
+    ceph_abort_msg("MOSDMap lied about what maps it had?");
+  }
+
+  // even if this map isn't from a mon, we may have satisfied our subscription
+  monc->sub_got("osdmap", last);
+
+  if (!m->maps.empty() && requested_full_first) {
+    dout(10) << __func__ << " still missing full maps " << requested_full_first
+	     << ".." << requested_full_last << dendl;
+    rerequest_full_maps();
+  }
+
+  if (superblock.oldest_map) {
+    // make sure we at least keep pace with incoming maps
+    trim_maps(m->oldest_map, last - first + 1, skip_maps);
+    pg_num_history.prune(superblock.oldest_map);
+  }
+
+  if (!superblock.oldest_map || skip_maps)
+    superblock.oldest_map = first;
+  superblock.newest_map = last;
+  superblock.current_epoch = last;
+
+  // note in the superblock that we were clean thru the prior epoch
+  epoch_t boot_epoch = service.get_boot_epoch();
+  if (boot_epoch && boot_epoch >= superblock.mounted) {
+    superblock.mounted = boot_epoch;
+    superblock.clean_thru = last;
+  }
+
+  // check for pg_num changes and deleted pools
+  OSDMapRef lastmap;
+  for (auto& i : added_maps) {
+    if (!lastmap) {
+      if (!(lastmap = service.try_get_map(i.first - 1))) {
+        dout(10) << __func__ << " can't get previous map " << i.first - 1
+                 << " probably first start of this osd" << dendl;
+        continue;
+      }
+    }
+    ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
+    for (auto& j : lastmap->get_pools()) {
+      if (!i.second->have_pg_pool(j.first)) {
+	pg_num_history.log_pool_delete(i.first, j.first);
+	dout(10) << __func__ << " recording final pg_pool_t for pool "
+		 << j.first << dendl;
+	// this information is needed by _make_pg() if have to restart before
+	// the pool is deleted and need to instantiate a new (zombie) PG[Pool].
+	ghobject_t obj = make_final_pool_info_oid(j.first);
+	bufferlist bl;
+	encode(j.second, bl, CEPH_FEATURES_ALL);
+	string name = lastmap->get_pool_name(j.first);
+	encode(name, bl);
+	map<string,string> profile;
+	if (lastmap->get_pg_pool(j.first)->is_erasure()) {
+	  profile = lastmap->get_erasure_code_profile(
+	    lastmap->get_pg_pool(j.first)->erasure_code_profile);
+	}
+	encode(profile, bl);
+	t.write(coll_t::meta(), obj, 0, bl.length(), bl);
+      } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
+		 new_pg_num != j.second.get_pg_num()) {
+	dout(10) << __func__ << " recording pool " << j.first << " pg_num "
+		 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
+	pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
+      }
+    }
+    for (auto& j : i.second->get_pools()) {
+      if (!lastmap->have_pg_pool(j.first)) {
+	dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
+		 << j.second.get_pg_num() << dendl;
+	pg_num_history.log_pg_num_change(i.first, j.first,
+					 j.second.get_pg_num());
+      }
+    }
+    lastmap = i.second;
+  }
+  pg_num_history.epoch = last;
+  {
+    bufferlist bl;
+    ::encode(pg_num_history, bl);
+    t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
+    dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
+  }
+
+  // record new purged_snaps
+  if (superblock.purged_snaps_last == start - 1) {
+    SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
+				    make_purged_snaps_oid(), &t,
+				    purged_snaps);
+    superblock.purged_snaps_last = last;
+  } else {
+    dout(10) << __func__ << " superblock purged_snaps_last is "
+	     << superblock.purged_snaps_last
+	     << ", not recording new purged_snaps" << dendl;
+  }
+
+  // superblock and commit
+  write_superblock(t);
+  t.register_on_commit(new C_OnMapCommit(this, start, last, m));
+  store->queue_transaction(
+    service.meta_ch,
+    std::move(t));
+  service.publish_superblock(superblock);
+}
+
+void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
+{
+  dout(10) << __func__ << " " << first << ".." << last << dendl;
+  if (is_stopping()) {
+    dout(10) << __func__ << " bailing, we are shutting down" << dendl;
+    return;
+  }
+  std::lock_guard l(osd_lock);
+  if (is_stopping()) {
+    dout(10) << __func__ << " bailing, we are shutting down" << dendl;
+    return;
+  }
+  map_lock.lock();
+
+  ceph_assert(first <= last);
+
+  bool do_shutdown = false;
+  bool do_restart = false;
+  bool network_error = false;
+  OSDMapRef osdmap = get_osdmap();
+
+  // advance through the new maps
+  for (epoch_t cur = first; cur <= last; cur++) {
+    dout(10) << " advance to epoch " << cur
+	     << " (<= last " << last
+	     << " <= newest_map " << superblock.newest_map
+	     << ")" << dendl;
+
+    OSDMapRef newmap = get_map(cur);
+    ceph_assert(newmap);  // we just cached it above!
+
+    // start blocklisting messages sent to peers that go down.
+    service.pre_publish_map(newmap);
+
+    // kill connections to newly down osds
+    bool waited_for_reservations = false;
+    set<int> old;
+    osdmap = get_osdmap();
+    osdmap->get_all_osds(old);
+    for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
+      if (*p != whoami &&
+	  osdmap->is_up(*p) && // in old map
+	  newmap->is_down(*p)) {    // but not the new one
+        if (!waited_for_reservations) {
+          service.await_reserved_maps();
+          waited_for_reservations = true;
+        }
+	note_down_osd(*p);
+      } else if (*p != whoami &&
+                osdmap->is_down(*p) &&
+                newmap->is_up(*p)) {
+        note_up_osd(*p);
+      }
+    }
+
+    if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
+      dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
+	       << dendl;
+      if (is_booting()) {
+	// this captures the case where we sent the boot message while
+	// NOUP was being set on the mon and our boot request was
+	// dropped, and then later it is cleared.  it imperfectly
+	// handles the case where our original boot message was not
+	// dropped and we restart even though we might have booted, but
+	// that is harmless (boot will just take slightly longer).
+	do_restart = true;
+      }
+    }
+
+    osdmap = std::move(newmap);
+    set_osdmap(osdmap);
+    epoch_t up_epoch;
+    epoch_t boot_epoch;
+    service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
+    if (!up_epoch &&
+	osdmap->is_up(whoami) &&
+	osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
+      up_epoch = osdmap->get_epoch();
+      dout(10) << "up_epoch is " << up_epoch << dendl;
+      if (!boot_epoch) {
+	boot_epoch = osdmap->get_epoch();
+	dout(10) << "boot_epoch is " << boot_epoch << dendl;
+      }
+      service.set_epochs(&boot_epoch, &up_epoch, NULL);
+    }
+  }
+
+  epoch_t _bind_epoch = service.get_bind_epoch();
+  if (osdmap->is_up(whoami) &&
+      osdmap->get_addrs(whoami).legacy_equals(
+	client_messenger->get_myaddrs()) &&
+      _bind_epoch < osdmap->get_up_from(whoami)) {
+
+    if (is_booting()) {
+      dout(1) << "state: booting -> active" << dendl;
+      set_state(STATE_ACTIVE);
+      do_restart = false;
+
+      // set incarnation so that osd_reqid_t's we generate for our
+      // objecter requests are unique across restarts.
+      service.objecter->set_client_incarnation(osdmap->get_epoch());
+      cancel_pending_failures();
+    }
+  }
+
+  if (osdmap->get_epoch() > 0 &&
+      is_active()) {
+    if (!osdmap->exists(whoami)) {
+      derr << "map says i do not exist.  shutting down." << dendl;
+      do_shutdown = true;   // don't call shutdown() while we have
+			    // everything paused
+    } else if (osdmap->is_stop(whoami)) {
+      derr << "map says i am stopped by admin. shutting down." << dendl;
+      do_shutdown = true;
+    } else if (!osdmap->is_up(whoami) ||
+	       !osdmap->get_addrs(whoami).legacy_equals(
+		 client_messenger->get_myaddrs()) ||
+	       !osdmap->get_cluster_addrs(whoami).legacy_equals(
+		 cluster_messenger->get_myaddrs()) ||
+	       !osdmap->get_hb_back_addrs(whoami).legacy_equals(
+		 hb_back_server_messenger->get_myaddrs()) ||
+	       !osdmap->get_hb_front_addrs(whoami).legacy_equals(
+		 hb_front_server_messenger->get_myaddrs())) {
+      if (!osdmap->is_up(whoami)) {
+	if (service.is_preparing_to_stop() || service.is_stopping()) {
+	  service.got_stop_ack();
+	} else {
+          clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
+                          "but it is still running";
+          clog->debug() << "map e" << osdmap->get_epoch()
+                        << " wrongly marked me down at e"
+                        << osdmap->get_down_at(whoami);
+	}
+	if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
+	  // note that this is best-effort...
+	  monc->send_mon_message(
+	    new MOSDMarkMeDead(
+	      monc->get_fsid(),
+	      whoami,
+	      osdmap->get_epoch()));
+	}
+      } else if (!osdmap->get_addrs(whoami).legacy_equals(
+		   client_messenger->get_myaddrs())) {
+	clog->error() << "map e" << osdmap->get_epoch()
+		      << " had wrong client addr (" << osdmap->get_addrs(whoami)
+		      << " != my " << client_messenger->get_myaddrs() << ")";
+      } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
+		   cluster_messenger->get_myaddrs())) {
+	clog->error() << "map e" << osdmap->get_epoch()
+		      << " had wrong cluster addr ("
+		      << osdmap->get_cluster_addrs(whoami)
+		      << " != my " << cluster_messenger->get_myaddrs() << ")";
+      } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
+		   hb_back_server_messenger->get_myaddrs())) {
+	clog->error() << "map e" << osdmap->get_epoch()
+		      << " had wrong heartbeat back addr ("
+		      << osdmap->get_hb_back_addrs(whoami)
+		      << " != my " << hb_back_server_messenger->get_myaddrs()
+		      << ")";
+      } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
+		   hb_front_server_messenger->get_myaddrs())) {
+	clog->error() << "map e" << osdmap->get_epoch()
+		      << " had wrong heartbeat front addr ("
+		      << osdmap->get_hb_front_addrs(whoami)
+		      << " != my " << hb_front_server_messenger->get_myaddrs()
+		      << ")";
+      }
+
+      if (!service.is_stopping()) {
+        epoch_t up_epoch = 0;
+        epoch_t bind_epoch = osdmap->get_epoch();
+        service.set_epochs(NULL,&up_epoch, &bind_epoch);
+	do_restart = true;
+
+	//add markdown log
+	utime_t now = ceph_clock_now();
+	utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
+	osd_markdown_log.push_back(now);
+	if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
+	  derr << __func__ << " marked down "
+	       << osd_markdown_log.size()
+	       << " > osd_max_markdown_count "
+	       << cct->_conf->osd_max_markdown_count
+	       << " in last " << grace << " seconds, shutting down"
+	       << dendl;
+	  do_restart = false;
+	  do_shutdown = true;
+	}
+
+	start_waiting_for_healthy();
+
+	set<int> avoid_ports;
+#if defined(__FreeBSD__)
+        // prevent FreeBSD from grabbing the client_messenger port during
+        // rebinding. In which case a cluster_meesneger will connect also
+	// to the same port
+	client_messenger->get_myaddrs().get_ports(&avoid_ports);
+#endif
+	cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
+
+	int r = cluster_messenger->rebind(avoid_ports);
+	if (r != 0) {
+	  do_shutdown = true;  // FIXME: do_restart?
+          network_error = true;
+          derr << __func__ << " marked down:"
+	       << " rebind cluster_messenger failed" << dendl;
+        }
+
+	hb_back_server_messenger->mark_down_all();
+	hb_front_server_messenger->mark_down_all();
+	hb_front_client_messenger->mark_down_all();
+	hb_back_client_messenger->mark_down_all();
+
+	reset_heartbeat_peers(true);
+      }
+    }
+  } else if (osdmap->get_epoch() > 0 && osdmap->is_stop(whoami)) {
+    derr << "map says i am stopped by admin. shutting down." << dendl;
+    do_shutdown = true;
+  }
+
+  map_lock.unlock();
+
+  check_osdmap_features();
+
+  // yay!
+  consume_map();
+
+  if (is_active() || is_waiting_for_healthy())
+    maybe_update_heartbeat_peers();
+
+  if (is_active()) {
+    activate_map();
+  }
+
+  if (do_shutdown) {
+    if (network_error) {
+      cancel_pending_failures();
+    }
+    // trigger shutdown in a different thread
+    dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
+    queue_async_signal(SIGINT);
+  }
+  else if (m->newest_map && m->newest_map > last) {
+    dout(10) << " msg say newest map is " << m->newest_map
+	     << ", requesting more" << dendl;
+    osdmap_subscribe(osdmap->get_epoch()+1, false);
+  }
+  else if (is_preboot()) {
+    if (m->get_source().is_mon())
+      _preboot(m->oldest_map, m->newest_map);
+    else
+      start_boot();
+  }
+  else if (do_restart)
+    start_boot();
+
+}
+
+void OSD::check_osdmap_features()
+{
+  // adjust required feature bits?
+
+  // we have to be a bit careful here, because we are accessing the
+  // Policy structures without taking any lock.  in particular, only
+  // modify integer values that can safely be read by a racing CPU.
+  // since we are only accessing existing Policy structures a their
+  // current memory location, and setting or clearing bits in integer
+  // fields, and we are the only writer, this is not a problem.
+
+  const auto osdmap = get_osdmap();
+  {
+    Messenger::Policy p = client_messenger->get_default_policy();
+    uint64_t mask;
+    uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
+    if ((p.features_required & mask) != features) {
+      dout(0) << "crush map has features " << features
+	      << ", adjusting msgr requires for clients" << dendl;
+      p.features_required = (p.features_required & ~mask) | features;
+      client_messenger->set_default_policy(p);
+    }
+  }
+  {
+    Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
+    uint64_t mask;
+    uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
+    if ((p.features_required & mask) != features) {
+      dout(0) << "crush map has features " << features
+	      << " was " << p.features_required
+	      << ", adjusting msgr requires for mons" << dendl;
+      p.features_required = (p.features_required & ~mask) | features;
+      client_messenger->set_policy(entity_name_t::TYPE_MON, p);
+    }
+  }
+  {
+    Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
+    uint64_t mask;
+    uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
+
+    if ((p.features_required & mask) != features) {
+      dout(0) << "crush map has features " << features
+	      << ", adjusting msgr requires for osds" << dendl;
+      p.features_required = (p.features_required & ~mask) | features;
+      cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
+    }
+
+    if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
+      dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
+      superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
+      ObjectStore::Transaction t;
+      write_superblock(t);
+      int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
+      ceph_assert(err == 0);
+    }
+  }
+
+  if (osdmap->require_osd_release < ceph_release_t::nautilus) {
+    hb_front_server_messenger->set_require_authorizer(false);
+    hb_back_server_messenger->set_require_authorizer(false);
+  } else {
+    hb_front_server_messenger->set_require_authorizer(true);
+    hb_back_server_messenger->set_require_authorizer(true);
+  }
+
+  if (osdmap->require_osd_release != last_require_osd_release) {
+    dout(1) << __func__ << " require_osd_release " << last_require_osd_release
+	    << " -> " << to_string(osdmap->require_osd_release) << dendl;
+    store->write_meta("require_osd_release",
+		      stringify((int)osdmap->require_osd_release));
+    last_require_osd_release = osdmap->require_osd_release;
+  }
+}
+
+struct C_FinishSplits : public Context {
+  OSD *osd;
+  set<PGRef> pgs;
+  C_FinishSplits(OSD *osd, const set<PGRef> &in)
+    : osd(osd), pgs(in) {}
+  void finish(int r) override {
+    osd->_finish_splits(pgs);
+  }
+};
+
+void OSD::_finish_splits(set<PGRef>& pgs)
+{
+  dout(10) << __func__ << " " << pgs << dendl;
+  if (is_stopping())
+    return;
+  for (set<PGRef>::iterator i = pgs.begin();
+       i != pgs.end();
+       ++i) {
+    PG *pg = i->get();
+
+    PeeringCtx rctx = create_context();
+    pg->lock();
+    dout(10) << __func__ << " " << *pg << dendl;
+    epoch_t e = pg->get_osdmap_epoch();
+    pg->handle_initialize(rctx);
+    pg->queue_null(e, e);
+    dispatch_context(rctx, pg, service.get_osdmap());
+    pg->unlock();
+
+    unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
+    shards[shard_index]->register_and_wake_split_child(pg);
+  }
+};
+
+bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
+			   unsigned need)
+{
+  std::lock_guard l(merge_lock);
+  auto& p = merge_waiters[nextmap->get_epoch()][target];
+  p[src->pg_id] = src;
+  dout(10) << __func__ << " added merge_waiter " << src->pg_id
+	   << " for " << target  << ", have " << p.size() << "/" << need
+	   << dendl;
+  return p.size() == need;
+}
+
+bool OSD::advance_pg(
+  epoch_t osd_epoch,
+  PG *pg,
+  ThreadPool::TPHandle &handle,
+  PeeringCtx &rctx)
+{
+  if (osd_epoch <= pg->get_osdmap_epoch()) {
+    return true;
+  }
+  ceph_assert(pg->is_locked());
+  OSDMapRef lastmap = pg->get_osdmap();
+  set<PGRef> new_pgs;  // any split children
+  bool ret = true;
+
+  unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
+    lastmap->get_pg_num(pg->pg_id.pool()) : 0;
+  for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
+       next_epoch <= osd_epoch;
+       ++next_epoch) {
+    OSDMapRef nextmap = service.try_get_map(next_epoch);
+    if (!nextmap) {
+      dout(20) << __func__ << " missing map " << next_epoch << dendl;
+      continue;
+    }
+
+    unsigned new_pg_num =
+      (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
+      nextmap->get_pg_num(pg->pg_id.pool()) : 0;
+    if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
+      // check for merge
+      if (nextmap->have_pg_pool(pg->pg_id.pool())) {
+	spg_t parent;
+	if (pg->pg_id.is_merge_source(
+	      old_pg_num,
+	      new_pg_num,
+	      &parent)) {
+	  // we are merge source
+	  PGRef spg = pg; // carry a ref
+	  dout(1) << __func__ << " " << pg->pg_id
+		  << " is merge source, target is " << parent
+		   << dendl;
+	  pg->write_if_dirty(rctx);
+	  if (!new_pgs.empty()) {
+	    rctx.transaction.register_on_applied(new C_FinishSplits(this,
+								    new_pgs));
+	    new_pgs.clear();
+	  }
+	  dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
+	  pg->ch->flush();
+	  // release backoffs explicitly, since the on_shutdown path
+	  // aggressively tears down backoff state.
+	  if (pg->is_primary()) {
+	    pg->release_pg_backoffs();
+	  }
+	  pg->on_shutdown();
+	  OSDShard *sdata = pg->osd_shard;
+	  {
+	    std::lock_guard l(sdata->shard_lock);
+	    if (pg->pg_slot) {
+	      sdata->_detach_pg(pg->pg_slot);
+	      // update pg count now since we might not get an osdmap
+	      // any time soon.
+	      if (pg->is_primary())
+		logger->dec(l_osd_pg_primary);
+	      else if (pg->is_nonprimary())
+		logger->dec(l_osd_pg_replica); // misnomer
+	      else
+		logger->dec(l_osd_pg_stray);
+	    }
+	  }
+	  pg->unlock();
+
+	  set<spg_t> children;
+	  parent.is_split(new_pg_num, old_pg_num, &children);
+	  if (add_merge_waiter(nextmap, parent, pg, children.size())) {
+	    enqueue_peering_evt(
+	      parent,
+	      PGPeeringEventRef(
+		std::make_shared<PGPeeringEvent>(
+		  nextmap->get_epoch(),
+		  nextmap->get_epoch(),
+		  NullEvt())));
+	  }
+	  ret = false;
+	  goto out;
+	} else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
+	  // we are merge target
+	  set<spg_t> children;
+	  pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
+	  dout(20) << __func__ << " " << pg->pg_id
+		   << " is merge target, sources are " << children
+		   << dendl;
+	  map<spg_t,PGRef> sources;
+	  {
+	    std::lock_guard l(merge_lock);
+	    auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
+	    unsigned need = children.size();
+	    dout(20) << __func__ << " have " << s.size() << "/"
+		     << need << dendl;
+	    if (s.size() == need) {
+	      sources.swap(s);
+	      merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
+	      if (merge_waiters[nextmap->get_epoch()].empty()) {
+		merge_waiters.erase(nextmap->get_epoch());
+	      }
+	    }
+	  }
+	  if (!sources.empty()) {
+	    unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
+	    unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
+	    dout(1) << __func__ << " merging " << pg->pg_id << dendl;
+	    pg->merge_from(
+	      sources, rctx, split_bits,
+	      nextmap->get_pg_pool(
+		pg->pg_id.pool())->last_pg_merge_meta);
+	    pg->pg_slot->waiting_for_merge_epoch = 0;
+	  } else {
+	    dout(20) << __func__ << " not ready to merge yet" << dendl;
+	    pg->write_if_dirty(rctx);
+	    if (!new_pgs.empty()) {
+	      rctx.transaction.register_on_applied(new C_FinishSplits(this,
+								      new_pgs));
+	      new_pgs.clear();
+	    }
+	    dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
+	    pg->unlock();
+	    // kick source(s) to get them ready
+	    for (auto& i : children) {
+	      dout(20) << __func__ << " kicking source " << i << dendl;
+	      enqueue_peering_evt(
+		i,
+		PGPeeringEventRef(
+		  std::make_shared<PGPeeringEvent>(
+		    nextmap->get_epoch(),
+		    nextmap->get_epoch(),
+		    NullEvt())));
+	    }
+	    ret = false;
+	    goto out;
+	  }
+	}
+      }
+    }
+
+    vector<int> newup, newacting;
+    int up_primary, acting_primary;
+    nextmap->pg_to_up_acting_osds(
+      pg->pg_id.pgid,
+      &newup, &up_primary,
+      &newacting, &acting_primary);
+    pg->handle_advance_map(
+      nextmap, lastmap, newup, up_primary,
+      newacting, acting_primary, rctx);
+
+    auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
+    auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
+    if (oldpool != lastmap->get_pools().end()
+        && newpool != nextmap->get_pools().end()) {
+      dout(20) << __func__
+	       << " new pool opts " << newpool->second.opts
+	       << " old pool opts " << oldpool->second.opts
+	       << dendl;
+
+      double old_min_interval = 0, new_min_interval = 0;
+      oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
+      newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
+
+      double old_max_interval = 0, new_max_interval = 0;
+      oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
+      newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
+
+      // Assume if an interval is change from set to unset or vice versa the actual config
+      // is different.  Keep it simple even if it is possible to call resched_all_scrub()
+      // unnecessarily.
+      if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
+	pg->on_info_history_change();
+      }
+    }
+
+    if (new_pg_num && old_pg_num != new_pg_num) {
+      // check for split
+      set<spg_t> children;
+      if (pg->pg_id.is_split(
+	    old_pg_num,
+	    new_pg_num,
+	    &children)) {
+	split_pgs(
+	  pg, children, &new_pgs, lastmap, nextmap,
+	  rctx);
+      }
+    }
+
+    lastmap = nextmap;
+    old_pg_num = new_pg_num;
+    handle.reset_tp_timeout();
+  }
+  pg->handle_activate_map(rctx);
+
+  ret = true;
+ out:
+  if (!new_pgs.empty()) {
+    rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
+  }
+  return ret;
+}
+
+void OSD::consume_map()
+{
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+  auto osdmap = get_osdmap();
+  dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
+
+  /** make sure the cluster is speaking in SORTBITWISE, because we don't
+   *  speak the older sorting version any more. Be careful not to force
+   *  a shutdown if we are merely processing old maps, though.
+   */
+  if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
+    derr << __func__ << " SORTBITWISE flag is not set" << dendl;
+    ceph_abort();
+  }
+
+  service.pre_publish_map(osdmap);
+  service.await_reserved_maps();
+  service.publish_map(osdmap);
+
+  // prime splits and merges
+  set<pair<spg_t,epoch_t>> newly_split;  // splits, and when
+  set<pair<spg_t,epoch_t>> merge_pgs;    // merge participants, and when
+  for (auto& shard : shards) {
+    shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
+  }
+  if (!newly_split.empty()) {
+    for (auto& shard : shards) {
+      shard->prime_splits(osdmap, &newly_split);
+    }
+    ceph_assert(newly_split.empty());
+  }
+
+  // prune sent_ready_to_merge
+  service.prune_sent_ready_to_merge(osdmap);
+
+  // FIXME, maybe: We could race against an incoming peering message
+  // that instantiates a merge PG after identify_merges() below and
+  // never set up its peer to complete the merge.  An OSD restart
+  // would clear it up.  This is a hard race to resolve,
+  // extraordinarily rare (we only merge PGs that are stable and
+  // clean, so it'd have to be an imported PG to an OSD with a
+  // slightly stale OSDMap...), so I'm ignoring it for now.  We plan to
+  // replace all of this with a seastar-based code soon anyway.
+  if (!merge_pgs.empty()) {
+    // mark the pgs we already have, or create new and empty merge
+    // participants for those we are missing.  do this all under the
+    // shard lock so we don't have to worry about racing pg creates
+    // via _process.
+    for (auto& shard : shards) {
+      shard->prime_merges(osdmap, &merge_pgs);
+    }
+    ceph_assert(merge_pgs.empty());
+  }
+
+  service.prune_pg_created();
+
+  unsigned pushes_to_free = 0;
+  for (auto& shard : shards) {
+    shard->consume_map(osdmap, &pushes_to_free);
+  }
+
+  vector<spg_t> pgids;
+  _get_pgids(&pgids);
+
+  // count (FIXME, probably during seastar rewrite)
+  int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
+  vector<PGRef> pgs;
+  _get_pgs(&pgs);
+  for (auto& pg : pgs) {
+    // FIXME (probably during seastar rewrite): this is lockless and
+    // racy, but we don't want to take pg lock here.
+    if (pg->is_primary())
+      num_pg_primary++;
+    else if (pg->is_nonprimary())
+      num_pg_replica++;  // misnomer
+    else
+      num_pg_stray++;
+  }
+
+  {
+    // FIXME (as part of seastar rewrite): move to OSDShard
+    std::lock_guard l(pending_creates_lock);
+    for (auto pg = pending_creates_from_osd.begin();
+	 pg != pending_creates_from_osd.end();) {
+      if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
+	dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
+		 << "discarding pending_create_from_osd" << dendl;
+	pg = pending_creates_from_osd.erase(pg);
+      } else {
+	++pg;
+      }
+    }
+  }
+
+  service.maybe_inject_dispatch_delay();
+
+  dispatch_sessions_waiting_on_map();
+
+  service.maybe_inject_dispatch_delay();
+
+  service.release_reserved_pushes(pushes_to_free);
+
+  // queue null events to push maps down to individual PGs
+  for (auto pgid : pgids) {
+    enqueue_peering_evt(
+      pgid,
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	  osdmap->get_epoch(),
+	  osdmap->get_epoch(),
+	  NullEvt())));
+  }
+  logger->set(l_osd_pg, pgids.size());
+  logger->set(l_osd_pg_primary, num_pg_primary);
+  logger->set(l_osd_pg_replica, num_pg_replica);
+  logger->set(l_osd_pg_stray, num_pg_stray);
+}
+
+void OSD::activate_map()
+{
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+  auto osdmap = get_osdmap();
+
+  dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
+
+  // norecover?
+  if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
+    if (!service.recovery_is_paused()) {
+      dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
+      service.pause_recovery();
+    }
+  } else {
+    if (service.recovery_is_paused()) {
+      dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
+      service.unpause_recovery();
+    }
+  }
+
+  service.activate_map();
+
+  // process waiters
+  take_waiters(waiting_for_osdmap);
+}
+
+bool OSD::require_mon_peer(const Message *m)
+{
+  if (!m->get_connection()->peer_is_mon()) {
+    dout(0) << "require_mon_peer received from non-mon "
+	    << m->get_connection()->get_peer_addr()
+	    << " " << *m << dendl;
+    return false;
+  }
+  return true;
+}
+
+bool OSD::require_mon_or_mgr_peer(const Message *m)
+{
+  if (!m->get_connection()->peer_is_mon() &&
+      !m->get_connection()->peer_is_mgr()) {
+    dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
+	    << m->get_connection()->get_peer_addr()
+	    << " " << *m << dendl;
+    return false;
+  }
+  return true;
+}
+
+bool OSD::require_osd_peer(const Message *m)
+{
+  if (!m->get_connection()->peer_is_osd()) {
+    dout(0) << "require_osd_peer received from non-osd "
+	    << m->get_connection()->get_peer_addr()
+	    << " " << *m << dendl;
+    return false;
+  }
+  return true;
+}
+
+bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
+{
+  epoch_t up_epoch = service.get_up_epoch();
+  if (epoch < up_epoch) {
+    dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
+    return false;
+  }
+
+  if (!is_active()) {
+    dout(7) << "still in boot state, dropping message " << *m << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
+				     bool is_fast_dispatch)
+{
+  int from = m->get_source().num();
+
+  if (map->is_down(from) ||
+      (map->get_cluster_addrs(from) != m->get_source_addrs())) {
+    dout(5) << "from dead osd." << from << ", marking down, "
+	    << " msg was " << m->get_source_inst().addr
+	    << " expected "
+	    << (map->is_up(from) ?
+		map->get_cluster_addrs(from) : entity_addrvec_t())
+	    << dendl;
+    ConnectionRef con = m->get_connection();
+    con->mark_down();
+    if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
+      if (!is_fast_dispatch)
+	s->session_dispatch_lock.lock();
+      clear_session_waiting_on_map(s);
+      con->set_priv(nullptr);   // break ref <-> session cycle, if any
+      s->con.reset();
+      if (!is_fast_dispatch)
+	s->session_dispatch_lock.unlock();
+    }
+    return false;
+  }
+  return true;
+}
+
+
+/*
+ * require that we have same (or newer) map, and that
+ * the source is the pg primary.
+ */
+bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
+				    bool is_fast_dispatch)
+{
+  const Message *m = op->get_req();
+  const auto osdmap = get_osdmap();
+  dout(15) << "require_same_or_newer_map " << epoch
+	   << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
+
+  ceph_assert(ceph_mutex_is_locked(osd_lock));
+
+  // do they have a newer map?
+  if (epoch > osdmap->get_epoch()) {
+    dout(7) << "waiting for newer map epoch " << epoch
+	    << " > my " << osdmap->get_epoch() << " with " << m << dendl;
+    wait_for_new_map(op);
+    return false;
+  }
+
+  if (!require_self_aliveness(op->get_req(), epoch)) {
+    return false;
+  }
+
+  // ok, our map is same or newer.. do they still exist?
+  if (m->get_connection()->get_messenger() == cluster_messenger &&
+      !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
+    return false;
+  }
+
+  return true;
+}
+
+
+
+
+
+// ----------------------------------------
+// pg creation
+
+void OSD::split_pgs(
+  PG *parent,
+  const set<spg_t> &childpgids, set<PGRef> *out_pgs,
+  OSDMapRef curmap,
+  OSDMapRef nextmap,
+  PeeringCtx &rctx)
+{
+  unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
+  parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
+
+  vector<object_stat_sum_t> updated_stats;
+  parent->start_split_stats(childpgids, &updated_stats);
+
+  vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
+  for (set<spg_t>::const_iterator i = childpgids.begin();
+       i != childpgids.end();
+       ++i, ++stat_iter) {
+    ceph_assert(stat_iter != updated_stats.end());
+    dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
+    PG* child = _make_pg(nextmap, *i);
+    child->lock(true);
+    out_pgs->insert(child);
+    child->ch = store->create_new_collection(child->coll);
+
+    {
+      uint32_t shard_index = i->hash_to_shard(shards.size());
+      assert(NULL != shards[shard_index]);
+      store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
+    }
+
+    unsigned split_bits = i->get_split_bits(pg_num);
+    dout(10) << " pg_num is " << pg_num
+	     << ", m_seed " << i->ps()
+	     << ", split_bits is " << split_bits << dendl;
+    parent->split_colls(
+      *i,
+      split_bits,
+      i->ps(),
+      &child->get_pool().info,
+      rctx.transaction);
+    parent->split_into(
+      i->pgid,
+      child,
+      split_bits);
+
+    child->init_collection_pool_opts();
+
+    child->finish_split_stats(*stat_iter, rctx.transaction);
+    child->unlock();
+  }
+  ceph_assert(stat_iter != updated_stats.end());
+  parent->finish_split_stats(*stat_iter, rctx.transaction);
+}
+
+/*
+ * holding osd_lock
+ */
+void OSD::handle_pg_create(OpRequestRef op)
+{
+  // NOTE: this can be removed in P release (mimic is the last version to
+  // send MOSDPGCreate messages).
+
+  auto m = op->get_req<MOSDPGCreate>();
+  ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
+
+  dout(10) << "handle_pg_create " << *m << dendl;
+
+  if (!require_mon_peer(op->get_req())) {
+    return;
+  }
+
+  if (!require_same_or_newer_map(op, m->epoch, false))
+    return;
+
+  op->mark_started();
+
+  const auto osdmap = get_osdmap();
+  map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
+  for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
+       p != m->mkpg.end();
+       ++p, ++ci) {
+    ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
+    epoch_t created = p->second.created;
+    if (p->second.split_bits) // Skip split pgs
+      continue;
+    pg_t on = p->first;
+
+    if (!osdmap->have_pg_pool(on.pool())) {
+      dout(20) << "ignoring pg on deleted pool " << on << dendl;
+      continue;
+    }
+
+    dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
+
+    spg_t pgid;
+    bool mapped = osdmap->get_primary_shard(on, &pgid);
+    ceph_assert(mapped);
+
+    // is it still ours?
+    vector<int> up, acting;
+    int up_primary = -1;
+    int acting_primary = -1;
+    osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
+    int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
+
+    if (acting_primary != whoami) {
+      dout(10) << "mkpg " << on << "  not acting_primary (" << acting_primary
+	       << "), my role=" << role << ", skipping" << dendl;
+      continue;
+    }
+
+
+    PastIntervals pi;
+    pg_history_t history;
+    build_initial_pg_history(pgid, created, ci->second, &history, &pi);
+
+    // The mon won't resend unless the primary changed, so we ignore
+    // same_interval_since.  We'll pass this history with the current
+    // epoch as the event.
+    if (history.same_primary_since > m->epoch) {
+      dout(10) << __func__ << ": got obsolete pg create on pgid "
+	       << pgid << " from epoch " << m->epoch
+	       << ", primary changed in " << history.same_primary_since
+	       << dendl;
+      continue;
+    }
+    enqueue_peering_evt(
+      pgid,
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	  osdmap->get_epoch(),
+	  osdmap->get_epoch(),
+	  NullEvt(),
+	  true,
+	  new PGCreateInfo(
+	    pgid,
+	    osdmap->get_epoch(),
+	    history,
+	    pi,
+	    true)
+	  )));
+  }
+
+  {
+    std::lock_guard l(pending_creates_lock);
+    if (pending_creates_from_mon == 0) {
+      last_pg_create_epoch = m->epoch;
+    }
+  }
+
+  maybe_update_heartbeat_peers();
+}
+
+
+// ----------------------------------------
+// peering and recovery
+
+PeeringCtx OSD::create_context()
+{
+  return PeeringCtx(get_osdmap()->require_osd_release);
+}
+
+void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
+                           ThreadPool::TPHandle *handle)
+{
+  if (!service.get_osdmap()->is_up(whoami)) {
+    dout(20) << __func__ << " not up in osdmap" << dendl;
+  } else if (!is_active()) {
+    dout(20) << __func__ << " not active" << dendl;
+  } else {
+    for (auto& [osd, ls] : ctx.message_map) {
+      if (!curmap->is_up(osd)) {
+	dout(20) << __func__ << " skipping down osd." << osd << dendl;
+	continue;
+      }
+      ConnectionRef con = service.get_con_osd_cluster(
+	osd, curmap->get_epoch());
+      if (!con) {
+	dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
+		 << dendl;
+	continue;
+      }
+      service.maybe_share_map(con.get(), curmap);
+      for (auto m : ls) {
+	con->send_message2(m);
+      }
+      ls.clear();
+    }
+  }
+  if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
+    int tr = store->queue_transaction(
+      pg->ch,
+      std::move(ctx.transaction), TrackedOpRef(),
+      handle);
+    ceph_assert(tr == 0);
+  }
+}
+
+void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
+{
+  dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+  if (!require_mon_peer(m)) {
+    m->put();
+    return;
+  }
+  for (auto& p : m->pgs) {
+    spg_t pgid = p.first;
+    epoch_t created = p.second.first;
+    utime_t created_stamp = p.second.second;
+    auto q = m->pg_extra.find(pgid);
+    if (q == m->pg_extra.end()) {
+      dout(20) << __func__ << " " << pgid << " e" << created
+	       << "@" << created_stamp
+	       << " (no history or past_intervals)" << dendl;
+      // pre-octopus ... no pg history.  this can be removed in Q release.
+      enqueue_peering_evt(
+	pgid,
+	PGPeeringEventRef(
+	  std::make_shared<PGPeeringEvent>(
+	    m->epoch,
+	    m->epoch,
+	    NullEvt(),
+	    true,
+	    new PGCreateInfo(
+	      pgid,
+	      created,
+	      pg_history_t(created, created_stamp),
+	      PastIntervals(),
+	      true)
+	    )));
+    } else {
+      dout(20) << __func__ << " " << pgid << " e" << created
+	       << "@" << created_stamp
+	       << " history " << q->second.first
+	       << " pi " << q->second.second << dendl;
+      if (!q->second.second.empty() &&
+	  m->epoch < q->second.second.get_bounds().second) {
+	clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
+		      << " and unmatched past_intervals " << q->second.second
+		      << " (history " << q->second.first << ")";
+      } else {
+	enqueue_peering_evt(
+	  pgid,
+	  PGPeeringEventRef(
+	    std::make_shared<PGPeeringEvent>(
+	      m->epoch,
+	      m->epoch,
+	      NullEvt(),
+	      true,
+	      new PGCreateInfo(
+		pgid,
+		m->epoch,
+		q->second.first,
+		q->second.second,
+		true)
+	      )));
+      }
+    }
+  }
+
+  {
+    std::lock_guard l(pending_creates_lock);
+    if (pending_creates_from_mon == 0) {
+      last_pg_create_epoch = m->epoch;
+    }
+  }
+
+  m->put();
+}
+
+void OSD::handle_fast_pg_query(MOSDPGQuery *m)
+{
+  dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+  if (!require_osd_peer(m)) {
+    m->put();
+    return;
+  }
+  int from = m->get_source().num();
+  for (auto& p : m->pg_list) {
+    enqueue_peering_evt(
+      p.first,
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	  p.second.epoch_sent, p.second.epoch_sent,
+	  MQuery(
+	    p.first,
+	    pg_shard_t(from, p.second.from),
+	    p.second,
+	    p.second.epoch_sent),
+	  false))
+      );
+  }
+  m->put();
+}
+
+void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
+{
+  dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+  if (!require_osd_peer(m)) {
+    m->put();
+    return;
+  }
+  int from = m->get_source().num();
+  for (auto& p : m->get_pg_list()) {
+    spg_t pgid(p.info.pgid.pgid, p.to);
+    enqueue_peering_evt(
+      pgid,
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	  p.epoch_sent,
+	  p.query_epoch,
+	  MNotifyRec(
+	    pgid, pg_shard_t(from, p.from),
+	    p,
+	    m->get_connection()->get_features()),
+	  true,
+	  new PGCreateInfo(
+	    pgid,
+	    p.query_epoch,
+	    p.info.history,
+	    p.past_intervals,
+	    false)
+	  )));
+  }
+  m->put();
+}
+
+void OSD::handle_fast_pg_info(MOSDPGInfo* m)
+{
+  dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+  if (!require_osd_peer(m)) {
+    m->put();
+    return;
+  }
+  int from = m->get_source().num();
+  for (auto& p : m->pg_list) {
+    enqueue_peering_evt(
+      spg_t(p.info.pgid.pgid, p.to),
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	  p.epoch_sent, p.query_epoch,
+	  MInfoRec(
+	    pg_shard_t(from, p.from),
+	    p.info,
+	    p.epoch_sent)))
+      );
+  }
+  m->put();
+}
+
+void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
+{
+  dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
+  if (!require_osd_peer(m)) {
+    m->put();
+    return;
+  }
+  for (auto& pgid : m->pg_list) {
+    enqueue_peering_evt(
+      pgid,
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	  m->get_epoch(), m->get_epoch(),
+	  PeeringState::DeleteStart())));
+  }
+  m->put();
+}
+
+void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
+{
+  dout(10) << __func__ << " " << *m << dendl;
+  if (!require_mon_or_mgr_peer(m)) {
+    m->put();
+    return;
+  }
+  epoch_t epoch = get_osdmap_epoch();
+  for (auto pgid : m->forced_pgs) {
+    if (m->options & OFR_BACKFILL) {
+      if (m->options & OFR_CANCEL) {
+	enqueue_peering_evt(
+	  pgid,
+	  PGPeeringEventRef(
+	    std::make_shared<PGPeeringEvent>(
+	      epoch, epoch,
+	      PeeringState::UnsetForceBackfill())));
+      } else {
+	enqueue_peering_evt(
+	  pgid,
+	  PGPeeringEventRef(
+	    std::make_shared<PGPeeringEvent>(
+	      epoch, epoch,
+	      PeeringState::SetForceBackfill())));
+      }
+    } else if (m->options & OFR_RECOVERY) {
+      if (m->options & OFR_CANCEL) {
+	enqueue_peering_evt(
+	  pgid,
+	  PGPeeringEventRef(
+	    std::make_shared<PGPeeringEvent>(
+	      epoch, epoch,
+	      PeeringState::UnsetForceRecovery())));
+      } else {
+	enqueue_peering_evt(
+	  pgid,
+	  PGPeeringEventRef(
+	    std::make_shared<PGPeeringEvent>(
+	      epoch, epoch,
+	      PeeringState::SetForceRecovery())));
+      }
+    }
+  }
+  m->put();
+}
+
+void OSD::handle_pg_query_nopg(const MQuery& q)
+{
+  spg_t pgid = q.pgid;
+  dout(10) << __func__ << " " << pgid << dendl;
+
+  OSDMapRef osdmap = get_osdmap();
+  if (!osdmap->have_pg_pool(pgid.pool()))
+    return;
+
+  dout(10) << " pg " << pgid << " dne" << dendl;
+  pg_info_t empty(spg_t(pgid.pgid, q.query.to));
+  ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
+  if (con) {
+    Message *m;
+    if (q.query.type == pg_query_t::LOG ||
+	q.query.type == pg_query_t::FULLLOG) {
+      m = new MOSDPGLog(
+	q.query.from, q.query.to,
+	osdmap->get_epoch(), empty,
+	q.query.epoch_sent);
+    } else {
+      vector<pg_notify_t> ls;
+      ls.push_back(
+	pg_notify_t(
+	  q.query.from, q.query.to,
+	  q.query.epoch_sent,
+	  osdmap->get_epoch(),
+	  empty,
+	  PastIntervals()));
+      m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
+    }
+    service.maybe_share_map(con.get(), osdmap);
+    con->send_message(m);
+  }
+}
+
+void OSDService::queue_check_readable(spg_t spgid,
+				      epoch_t lpr,
+				      ceph::signedspan delay)
+{
+  if (delay == ceph::signedspan::zero()) {
+    osd->enqueue_peering_evt(
+      spgid,
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	  lpr, lpr,
+	  PeeringState::CheckReadable())));
+  } else {
+    mono_timer.add_event(
+      delay,
+      [this, spgid, lpr]() {
+	queue_check_readable(spgid, lpr);
+      });
+  }
+}
+
+
+// =========================================================
+// RECOVERY
+
+void OSDService::_maybe_queue_recovery() {
+  ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
+  uint64_t available_pushes;
+  while (!awaiting_throttle.empty() &&
+	 _recover_now(&available_pushes)) {
+    uint64_t to_start = std::min(
+      available_pushes,
+      cct->_conf->osd_recovery_max_single_start);
+    _queue_for_recovery(awaiting_throttle.front(), to_start);
+    awaiting_throttle.pop_front();
+    dout(10) << __func__ << " starting " << to_start
+	     << ", recovery_ops_reserved " << recovery_ops_reserved
+	     << " -> " << (recovery_ops_reserved + to_start) << dendl;
+    recovery_ops_reserved += to_start;
+  }
+}
+
+bool OSDService::_recover_now(uint64_t *available_pushes)
+{
+  if (available_pushes)
+      *available_pushes = 0;
+
+  if (ceph_clock_now() < defer_recovery_until) {
+    dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
+    return false;
+  }
+
+  if (recovery_paused) {
+    dout(15) << __func__ << " paused" << dendl;
+    return false;
+  }
+
+  uint64_t max = osd->get_recovery_max_active();
+  if (max <= recovery_ops_active + recovery_ops_reserved) {
+    dout(15) << __func__ << " active " << recovery_ops_active
+	     << " + reserved " << recovery_ops_reserved
+	     << " >= max " << max << dendl;
+    return false;
+  }
+
+  if (available_pushes)
+    *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
+
+  return true;
+}
+
+unsigned OSDService::get_target_pg_log_entries() const
+{
+  auto num_pgs = osd->get_num_pgs();
+  auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
+  if (num_pgs > 0 && target > 0) {
+    // target an even spread of our budgeted log entries across all
+    // PGs.  note that while we only get to control the entry count
+    // for primary PGs, we'll normally be responsible for a mix of
+    // primary and replica PGs (for the same pool(s) even), so this
+    // will work out.
+    return std::max<unsigned>(
+      std::min<unsigned>(target / num_pgs,
+			 cct->_conf->osd_max_pg_log_entries),
+      cct->_conf->osd_min_pg_log_entries);
+  } else {
+    // fall back to a per-pg value.
+    return cct->_conf->osd_min_pg_log_entries;
+  }
+}
+
+void OSD::do_recovery(
+  PG *pg, epoch_t queued, uint64_t reserved_pushes,
+  ThreadPool::TPHandle &handle)
+{
+  uint64_t started = 0;
+
+  /*
+   * When the value of osd_recovery_sleep is set greater than zero, recovery
+   * ops are scheduled after osd_recovery_sleep amount of time from the previous
+   * recovery event's schedule time. This is done by adding a
+   * recovery_requeue_callback event, which re-queues the recovery op using
+   * queue_recovery_after_sleep.
+   */
+  float recovery_sleep = get_osd_recovery_sleep();
+  {
+    std::lock_guard l(service.sleep_lock);
+    if (recovery_sleep > 0 && service.recovery_needs_sleep) {
+      PGRef pgref(pg);
+      auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
+        dout(20) << "do_recovery wake up at "
+                 << ceph_clock_now()
+	         << ", re-queuing recovery" << dendl;
+	std::lock_guard l(service.sleep_lock);
+        service.recovery_needs_sleep = false;
+        service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
+      });
+
+      // This is true for the first recovery op and when the previous recovery op
+      // has been scheduled in the past. The next recovery op is scheduled after
+      // completing the sleep from now.
+
+      if (auto now = ceph::real_clock::now();
+	  service.recovery_schedule_time < now) {
+        service.recovery_schedule_time = now;
+      }
+      service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
+      service.sleep_timer.add_event_at(service.recovery_schedule_time,
+				       recovery_requeue_callback);
+      dout(20) << "Recovery event scheduled at "
+               << service.recovery_schedule_time << dendl;
+      return;
+    }
+  }
+
+  {
+    {
+      std::lock_guard l(service.sleep_lock);
+      service.recovery_needs_sleep = true;
+    }
+
+    if (pg->pg_has_reset_since(queued)) {
+      goto out;
+    }
+
+    dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
+#ifdef DEBUG_RECOVERY_OIDS
+    dout(20) << "  active was " << service.recovery_oids[pg->pg_id] << dendl;
+#endif
+
+    bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
+    dout(10) << "do_recovery started " << started << "/" << reserved_pushes
+	     << " on " << *pg << dendl;
+
+    if (do_unfound) {
+      PeeringCtx rctx = create_context();
+      rctx.handle = &handle;
+      pg->find_unfound(queued, rctx);
+      dispatch_context(rctx, pg, pg->get_osdmap());
+    }
+  }
+
+ out:
+  ceph_assert(started <= reserved_pushes);
+  service.release_reserved_pushes(reserved_pushes);
+}
+
+void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
+{
+  std::lock_guard l(recovery_lock);
+  dout(10) << "start_recovery_op " << *pg << " " << soid
+	   << " (" << recovery_ops_active << "/"
+	   << osd->get_recovery_max_active() << " rops)"
+	   << dendl;
+  recovery_ops_active++;
+
+#ifdef DEBUG_RECOVERY_OIDS
+  dout(20) << "  active was " << recovery_oids[pg->pg_id] << dendl;
+  ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
+  recovery_oids[pg->pg_id].insert(soid);
+#endif
+}
+
+void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
+{
+  std::lock_guard l(recovery_lock);
+  dout(10) << "finish_recovery_op " << *pg << " " << soid
+	   << " dequeue=" << dequeue
+	   << " (" << recovery_ops_active << "/"
+	   << osd->get_recovery_max_active() << " rops)"
+	   << dendl;
+
+  // adjust count
+  ceph_assert(recovery_ops_active > 0);
+  recovery_ops_active--;
+
+#ifdef DEBUG_RECOVERY_OIDS
+  dout(20) << "  active oids was " << recovery_oids[pg->pg_id] << dendl;
+  ceph_assert(recovery_oids[pg->pg_id].count(soid));
+  recovery_oids[pg->pg_id].erase(soid);
+#endif
+
+  _maybe_queue_recovery();
+}
+
+bool OSDService::is_recovery_active()
+{
+  if (cct->_conf->osd_debug_pretend_recovery_active) {
+    return true;
+  }
+  return local_reserver.has_reservation() || remote_reserver.has_reservation();
+}
+
+void OSDService::release_reserved_pushes(uint64_t pushes)
+{
+  std::lock_guard l(recovery_lock);
+  dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
+	   << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
+	   << dendl;
+  ceph_assert(recovery_ops_reserved >= pushes);
+  recovery_ops_reserved -= pushes;
+  _maybe_queue_recovery();
+}
+
+// =========================================================
+// OPS
+
+bool OSD::op_is_discardable(const MOSDOp *op)
+{
+  // drop client request if they are not connected and can't get the
+  // reply anyway.
+  if (!op->get_connection()->is_connected()) {
+    return true;
+  }
+  return false;
+}
+
+void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
+{
+  const utime_t stamp = op->get_req()->get_recv_stamp();
+  const utime_t latency = ceph_clock_now() - stamp;
+  const unsigned priority = op->get_req()->get_priority();
+  const int cost = op->get_req()->get_cost();
+  const uint64_t owner = op->get_req()->get_source().num();
+  const int type = op->get_req()->get_type();
+
+  dout(15) << "enqueue_op " << op << " prio " << priority
+           << " type " << type
+	   << " cost " << cost
+	   << " latency " << latency
+	   << " epoch " << epoch
+	   << " " << *(op->get_req()) << dendl;
+  op->osd_trace.event("enqueue op");
+  op->osd_trace.keyval("priority", priority);
+  op->osd_trace.keyval("cost", cost);
+#ifdef HAVE_JAEGER
+  if (op->osd_parent_span) {
+    auto enqueue_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
+    enqueue_span->Log({
+	{"priority", priority},
+	{"cost", cost},
+	{"epoch", epoch},
+	{"owner", owner},
+	{"type", type}
+	});
+  }
+#endif
+  op->mark_queued_for_pg();
+  logger->tinc(l_osd_op_before_queue_op_lat, latency);
+  if (type == MSG_OSD_PG_PUSH ||
+      type == MSG_OSD_PG_PUSH_REPLY) {
+    op_shardedwq.queue(
+      OpSchedulerItem(
+        unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
+        cost, priority, stamp, owner, epoch));
+  } else {
+    op_shardedwq.queue(
+      OpSchedulerItem(
+        unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
+        cost, priority, stamp, owner, epoch));
+  }
+}
+
+void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
+{
+  dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
+  op_shardedwq.queue(
+    OpSchedulerItem(
+      unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
+      10,
+      cct->_conf->osd_peering_op_priority,
+      utime_t(),
+      0,
+      evt->get_epoch_sent()));
+}
+
+/*
+ * NOTE: dequeue called in worker thread, with pg lock
+ */
+void OSD::dequeue_op(
+  PGRef pg, OpRequestRef op,
+  ThreadPool::TPHandle &handle)
+{
+  const Message *m = op->get_req();
+
+  FUNCTRACE(cct);
+  OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
+
+  utime_t now = ceph_clock_now();
+  op->set_dequeued_time(now);
+
+  utime_t latency = now - m->get_recv_stamp();
+  dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
+	   << " cost " << m->get_cost()
+	   << " latency " << latency
+	   << " " << *m
+	   << " pg " << *pg << dendl;
+
+  logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
+
+  service.maybe_share_map(m->get_connection().get(),
+			  pg->get_osdmap(),
+			  op->sent_epoch);
+
+  if (pg->is_deleting())
+    return;
+
+  op->mark_reached_pg();
+  op->osd_trace.event("dequeue_op");
+
+  pg->do_request(op, handle);
+
+  // finish
+  dout(10) << "dequeue_op " << op << " finish" << dendl;
+  OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
+}
+
+
+void OSD::dequeue_peering_evt(
+  OSDShard *sdata,
+  PG *pg,
+  PGPeeringEventRef evt,
+  ThreadPool::TPHandle& handle)
+{
+  PeeringCtx rctx = create_context();
+  auto curmap = sdata->get_osdmap();
+  bool need_up_thru = false;
+  epoch_t same_interval_since = 0;
+  if (!pg) {
+    if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
+      handle_pg_query_nopg(*q);
+    } else {
+      derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
+      ceph_abort();
+    }
+  } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
+    pg->do_peering_event(evt, rctx);
+    if (pg->is_deleted()) {
+      pg->unlock();
+      return;
+    }
+    dispatch_context(rctx, pg, curmap, &handle);
+    need_up_thru = pg->get_need_up_thru();
+    same_interval_since = pg->get_same_interval_since();
+    pg->unlock();
+  }
+
+  if (need_up_thru) {
+    queue_want_up_thru(same_interval_since);
+  }
+
+  service.send_pg_temp();
+}
+
+void OSD::dequeue_delete(
+  OSDShard *sdata,
+  PG *pg,
+  epoch_t e,
+  ThreadPool::TPHandle& handle)
+{
+  dequeue_peering_evt(
+    sdata,
+    pg,
+    PGPeeringEventRef(
+      std::make_shared<PGPeeringEvent>(
+	e, e,
+	PeeringState::DeleteSome())),
+    handle);
+}
+
+
+
+// --------------------------------
+
+const char** OSD::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "osd_max_backfills",
+    "osd_min_recovery_priority",
+    "osd_max_trimming_pgs",
+    "osd_op_complaint_time",
+    "osd_op_log_threshold",
+    "osd_op_history_size",
+    "osd_op_history_duration",
+    "osd_op_history_slow_op_size",
+    "osd_op_history_slow_op_threshold",
+    "osd_enable_op_tracker",
+    "osd_map_cache_size",
+    "osd_pg_epoch_max_lag_factor",
+    "osd_pg_epoch_persisted_max_stale",
+    "osd_recovery_sleep",
+    "osd_recovery_sleep_hdd",
+    "osd_recovery_sleep_ssd",
+    "osd_recovery_sleep_hybrid",
+    "osd_delete_sleep",
+    "osd_delete_sleep_hdd",
+    "osd_delete_sleep_ssd",
+    "osd_delete_sleep_hybrid",
+    "osd_snap_trim_sleep",
+    "osd_snap_trim_sleep_hdd",
+    "osd_snap_trim_sleep_ssd",
+    "osd_snap_trim_sleep_hybrid"
+    "osd_scrub_sleep",
+    "osd_recovery_max_active",
+    "osd_recovery_max_active_hdd",
+    "osd_recovery_max_active_ssd",
+    // clog & admin clog
+    "clog_to_monitors",
+    "clog_to_syslog",
+    "clog_to_syslog_facility",
+    "clog_to_syslog_level",
+    "osd_objectstore_fuse",
+    "clog_to_graylog",
+    "clog_to_graylog_host",
+    "clog_to_graylog_port",
+    "host",
+    "fsid",
+    "osd_recovery_delay_start",
+    "osd_client_message_size_cap",
+    "osd_client_message_cap",
+    "osd_heartbeat_min_size",
+    "osd_heartbeat_interval",
+    "osd_object_clean_region_max_num_intervals",
+    "osd_scrub_min_interval",
+    "osd_scrub_max_interval",
+    NULL
+  };
+  return KEYS;
+}
+
+void OSD::handle_conf_change(const ConfigProxy& conf,
+			     const std::set <std::string> &changed)
+{
+  std::lock_guard l{osd_lock};
+
+  if (changed.count("osd_max_backfills") ||
+      changed.count("osd_delete_sleep") ||
+      changed.count("osd_delete_sleep_hdd") ||
+      changed.count("osd_delete_sleep_ssd") ||
+      changed.count("osd_delete_sleep_hybrid") ||
+      changed.count("osd_snap_trim_sleep") ||
+      changed.count("osd_snap_trim_sleep_hdd") ||
+      changed.count("osd_snap_trim_sleep_ssd") ||
+      changed.count("osd_snap_trim_sleep_hybrid") ||
+      changed.count("osd_scrub_sleep") ||
+      changed.count("osd_recovery_sleep") ||
+      changed.count("osd_recovery_sleep_hdd") ||
+      changed.count("osd_recovery_sleep_ssd") ||
+      changed.count("osd_recovery_sleep_hybrid") ||
+      changed.count("osd_recovery_max_active") ||
+      changed.count("osd_recovery_max_active_hdd") ||
+      changed.count("osd_recovery_max_active_ssd")) {
+    if (!maybe_override_options_for_qos() &&
+        changed.count("osd_max_backfills")) {
+      // Scheduler is not "mclock". Fallback to earlier behavior
+      service.local_reserver.set_max(cct->_conf->osd_max_backfills);
+      service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
+    }
+  }
+  if (changed.count("osd_min_recovery_priority")) {
+    service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
+    service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
+  }
+  if (changed.count("osd_max_trimming_pgs")) {
+    service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
+  }
+  if (changed.count("osd_op_complaint_time") ||
+      changed.count("osd_op_log_threshold")) {
+    op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
+                                           cct->_conf->osd_op_log_threshold);
+  }
+  if (changed.count("osd_op_history_size") ||
+      changed.count("osd_op_history_duration")) {
+    op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
+                                             cct->_conf->osd_op_history_duration);
+  }
+  if (changed.count("osd_op_history_slow_op_size") ||
+      changed.count("osd_op_history_slow_op_threshold")) {
+    op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
+                                                      cct->_conf->osd_op_history_slow_op_threshold);
+  }
+  if (changed.count("osd_enable_op_tracker")) {
+      op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
+  }
+  if (changed.count("osd_map_cache_size")) {
+    service.map_cache.set_size(cct->_conf->osd_map_cache_size);
+    service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
+    service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
+  }
+  if (changed.count("clog_to_monitors") ||
+      changed.count("clog_to_syslog") ||
+      changed.count("clog_to_syslog_level") ||
+      changed.count("clog_to_syslog_facility") ||
+      changed.count("clog_to_graylog") ||
+      changed.count("clog_to_graylog_host") ||
+      changed.count("clog_to_graylog_port") ||
+      changed.count("host") ||
+      changed.count("fsid")) {
+    update_log_config();
+  }
+  if (changed.count("osd_pg_epoch_max_lag_factor")) {
+    m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
+      "osd_pg_epoch_max_lag_factor");
+  }
+
+#ifdef HAVE_LIBFUSE
+  if (changed.count("osd_objectstore_fuse")) {
+    if (store) {
+      enable_disable_fuse(false);
+    }
+  }
+#endif
+
+  if (changed.count("osd_recovery_delay_start")) {
+    service.defer_recovery(cct->_conf->osd_recovery_delay_start);
+    service.kick_recovery_queue();
+  }
+
+  if (changed.count("osd_client_message_cap")) {
+    uint64_t newval = cct->_conf->osd_client_message_cap;
+    Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
+    if (pol.throttler_messages && newval > 0) {
+      pol.throttler_messages->reset_max(newval);
+    }
+  }
+  if (changed.count("osd_client_message_size_cap")) {
+    uint64_t newval = cct->_conf->osd_client_message_size_cap;
+    Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
+    if (pol.throttler_bytes && newval > 0) {
+      pol.throttler_bytes->reset_max(newval);
+    }
+  }
+  if (changed.count("osd_object_clean_region_max_num_intervals")) {
+    ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
+  }
+
+  if (changed.count("osd_scrub_min_interval") ||
+      changed.count("osd_scrub_max_interval")) {
+    resched_all_scrubs();
+    dout(0) << __func__ << ": scrub interval change" << dendl;
+  }
+  check_config();
+  if (changed.count("osd_asio_thread_count")) {
+    service.poolctx.stop();
+    service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
+  }
+}
+
+void OSD::maybe_override_max_osd_capacity_for_qos()
+{
+  // If the scheduler enabled is mclock, override the default
+  // osd capacity with the value obtained from running the
+  // osd bench test. This is later used to setup mclock.
+  if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
+      (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false)) {
+    std::string max_capacity_iops_config;
+    bool force_run_benchmark =
+      cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
+
+    if (store_is_rotational) {
+      max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
+    } else {
+      max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
+    }
+
+    if (!force_run_benchmark) {
+      double default_iops = 0.0;
+
+      // Get the current osd iops capacity
+      double cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
+
+      // Get the default max iops capacity
+      auto val = cct->_conf.get_val_default(max_capacity_iops_config);
+      if (!val.has_value()) {
+        derr << __func__ << " Unable to determine default value of "
+            << max_capacity_iops_config << dendl;
+        // Cannot determine default iops. Force a run of the OSD benchmark.
+        force_run_benchmark = true;
+      } else {
+        // Default iops
+        default_iops = std::stod(val.value());
+      }
+
+      // Determine if we really need to run the osd benchmark
+      if (!force_run_benchmark && (default_iops != cur_iops)) {
+        dout(1) << __func__ << std::fixed << std::setprecision(2)
+                << " default_iops: " << default_iops
+                << " cur_iops: " << cur_iops
+                << ". Skip OSD benchmark test." << dendl;
+        return;
+      }
+    }
+
+    // Run osd bench: write 100 4MiB objects with blocksize 4KiB
+    int64_t count = 12288000; // Count of bytes to write
+    int64_t bsize = 4096;     // Block size
+    int64_t osize = 4194304;  // Object size
+    int64_t onum = 100;       // Count of objects to write
+    double elapsed = 0.0;     // Time taken to complete the test
+    double iops = 0.0;
+    stringstream ss;
+    int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
+    if (ret != 0) {
+      derr << __func__
+           << " osd bench err: " << ret
+           << " osd bench errstr: " << ss.str()
+           << dendl;
+      return;
+    }
+
+    double rate = count / elapsed;
+    iops = rate / bsize;
+    dout(1) << __func__
+            << " osd bench result -"
+            << std::fixed << std::setprecision(3)
+            << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
+            << " iops: " << iops
+            << " elapsed_sec: " << elapsed
+            << dendl;
+
+    // Persist iops to the MON store
+    ret = mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
+    if (ret < 0) {
+      // Fallback to setting the config within the in-memory "values" map.
+      cct->_conf.set_val(max_capacity_iops_config, std::to_string(iops));
+    }
+
+    // Override the max osd capacity for all shards
+    for (auto& shard : shards) {
+      shard->update_scheduler_config();
+    }
+  }
+}
+
+bool OSD::maybe_override_options_for_qos()
+{
+  // If the scheduler enabled is mclock, override the recovery, backfill
+  // and sleep options so that mclock can meet the QoS goals.
+  if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") {
+    dout(1) << __func__
+            << ": Changing recovery/backfill/sleep settings for QoS" << dendl;
+
+    // Set high value for recovery max active
+    uint32_t rec_max_active = 1000;
+    cct->_conf.set_val(
+      "osd_recovery_max_active", std::to_string(rec_max_active));
+    cct->_conf.set_val(
+      "osd_recovery_max_active_hdd", std::to_string(rec_max_active));
+    cct->_conf.set_val(
+      "osd_recovery_max_active_ssd", std::to_string(rec_max_active));
+
+    // Set high value for osd_max_backfill
+    uint32_t max_backfills = 1000;
+    cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
+    service.local_reserver.set_max(max_backfills);
+    service.remote_reserver.set_max(max_backfills);
+
+    // Disable recovery sleep
+    cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
+    cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
+    cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
+    cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
+
+    // Disable delete sleep
+    cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
+    cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
+    cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
+    cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
+
+    // Disable snap trim sleep
+    cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
+    cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
+    cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
+    cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
+
+    // Disable scrub sleep
+    cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
+    return true;
+  }
+  return false;
+}
+
+int OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
+{
+  std::string cmd =
+    "{"
+      "\"prefix\": \"config set\", "
+      "\"who\": \"osd." + std::to_string(whoami) + "\", "
+      "\"name\": \"" + key + "\", "
+      "\"value\": \"" + val + "\""
+    "}";
+
+  vector<std::string> vcmd{cmd};
+  bufferlist inbl;
+  std::string outs;
+  C_SaferCond cond;
+  monc->start_mon_command(vcmd, inbl, nullptr, &outs, &cond);
+  int r = cond.wait();
+  if (r < 0) {
+    derr << __func__ << " Failed to set config key " << key
+         << " err: " << cpp_strerror(r)
+         << " errstr: " << outs << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+void OSD::update_log_config()
+{
+  map<string,string> log_to_monitors;
+  map<string,string> log_to_syslog;
+  map<string,string> log_channel;
+  map<string,string> log_prio;
+  map<string,string> log_to_graylog;
+  map<string,string> log_to_graylog_host;
+  map<string,string> log_to_graylog_port;
+  uuid_d fsid;
+  string host;
+
+  if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
+			       log_channel, log_prio, log_to_graylog,
+			       log_to_graylog_host, log_to_graylog_port,
+			       fsid, host) == 0)
+    clog->update_config(log_to_monitors, log_to_syslog,
+			log_channel, log_prio, log_to_graylog,
+			log_to_graylog_host, log_to_graylog_port,
+			fsid, host);
+  derr << "log_to_monitors " << log_to_monitors << dendl;
+}
+
+void OSD::check_config()
+{
+  // some sanity checks
+  if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
+    clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
+		 << " is not > osd_pg_epoch_persisted_max_stale ("
+		 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
+  }
+  if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
+    clog->warn() << "osd_object_clean_region_max_num_intervals ("
+                 << cct->_conf->osd_object_clean_region_max_num_intervals
+                << ") is < 0";
+  }
+}
+
+// --------------------------------
+
+void OSD::get_latest_osdmap()
+{
+  dout(10) << __func__ << " -- start" << dendl;
+
+  boost::system::error_code ec;
+  service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
+
+  dout(10) << __func__ << " -- finish" << dendl;
+}
+
+// --------------------------------
+
+void OSD::set_perf_queries(const ConfigPayload &config_payload) {
+  const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
+  const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
+  dout(10) << "setting " << queries.size() << " queries" << dendl;
+
+  std::list<OSDPerfMetricQuery> supported_queries;
+  for (auto &it : queries) {
+    auto &query = it.first;
+    if (!query.key_descriptor.empty()) {
+      supported_queries.push_back(query);
+    }
+  }
+  if (supported_queries.size() < queries.size()) {
+    dout(1) << queries.size() - supported_queries.size()
+            << " unsupported queries" << dendl;
+  }
+  {
+    std::lock_guard locker{m_perf_queries_lock};
+    m_perf_queries = supported_queries;
+    m_perf_limits = queries;
+  }
+  std::vector<PGRef> pgs;
+  _get_pgs(&pgs);
+  for (auto& pg : pgs) {
+    std::scoped_lock l{*pg};
+    pg->set_dynamic_perf_stats_queries(supported_queries);
+  }
+}
+
+MetricPayload OSD::get_perf_reports() {
+  OSDMetricPayload payload;
+  std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
+
+  std::vector<PGRef> pgs;
+  _get_pgs(&pgs);
+  DynamicPerfStats dps;
+  for (auto& pg : pgs) {
+    // m_perf_queries can be modified only in set_perf_queries by mgr client
+    // request, and it is protected by by mgr client's lock, which is held
+    // when set_perf_queries/get_perf_reports are called, so we may not hold
+    // m_perf_queries_lock here.
+    DynamicPerfStats pg_dps(m_perf_queries);
+    pg->lock();
+    pg->get_dynamic_perf_stats(&pg_dps);
+    pg->unlock();
+    dps.merge(pg_dps);
+  }
+  dps.add_to_reports(m_perf_limits, &reports);
+  dout(20) << "reports for " << reports.size() << " queries" << dendl;
+
+  return payload;
+}
+
+// =============================================================
+
+#undef dout_context
+#define dout_context cct
+#undef dout_prefix
+#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
+
+void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
+{
+  dout(10) << pg->pg_id << " " << pg << dendl;
+  slot->pg = pg;
+  pg->osd_shard = this;
+  pg->pg_slot = slot;
+  osd->inc_num_pgs();
+
+  slot->epoch = pg->get_osdmap_epoch();
+  pg_slots_by_epoch.insert(*slot);
+}
+
+void OSDShard::_detach_pg(OSDShardPGSlot *slot)
+{
+  dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
+  slot->pg->osd_shard = nullptr;
+  slot->pg->pg_slot = nullptr;
+  slot->pg = nullptr;
+  osd->dec_num_pgs();
+
+  pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
+  slot->epoch = 0;
+  if (waiting_for_min_pg_epoch) {
+    min_pg_epoch_cond.notify_all();
+  }
+}
+
+void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
+{
+  std::lock_guard l(shard_lock);
+  dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
+	   << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
+  pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
+  dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
+  slot->epoch = e;
+  pg_slots_by_epoch.insert(*slot);
+  dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
+	   << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
+  if (waiting_for_min_pg_epoch) {
+    min_pg_epoch_cond.notify_all();
+  }
+}
+
+epoch_t OSDShard::get_min_pg_epoch()
+{
+  std::lock_guard l(shard_lock);
+  auto p = pg_slots_by_epoch.begin();
+  if (p == pg_slots_by_epoch.end()) {
+    return 0;
+  }
+  return p->epoch;
+}
+
+void OSDShard::wait_min_pg_epoch(epoch_t need)
+{
+  std::unique_lock l{shard_lock};
+  ++waiting_for_min_pg_epoch;
+  min_pg_epoch_cond.wait(l, [need, this] {
+    if (pg_slots_by_epoch.empty()) {
+      return true;
+    } else if (pg_slots_by_epoch.begin()->epoch >= need) {
+      return true;
+    } else {
+      dout(10) << need << " waiting on "
+	       << pg_slots_by_epoch.begin()->epoch << dendl;
+      return false;
+    }
+  });
+  --waiting_for_min_pg_epoch;
+}
+
+epoch_t OSDShard::get_max_waiting_epoch()
+{
+  std::lock_guard l(shard_lock);
+  epoch_t r = 0;
+  for (auto& i : pg_slots) {
+    if (!i.second->waiting_peering.empty()) {
+      r = std::max(r, i.second->waiting_peering.rbegin()->first);
+    }
+  }
+  return r;
+}
+
+void OSDShard::consume_map(
+  const OSDMapRef& new_osdmap,
+  unsigned *pushes_to_free)
+{
+  std::lock_guard l(shard_lock);
+  OSDMapRef old_osdmap;
+  {
+    std::lock_guard l(osdmap_lock);
+    old_osdmap = std::move(shard_osdmap);
+    shard_osdmap = new_osdmap;
+  }
+  dout(10) << new_osdmap->get_epoch()
+           << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
+	   << dendl;
+  bool queued = false;
+
+  // check slots
+  auto p = pg_slots.begin();
+  while (p != pg_slots.end()) {
+    OSDShardPGSlot *slot = p->second.get();
+    const spg_t& pgid = p->first;
+    dout(20) << __func__ << " " << pgid << dendl;
+    if (!slot->waiting_for_split.empty()) {
+      dout(20) << __func__ << "  " << pgid
+	       << " waiting for split " << slot->waiting_for_split << dendl;
+      ++p;
+      continue;
+    }
+    if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
+      dout(20) << __func__ << "  " << pgid
+	       << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
+	       << dendl;
+      ++p;
+      continue;
+    }
+    if (!slot->waiting_peering.empty()) {
+      epoch_t first = slot->waiting_peering.begin()->first;
+      if (first <= new_osdmap->get_epoch()) {
+	dout(20) << __func__ << "  " << pgid
+		 << " pending_peering first epoch " << first
+		 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
+	_wake_pg_slot(pgid, slot);
+	queued = true;
+      }
+      ++p;
+      continue;
+    }
+    if (!slot->waiting.empty()) {
+      if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
+	dout(20) << __func__ << "  " << pgid << " maps to us, keeping"
+		 << dendl;
+	++p;
+	continue;
+      }
+      while (!slot->waiting.empty() &&
+	     slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
+	auto& qi = slot->waiting.front();
+	dout(20) << __func__ << "  " << pgid
+		 << " waiting item " << qi
+		 << " epoch " << qi.get_map_epoch()
+		 << " <= " << new_osdmap->get_epoch()
+		 << ", "
+		 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
+		     "misdirected")
+		 << ", dropping" << dendl;
+        *pushes_to_free += qi.get_reserved_pushes();
+	slot->waiting.pop_front();
+      }
+    }
+    if (slot->waiting.empty() &&
+	slot->num_running == 0 &&
+	slot->waiting_for_split.empty() &&
+	!slot->pg) {
+      dout(20) << __func__ << "  " << pgid << " empty, pruning" << dendl;
+      p = pg_slots.erase(p);
+      continue;
+    }
+
+    ++p;
+  }
+  if (queued) {
+    std::lock_guard l{sdata_wait_lock};
+    sdata_cond.notify_one();
+  }
+}
+
+void OSDShard::_wake_pg_slot(
+  spg_t pgid,
+  OSDShardPGSlot *slot)
+{
+  dout(20) << __func__ << " " << pgid
+	   << " to_process " << slot->to_process
+	   << " waiting " << slot->waiting
+	   << " waiting_peering " << slot->waiting_peering << dendl;
+  for (auto i = slot->to_process.rbegin();
+       i != slot->to_process.rend();
+       ++i) {
+    scheduler->enqueue_front(std::move(*i));
+  }
+  slot->to_process.clear();
+  for (auto i = slot->waiting.rbegin();
+       i != slot->waiting.rend();
+       ++i) {
+    scheduler->enqueue_front(std::move(*i));
+  }
+  slot->waiting.clear();
+  for (auto i = slot->waiting_peering.rbegin();
+       i != slot->waiting_peering.rend();
+       ++i) {
+    // this is overkill; we requeue everything, even if some of these
+    // items are waiting for maps we don't have yet.  FIXME, maybe,
+    // someday, if we decide this inefficiency matters
+    for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
+      scheduler->enqueue_front(std::move(*j));
+    }
+  }
+  slot->waiting_peering.clear();
+  ++slot->requeue_seq;
+}
+
+void OSDShard::identify_splits_and_merges(
+  const OSDMapRef& as_of_osdmap,
+  set<pair<spg_t,epoch_t>> *split_pgs,
+  set<pair<spg_t,epoch_t>> *merge_pgs)
+{
+  std::lock_guard l(shard_lock);
+  if (shard_osdmap) {
+    for (auto& i : pg_slots) {
+      const spg_t& pgid = i.first;
+      auto *slot = i.second.get();
+      if (slot->pg) {
+	osd->service.identify_splits_and_merges(
+	  shard_osdmap, as_of_osdmap, pgid,
+	  split_pgs, merge_pgs);
+      } else if (!slot->waiting_for_split.empty()) {
+	osd->service.identify_splits_and_merges(
+	  shard_osdmap, as_of_osdmap, pgid,
+	  split_pgs, nullptr);
+      } else {
+	dout(20) << __func__ << " slot " << pgid
+		 << " has no pg and waiting_for_split " << dendl;
+      }
+    }
+  }
+}
+
+void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
+			    set<pair<spg_t,epoch_t>> *pgids)
+{
+  std::lock_guard l(shard_lock);
+  _prime_splits(pgids);
+  if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
+    set<pair<spg_t,epoch_t>> newer_children;
+    for (auto i : *pgids) {
+      osd->service.identify_splits_and_merges(
+	as_of_osdmap, shard_osdmap, i.first,
+	&newer_children, nullptr);
+    }
+    newer_children.insert(pgids->begin(), pgids->end());
+    dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
+	     << shard_osdmap->get_epoch() << ", new children " << newer_children
+	     << dendl;
+    _prime_splits(&newer_children);
+    // note: we don't care what is left over here for other shards.
+    // if this shard is ahead of us and one isn't, e.g., one thread is
+    // calling into prime_splits via _process (due to a newly created
+    // pg) and this shard has a newer map due to a racing consume_map,
+    // then any grandchildren left here will be identified (or were
+    // identified) when the slower shard's osdmap is advanced.
+    // _prime_splits() will tolerate the case where the pgid is
+    // already primed.
+  }
+}
+
+void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
+{
+  dout(10) << *pgids << dendl;
+  auto p = pgids->begin();
+  while (p != pgids->end()) {
+    unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
+    if (shard_index == shard_id) {
+      auto r = pg_slots.emplace(p->first, nullptr);
+      if (r.second) {
+	dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
+	r.first->second = make_unique<OSDShardPGSlot>();
+	r.first->second->waiting_for_split.insert(p->second);
+      } else {
+	auto q = r.first;
+	ceph_assert(q != pg_slots.end());
+	dout(10) << "priming (existing) slot " << p->first << " e" << p->second
+		 << dendl;
+	q->second->waiting_for_split.insert(p->second);
+      }
+      p = pgids->erase(p);
+    } else {
+      ++p;
+    }
+  }
+}
+
+void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
+			    set<pair<spg_t,epoch_t>> *merge_pgs)
+{
+  std::lock_guard l(shard_lock);
+  dout(20) << __func__ << " checking shard " << shard_id
+	   << " for remaining merge pgs " << merge_pgs << dendl;
+  auto p = merge_pgs->begin();
+  while (p != merge_pgs->end()) {
+    spg_t pgid = p->first;
+    epoch_t epoch = p->second;
+    unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
+    if (shard_index != shard_id) {
+      ++p;
+      continue;
+    }
+    OSDShardPGSlot *slot;
+    auto r = pg_slots.emplace(pgid, nullptr);
+    if (r.second) {
+      r.first->second = make_unique<OSDShardPGSlot>();
+    }
+    slot = r.first->second.get();
+    if (slot->pg) {
+      // already have pg
+      dout(20) << __func__ << "  have merge participant pg " << pgid
+	       << " " << slot->pg << dendl;
+    } else if (!slot->waiting_for_split.empty() &&
+	       *slot->waiting_for_split.begin() < epoch) {
+      dout(20) << __func__ << "  pending split on merge participant pg " << pgid
+	       << " " << slot->waiting_for_split << dendl;
+    } else {
+      dout(20) << __func__ << "  creating empty merge participant " << pgid
+	       << " for merge in " << epoch << dendl;
+      // leave history zeroed; PG::merge_from() will fill it in.
+      pg_history_t history;
+      PGCreateInfo cinfo(pgid, epoch - 1,
+			 history, PastIntervals(), false);
+      PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
+      _attach_pg(r.first->second.get(), pg.get());
+      _wake_pg_slot(pgid, slot);
+      pg->unlock();
+    }
+    // mark slot for merge
+    dout(20) << __func__ << "  marking merge participant " << pgid << dendl;
+    slot->waiting_for_merge_epoch = epoch;
+    p = merge_pgs->erase(p);
+  }
+}
+
+void OSDShard::register_and_wake_split_child(PG *pg)
+{
+  epoch_t epoch;
+  {
+    std::lock_guard l(shard_lock);
+    dout(10) << pg->pg_id << " " << pg << dendl;
+    auto p = pg_slots.find(pg->pg_id);
+    ceph_assert(p != pg_slots.end());
+    auto *slot = p->second.get();
+    dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
+	     << dendl;
+    ceph_assert(!slot->pg);
+    ceph_assert(!slot->waiting_for_split.empty());
+    _attach_pg(slot, pg);
+
+    epoch = pg->get_osdmap_epoch();
+    ceph_assert(slot->waiting_for_split.count(epoch));
+    slot->waiting_for_split.erase(epoch);
+    if (slot->waiting_for_split.empty()) {
+      _wake_pg_slot(pg->pg_id, slot);
+    } else {
+      dout(10) << __func__ << " still waiting for split on "
+	       << slot->waiting_for_split << dendl;
+    }
+  }
+
+  // kick child to ensure it pulls up to the latest osdmap
+  osd->enqueue_peering_evt(
+    pg->pg_id,
+    PGPeeringEventRef(
+      std::make_shared<PGPeeringEvent>(
+	epoch,
+	epoch,
+	NullEvt())));
+
+  std::lock_guard l{sdata_wait_lock};
+  sdata_cond.notify_one();
+}
+
+void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
+{
+  std::lock_guard l(shard_lock);
+  vector<spg_t> to_delete;
+  for (auto& i : pg_slots) {
+    if (i.first != parent &&
+	i.first.get_ancestor(old_pg_num) == parent) {
+      dout(10) << __func__ << " parent " << parent << " clearing " << i.first
+	       << dendl;
+      _wake_pg_slot(i.first, i.second.get());
+      to_delete.push_back(i.first);
+    }
+  }
+  for (auto pgid : to_delete) {
+    pg_slots.erase(pgid);
+  }
+}
+
+void OSDShard::update_scheduler_config()
+{
+  std::lock_guard l(shard_lock);
+  scheduler->update_configuration();
+}
+
+OSDShard::OSDShard(
+  int id,
+  CephContext *cct,
+  OSD *osd)
+  : shard_id(id),
+    cct(cct),
+    osd(osd),
+    shard_name(string("OSDShard.") + stringify(id)),
+    sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
+    sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
+    osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
+    shard_lock_name(shard_name + "::shard_lock"),
+    shard_lock{make_mutex(shard_lock_name)},
+    scheduler(ceph::osd::scheduler::make_scheduler(
+      cct, osd->num_shards, osd->store->is_rotational())),
+    context_queue(sdata_wait_lock, sdata_cond)
+{
+  dout(0) << "using op scheduler " << *scheduler << dendl;
+}
+
+
+// =============================================================
+
+#undef dout_context
+#define dout_context osd->cct
+#undef dout_prefix
+#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
+
+void OSD::ShardedOpWQ::_add_slot_waiter(
+  spg_t pgid,
+  OSDShardPGSlot *slot,
+  OpSchedulerItem&& qi)
+{
+  if (qi.is_peering()) {
+    dout(20) << __func__ << " " << pgid
+	     << " peering, item epoch is "
+	     << qi.get_map_epoch()
+	     << ", will wait on " << qi << dendl;
+    slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
+  } else {
+    dout(20) << __func__ << " " << pgid
+	     << " item epoch is "
+	     << qi.get_map_epoch()
+	     << ", will wait on " << qi << dendl;
+    slot->waiting.push_back(std::move(qi));
+  }
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
+
+void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
+{
+  uint32_t shard_index = thread_index % osd->num_shards;
+  auto& sdata = osd->shards[shard_index];
+  ceph_assert(sdata);
+
+  // If all threads of shards do oncommits, there is a out-of-order
+  // problem.  So we choose the thread which has the smallest
+  // thread_index(thread_index < num_shards) of shard to do oncommit
+  // callback.
+  bool is_smallest_thread_index = thread_index < osd->num_shards;
+
+  // peek at spg_t
+  sdata->shard_lock.lock();
+  if (sdata->scheduler->empty() &&
+      (!is_smallest_thread_index || sdata->context_queue.empty())) {
+    std::unique_lock wait_lock{sdata->sdata_wait_lock};
+    if (is_smallest_thread_index && !sdata->context_queue.empty()) {
+      // we raced with a context_queue addition, don't wait
+      wait_lock.unlock();
+    } else if (!sdata->stop_waiting) {
+      dout(20) << __func__ << " empty q, waiting" << dendl;
+      osd->cct->get_heartbeat_map()->clear_timeout(hb);
+      sdata->shard_lock.unlock();
+      sdata->sdata_cond.wait(wait_lock);
+      wait_lock.unlock();
+      sdata->shard_lock.lock();
+      if (sdata->scheduler->empty() &&
+         !(is_smallest_thread_index && !sdata->context_queue.empty())) {
+	sdata->shard_lock.unlock();
+	return;
+      }
+      // found a work item; reapply default wq timeouts
+      osd->cct->get_heartbeat_map()->reset_timeout(hb,
+        timeout_interval, suicide_interval);
+    } else {
+      dout(20) << __func__ << " need return immediately" << dendl;
+      wait_lock.unlock();
+      sdata->shard_lock.unlock();
+      return;
+    }
+  }
+
+  list<Context *> oncommits;
+  if (is_smallest_thread_index) {
+    sdata->context_queue.move_to(oncommits);
+  }
+
+  WorkItem work_item;
+  while (!std::get_if<OpSchedulerItem>(&work_item)) {
+    if (sdata->scheduler->empty()) {
+      if (osd->is_stopping()) {
+        sdata->shard_lock.unlock();
+        for (auto c : oncommits) {
+          dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
+          delete c;
+        }
+        return;    // OSD shutdown, discard.
+      }
+      sdata->shard_lock.unlock();
+      handle_oncommits(oncommits);
+      return;
+    }
+
+    work_item = sdata->scheduler->dequeue();
+    if (osd->is_stopping()) {
+      sdata->shard_lock.unlock();
+      for (auto c : oncommits) {
+        dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
+        delete c;
+      }
+      return;    // OSD shutdown, discard.
+    }
+
+    // If the work item is scheduled in the future, wait until
+    // the time returned in the dequeue response before retrying.
+    if (auto when_ready = std::get_if<double>(&work_item)) {
+      if (is_smallest_thread_index) {
+        sdata->shard_lock.unlock();
+        handle_oncommits(oncommits);
+        return;
+      }
+      std::unique_lock wait_lock{sdata->sdata_wait_lock};
+      auto future_time = ceph::real_clock::from_double(*when_ready);
+      dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
+      // Disable heartbeat timeout until we find a non-future work item to process.
+      osd->cct->get_heartbeat_map()->clear_timeout(hb);
+      sdata->shard_lock.unlock();
+      ++sdata->waiting_threads;
+      sdata->sdata_cond.wait_until(wait_lock, future_time);
+      --sdata->waiting_threads;
+      wait_lock.unlock();
+      sdata->shard_lock.lock();
+      // Reapply default wq timeouts
+      osd->cct->get_heartbeat_map()->reset_timeout(hb,
+        timeout_interval, suicide_interval);
+    }
+  } // while
+
+  // Access the stored item
+  auto item = std::move(std::get<OpSchedulerItem>(work_item));
+  if (osd->is_stopping()) {
+    sdata->shard_lock.unlock();
+    for (auto c : oncommits) {
+      dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
+      delete c;
+    }
+    return;    // OSD shutdown, discard.
+  }
+
+  const auto token = item.get_ordering_token();
+  auto r = sdata->pg_slots.emplace(token, nullptr);
+  if (r.second) {
+    r.first->second = make_unique<OSDShardPGSlot>();
+  }
+  OSDShardPGSlot *slot = r.first->second.get();
+  dout(20) << __func__ << " " << token
+	   << (r.second ? " (new)" : "")
+	   << " to_process " << slot->to_process
+	   << " waiting " << slot->waiting
+	   << " waiting_peering " << slot->waiting_peering
+	   << dendl;
+  slot->to_process.push_back(std::move(item));
+  dout(20) << __func__ << " " << slot->to_process.back()
+	   << " queued" << dendl;
+
+ retry_pg:
+  PGRef pg = slot->pg;
+
+  // lock pg (if we have it)
+  if (pg) {
+    // note the requeue seq now...
+    uint64_t requeue_seq = slot->requeue_seq;
+    ++slot->num_running;
+
+    sdata->shard_lock.unlock();
+    osd->service.maybe_inject_dispatch_delay();
+    pg->lock();
+    osd->service.maybe_inject_dispatch_delay();
+    sdata->shard_lock.lock();
+
+    auto q = sdata->pg_slots.find(token);
+    if (q == sdata->pg_slots.end()) {
+      // this can happen if we race with pg removal.
+      dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
+      pg->unlock();
+      sdata->shard_lock.unlock();
+      handle_oncommits(oncommits);
+      return;
+    }
+    slot = q->second.get();
+    --slot->num_running;
+
+    if (slot->to_process.empty()) {
+      // raced with _wake_pg_slot or consume_map
+      dout(20) << __func__ << " " << token
+	       << " nothing queued" << dendl;
+      pg->unlock();
+      sdata->shard_lock.unlock();
+      handle_oncommits(oncommits);
+      return;
+    }
+    if (requeue_seq != slot->requeue_seq) {
+      dout(20) << __func__ << " " << token
+	       << " requeue_seq " << slot->requeue_seq << " > our "
+	       << requeue_seq << ", we raced with _wake_pg_slot"
+	       << dendl;
+      pg->unlock();
+      sdata->shard_lock.unlock();
+      handle_oncommits(oncommits);
+      return;
+    }
+    if (slot->pg != pg) {
+      // this can happen if we race with pg removal.
+      dout(20) << __func__ << " slot " << token << " no longer attached to "
+	       << pg << dendl;
+      pg->unlock();
+      goto retry_pg;
+    }
+  }
+
+  dout(20) << __func__ << " " << token
+	   << " to_process " << slot->to_process
+	   << " waiting " << slot->waiting
+	   << " waiting_peering " << slot->waiting_peering << dendl;
+
+  ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
+				 suicide_interval);
+
+  // take next item
+  auto qi = std::move(slot->to_process.front());
+  slot->to_process.pop_front();
+  dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
+  set<pair<spg_t,epoch_t>> new_children;
+  OSDMapRef osdmap;
+
+  while (!pg) {
+    // should this pg shard exist on this osd in this (or a later) epoch?
+    osdmap = sdata->shard_osdmap;
+    const PGCreateInfo *create_info = qi.creates_pg();
+    if (!slot->waiting_for_split.empty()) {
+      dout(20) << __func__ << " " << token
+	       << " splitting " << slot->waiting_for_split << dendl;
+      _add_slot_waiter(token, slot, std::move(qi));
+    } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
+      dout(20) << __func__ << " " << token
+	       << " map " << qi.get_map_epoch() << " > "
+	       << osdmap->get_epoch() << dendl;
+      _add_slot_waiter(token, slot, std::move(qi));
+    } else if (qi.is_peering()) {
+      if (!qi.peering_requires_pg()) {
+	// for pg-less events, we run them under the ordering lock, since
+	// we don't have the pg lock to keep them ordered.
+	qi.run(osd, sdata, pg, tp_handle);
+      } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
+	if (create_info) {
+	  if (create_info->by_mon &&
+	      osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
+	    dout(20) << __func__ << " " << token
+		     << " no pg, no longer primary, ignoring mon create on "
+		     << qi << dendl;
+	  } else {
+	    dout(20) << __func__ << " " << token
+		     << " no pg, should create on " << qi << dendl;
+	    pg = osd->handle_pg_create_info(osdmap, create_info);
+	    if (pg) {
+	      // we created the pg! drop out and continue "normally"!
+	      sdata->_attach_pg(slot, pg.get());
+	      sdata->_wake_pg_slot(token, slot);
+
+	      // identify split children between create epoch and shard epoch.
+	      osd->service.identify_splits_and_merges(
+		pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
+	      sdata->_prime_splits(&new_children);
+	      // distribute remaining split children to other shards below!
+	      break;
+	    }
+	    dout(20) << __func__ << " ignored create on " << qi << dendl;
+	  }
+	} else {
+	  dout(20) << __func__ << " " << token
+		   << " no pg, peering, !create, discarding " << qi << dendl;
+	}
+      } else {
+	dout(20) << __func__ << " " << token
+		 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
+		 << ", discarding " << qi
+		 << dendl;
+      }
+    } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
+      dout(20) << __func__ << " " << token
+	       << " no pg, should exist e" << osdmap->get_epoch()
+	       << ", will wait on " << qi << dendl;
+      _add_slot_waiter(token, slot, std::move(qi));
+    } else {
+      dout(20) << __func__ << " " << token
+	       << " no pg, shouldn't exist e" << osdmap->get_epoch()
+	       << ", dropping " << qi << dendl;
+      // share map with client?
+      if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
+	osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
+				     sdata->shard_osdmap,
+				     (*_op)->sent_epoch);
+      }
+      unsigned pushes_to_free = qi.get_reserved_pushes();
+      if (pushes_to_free > 0) {
+	sdata->shard_lock.unlock();
+	osd->service.release_reserved_pushes(pushes_to_free);
+	handle_oncommits(oncommits);
+	return;
+      }
+    }
+    sdata->shard_lock.unlock();
+    handle_oncommits(oncommits);
+    return;
+  }
+  if (qi.is_peering()) {
+    OSDMapRef osdmap = sdata->shard_osdmap;
+    if (qi.get_map_epoch() > osdmap->get_epoch()) {
+      _add_slot_waiter(token, slot, std::move(qi));
+      sdata->shard_lock.unlock();
+      pg->unlock();
+      handle_oncommits(oncommits);
+      return;
+    }
+  }
+  sdata->shard_lock.unlock();
+
+  if (!new_children.empty()) {
+    for (auto shard : osd->shards) {
+      shard->prime_splits(osdmap, &new_children);
+    }
+    ceph_assert(new_children.empty());
+  }
+
+  // osd_opwq_process marks the point at which an operation has been dequeued
+  // and will begin to be handled by a worker thread.
+  {
+#ifdef WITH_LTTNG
+    osd_reqid_t reqid;
+    if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
+      reqid = (*_op)->get_reqid();
+    }
+#endif
+    tracepoint(osd, opwq_process_start, reqid.name._type,
+        reqid.name._num, reqid.tid, reqid.inc);
+  }
+
+  lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
+  Formatter *f = Formatter::create("json");
+  f->open_object_section("q");
+  dump(f);
+  f->close_section();
+  f->flush(*_dout);
+  delete f;
+  *_dout << dendl;
+
+  qi.run(osd, sdata, pg, tp_handle);
+
+  {
+#ifdef WITH_LTTNG
+    osd_reqid_t reqid;
+    if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
+      reqid = (*_op)->get_reqid();
+    }
+#endif
+    tracepoint(osd, opwq_process_finish, reqid.name._type,
+        reqid.name._num, reqid.tid, reqid.inc);
+  }
+
+  handle_oncommits(oncommits);
+}
+
+void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
+  uint32_t shard_index =
+    item.get_ordering_token().hash_to_shard(osd->shards.size());
+
+  dout(20) << __func__ << " " << item << dendl;
+
+  OSDShard* sdata = osd->shards[shard_index];
+  assert (NULL != sdata);
+
+  bool empty = true;
+  {
+    std::lock_guard l{sdata->shard_lock};
+    empty = sdata->scheduler->empty();
+    sdata->scheduler->enqueue(std::move(item));
+  }
+
+  {
+    std::lock_guard l{sdata->sdata_wait_lock};
+    if (empty) {
+      sdata->sdata_cond.notify_all();
+    } else if (sdata->waiting_threads) {
+      sdata->sdata_cond.notify_one();
+    }
+  }
+}
+
+void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
+{
+  auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
+  auto& sdata = osd->shards[shard_index];
+  ceph_assert(sdata);
+  sdata->shard_lock.lock();
+  auto p = sdata->pg_slots.find(item.get_ordering_token());
+  if (p != sdata->pg_slots.end() &&
+      !p->second->to_process.empty()) {
+    // we may be racing with _process, which has dequeued a new item
+    // from scheduler, put it on to_process, and is now busy taking the
+    // pg lock.  ensure this old requeued item is ordered before any
+    // such newer item in to_process.
+    p->second->to_process.push_front(std::move(item));
+    item = std::move(p->second->to_process.back());
+    p->second->to_process.pop_back();
+    dout(20) << __func__
+	     << " " << p->second->to_process.front()
+	     << " shuffled w/ " << item << dendl;
+  } else {
+    dout(20) << __func__ << " " << item << dendl;
+  }
+  sdata->scheduler->enqueue_front(std::move(item));
+  sdata->shard_lock.unlock();
+  std::lock_guard l{sdata->sdata_wait_lock};
+  sdata->sdata_cond.notify_one();
+}
+
+namespace ceph::osd_cmds {
+
+int heap(CephContext& cct,
+         const cmdmap_t& cmdmap,
+         std::ostream& outos,
+         std::ostream& erros)
+{
+  if (!ceph_using_tcmalloc()) {
+        erros << "could not issue heap profiler command -- not using tcmalloc!";
+        return -EOPNOTSUPP;
+  }
+
+  string cmd;
+  if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
+        erros << "unable to get value for command \"" << cmd << "\"";
+       return -EINVAL;
+  }
+
+  std::vector<std::string> cmd_vec;
+  get_str_vec(cmd, cmd_vec);
+
+  string val;
+  if (cmd_getval(cmdmap, "value", val)) {
+    cmd_vec.push_back(val);
+  }
+
+  ceph_heap_profiler_handle_command(cmd_vec, outos);
+
+  return 0;
+}
+
+} // namespace ceph::osd_cmds
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
new file mode 100644
index 000000000..efbcb40f7
--- /dev/null
+++ b/src/osd/OSD.h
@@ -0,0 +1,2152 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_H
+#define CEPH_OSD_H
+
+#include "PG.h"
+
+#include "msg/Dispatcher.h"
+
+#include "common/async/context_pool.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "common/AsyncReserver.h"
+#include "common/ceph_context.h"
+#include "common/config_cacher.h"
+#include "common/zipkin_trace.h"
+#include "common/ceph_timer.h"
+
+#include "mgr/MgrClient.h"
+
+#include "os/ObjectStore.h"
+
+#include "include/CompatSet.h"
+#include "include/common_fwd.h"
+
+#include "OpRequest.h"
+#include "Session.h"
+
+#include "osd/scheduler/OpScheduler.h"
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "include/unordered_map.h"
+
+#include "common/shared_cache.hpp"
+#include "common/simple_cache.hpp"
+#include "messages/MOSDOp.h"
+#include "common/EventTrace.h"
+#include "osd/osd_perf_counters.h"
+#include "common/Finisher.h"
+
+#define CEPH_OSD_PROTOCOL    10 /* cluster internal */
+
+/*
+
+  lock ordering for pg map
+
+    PG::lock
+      ShardData::lock
+        OSD::pg_map_lock
+
+  */
+
+class Messenger;
+class Message;
+class MonClient;
+class ObjectStore;
+class FuseStore;
+class OSDMap;
+class MLog;
+class Objecter;
+class KeyStore;
+
+class Watch;
+class PrimaryLogPG;
+
+class TestOpsSocketHook;
+struct C_FinishSplits;
+struct C_OpenPGs;
+class LogChannel;
+
+class MOSDPGCreate2;
+class MOSDPGQuery;
+class MOSDPGNotify;
+class MOSDPGInfo;
+class MOSDPGRemove;
+class MOSDForceRecovery;
+class MMonGetPurgedSnapsReply;
+
+class OSD;
+
+class OSDService {
+  using OpSchedulerItem = ceph::osd::scheduler::OpSchedulerItem;
+public:
+  OSD *osd;
+  CephContext *cct;
+  ObjectStore::CollectionHandle meta_ch;
+  const int whoami;
+  ObjectStore *&store;
+  LogClient &log_client;
+  LogChannelRef clog;
+  PGRecoveryStats &pg_recovery_stats;
+private:
+  Messenger *&cluster_messenger;
+  Messenger *&client_messenger;
+public:
+  PerfCounters *&logger;
+  PerfCounters *&recoverystate_perf;
+  MonClient   *&monc;
+
+  md_config_cacher_t<Option::size_t> osd_max_object_size;
+  md_config_cacher_t<bool> osd_skip_data_digest;
+
+  void enqueue_back(OpSchedulerItem&& qi);
+  void enqueue_front(OpSchedulerItem&& qi);
+
+  void maybe_inject_dispatch_delay() {
+    if (g_conf()->osd_debug_inject_dispatch_delay_probability > 0) {
+      if (rand() % 10000 <
+	  g_conf()->osd_debug_inject_dispatch_delay_probability * 10000) {
+	utime_t t;
+	t.set_from_double(g_conf()->osd_debug_inject_dispatch_delay_duration);
+	t.sleep();
+      }
+    }
+  }
+
+  ceph::signedspan get_mnow();
+
+private:
+  // -- superblock --
+  ceph::mutex publish_lock, pre_publish_lock; // pre-publish orders before publish
+  OSDSuperblock superblock;
+
+public:
+  OSDSuperblock get_superblock() {
+    std::lock_guard l(publish_lock);
+    return superblock;
+  }
+  void publish_superblock(const OSDSuperblock &block) {
+    std::lock_guard l(publish_lock);
+    superblock = block;
+  }
+
+  int get_nodeid() const { return whoami; }
+
+  std::atomic<epoch_t> max_oldest_map;
+private:
+  OSDMapRef osdmap;
+
+public:
+  OSDMapRef get_osdmap() {
+    std::lock_guard l(publish_lock);
+    return osdmap;
+  }
+  epoch_t get_osdmap_epoch() {
+    std::lock_guard l(publish_lock);
+    return osdmap ? osdmap->get_epoch() : 0;
+  }
+  void publish_map(OSDMapRef map) {
+    std::lock_guard l(publish_lock);
+    osdmap = map;
+  }
+
+  /*
+   * osdmap - current published std::map
+   * next_osdmap - pre_published std::map that is about to be published.
+   *
+   * We use the next_osdmap to send messages and initiate connections,
+   * but only if the target is the same instance as the one in the std::map
+   * epoch the current user is working from (i.e., the result is
+   * equivalent to what is in next_osdmap).
+   *
+   * This allows the helpers to start ignoring osds that are about to
+   * go down, and let OSD::handle_osd_map()/note_down_osd() mark them
+   * down, without worrying about reopening connections from threads
+   * working from old maps.
+   */
+private:
+  OSDMapRef next_osdmap;
+  ceph::condition_variable pre_publish_cond;
+  int pre_publish_waiter = 0;
+
+public:
+  void pre_publish_map(OSDMapRef map) {
+    std::lock_guard l(pre_publish_lock);
+    next_osdmap = std::move(map);
+  }
+
+  void activate_map();
+  /// map epochs reserved below
+  std::map<epoch_t, unsigned> map_reservations;
+
+  /// gets ref to next_osdmap and registers the epoch as reserved
+  OSDMapRef get_nextmap_reserved() {
+    std::lock_guard l(pre_publish_lock);
+    epoch_t e = next_osdmap->get_epoch();
+    std::map<epoch_t, unsigned>::iterator i =
+      map_reservations.insert(std::make_pair(e, 0)).first;
+    i->second++;
+    return next_osdmap;
+  }
+  /// releases reservation on map
+  void release_map(OSDMapRef osdmap) {
+    std::lock_guard l(pre_publish_lock);
+    std::map<epoch_t, unsigned>::iterator i =
+      map_reservations.find(osdmap->get_epoch());
+    ceph_assert(i != map_reservations.end());
+    ceph_assert(i->second > 0);
+    if (--(i->second) == 0) {
+      map_reservations.erase(i);
+    }
+    if (pre_publish_waiter) {
+      pre_publish_cond.notify_all();
+    }
+  }
+  /// blocks until there are no reserved maps prior to next_osdmap
+  void await_reserved_maps() {
+    std::unique_lock l{pre_publish_lock};
+    ceph_assert(next_osdmap);
+    pre_publish_waiter++;
+    pre_publish_cond.wait(l, [this] {
+      auto i = map_reservations.cbegin();
+      return (i == map_reservations.cend() ||
+	      i->first >= next_osdmap->get_epoch());
+    });
+    pre_publish_waiter--;
+  }
+  OSDMapRef get_next_osdmap() {
+    std::lock_guard l(pre_publish_lock);
+    return next_osdmap;
+  }
+
+  void maybe_share_map(Connection *con,
+		       const OSDMapRef& osdmap,
+		       epoch_t peer_epoch_lb=0);
+
+  void send_map(class MOSDMap *m, Connection *con);
+  void send_incremental_map(epoch_t since, Connection *con,
+			    const OSDMapRef& osdmap);
+  MOSDMap *build_incremental_map_msg(epoch_t from, epoch_t to,
+                                       OSDSuperblock& superblock);
+
+  ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch);
+  std::pair<ConnectionRef,ConnectionRef> get_con_osd_hb(int peer, epoch_t from_epoch);  // (back, front)
+  void send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch);
+  void send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch);
+  void send_message_osd_cluster(MessageRef m, Connection *con) {
+    con->send_message2(std::move(m));
+  }
+  void send_message_osd_cluster(Message *m, const ConnectionRef& con) {
+    con->send_message(m);
+  }
+  void send_message_osd_client(Message *m, const ConnectionRef& con) {
+    con->send_message(m);
+  }
+  entity_name_t get_cluster_msgr_name() const;
+
+private:
+  // -- scrub scheduling --
+  ceph::mutex sched_scrub_lock = ceph::make_mutex("OSDService::sched_scrub_lock");
+  int scrubs_local;
+  int scrubs_remote;
+
+public:
+  struct ScrubJob {
+    CephContext* cct;
+    /// pg to be scrubbed
+    spg_t pgid;
+    /// a time scheduled for scrub. but the scrub could be delayed if system
+    /// load is too high or it fails to fall in the scrub hours
+    utime_t sched_time;
+    /// the hard upper bound of scrub time
+    utime_t deadline;
+    ScrubJob() : cct(nullptr) {}
+    explicit ScrubJob(CephContext* cct, const spg_t& pg,
+		      const utime_t& timestamp,
+		      double pool_scrub_min_interval = 0,
+		      double pool_scrub_max_interval = 0, bool must = true);
+    /// order the jobs by sched_time
+    bool operator<(const ScrubJob& rhs) const;
+  };
+  std::set<ScrubJob> sched_scrub_pg;
+
+  /// @returns the scrub_reg_stamp used for unregistering the scrub job
+  utime_t reg_pg_scrub(spg_t pgid,
+		       utime_t t,
+		       double pool_scrub_min_interval,
+		       double pool_scrub_max_interval,
+		       bool must) {
+    ScrubJob scrub_job(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval,
+		       must);
+    std::lock_guard l(OSDService::sched_scrub_lock);
+    sched_scrub_pg.insert(scrub_job);
+    return scrub_job.sched_time;
+  }
+
+  void unreg_pg_scrub(spg_t pgid, utime_t t) {
+    std::lock_guard l(sched_scrub_lock);
+    size_t removed = sched_scrub_pg.erase(ScrubJob(cct, pgid, t));
+    ceph_assert(removed);
+  }
+
+  bool first_scrub_stamp(ScrubJob *out) {
+    std::lock_guard l(sched_scrub_lock);
+    if (sched_scrub_pg.empty())
+      return false;
+    std::set<ScrubJob>::iterator iter = sched_scrub_pg.begin();
+    *out = *iter;
+    return true;
+  }
+  bool next_scrub_stamp(const ScrubJob& next,
+			ScrubJob *out) {
+    std::lock_guard l(sched_scrub_lock);
+    if (sched_scrub_pg.empty())
+      return false;
+    std::set<ScrubJob>::const_iterator iter = sched_scrub_pg.upper_bound(next);
+    if (iter == sched_scrub_pg.cend())
+      return false;
+    *out = *iter;
+    return true;
+  }
+
+  void dumps_scrub(ceph::Formatter* f);
+
+  bool can_inc_scrubs();
+  bool inc_scrubs_local();
+  void dec_scrubs_local();
+  bool inc_scrubs_remote();
+  void dec_scrubs_remote();
+  void dump_scrub_reservations(ceph::Formatter *f);
+
+  void reply_op_error(OpRequestRef op, int err);
+  void reply_op_error(OpRequestRef op, int err, eversion_t v, version_t uv,
+		      std::vector<pg_log_op_return_item_t> op_returns);
+  void handle_misdirected_op(PG *pg, OpRequestRef op);
+
+
+private:
+  // -- agent shared state --
+  ceph::mutex agent_lock = ceph::make_mutex("OSDService::agent_lock");
+  ceph::condition_variable agent_cond;
+  std::map<uint64_t, std::set<PGRef> > agent_queue;
+  std::set<PGRef>::iterator agent_queue_pos;
+  bool agent_valid_iterator;
+  int agent_ops;
+  int flush_mode_high_count; //once have one pg with FLUSH_MODE_HIGH then flush objects with high speed
+  std::set<hobject_t> agent_oids;
+  bool agent_active;
+  struct AgentThread : public Thread {
+    OSDService *osd;
+    explicit AgentThread(OSDService *o) : osd(o) {}
+    void *entry() override {
+      osd->agent_entry();
+      return NULL;
+    }
+  } agent_thread;
+  bool agent_stop_flag;
+  ceph::mutex agent_timer_lock = ceph::make_mutex("OSDService::agent_timer_lock");
+  SafeTimer agent_timer;
+
+public:
+  void agent_entry();
+  void agent_stop();
+
+  void _enqueue(PG *pg, uint64_t priority) {
+    if (!agent_queue.empty() &&
+	agent_queue.rbegin()->first < priority)
+      agent_valid_iterator = false;  // inserting higher-priority queue
+    std::set<PGRef>& nq = agent_queue[priority];
+    if (nq.empty())
+      agent_cond.notify_all();
+    nq.insert(pg);
+  }
+
+  void _dequeue(PG *pg, uint64_t old_priority) {
+    std::set<PGRef>& oq = agent_queue[old_priority];
+    std::set<PGRef>::iterator p = oq.find(pg);
+    ceph_assert(p != oq.end());
+    if (p == agent_queue_pos)
+      ++agent_queue_pos;
+    oq.erase(p);
+    if (oq.empty()) {
+      if (agent_queue.rbegin()->first == old_priority)
+	agent_valid_iterator = false;
+      agent_queue.erase(old_priority);
+    }
+  }
+
+  /// enable agent for a pg
+  void agent_enable_pg(PG *pg, uint64_t priority) {
+    std::lock_guard l(agent_lock);
+    _enqueue(pg, priority);
+  }
+
+  /// adjust priority for an enagled pg
+  void agent_adjust_pg(PG *pg, uint64_t old_priority, uint64_t new_priority) {
+    std::lock_guard l(agent_lock);
+    ceph_assert(new_priority != old_priority);
+    _enqueue(pg, new_priority);
+    _dequeue(pg, old_priority);
+  }
+
+  /// disable agent for a pg
+  void agent_disable_pg(PG *pg, uint64_t old_priority) {
+    std::lock_guard l(agent_lock);
+    _dequeue(pg, old_priority);
+  }
+
+  /// note start of an async (evict) op
+  void agent_start_evict_op() {
+    std::lock_guard l(agent_lock);
+    ++agent_ops;
+  }
+
+  /// note finish or cancellation of an async (evict) op
+  void agent_finish_evict_op() {
+    std::lock_guard l(agent_lock);
+    ceph_assert(agent_ops > 0);
+    --agent_ops;
+    agent_cond.notify_all();
+  }
+
+  /// note start of an async (flush) op
+  void agent_start_op(const hobject_t& oid) {
+    std::lock_guard l(agent_lock);
+    ++agent_ops;
+    ceph_assert(agent_oids.count(oid) == 0);
+    agent_oids.insert(oid);
+  }
+
+  /// note finish or cancellation of an async (flush) op
+  void agent_finish_op(const hobject_t& oid) {
+    std::lock_guard l(agent_lock);
+    ceph_assert(agent_ops > 0);
+    --agent_ops;
+    ceph_assert(agent_oids.count(oid) == 1);
+    agent_oids.erase(oid);
+    agent_cond.notify_all();
+  }
+
+  /// check if we are operating on an object
+  bool agent_is_active_oid(const hobject_t& oid) {
+    std::lock_guard l(agent_lock);
+    return agent_oids.count(oid);
+  }
+
+  /// get count of active agent ops
+  int agent_get_num_ops() {
+    std::lock_guard l(agent_lock);
+    return agent_ops;
+  }
+
+  void agent_inc_high_count() {
+    std::lock_guard l(agent_lock);
+    flush_mode_high_count ++;
+  }
+
+  void agent_dec_high_count() {
+    std::lock_guard l(agent_lock);
+    flush_mode_high_count --;
+  }
+
+private:
+  /// throttle promotion attempts
+  std::atomic<unsigned int> promote_probability_millis{1000}; ///< probability thousands. one word.
+  PromoteCounter promote_counter;
+  utime_t last_recalibrate;
+  unsigned long promote_max_objects, promote_max_bytes;
+
+public:
+  bool promote_throttle() {
+    // NOTE: lockless!  we rely on the probability being a single word.
+    promote_counter.attempt();
+    if ((unsigned)rand() % 1000 > promote_probability_millis)
+      return true;  // yes throttle (no promote)
+    if (promote_max_objects &&
+	promote_counter.objects > promote_max_objects)
+      return true;  // yes throttle
+    if (promote_max_bytes &&
+	promote_counter.bytes > promote_max_bytes)
+      return true;  // yes throttle
+    return false;   //  no throttle (promote)
+  }
+  void promote_finish(uint64_t bytes) {
+    promote_counter.finish(bytes);
+  }
+  void promote_throttle_recalibrate();
+  unsigned get_num_shards() const {
+    return m_objecter_finishers;
+  }
+  Finisher* get_objecter_finisher(int shard) {
+    return objecter_finishers[shard].get();
+  }
+
+  // -- Objecter, for tiering reads/writes from/to other OSDs --
+  ceph::async::io_context_pool& poolctx;
+  std::unique_ptr<Objecter> objecter;
+  int m_objecter_finishers;
+  std::vector<std::unique_ptr<Finisher>> objecter_finishers;
+
+  // -- Watch --
+  ceph::mutex watch_lock = ceph::make_mutex("OSDService::watch_lock");
+  SafeTimer watch_timer;
+  uint64_t next_notif_id;
+  uint64_t get_next_id(epoch_t cur_epoch) {
+    std::lock_guard l(watch_lock);
+    return (((uint64_t)cur_epoch) << 32) | ((uint64_t)(next_notif_id++));
+  }
+
+  // -- Recovery/Backfill Request Scheduling --
+  ceph::mutex recovery_request_lock = ceph::make_mutex("OSDService::recovery_request_lock");
+  SafeTimer recovery_request_timer;
+
+  // For async recovery sleep
+  bool recovery_needs_sleep = true;
+  ceph::real_clock::time_point recovery_schedule_time;
+
+  // For recovery & scrub & snap
+  ceph::mutex sleep_lock = ceph::make_mutex("OSDService::sleep_lock");
+  SafeTimer sleep_timer;
+
+  // -- tids --
+  // for ops i issue
+  std::atomic<unsigned int> last_tid{0};
+  ceph_tid_t get_tid() {
+    return (ceph_tid_t)last_tid++;
+  }
+
+  // -- backfill_reservation --
+  Finisher reserver_finisher;
+  AsyncReserver<spg_t, Finisher> local_reserver;
+  AsyncReserver<spg_t, Finisher> remote_reserver;
+
+  // -- pg merge --
+  ceph::mutex merge_lock = ceph::make_mutex("OSD::merge_lock");
+  std::map<pg_t,eversion_t> ready_to_merge_source;   // pg -> version
+  std::map<pg_t,std::tuple<eversion_t,epoch_t,epoch_t>> ready_to_merge_target;  // pg -> (version,les,lec)
+  std::set<pg_t> not_ready_to_merge_source;
+  std::map<pg_t,pg_t> not_ready_to_merge_target;
+  std::set<pg_t> sent_ready_to_merge_source;
+
+  void set_ready_to_merge_source(PG *pg,
+				 eversion_t version);
+  void set_ready_to_merge_target(PG *pg,
+				 eversion_t version,
+				 epoch_t last_epoch_started,
+				 epoch_t last_epoch_clean);
+  void set_not_ready_to_merge_source(pg_t source);
+  void set_not_ready_to_merge_target(pg_t target, pg_t source);
+  void clear_ready_to_merge(PG *pg);
+  void send_ready_to_merge();
+  void _send_ready_to_merge();
+  void clear_sent_ready_to_merge();
+  void prune_sent_ready_to_merge(const OSDMapRef& osdmap);
+
+  // -- pg_temp --
+private:
+  ceph::mutex pg_temp_lock = ceph::make_mutex("OSDService::pg_temp_lock");
+  struct pg_temp_t {
+    std::vector<int> acting;
+    bool forced = false;
+  };
+  std::map<pg_t, pg_temp_t> pg_temp_wanted;
+  std::map<pg_t, pg_temp_t> pg_temp_pending;
+  void _sent_pg_temp();
+  friend std::ostream& operator<<(std::ostream&, const pg_temp_t&);
+public:
+  void queue_want_pg_temp(pg_t pgid, const std::vector<int>& want,
+			  bool forced = false);
+  void remove_want_pg_temp(pg_t pgid);
+  void requeue_pg_temp();
+  void send_pg_temp();
+
+  ceph::mutex pg_created_lock = ceph::make_mutex("OSDService::pg_created_lock");
+  std::set<pg_t> pg_created;
+  void send_pg_created(pg_t pgid);
+  void prune_pg_created();
+  void send_pg_created();
+
+  AsyncReserver<spg_t, Finisher> snap_reserver;
+  void queue_recovery_context(PG *pg, GenContext<ThreadPool::TPHandle&> *c);
+  void queue_for_snap_trim(PG *pg);
+  void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// queue the message (-> event) that all replicas have reserved scrub resources for us
+  void queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// queue the message (-> event) that some replicas denied our scrub resources request
+  void queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals either (a) the end of a sleep period, or (b) a recheck of the availability
+  /// of the primary map being created by the backend.
+  void queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals a change in the number of in-flight recovery writes
+  void queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals that all pending updates were applied
+  void queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals that the selected chunk (objects range) is available for scrubbing
+  void queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// The chunk selected is blocked by user operations, and cannot be scrubbed now
+  void queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// The block-range that was locked and prevented the scrubbing - is freed
+  void queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals that all write OPs are done
+  void queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals that the the local (Primary's) scrub map is ready
+  void queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals that we (the Primary) got all waited-for scrub-maps from our replicas
+  void queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals that all chunks were handled
+  /// Note: always with high priority, as must be acted upon before the
+  /// next scrub request arrives from the Primary (and the primary is free
+  /// to send the request once the replica's map is received).
+  void queue_scrub_is_finished(PG* pg);
+
+  /// Signals that there are more chunks to handle
+  void queue_scrub_next_chunk(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  /// Signals that we have finished comparing the maps for this chunk
+  /// Note: required, as in Crimson this operation is 'futurized'.
+  void queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  void queue_for_rep_scrub(PG* pg,
+			   Scrub::scrub_prio_t with_high_priority,
+			   unsigned int qu_priority,
+			   Scrub::act_token_t act_token);
+
+  /// Signals a change in the number of in-flight recovery writes
+  void queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority);
+
+  /// (not in Crimson) Queue a SchedReplica event to be sent to the replica, to
+  /// trigger a re-check of the availability of the scrub map prepared by the
+  /// backend.
+  void queue_for_rep_scrub_resched(PG* pg,
+				   Scrub::scrub_prio_t with_high_priority,
+				   unsigned int qu_priority,
+				   Scrub::act_token_t act_token);
+
+  void queue_for_pg_delete(spg_t pgid, epoch_t e);
+  bool try_finish_pg_delete(PG *pg, unsigned old_pg_num);
+
+private:
+  // -- pg recovery and associated throttling --
+  ceph::mutex recovery_lock = ceph::make_mutex("OSDService::recovery_lock");
+  std::list<std::pair<epoch_t, PGRef> > awaiting_throttle;
+
+  /// queue a scrub-related message for a PG
+  template <class MSG_TYPE>
+  void queue_scrub_event_msg(PG* pg,
+			     Scrub::scrub_prio_t with_priority,
+			     unsigned int qu_priority,
+			     Scrub::act_token_t act_token);
+
+  /// An alternative version of queue_scrub_event_msg(), in which the queuing priority is
+  /// provided by the executing scrub (i.e. taken from PgScrubber::m_flags)
+  template <class MSG_TYPE>
+  void queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority);
+
+  utime_t defer_recovery_until;
+  uint64_t recovery_ops_active;
+  uint64_t recovery_ops_reserved;
+  bool recovery_paused;
+#ifdef DEBUG_RECOVERY_OIDS
+  std::map<spg_t, std::set<hobject_t> > recovery_oids;
+#endif
+  bool _recover_now(uint64_t *available_pushes);
+  void _maybe_queue_recovery();
+  void _queue_for_recovery(
+    std::pair<epoch_t, PGRef> p, uint64_t reserved_pushes);
+public:
+  void start_recovery_op(PG *pg, const hobject_t& soid);
+  void finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue);
+  bool is_recovery_active();
+  void release_reserved_pushes(uint64_t pushes);
+  void defer_recovery(float defer_for) {
+    defer_recovery_until = ceph_clock_now();
+    defer_recovery_until += defer_for;
+  }
+  void pause_recovery() {
+    std::lock_guard l(recovery_lock);
+    recovery_paused = true;
+  }
+  bool recovery_is_paused() {
+    std::lock_guard l(recovery_lock);
+    return recovery_paused;
+  }
+  void unpause_recovery() {
+    std::lock_guard l(recovery_lock);
+    recovery_paused = false;
+    _maybe_queue_recovery();
+  }
+  void kick_recovery_queue() {
+    std::lock_guard l(recovery_lock);
+    _maybe_queue_recovery();
+  }
+  void clear_queued_recovery(PG *pg) {
+    std::lock_guard l(recovery_lock);
+    awaiting_throttle.remove_if(
+      [pg](decltype(awaiting_throttle)::const_reference awaiting ) {
+	return awaiting.second.get() == pg;
+      });
+  }
+
+  unsigned get_target_pg_log_entries() const;
+
+  // delayed pg activation
+  void queue_for_recovery(PG *pg) {
+    std::lock_guard l(recovery_lock);
+
+    if (pg->is_forced_recovery_or_backfill()) {
+      awaiting_throttle.push_front(std::make_pair(pg->get_osdmap()->get_epoch(), pg));
+    } else {
+      awaiting_throttle.push_back(std::make_pair(pg->get_osdmap()->get_epoch(), pg));
+    }
+    _maybe_queue_recovery();
+  }
+  void queue_recovery_after_sleep(PG *pg, epoch_t queued, uint64_t reserved_pushes) {
+    std::lock_guard l(recovery_lock);
+    _queue_for_recovery(std::make_pair(queued, pg), reserved_pushes);
+  }
+
+  void queue_check_readable(spg_t spgid,
+			    epoch_t lpr,
+			    ceph::signedspan delay = ceph::signedspan::zero());
+
+  // osd map cache (past osd maps)
+  ceph::mutex map_cache_lock = ceph::make_mutex("OSDService::map_cache_lock");
+  SharedLRU<epoch_t, const OSDMap> map_cache;
+  SimpleLRU<epoch_t, ceph::buffer::list> map_bl_cache;
+  SimpleLRU<epoch_t, ceph::buffer::list> map_bl_inc_cache;
+
+  OSDMapRef try_get_map(epoch_t e);
+  OSDMapRef get_map(epoch_t e) {
+    OSDMapRef ret(try_get_map(e));
+    ceph_assert(ret);
+    return ret;
+  }
+  OSDMapRef add_map(OSDMap *o) {
+    std::lock_guard l(map_cache_lock);
+    return _add_map(o);
+  }
+  OSDMapRef _add_map(OSDMap *o);
+
+  void _add_map_bl(epoch_t e, ceph::buffer::list& bl);
+  bool get_map_bl(epoch_t e, ceph::buffer::list& bl) {
+    std::lock_guard l(map_cache_lock);
+    return _get_map_bl(e, bl);
+  }
+  bool _get_map_bl(epoch_t e, ceph::buffer::list& bl);
+
+  void _add_map_inc_bl(epoch_t e, ceph::buffer::list& bl);
+  bool get_inc_map_bl(epoch_t e, ceph::buffer::list& bl);
+
+  /// identify split child pgids over a osdmap interval
+  void identify_splits_and_merges(
+    OSDMapRef old_map,
+    OSDMapRef new_map,
+    spg_t pgid,
+    std::set<std::pair<spg_t,epoch_t>> *new_children,
+    std::set<std::pair<spg_t,epoch_t>> *merge_pgs);
+
+  void need_heartbeat_peer_update();
+
+  void init();
+  void final_init();
+  void start_shutdown();
+  void shutdown_reserver();
+  void shutdown();
+
+  // -- stats --
+  ceph::mutex stat_lock = ceph::make_mutex("OSDService::stat_lock");
+  osd_stat_t osd_stat;
+  uint32_t seq = 0;
+
+  void set_statfs(const struct store_statfs_t &stbuf,
+    osd_alert_list_t& alerts);
+  osd_stat_t set_osd_stat(std::vector<int>& hb_peers, int num_pgs);
+  void inc_osd_stat_repaired(void);
+  float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0);
+  osd_stat_t get_osd_stat() {
+    std::lock_guard l(stat_lock);
+    ++seq;
+    osd_stat.up_from = up_epoch;
+    osd_stat.seq = ((uint64_t)osd_stat.up_from << 32) + seq;
+    return osd_stat;
+  }
+  uint64_t get_osd_stat_seq() {
+    std::lock_guard l(stat_lock);
+    return osd_stat.seq;
+  }
+  void get_hb_pingtime(std::map<int, osd_stat_t::Interfaces> *pp)
+  {
+    std::lock_guard l(stat_lock);
+    *pp = osd_stat.hb_pingtime;
+    return;
+  }
+
+  // -- OSD Full Status --
+private:
+  friend TestOpsSocketHook;
+  mutable ceph::mutex full_status_lock = ceph::make_mutex("OSDService::full_status_lock");
+  enum s_names { INVALID = -1, NONE, NEARFULL, BACKFILLFULL, FULL, FAILSAFE } cur_state;  // ascending
+  const char *get_full_state_name(s_names s) const {
+    switch (s) {
+    case NONE: return "none";
+    case NEARFULL: return "nearfull";
+    case BACKFILLFULL: return "backfillfull";
+    case FULL: return "full";
+    case FAILSAFE: return "failsafe";
+    default: return "???";
+    }
+  }
+  s_names get_full_state(std::string type) const {
+    if (type == "none")
+      return NONE;
+    else if (type == "failsafe")
+      return FAILSAFE;
+    else if (type == "full")
+      return FULL;
+    else if (type == "backfillfull")
+      return BACKFILLFULL;
+    else if (type == "nearfull")
+      return NEARFULL;
+    else
+      return INVALID;
+  }
+  double cur_ratio, physical_ratio;  ///< current utilization
+  mutable int64_t injectfull = 0;
+  s_names injectfull_state = NONE;
+  float get_failsafe_full_ratio();
+  bool _check_inject_full(DoutPrefixProvider *dpp, s_names type) const;
+  bool _check_full(DoutPrefixProvider *dpp, s_names type) const;
+public:
+  void check_full_status(float ratio, float pratio);
+  s_names recalc_full_state(float ratio, float pratio, std::string &inject);
+  bool _tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t);
+  bool check_failsafe_full(DoutPrefixProvider *dpp) const;
+  bool check_full(DoutPrefixProvider *dpp) const;
+  bool tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t);
+  bool check_backfill_full(DoutPrefixProvider *dpp) const;
+  bool check_nearfull(DoutPrefixProvider *dpp) const;
+  bool is_failsafe_full() const;
+  bool is_full() const;
+  bool is_backfillfull() const;
+  bool is_nearfull() const;
+  bool need_fullness_update();  ///< osdmap state needs update
+  void set_injectfull(s_names type, int64_t count);
+
+
+  // -- epochs --
+private:
+  // protects access to boot_epoch, up_epoch, bind_epoch
+  mutable ceph::mutex epoch_lock = ceph::make_mutex("OSDService::epoch_lock");
+  epoch_t boot_epoch;  // _first_ epoch we were marked up (after this process started)
+  epoch_t up_epoch;    // _most_recent_ epoch we were marked up
+  epoch_t bind_epoch;  // epoch we last did a bind to new ip:ports
+public:
+  /**
+   * Retrieve the boot_, up_, and bind_ epochs the OSD has std::set. The params
+   * can be NULL if you don't care about them.
+   */
+  void retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
+                       epoch_t *_bind_epoch) const;
+  /**
+   * Std::set the boot, up, and bind epochs. Any NULL params will not be std::set.
+   */
+  void set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
+                  const epoch_t *_bind_epoch);
+  epoch_t get_boot_epoch() const {
+    epoch_t ret;
+    retrieve_epochs(&ret, NULL, NULL);
+    return ret;
+  }
+  epoch_t get_up_epoch() const {
+    epoch_t ret;
+    retrieve_epochs(NULL, &ret, NULL);
+    return ret;
+  }
+  epoch_t get_bind_epoch() const {
+    epoch_t ret;
+    retrieve_epochs(NULL, NULL, &ret);
+    return ret;
+  }
+
+  void request_osdmap_update(epoch_t e);
+
+  // -- heartbeats --
+  ceph::mutex hb_stamp_lock = ceph::make_mutex("OSDServce::hb_stamp_lock");
+
+  /// osd -> heartbeat stamps
+  std::vector<HeartbeatStampsRef> hb_stamps;
+
+  /// get or create a ref for a peer's HeartbeatStamps
+  HeartbeatStampsRef get_hb_stamps(unsigned osd);
+
+
+  // Timer for readable leases
+  ceph::timer<ceph::mono_clock> mono_timer = ceph::timer<ceph::mono_clock>{ceph::construct_suspended};
+
+  void queue_renew_lease(epoch_t epoch, spg_t spgid);
+
+  // -- stopping --
+  ceph::mutex is_stopping_lock = ceph::make_mutex("OSDService::is_stopping_lock");
+  ceph::condition_variable is_stopping_cond;
+  enum {
+    NOT_STOPPING,
+    PREPARING_TO_STOP,
+    STOPPING };
+  std::atomic<int> state{NOT_STOPPING};
+  int get_state() const {
+    return state;
+  }
+  void set_state(int s) {
+    state = s;
+  }
+  bool is_stopping() const {
+    return state == STOPPING;
+  }
+  bool is_preparing_to_stop() const {
+    return state == PREPARING_TO_STOP;
+  }
+  bool prepare_to_stop();
+  void got_stop_ack();
+
+
+#ifdef PG_DEBUG_REFS
+  ceph::mutex pgid_lock = ceph::make_mutex("OSDService::pgid_lock");
+  std::map<spg_t, int> pgid_tracker;
+  std::map<spg_t, PG*> live_pgs;
+  void add_pgid(spg_t pgid, PG *pg);
+  void remove_pgid(spg_t pgid, PG *pg);
+  void dump_live_pgids();
+#endif
+
+  explicit OSDService(OSD *osd, ceph::async::io_context_pool& poolctx);
+  ~OSDService() = default;
+};
+
+/*
+
+  Each PG slot includes queues for events that are processing and/or waiting
+  for a PG to be materialized in the slot.
+
+  These are the constraints:
+
+  - client ops must remained ordered by client, regardless of std::map epoch
+  - peering messages/events from peers must remain ordered by peer
+  - peering messages and client ops need not be ordered relative to each other
+
+  - some peering events can create a pg (e.g., notify)
+  - the query peering event can proceed when a PG doesn't exist
+
+  Implementation notes:
+
+  - everybody waits for split.  If the OSD has the parent PG it will instantiate
+    the PGSlot early and mark it waiting_for_split.  Everything will wait until
+    the parent is able to commit the split operation and the child PG's are
+    materialized in the child slots.
+
+  - every event has an epoch property and will wait for the OSDShard to catch
+    up to that epoch.  For example, if we get a peering event from a future
+    epoch, the event will wait in the slot until the local OSD has caught up.
+    (We should be judicious in specifying the required epoch [by, e.g., setting
+    it to the same_interval_since epoch] so that we don't wait for epochs that
+    don't affect the given PG.)
+
+  - we maintain two separate wait lists, *waiting* and *waiting_peering*. The
+    OpSchedulerItem has an is_peering() bool to determine which we use.  Waiting
+    peering events are queued up by epoch required.
+
+  - when we wake a PG slot (e.g., we finished split, or got a newer osdmap, or
+    materialized the PG), we wake *all* waiting items.  (This could be optimized,
+    probably, but we don't bother.)  We always requeue peering items ahead of
+    client ops.
+
+  - some peering events are marked !peering_requires_pg (PGQuery).  if we do
+    not have a PG these are processed immediately (under the shard lock).
+
+  - we do not have a PG present, we check if the slot maps to the current host.
+    if so, we either queue the item and wait for the PG to materialize, or
+    (if the event is a pg creating event like PGNotify), we materialize the PG.
+
+  - when we advance the osdmap on the OSDShard, we scan pg slots and
+    discard any slots with no pg (and not waiting_for_split) that no
+    longer std::map to the current host.
+
+  */
+
+struct OSDShardPGSlot {
+  using OpSchedulerItem = ceph::osd::scheduler::OpSchedulerItem;
+  PGRef pg;                      ///< pg reference
+  std::deque<OpSchedulerItem> to_process; ///< order items for this slot
+  int num_running = 0;          ///< _process threads doing pg lookup/lock
+
+  std::deque<OpSchedulerItem> waiting;   ///< waiting for pg (or map + pg)
+
+  /// waiting for map (peering evt)
+  std::map<epoch_t,std::deque<OpSchedulerItem>> waiting_peering;
+
+  /// incremented by wake_pg_waiters; indicates racing _process threads
+  /// should bail out (their op has been requeued)
+  uint64_t requeue_seq = 0;
+
+  /// waiting for split child to materialize in these epoch(s)
+  std::set<epoch_t> waiting_for_split;
+
+  epoch_t epoch = 0;
+  boost::intrusive::set_member_hook<> pg_epoch_item;
+
+  /// waiting for a merge (source or target) by this epoch
+  epoch_t waiting_for_merge_epoch = 0;
+};
+
+struct OSDShard {
+  const unsigned shard_id;
+  CephContext *cct;
+  OSD *osd;
+
+  std::string shard_name;
+
+  std::string sdata_wait_lock_name;
+  ceph::mutex sdata_wait_lock;
+  ceph::condition_variable sdata_cond;
+  int waiting_threads = 0;
+
+  ceph::mutex osdmap_lock;  ///< protect shard_osdmap updates vs users w/o shard_lock
+  OSDMapRef shard_osdmap;
+
+  OSDMapRef get_osdmap() {
+    std::lock_guard l(osdmap_lock);
+    return shard_osdmap;
+  }
+
+  std::string shard_lock_name;
+  ceph::mutex shard_lock;   ///< protects remaining members below
+
+  /// map of slots for each spg_t.  maintains ordering of items dequeued
+  /// from scheduler while _process thread drops shard lock to acquire the
+  /// pg lock.  stale slots are removed by consume_map.
+  std::unordered_map<spg_t,std::unique_ptr<OSDShardPGSlot>> pg_slots;
+
+  struct pg_slot_compare_by_epoch {
+    bool operator()(const OSDShardPGSlot& l, const OSDShardPGSlot& r) const {
+      return l.epoch < r.epoch;
+    }
+  };
+
+  /// maintain an ordering of pg slots by pg epoch
+  boost::intrusive::multiset<
+    OSDShardPGSlot,
+    boost::intrusive::member_hook<
+      OSDShardPGSlot,
+      boost::intrusive::set_member_hook<>,
+      &OSDShardPGSlot::pg_epoch_item>,
+    boost::intrusive::compare<pg_slot_compare_by_epoch>> pg_slots_by_epoch;
+  int waiting_for_min_pg_epoch = 0;
+  ceph::condition_variable min_pg_epoch_cond;
+
+  /// priority queue
+  ceph::osd::scheduler::OpSchedulerRef scheduler;
+
+  bool stop_waiting = false;
+
+  ContextQueue context_queue;
+
+  void _attach_pg(OSDShardPGSlot *slot, PG *pg);
+  void _detach_pg(OSDShardPGSlot *slot);
+
+  void update_pg_epoch(OSDShardPGSlot *slot, epoch_t epoch);
+  epoch_t get_min_pg_epoch();
+  void wait_min_pg_epoch(epoch_t need);
+
+  /// return newest epoch we are waiting for
+  epoch_t get_max_waiting_epoch();
+
+  /// push osdmap into shard
+  void consume_map(
+    const OSDMapRef& osdmap,
+    unsigned *pushes_to_free);
+
+  void _wake_pg_slot(spg_t pgid, OSDShardPGSlot *slot);
+
+  void identify_splits_and_merges(
+    const OSDMapRef& as_of_osdmap,
+    std::set<std::pair<spg_t,epoch_t>> *split_children,
+    std::set<std::pair<spg_t,epoch_t>> *merge_pgs);
+  void _prime_splits(std::set<std::pair<spg_t,epoch_t>> *pgids);
+  void prime_splits(const OSDMapRef& as_of_osdmap,
+		    std::set<std::pair<spg_t,epoch_t>> *pgids);
+  void prime_merges(const OSDMapRef& as_of_osdmap,
+		    std::set<std::pair<spg_t,epoch_t>> *merge_pgs);
+  void register_and_wake_split_child(PG *pg);
+  void unprime_split_children(spg_t parent, unsigned old_pg_num);
+  void update_scheduler_config();
+
+  OSDShard(
+    int id,
+    CephContext *cct,
+    OSD *osd);
+};
+
+class OSD : public Dispatcher,
+	    public md_config_obs_t {
+  using OpSchedulerItem = ceph::osd::scheduler::OpSchedulerItem;
+
+  /** OSD **/
+  // global lock
+  ceph::mutex osd_lock = ceph::make_mutex("OSD::osd_lock");
+  SafeTimer tick_timer;    // safe timer (osd_lock)
+
+  // Tick timer for those stuff that do not need osd_lock
+  ceph::mutex tick_timer_lock = ceph::make_mutex("OSD::tick_timer_lock");
+  SafeTimer tick_timer_without_osd_lock;
+  std::string gss_ktfile_client{};
+
+public:
+  // config observer bits
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override;
+  void update_log_config();
+  void check_config();
+
+protected:
+
+  const double OSD_TICK_INTERVAL = { 1.0 };
+  double get_tick_interval() const;
+
+  Messenger   *cluster_messenger;
+  Messenger   *client_messenger;
+  Messenger   *objecter_messenger;
+  MonClient   *monc; // check the "monc helpers" list before accessing directly
+  MgrClient   mgrc;
+  PerfCounters      *logger;
+  PerfCounters      *recoverystate_perf;
+  ObjectStore *store;
+#ifdef HAVE_LIBFUSE
+  FuseStore *fuse_store = nullptr;
+#endif
+  LogClient log_client;
+  LogChannelRef clog;
+
+  int whoami;
+  std::string dev_path, journal_path;
+
+  ceph_release_t last_require_osd_release{ceph_release_t::unknown};
+
+  int numa_node = -1;
+  size_t numa_cpu_set_size = 0;
+  cpu_set_t numa_cpu_set;
+
+  bool store_is_rotational = true;
+  bool journal_is_rotational = true;
+
+  ZTracer::Endpoint trace_endpoint;
+  PerfCounters* create_logger();
+  PerfCounters* create_recoverystate_perf();
+  void tick();
+  void tick_without_osd_lock();
+  void _dispatch(Message *m);
+  void dispatch_op(OpRequestRef op);
+
+  void check_osdmap_features();
+
+  // asok
+  friend class OSDSocketHook;
+  class OSDSocketHook *asok_hook;
+  void asok_command(
+    std::string_view prefix,
+    const cmdmap_t& cmdmap,
+    ceph::Formatter *f,
+    const ceph::buffer::list& inbl,
+    std::function<void(int,const std::string&,ceph::buffer::list&)> on_finish);
+
+public:
+  int get_nodeid() { return whoami; }
+
+  static ghobject_t get_osdmap_pobject_name(epoch_t epoch) {
+    char foo[20];
+    snprintf(foo, sizeof(foo), "osdmap.%d", epoch);
+    return ghobject_t(hobject_t(sobject_t(object_t(foo), 0)));
+  }
+  static ghobject_t get_inc_osdmap_pobject_name(epoch_t epoch) {
+    char foo[22];
+    snprintf(foo, sizeof(foo), "inc_osdmap.%d", epoch);
+    return ghobject_t(hobject_t(sobject_t(object_t(foo), 0)));
+  }
+
+  static ghobject_t make_snapmapper_oid() {
+    return ghobject_t(hobject_t(
+      sobject_t(
+	object_t("snapmapper"),
+	0)));
+  }
+  static ghobject_t make_purged_snaps_oid() {
+    return ghobject_t(hobject_t(
+      sobject_t(
+	object_t("purged_snaps"),
+	0)));
+  }
+
+  static ghobject_t make_pg_log_oid(spg_t pg) {
+    std::stringstream ss;
+    ss << "pglog_" << pg;
+    std::string s;
+    getline(ss, s);
+    return ghobject_t(hobject_t(sobject_t(object_t(s.c_str()), 0)));
+  }
+
+  static ghobject_t make_pg_biginfo_oid(spg_t pg) {
+    std::stringstream ss;
+    ss << "pginfo_" << pg;
+    std::string s;
+    getline(ss, s);
+    return ghobject_t(hobject_t(sobject_t(object_t(s.c_str()), 0)));
+  }
+  static ghobject_t make_infos_oid() {
+    hobject_t oid(sobject_t("infos", CEPH_NOSNAP));
+    return ghobject_t(oid);
+  }
+
+  static ghobject_t make_final_pool_info_oid(int64_t pool) {
+    return ghobject_t(
+      hobject_t(
+	sobject_t(
+	  object_t(std::string("final_pool_") + stringify(pool)),
+	  CEPH_NOSNAP)));
+  }
+
+  static ghobject_t make_pg_num_history_oid() {
+    return ghobject_t(hobject_t(sobject_t("pg_num_history", CEPH_NOSNAP)));
+  }
+
+  static void recursive_remove_collection(CephContext* cct,
+					  ObjectStore *store,
+					  spg_t pgid,
+					  coll_t tmp);
+
+  /**
+   * get_osd_initial_compat_set()
+   *
+   * Get the initial feature std::set for this OSD.  Features
+   * here are automatically upgraded.
+   *
+   * Return value: Initial osd CompatSet
+   */
+  static CompatSet get_osd_initial_compat_set();
+
+  /**
+   * get_osd_compat_set()
+   *
+   * Get all features supported by this OSD
+   *
+   * Return value: CompatSet of all supported features
+   */
+  static CompatSet get_osd_compat_set();
+
+
+private:
+  class C_Tick;
+  class C_Tick_WithoutOSDLock;
+
+  // -- config settings --
+  float m_osd_pg_epoch_max_lag_factor;
+
+  // -- superblock --
+  OSDSuperblock superblock;
+
+  void write_superblock();
+  void write_superblock(ObjectStore::Transaction& t);
+  int read_superblock();
+
+  void clear_temp_objects();
+
+  CompatSet osd_compat;
+
+  // -- state --
+public:
+  typedef enum {
+    STATE_INITIALIZING = 1,
+    STATE_PREBOOT,
+    STATE_BOOTING,
+    STATE_ACTIVE,
+    STATE_STOPPING,
+    STATE_WAITING_FOR_HEALTHY
+  } osd_state_t;
+
+  static const char *get_state_name(int s) {
+    switch (s) {
+    case STATE_INITIALIZING: return "initializing";
+    case STATE_PREBOOT: return "preboot";
+    case STATE_BOOTING: return "booting";
+    case STATE_ACTIVE: return "active";
+    case STATE_STOPPING: return "stopping";
+    case STATE_WAITING_FOR_HEALTHY: return "waiting_for_healthy";
+    default: return "???";
+    }
+  }
+
+private:
+  std::atomic<int> state{STATE_INITIALIZING};
+
+public:
+  int get_state() const {
+    return state;
+  }
+  void set_state(int s) {
+    state = s;
+  }
+  bool is_initializing() const {
+    return state == STATE_INITIALIZING;
+  }
+  bool is_preboot() const {
+    return state == STATE_PREBOOT;
+  }
+  bool is_booting() const {
+    return state == STATE_BOOTING;
+  }
+  bool is_active() const {
+    return state == STATE_ACTIVE;
+  }
+  bool is_stopping() const {
+    return state == STATE_STOPPING;
+  }
+  bool is_waiting_for_healthy() const {
+    return state == STATE_WAITING_FOR_HEALTHY;
+  }
+
+private:
+
+  ShardedThreadPool osd_op_tp;
+
+  void get_latest_osdmap();
+
+  // -- sessions --
+private:
+  void dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap);
+
+  ceph::mutex session_waiting_lock = ceph::make_mutex("OSD::session_waiting_lock");
+  std::set<ceph::ref_t<Session>> session_waiting_for_map;
+
+  /// Caller assumes refs for included Sessions
+  void get_sessions_waiting_for_map(std::set<ceph::ref_t<Session>> *out) {
+    std::lock_guard l(session_waiting_lock);
+    out->swap(session_waiting_for_map);
+  }
+  void register_session_waiting_on_map(const ceph::ref_t<Session>& session) {
+    std::lock_guard l(session_waiting_lock);
+    session_waiting_for_map.insert(session);
+  }
+  void clear_session_waiting_on_map(const ceph::ref_t<Session>& session) {
+    std::lock_guard l(session_waiting_lock);
+    session_waiting_for_map.erase(session);
+  }
+  void dispatch_sessions_waiting_on_map() {
+    std::set<ceph::ref_t<Session>> sessions_to_check;
+    get_sessions_waiting_for_map(&sessions_to_check);
+    for (auto i = sessions_to_check.begin();
+	 i != sessions_to_check.end();
+	 sessions_to_check.erase(i++)) {
+      std::lock_guard l{(*i)->session_dispatch_lock};
+      dispatch_session_waiting(*i, get_osdmap());
+    }
+  }
+  void session_handle_reset(const ceph::ref_t<Session>& session) {
+    std::lock_guard l(session->session_dispatch_lock);
+    clear_session_waiting_on_map(session);
+
+    session->clear_backoffs();
+
+    /* Messages have connection refs, we need to clear the
+     * connection->session->message->connection
+     * cycles which result.
+     * Bug #12338
+     */
+    session->waiting_on_map.clear_and_dispose(TrackedOp::Putter());
+  }
+
+private:
+  /**
+   * @defgroup monc helpers
+   * @{
+   * Right now we only have the one
+   */
+
+  /**
+   * Ask the Monitors for a sequence of OSDMaps.
+   *
+   * @param epoch The epoch to start with when replying
+   * @param force_request True if this request forces a new subscription to
+   * the monitors; false if an outstanding request that encompasses it is
+   * sufficient.
+   */
+  void osdmap_subscribe(version_t epoch, bool force_request);
+  /** @} monc helpers */
+
+  ceph::mutex osdmap_subscribe_lock = ceph::make_mutex("OSD::osdmap_subscribe_lock");
+  epoch_t latest_subscribed_epoch{0};
+
+  // -- heartbeat --
+  /// information about a heartbeat peer
+  struct HeartbeatInfo {
+    int peer;           ///< peer
+    ConnectionRef con_front;   ///< peer connection (front)
+    ConnectionRef con_back;    ///< peer connection (back)
+    utime_t first_tx;   ///< time we sent our first ping request
+    utime_t last_tx;    ///< last time we sent a ping request
+    utime_t last_rx_front;  ///< last time we got a ping reply on the front side
+    utime_t last_rx_back;   ///< last time we got a ping reply on the back side
+    epoch_t epoch;      ///< most recent epoch we wanted this peer
+    /// number of connections we send and receive heartbeat pings/replies
+    static constexpr int HEARTBEAT_MAX_CONN = 2;
+    /// history of inflight pings, arranging by timestamp we sent
+    /// send time -> deadline -> remaining replies
+    std::map<utime_t, std::pair<utime_t, int>> ping_history;
+
+    utime_t hb_interval_start;
+    uint32_t hb_average_count = 0;
+    uint32_t hb_index = 0;
+
+    uint32_t hb_total_back = 0;
+    uint32_t hb_min_back = UINT_MAX;
+    uint32_t hb_max_back = 0;
+    std::vector<uint32_t> hb_back_pingtime;
+    std::vector<uint32_t> hb_back_min;
+    std::vector<uint32_t> hb_back_max;
+
+    uint32_t hb_total_front = 0;
+    uint32_t hb_min_front = UINT_MAX;
+    uint32_t hb_max_front = 0;
+    std::vector<uint32_t> hb_front_pingtime;
+    std::vector<uint32_t> hb_front_min;
+    std::vector<uint32_t> hb_front_max;
+
+    bool is_stale(utime_t stale) const {
+      if (ping_history.empty()) {
+        return false;
+      }
+      utime_t oldest_deadline = ping_history.begin()->second.first;
+      return oldest_deadline <= stale;
+    }
+
+    bool is_unhealthy(utime_t now) const {
+      if (ping_history.empty()) {
+        /// we haven't sent a ping yet or we have got all replies,
+        /// in either way we are safe and healthy for now
+        return false;
+      }
+
+      utime_t oldest_deadline = ping_history.begin()->second.first;
+      return now > oldest_deadline;
+    }
+
+    bool is_healthy(utime_t now) const {
+      if (last_rx_front == utime_t() || last_rx_back == utime_t()) {
+        // only declare to be healthy until we have received the first
+        // replies from both front/back connections
+        return false;
+      }
+      return !is_unhealthy(now);
+    }
+
+    void clear_mark_down(Connection *except = nullptr) {
+      if (con_back && con_back != except) {
+	con_back->mark_down();
+	con_back->clear_priv();
+	con_back.reset(nullptr);
+      }
+      if (con_front && con_front != except) {
+	con_front->mark_down();
+	con_front->clear_priv();
+	con_front.reset(nullptr);
+      }
+    }
+  };
+
+  ceph::mutex heartbeat_lock = ceph::make_mutex("OSD::heartbeat_lock");
+  std::map<int, int> debug_heartbeat_drops_remaining;
+  ceph::condition_variable heartbeat_cond;
+  bool heartbeat_stop;
+  std::atomic<bool> heartbeat_need_update;
+  std::map<int,HeartbeatInfo> heartbeat_peers;  ///< map of osd id to HeartbeatInfo
+  utime_t last_mon_heartbeat;
+  Messenger *hb_front_client_messenger;
+  Messenger *hb_back_client_messenger;
+  Messenger *hb_front_server_messenger;
+  Messenger *hb_back_server_messenger;
+  utime_t last_heartbeat_resample;   ///< last time we chose random peers in waiting-for-healthy state
+  double daily_loadavg;
+  ceph::mono_time startup_time;
+
+  // Track ping repsonse times using vector as a circular buffer
+  // MUST BE A POWER OF 2
+  const uint32_t hb_vector_size = 16;
+
+  void _add_heartbeat_peer(int p);
+  void _remove_heartbeat_peer(int p);
+  bool heartbeat_reset(Connection *con);
+  void maybe_update_heartbeat_peers();
+  void reset_heartbeat_peers(bool all);
+  bool heartbeat_peers_need_update() {
+    return heartbeat_need_update.load();
+  }
+  void heartbeat_set_peers_need_update() {
+    heartbeat_need_update.store(true);
+  }
+  void heartbeat_clear_peers_need_update() {
+    heartbeat_need_update.store(false);
+  }
+  void heartbeat();
+  void heartbeat_check();
+  void heartbeat_entry();
+  void need_heartbeat_peer_update();
+
+  void heartbeat_kick() {
+    std::lock_guard l(heartbeat_lock);
+    heartbeat_cond.notify_all();
+  }
+
+  struct T_Heartbeat : public Thread {
+    OSD *osd;
+    explicit T_Heartbeat(OSD *o) : osd(o) {}
+    void *entry() override {
+      osd->heartbeat_entry();
+      return 0;
+    }
+  } heartbeat_thread;
+
+public:
+  bool heartbeat_dispatch(Message *m);
+
+  struct HeartbeatDispatcher : public Dispatcher {
+    OSD *osd;
+    explicit HeartbeatDispatcher(OSD *o) : Dispatcher(o->cct), osd(o) {}
+
+    bool ms_can_fast_dispatch_any() const override { return true; }
+    bool ms_can_fast_dispatch(const Message *m) const override {
+      switch (m->get_type()) {
+      case CEPH_MSG_PING:
+      case MSG_OSD_PING:
+	return true;
+      default:
+	return false;
+      }
+    }
+    void ms_fast_dispatch(Message *m) override {
+      osd->heartbeat_dispatch(m);
+    }
+    bool ms_dispatch(Message *m) override {
+      return osd->heartbeat_dispatch(m);
+    }
+    bool ms_handle_reset(Connection *con) override {
+      return osd->heartbeat_reset(con);
+    }
+    void ms_handle_remote_reset(Connection *con) override {}
+    bool ms_handle_refused(Connection *con) override {
+      return osd->ms_handle_refused(con);
+    }
+    int ms_handle_authentication(Connection *con) override {
+      return true;
+    }
+  } heartbeat_dispatcher;
+
+private:
+  // -- waiters --
+  std::list<OpRequestRef> finished;
+
+  void take_waiters(std::list<OpRequestRef>& ls) {
+    ceph_assert(ceph_mutex_is_locked(osd_lock));
+    finished.splice(finished.end(), ls);
+  }
+  void do_waiters();
+
+  // -- op tracking --
+  OpTracker op_tracker;
+  void test_ops(std::string command, std::string args, std::ostream& ss);
+  friend class TestOpsSocketHook;
+  TestOpsSocketHook *test_ops_hook;
+  friend struct C_FinishSplits;
+  friend struct C_OpenPGs;
+
+protected:
+
+  /*
+   * The ordered op delivery chain is:
+   *
+   *   fast dispatch -> scheduler back
+   *                    scheduler front <-> to_process back
+   *                                     to_process front  -> RunVis(item)
+   *                                                      <- queue_front()
+   *
+   * The scheduler is per-shard, and to_process is per pg_slot.  Items can be
+   * pushed back up into to_process and/or scheduler while order is preserved.
+   *
+   * Multiple worker threads can operate on each shard.
+   *
+   * Under normal circumstances, num_running == to_process.size().  There are
+   * two times when that is not true: (1) when waiting_for_pg == true and
+   * to_process is accumulating requests that are waiting for the pg to be
+   * instantiated; in that case they will all get requeued together by
+   * wake_pg_waiters, and (2) when wake_pg_waiters just ran, waiting_for_pg
+   * and already requeued the items.
+   */
+  friend class ceph::osd::scheduler::PGOpItem;
+  friend class ceph::osd::scheduler::PGPeeringItem;
+  friend class ceph::osd::scheduler::PGRecovery;
+  friend class ceph::osd::scheduler::PGRecoveryMsg;
+  friend class ceph::osd::scheduler::PGDelete;
+
+  class ShardedOpWQ
+    : public ShardedThreadPool::ShardedWQ<OpSchedulerItem>
+  {
+    OSD *osd;
+
+  public:
+    ShardedOpWQ(OSD *o,
+		ceph::timespan ti,
+		ceph::timespan si,
+		ShardedThreadPool* tp)
+      : ShardedThreadPool::ShardedWQ<OpSchedulerItem>(ti, si, tp),
+        osd(o) {
+    }
+
+    void _add_slot_waiter(
+      spg_t token,
+      OSDShardPGSlot *slot,
+      OpSchedulerItem&& qi);
+
+    /// try to do some work
+    void _process(uint32_t thread_index, ceph::heartbeat_handle_d *hb) override;
+
+    /// enqueue a new item
+    void _enqueue(OpSchedulerItem&& item) override;
+
+    /// requeue an old item (at the front of the line)
+    void _enqueue_front(OpSchedulerItem&& item) override;
+
+    void return_waiting_threads() override {
+      for(uint32_t i = 0; i < osd->num_shards; i++) {
+	OSDShard* sdata = osd->shards[i];
+	assert (NULL != sdata);
+	std::scoped_lock l{sdata->sdata_wait_lock};
+	sdata->stop_waiting = true;
+	sdata->sdata_cond.notify_all();
+      }
+    }
+
+    void stop_return_waiting_threads() override {
+      for(uint32_t i = 0; i < osd->num_shards; i++) {
+	OSDShard* sdata = osd->shards[i];
+	assert (NULL != sdata);
+	std::scoped_lock l{sdata->sdata_wait_lock};
+	sdata->stop_waiting = false;
+      }
+    }
+
+    void dump(ceph::Formatter *f) {
+      for(uint32_t i = 0; i < osd->num_shards; i++) {
+	auto &&sdata = osd->shards[i];
+
+	char queue_name[32] = {0};
+	snprintf(queue_name, sizeof(queue_name), "%s%" PRIu32, "OSD:ShardedOpWQ:", i);
+	ceph_assert(NULL != sdata);
+
+	std::scoped_lock l{sdata->shard_lock};
+	f->open_object_section(queue_name);
+	sdata->scheduler->dump(*f);
+	f->close_section();
+      }
+    }
+
+    bool is_shard_empty(uint32_t thread_index) override {
+      uint32_t shard_index = thread_index % osd->num_shards;
+      auto &&sdata = osd->shards[shard_index];
+      ceph_assert(sdata);
+      std::lock_guard l(sdata->shard_lock);
+      if (thread_index < osd->num_shards) {
+	return sdata->scheduler->empty() && sdata->context_queue.empty();
+      } else {
+	return sdata->scheduler->empty();
+      }
+    }
+
+    void handle_oncommits(std::list<Context*>& oncommits) {
+      for (auto p : oncommits) {
+	p->complete(0);
+      }
+    }
+  } op_shardedwq;
+
+
+  void enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch);
+  void dequeue_op(
+    PGRef pg, OpRequestRef op,
+    ThreadPool::TPHandle &handle);
+
+  void enqueue_peering_evt(
+    spg_t pgid,
+    PGPeeringEventRef ref);
+  void dequeue_peering_evt(
+    OSDShard *sdata,
+    PG *pg,
+    PGPeeringEventRef ref,
+    ThreadPool::TPHandle& handle);
+
+  void dequeue_delete(
+    OSDShard *sdata,
+    PG *pg,
+    epoch_t epoch,
+    ThreadPool::TPHandle& handle);
+
+  friend class PG;
+  friend struct OSDShard;
+  friend class PrimaryLogPG;
+  friend class PgScrubber;
+
+
+ protected:
+
+  // -- osd map --
+  // TODO: switch to std::atomic<OSDMapRef> when C++20 will be available.
+  OSDMapRef       _osdmap;
+  void set_osdmap(OSDMapRef osdmap) {
+    std::atomic_store(&_osdmap, osdmap);
+  }
+  OSDMapRef get_osdmap() const {
+    return std::atomic_load(&_osdmap);
+  }
+  epoch_t get_osdmap_epoch() const {
+    // XXX: performance?
+    auto osdmap = get_osdmap();
+    return osdmap ? osdmap->get_epoch() : 0;
+  }
+
+  pool_pg_num_history_t pg_num_history;
+
+  ceph::shared_mutex map_lock = ceph::make_shared_mutex("OSD::map_lock");
+  std::list<OpRequestRef>  waiting_for_osdmap;
+  std::deque<utime_t> osd_markdown_log;
+
+  friend struct send_map_on_destruct;
+
+  void wait_for_new_map(OpRequestRef op);
+  void handle_osd_map(class MOSDMap *m);
+  void _committed_osd_maps(epoch_t first, epoch_t last, class MOSDMap *m);
+  void trim_maps(epoch_t oldest, int nreceived, bool skip_maps);
+  void note_down_osd(int osd);
+  void note_up_osd(int osd);
+  friend struct C_OnMapCommit;
+
+  bool advance_pg(
+    epoch_t advance_to,
+    PG *pg,
+    ThreadPool::TPHandle &handle,
+    PeeringCtx &rctx);
+  void consume_map();
+  void activate_map();
+
+  // osd map cache (past osd maps)
+  OSDMapRef get_map(epoch_t e) {
+    return service.get_map(e);
+  }
+  OSDMapRef add_map(OSDMap *o) {
+    return service.add_map(o);
+  }
+  bool get_map_bl(epoch_t e, ceph::buffer::list& bl) {
+    return service.get_map_bl(e, bl);
+  }
+
+public:
+  // -- shards --
+  std::vector<OSDShard*> shards;
+  uint32_t num_shards = 0;
+
+  void inc_num_pgs() {
+    ++num_pgs;
+  }
+  void dec_num_pgs() {
+    --num_pgs;
+  }
+  int get_num_pgs() const {
+    return num_pgs;
+  }
+
+protected:
+  ceph::mutex merge_lock = ceph::make_mutex("OSD::merge_lock");
+  /// merge epoch -> target pgid -> source pgid -> pg
+  std::map<epoch_t,std::map<spg_t,std::map<spg_t,PGRef>>> merge_waiters;
+
+  bool add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef source,
+			unsigned need);
+
+  // -- placement groups --
+  std::atomic<size_t> num_pgs = {0};
+
+  std::mutex pending_creates_lock;
+  using create_from_osd_t = std::pair<spg_t, bool /* is primary*/>;
+  std::set<create_from_osd_t> pending_creates_from_osd;
+  unsigned pending_creates_from_mon = 0;
+
+  PGRecoveryStats pg_recovery_stats;
+
+  PGRef _lookup_pg(spg_t pgid);
+  PGRef _lookup_lock_pg(spg_t pgid);
+  void register_pg(PGRef pg);
+  bool try_finish_pg_delete(PG *pg, unsigned old_pg_num);
+
+  void _get_pgs(std::vector<PGRef> *v, bool clear_too=false);
+  void _get_pgids(std::vector<spg_t> *v);
+
+public:
+  PGRef lookup_lock_pg(spg_t pgid);
+
+  std::set<int64_t> get_mapped_pools();
+
+protected:
+  PG* _make_pg(OSDMapRef createmap, spg_t pgid);
+
+  bool maybe_wait_for_max_pg(const OSDMapRef& osdmap,
+			     spg_t pgid, bool is_mon_create);
+  void resume_creating_pg();
+
+  void load_pgs();
+
+  /// build initial pg history and intervals on create
+  void build_initial_pg_history(
+    spg_t pgid,
+    epoch_t created,
+    utime_t created_stamp,
+    pg_history_t *h,
+    PastIntervals *pi);
+
+  epoch_t last_pg_create_epoch;
+
+  void handle_pg_create(OpRequestRef op);
+
+  void split_pgs(
+    PG *parent,
+    const std::set<spg_t> &childpgids, std::set<PGRef> *out_pgs,
+    OSDMapRef curmap,
+    OSDMapRef nextmap,
+    PeeringCtx &rctx);
+  void _finish_splits(std::set<PGRef>& pgs);
+
+  // == monitor interaction ==
+  ceph::mutex mon_report_lock = ceph::make_mutex("OSD::mon_report_lock");
+  utime_t last_mon_report;
+  Finisher boot_finisher;
+
+  // -- boot --
+  void start_boot();
+  void _got_mon_epochs(epoch_t oldest, epoch_t newest);
+  void _preboot(epoch_t oldest, epoch_t newest);
+  void _send_boot();
+  void _collect_metadata(std::map<std::string,std::string> *pmeta);
+  void _get_purged_snaps();
+  void handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *r);
+
+  void start_waiting_for_healthy();
+  bool _is_healthy();
+
+  void send_full_update();
+
+  friend struct CB_OSD_GetVersion;
+
+  // -- alive --
+  epoch_t up_thru_wanted;
+
+  void queue_want_up_thru(epoch_t want);
+  void send_alive();
+
+  // -- full map requests --
+  epoch_t requested_full_first, requested_full_last;
+
+  void request_full_map(epoch_t first, epoch_t last);
+  void rerequest_full_maps() {
+    epoch_t first = requested_full_first;
+    epoch_t last = requested_full_last;
+    requested_full_first = 0;
+    requested_full_last = 0;
+    request_full_map(first, last);
+  }
+  void got_full_map(epoch_t e);
+
+  // -- failures --
+  std::map<int,utime_t> failure_queue;
+  std::map<int,std::pair<utime_t,entity_addrvec_t> > failure_pending;
+
+  void requeue_failures();
+  void send_failures();
+  void send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs);
+  void cancel_pending_failures();
+
+  ceph::coarse_mono_clock::time_point last_sent_beacon;
+  ceph::mutex min_last_epoch_clean_lock = ceph::make_mutex("OSD::min_last_epoch_clean_lock");
+  epoch_t min_last_epoch_clean = 0;
+  // which pgs were scanned for min_lec
+  std::vector<pg_t> min_last_epoch_clean_pgs;
+  void send_beacon(const ceph::coarse_mono_clock::time_point& now);
+
+  ceph_tid_t get_tid() {
+    return service.get_tid();
+  }
+
+  double scrub_sleep_time(bool must_scrub);
+
+  // -- generic pg peering --
+  PeeringCtx create_context();
+  void dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
+                        ThreadPool::TPHandle *handle = NULL);
+
+  bool require_mon_peer(const Message *m);
+  bool require_mon_or_mgr_peer(const Message *m);
+  bool require_osd_peer(const Message *m);
+  /***
+   * Verifies that we were alive in the given epoch, and that
+   * still are.
+   */
+  bool require_self_aliveness(const Message *m, epoch_t alive_since);
+  /**
+   * Verifies that the OSD who sent the given op has the same
+   * address as in the given std::map.
+   * @pre op was sent by an OSD using the cluster messenger
+   */
+  bool require_same_peer_instance(const Message *m, const OSDMapRef& map,
+				  bool is_fast_dispatch);
+
+  bool require_same_or_newer_map(OpRequestRef& op, epoch_t e,
+				 bool is_fast_dispatch);
+
+  void handle_fast_pg_create(MOSDPGCreate2 *m);
+  void handle_fast_pg_query(MOSDPGQuery *m);
+  void handle_pg_query_nopg(const MQuery& q);
+  void handle_fast_pg_notify(MOSDPGNotify *m);
+  void handle_pg_notify_nopg(const MNotifyRec& q);
+  void handle_fast_pg_info(MOSDPGInfo *m);
+  void handle_fast_pg_remove(MOSDPGRemove *m);
+
+public:
+  // used by OSDShard
+  PGRef handle_pg_create_info(const OSDMapRef& osdmap, const PGCreateInfo *info);
+protected:
+
+  void handle_fast_force_recovery(MOSDForceRecovery *m);
+
+  // -- commands --
+  void handle_command(class MCommand *m);
+
+
+  // -- pg recovery --
+  void do_recovery(PG *pg, epoch_t epoch_queued, uint64_t pushes_reserved,
+		   ThreadPool::TPHandle &handle);
+
+
+  // -- scrubbing --
+  void sched_scrub();
+  void resched_all_scrubs();
+  bool scrub_random_backoff();
+  bool scrub_load_below_threshold();
+  bool scrub_time_permit(utime_t now);
+
+  // -- status reporting --
+  MPGStats *collect_pg_stats();
+  std::vector<DaemonHealthMetric> get_health_metrics();
+
+
+private:
+  bool ms_can_fast_dispatch_any() const override { return true; }
+  bool ms_can_fast_dispatch(const Message *m) const override {
+    switch (m->get_type()) {
+    case CEPH_MSG_PING:
+    case CEPH_MSG_OSD_OP:
+    case CEPH_MSG_OSD_BACKOFF:
+    case MSG_OSD_SCRUB2:
+    case MSG_OSD_FORCE_RECOVERY:
+    case MSG_MON_COMMAND:
+    case MSG_OSD_PG_CREATE2:
+    case MSG_OSD_PG_QUERY:
+    case MSG_OSD_PG_QUERY2:
+    case MSG_OSD_PG_INFO:
+    case MSG_OSD_PG_INFO2:
+    case MSG_OSD_PG_NOTIFY:
+    case MSG_OSD_PG_NOTIFY2:
+    case MSG_OSD_PG_LOG:
+    case MSG_OSD_PG_TRIM:
+    case MSG_OSD_PG_REMOVE:
+    case MSG_OSD_BACKFILL_RESERVE:
+    case MSG_OSD_RECOVERY_RESERVE:
+    case MSG_OSD_REPOP:
+    case MSG_OSD_REPOPREPLY:
+    case MSG_OSD_PG_PUSH:
+    case MSG_OSD_PG_PULL:
+    case MSG_OSD_PG_PUSH_REPLY:
+    case MSG_OSD_PG_SCAN:
+    case MSG_OSD_PG_BACKFILL:
+    case MSG_OSD_PG_BACKFILL_REMOVE:
+    case MSG_OSD_EC_WRITE:
+    case MSG_OSD_EC_WRITE_REPLY:
+    case MSG_OSD_EC_READ:
+    case MSG_OSD_EC_READ_REPLY:
+    case MSG_OSD_SCRUB_RESERVE:
+    case MSG_OSD_REP_SCRUB:
+    case MSG_OSD_REP_SCRUBMAP:
+    case MSG_OSD_PG_UPDATE_LOG_MISSING:
+    case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
+    case MSG_OSD_PG_RECOVERY_DELETE:
+    case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+    case MSG_OSD_PG_LEASE:
+    case MSG_OSD_PG_LEASE_ACK:
+      return true;
+    default:
+      return false;
+    }
+  }
+  void ms_fast_dispatch(Message *m) override;
+  bool ms_dispatch(Message *m) override;
+  void ms_handle_connect(Connection *con) override;
+  void ms_handle_fast_connect(Connection *con) override;
+  void ms_handle_fast_accept(Connection *con) override;
+  int ms_handle_authentication(Connection *con) override;
+  bool ms_handle_reset(Connection *con) override;
+  void ms_handle_remote_reset(Connection *con) override {}
+  bool ms_handle_refused(Connection *con) override;
+
+ public:
+  /* internal and external can point to the same messenger, they will still
+   * be cleaned up properly*/
+  OSD(CephContext *cct_,
+      ObjectStore *store_,
+      int id,
+      Messenger *internal,
+      Messenger *external,
+      Messenger *hb_front_client,
+      Messenger *hb_back_client,
+      Messenger *hb_front_server,
+      Messenger *hb_back_server,
+      Messenger *osdc_messenger,
+      MonClient *mc, const std::string &dev, const std::string &jdev,
+      ceph::async::io_context_pool& poolctx);
+  ~OSD() override;
+
+  // static bits
+  static int mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, std::string osdspec_affinity);
+
+  /* remove any non-user xattrs from a std::map of them */
+  void filter_xattrs(std::map<std::string, ceph::buffer::ptr>& attrs) {
+    for (std::map<std::string, ceph::buffer::ptr>::iterator iter = attrs.begin();
+	 iter != attrs.end();
+	 ) {
+      if (('_' != iter->first.at(0)) || (iter->first.size() == 1))
+	attrs.erase(iter++);
+      else ++iter;
+    }
+  }
+
+private:
+  int mon_cmd_maybe_osd_create(std::string &cmd);
+  int update_crush_device_class();
+  int update_crush_location();
+
+  static int write_meta(CephContext *cct,
+			ObjectStore *store,
+			uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, std::string& osdspec_affinity);
+
+  void handle_scrub(class MOSDScrub *m);
+  void handle_fast_scrub(class MOSDScrub2 *m);
+  void handle_osd_ping(class MOSDPing *m);
+
+  size_t get_num_cache_shards();
+  int get_num_op_shards();
+  int get_num_op_threads();
+
+  float get_osd_recovery_sleep();
+  float get_osd_delete_sleep();
+  float get_osd_snap_trim_sleep();
+
+  int get_recovery_max_active();
+  void maybe_override_max_osd_capacity_for_qos();
+  bool maybe_override_options_for_qos();
+  int run_osd_bench_test(int64_t count,
+                         int64_t bsize,
+                         int64_t osize,
+                         int64_t onum,
+                         double *elapsed,
+                         std::ostream& ss);
+  int mon_cmd_set_config(const std::string &key, const std::string &val);
+
+  void scrub_purged_snaps();
+  void probe_smart(const std::string& devid, std::ostream& ss);
+
+public:
+  static int peek_meta(ObjectStore *store,
+		       std::string *magic,
+		       uuid_d *cluster_fsid,
+		       uuid_d *osd_fsid,
+		       int *whoami,
+		       ceph_release_t *min_osd_release);
+
+
+  // startup/shutdown
+  int pre_init();
+  int init();
+  void final_init();
+
+  int enable_disable_fuse(bool stop);
+  int set_numa_affinity();
+
+  void suicide(int exitcode);
+  int shutdown();
+
+  void handle_signal(int signum);
+
+  /// check if we can throw out op from a disconnected client
+  static bool op_is_discardable(const MOSDOp *m);
+
+public:
+  OSDService service;
+  friend class OSDService;
+
+private:
+  void set_perf_queries(const ConfigPayload &config_payload);
+  MetricPayload get_perf_reports();
+
+  ceph::mutex m_perf_queries_lock = ceph::make_mutex("OSD::m_perf_queries_lock");
+  std::list<OSDPerfMetricQuery> m_perf_queries;
+  std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> m_perf_limits;
+};
+
+
+//compatibility of the executable
+extern const CompatSet::Feature ceph_osd_feature_compat[];
+extern const CompatSet::Feature ceph_osd_feature_ro_compat[];
+extern const CompatSet::Feature ceph_osd_feature_incompat[];
+
+#endif // CEPH_OSD_H
diff --git a/src/osd/OSDCap.cc b/src/osd/OSDCap.cc
new file mode 100644
index 000000000..e7bf05827
--- /dev/null
+++ b/src/osd/OSDCap.cc
@@ -0,0 +1,532 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <boost/config/warning_disable.hpp>
+#include <boost/spirit/include/qi.hpp>
+#include <boost/spirit/include/phoenix_operator.hpp>
+#include <boost/spirit/include/phoenix.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "OSDCap.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "include/ipaddr.h"
+
+using std::ostream;
+using std::string;
+using std::vector;
+
+ostream& operator<<(ostream& out, const osd_rwxa_t& p)
+{
+  if (p == OSD_CAP_ANY)
+    return out << "*";
+
+  if (p & OSD_CAP_R)
+    out << "r";
+  if (p & OSD_CAP_W)
+    out << "w";
+  if ((p & OSD_CAP_X) == OSD_CAP_X) {
+    out << "x";
+  } else {
+    if (p & OSD_CAP_CLS_R)
+      out << " class-read";
+    if (p & OSD_CAP_CLS_W)
+      out << " class-write";
+  }
+  return out;
+}
+
+ostream& operator<<(ostream& out, const OSDCapSpec& s)
+{
+  if (s.allow)
+    return out << s.allow;
+  if (s.class_name.length()) {
+    out << "class '" << s.class_name << "'";
+    if (!s.method_name.empty()) {
+      out << " '" << s.method_name << "'";
+    }
+  }
+  return out;
+}
+
+ostream& operator<<(ostream& out, const OSDCapPoolNamespace& pns)
+{
+  if (!pns.pool_name.empty()) {
+    out << "pool " << pns.pool_name << " ";
+  }
+  if (pns.nspace) {
+    out << "namespace ";
+    if (pns.nspace->empty()) {
+      out << "\"\"";
+    } else {
+      out << *pns.nspace;
+    }
+    out << " ";
+  }
+  return out;
+}
+
+ostream& operator<<(ostream &out, const OSDCapPoolTag &pt)
+{
+  out << "app " << pt.application << " key " << pt.key << " val " << pt.value
+      << " ";
+  return out;
+}
+
+ostream& operator<<(ostream& out, const OSDCapMatch& m)
+{
+  if (!m.pool_namespace.pool_name.empty() || m.pool_namespace.nspace) {
+    out << m.pool_namespace;
+  }
+
+  if (!m.pool_tag.application.empty()) {
+    out << m.pool_tag;
+  }
+
+  if (m.object_prefix.length()) {
+    out << "object_prefix " << m.object_prefix << " ";
+  }
+  return out;
+}
+
+ostream& operator<<(ostream& out, const OSDCapProfile& m)
+{
+  out << "profile " << m.name;
+  out << m.pool_namespace;
+  return out;
+}
+
+bool OSDCapPoolNamespace::is_match(const std::string& pn,
+                                   const std::string& ns) const
+{
+  if (!pool_name.empty()) {
+    if (pool_name != pn) {
+      return false;
+    }
+  }
+  if (nspace) {
+    if (!nspace->empty() && nspace->back() == '*' &&
+	boost::starts_with(ns, nspace->substr(0, nspace->length() - 1))) {
+      return true;
+    }
+
+    if (*nspace != ns) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool OSDCapPoolNamespace::is_match_all() const
+{
+  if (!pool_name.empty())
+    return false;
+  if (nspace)
+    return false;
+  return true;
+}
+
+bool OSDCapPoolTag::is_match(const app_map_t& app_map) const
+{
+  if (application.empty()) {
+    return true;
+  }
+  auto kv_map = app_map.find(application);
+  if (kv_map == app_map.end()) {
+    return false;
+  }
+  if (!key.compare("*") && !value.compare("*")) {
+    return true;
+  }
+  if (!key.compare("*")) {
+    for (auto it : kv_map->second) {
+      if (it.second == value) {
+	return true;
+      }
+    }
+    return false;
+  }
+  auto kv_val = kv_map->second.find(key);
+  if (kv_val == kv_map->second.end()) {
+    return false;
+  }
+  if (!value.compare("*")) {
+    return true;
+  }
+  return kv_val->second == value;
+}
+
+bool OSDCapPoolTag::is_match_all() const {
+  return application.empty();
+}
+
+bool OSDCapMatch::is_match(const string& pn, const string& ns,
+			   const OSDCapPoolTag::app_map_t& app_map,
+			   const string& object) const
+{
+  if (!pool_namespace.is_match(pn, ns)) {
+    return false;
+  } else if (!pool_tag.is_match(app_map)) {
+    return false;
+  }
+
+  if (object_prefix.length()) {
+    if (object.find(object_prefix) != 0)
+      return false;
+  }
+  return true;
+}
+
+bool OSDCapMatch::is_match_all() const
+{
+if (!pool_namespace.is_match_all()) {
+    return false;
+  } else if (!pool_tag.is_match_all()) {
+    return false;
+  }
+
+  if (object_prefix.length()) {
+    return false;
+  }
+  return true;
+}
+
+ostream& operator<<(ostream& out, const OSDCapGrant& g)
+{
+  out << "grant(";
+  if (g.profile.is_valid()) {
+    out << g.profile << " [";
+    for (auto it = g.profile_grants.cbegin();
+         it != g.profile_grants.cend(); ++it) {
+      if (it != g.profile_grants.cbegin()) {
+        out << ",";
+      }
+      out << *it;
+    }
+    out << "]";
+  } else {
+    out << g.match << g.spec;
+  }
+  if (g.network.size()) {
+    out << " network " << g.network;
+  }
+  out << ")";
+  return out;
+}
+
+void OSDCapGrant::set_network(const string& n)
+{
+  network = n;
+  network_valid = ::parse_network(n.c_str(), &network_parsed, &network_prefix);
+}
+
+bool OSDCapGrant::allow_all() const
+{
+  if (profile.is_valid()) {
+    return std::any_of(profile_grants.cbegin(), profile_grants.cend(),
+                       [](const OSDCapGrant& grant) {
+        return grant.allow_all();
+      });
+  }
+
+  return (match.is_match_all() && spec.allow_all());
+}
+
+bool OSDCapGrant::is_capable(
+  const string& pool_name,
+  const string& ns,
+  const OSDCapPoolTag::app_map_t& application_metadata,
+  const string& object,
+  bool op_may_read,
+  bool op_may_write,
+  const std::vector<OpInfo::ClassInfo>& classes,
+  const entity_addr_t& addr,
+  std::vector<bool>* class_allowed) const
+{
+  osd_rwxa_t allow = 0;
+
+  if (network.size() &&
+      (!network_valid ||
+       !network_contains(network_parsed,
+			 network_prefix,
+			 addr))) {
+    return false;
+  }
+
+  if (profile.is_valid()) {
+    return std::any_of(profile_grants.cbegin(), profile_grants.cend(),
+                       [&](const OSDCapGrant& grant) {
+			   return grant.is_capable(pool_name, ns,
+						   application_metadata,
+						   object, op_may_read,
+						   op_may_write, classes, addr,
+						   class_allowed);
+		       });
+  } else {
+    if (match.is_match(pool_name, ns, application_metadata, object)) {
+      allow = allow | spec.allow;
+      if ((op_may_read && !(allow & OSD_CAP_R)) ||
+          (op_may_write && !(allow & OSD_CAP_W))) {
+        return false;
+      }
+      if (!classes.empty()) {
+        // check 'allow *'
+        if (spec.allow_all()) {
+          return true;
+        }
+
+        // compare this grant to each class in the operation
+        for (size_t i = 0; i < classes.size(); ++i) {
+          // check 'allow class foo [method_name]'
+          if (!spec.class_name.empty() &&
+              classes[i].class_name == spec.class_name &&
+              (spec.method_name.empty() ||
+               classes[i].method_name == spec.method_name)) {
+            (*class_allowed)[i] = true;
+            continue;
+          }
+          // check 'allow x | class-{rw}': must be on allow list
+          if (!classes[i].allowed) {
+            continue;
+          }
+          if ((classes[i].read && !(allow & OSD_CAP_CLS_R)) ||
+              (classes[i].write && !(allow & OSD_CAP_CLS_W))) {
+            continue;
+          }
+          (*class_allowed)[i] = true;
+        }
+        if (!std::all_of(class_allowed->cbegin(), class_allowed->cend(),
+              [](bool v) { return v; })) {
+          return false;
+        }
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+void OSDCapGrant::expand_profile()
+{
+  if (profile.name == "read-only") {
+    // grants READ-ONLY caps to the OSD
+    profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace),
+                                OSDCapSpec(osd_rwxa_t(OSD_CAP_R)));
+    return;
+  }
+  if (profile.name == "read-write") {
+    // grants READ-WRITE caps to the OSD
+    profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace),
+                                OSDCapSpec(osd_rwxa_t(OSD_CAP_R | OSD_CAP_W)));
+  }
+
+  if (profile.name == "rbd") {
+    // RBD read-write grant
+    profile_grants.emplace_back(OSDCapMatch(string(), "rbd_info"),
+                                OSDCapSpec(osd_rwxa_t(OSD_CAP_R)));
+    profile_grants.emplace_back(OSDCapMatch(string(), "rbd_children"),
+                                OSDCapSpec(osd_rwxa_t(OSD_CAP_CLS_R)));
+    profile_grants.emplace_back(OSDCapMatch(string(), "rbd_mirroring"),
+                                OSDCapSpec(osd_rwxa_t(OSD_CAP_CLS_R)));
+    profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace.pool_name),
+                                OSDCapSpec("rbd", "metadata_list"));
+    profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace),
+                                OSDCapSpec(osd_rwxa_t(OSD_CAP_R |
+                                                      OSD_CAP_W |
+                                                      OSD_CAP_X)));
+  }
+  if (profile.name == "rbd-read-only") {
+    // RBD read-only grant
+    profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace),
+                                OSDCapSpec(osd_rwxa_t(OSD_CAP_R |
+                                                      OSD_CAP_CLS_R)));
+    profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace,
+                                            "rbd_header."),
+                                OSDCapSpec("rbd", "child_attach"));
+    profile_grants.emplace_back(OSDCapMatch(profile.pool_namespace,
+                                            "rbd_header."),
+                                OSDCapSpec("rbd", "child_detach"));
+  }
+}
+
+bool OSDCap::allow_all() const
+{
+  for (auto &grant : grants) {
+    if (grant.allow_all()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void OSDCap::set_allow_all()
+{
+  grants.clear();
+  grants.push_back(OSDCapGrant(OSDCapMatch(), OSDCapSpec(OSD_CAP_ANY)));
+}
+
+bool OSDCap::is_capable(const string& pool_name, const string& ns,
+			const OSDCapPoolTag::app_map_t& application_metadata,
+			const string& object,
+                        bool op_may_read, bool op_may_write,
+			const std::vector<OpInfo::ClassInfo>& classes,
+			const entity_addr_t& addr) const
+{
+  std::vector<bool> class_allowed(classes.size(), false);
+  for (auto &grant : grants) {
+    if (grant.is_capable(pool_name, ns, application_metadata,
+			 object, op_may_read, op_may_write, classes, addr,
+			 &class_allowed)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+
+// grammar
+namespace qi = boost::spirit::qi;
+namespace ascii = boost::spirit::ascii;
+namespace phoenix = boost::phoenix;
+
+template <typename Iterator>
+struct OSDCapParser : qi::grammar<Iterator, OSDCap()>
+{
+  OSDCapParser() : OSDCapParser::base_type(osdcap)
+  {
+    using qi::char_;
+    using qi::int_;
+    using qi::lexeme;
+    using qi::alnum;
+    using qi::_val;
+    using qi::_1;
+    using qi::_2;
+    using qi::_3;
+    using qi::eps;
+    using qi::lit;
+
+    quoted_string %=
+      lexeme['"' >> +(char_ - '"') >> '"'] | 
+      lexeme['\'' >> +(char_ - '\'') >> '\''];
+    equoted_string %=
+      lexeme['"' >> *(char_ - '"') >> '"'] |
+      lexeme['\'' >> *(char_ - '\'') >> '\''];
+    unquoted_word %= +char_("a-zA-Z0-9_./-");
+    str %= quoted_string | unquoted_word;
+    estr %= equoted_string | unquoted_word;
+    network_str %= +char_("/.:a-fA-F0-9][");
+
+    spaces = +ascii::space;
+
+    wildcard = (lit('*') | lit("all")) [_val = "*"];
+
+    pool_name %= -(spaces >> lit("pool") >> (lit('=') | spaces) >> str);
+    nspace %= (spaces >> lit("namespace")
+	       >> (lit('=') | spaces)
+	       >> estr >> -char_('*'));
+
+    // match := [pool[=]<poolname> [namespace[=]<namespace>]] [object_prefix <prefix>]
+    object_prefix %= -(spaces >> lit("object_prefix") >> spaces >> str);
+    pooltag %= (spaces >> lit("tag")
+		>> spaces >> str // application
+		>> spaces >> (wildcard | str) // key
+		>> -spaces >> lit('=') >> -spaces >> (wildcard | str)); // value
+
+    match = (
+      pooltag                                 [_val = phoenix::construct<OSDCapMatch>(_1)] |
+      (nspace >> pooltag)                     [_val = phoenix::construct<OSDCapMatch>(_1, _2)] |
+      (pool_name >> nspace >> object_prefix)  [_val = phoenix::construct<OSDCapMatch>(_1, _2, _3)] |
+      (pool_name >> object_prefix)            [_val = phoenix::construct<OSDCapMatch>(_1, _2)]
+    );
+
+    // rwxa := * | [r][w][x] [class-read] [class-write]
+    rwxa =
+      (spaces >> wildcard[_val = OSD_CAP_ANY]) |
+      ( eps[_val = 0] >>
+	(
+	 spaces >>
+	 ( lit('r')[_val |= OSD_CAP_R] ||
+	   lit('w')[_val |= OSD_CAP_W] ||
+	   lit('x')[_val |= OSD_CAP_X] )) ||
+	( (spaces >> lit("class-read")[_val |= OSD_CAP_CLS_R]) ||
+	  (spaces >> lit("class-write")[_val |= OSD_CAP_CLS_W]) ));
+
+    // capspec := * | rwx | class <name> [<method name>]
+    class_name %= (spaces >> lit("class") >> spaces >> str);
+    method_name %= -(spaces >> str);
+    capspec = (
+      (rwxa)                      [_val = phoenix::construct<OSDCapSpec>(_1)] |
+      (class_name >> method_name) [_val = phoenix::construct<OSDCapSpec>(_1, _2)]);
+
+    // profile := profile <name> [pool[=]<pool> [namespace[=]<namespace>]]
+    profile_name %= (lit("profile") >> (lit('=') | spaces) >> str);
+    profile = (
+      (profile_name >> pool_name >> nspace) [_val = phoenix::construct<OSDCapProfile>(_1, _2, _3)] |
+      (profile_name >> pool_name)           [_val = phoenix::construct<OSDCapProfile>(_1, _2)]);
+
+    // grant := allow match capspec
+    grant = (*ascii::blank >>
+	     ((lit("allow") >> capspec >> match >>
+	       -(spaces >> lit("network") >> spaces >> network_str))
+	       [_val = phoenix::construct<OSDCapGrant>(_2, _1, _3)] |
+	      (lit("allow") >> match >> capspec >>
+	       -(spaces >> lit("network") >> spaces >> network_str))
+	       [_val = phoenix::construct<OSDCapGrant>(_1, _2, _3)] |
+              (profile >> -(spaces >> lit("network") >> spaces >> network_str))
+	       [_val = phoenix::construct<OSDCapGrant>(_1, _2)]
+             ) >> *ascii::blank);
+    // osdcap := grant [grant ...]
+    grants %= (grant % (lit(';') | lit(',')));
+    osdcap = grants  [_val = phoenix::construct<OSDCap>(_1)];
+  }
+  qi::rule<Iterator> spaces;
+  qi::rule<Iterator, unsigned()> rwxa;
+  qi::rule<Iterator, string()> quoted_string, equoted_string;
+  qi::rule<Iterator, string()> unquoted_word;
+  qi::rule<Iterator, string()> str, estr, network_str;
+  qi::rule<Iterator, string()> wildcard;
+  qi::rule<Iterator, string()> class_name;
+  qi::rule<Iterator, string()> method_name;
+  qi::rule<Iterator, OSDCapSpec()> capspec;
+  qi::rule<Iterator, string()> pool_name;
+  qi::rule<Iterator, string()> nspace;
+  qi::rule<Iterator, string()> object_prefix;
+  qi::rule<Iterator, OSDCapPoolTag()> pooltag;
+  qi::rule<Iterator, OSDCapMatch()> match;
+  qi::rule<Iterator, string()> profile_name;
+  qi::rule<Iterator, OSDCapProfile()> profile;
+  qi::rule<Iterator, OSDCapGrant()> grant;
+  qi::rule<Iterator, std::vector<OSDCapGrant>()> grants;
+  qi::rule<Iterator, OSDCap()> osdcap;
+};
+
+bool OSDCap::parse(const string& str, ostream *err)
+{
+  OSDCapParser<string::const_iterator> g;
+  string::const_iterator iter = str.begin();
+  string::const_iterator end = str.end();
+
+  bool r = qi::phrase_parse(iter, end, g, ascii::space, *this);
+  if (r && iter == end)
+    return true;
+
+  // Make sure no grants are kept after parsing failed!
+  grants.clear();
+
+  if (err)
+    *err << "osd capability parse failed, stopped at '" << std::string(iter, end)
+	 << "' of '" << str << "'";
+
+  return false; 
+}
diff --git a/src/osd/OSDCap.h b/src/osd/OSDCap.h
new file mode 100644
index 000000000..394b1a726
--- /dev/null
+++ b/src/osd/OSDCap.h
@@ -0,0 +1,261 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ * OSDCaps: Hold the capabilities associated with a single authenticated
+ * user key. These are specified by text strings of the form
+ * "allow r" (which allows reading anything on the OSD)
+ *  "allow rwx pool foo" (which allows full access to listed pools)
+ * "allow *" (which allows full access to EVERYTHING)
+ *
+ * The full grammar is documented in the parser in OSDCap.cc.
+ *
+ * The OSD assumes that anyone with * caps is an admin and has full
+ * message permissions. This means that only the monitor and the OSDs
+ * should get *
+ */
+
+#ifndef CEPH_OSDCAP_H
+#define CEPH_OSDCAP_H
+
+#include <ostream>
+using std::ostream;
+
+#include "include/types.h"
+#include "OpRequest.h"
+
+#include <list>
+#include <vector>
+#include <boost/optional.hpp>
+#include <boost/fusion/include/adapt_struct.hpp>
+
+static const __u8 OSD_CAP_R     = (1 << 1);      // read
+static const __u8 OSD_CAP_W     = (1 << 2);      // write
+static const __u8 OSD_CAP_CLS_R = (1 << 3);      // class read
+static const __u8 OSD_CAP_CLS_W = (1 << 4);      // class write
+static const __u8 OSD_CAP_X     = (OSD_CAP_CLS_R | OSD_CAP_CLS_W); // execute
+static const __u8 OSD_CAP_ANY   = 0xff;          // *
+
+struct osd_rwxa_t {
+  __u8 val;
+
+  // cppcheck-suppress noExplicitConstructor
+  osd_rwxa_t(__u8 v = 0) : val(v) {}
+  osd_rwxa_t& operator=(__u8 v) {
+    val = v;
+    return *this;
+  }
+  operator __u8() const {
+    return val;
+  }
+};
+
+ostream& operator<<(ostream& out, const osd_rwxa_t& p);
+
+struct OSDCapSpec {
+  osd_rwxa_t allow;
+  std::string class_name;
+  std::string method_name;
+
+  OSDCapSpec() : allow(0) {}
+  explicit OSDCapSpec(osd_rwxa_t v) : allow(v) {}
+  OSDCapSpec(std::string class_name, std::string method_name)
+    : allow(0), class_name(std::move(class_name)),
+      method_name(std::move(method_name)) {}
+
+  bool allow_all() const {
+    return allow == OSD_CAP_ANY;
+  }
+};
+
+ostream& operator<<(ostream& out, const OSDCapSpec& s);
+
+struct OSDCapPoolNamespace {
+  std::string pool_name;
+  boost::optional<std::string> nspace = boost::none;
+
+  OSDCapPoolNamespace() {
+  }
+  OSDCapPoolNamespace(const std::string& pool_name,
+                      const boost::optional<std::string>& nspace = boost::none)
+    : pool_name(pool_name), nspace(nspace) {
+  }
+
+  bool is_match(const std::string& pn, const std::string& ns) const;
+  bool is_match_all() const;
+};
+
+ostream& operator<<(ostream& out, const OSDCapPoolNamespace& pns);
+
+struct OSDCapPoolTag {
+  typedef std::map<std::string, std::map<std::string, std::string> > app_map_t;
+  std::string application;
+  std::string key;
+  std::string value;
+
+  OSDCapPoolTag () {}
+  OSDCapPoolTag(const std::string& application, const std::string& key,
+		const std::string& value) :
+    application(application), key(key), value(value) {}
+
+  bool is_match(const app_map_t& app_map) const;
+  bool is_match_all() const;
+};
+// adapt for parsing with boost::spirit::qi in OSDCapParser
+BOOST_FUSION_ADAPT_STRUCT(OSDCapPoolTag,
+			  (std::string, application)
+			  (std::string, key)
+			  (std::string, value))
+
+ostream& operator<<(ostream& out, const OSDCapPoolTag& pt);
+
+struct OSDCapMatch {
+  typedef std::map<std::string, std::map<std::string, std::string> > app_map_t;
+  OSDCapPoolNamespace pool_namespace;
+  OSDCapPoolTag pool_tag;
+  std::string object_prefix;
+
+  OSDCapMatch() {}
+  explicit OSDCapMatch(const OSDCapPoolTag& pt) : pool_tag(pt) {}
+  explicit OSDCapMatch(const OSDCapPoolNamespace& pns) : pool_namespace(pns) {}
+  OSDCapMatch(const OSDCapPoolNamespace& pns, const std::string& pre)
+    : pool_namespace(pns), object_prefix(pre) {}
+  OSDCapMatch(const std::string& pl, const std::string& pre)
+    : pool_namespace(pl), object_prefix(pre) {}
+  OSDCapMatch(const std::string& pl, const std::string& ns,
+              const std::string& pre)
+    : pool_namespace(pl, ns), object_prefix(pre) {}
+  OSDCapMatch(const std::string& dummy, const std::string& app,
+	      const std::string& key, const std::string& val)
+    : pool_tag(app, key, val) {}
+  OSDCapMatch(const std::string& ns, const OSDCapPoolTag& pt)
+    : pool_namespace("", ns), pool_tag(pt) {}
+
+  /**
+   * check if given request parameters match our constraints
+   *
+   * @param pool_name pool name
+   * @param nspace_name namespace name
+   * @param object object name
+   * @return true if we match, false otherwise
+   */
+  bool is_match(const std::string& pool_name, const std::string& nspace_name,
+                const app_map_t& app_map,
+		const std::string& object) const;
+  bool is_match_all() const;
+};
+
+ostream& operator<<(ostream& out, const OSDCapMatch& m);
+
+
+struct OSDCapProfile {
+  std::string name;
+  OSDCapPoolNamespace pool_namespace;
+
+  OSDCapProfile() {
+  }
+  OSDCapProfile(const std::string& name,
+                const std::string& pool_name,
+                const boost::optional<std::string>& nspace = boost::none)
+    : name(name), pool_namespace(pool_name, nspace) {
+  }
+
+  inline bool is_valid() const {
+    return !name.empty();
+  }
+};
+
+ostream& operator<<(ostream& out, const OSDCapProfile& m);
+
+struct OSDCapGrant {
+  OSDCapMatch match;
+  OSDCapSpec spec;
+  OSDCapProfile profile;
+  std::string network;
+  entity_addr_t network_parsed;
+  unsigned network_prefix = 0;
+  bool network_valid = true;
+
+  // explicit grants that a profile grant expands to; populated as
+  // needed by expand_profile() and cached here.
+  std::list<OSDCapGrant> profile_grants;
+
+  OSDCapGrant() {}
+  OSDCapGrant(const OSDCapMatch& m, const OSDCapSpec& s,
+	      boost::optional<std::string> n = {})
+    : match(m), spec(s) {
+    if (n) {
+      set_network(*n);
+    }
+  }
+  explicit OSDCapGrant(const OSDCapProfile& profile,
+		       boost::optional<std::string> n = {})
+    : profile(profile) {
+    if (n) {
+      set_network(*n);
+    }
+    expand_profile();
+  }
+
+  void set_network(const std::string& n);
+
+  bool allow_all() const;
+  bool is_capable(const std::string& pool_name, const std::string& ns,
+		  const OSDCapPoolTag::app_map_t& application_metadata,
+                  const std::string& object, bool op_may_read, bool op_may_write,
+                  const std::vector<OpInfo::ClassInfo>& classes,
+		  const entity_addr_t& addr,
+                  std::vector<bool>* class_allowed) const;
+
+  void expand_profile();
+};
+
+ostream& operator<<(ostream& out, const OSDCapGrant& g);
+
+
+struct OSDCap {
+  std::vector<OSDCapGrant> grants;
+
+  OSDCap() {}
+  explicit OSDCap(std::vector<OSDCapGrant> g) : grants(std::move(g)) {}
+
+  bool allow_all() const;
+  void set_allow_all();
+  bool parse(const std::string& str, ostream *err=NULL);
+
+  /**
+   * check if we are capable of something
+   *
+   * This method actually checks a description of a particular operation against
+   * what the capability has specified.  Currently that is just rwx with matches
+   * against pool, and object name prefix.
+   *
+   * @param pool_name name of the pool we are accessing
+   * @param ns name of the namespace we are accessing
+   * @param object name of the object we are accessing
+   * @param op_may_read whether the operation may need to read
+   * @param op_may_write whether the operation may need to write
+   * @param classes (class-name, rd, wr, allowed-flag) tuples
+   * @return true if the operation is allowed, false otherwise
+   */
+  bool is_capable(const std::string& pool_name, const std::string& ns,
+		  const OSDCapPoolTag::app_map_t& application_metadata,
+		  const std::string& object, bool op_may_read, bool op_may_write,
+		  const std::vector<OpInfo::ClassInfo>& classes,
+		  const entity_addr_t& addr) const;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const OSDCap& cap) 
+{
+  return out << "osdcap" << cap.grants;
+}
+
+#endif
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
new file mode 100644
index 000000000..6e5caf53a
--- /dev/null
+++ b/src/osd/OSDMap.cc
@@ -0,0 +1,6412 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <algorithm>
+#include <optional>
+#include <random>
+
+#include <boost/algorithm/string.hpp>
+
+#include "OSDMap.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include "include/ceph_features.h"
+#include "include/common_fwd.h"
+#include "include/str_map.h"
+
+#include "common/code_environment.h"
+#include "mon/health_check.h"
+
+#include "crush/CrushTreeDumper.h"
+#include "common/Clock.h"
+#include "mon/PGMap.h"
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::multimap;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::unordered_map;
+using std::vector;
+
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+
+#define dout_subsys ceph_subsys_osd
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap, osdmap, osdmap);
+MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMap::Incremental, osdmap_inc, osdmap);
+
+
+// ----------------------------------
+// osd_info_t
+
+void osd_info_t::dump(Formatter *f) const
+{
+  f->dump_int("last_clean_begin", last_clean_begin);
+  f->dump_int("last_clean_end", last_clean_end);
+  f->dump_int("up_from", up_from);
+  f->dump_int("up_thru", up_thru);
+  f->dump_int("down_at", down_at);
+  f->dump_int("lost_at", lost_at);
+}
+
+void osd_info_t::encode(ceph::buffer::list& bl) const
+{
+  using ceph::encode;
+  __u8 struct_v = 1;
+  encode(struct_v, bl);
+  encode(last_clean_begin, bl);
+  encode(last_clean_end, bl);
+  encode(up_from, bl);
+  encode(up_thru, bl);
+  encode(down_at, bl);
+  encode(lost_at, bl);
+}
+
+void osd_info_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  using ceph::decode;
+  __u8 struct_v;
+  decode(struct_v, bl);
+  decode(last_clean_begin, bl);
+  decode(last_clean_end, bl);
+  decode(up_from, bl);
+  decode(up_thru, bl);
+  decode(down_at, bl);
+  decode(lost_at, bl);
+}
+
+void osd_info_t::generate_test_instances(list<osd_info_t*>& o)
+{
+  o.push_back(new osd_info_t);
+  o.push_back(new osd_info_t);
+  o.back()->last_clean_begin = 1;
+  o.back()->last_clean_end = 2;
+  o.back()->up_from = 30;
+  o.back()->up_thru = 40;
+  o.back()->down_at = 5;
+  o.back()->lost_at = 6;
+}
+
+ostream& operator<<(ostream& out, const osd_info_t& info)
+{
+  out << "up_from " << info.up_from
+      << " up_thru " << info.up_thru
+      << " down_at " << info.down_at
+      << " last_clean_interval [" << info.last_clean_begin << "," << info.last_clean_end << ")";
+  if (info.lost_at)
+    out << " lost_at " << info.lost_at;
+  return out;
+}
+
+// ----------------------------------
+// osd_xinfo_t
+
+void osd_xinfo_t::dump(Formatter *f) const
+{
+  f->dump_stream("down_stamp") << down_stamp;
+  f->dump_float("laggy_probability", laggy_probability);
+  f->dump_int("laggy_interval", laggy_interval);
+  f->dump_int("features", features);
+  f->dump_unsigned("old_weight", old_weight);
+  f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
+  f->dump_int("dead_epoch", dead_epoch);
+}
+
+void osd_xinfo_t::encode(ceph::buffer::list& bl, uint64_t enc_features) const
+{
+  uint8_t v = 4;
+  if (!HAVE_FEATURE(enc_features, SERVER_OCTOPUS)) {
+    v = 3;
+  }
+  ENCODE_START(v, 1, bl);
+  encode(down_stamp, bl);
+  __u32 lp = laggy_probability * float(0xfffffffful);
+  encode(lp, bl);
+  encode(laggy_interval, bl);
+  encode(features, bl);
+  encode(old_weight, bl);
+  if (v >= 4) {
+    encode(last_purged_snaps_scrub, bl);
+    encode(dead_epoch, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void osd_xinfo_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START(4, bl);
+  decode(down_stamp, bl);
+  __u32 lp;
+  decode(lp, bl);
+  laggy_probability = (float)lp / (float)0xffffffff;
+  decode(laggy_interval, bl);
+  if (struct_v >= 2)
+    decode(features, bl);
+  else
+    features = 0;
+  if (struct_v >= 3)
+    decode(old_weight, bl);
+  else
+    old_weight = 0;
+  if (struct_v >= 4) {
+    decode(last_purged_snaps_scrub, bl);
+    decode(dead_epoch, bl);
+  } else {
+    dead_epoch = 0;
+  }
+  DECODE_FINISH(bl);
+}
+
+void osd_xinfo_t::generate_test_instances(list<osd_xinfo_t*>& o)
+{
+  o.push_back(new osd_xinfo_t);
+  o.push_back(new osd_xinfo_t);
+  o.back()->down_stamp = utime_t(2, 3);
+  o.back()->laggy_probability = .123;
+  o.back()->laggy_interval = 123456;
+  o.back()->old_weight = 0x7fff;
+}
+
+ostream& operator<<(ostream& out, const osd_xinfo_t& xi)
+{
+  return out << "down_stamp " << xi.down_stamp
+	     << " laggy_probability " << xi.laggy_probability
+	     << " laggy_interval " << xi.laggy_interval
+	     << " old_weight " << xi.old_weight
+	     << " last_purged_snaps_scrub " << xi.last_purged_snaps_scrub
+	     << " dead_epoch " << xi.dead_epoch;
+}
+
+// ----------------------------------
+// OSDMap::Incremental
+
+int OSDMap::Incremental::get_net_marked_out(const OSDMap *previous) const
+{
+  int n = 0;
+  for (auto &weight : new_weight) {
+    if (weight.second == CEPH_OSD_OUT && !previous->is_out(weight.first))
+      n++;  // marked out
+    else if (weight.second != CEPH_OSD_OUT && previous->is_out(weight.first))
+      n--;  // marked in
+  }
+  return n;
+}
+
+int OSDMap::Incremental::get_net_marked_down(const OSDMap *previous) const
+{
+  int n = 0;
+  for (auto &state : new_state) { // 
+    if (state.second & CEPH_OSD_UP) {
+      if (previous->is_up(state.first))
+	n++;  // marked down
+      else
+	n--;  // marked up
+    }
+  }
+  return n;
+}
+
+int OSDMap::Incremental::identify_osd(uuid_d u) const
+{
+  for (auto &uuid : new_uuid)
+    if (uuid.second == u)
+      return uuid.first;
+  return -1;
+}
+
+int OSDMap::Incremental::propagate_base_properties_to_tiers(CephContext *cct,
+							    const OSDMap& osdmap)
+{
+  ceph_assert(epoch == osdmap.get_epoch() + 1);
+
+  for (auto &new_pool : new_pools) {
+    if (!new_pool.second.tiers.empty()) {
+      pg_pool_t& base = new_pool.second;
+
+      auto new_rem_it = new_removed_snaps.find(new_pool.first);
+
+      for (const auto &tier_pool : base.tiers) {
+	const auto &r = new_pools.find(tier_pool);
+	pg_pool_t *tier = 0;
+	if (r == new_pools.end()) {
+	  const pg_pool_t *orig = osdmap.get_pg_pool(tier_pool);
+	  if (!orig) {
+	    lderr(cct) << __func__ << " no pool " << tier_pool << dendl;
+	    return -EIO;
+	  }
+	  tier = get_new_pool(tier_pool, orig);
+	} else {
+	  tier = &r->second;
+	}
+	if (tier->tier_of != new_pool.first) {
+	  lderr(cct) << __func__ << " " << r->first << " tier_of != " << new_pool.first << dendl;
+	  return -EIO;
+	}
+
+        ldout(cct, 10) << __func__ << " from " << new_pool.first << " to "
+                       << tier_pool << dendl;
+	tier->snap_seq = base.snap_seq;
+	tier->snap_epoch = base.snap_epoch;
+	tier->snaps = base.snaps;
+	tier->removed_snaps = base.removed_snaps;
+	tier->flags |= base.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS|
+				     pg_pool_t::FLAG_POOL_SNAPS);
+
+	if (new_rem_it != new_removed_snaps.end()) {
+	  new_removed_snaps[tier_pool] = new_rem_it->second;
+	}
+
+	tier->application_metadata = base.application_metadata;
+      }
+    }
+  }
+  return 0;
+}
+
+// ----------------------------------
+// OSDMap
+
+bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
+{
+  if (id >= 0)
+    return is_down(id);
+
+  if (down_cache &&
+      down_cache->count(id)) {
+    return true;
+  }
+
+  list<int> children;
+  crush->get_children(id, &children);
+  for (const auto &child : children) {
+    if (!subtree_is_down(child, down_cache)) {
+      return false;
+    }
+  }
+  if (down_cache) {
+    down_cache->insert(id);
+  }
+  return true;
+}
+
+bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
+{
+  // use a stack-local down_cache if we didn't get one from the
+  // caller.  then at least this particular call will avoid duplicated
+  // work.
+  set<int> local_down_cache;
+  if (!down_cache) {
+    down_cache = &local_down_cache;
+  }
+
+  int current = id;
+  while (true) {
+    int type;
+    if (current >= 0) {
+      type = 0;
+    } else {
+      type = crush->get_bucket_type(current);
+    }
+    ceph_assert(type >= 0);
+
+    if (!subtree_is_down(current, down_cache)) {
+      ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = false" << dendl;
+      return false;
+    }
+
+    // is this a big enough subtree to be marked as down?
+    if (type >= subtree_type) {
+      ldout(cct, 30) << "containing_subtree_is_down(" << id << ") = true ... " << type << " >= " << subtree_type << dendl;
+      return true;
+    }
+
+    int r = crush->get_immediate_parent_id(current, &current);
+    if (r < 0) {
+      return false;
+    }
+  }
+}
+
+bool OSDMap::subtree_type_is_down(
+  CephContext *cct,
+  int id,
+  int subtree_type,
+  set<int> *down_in_osds,
+  set<int> *up_in_osds,
+  set<int> *subtree_up,
+  unordered_map<int, set<int> > *subtree_type_down) const
+{
+  if (id >= 0) {
+    bool is_down_ret = is_down(id);
+    if (!is_out(id)) {
+      if (is_down_ret) {
+        down_in_osds->insert(id);
+      } else {
+        up_in_osds->insert(id);
+      }
+    }
+    return is_down_ret;
+  }
+
+  if (subtree_type_down &&
+      (*subtree_type_down)[subtree_type].count(id)) {
+    return true;
+  }
+
+  list<int> children;
+  crush->get_children(id, &children);
+  for (const auto &child : children) {
+    if (!subtree_type_is_down(
+	  cct, child, crush->get_bucket_type(child),
+	  down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
+      subtree_up->insert(id);
+      return false;
+    }
+  }
+  if (subtree_type_down) {
+    (*subtree_type_down)[subtree_type].insert(id);
+  }
+  return true;
+}
+
+void OSDMap::Incremental::encode_client_old(ceph::buffer::list& bl) const
+{
+  using ceph::encode;
+  __u16 v = 5;
+  encode(v, bl);
+  encode(fsid, bl);
+  encode(epoch, bl);
+  encode(modified, bl);
+  int32_t new_t = new_pool_max;
+  encode(new_t, bl);
+  encode(new_flags, bl);
+  encode(fullmap, bl);
+  encode(crush, bl);
+
+  encode(new_max_osd, bl);
+  // for encode(new_pools, bl);
+  __u32 n = new_pools.size();
+  encode(n, bl);
+  for (const auto &new_pool : new_pools) {
+    n = new_pool.first;
+    encode(n, bl);
+    encode(new_pool.second, bl, 0);
+  }
+  // for encode(new_pool_names, bl);
+  n = new_pool_names.size();
+  encode(n, bl);
+
+  for (const auto &new_pool_name : new_pool_names) {
+    n = new_pool_name.first;
+    encode(n, bl);
+    encode(new_pool_name.second, bl);
+  }
+  // for encode(old_pools, bl);
+  n = old_pools.size();
+  encode(n, bl);
+  for (auto &old_pool : old_pools) {
+    n = old_pool;
+    encode(n, bl);
+  }
+  encode(new_up_client, bl, 0);
+  {
+    // legacy is map<int32_t,uint8_t>
+    map<int32_t, uint8_t> os;
+    for (auto p : new_state) {
+      // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
+      // that an old client could not understand.
+      // skip those!
+      uint8_t s = p.second;
+      if (p.second != 0 && s == 0)
+        continue;
+      os[p.first] = s;
+    }
+    uint32_t n = os.size();
+    encode(n, bl);
+    for (auto p : os) {
+      encode(p.first, bl);
+      encode(p.second, bl);
+    }
+  }
+  encode(new_weight, bl);
+  // for encode(new_pg_temp, bl);
+  n = new_pg_temp.size();
+  encode(n, bl);
+
+  for (const auto &pg_temp : new_pg_temp) {
+    old_pg_t opg = pg_temp.first.get_old_pg();
+    encode(opg, bl);
+    encode(pg_temp.second, bl);
+  }
+}
+
+void OSDMap::Incremental::encode_classic(ceph::buffer::list& bl, uint64_t features) const
+{
+  using ceph::encode;
+  if ((features & CEPH_FEATURE_PGID64) == 0) {
+    encode_client_old(bl);
+    return;
+  }
+
+  // base
+  __u16 v = 6;
+  encode(v, bl);
+  encode(fsid, bl);
+  encode(epoch, bl);
+  encode(modified, bl);
+  encode(new_pool_max, bl);
+  encode(new_flags, bl);
+  encode(fullmap, bl);
+  encode(crush, bl);
+
+  encode(new_max_osd, bl);
+  encode(new_pools, bl, features);
+  encode(new_pool_names, bl);
+  encode(old_pools, bl);
+  encode(new_up_client, bl, features);
+  {
+    map<int32_t, uint8_t> os;
+    for (auto p : new_state) {
+      // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
+      // that an old client could not understand.
+      // skip those!
+      uint8_t s = p.second;
+      if (p.second != 0 && s == 0)
+        continue;
+      os[p.first] = s;
+    }
+    uint32_t n = os.size();
+    encode(n, bl);
+    for (auto p : os) {
+      encode(p.first, bl);
+      encode(p.second, bl);
+    }
+  }
+  encode(new_weight, bl);
+  encode(new_pg_temp, bl);
+
+  // extended
+  __u16 ev = 10;
+  encode(ev, bl);
+  encode(new_hb_back_up, bl, features);
+  encode(new_up_thru, bl);
+  encode(new_last_clean_interval, bl);
+  encode(new_lost, bl);
+  encode(new_blocklist, bl, features);
+  encode(old_blocklist, bl, features);
+  encode(new_up_cluster, bl, features);
+  encode(cluster_snapshot, bl);
+  encode(new_uuid, bl);
+  encode(new_xinfo, bl, features);
+  encode(new_hb_front_up, bl, features);
+}
+
+template<class T>
+static void encode_addrvec_map_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
+{
+  uint32_t n = m.size();
+  encode(n, bl);
+  for (auto& i : m) {
+    encode(i.first, bl);
+    encode(i.second.legacy_addr(), bl, f);
+  }
+}
+
+template<class T>
+static void encode_addrvec_pvec_as_addr(const T& m, ceph::buffer::list& bl, uint64_t f)
+{
+  uint32_t n = m.size();
+  encode(n, bl);
+  for (auto& i : m) {
+    if (i) {
+      encode(i->legacy_addr(), bl, f);
+    } else {
+      encode(entity_addr_t(), bl, f);
+    }
+  }
+}
+
+/* for a description of osdmap incremental versions, and when they were
+ * introduced, please refer to
+ *    doc/dev/osd_internals/osdmap_versions.txt
+ */
+void OSDMap::Incremental::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  using ceph::encode;
+  if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
+    encode_classic(bl, features);
+    return;
+  }
+
+  // only a select set of callers should *ever* be encoding new
+  // OSDMaps.  others should be passing around the canonical encoded
+  // buffers from on high.  select out those callers by passing in an
+  // "impossible" feature bit.
+  ceph_assert(features & CEPH_FEATURE_RESERVED);
+  features &= ~CEPH_FEATURE_RESERVED;
+
+  size_t start_offset = bl.length();
+  size_t tail_offset;
+  size_t crc_offset;
+  std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
+
+  // meta-encoding: how we include client-used and osd-specific data
+  ENCODE_START(8, 7, bl);
+
+  {
+    uint8_t v = 8;
+    if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      v = 3;
+    } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
+      v = 5;
+    } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+      v = 6;
+    }
+    ENCODE_START(v, 1, bl); // client-usable data
+    encode(fsid, bl);
+    encode(epoch, bl);
+    encode(modified, bl);
+    encode(new_pool_max, bl);
+    encode(new_flags, bl);
+    encode(fullmap, bl);
+    encode(crush, bl);
+
+    encode(new_max_osd, bl);
+    encode(new_pools, bl, features);
+    encode(new_pool_names, bl);
+    encode(old_pools, bl);
+    if (v >= 7) {
+      encode(new_up_client, bl, features);
+    } else {
+      encode_addrvec_map_as_addr(new_up_client, bl, features);
+    }
+    if (v >= 5) {
+      encode(new_state, bl);
+    } else {
+      map<int32_t, uint8_t> os;
+      for (auto p : new_state) {
+        // new_state may only inculde some new flags(e.g., CEPH_OSD_NOOUT)
+        // that an old client could not understand.
+        // skip those!
+        uint8_t s = p.second;
+        if (p.second != 0 && s == 0)
+          continue;
+        os[p.first] = s;
+      }
+      uint32_t n = os.size();
+      encode(n, bl);
+      for (auto p : os) {
+        encode(p.first, bl);
+        encode(p.second, bl);
+      }
+    }
+    encode(new_weight, bl);
+    encode(new_pg_temp, bl);
+    encode(new_primary_temp, bl);
+    encode(new_primary_affinity, bl);
+    encode(new_erasure_code_profiles, bl);
+    encode(old_erasure_code_profiles, bl);
+    if (v >= 4) {
+      encode(new_pg_upmap, bl);
+      encode(old_pg_upmap, bl);
+      encode(new_pg_upmap_items, bl);
+      encode(old_pg_upmap_items, bl);
+    }
+    if (v >= 6) {
+      encode(new_removed_snaps, bl);
+      encode(new_purged_snaps, bl);
+    }
+    if (v >= 8) {
+      encode(new_last_up_change, bl);
+      encode(new_last_in_change, bl);
+    }
+    ENCODE_FINISH(bl); // client-usable data
+  }
+
+  {
+    uint8_t target_v = 9; // if bumping this, be aware of range_blocklist 11
+    if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      target_v = 2;
+    } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+      target_v = 6;
+    }
+    if (change_stretch_mode) {
+      target_v = std::max((uint8_t)10, target_v);
+    }
+    if (!new_range_blocklist.empty() ||
+	!old_range_blocklist.empty()) {
+      target_v = std::max((uint8_t)11, target_v);
+    }
+    ENCODE_START(target_v, 1, bl); // extended, osd-only data
+    if (target_v < 7) {
+      encode_addrvec_map_as_addr(new_hb_back_up, bl, features);
+    } else {
+      encode(new_hb_back_up, bl, features);
+    }
+    encode(new_up_thru, bl);
+    encode(new_last_clean_interval, bl);
+    encode(new_lost, bl);
+    encode(new_blocklist, bl, features);
+    encode(old_blocklist, bl, features);
+    if (target_v < 7) {
+      encode_addrvec_map_as_addr(new_up_cluster, bl, features);
+    } else {
+      encode(new_up_cluster, bl, features);
+    }
+    encode(cluster_snapshot, bl);
+    encode(new_uuid, bl);
+    encode(new_xinfo, bl, features);
+    if (target_v < 7) {
+      encode_addrvec_map_as_addr(new_hb_front_up, bl, features);
+    } else {
+      encode(new_hb_front_up, bl, features);
+    }
+    encode(features, bl);         // NOTE: features arg, not the member
+    if (target_v >= 3) {
+      encode(new_nearfull_ratio, bl);
+      encode(new_full_ratio, bl);
+      encode(new_backfillfull_ratio, bl);
+    }
+    // 5 was string-based new_require_min_compat_client
+    if (target_v >= 6) {
+      encode(new_require_min_compat_client, bl);
+      encode(new_require_osd_release, bl);
+    }
+    if (target_v >= 8) {
+      encode(new_crush_node_flags, bl);
+    }
+    if (target_v >= 9) {
+      encode(new_device_class_flags, bl);
+    }
+    if (target_v >= 10) {
+      encode(change_stretch_mode, bl);
+      encode(new_stretch_bucket_count, bl);
+      encode(new_degraded_stretch_mode, bl);
+      encode(new_recovering_stretch_mode, bl);
+      encode(new_stretch_mode_bucket, bl);
+      encode(stretch_mode_enabled, bl);
+    }
+    if (target_v >= 11) {
+      encode(new_range_blocklist, bl, features);
+      encode(old_range_blocklist, bl, features);
+    }
+    ENCODE_FINISH(bl); // osd-only data
+  }
+
+  crc_offset = bl.length();
+  crc_filler = bl.append_hole(sizeof(uint32_t));
+  tail_offset = bl.length();
+
+  encode(full_crc, bl);
+
+  ENCODE_FINISH(bl); // meta-encoding wrapper
+
+  // fill in crc
+  ceph::buffer::list front;
+  front.substr_of(bl, start_offset, crc_offset - start_offset);
+  inc_crc = front.crc32c(-1);
+  ceph::buffer::list tail;
+  tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
+  inc_crc = tail.crc32c(inc_crc);
+  ceph_le32 crc_le;
+  crc_le = inc_crc;
+  crc_filler->copy_in(4u, (char*)&crc_le);
+  have_crc = true;
+}
+
+void OSDMap::Incremental::decode_classic(ceph::buffer::list::const_iterator &p)
+{
+  using ceph::decode;
+  __u32 n, t;
+  // base
+  __u16 v;
+  decode(v, p);
+  decode(fsid, p);
+  decode(epoch, p);
+  decode(modified, p);
+  if (v == 4 || v == 5) {
+    decode(n, p);
+    new_pool_max = n;
+  } else if (v >= 6)
+    decode(new_pool_max, p);
+  decode(new_flags, p);
+  decode(fullmap, p);
+  decode(crush, p);
+
+  decode(new_max_osd, p);
+  if (v < 6) {
+    new_pools.clear();
+    decode(n, p);
+    while (n--) {
+      decode(t, p);
+      decode(new_pools[t], p);
+    }
+  } else {
+    decode(new_pools, p);
+  }
+  if (v == 5) {
+    new_pool_names.clear();
+    decode(n, p);
+    while (n--) {
+      decode(t, p);
+      decode(new_pool_names[t], p);
+    }
+  } else if (v >= 6) {
+    decode(new_pool_names, p);
+  }
+  if (v < 6) {
+    old_pools.clear();
+    decode(n, p);
+    while (n--) {
+      decode(t, p);
+      old_pools.insert(t);
+    }
+  } else {
+    decode(old_pools, p);
+  }
+  decode(new_up_client, p);
+  {
+    map<int32_t,uint8_t> ns;
+    decode(ns, p);
+    for (auto q : ns) {
+      new_state[q.first] = q.second;
+    }
+  }
+  decode(new_weight, p);
+
+  if (v < 6) {
+    new_pg_temp.clear();
+    decode(n, p);
+    while (n--) {
+      old_pg_t opg;
+      ceph::decode_raw(opg, p);
+      decode(new_pg_temp[pg_t(opg)], p);
+    }
+  } else {
+    decode(new_pg_temp, p);
+  }
+
+  // decode short map, too.
+  if (v == 5 && p.end())
+    return;
+
+  // extended
+  __u16 ev = 0;
+  if (v >= 5)
+    decode(ev, p);
+  decode(new_hb_back_up, p);
+  if (v < 5)
+    decode(new_pool_names, p);
+  decode(new_up_thru, p);
+  decode(new_last_clean_interval, p);
+  decode(new_lost, p);
+  decode(new_blocklist, p);
+  decode(old_blocklist, p);
+  if (ev >= 6)
+    decode(new_up_cluster, p);
+  if (ev >= 7)
+    decode(cluster_snapshot, p);
+  if (ev >= 8)
+    decode(new_uuid, p);
+  if (ev >= 9)
+    decode(new_xinfo, p);
+  if (ev >= 10)
+    decode(new_hb_front_up, p);
+}
+
+/* for a description of osdmap incremental versions, and when they were
+ * introduced, please refer to
+ *    doc/dev/osd_internals/osdmap_versions.txt
+ */
+void OSDMap::Incremental::decode(ceph::buffer::list::const_iterator& bl)
+{
+  using ceph::decode;
+  /**
+   * Older encodings of the Incremental had a single struct_v which
+   * covered the whole encoding, and was prior to our modern
+   * stuff which includes a compatv and a size. So if we see
+   * a struct_v < 7, we must rewind to the beginning and use our
+   * classic decoder.
+   */
+  size_t start_offset = bl.get_off();
+  size_t tail_offset = 0;
+  ceph::buffer::list crc_front, crc_tail;
+
+  DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
+  if (struct_v < 7) {
+    bl.seek(start_offset);
+    decode_classic(bl);
+    encode_features = 0;
+    if (struct_v >= 6)
+      encode_features = CEPH_FEATURE_PGID64;
+    else
+      encode_features = 0;
+    return;
+  }
+  {
+    DECODE_START(8, bl); // client-usable data
+    decode(fsid, bl);
+    decode(epoch, bl);
+    decode(modified, bl);
+    decode(new_pool_max, bl);
+    decode(new_flags, bl);
+    decode(fullmap, bl);
+    decode(crush, bl);
+
+    decode(new_max_osd, bl);
+    decode(new_pools, bl);
+    decode(new_pool_names, bl);
+    decode(old_pools, bl);
+    decode(new_up_client, bl);
+    if (struct_v >= 5) {
+      decode(new_state, bl);
+    } else {
+      map<int32_t,uint8_t> ns;
+      decode(ns, bl);
+      for (auto q : ns) {
+	new_state[q.first] = q.second;
+      }
+    }
+    decode(new_weight, bl);
+    decode(new_pg_temp, bl);
+    decode(new_primary_temp, bl);
+    if (struct_v >= 2)
+      decode(new_primary_affinity, bl);
+    else
+      new_primary_affinity.clear();
+    if (struct_v >= 3) {
+      decode(new_erasure_code_profiles, bl);
+      decode(old_erasure_code_profiles, bl);
+    } else {
+      new_erasure_code_profiles.clear();
+      old_erasure_code_profiles.clear();
+    }
+    if (struct_v >= 4) {
+      decode(new_pg_upmap, bl);
+      decode(old_pg_upmap, bl);
+      decode(new_pg_upmap_items, bl);
+      decode(old_pg_upmap_items, bl);
+    }
+    if (struct_v >= 6) {
+      decode(new_removed_snaps, bl);
+      decode(new_purged_snaps, bl);
+    }
+    if (struct_v >= 8) {
+      decode(new_last_up_change, bl);
+      decode(new_last_in_change, bl);
+    }
+    DECODE_FINISH(bl); // client-usable data
+  }
+
+  {
+    DECODE_START(10, bl); // extended, osd-only data
+    decode(new_hb_back_up, bl);
+    decode(new_up_thru, bl);
+    decode(new_last_clean_interval, bl);
+    decode(new_lost, bl);
+    decode(new_blocklist, bl);
+    decode(old_blocklist, bl);
+    decode(new_up_cluster, bl);
+    decode(cluster_snapshot, bl);
+    decode(new_uuid, bl);
+    decode(new_xinfo, bl);
+    decode(new_hb_front_up, bl);
+    if (struct_v >= 2)
+      decode(encode_features, bl);
+    else
+      encode_features = CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDMAP_ENC;
+    if (struct_v >= 3) {
+      decode(new_nearfull_ratio, bl);
+      decode(new_full_ratio, bl);
+    } else {
+      new_nearfull_ratio = -1;
+      new_full_ratio = -1;
+    }
+    if (struct_v >= 4) {
+      decode(new_backfillfull_ratio, bl);
+    } else {
+      new_backfillfull_ratio = -1;
+    }
+    if (struct_v == 5) {
+      string r;
+      decode(r, bl);
+      if (r.length()) {
+	new_require_min_compat_client = ceph_release_from_name(r);
+      }
+    }
+    if (struct_v >= 6) {
+      decode(new_require_min_compat_client, bl);
+      decode(new_require_osd_release, bl);
+    } else {
+      if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
+	// only for compat with post-kraken pre-luminous test clusters
+	new_require_osd_release = ceph_release_t::luminous;
+	new_flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
+      } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_KRAKEN)) {
+	new_require_osd_release = ceph_release_t::kraken;
+      } else if (new_flags >= 0 && (new_flags & CEPH_OSDMAP_REQUIRE_JEWEL)) {
+	new_require_osd_release = ceph_release_t::jewel;
+      } else {
+	new_require_osd_release = ceph_release_t::unknown;
+      }
+    }
+    if (struct_v >= 8) {
+      decode(new_crush_node_flags, bl);
+    }
+    if (struct_v >= 9) {
+      decode(new_device_class_flags, bl);
+    }
+    if (struct_v >= 10) {
+      decode(change_stretch_mode, bl);
+      decode(new_stretch_bucket_count, bl);
+      decode(new_degraded_stretch_mode, bl);
+      decode(new_recovering_stretch_mode, bl);
+      decode(new_stretch_mode_bucket, bl);
+      decode(stretch_mode_enabled, bl);
+    }
+    if (struct_v >= 11) {
+      decode(new_range_blocklist, bl);
+      decode(old_range_blocklist, bl);
+    }
+    DECODE_FINISH(bl); // osd-only data
+  }
+
+  if (struct_v >= 8) {
+    have_crc = true;
+    crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
+    decode(inc_crc, bl);
+    tail_offset = bl.get_off();
+    decode(full_crc, bl);
+  } else {
+    have_crc = false;
+    full_crc = 0;
+    inc_crc = 0;
+  }
+
+  DECODE_FINISH(bl); // wrapper
+
+  if (have_crc) {
+    // verify crc
+    uint32_t actual = crc_front.crc32c(-1);
+    if (tail_offset < bl.get_off()) {
+      ceph::buffer::list tail;
+      tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
+      actual = tail.crc32c(actual);
+    }
+    if (inc_crc != actual) {
+      ostringstream ss;
+      ss << "bad crc, actual " << actual << " != expected " << inc_crc;
+      string s = ss.str();
+      throw ceph::buffer::malformed_input(s.c_str());
+    }
+  }
+}
+
+void OSDMap::Incremental::dump(Formatter *f) const
+{
+  f->dump_int("epoch", epoch);
+  f->dump_stream("fsid") << fsid;
+  f->dump_stream("modified") << modified;
+  f->dump_stream("new_last_up_change") << new_last_up_change;
+  f->dump_stream("new_last_in_change") << new_last_in_change;
+  f->dump_int("new_pool_max", new_pool_max);
+  f->dump_int("new_flags", new_flags);
+  f->dump_float("new_full_ratio", new_full_ratio);
+  f->dump_float("new_nearfull_ratio", new_nearfull_ratio);
+  f->dump_float("new_backfillfull_ratio", new_backfillfull_ratio);
+  f->dump_int("new_require_min_compat_client", to_integer<int>(new_require_min_compat_client));
+  f->dump_int("new_require_osd_release", to_integer<int>(new_require_osd_release));
+
+  if (fullmap.length()) {
+    f->open_object_section("full_map");
+    OSDMap full;
+    ceph::buffer::list fbl = fullmap;  // kludge around constness.
+    auto p = fbl.cbegin();
+    full.decode(p);
+    full.dump(f);
+    f->close_section();
+  }
+  if (crush.length()) {
+    f->open_object_section("crush");
+    CrushWrapper c;
+    ceph::buffer::list tbl = crush;  // kludge around constness.
+    auto p = tbl.cbegin();
+    c.decode(p);
+    c.dump(f);
+    f->close_section();
+  }
+
+  f->dump_int("new_max_osd", new_max_osd);
+
+  f->open_array_section("new_pools");
+
+  for (const auto &new_pool : new_pools) {
+    f->open_object_section("pool");
+    f->dump_int("pool", new_pool.first);
+    new_pool.second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("new_pool_names");
+
+  for (const auto &new_pool_name : new_pool_names) {
+    f->open_object_section("pool_name");
+    f->dump_int("pool", new_pool_name.first);
+    f->dump_string("name", new_pool_name.second);
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("old_pools");
+
+  for (const auto &old_pool : old_pools)
+    f->dump_int("pool", old_pool);
+  f->close_section();
+
+  f->open_array_section("new_up_osds");
+
+  for (const auto &upclient : new_up_client) {
+    f->open_object_section("osd");
+    f->dump_int("osd", upclient.first);
+    f->dump_stream("public_addr") << upclient.second.legacy_addr();
+    f->dump_object("public_addrs", upclient.second);
+    if (auto p = new_up_cluster.find(upclient.first);
+	p != new_up_cluster.end()) {
+      f->dump_stream("cluster_addr") << p->second.legacy_addr();
+      f->dump_object("cluster_addrs", p->second);
+    }
+    if (auto p = new_hb_back_up.find(upclient.first);
+	p != new_hb_back_up.end()) {
+      f->dump_object("heartbeat_back_addrs", p->second);
+    }
+    if (auto p = new_hb_front_up.find(upclient.first);
+	p != new_hb_front_up.end()) {
+      f->dump_object("heartbeat_front_addrs", p->second);
+    }
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("new_weight");
+
+  for (const auto &weight : new_weight) {
+    f->open_object_section("osd");
+    f->dump_int("osd", weight.first);
+    f->dump_int("weight", weight.second);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("osd_state_xor");
+  for (const auto &ns : new_state) {
+    f->open_object_section("osd");
+    f->dump_int("osd", ns.first);
+    set<string> st;
+    calc_state_set(new_state.find(ns.first)->second, st);
+    f->open_array_section("state_xor");
+    for (auto &state : st)
+      f->dump_string("state", state);
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("new_pg_temp");
+
+  for (const auto &pg_temp : new_pg_temp) {
+    f->open_object_section("pg");
+    f->dump_stream("pgid") << pg_temp.first;
+    f->open_array_section("osds");
+
+    for (const auto &osd : pg_temp.second)
+      f->dump_int("osd", osd);
+    f->close_section();
+    f->close_section();    
+  }
+  f->close_section();
+
+  f->open_array_section("primary_temp");
+
+  for (const auto &primary_temp : new_primary_temp) {
+    f->dump_stream("pgid") << primary_temp.first;
+    f->dump_int("osd", primary_temp.second);
+  }
+  f->close_section(); // primary_temp
+
+  f->open_array_section("new_pg_upmap");
+  for (auto& i : new_pg_upmap) {
+    f->open_object_section("mapping");
+    f->dump_stream("pgid") << i.first;
+    f->open_array_section("osds");
+    for (auto osd : i.second) {
+      f->dump_int("osd", osd);
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("old_pg_upmap");
+  for (auto& i : old_pg_upmap) {
+    f->dump_stream("pgid") << i;
+  }
+  f->close_section();
+
+  f->open_array_section("new_pg_upmap_items");
+  for (auto& i : new_pg_upmap_items) {
+    f->open_object_section("mapping");
+    f->dump_stream("pgid") << i.first;
+    f->open_array_section("mappings");
+    for (auto& p : i.second) {
+      f->open_object_section("mapping");
+      f->dump_int("from", p.first);
+      f->dump_int("to", p.second);
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("old_pg_upmap_items");
+  for (auto& i : old_pg_upmap_items) {
+    f->dump_stream("pgid") << i;
+  }
+  f->close_section();
+
+  f->open_array_section("new_up_thru");
+
+  for (const auto &up_thru : new_up_thru) {
+    f->open_object_section("osd");
+    f->dump_int("osd", up_thru.first);
+    f->dump_int("up_thru", up_thru.second);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("new_lost");
+
+  for (const auto &lost : new_lost) {
+    f->open_object_section("osd");
+    f->dump_int("osd", lost.first);
+    f->dump_int("epoch_lost", lost.second);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("new_last_clean_interval");
+
+  for (const auto &last_clean_interval : new_last_clean_interval) {
+    f->open_object_section("osd");
+    f->dump_int("osd", last_clean_interval.first);
+    f->dump_int("first", last_clean_interval.second.first);
+    f->dump_int("last", last_clean_interval.second.second);
+    f->close_section();
+  }
+  f->close_section();
+
+  f->open_array_section("new_blocklist");
+  for (const auto &blist : new_blocklist) {
+    stringstream ss;
+    ss << blist.first;
+    f->dump_stream(ss.str().c_str()) << blist.second;
+  }
+  f->close_section();
+  f->open_array_section("old_blocklist");
+  for (const auto &blist : old_blocklist)
+    f->dump_stream("addr") << blist;
+  f->close_section();
+  f->open_array_section("new_range_blocklist");
+  for (const auto &blist : new_range_blocklist) {
+    stringstream ss;
+    ss << blist.first;
+    f->dump_stream(ss.str().c_str()) << blist.second;
+  }
+  f->close_section();
+  f->open_array_section("old_range_blocklist");
+  for (const auto &blist : old_range_blocklist)
+    f->dump_stream("addr") << blist;
+  f->close_section();
+
+  f->open_array_section("new_xinfo");
+  for (const auto &xinfo : new_xinfo) {
+    f->open_object_section("xinfo");
+    f->dump_int("osd", xinfo.first);
+    xinfo.second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+
+  if (cluster_snapshot.size())
+    f->dump_string("cluster_snapshot", cluster_snapshot);
+
+  f->open_array_section("new_uuid");
+  for (const auto &uuid : new_uuid) {
+    f->open_object_section("osd");
+    f->dump_int("osd", uuid.first);
+    f->dump_stream("uuid") << uuid.second;
+    f->close_section();
+  }
+  f->close_section();
+
+  OSDMap::dump_erasure_code_profiles(new_erasure_code_profiles, f);
+  f->open_array_section("old_erasure_code_profiles");
+  for (const auto &erasure_code_profile : old_erasure_code_profiles) {
+    f->dump_string("old", erasure_code_profile);
+  }
+  f->close_section();
+
+  f->open_array_section("new_removed_snaps");
+  for (auto& p : new_removed_snaps) {
+    f->open_object_section("pool");
+    f->dump_int("pool", p.first);
+    f->open_array_section("snaps");
+    for (auto q = p.second.begin(); q != p.second.end(); ++q) {
+      f->open_object_section("interval");
+      f->dump_unsigned("begin", q.get_start());
+      f->dump_unsigned("length", q.get_len());
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("new_purged_snaps");
+  for (auto& p : new_purged_snaps) {
+    f->open_object_section("pool");
+    f->dump_int("pool", p.first);
+    f->open_array_section("snaps");
+    for (auto q = p.second.begin(); q != p.second.end(); ++q) {
+      f->open_object_section("interval");
+      f->dump_unsigned("begin", q.get_start());
+      f->dump_unsigned("length", q.get_len());
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->open_array_section("new_crush_node_flags");
+  for (auto& i : new_crush_node_flags) {
+    f->open_object_section("node");
+    f->dump_int("id", i.first);
+    set<string> st;
+    calc_state_set(i.second, st);
+    for (auto& j : st) {
+      f->dump_string("flag", j);
+    }
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("new_device_class_flags");
+  for (auto& i : new_device_class_flags) {
+    f->open_object_section("device_class");
+    f->dump_int("id", i.first);
+    set<string> st;
+    calc_state_set(i.second, st);
+    for (auto& j : st) {
+      f->dump_string("flag", j);
+    }
+    f->close_section();
+  }
+  f->close_section();
+  f->open_object_section("stretch_mode");
+  {
+    f->dump_bool("change_stretch_mode", change_stretch_mode);
+    f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
+    f->dump_unsigned("new_stretch_bucket_count", new_stretch_bucket_count);
+    f->dump_unsigned("new_degraded_stretch_mode", new_degraded_stretch_mode);
+    f->dump_unsigned("new_recovering_stretch_mode", new_recovering_stretch_mode);
+    f->dump_int("new_stretch_mode_bucket", new_stretch_mode_bucket);
+  }
+  f->close_section();
+  f->close_section();
+}
+
+void OSDMap::Incremental::generate_test_instances(list<Incremental*>& o)
+{
+  o.push_back(new Incremental);
+}
+
+// ----------------------------------
+// OSDMap
+
+void OSDMap::set_epoch(epoch_t e)
+{
+  epoch = e;
+  for (auto &pool : pools)
+    pool.second.last_change = e;
+}
+
+OSDMap::range_bits::range_bits() : ipv6(false) {
+  memset(&bits, 0, sizeof(bits));
+}
+
+OSDMap::range_bits::range_bits(const entity_addr_t& addr) : ipv6(false) {
+  memset(&bits, 0, sizeof(bits));
+  parse(addr);
+}
+
+void OSDMap::range_bits::get_ipv6_bytes(unsigned const char *addr,
+					uint64_t *upper, uint64_t *lower)
+{
+  *upper = ((uint64_t)(ntohl(*(uint32_t*)(addr)))) << 32 |
+    ((uint64_t)(ntohl(*(uint32_t*)(&addr[4]))));
+  *lower = ((uint64_t)(ntohl(*(uint32_t*)(&addr[8])))) << 32 |
+    ((uint64_t)(ntohl(*(uint32_t*)(&addr[12]))));
+}
+
+void OSDMap::range_bits::parse(const entity_addr_t& addr) {
+  // parse it into meaningful data
+  if (addr.is_ipv6()) {
+    get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr,
+		   &bits.ipv6.upper_64_bits, &bits.ipv6.lower_64_bits);
+    int32_t lower_shift = std::min(128-
+				   static_cast<int32_t>(addr.get_nonce()), 64);
+    int32_t upper_shift = std::max(64- //(128-b.first.get_nonce())-64
+				   static_cast<int32_t>(addr.get_nonce()), 0); 
+
+    auto get_mask = [](int32_t shift) -> uint64_t {
+      if (shift >= 0 && shift < 64) {
+	return UINT64_MAX << shift;
+      }
+      return 0;
+    };
+
+    bits.ipv6.lower_mask = get_mask(lower_shift);
+    bits.ipv6.upper_mask = get_mask(upper_shift);
+    ipv6 = true;
+  } else if (addr.is_ipv4()) {
+    bits.ipv4.ip_32_bits = ntohl(addr.in4_addr().sin_addr.s_addr);
+    if (addr.get_nonce() > 0) {
+      bits.ipv4.mask = UINT32_MAX << (32-addr.get_nonce());
+    } else {
+      bits.ipv4.mask = 0;
+    }
+  } else {
+    // uh...
+  }
+}
+
+bool OSDMap::range_bits::matches(const entity_addr_t& addr) const {
+  if (addr.is_ipv4() && !ipv6) {
+    return ((ntohl(addr.in4_addr().sin_addr.s_addr) & bits.ipv4.mask) ==
+	    (bits.ipv4.ip_32_bits & bits.ipv4.mask));
+  } else if (addr.is_ipv6() && ipv6) {
+    uint64_t upper_64, lower_64;
+    get_ipv6_bytes(addr.in6_addr().sin6_addr.s6_addr, &upper_64, &lower_64);
+    return (((upper_64 & bits.ipv6.upper_mask) ==
+	     (bits.ipv6.upper_64_bits & bits.ipv6.upper_mask)) &&
+	    ((lower_64 & bits.ipv6.lower_mask) ==
+	     (bits.ipv6.lower_64_bits & bits.ipv6.lower_mask)));
+  }
+  return false;
+}
+
+bool OSDMap::is_blocklisted(const entity_addr_t& orig, CephContext *cct) const
+{
+  if (cct) ldout(cct, 25) << "is_blocklisted: " << orig << dendl;
+  if (blocklist.empty() && range_blocklist.empty()) {
+    if (cct) ldout(cct, 30) << "not blocklisted: " << orig << dendl;
+    return false;
+  }
+
+  // all blocklist entries are type ANY for nautilus+
+  // FIXME: avoid this copy!
+  entity_addr_t a = orig;
+  if (require_osd_release < ceph_release_t::nautilus) {
+    a.set_type(entity_addr_t::TYPE_LEGACY);
+  } else {
+    a.set_type(entity_addr_t::TYPE_ANY);
+  }
+
+  // this specific instance?
+  if (blocklist.count(a)) {
+    if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl;
+    return true;
+  }
+
+  // is entire ip blocklisted?
+  if (a.is_ip()) {
+    a.set_port(0);
+    a.set_nonce(0);
+    if (blocklist.count(a)) {
+      if (cct) ldout(cct, 20) << "blocklist contains " << a << dendl;
+      return true;
+    }
+  }
+
+  // is it in a blocklisted range?
+  for (const auto& i : calculated_ranges) {
+    bool blocked = i.second.matches(a);
+    if (blocked) {
+      if (cct) ldout(cct, 20) << "range_blocklist contains " << a << dendl;
+      return true;
+    }
+  }
+
+  if (cct) ldout(cct, 25) << "not blocklisted: " << orig << dendl;
+  return false;
+}
+
+bool OSDMap::is_blocklisted(const entity_addrvec_t& av, CephContext *cct) const
+{
+  if (blocklist.empty() && range_blocklist.empty())
+    return false;
+
+  for (auto& a : av.v) {
+    if (is_blocklisted(a, cct)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void OSDMap::get_blocklist(list<pair<entity_addr_t,utime_t> > *bl,
+			   std::list<std::pair<entity_addr_t,utime_t> > *rl) const
+{
+   std::copy(blocklist.begin(), blocklist.end(), std::back_inserter(*bl));
+   std::copy(range_blocklist.begin(), range_blocklist.end(),
+	     std::back_inserter(*rl));
+}
+
+void OSDMap::get_blocklist(std::set<entity_addr_t> *bl,
+			   std::set<entity_addr_t> *rl) const
+{
+  for (const auto &i : blocklist) {
+    bl->insert(i.first);
+  }
+  for (const auto &i : range_blocklist) {
+    rl->insert(i.first);
+  }
+}
+
+void OSDMap::set_max_osd(int m)
+{
+  max_osd = m;
+  osd_state.resize(max_osd, 0);
+  osd_weight.resize(max_osd, CEPH_OSD_OUT);
+  osd_info.resize(max_osd);
+  osd_xinfo.resize(max_osd);
+  osd_addrs->client_addrs.resize(max_osd);
+  osd_addrs->cluster_addrs.resize(max_osd);
+  osd_addrs->hb_back_addrs.resize(max_osd);
+  osd_addrs->hb_front_addrs.resize(max_osd);
+  osd_uuid->resize(max_osd);
+  if (osd_primary_affinity)
+    osd_primary_affinity->resize(max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
+
+  calc_num_osds();
+}
+
+int OSDMap::calc_num_osds()
+{
+  num_osd = 0;
+  num_up_osd = 0;
+  num_in_osd = 0;
+  for (int i=0; i<max_osd; i++) {
+    if (osd_state[i] & CEPH_OSD_EXISTS) {
+      ++num_osd;
+      if (osd_state[i] & CEPH_OSD_UP) {
+	++num_up_osd;
+      }
+      if (get_weight(i) != CEPH_OSD_OUT) {
+	++num_in_osd;
+      }
+    }
+  }
+  return num_osd;
+}
+
+void OSDMap::get_full_pools(CephContext *cct,
+                            set<int64_t> *full,
+                            set<int64_t> *backfillfull,
+                            set<int64_t> *nearfull) const
+{
+  ceph_assert(full);
+  ceph_assert(backfillfull);
+  ceph_assert(nearfull);
+  full->clear();
+  backfillfull->clear();
+  nearfull->clear();
+
+  vector<int> full_osds;
+  vector<int> backfillfull_osds;
+  vector<int> nearfull_osds;
+  for (int i = 0; i < max_osd; ++i) {
+    if (exists(i) && is_up(i) && is_in(i)) {
+      if (osd_state[i] & CEPH_OSD_FULL)
+        full_osds.push_back(i);
+      else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
+	backfillfull_osds.push_back(i);
+      else if (osd_state[i] & CEPH_OSD_NEARFULL)
+	nearfull_osds.push_back(i);
+    }
+  }
+
+  for (auto i: full_osds) {
+    get_pool_ids_by_osd(cct, i, full);
+  }
+  for (auto i: backfillfull_osds) {
+    get_pool_ids_by_osd(cct, i, backfillfull);
+  }
+  for (auto i: nearfull_osds) {
+    get_pool_ids_by_osd(cct, i, nearfull);
+  }
+}
+
+void OSDMap::get_full_osd_counts(set<int> *full, set<int> *backfill,
+				 set<int> *nearfull) const
+{
+  full->clear();
+  backfill->clear();
+  nearfull->clear();
+  for (int i = 0; i < max_osd; ++i) {
+    if (exists(i) && is_up(i) && is_in(i)) {
+      if (osd_state[i] & CEPH_OSD_FULL)
+	full->emplace(i);
+      else if (osd_state[i] & CEPH_OSD_BACKFILLFULL)
+	backfill->emplace(i);
+      else if (osd_state[i] & CEPH_OSD_NEARFULL)
+	nearfull->emplace(i);
+    }
+  }
+}
+
+void OSDMap::get_all_osds(set<int32_t>& ls) const
+{
+  for (int i=0; i<max_osd; i++)
+    if (exists(i))
+      ls.insert(i);
+}
+
+void OSDMap::get_up_osds(set<int32_t>& ls) const
+{
+  for (int i = 0; i < max_osd; i++) {
+    if (is_up(i))
+      ls.insert(i);
+  }
+}
+
+void OSDMap::get_out_existing_osds(set<int32_t>& ls) const
+{
+  for (int i = 0; i < max_osd; i++) {
+    if (exists(i) && get_weight(i) == CEPH_OSD_OUT)
+      ls.insert(i);
+  }
+}
+
+void OSDMap::get_flag_set(set<string> *flagset) const
+{
+  for (unsigned i = 0; i < sizeof(flags) * 8; ++i) {
+    if (flags & (1<<i)) {
+      flagset->insert(get_flag_string(flags & (1<<i)));
+    }
+  }
+}
+
+void OSDMap::calc_state_set(int state, set<string>& st)
+{
+  unsigned t = state;
+  for (unsigned s = 1; t; s <<= 1) {
+    if (t & s) {
+      t &= ~s;
+      st.insert(ceph_osd_state_name(s));
+    }
+  }
+}
+
+void OSDMap::adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const
+{
+  float max = 0;
+  for (const auto &weight : weights) {
+    if (weight.second > max)
+      max = weight.second;
+  }
+
+  for (const auto &weight : weights) {
+    inc.new_weight[weight.first] = (unsigned)((weight.second / max) * CEPH_OSD_IN);
+  }
+}
+
+int OSDMap::identify_osd(const entity_addr_t& addr) const
+{
+  for (int i=0; i<max_osd; i++)
+    if (exists(i) && (get_addrs(i).contains(addr) ||
+		      get_cluster_addrs(i).contains(addr)))
+      return i;
+  return -1;
+}
+
+int OSDMap::identify_osd(const uuid_d& u) const
+{
+  for (int i=0; i<max_osd; i++)
+    if (exists(i) && get_uuid(i) == u)
+      return i;
+  return -1;
+}
+
+int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
+{
+  for (int i=0; i<max_osd; i++)
+    if (exists(i) && (get_addrs(i).contains(addr) ||
+		      get_cluster_addrs(i).contains(addr) ||
+		      get_hb_back_addrs(i).contains(addr) ||
+		      get_hb_front_addrs(i).contains(addr)))
+      return i;
+  return -1;
+}
+
+int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
+{
+  for (int i=0; i<max_osd; i++)
+    if (exists(i) && (get_addrs(i).is_same_host(ip) ||
+		      get_cluster_addrs(i).is_same_host(ip)))
+      return i;
+  return -1;
+}
+
+
+uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
+{
+  uint64_t features = 0;  // things we actually have
+  uint64_t mask = 0;      // things we could have
+
+  if (crush->has_nondefault_tunables())
+    features |= CEPH_FEATURE_CRUSH_TUNABLES;
+  if (crush->has_nondefault_tunables2())
+    features |= CEPH_FEATURE_CRUSH_TUNABLES2;
+  if (crush->has_nondefault_tunables3())
+    features |= CEPH_FEATURE_CRUSH_TUNABLES3;
+  if (crush->has_v4_buckets())
+    features |= CEPH_FEATURE_CRUSH_V4;
+  if (crush->has_nondefault_tunables5())
+    features |= CEPH_FEATURE_CRUSH_TUNABLES5;
+  if (crush->has_incompat_choose_args()) {
+    features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS;
+  }
+  mask |= CEPH_FEATURES_CRUSH;
+
+  if (!pg_upmap.empty() || !pg_upmap_items.empty())
+    features |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
+  mask |= CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
+
+  for (auto &pool: pools) {
+    if (pool.second.has_flag(pg_pool_t::FLAG_HASHPSPOOL)) {
+      features |= CEPH_FEATURE_OSDHASHPSPOOL;
+    }
+    if (!pool.second.tiers.empty() ||
+	pool.second.is_tier()) {
+      features |= CEPH_FEATURE_OSD_CACHEPOOL;
+    }
+    int ruleid = crush->find_rule(pool.second.get_crush_rule(),
+				  pool.second.get_type(),
+				  pool.second.get_size());
+    if (ruleid >= 0) {
+      if (crush->is_v2_rule(ruleid))
+	features |= CEPH_FEATURE_CRUSH_V2;
+      if (crush->is_v3_rule(ruleid))
+	features |= CEPH_FEATURE_CRUSH_TUNABLES3;
+      if (crush->is_v5_rule(ruleid))
+	features |= CEPH_FEATURE_CRUSH_TUNABLES5;
+    }
+  }
+  mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
+
+  if (osd_primary_affinity) {
+    for (int i = 0; i < max_osd; ++i) {
+      if ((*osd_primary_affinity)[i] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+	features |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
+	break;
+      }
+    }
+  }
+  mask |= CEPH_FEATURE_OSD_PRIMARY_AFFINITY;
+
+  if (entity_type == CEPH_ENTITY_TYPE_OSD) {
+    const uint64_t jewel_features = CEPH_FEATURE_SERVER_JEWEL;
+    if (require_osd_release >= ceph_release_t::jewel) {
+      features |= jewel_features;
+    }
+    mask |= jewel_features;
+
+    const uint64_t kraken_features = CEPH_FEATUREMASK_SERVER_KRAKEN
+      | CEPH_FEATURE_MSG_ADDR2;
+    if (require_osd_release >= ceph_release_t::kraken) {
+      features |= kraken_features;
+    }
+    mask |= kraken_features;
+
+    if (stretch_mode_enabled) {
+      features |= CEPH_FEATUREMASK_STRETCH_MODE;
+      mask |= CEPH_FEATUREMASK_STRETCH_MODE;
+    }
+  }
+
+  if (require_min_compat_client >= ceph_release_t::nautilus) {
+    // if min_compat_client is >= nautilus, require v2 cephx signatures
+    // from everyone
+    features |= CEPH_FEATUREMASK_CEPHX_V2;
+  } else if (require_osd_release >= ceph_release_t::nautilus &&
+	     entity_type == CEPH_ENTITY_TYPE_OSD) {
+    // if osds are >= nautilus, at least require the signatures from them
+    features |= CEPH_FEATUREMASK_CEPHX_V2;
+  }
+  mask |= CEPH_FEATUREMASK_CEPHX_V2;
+
+  if (pmask)
+    *pmask = mask;
+  return features;
+}
+
+ceph_release_t OSDMap::get_min_compat_client() const
+{
+  uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr);
+
+  if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) ||      // v12.0.0-1733-g27d6f43
+      HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) {    // v12.0.1-2172-gef1ef28
+    return ceph_release_t::luminous;  // v12.2.0
+  }
+  if (HAVE_FEATURE(f, CRUSH_TUNABLES5)) {      // v10.0.0-612-g043a737
+    return ceph_release_t::jewel;     // v10.2.0
+  }
+  if (HAVE_FEATURE(f, CRUSH_V4)) {             // v0.91-678-g325fc56
+    return ceph_release_t::hammer;    // v0.94.0
+  }
+  if (HAVE_FEATURE(f, OSD_PRIMARY_AFFINITY) || // v0.76-553-gf825624
+      HAVE_FEATURE(f, CRUSH_TUNABLES3) ||      // v0.76-395-ge20a55d
+      HAVE_FEATURE(f, OSD_CACHEPOOL)) {        // v0.67-401-gb91c1c5
+    return ceph_release_t::firefly;   // v0.80.0
+  }
+  if (HAVE_FEATURE(f, CRUSH_TUNABLES2) ||      // v0.54-684-g0cc47ff
+      HAVE_FEATURE(f, OSDHASHPSPOOL)) {        // v0.57-398-g8cc2b0f
+    return ceph_release_t::dumpling;  // v0.67.0
+  }
+  if (HAVE_FEATURE(f, CRUSH_TUNABLES)) {       // v0.48argonaut-206-g6f381af
+    return ceph_release_t::argonaut;  // v0.48argonaut-206-g6f381af
+  }
+  return ceph_release_t::argonaut;    // v0.48argonaut-206-g6f381af
+}
+
+ceph_release_t OSDMap::get_require_min_compat_client() const
+{
+  return require_min_compat_client;
+}
+
+void OSDMap::_calc_up_osd_features()
+{
+  bool first = true;
+  cached_up_osd_features = 0;
+  for (int osd = 0; osd < max_osd; ++osd) {
+    if (!is_up(osd))
+      continue;
+    const osd_xinfo_t &xi = get_xinfo(osd);
+    if (xi.features == 0)
+      continue;  // bogus xinfo, maybe #20751 or similar, skipping
+    if (first) {
+      cached_up_osd_features = xi.features;
+      first = false;
+    } else {
+      cached_up_osd_features &= xi.features;
+    }
+  }
+}
+
+uint64_t OSDMap::get_up_osd_features() const
+{
+  return cached_up_osd_features;
+}
+
+void OSDMap::dedup(const OSDMap *o, OSDMap *n)
+{
+  using ceph::encode;
+  if (o->epoch == n->epoch)
+    return;
+
+  int diff = 0;
+
+  // do addrs match?
+  if (o->max_osd != n->max_osd)
+    diff++;
+  for (int i = 0; i < o->max_osd && i < n->max_osd; i++) {
+    if ( n->osd_addrs->client_addrs[i] &&  o->osd_addrs->client_addrs[i] &&
+	*n->osd_addrs->client_addrs[i] == *o->osd_addrs->client_addrs[i])
+      n->osd_addrs->client_addrs[i] = o->osd_addrs->client_addrs[i];
+    else
+      diff++;
+    if ( n->osd_addrs->cluster_addrs[i] &&  o->osd_addrs->cluster_addrs[i] &&
+	*n->osd_addrs->cluster_addrs[i] == *o->osd_addrs->cluster_addrs[i])
+      n->osd_addrs->cluster_addrs[i] = o->osd_addrs->cluster_addrs[i];
+    else
+      diff++;
+    if ( n->osd_addrs->hb_back_addrs[i] &&  o->osd_addrs->hb_back_addrs[i] &&
+	*n->osd_addrs->hb_back_addrs[i] == *o->osd_addrs->hb_back_addrs[i])
+      n->osd_addrs->hb_back_addrs[i] = o->osd_addrs->hb_back_addrs[i];
+    else
+      diff++;
+    if ( n->osd_addrs->hb_front_addrs[i] &&  o->osd_addrs->hb_front_addrs[i] &&
+	*n->osd_addrs->hb_front_addrs[i] == *o->osd_addrs->hb_front_addrs[i])
+      n->osd_addrs->hb_front_addrs[i] = o->osd_addrs->hb_front_addrs[i];
+    else
+      diff++;
+  }
+  if (diff == 0) {
+    // zoinks, no differences at all!
+    n->osd_addrs = o->osd_addrs;
+  }
+
+  // does crush match?
+  ceph::buffer::list oc, nc;
+  encode(*o->crush, oc, CEPH_FEATURES_SUPPORTED_DEFAULT);
+  encode(*n->crush, nc, CEPH_FEATURES_SUPPORTED_DEFAULT);
+  if (oc.contents_equal(nc)) {
+    n->crush = o->crush;
+  }
+
+  // does pg_temp match?
+  if (*o->pg_temp == *n->pg_temp)
+    n->pg_temp = o->pg_temp;
+
+  // does primary_temp match?
+  if (o->primary_temp->size() == n->primary_temp->size()) {
+    if (*o->primary_temp == *n->primary_temp)
+      n->primary_temp = o->primary_temp;
+  }
+
+  // do uuids match?
+  if (o->osd_uuid->size() == n->osd_uuid->size() &&
+      *o->osd_uuid == *n->osd_uuid)
+    n->osd_uuid = o->osd_uuid;
+}
+
+void OSDMap::clean_temps(CephContext *cct,
+			 const OSDMap& oldmap,
+			 const OSDMap& nextmap,
+			 Incremental *pending_inc)
+{
+  ldout(cct, 10) << __func__ << dendl;
+
+  for (auto pg : *nextmap.pg_temp) {
+    // if pool does not exist, remove any existing pg_temps associated with
+    // it.  we don't care about pg_temps on the pending_inc either; if there
+    // are new_pg_temp entries on the pending, clear them out just as well.
+    if (!nextmap.have_pg_pool(pg.first.pool())) {
+      ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
+		     << " for nonexistent pool " << pg.first.pool() << dendl;
+      pending_inc->new_pg_temp[pg.first].clear();
+      continue;
+    }
+    if (!nextmap.pg_exists(pg.first)) {
+      ldout(cct, 10) << __func__ << " removing pg_temp " << pg.first
+                     << " for nonexistent pg " << dendl;
+      pending_inc->new_pg_temp[pg.first].clear();
+      continue;
+    }
+    // all osds down?
+    unsigned num_up = 0;
+    for (auto o : pg.second) {
+      if (!nextmap.is_down(o)) {
+	++num_up;
+	break;
+      }
+    }
+    if (num_up == 0) {
+      ldout(cct, 10) << __func__ << "  removing pg_temp " << pg.first
+		     << " with all down osds" << pg.second << dendl;
+      pending_inc->new_pg_temp[pg.first].clear();
+      continue;
+    }
+    // redundant pg_temp?
+    vector<int> raw_up;
+    int primary;
+    nextmap.pg_to_raw_up(pg.first, &raw_up, &primary);
+    bool remove = false;
+    if (raw_up == pg.second) {
+      ldout(cct, 10) << __func__ << "  removing pg_temp " << pg.first << " "
+		     << pg.second << " that matches raw_up mapping" << dendl;
+      remove = true;
+    }
+    // oversized pg_temp?
+    if (pg.second.size() > nextmap.get_pg_pool(pg.first.pool())->get_size()) {
+      ldout(cct, 10) << __func__ << "  removing pg_temp " << pg.first << " "
+		     << pg.second << " exceeds pool size" << dendl;
+      remove = true;
+    }
+    if (remove) {
+      if (oldmap.pg_temp->count(pg.first))
+	pending_inc->new_pg_temp[pg.first].clear();
+      else
+	pending_inc->new_pg_temp.erase(pg.first);
+    }
+  }
+  
+  for (auto &pg : *nextmap.primary_temp) {
+    // primary down?
+    if (nextmap.is_down(pg.second)) {
+      ldout(cct, 10) << __func__ << "  removing primary_temp " << pg.first
+		     << " to down " << pg.second << dendl;
+      pending_inc->new_primary_temp[pg.first] = -1;
+      continue;
+    }
+    // redundant primary_temp?
+    vector<int> real_up, templess_up;
+    int real_primary, templess_primary;
+    pg_t pgid = pg.first;
+    nextmap.pg_to_acting_osds(pgid, &real_up, &real_primary);
+    nextmap.pg_to_raw_up(pgid, &templess_up, &templess_primary);
+    if (real_primary == templess_primary){
+      ldout(cct, 10) << __func__ << "  removing primary_temp "
+		     << pgid << " -> " << real_primary
+		     << " (unnecessary/redundant)" << dendl;
+      if (oldmap.primary_temp->count(pgid))
+	pending_inc->new_primary_temp[pgid] = -1;
+      else
+	pending_inc->new_primary_temp.erase(pgid);
+    }
+  }
+}
+
+void OSDMap::get_upmap_pgs(vector<pg_t> *upmap_pgs) const
+{
+  upmap_pgs->reserve(pg_upmap.size() + pg_upmap_items.size());
+  for (auto& p : pg_upmap)
+    upmap_pgs->push_back(p.first);
+  for (auto& p : pg_upmap_items)
+    upmap_pgs->push_back(p.first);
+}
+
+bool OSDMap::check_pg_upmaps(
+  CephContext *cct,
+  const vector<pg_t>& to_check,
+  vector<pg_t> *to_cancel,
+  map<pg_t, mempool::osdmap::vector<pair<int,int>>> *to_remap) const
+{
+  bool any_change = false;
+  map<int, map<int, float>> rule_weight_map;
+  for (auto& pg : to_check) {
+    const pg_pool_t *pi = get_pg_pool(pg.pool());
+    if (!pi || pg.ps() >= pi->get_pg_num_pending()) {
+      ldout(cct, 0) << __func__ << " pg " << pg << " is gone or merge source"
+		    << dendl;
+      to_cancel->push_back(pg);
+      continue;
+    }
+    if (pi->is_pending_merge(pg, nullptr)) {
+      ldout(cct, 0) << __func__ << " pg " << pg << " is pending merge"
+		    << dendl;
+      to_cancel->push_back(pg);
+      continue;
+    }
+    vector<int> raw, up;
+    pg_to_raw_upmap(pg, &raw, &up);
+    auto crush_rule = get_pg_pool_crush_rule(pg);
+    auto r = crush->verify_upmap(cct,
+                                 crush_rule,
+                                 get_pg_pool_size(pg),
+                                 up);
+    if (r < 0) {
+      ldout(cct, 0) << __func__ << " verify_upmap of pg " << pg
+                    << " returning " << r
+                    << dendl;
+      to_cancel->push_back(pg);
+      continue;
+    }
+    // below we check against crush-topology changing..
+    map<int, float> weight_map;
+    auto it = rule_weight_map.find(crush_rule);
+    if (it == rule_weight_map.end()) {
+      auto r = crush->get_rule_weight_osd_map(crush_rule, &weight_map);
+      if (r < 0) {
+        lderr(cct) << __func__ << " unable to get crush weight_map for "
+                   << "crush_rule " << crush_rule
+                   << dendl;
+        continue;
+      }
+      rule_weight_map[crush_rule] = weight_map;
+    } else {
+      weight_map = it->second;
+    }
+    ldout(cct, 10) << __func__ << " pg " << pg
+                   << " weight_map " << weight_map
+                   << dendl;
+    for (auto osd : up) {
+      auto it = weight_map.find(osd);
+      if (it == weight_map.end()) {
+        ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd << " is gone or has "
+	              << "been moved out of the specific crush-tree"
+		      << dendl;
+        to_cancel->push_back(pg);
+        break;
+      }
+      auto adjusted_weight = get_weightf(it->first) * it->second;
+      if (adjusted_weight == 0) {
+        ldout(cct, 10) << __func__ << " pg " << pg << ": osd " << osd
+	              << " is out/crush-out"
+		    << dendl;
+        to_cancel->push_back(pg);
+        break;
+      }
+    }
+    if (!to_cancel->empty() && to_cancel->back() == pg)
+      continue;
+    // okay, upmap is valid
+    // continue to check if it is still necessary
+    auto i = pg_upmap.find(pg);
+    if (i != pg_upmap.end()) {
+      if (i->second == raw) {
+        ldout(cct, 10) << "removing redundant pg_upmap " << i->first << " "
+                       << i->second << dendl;
+        to_cancel->push_back(pg);
+        continue;
+      }
+      if ((int)i->second.size() != get_pg_pool_size(pg)) {
+        ldout(cct, 10) << "removing pg_upmap " << i->first << " "
+                       << i->second << " != pool size " << get_pg_pool_size(pg)
+                       << dendl;
+        to_cancel->push_back(pg);
+        continue;
+      }
+    }
+    auto j = pg_upmap_items.find(pg);
+    if (j != pg_upmap_items.end()) {
+      mempool::osdmap::vector<pair<int,int>> newmap;
+      for (auto& p : j->second) {
+        if (std::find(raw.begin(), raw.end(), p.first) == raw.end()) {
+          // cancel mapping if source osd does not exist anymore
+          continue;
+        }
+        if (p.second != CRUSH_ITEM_NONE && p.second < max_osd &&
+            p.second >= 0 && osd_weight[p.second] == 0) {
+          // cancel mapping if target osd is out
+          continue;
+        }
+        newmap.push_back(p);
+      }
+      if (newmap.empty()) {
+        ldout(cct, 10) << " removing no-op pg_upmap_items "
+                       << j->first << " " << j->second
+                       << dendl;
+        to_cancel->push_back(pg);
+      } else if (newmap != j->second) {
+        ldout(cct, 10) << " simplifying partially no-op pg_upmap_items "
+                       << j->first << " " << j->second
+                       << " -> " << newmap
+                       << dendl;
+        to_remap->insert({pg, newmap});
+        any_change = true;
+      }
+    }
+  }
+  any_change = any_change || !to_cancel->empty();
+  return any_change;
+}
+
+void OSDMap::clean_pg_upmaps(
+  CephContext *cct,
+  Incremental *pending_inc,
+  const vector<pg_t>& to_cancel,
+  const map<pg_t, mempool::osdmap::vector<pair<int,int>>>& to_remap) const
+{
+  for (auto &pg: to_cancel) {
+    auto i = pending_inc->new_pg_upmap.find(pg);
+    if (i != pending_inc->new_pg_upmap.end()) {
+      ldout(cct, 10) << __func__ << " cancel invalid pending "
+                     << "pg_upmap entry "
+                     << i->first << "->" << i->second
+                     << dendl;
+      pending_inc->new_pg_upmap.erase(i);
+    }
+    auto j = pg_upmap.find(pg);
+    if (j != pg_upmap.end()) {
+      ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
+                     << j->first << "->" << j->second
+                     << dendl;
+      pending_inc->old_pg_upmap.insert(pg);
+    }
+    auto p = pending_inc->new_pg_upmap_items.find(pg);
+    if (p != pending_inc->new_pg_upmap_items.end()) {
+      ldout(cct, 10) << __func__ << " cancel invalid pending "
+                     << "pg_upmap_items entry "
+                     << p->first << "->" << p->second
+                     << dendl;
+      pending_inc->new_pg_upmap_items.erase(p);
+    }
+    auto q = pg_upmap_items.find(pg);
+    if (q != pg_upmap_items.end()) {
+      ldout(cct, 10) << __func__ << " cancel invalid "
+                     << "pg_upmap_items entry "
+                     << q->first << "->" << q->second
+                     << dendl;
+      pending_inc->old_pg_upmap_items.insert(pg);
+    }
+  }
+  for (auto& i : to_remap)
+    pending_inc->new_pg_upmap_items[i.first] = i.second;
+}
+
+bool OSDMap::clean_pg_upmaps(
+  CephContext *cct,
+  Incremental *pending_inc) const
+{
+  ldout(cct, 10) << __func__ << dendl;
+  vector<pg_t> to_check;
+  vector<pg_t> to_cancel;
+  map<pg_t, mempool::osdmap::vector<pair<int,int>>> to_remap;
+
+  get_upmap_pgs(&to_check);
+  auto any_change = check_pg_upmaps(cct, to_check, &to_cancel, &to_remap);
+  clean_pg_upmaps(cct, pending_inc, to_cancel, to_remap);
+  return any_change;
+}
+
+int OSDMap::apply_incremental(const Incremental &inc)
+{
+  new_blocklist_entries = false;
+  if (inc.epoch == 1)
+    fsid = inc.fsid;
+  else if (inc.fsid != fsid)
+    return -EINVAL;
+  
+  ceph_assert(inc.epoch == epoch+1);
+
+  epoch++;
+  modified = inc.modified;
+
+  // full map?
+  if (inc.fullmap.length()) {
+    ceph::buffer::list bl(inc.fullmap);
+    decode(bl);
+    return 0;
+  }
+
+  // nope, incremental.
+  if (inc.new_flags >= 0) {
+    flags = inc.new_flags;
+    // the below is just to cover a newly-upgraded luminous mon
+    // cluster that has to set require_jewel_osds or
+    // require_kraken_osds before the osds can be upgraded to
+    // luminous.
+    if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
+      if (require_osd_release < ceph_release_t::kraken) {
+	require_osd_release = ceph_release_t::kraken;
+      }
+    } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
+      if (require_osd_release < ceph_release_t::jewel) {
+	require_osd_release = ceph_release_t::jewel;
+      }
+    }
+  }
+
+  if (inc.new_max_osd >= 0)
+    set_max_osd(inc.new_max_osd);
+
+  if (inc.new_pool_max != -1)
+    pool_max = inc.new_pool_max;
+
+  for (const auto &pool : inc.new_pools) {
+    pools[pool.first] = pool.second;
+    pools[pool.first].last_change = epoch;
+  }
+
+  new_removed_snaps = inc.new_removed_snaps;
+  new_purged_snaps = inc.new_purged_snaps;
+  for (auto p = new_removed_snaps.begin();
+       p != new_removed_snaps.end();
+       ++p) {
+    removed_snaps_queue[p->first].union_of(p->second);
+  }
+  for (auto p = new_purged_snaps.begin();
+       p != new_purged_snaps.end();
+       ++p) {
+    auto q = removed_snaps_queue.find(p->first);
+    ceph_assert(q != removed_snaps_queue.end());
+    q->second.subtract(p->second);
+    if (q->second.empty()) {
+      removed_snaps_queue.erase(q);
+    }
+  }
+
+  if (inc.new_last_up_change != utime_t()) {
+    last_up_change = inc.new_last_up_change;
+  }
+  if (inc.new_last_in_change != utime_t()) {
+    last_in_change = inc.new_last_in_change;
+  }
+
+  for (const auto &pname : inc.new_pool_names) {
+    auto pool_name_entry = pool_name.find(pname.first);
+    if (pool_name_entry != pool_name.end()) {
+      name_pool.erase(pool_name_entry->second);
+      pool_name_entry->second = pname.second;
+    } else {
+      pool_name[pname.first] = pname.second;
+    }
+    name_pool[pname.second] = pname.first;
+  }
+  
+  for (const auto &pool : inc.old_pools) {
+    pools.erase(pool);
+    name_pool.erase(pool_name[pool]);
+    pool_name.erase(pool);
+  }
+
+  for (const auto &weight : inc.new_weight) {
+    set_weight(weight.first, weight.second);
+
+    // if we are marking in, clear the AUTOOUT and NEW bits, and clear
+    // xinfo old_weight.
+    if (weight.second) {
+      osd_state[weight.first] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW);
+      osd_xinfo[weight.first].old_weight = 0;
+    }
+  }
+
+  for (const auto &primary_affinity : inc.new_primary_affinity) {
+    set_primary_affinity(primary_affinity.first, primary_affinity.second);
+  }
+
+  // erasure_code_profiles
+  for (const auto &profile : inc.old_erasure_code_profiles)
+    erasure_code_profiles.erase(profile);
+  
+  for (const auto &profile : inc.new_erasure_code_profiles) {
+    set_erasure_code_profile(profile.first, profile.second);
+  }
+  
+  // up/down
+  for (const auto &state : inc.new_state) {
+    const auto osd = state.first;
+    int s = state.second ? state.second : CEPH_OSD_UP;
+    if ((osd_state[osd] & CEPH_OSD_UP) &&
+	(s & CEPH_OSD_UP)) {
+      osd_info[osd].down_at = epoch;
+      osd_xinfo[osd].down_stamp = modified;
+    }
+    if ((osd_state[osd] & CEPH_OSD_EXISTS) &&
+	(s & CEPH_OSD_EXISTS)) {
+      // osd is destroyed; clear out anything interesting.
+      (*osd_uuid)[osd] = uuid_d();
+      osd_info[osd] = osd_info_t();
+      osd_xinfo[osd] = osd_xinfo_t();
+      set_primary_affinity(osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
+      osd_addrs->client_addrs[osd].reset(new entity_addrvec_t());
+      osd_addrs->cluster_addrs[osd].reset(new entity_addrvec_t());
+      osd_addrs->hb_front_addrs[osd].reset(new entity_addrvec_t());
+      osd_addrs->hb_back_addrs[osd].reset(new entity_addrvec_t());
+      osd_state[osd] = 0;
+    } else {
+      osd_state[osd] ^= s;
+    }
+  }
+
+  for (const auto &client : inc.new_up_client) {
+    osd_state[client.first] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
+    osd_state[client.first] &= ~CEPH_OSD_STOP; // if any
+    osd_addrs->client_addrs[client.first].reset(
+      new entity_addrvec_t(client.second));
+    osd_addrs->hb_back_addrs[client.first].reset(
+      new entity_addrvec_t(inc.new_hb_back_up.find(client.first)->second));
+    osd_addrs->hb_front_addrs[client.first].reset(
+      new entity_addrvec_t(inc.new_hb_front_up.find(client.first)->second));
+
+    osd_info[client.first].up_from = epoch;
+  }
+
+  for (const auto &cluster : inc.new_up_cluster)
+    osd_addrs->cluster_addrs[cluster.first].reset(
+      new entity_addrvec_t(cluster.second));
+
+  // info
+  for (const auto &thru : inc.new_up_thru)
+    osd_info[thru.first].up_thru = thru.second;
+  
+  for (const auto &interval : inc.new_last_clean_interval) {
+    osd_info[interval.first].last_clean_begin = interval.second.first;
+    osd_info[interval.first].last_clean_end = interval.second.second;
+  }
+  
+  for (const auto &lost : inc.new_lost)
+    osd_info[lost.first].lost_at = lost.second;
+
+  // xinfo
+  for (const auto &xinfo : inc.new_xinfo)
+    osd_xinfo[xinfo.first] = xinfo.second;
+
+  // uuid
+  for (const auto &uuid : inc.new_uuid)
+    (*osd_uuid)[uuid.first] = uuid.second;
+
+  // pg rebuild
+  for (const auto &pg : inc.new_pg_temp) {
+    if (pg.second.empty())
+      pg_temp->erase(pg.first);
+    else
+      pg_temp->set(pg.first, pg.second);
+  }
+  if (!inc.new_pg_temp.empty()) {
+    // make sure pg_temp is efficiently stored
+    pg_temp->rebuild();
+  }
+
+  for (const auto &pg : inc.new_primary_temp) {
+    if (pg.second == -1)
+      primary_temp->erase(pg.first);
+    else
+      (*primary_temp)[pg.first] = pg.second;
+  }
+
+  for (auto& p : inc.new_pg_upmap) {
+    pg_upmap[p.first] = p.second;
+  }
+  for (auto& pg : inc.old_pg_upmap) {
+    pg_upmap.erase(pg);
+  }
+  for (auto& p : inc.new_pg_upmap_items) {
+    pg_upmap_items[p.first] = p.second;
+  }
+  for (auto& pg : inc.old_pg_upmap_items) {
+    pg_upmap_items.erase(pg);
+  }
+
+  // blocklist
+  if (!inc.new_blocklist.empty()) {
+    blocklist.insert(inc.new_blocklist.begin(),inc.new_blocklist.end());
+    new_blocklist_entries = true;
+  }
+  for (const auto &addr : inc.old_blocklist)
+    blocklist.erase(addr);
+
+  for (const auto& addr_p : inc.new_range_blocklist) {
+    range_blocklist.insert(addr_p);
+    calculated_ranges.emplace(addr_p.first, addr_p.first);
+    new_blocklist_entries = true;
+  }
+  for (const auto &addr : inc.old_range_blocklist) {
+    calculated_ranges.erase(addr);
+    range_blocklist.erase(addr);
+  }
+
+  for (auto& i : inc.new_crush_node_flags) {
+    if (i.second) {
+      crush_node_flags[i.first] = i.second;
+    } else {
+      crush_node_flags.erase(i.first);
+    }
+  }
+
+  for (auto& i : inc.new_device_class_flags) {
+    if (i.second) {
+      device_class_flags[i.first] = i.second;
+    } else {
+      device_class_flags.erase(i.first);
+    }
+  }
+
+  // cluster snapshot?
+  if (inc.cluster_snapshot.length()) {
+    cluster_snapshot = inc.cluster_snapshot;
+    cluster_snapshot_epoch = inc.epoch;
+  } else {
+    cluster_snapshot.clear();
+    cluster_snapshot_epoch = 0;
+  }
+
+  if (inc.new_nearfull_ratio >= 0) {
+    nearfull_ratio = inc.new_nearfull_ratio;
+  }
+  if (inc.new_backfillfull_ratio >= 0) {
+    backfillfull_ratio = inc.new_backfillfull_ratio;
+  }
+  if (inc.new_full_ratio >= 0) {
+    full_ratio = inc.new_full_ratio;
+  }
+  if (inc.new_require_min_compat_client > ceph_release_t::unknown) {
+    require_min_compat_client = inc.new_require_min_compat_client;
+  }
+  if (inc.new_require_osd_release >= ceph_release_t::unknown) {
+    require_osd_release = inc.new_require_osd_release;
+    if (require_osd_release >= ceph_release_t::luminous) {
+      flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
+      flags |= CEPH_OSDMAP_RECOVERY_DELETES;
+    }
+  }
+
+  if (inc.new_require_osd_release >= ceph_release_t::unknown) {
+    require_osd_release = inc.new_require_osd_release;
+    if (require_osd_release >= ceph_release_t::nautilus) {
+      flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
+    }
+  }
+  // do new crush map last (after up/down stuff)
+  if (inc.crush.length()) {
+    ceph::buffer::list bl(inc.crush);
+    auto blp = bl.cbegin();
+    crush.reset(new CrushWrapper);
+    crush->decode(blp);
+    if (require_osd_release >= ceph_release_t::luminous) {
+      // only increment if this is a luminous-encoded osdmap, lest
+      // the mon's crush_version diverge from what the osds or others
+      // are decoding and applying on their end.  if we won't encode
+      // it in the canonical version, don't change it.
+      ++crush_version;
+    }
+    for (auto it = device_class_flags.begin();
+         it != device_class_flags.end();) {
+      const char* class_name = crush->get_class_name(it->first);
+      if (!class_name) // device class is gone
+        it = device_class_flags.erase(it);
+      else
+        it++;
+    }
+  }
+
+  if (inc.change_stretch_mode) {
+    stretch_mode_enabled = inc.stretch_mode_enabled;
+    stretch_bucket_count = inc.new_stretch_bucket_count;
+    degraded_stretch_mode = inc.new_degraded_stretch_mode;
+    recovering_stretch_mode = inc.new_recovering_stretch_mode;
+    stretch_mode_bucket = inc.new_stretch_mode_bucket;
+  }
+
+  calc_num_osds();
+  _calc_up_osd_features();
+  return 0;
+}
+
+// mapping
+int OSDMap::map_to_pg(
+  int64_t poolid,
+  const string& name,
+  const string& key,
+  const string& nspace,
+  pg_t *pg) const
+{
+  // calculate ps (placement seed)
+  const pg_pool_t *pool = get_pg_pool(poolid);
+  if (!pool)
+    return -ENOENT;
+  ps_t ps;
+  if (!key.empty())
+    ps = pool->hash_key(key, nspace);
+  else
+    ps = pool->hash_key(name, nspace);
+  *pg = pg_t(ps, poolid);
+  return 0;
+}
+
+int OSDMap::object_locator_to_pg(
+  const object_t& oid, const object_locator_t& loc, pg_t &pg) const
+{
+  if (loc.hash >= 0) {
+    if (!get_pg_pool(loc.get_pool())) {
+      return -ENOENT;
+    }
+    pg = pg_t(loc.hash, loc.get_pool());
+    return 0;
+  }
+  return map_to_pg(loc.get_pool(), oid.name, loc.key, loc.nspace, &pg);
+}
+
+ceph_object_layout OSDMap::make_object_layout(
+  object_t oid, int pg_pool, string nspace) const
+{
+  object_locator_t loc(pg_pool, nspace);
+
+  ceph_object_layout ol;
+  pg_t pgid = object_locator_to_pg(oid, loc);
+  ol.ol_pgid = pgid.get_old_pg().v;
+  ol.ol_stripe_unit = 0;
+  return ol;
+}
+
+void OSDMap::_remove_nonexistent_osds(const pg_pool_t& pool,
+				      vector<int>& osds) const
+{
+  if (pool.can_shift_osds()) {
+    unsigned removed = 0;
+    for (unsigned i = 0; i < osds.size(); i++) {
+      if (!exists(osds[i])) {
+	removed++;
+	continue;
+      }
+      if (removed) {
+	osds[i - removed] = osds[i];
+      }
+    }
+    if (removed)
+      osds.resize(osds.size() - removed);
+  } else {
+    for (auto& osd : osds) {
+      if (!exists(osd))
+	osd = CRUSH_ITEM_NONE;
+    }
+  }
+}
+
+void OSDMap::_pg_to_raw_osds(
+  const pg_pool_t& pool, pg_t pg,
+  vector<int> *osds,
+  ps_t *ppps) const
+{
+  // map to osds[]
+  ps_t pps = pool.raw_pg_to_pps(pg);  // placement ps
+  unsigned size = pool.get_size();
+
+  // what crush rule?
+  int ruleno = crush->find_rule(pool.get_crush_rule(), pool.get_type(), size);
+  if (ruleno >= 0)
+    crush->do_rule(ruleno, pps, *osds, size, osd_weight, pg.pool());
+
+  _remove_nonexistent_osds(pool, *osds);
+
+  if (ppps)
+    *ppps = pps;
+}
+
+int OSDMap::_pick_primary(const vector<int>& osds) const
+{
+  for (auto osd : osds) {
+    if (osd != CRUSH_ITEM_NONE) {
+      return osd;
+    }
+  }
+  return -1;
+}
+
+void OSDMap::_apply_upmap(const pg_pool_t& pi, pg_t raw_pg, vector<int> *raw) const
+{
+  pg_t pg = pi.raw_pg_to_pg(raw_pg);
+  auto p = pg_upmap.find(pg);
+  if (p != pg_upmap.end()) {
+    // make sure targets aren't marked out
+    for (auto osd : p->second) {
+      if (osd != CRUSH_ITEM_NONE && osd < max_osd && osd >= 0 &&
+          osd_weight[osd] == 0) {
+	// reject/ignore the explicit mapping
+	return;
+      }
+    }
+    *raw = vector<int>(p->second.begin(), p->second.end());
+    // continue to check and apply pg_upmap_items if any
+  }
+
+  auto q = pg_upmap_items.find(pg);
+  if (q != pg_upmap_items.end()) {
+    // NOTE: this approach does not allow a bidirectional swap,
+    // e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
+    for (auto& r : q->second) {
+      // make sure the replacement value doesn't already appear
+      bool exists = false;
+      ssize_t pos = -1;
+      for (unsigned i = 0; i < raw->size(); ++i) {
+	int osd = (*raw)[i];
+	if (osd == r.second) {
+	  exists = true;
+	  break;
+	}
+	// ignore mapping if target is marked out (or invalid osd id)
+	if (osd == r.first &&
+	    pos < 0 &&
+	    !(r.second != CRUSH_ITEM_NONE && r.second < max_osd &&
+	      r.second >= 0 && osd_weight[r.second] == 0)) {
+	  pos = i;
+	}
+      }
+      if (!exists && pos >= 0) {
+	(*raw)[pos] = r.second;
+      }
+    }
+  }
+}
+
+// pg -> (up osd list)
+void OSDMap::_raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
+                             vector<int> *up) const
+{
+  if (pool.can_shift_osds()) {
+    // shift left
+    up->clear();
+    up->reserve(raw.size());
+    for (unsigned i=0; i<raw.size(); i++) {
+      if (!exists(raw[i]) || is_down(raw[i]))
+	continue;
+      up->push_back(raw[i]);
+    }
+  } else {
+    // set down/dne devices to NONE
+    up->resize(raw.size());
+    for (int i = raw.size() - 1; i >= 0; --i) {
+      if (!exists(raw[i]) || is_down(raw[i])) {
+	(*up)[i] = CRUSH_ITEM_NONE;
+      } else {
+	(*up)[i] = raw[i];
+      }
+    }
+  }
+}
+
+void OSDMap::_apply_primary_affinity(ps_t seed,
+				     const pg_pool_t& pool,
+				     vector<int> *osds,
+				     int *primary) const
+{
+  // do we have any non-default primary_affinity values for these osds?
+  if (!osd_primary_affinity)
+    return;
+
+  bool any = false;
+  for (const auto osd : *osds) {
+    if (osd != CRUSH_ITEM_NONE &&
+	(*osd_primary_affinity)[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+      any = true;
+      break;
+    }
+  }
+  if (!any)
+    return;
+
+  // pick the primary.  feed both the seed (for the pg) and the osd
+  // into the hash/rng so that a proportional fraction of an osd's pgs
+  // get rejected as primary.
+  int pos = -1;
+  for (unsigned i = 0; i < osds->size(); ++i) {
+    int o = (*osds)[i];
+    if (o == CRUSH_ITEM_NONE)
+      continue;
+    unsigned a = (*osd_primary_affinity)[o];
+    if (a < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
+	(crush_hash32_2(CRUSH_HASH_RJENKINS1,
+			seed, o) >> 16) >= a) {
+      // we chose not to use this primary.  note it anyway as a
+      // fallback in case we don't pick anyone else, but keep looking.
+      if (pos < 0)
+	pos = i;
+    } else {
+      pos = i;
+      break;
+    }
+  }
+  if (pos < 0)
+    return;
+
+  *primary = (*osds)[pos];
+
+  if (pool.can_shift_osds() && pos > 0) {
+    // move the new primary to the front.
+    for (int i = pos; i > 0; --i) {
+      (*osds)[i] = (*osds)[i-1];
+    }
+    (*osds)[0] = *primary;
+  }
+}
+
+void OSDMap::_get_temp_osds(const pg_pool_t& pool, pg_t pg,
+                            vector<int> *temp_pg, int *temp_primary) const
+{
+  pg = pool.raw_pg_to_pg(pg);
+  const auto p = pg_temp->find(pg);
+  temp_pg->clear();
+  if (p != pg_temp->end()) {
+    for (unsigned i=0; i<p->second.size(); i++) {
+      if (!exists(p->second[i]) || is_down(p->second[i])) {
+	if (pool.can_shift_osds()) {
+	  continue;
+	} else {
+	  temp_pg->push_back(CRUSH_ITEM_NONE);
+	}
+      } else {
+	temp_pg->push_back(p->second[i]);
+      }
+    }
+  }
+  const auto &pp = primary_temp->find(pg);
+  *temp_primary = -1;
+  if (pp != primary_temp->end()) {
+    *temp_primary = pp->second;
+  } else if (!temp_pg->empty()) { // apply pg_temp's primary
+    for (unsigned i = 0; i < temp_pg->size(); ++i) {
+      if ((*temp_pg)[i] != CRUSH_ITEM_NONE) {
+	*temp_primary = (*temp_pg)[i];
+	break;
+      }
+    }
+  }
+}
+
+void OSDMap::pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const
+{
+  const pg_pool_t *pool = get_pg_pool(pg.pool());
+  if (!pool) {
+    *primary = -1;
+    raw->clear();
+    return;
+  }
+  _pg_to_raw_osds(*pool, pg, raw, NULL);
+  *primary = _pick_primary(*raw);
+}
+
+void OSDMap::pg_to_raw_upmap(pg_t pg, vector<int>*raw,
+                             vector<int> *raw_upmap) const
+{
+  auto pool = get_pg_pool(pg.pool());
+  if (!pool) {
+    raw_upmap->clear();
+    return;
+  }
+  _pg_to_raw_osds(*pool, pg, raw, NULL);
+  *raw_upmap = *raw;
+  _apply_upmap(*pool, pg, raw_upmap);
+}
+
+void OSDMap::pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const
+{
+  const pg_pool_t *pool = get_pg_pool(pg.pool());
+  if (!pool) {
+    *primary = -1;
+    up->clear();
+    return;
+  }
+  vector<int> raw;
+  ps_t pps;
+  _pg_to_raw_osds(*pool, pg, &raw, &pps);
+  _apply_upmap(*pool, pg, &raw);
+  _raw_to_up_osds(*pool, raw, up);
+  *primary = _pick_primary(raw);
+  _apply_primary_affinity(pps, *pool, up, primary);
+}
+
+void OSDMap::_pg_to_up_acting_osds(
+  const pg_t& pg, vector<int> *up, int *up_primary,
+  vector<int> *acting, int *acting_primary,
+  bool raw_pg_to_pg) const
+{
+  const pg_pool_t *pool = get_pg_pool(pg.pool());
+  if (!pool ||
+      (!raw_pg_to_pg && pg.ps() >= pool->get_pg_num())) {
+    if (up)
+      up->clear();
+    if (up_primary)
+      *up_primary = -1;
+    if (acting)
+      acting->clear();
+    if (acting_primary)
+      *acting_primary = -1;
+    return;
+  }
+  vector<int> raw;
+  vector<int> _up;
+  vector<int> _acting;
+  int _up_primary;
+  int _acting_primary;
+  ps_t pps;
+  _get_temp_osds(*pool, pg, &_acting, &_acting_primary);
+  if (_acting.empty() || up || up_primary) {
+    _pg_to_raw_osds(*pool, pg, &raw, &pps);
+    _apply_upmap(*pool, pg, &raw);
+    _raw_to_up_osds(*pool, raw, &_up);
+    _up_primary = _pick_primary(_up);
+    _apply_primary_affinity(pps, *pool, &_up, &_up_primary);
+    if (_acting.empty()) {
+      _acting = _up;
+      if (_acting_primary == -1) {
+        _acting_primary = _up_primary;
+      }
+    }
+  
+    if (up)
+      up->swap(_up);
+    if (up_primary)
+      *up_primary = _up_primary;
+  }
+
+  if (acting)
+    acting->swap(_acting);
+  if (acting_primary)
+    *acting_primary = _acting_primary;
+}
+
+int OSDMap::calc_pg_role_broken(int osd, const vector<int>& acting, int nrep)
+{
+  // This implementation is broken for EC PGs since the osd may appear
+  // multiple times in the acting set.  See
+  // https://tracker.ceph.com/issues/43213
+  if (!nrep)
+    nrep = acting.size();
+  for (int i=0; i<nrep; i++) 
+    if (acting[i] == osd)
+      return i;
+  return -1;
+}
+
+int OSDMap::calc_pg_role(pg_shard_t who, const vector<int>& acting)
+{
+  int nrep = acting.size();
+  if (who.shard == shard_id_t::NO_SHARD) {
+    for (int i=0; i<nrep; i++) {
+      if (acting[i] == who.osd) {
+	return i;
+      }
+    }
+  } else {
+    if (who.shard < nrep && acting[who.shard] == who.osd) {
+      return who.shard;
+    }
+  }
+  return -1;
+}
+
+bool OSDMap::primary_changed_broken(
+  int oldprimary,
+  const vector<int> &oldacting,
+  int newprimary,
+  const vector<int> &newacting)
+{
+  if (oldacting.empty() && newacting.empty())
+    return false;    // both still empty
+  if (oldacting.empty() ^ newacting.empty())
+    return true;     // was empty, now not, or vice versa
+  if (oldprimary != newprimary)
+    return true;     // primary changed
+  if (calc_pg_role_broken(oldprimary, oldacting) !=
+      calc_pg_role_broken(newprimary, newacting))
+    return true;
+  return false;      // same primary (tho replicas may have changed)
+}
+
+uint64_t OSDMap::get_encoding_features() const
+{
+  uint64_t f = SIGNIFICANT_FEATURES;
+  if (require_osd_release < ceph_release_t::octopus) {
+    f &= ~CEPH_FEATURE_SERVER_OCTOPUS;
+  }
+  if (require_osd_release < ceph_release_t::nautilus) {
+    f &= ~CEPH_FEATURE_SERVER_NAUTILUS;
+  }
+  if (require_osd_release < ceph_release_t::mimic) {
+    f &= ~CEPH_FEATURE_SERVER_MIMIC;
+  }
+  if (require_osd_release < ceph_release_t::luminous) {
+    f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
+	   CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
+  }
+  if (require_osd_release < ceph_release_t::kraken) {
+    f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
+	   CEPH_FEATURE_MSG_ADDR2);
+  }
+  if (require_osd_release < ceph_release_t::jewel) {
+    f &= ~(CEPH_FEATURE_SERVER_JEWEL |
+	   CEPH_FEATURE_NEW_OSDOP_ENCODING |
+	   CEPH_FEATURE_CRUSH_TUNABLES5);
+  }
+  return f;
+}
+
+// serialize, unserialize
+void OSDMap::encode_client_old(ceph::buffer::list& bl) const
+{
+  using ceph::encode;
+  __u16 v = 5;
+  encode(v, bl);
+
+  // base
+  encode(fsid, bl);
+  encode(epoch, bl);
+  encode(created, bl);
+  encode(modified, bl);
+
+  // for encode(pools, bl);
+  __u32 n = pools.size();
+  encode(n, bl);
+
+  for (const auto &pool : pools) {
+    n = pool.first;
+    encode(n, bl);
+    encode(pool.second, bl, 0);
+  }
+  // for encode(pool_name, bl);
+  n = pool_name.size();
+  encode(n, bl);
+  for (const auto &pname : pool_name) {
+    n = pname.first;
+    encode(n, bl);
+    encode(pname.second, bl);
+  }
+  // for encode(pool_max, bl);
+  n = pool_max;
+  encode(n, bl);
+
+  encode(flags, bl);
+
+  encode(max_osd, bl);
+  {
+    uint32_t n = osd_state.size();
+    encode(n, bl);
+    for (auto s : osd_state) {
+      encode((uint8_t)s, bl);
+    }
+  }
+  encode(osd_weight, bl);
+  encode(osd_addrs->client_addrs, bl, 0);
+
+  // for encode(pg_temp, bl);
+  n = pg_temp->size();
+  encode(n, bl);
+  for (const auto& pg : *pg_temp) {
+    old_pg_t opg = pg.first.get_old_pg();
+    encode(opg, bl);
+    encode(pg.second, bl);
+  }
+
+  // crush
+  ceph::buffer::list cbl;
+  crush->encode(cbl, 0 /* legacy (no) features */);
+  encode(cbl, bl);
+}
+
+void OSDMap::encode_classic(ceph::buffer::list& bl, uint64_t features) const
+{
+  using ceph::encode;
+  if ((features & CEPH_FEATURE_PGID64) == 0) {
+    encode_client_old(bl);
+    return;
+  }
+
+  __u16 v = 6;
+  encode(v, bl);
+
+  // base
+  encode(fsid, bl);
+  encode(epoch, bl);
+  encode(created, bl);
+  encode(modified, bl);
+
+  encode(pools, bl, features);
+  encode(pool_name, bl);
+  encode(pool_max, bl);
+
+  encode(flags, bl);
+
+  encode(max_osd, bl);
+  {
+    uint32_t n = osd_state.size();
+    encode(n, bl);
+    for (auto s : osd_state) {
+      encode((uint8_t)s, bl);
+    }
+  }
+  encode(osd_weight, bl);
+  encode(osd_addrs->client_addrs, bl, features);
+
+  encode(*pg_temp, bl);
+
+  // crush
+  ceph::buffer::list cbl;
+  crush->encode(cbl, 0 /* legacy (no) features */);
+  encode(cbl, bl);
+
+  // extended
+  __u16 ev = 10;
+  encode(ev, bl);
+  encode(osd_addrs->hb_back_addrs, bl, features);
+  encode(osd_info, bl);
+  encode(blocklist, bl, features);
+  encode(osd_addrs->cluster_addrs, bl, features);
+  encode(cluster_snapshot_epoch, bl);
+  encode(cluster_snapshot, bl);
+  encode(*osd_uuid, bl);
+  encode(osd_xinfo, bl, features);
+  encode(osd_addrs->hb_front_addrs, bl, features);
+}
+
+/* for a description of osdmap versions, and when they were introduced, please
+ * refer to
+ *    doc/dev/osd_internals/osdmap_versions.txt
+ */
+void OSDMap::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  using ceph::encode;
+  if ((features & CEPH_FEATURE_OSDMAP_ENC) == 0) {
+    encode_classic(bl, features);
+    return;
+  }
+
+  // only a select set of callers should *ever* be encoding new
+  // OSDMaps.  others should be passing around the canonical encoded
+  // buffers from on high.  select out those callers by passing in an
+  // "impossible" feature bit.
+  ceph_assert(features & CEPH_FEATURE_RESERVED);
+  features &= ~CEPH_FEATURE_RESERVED;
+
+  size_t start_offset = bl.length();
+  size_t tail_offset;
+  size_t crc_offset;
+  std::optional<ceph::buffer::list::contiguous_filler> crc_filler;
+
+  // meta-encoding: how we include client-used and osd-specific data
+  ENCODE_START(8, 7, bl);
+
+  {
+    // NOTE: any new encoding dependencies must be reflected by
+    // SIGNIFICANT_FEATURES
+    uint8_t v = 9;
+    if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      v = 3;
+    } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
+      v = 6;
+    } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+      v = 7;
+    }
+    ENCODE_START(v, 1, bl); // client-usable data
+    // base
+    encode(fsid, bl);
+    encode(epoch, bl);
+    encode(created, bl);
+    encode(modified, bl);
+
+    encode(pools, bl, features);
+    encode(pool_name, bl);
+    encode(pool_max, bl);
+
+    if (v < 4) {
+      decltype(flags) f = flags;
+      if (require_osd_release >= ceph_release_t::luminous)
+	f |= CEPH_OSDMAP_REQUIRE_LUMINOUS | CEPH_OSDMAP_RECOVERY_DELETES;
+      else if (require_osd_release == ceph_release_t::kraken)
+	f |= CEPH_OSDMAP_REQUIRE_KRAKEN;
+      else if (require_osd_release == ceph_release_t::jewel)
+	f |= CEPH_OSDMAP_REQUIRE_JEWEL;
+      encode(f, bl);
+    } else {
+      encode(flags, bl);
+    }
+
+    encode(max_osd, bl);
+    if (v >= 5) {
+      encode(osd_state, bl);
+    } else {
+      uint32_t n = osd_state.size();
+      encode(n, bl);
+      for (auto s : osd_state) {
+	encode((uint8_t)s, bl);
+      }
+    }
+    encode(osd_weight, bl);
+    if (v >= 8) {
+      encode(osd_addrs->client_addrs, bl, features);
+    } else {
+      encode_addrvec_pvec_as_addr(osd_addrs->client_addrs, bl, features);
+    }
+
+    encode(*pg_temp, bl);
+    encode(*primary_temp, bl);
+    if (osd_primary_affinity) {
+      encode(*osd_primary_affinity, bl);
+    } else {
+      vector<__u32> v;
+      encode(v, bl);
+    }
+
+    // crush
+    ceph::buffer::list cbl;
+    crush->encode(cbl, features);
+    encode(cbl, bl);
+    encode(erasure_code_profiles, bl);
+
+    if (v >= 4) {
+      encode(pg_upmap, bl);
+      encode(pg_upmap_items, bl);
+    } else {
+      ceph_assert(pg_upmap.empty());
+      ceph_assert(pg_upmap_items.empty());
+    }
+    if (v >= 6) {
+      encode(crush_version, bl);
+    }
+    if (v >= 7) {
+      encode(new_removed_snaps, bl);
+      encode(new_purged_snaps, bl);
+    }
+    if (v >= 9) {
+      encode(last_up_change, bl);
+      encode(last_in_change, bl);
+    }
+    ENCODE_FINISH(bl); // client-usable data
+  }
+
+  {
+    // NOTE: any new encoding dependencies must be reflected by
+    // SIGNIFICANT_FEATURES
+    uint8_t target_v = 9; // when bumping this, be aware of range blocklist
+    if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+      target_v = 1;
+    } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
+      target_v = 5;
+    } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+      target_v = 6;
+    }
+    if (stretch_mode_enabled) {
+      target_v = std::max((uint8_t)10, target_v);
+    }
+    if (!range_blocklist.empty()) {
+      target_v = std::max((uint8_t)11, target_v);
+    }
+    ENCODE_START(target_v, 1, bl); // extended, osd-only data
+    if (target_v < 7) {
+      encode_addrvec_pvec_as_addr(osd_addrs->hb_back_addrs, bl, features);
+    } else {
+      encode(osd_addrs->hb_back_addrs, bl, features);
+    }
+    encode(osd_info, bl);
+    {
+      // put this in a sorted, ordered map<> so that we encode in a
+      // deterministic order.
+      map<entity_addr_t,utime_t> blocklist_map;
+      for (const auto &addr : blocklist)
+	blocklist_map.insert(make_pair(addr.first, addr.second));
+      encode(blocklist_map, bl, features);
+    }
+    if (target_v < 7) {
+      encode_addrvec_pvec_as_addr(osd_addrs->cluster_addrs, bl, features);
+    } else {
+      encode(osd_addrs->cluster_addrs, bl, features);
+    }
+    encode(cluster_snapshot_epoch, bl);
+    encode(cluster_snapshot, bl);
+    encode(*osd_uuid, bl);
+    encode(osd_xinfo, bl, features);
+    if (target_v < 7) {
+      encode_addrvec_pvec_as_addr(osd_addrs->hb_front_addrs, bl, features);
+    } else {
+      encode(osd_addrs->hb_front_addrs, bl, features);
+    }
+    if (target_v >= 2) {
+      encode(nearfull_ratio, bl);
+      encode(full_ratio, bl);
+      encode(backfillfull_ratio, bl);
+    }
+    // 4 was string-based new_require_min_compat_client
+    if (target_v >= 5) {
+      encode(require_min_compat_client, bl);
+      encode(require_osd_release, bl);
+    }
+    if (target_v >= 6) {
+      encode(removed_snaps_queue, bl);
+    }
+    if (target_v >= 8) {
+      encode(crush_node_flags, bl);
+    }
+    if (target_v >= 9) {
+      encode(device_class_flags, bl);
+    }
+    if (target_v >= 10) {
+      encode(stretch_mode_enabled, bl);
+      encode(stretch_bucket_count, bl);
+      encode(degraded_stretch_mode, bl);
+      encode(recovering_stretch_mode, bl);
+      encode(stretch_mode_bucket, bl);
+    }
+    if (target_v >= 11) {
+      ::encode(range_blocklist, bl, features);
+    }
+    ENCODE_FINISH(bl); // osd-only data
+  }
+
+  crc_offset = bl.length();
+  crc_filler = bl.append_hole(sizeof(uint32_t));
+  tail_offset = bl.length();
+
+  ENCODE_FINISH(bl); // meta-encoding wrapper
+
+  // fill in crc
+  ceph::buffer::list front;
+  front.substr_of(bl, start_offset, crc_offset - start_offset);
+  crc = front.crc32c(-1);
+  if (tail_offset < bl.length()) {
+    ceph::buffer::list tail;
+    tail.substr_of(bl, tail_offset, bl.length() - tail_offset);
+    crc = tail.crc32c(crc);
+  }
+  ceph_le32 crc_le;
+  crc_le = crc;
+  crc_filler->copy_in(4, (char*)&crc_le);
+  crc_defined = true;
+}
+
+/* for a description of osdmap versions, and when they were introduced, please
+ * refer to
+ *    doc/dev/osd_internals/osdmap_versions.txt
+ */
+void OSDMap::decode(ceph::buffer::list& bl)
+{
+  auto p = bl.cbegin();
+  decode(p);
+}
+
+void OSDMap::decode_classic(ceph::buffer::list::const_iterator& p)
+{
+  using ceph::decode;
+  __u32 n, t;
+  __u16 v;
+  decode(v, p);
+
+  // base
+  decode(fsid, p);
+  decode(epoch, p);
+  decode(created, p);
+  decode(modified, p);
+
+  if (v < 6) {
+    if (v < 4) {
+      int32_t max_pools = 0;
+      decode(max_pools, p);
+      pool_max = max_pools;
+    }
+    pools.clear();
+    decode(n, p);
+    while (n--) {
+      decode(t, p);
+      decode(pools[t], p);
+    }
+    if (v == 4) {
+      decode(n, p);
+      pool_max = n;
+    } else if (v == 5) {
+      pool_name.clear();
+      decode(n, p);
+      while (n--) {
+	decode(t, p);
+	decode(pool_name[t], p);
+      }
+      decode(n, p);
+      pool_max = n;
+    }
+  } else {
+    decode(pools, p);
+    decode(pool_name, p);
+    decode(pool_max, p);
+  }
+  // kludge around some old bug that zeroed out pool_max (#2307)
+  if (pools.size() && pool_max < pools.rbegin()->first) {
+    pool_max = pools.rbegin()->first;
+  }
+
+  decode(flags, p);
+
+  decode(max_osd, p);
+  {
+    vector<uint8_t> os;
+    decode(os, p);
+    osd_state.resize(os.size());
+    for (unsigned i = 0; i < os.size(); ++i) {
+      osd_state[i] = os[i];
+    }
+  }
+  decode(osd_weight, p);
+  decode(osd_addrs->client_addrs, p);
+  if (v <= 5) {
+    pg_temp->clear();
+    decode(n, p);
+    while (n--) {
+      old_pg_t opg;
+      ceph::decode_raw(opg, p);
+      mempool::osdmap::vector<int32_t> v;
+      decode(v, p);
+      pg_temp->set(pg_t(opg), v);
+    }
+  } else {
+    decode(*pg_temp, p);
+  }
+
+  // crush
+  ceph::buffer::list cbl;
+  decode(cbl, p);
+  auto cblp = cbl.cbegin();
+  crush->decode(cblp);
+
+  // extended
+  __u16 ev = 0;
+  if (v >= 5)
+    decode(ev, p);
+  decode(osd_addrs->hb_back_addrs, p);
+  decode(osd_info, p);
+  if (v < 5)
+    decode(pool_name, p);
+
+  decode(blocklist, p);
+  if (ev >= 6)
+    decode(osd_addrs->cluster_addrs, p);
+  else
+    osd_addrs->cluster_addrs.resize(osd_addrs->client_addrs.size());
+
+  if (ev >= 7) {
+    decode(cluster_snapshot_epoch, p);
+    decode(cluster_snapshot, p);
+  }
+
+  if (ev >= 8) {
+    decode(*osd_uuid, p);
+  } else {
+    osd_uuid->resize(max_osd);
+  }
+  if (ev >= 9)
+    decode(osd_xinfo, p);
+  else
+    osd_xinfo.resize(max_osd);
+
+  if (ev >= 10)
+    decode(osd_addrs->hb_front_addrs, p);
+  else
+    osd_addrs->hb_front_addrs.resize(osd_addrs->hb_back_addrs.size());
+
+  osd_primary_affinity.reset();
+
+  post_decode();
+}
+
+void OSDMap::decode(ceph::buffer::list::const_iterator& bl)
+{
+  using ceph::decode;
+  /**
+   * Older encodings of the OSDMap had a single struct_v which
+   * covered the whole encoding, and was prior to our modern
+   * stuff which includes a compatv and a size. So if we see
+   * a struct_v < 7, we must rewind to the beginning and use our
+   * classic decoder.
+   */
+  size_t start_offset = bl.get_off();
+  size_t tail_offset = 0;
+  ceph::buffer::list crc_front, crc_tail;
+
+  DECODE_START_LEGACY_COMPAT_LEN(8, 7, 7, bl); // wrapper
+  if (struct_v < 7) {
+    bl.seek(start_offset);
+    decode_classic(bl);
+    return;
+  }
+  /**
+   * Since we made it past that hurdle, we can use our normal paths.
+   */
+  {
+    DECODE_START(9, bl); // client-usable data
+    // base
+    decode(fsid, bl);
+    decode(epoch, bl);
+    decode(created, bl);
+    decode(modified, bl);
+
+    decode(pools, bl);
+    decode(pool_name, bl);
+    decode(pool_max, bl);
+
+    decode(flags, bl);
+
+    decode(max_osd, bl);
+    if (struct_v >= 5) {
+      decode(osd_state, bl);
+    } else {
+      vector<uint8_t> os;
+      decode(os, bl);
+      osd_state.resize(os.size());
+      for (unsigned i = 0; i < os.size(); ++i) {
+	osd_state[i] = os[i];
+      }
+    }
+    decode(osd_weight, bl);
+    decode(osd_addrs->client_addrs, bl);
+
+    decode(*pg_temp, bl);
+    decode(*primary_temp, bl);
+    // dates back to firefly. version increased from 2 to 3 still in firefly.
+    // do we really still need to keep this around? even for old clients?
+    if (struct_v >= 2) {
+      osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>);
+      decode(*osd_primary_affinity, bl);
+      if (osd_primary_affinity->empty())
+	osd_primary_affinity.reset();
+    } else {
+      osd_primary_affinity.reset();
+    }
+
+    // crush
+    ceph::buffer::list cbl;
+    decode(cbl, bl);
+    auto cblp = cbl.cbegin();
+    crush->decode(cblp);
+    // added in firefly; version increased in luminous, so it affects
+    // giant, hammer, infernallis, jewel, and kraken. probably should be left
+    // alone until we require clients to be all luminous?
+    if (struct_v >= 3) {
+      decode(erasure_code_profiles, bl);
+    } else {
+      erasure_code_profiles.clear();
+    }
+    // version increased from 3 to 4 still in luminous, so same as above
+    // applies.
+    if (struct_v >= 4) {
+      decode(pg_upmap, bl);
+      decode(pg_upmap_items, bl);
+    } else {
+      pg_upmap.clear();
+      pg_upmap_items.clear();
+    }
+    // again, version increased from 5 to 6 still in luminous, so above
+    // applies.
+    if (struct_v >= 6) {
+      decode(crush_version, bl);
+    }
+    // version increase from 6 to 7 in mimic
+    if (struct_v >= 7) {
+      decode(new_removed_snaps, bl);
+      decode(new_purged_snaps, bl);
+    }
+    // version increase from 7 to 8, 8 to 9, in nautilus.
+    if (struct_v >= 9) {
+      decode(last_up_change, bl);
+      decode(last_in_change, bl);
+    }
+    DECODE_FINISH(bl); // client-usable data
+  }
+
+  {
+    DECODE_START(10, bl); // extended, osd-only data
+    decode(osd_addrs->hb_back_addrs, bl);
+    decode(osd_info, bl);
+    decode(blocklist, bl);
+    decode(osd_addrs->cluster_addrs, bl);
+    decode(cluster_snapshot_epoch, bl);
+    decode(cluster_snapshot, bl);
+    decode(*osd_uuid, bl);
+    decode(osd_xinfo, bl);
+    decode(osd_addrs->hb_front_addrs, bl);
+    // 
+    if (struct_v >= 2) {
+      decode(nearfull_ratio, bl);
+      decode(full_ratio, bl);
+    } else {
+      nearfull_ratio = 0;
+      full_ratio = 0;
+    }
+    if (struct_v >= 3) {
+      decode(backfillfull_ratio, bl);
+    } else {
+      backfillfull_ratio = 0;
+    }
+    if (struct_v == 4) {
+      string r;
+      decode(r, bl);
+      if (r.length())
+	require_min_compat_client = ceph_release_from_name(r.c_str());
+    }
+    if (struct_v >= 5) {
+      decode(require_min_compat_client, bl);
+      decode(require_osd_release, bl);
+      if (require_osd_release >= ceph_release_t::nautilus) {
+	flags |= CEPH_OSDMAP_PGLOG_HARDLIMIT;
+      }
+      if (require_osd_release >= ceph_release_t::luminous) {
+	flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
+	flags |= CEPH_OSDMAP_RECOVERY_DELETES;
+      }
+    } else {
+      if (flags & CEPH_OSDMAP_REQUIRE_LUMINOUS) {
+	// only for compat with post-kraken pre-luminous test clusters
+	require_osd_release = ceph_release_t::luminous;
+	flags &= ~(CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS);
+	flags |= CEPH_OSDMAP_RECOVERY_DELETES;
+      } else if (flags & CEPH_OSDMAP_REQUIRE_KRAKEN) {
+	require_osd_release = ceph_release_t::kraken;
+      } else if (flags & CEPH_OSDMAP_REQUIRE_JEWEL) {
+	require_osd_release = ceph_release_t::jewel;
+      } else {
+	require_osd_release = ceph_release_t::unknown;
+      }
+    }
+    if (struct_v >= 6) {
+      decode(removed_snaps_queue, bl);
+    }
+    if (struct_v >= 8) {
+      decode(crush_node_flags, bl);
+    } else {
+      crush_node_flags.clear();
+    }
+    if (struct_v >= 9) {
+      decode(device_class_flags, bl);
+    } else {
+      device_class_flags.clear();
+    }
+    if (struct_v >= 10) {
+      decode(stretch_mode_enabled, bl);
+      decode(stretch_bucket_count, bl);
+      decode(degraded_stretch_mode, bl);
+      decode(recovering_stretch_mode, bl);
+      decode(stretch_mode_bucket, bl);
+    } else {
+      stretch_mode_enabled = false;
+      stretch_bucket_count = 0;
+      degraded_stretch_mode = 0;
+      recovering_stretch_mode = 0;
+      stretch_mode_bucket = 0;
+    }
+    if (struct_v >= 11) {
+      decode(range_blocklist, bl);
+      calculated_ranges.clear();
+      for (const auto& i : range_blocklist) {
+	calculated_ranges.emplace(i.first, i.first);
+      }
+    }
+    DECODE_FINISH(bl); // osd-only data
+  }
+
+  if (struct_v >= 8) {
+    crc_front.substr_of(bl.get_bl(), start_offset, bl.get_off() - start_offset);
+    decode(crc, bl);
+    tail_offset = bl.get_off();
+    crc_defined = true;
+  } else {
+    crc_defined = false;
+    crc = 0;
+  }
+
+  DECODE_FINISH(bl); // wrapper
+
+  if (tail_offset) {
+    // verify crc
+    uint32_t actual = crc_front.crc32c(-1);
+    if (tail_offset < bl.get_off()) {
+      ceph::buffer::list tail;
+      tail.substr_of(bl.get_bl(), tail_offset, bl.get_off() - tail_offset);
+      actual = tail.crc32c(actual);
+    }
+    if (crc != actual) {
+      ostringstream ss;
+      ss << "bad crc, actual " << actual << " != expected " << crc;
+      string s = ss.str();
+      throw ceph::buffer::malformed_input(s.c_str());
+    }
+  }
+
+  post_decode();
+}
+
+void OSDMap::post_decode()
+{
+  // index pool names
+  name_pool.clear();
+  for (const auto &pname : pool_name) {
+    name_pool[pname.second] = pname.first;
+  }
+
+  calc_num_osds();
+  _calc_up_osd_features();
+}
+
+void OSDMap::dump_erasure_code_profiles(
+  const mempool::osdmap::map<string,map<string,string>>& profiles,
+  Formatter *f)
+{
+  f->open_object_section("erasure_code_profiles");
+  for (const auto &profile : profiles) {
+    f->open_object_section(profile.first.c_str());
+    for (const auto &profm : profile.second) {
+      f->dump_string(profm.first.c_str(), profm.second);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void OSDMap::dump_osds(Formatter *f) const
+{
+  f->open_array_section("osds");
+  for (int i=0; i<get_max_osd(); i++) {
+    if (exists(i)) {
+      dump_osd(i, f);
+    }
+  }
+  f->close_section();
+}
+
+void OSDMap::dump_osd(int id, Formatter *f) const
+{
+  ceph_assert(f != nullptr);
+  if (!exists(id)) {
+    return;
+  }
+
+  f->open_object_section("osd_info");
+  f->dump_int("osd", id);
+  f->dump_stream("uuid") << get_uuid(id);
+  f->dump_int("up", is_up(id));
+  f->dump_int("in", is_in(id));
+  f->dump_float("weight", get_weightf(id));
+  f->dump_float("primary_affinity", get_primary_affinityf(id));
+  get_info(id).dump(f);
+  f->dump_object("public_addrs", get_addrs(id));
+  f->dump_object("cluster_addrs", get_cluster_addrs(id));
+  f->dump_object("heartbeat_back_addrs", get_hb_back_addrs(id));
+  f->dump_object("heartbeat_front_addrs", get_hb_front_addrs(id));
+  // compat
+  f->dump_stream("public_addr") << get_addrs(id).get_legacy_str();
+  f->dump_stream("cluster_addr") << get_cluster_addrs(id).get_legacy_str();
+  f->dump_stream("heartbeat_back_addr")
+    << get_hb_back_addrs(id).get_legacy_str();
+  f->dump_stream("heartbeat_front_addr")
+    << get_hb_front_addrs(id).get_legacy_str();
+
+  set<string> st;
+  get_state(id, st);
+  f->open_array_section("state");
+  for (const auto &state : st)
+    f->dump_string("state", state);
+  f->close_section();
+
+  f->close_section();
+}
+
+void OSDMap::dump(Formatter *f) const
+{
+  f->dump_int("epoch", get_epoch());
+  f->dump_stream("fsid") << get_fsid();
+  f->dump_stream("created") << get_created();
+  f->dump_stream("modified") << get_modified();
+  f->dump_stream("last_up_change") << last_up_change;
+  f->dump_stream("last_in_change") << last_in_change;
+  f->dump_string("flags", get_flag_string());
+  f->dump_unsigned("flags_num", flags);
+  f->open_array_section("flags_set");
+  set<string> flagset;
+  get_flag_set(&flagset);
+  for (auto p : flagset) {
+    f->dump_string("flag", p);
+  }
+  f->close_section();
+  f->dump_unsigned("crush_version", get_crush_version());
+  f->dump_float("full_ratio", full_ratio);
+  f->dump_float("backfillfull_ratio", backfillfull_ratio);
+  f->dump_float("nearfull_ratio", nearfull_ratio);
+  f->dump_string("cluster_snapshot", get_cluster_snapshot());
+  f->dump_int("pool_max", get_pool_max());
+  f->dump_int("max_osd", get_max_osd());
+  f->dump_string("require_min_compat_client",
+		 to_string(require_min_compat_client));
+  f->dump_string("min_compat_client",
+		 to_string(get_min_compat_client()));
+  f->dump_string("require_osd_release",
+		 to_string(require_osd_release));
+
+  f->open_array_section("pools");
+  for (const auto &pool : pools) {
+    std::string name("<unknown>");
+    const auto &pni = pool_name.find(pool.first);
+    if (pni != pool_name.end())
+      name = pni->second;
+    f->open_object_section("pool");
+    f->dump_int("pool", pool.first);
+    f->dump_string("pool_name", name);
+    pool.second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+
+  dump_osds(f);
+
+  f->open_array_section("osd_xinfo");
+  for (int i=0; i<get_max_osd(); i++) {
+    if (exists(i)) {
+      f->open_object_section("xinfo");
+      f->dump_int("osd", i);
+      osd_xinfo[i].dump(f);
+      f->close_section();
+    }
+  }
+  f->close_section();
+
+  f->open_array_section("pg_upmap");
+  for (auto& p : pg_upmap) {
+    f->open_object_section("mapping");
+    f->dump_stream("pgid") << p.first;
+    f->open_array_section("osds");
+    for (auto q : p.second) {
+      f->dump_int("osd", q);
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("pg_upmap_items");
+  for (auto& p : pg_upmap_items) {
+    f->open_object_section("mapping");
+    f->dump_stream("pgid") << p.first;
+    f->open_array_section("mappings");
+    for (auto& q : p.second) {
+      f->open_object_section("mapping");
+      f->dump_int("from", q.first);
+      f->dump_int("to", q.second);
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("pg_temp");
+  pg_temp->dump(f);
+  f->close_section();
+
+  f->open_array_section("primary_temp");
+  for (const auto &pg : *primary_temp) {
+    f->dump_stream("pgid") << pg.first;
+    f->dump_int("osd", pg.second);
+  }
+  f->close_section(); // primary_temp
+
+  f->open_object_section("blocklist");
+  for (const auto &addr : blocklist) {
+    stringstream ss;
+    ss << addr.first;
+    f->dump_stream(ss.str().c_str()) << addr.second;
+  }
+  f->close_section();
+  f->open_object_section("range_blocklist");
+  for (const auto &addr : range_blocklist) {
+    stringstream ss;
+    ss << addr.first;
+    f->dump_stream(ss.str().c_str()) << addr.second;
+  }
+  f->close_section();
+
+  dump_erasure_code_profiles(erasure_code_profiles, f);
+
+  f->open_array_section("removed_snaps_queue");
+  for (auto& p : removed_snaps_queue) {
+    f->open_object_section("pool");
+    f->dump_int("pool", p.first);
+    f->open_array_section("snaps");
+    for (auto q = p.second.begin(); q != p.second.end(); ++q) {
+      f->open_object_section("interval");
+      f->dump_unsigned("begin", q.get_start());
+      f->dump_unsigned("length", q.get_len());
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("new_removed_snaps");
+  for (auto& p : new_removed_snaps) {
+    f->open_object_section("pool");
+    f->dump_int("pool", p.first);
+    f->open_array_section("snaps");
+    for (auto q = p.second.begin(); q != p.second.end(); ++q) {
+      f->open_object_section("interval");
+      f->dump_unsigned("begin", q.get_start());
+      f->dump_unsigned("length", q.get_len());
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("new_purged_snaps");
+  for (auto& p : new_purged_snaps) {
+    f->open_object_section("pool");
+    f->dump_int("pool", p.first);
+    f->open_array_section("snaps");
+    for (auto q = p.second.begin(); q != p.second.end(); ++q) {
+      f->open_object_section("interval");
+      f->dump_unsigned("begin", q.get_start());
+      f->dump_unsigned("length", q.get_len());
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+  f->open_object_section("crush_node_flags");
+  for (auto& i : crush_node_flags) {
+    string s = crush->item_exists(i.first) ? crush->get_item_name(i.first)
+      : stringify(i.first);
+    f->open_array_section(s.c_str());
+    set<string> st;
+    calc_state_set(i.second, st);
+    for (auto& j : st) {
+      f->dump_string("flag", j);
+    }
+    f->close_section();
+  }
+  f->close_section();
+  f->open_object_section("device_class_flags");
+  for (auto& i : device_class_flags) {
+    const char* class_name = crush->get_class_name(i.first);
+    string s = class_name ? class_name : stringify(i.first);
+    f->open_array_section(s.c_str());
+    set<string> st;
+    calc_state_set(i.second, st);
+    for (auto& j : st) {
+      f->dump_string("flag", j);
+    }
+    f->close_section();
+  }
+  f->close_section();
+  f->open_object_section("stretch_mode");
+  {
+    f->dump_bool("stretch_mode_enabled", stretch_mode_enabled);
+    f->dump_unsigned("stretch_bucket_count", stretch_bucket_count);
+    f->dump_unsigned("degraded_stretch_mode", degraded_stretch_mode);
+    f->dump_unsigned("recovering_stretch_mode", recovering_stretch_mode);
+    f->dump_int("stretch_mode_bucket", stretch_mode_bucket);
+  }
+  f->close_section();
+}
+
+void OSDMap::generate_test_instances(list<OSDMap*>& o)
+{
+  o.push_back(new OSDMap);
+
+  CephContext *cct = new CephContext(CODE_ENVIRONMENT_UTILITY);
+  o.push_back(new OSDMap);
+  uuid_d fsid;
+  o.back()->build_simple(cct, 1, fsid, 16);
+  o.back()->created = o.back()->modified = utime_t(1, 2);  // fix timestamp
+  o.back()->blocklist[entity_addr_t()] = utime_t(5, 6);
+  cct->put();
+}
+
+string OSDMap::get_flag_string(unsigned f)
+{
+  string s;
+  if (f & CEPH_OSDMAP_PAUSERD)
+    s += ",pauserd";
+  if (f & CEPH_OSDMAP_PAUSEWR)
+    s += ",pausewr";
+  if (f & CEPH_OSDMAP_PAUSEREC)
+    s += ",pauserec";
+  if (f & CEPH_OSDMAP_NOUP)
+    s += ",noup";
+  if (f & CEPH_OSDMAP_NODOWN)
+    s += ",nodown";
+  if (f & CEPH_OSDMAP_NOOUT)
+    s += ",noout";
+  if (f & CEPH_OSDMAP_NOIN)
+    s += ",noin";
+  if (f & CEPH_OSDMAP_NOBACKFILL)
+    s += ",nobackfill";
+  if (f & CEPH_OSDMAP_NOREBALANCE)
+    s += ",norebalance";
+  if (f & CEPH_OSDMAP_NORECOVER)
+    s += ",norecover";
+  if (f & CEPH_OSDMAP_NOSCRUB)
+    s += ",noscrub";
+  if (f & CEPH_OSDMAP_NODEEP_SCRUB)
+    s += ",nodeep-scrub";
+  if (f & CEPH_OSDMAP_NOTIERAGENT)
+    s += ",notieragent";
+  if (f & CEPH_OSDMAP_NOSNAPTRIM)
+    s += ",nosnaptrim";
+  if (f & CEPH_OSDMAP_SORTBITWISE)
+    s += ",sortbitwise";
+  if (f & CEPH_OSDMAP_REQUIRE_JEWEL)
+    s += ",require_jewel_osds";
+  if (f & CEPH_OSDMAP_REQUIRE_KRAKEN)
+    s += ",require_kraken_osds";
+  if (f & CEPH_OSDMAP_REQUIRE_LUMINOUS)
+    s += ",require_luminous_osds";
+  if (f & CEPH_OSDMAP_RECOVERY_DELETES)
+    s += ",recovery_deletes";
+  if (f & CEPH_OSDMAP_PURGED_SNAPDIRS)
+    s += ",purged_snapdirs";
+  if (f & CEPH_OSDMAP_PGLOG_HARDLIMIT)
+    s += ",pglog_hardlimit";
+  if (s.length())
+    s.erase(0, 1);
+  return s;
+}
+
+string OSDMap::get_flag_string() const
+{
+  return get_flag_string(flags);
+}
+
+void OSDMap::print_pools(ostream& out) const
+{
+  for (const auto &pool : pools) {
+    std::string name("<unknown>");
+    const auto &pni = pool_name.find(pool.first);
+    if (pni != pool_name.end())
+      name = pni->second;
+    out << "pool " << pool.first
+	<< " '" << name
+	<< "' " << pool.second << "\n";
+
+    for (const auto &snap : pool.second.snaps)
+      out << "\tsnap " << snap.second.snapid << " '" << snap.second.name << "' " << snap.second.stamp << "\n";
+
+    if (!pool.second.removed_snaps.empty())
+      out << "\tremoved_snaps " << pool.second.removed_snaps << "\n";
+    auto p = removed_snaps_queue.find(pool.first);
+    if (p != removed_snaps_queue.end()) {
+      out << "\tremoved_snaps_queue " << p->second << "\n";
+    }
+  }
+  out << std::endl;
+}
+
+void OSDMap::print_osds(ostream& out) const
+{
+  for (int i=0; i<get_max_osd(); i++) {
+    if (exists(i)) {
+      print_osd(i, out);
+    }
+  }
+}
+void OSDMap::print_osd(int id, ostream& out) const
+{
+  if (!exists(id)) {
+    return;
+  }
+
+  out << "osd." << id;
+  out << (is_up(id) ? " up  ":" down");
+  out << (is_in(id) ? " in ":" out");
+  out << " weight " << get_weightf(id);
+  if (get_primary_affinity(id) != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+    out << " primary_affinity " << get_primary_affinityf(id);
+  }
+  const osd_info_t& info(get_info(id));
+  out << " " << info;
+  out << " " << get_addrs(id) << " " << get_cluster_addrs(id);
+  set<string> st;
+  get_state(id, st);
+  out << " " << st;
+  if (!get_uuid(id).is_zero()) {
+    out << " " << get_uuid(id);
+  }
+  out << "\n";
+}
+
+void OSDMap::print(ostream& out) const
+{
+  out << "epoch " << get_epoch() << "\n"
+      << "fsid " << get_fsid() << "\n"
+      << "created " << get_created() << "\n"
+      << "modified " << get_modified() << "\n";
+
+  out << "flags " << get_flag_string() << "\n";
+  out << "crush_version " << get_crush_version() << "\n";
+  out << "full_ratio " << full_ratio << "\n";
+  out << "backfillfull_ratio " << backfillfull_ratio << "\n";
+  out << "nearfull_ratio " << nearfull_ratio << "\n";
+  if (require_min_compat_client != ceph_release_t::unknown) {
+    out << "require_min_compat_client "
+	<< require_min_compat_client << "\n";
+  }
+  out << "min_compat_client " << get_min_compat_client()
+      << "\n";
+  if (require_osd_release > ceph_release_t::unknown) {
+    out << "require_osd_release " << require_osd_release
+	<< "\n";
+  }
+  out << "stretch_mode_enabled " << (stretch_mode_enabled ? "true" : "false") << "\n";
+  if (stretch_mode_enabled) {
+    out << "stretch_bucket_count " << stretch_bucket_count << "\n";
+    out << "degraded_stretch_mode " << degraded_stretch_mode << "\n";
+    out << "recovering_stretch_mode " << recovering_stretch_mode << "\n";
+    out << "stretch_mode_bucket " << stretch_mode_bucket << "\n";
+  }
+  if (get_cluster_snapshot().length())
+    out << "cluster_snapshot " << get_cluster_snapshot() << "\n";
+  out << "\n";
+
+  print_pools(out);
+
+  out << "max_osd " << get_max_osd() << "\n";
+  print_osds(out);
+  out << std::endl;
+
+  for (auto& p : pg_upmap) {
+    out << "pg_upmap " << p.first << " " << p.second << "\n";
+  }
+  for (auto& p : pg_upmap_items) {
+    out << "pg_upmap_items " << p.first << " " << p.second << "\n";
+  }
+
+  for (const auto& pg : *pg_temp)
+    out << "pg_temp " << pg.first << " " << pg.second << "\n";
+
+  for (const auto& pg : *primary_temp)
+    out << "primary_temp " << pg.first << " " << pg.second << "\n";
+
+  for (const auto &addr : blocklist)
+    out << "blocklist " << addr.first << " expires " << addr.second << "\n";
+  for (const auto &addr : range_blocklist)
+    out << "range blocklist " << addr.first << " expires " << addr.second << "\n";
+}
+
+class OSDTreePlainDumper : public CrushTreeDumper::Dumper<TextTable> {
+public:
+  typedef CrushTreeDumper::Dumper<TextTable> Parent;
+
+  OSDTreePlainDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
+		     unsigned f)
+    : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
+
+  bool should_dump_leaf(int i) const override {
+    if (!filter) {
+      return true; // normal case
+    }
+    if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
+	((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
+	((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
+	((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
+        ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
+      return true;
+    }
+    return false;
+  }
+
+  bool should_dump_empty_bucket() const override {
+    return !filter;
+  }
+
+  void init_table(TextTable *tbl) {
+    tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
+    tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("PRI-AFF", TextTable::LEFT, TextTable::RIGHT);
+  }
+  void dump(TextTable *tbl, string& bucket) {
+    init_table(tbl);
+
+    if (!bucket.empty()) {
+      set_root(bucket);
+      Parent::dump(tbl);
+    } else {
+      Parent::dump(tbl);
+      for (int i = 0; i < osdmap->get_max_osd(); i++) {
+	if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i)) {
+	  dump_item(CrushTreeDumper::Item(i, 0, 0, 0), tbl);
+	}
+      }
+    }
+  }
+
+protected:
+  void dump_item(const CrushTreeDumper::Item &qi, TextTable *tbl) override {
+    const char *c = crush->get_item_class(qi.id);
+    if (!c)
+      c = "";
+    *tbl << qi.id
+	 << c
+	 << weightf_t(qi.weight);
+
+    ostringstream name;
+    for (int k = 0; k < qi.depth; k++)
+      name << "    ";
+    if (qi.is_bucket()) {
+      name << crush->get_type_name(crush->get_bucket_type(qi.id)) << " "
+	   << crush->get_item_name(qi.id);
+    } else {
+      name << "osd." << qi.id;
+    }
+    *tbl << name.str();
+
+    if (!qi.is_bucket()) {
+      if (!osdmap->exists(qi.id)) {
+	*tbl << "DNE"
+	     << 0;
+      } else {
+        string s;
+        if (osdmap->is_up(qi.id)) {
+          s = "up";
+        } else if (osdmap->is_destroyed(qi.id)) {
+          s = "destroyed";
+        } else {
+          s = "down";
+        }
+	*tbl << s
+	     << weightf_t(osdmap->get_weightf(qi.id))
+	     << weightf_t(osdmap->get_primary_affinityf(qi.id));
+      }
+    }
+    *tbl << TextTable::endrow;
+  }
+
+private:
+  const OSDMap *osdmap;
+  const unsigned filter;
+};
+
+class OSDTreeFormattingDumper : public CrushTreeDumper::FormattingDumper {
+public:
+  typedef CrushTreeDumper::FormattingDumper Parent;
+
+  OSDTreeFormattingDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
+			  unsigned f)
+    : Parent(crush, osdmap_->get_pool_names()), osdmap(osdmap_), filter(f) { }
+
+  bool should_dump_leaf(int i) const override {
+    if (!filter) {
+      return true; // normal case
+    }
+    if (((filter & OSDMap::DUMP_UP) && osdmap->is_up(i)) ||
+        ((filter & OSDMap::DUMP_DOWN) && osdmap->is_down(i)) ||
+        ((filter & OSDMap::DUMP_IN) && osdmap->is_in(i)) ||
+        ((filter & OSDMap::DUMP_OUT) && osdmap->is_out(i)) ||
+        ((filter & OSDMap::DUMP_DESTROYED) && osdmap->is_destroyed(i))) {
+      return true;
+    }
+    return false;
+  }
+
+  bool should_dump_empty_bucket() const override {
+    return !filter;
+  }
+
+  void dump(Formatter *f, string& bucket) {
+    if (!bucket.empty()) {
+      set_root(bucket);
+      f->open_array_section("nodes");
+      Parent::dump(f);
+      f->close_section();
+    } else {
+      f->open_array_section("nodes");
+      Parent::dump(f);
+      f->close_section();
+      f->open_array_section("stray");
+      for (int i = 0; i < osdmap->get_max_osd(); i++) {
+	if (osdmap->exists(i) && !is_touched(i) && should_dump_leaf(i))
+	  dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
+      }
+      f->close_section();
+    }
+  }
+
+protected:
+  void dump_item_fields(const CrushTreeDumper::Item &qi, Formatter *f) override {
+    Parent::dump_item_fields(qi, f);
+    if (!qi.is_bucket())
+    {
+      string s;
+      if (osdmap->is_up(qi.id)) {
+        s = "up";
+      } else if (osdmap->is_destroyed(qi.id)) {
+        s = "destroyed";
+      } else {
+        s = "down";
+      }
+      f->dump_unsigned("exists", (int)osdmap->exists(qi.id));
+      f->dump_string("status", s);
+      f->dump_float("reweight", osdmap->get_weightf(qi.id));
+      f->dump_float("primary_affinity", osdmap->get_primary_affinityf(qi.id));
+    }
+  }
+
+private:
+  const OSDMap *osdmap;
+  const unsigned filter;
+};
+
+void OSDMap::print_tree(Formatter *f, ostream *out, unsigned filter, string bucket) const
+{
+  if (f) {
+    OSDTreeFormattingDumper(crush.get(), this, filter).dump(f, bucket);
+  } else {
+    ceph_assert(out);
+    TextTable tbl;
+    OSDTreePlainDumper(crush.get(), this, filter).dump(&tbl, bucket);
+    *out << tbl;
+  }
+}
+
+void OSDMap::print_summary(Formatter *f, ostream& out,
+			   const string& prefix, bool extra) const
+{
+  if (f) {
+    f->dump_int("epoch", get_epoch());
+    f->dump_int("num_osds", get_num_osds());
+    f->dump_int("num_up_osds", get_num_up_osds());
+    f->dump_int("osd_up_since", last_up_change.to_msec() / 1000);
+    f->dump_int("num_in_osds", get_num_in_osds());
+    f->dump_int("osd_in_since", last_in_change.to_msec() / 1000);
+    f->dump_unsigned("num_remapped_pgs", get_num_pg_temp());
+  } else {
+    utime_t now = ceph_clock_now();
+    out << get_num_osds() << " osds: "
+	<< get_num_up_osds() << " up";
+    if (last_up_change != utime_t()) {
+      out << " (since " << utimespan_str(now - last_up_change) << ")";
+    }
+    out << ", " << get_num_in_osds() << " in";
+    if (last_in_change != utime_t()) {
+      out << " (since " << utimespan_str(now - last_in_change) << ")";
+    }
+    if (extra)
+      out << "; epoch: e" << get_epoch();
+    if (get_num_pg_temp())
+      out << "; " << get_num_pg_temp() << " remapped pgs";
+    out << "\n";
+    uint64_t important_flags = flags & ~CEPH_OSDMAP_SEMIHIDDEN_FLAGS;
+    if (important_flags)
+      out << prefix << "flags " << get_flag_string(important_flags) << "\n";
+  }
+}
+
+void OSDMap::print_oneline_summary(ostream& out) const
+{
+  out << "e" << get_epoch() << ": "
+      << get_num_osds() << " total, "
+      << get_num_up_osds() << " up, "
+      << get_num_in_osds() << " in";
+}
+
+bool OSDMap::crush_rule_in_use(int rule_id) const
+{
+  for (const auto &pool : pools) {
+    if (pool.second.crush_rule == rule_id)
+      return true;
+  }
+  return false;
+}
+
+int OSDMap::validate_crush_rules(CrushWrapper *newcrush,
+				 ostream *ss) const
+{
+  for (auto& i : pools) {
+    auto& pool = i.second;
+    int ruleno = pool.get_crush_rule();
+    if (!newcrush->rule_exists(ruleno)) {
+      *ss << "pool " << i.first << " references crush_rule " << ruleno
+	  << " but it is not present";
+      return -EINVAL;
+    }
+    if (newcrush->get_rule_mask_ruleset(ruleno) != ruleno) {
+      *ss << "rule " << ruleno << " mask ruleset does not match rule id";
+      return -EINVAL;
+    }
+    if (newcrush->get_rule_mask_type(ruleno) != (int)pool.get_type()) {
+      *ss << "pool " << i.first << " type does not match rule " << ruleno;
+      return -EINVAL;
+    }
+    int poolsize = pool.get_size();
+    if (poolsize < newcrush->get_rule_mask_min_size(ruleno) ||
+	poolsize > newcrush->get_rule_mask_max_size(ruleno)) {
+      *ss << "pool " << i.first << " size " << poolsize << " does not"
+	  << " fall within rule " << ruleno
+	  << " min_size " << newcrush->get_rule_mask_min_size(ruleno)
+	  << " and max_size " << newcrush->get_rule_mask_max_size(ruleno);
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
+
+int OSDMap::build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
+				  int nosd, int pg_bits, int pgp_bits,
+				  bool default_pool)
+{
+  ldout(cct, 10) << "build_simple on " << nosd
+		 << " osds" << dendl;
+  epoch = e;
+  set_fsid(fsid);
+  created = modified = ceph_clock_now();
+
+  if (nosd >=  0) {
+    set_max_osd(nosd);
+  } else {
+    // count osds
+    int maxosd = 0;
+    const auto& conf = cct->_conf;
+    vector<string> sections;
+    conf.get_all_sections(sections);
+
+    for (auto &section : sections) {
+      if (section.find("osd.") != 0)
+	continue;
+
+      const char *begin = section.c_str() + 4;
+      char *end = (char*)begin;
+      int o = strtol(begin, &end, 10);
+      if (*end != '\0')
+	continue;
+
+      if (o > cct->_conf->mon_max_osd) {
+	lderr(cct) << "[osd." << o << "] in config has id > mon_max_osd " << cct->_conf->mon_max_osd << dendl;
+	return -ERANGE;
+      }
+
+      if (o > maxosd)
+	maxosd = o;
+    }
+
+    set_max_osd(maxosd + 1);
+  }
+
+
+  stringstream ss;
+  int r;
+  if (nosd >= 0)
+    r = build_simple_crush_map(cct, *crush, nosd, &ss);
+  else
+    r = build_simple_crush_map_from_conf(cct, *crush, &ss);
+  ceph_assert(r == 0);
+
+  int poolbase = get_max_osd() ? get_max_osd() : 1;
+
+  const int default_replicated_rule = crush->get_osd_pool_default_crush_replicated_ruleset(cct);
+  ceph_assert(default_replicated_rule >= 0);
+
+  if (default_pool) {
+    // pgp_num <= pg_num
+    if (pgp_bits > pg_bits)
+      pgp_bits = pg_bits;
+
+    vector<string> pool_names;
+    pool_names.push_back("rbd");
+    for (auto &plname : pool_names) {
+      int64_t pool = ++pool_max;
+      pools[pool].type = pg_pool_t::TYPE_REPLICATED;
+      pools[pool].flags = cct->_conf->osd_pool_default_flags;
+      if (cct->_conf->osd_pool_default_flag_hashpspool)
+	pools[pool].set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+      if (cct->_conf->osd_pool_default_flag_nodelete)
+	pools[pool].set_flag(pg_pool_t::FLAG_NODELETE);
+      if (cct->_conf->osd_pool_default_flag_nopgchange)
+	pools[pool].set_flag(pg_pool_t::FLAG_NOPGCHANGE);
+      if (cct->_conf->osd_pool_default_flag_nosizechange)
+	pools[pool].set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
+      if (cct->_conf->osd_pool_default_flag_bulk)
+        pools[pool].set_flag(pg_pool_t::FLAG_BULK);
+      pools[pool].size = cct->_conf.get_val<uint64_t>("osd_pool_default_size");
+      pools[pool].min_size = cct->_conf.get_osd_pool_default_min_size(
+                                 pools[pool].size);
+      pools[pool].crush_rule = default_replicated_rule;
+      pools[pool].object_hash = CEPH_STR_HASH_RJENKINS;
+      pools[pool].set_pg_num(poolbase << pg_bits);
+      pools[pool].set_pgp_num(poolbase << pgp_bits);
+      pools[pool].set_pg_num_target(poolbase << pg_bits);
+      pools[pool].set_pgp_num_target(poolbase << pgp_bits);
+      pools[pool].last_change = epoch;
+      pools[pool].application_metadata.insert(
+        {pg_pool_t::APPLICATION_NAME_RBD, {}});
+      if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
+            cct->_conf.get_val<string>("osd_pool_default_pg_autoscale_mode"));
+	  m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+	pools[pool].pg_autoscale_mode = m;
+      } else {
+	pools[pool].pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
+      }
+      pool_name[pool] = plname;
+      name_pool[plname] = pool;
+    }
+  }
+
+  map<string,string> profile_map;
+  r = get_erasure_code_profile_default(cct, profile_map, &ss);
+  if (r < 0) {
+    lderr(cct) << ss.str() << dendl;
+    return r;
+  }
+  set_erasure_code_profile("default", profile_map);
+  return 0;
+}
+
+int OSDMap::get_erasure_code_profile_default(CephContext *cct,
+					     map<string,string> &profile_map,
+					     ostream *ss)
+{
+  int r = get_json_str_map(cct->_conf.get_val<string>("osd_pool_default_erasure_code_profile"),
+		      *ss,
+		      &profile_map);
+  return r;
+}
+
+int OSDMap::_build_crush_types(CrushWrapper& crush)
+{
+  crush.set_type_name(0, "osd");
+  crush.set_type_name(1, "host");
+  crush.set_type_name(2, "chassis");
+  crush.set_type_name(3, "rack");
+  crush.set_type_name(4, "row");
+  crush.set_type_name(5, "pdu");
+  crush.set_type_name(6, "pod");
+  crush.set_type_name(7, "room");
+  crush.set_type_name(8, "datacenter");
+  crush.set_type_name(9, "zone");
+  crush.set_type_name(10, "region");
+  crush.set_type_name(11, "root");
+  return 11;
+}
+
+int OSDMap::build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
+				   int nosd, ostream *ss)
+{
+  crush.create();
+
+  // root
+  int root_type = _build_crush_types(crush);
+  int rootid;
+  int r = crush.add_bucket(0, 0, CRUSH_HASH_DEFAULT,
+			   root_type, 0, NULL, NULL, &rootid);
+  ceph_assert(r == 0);
+  crush.set_item_name(rootid, "default");
+
+  map<string,string> loc{
+    {"host", "localhost"},
+    {"rack", "localrack"},
+    {"root", "default"}
+  };
+  for (int o=0; o<nosd; o++) {
+    ldout(cct, 10) << " adding osd." << o << " at " << loc << dendl;
+    char name[32];
+    snprintf(name, sizeof(name), "osd.%d", o);
+    crush.insert_item(cct, o, 1.0, name, loc);
+  }
+
+  build_simple_crush_rules(cct, crush, "default", ss);
+
+  crush.finalize();
+
+  return 0;
+}
+
+int OSDMap::build_simple_crush_map_from_conf(CephContext *cct,
+					     CrushWrapper& crush,
+					     ostream *ss)
+{
+  const auto& conf = cct->_conf;
+
+  crush.create();
+
+  // root
+  int root_type = _build_crush_types(crush);
+  int rootid;
+  int r = crush.add_bucket(0, 0,
+			   CRUSH_HASH_DEFAULT,
+			   root_type, 0, NULL, NULL, &rootid);
+  ceph_assert(r == 0);
+  crush.set_item_name(rootid, "default");
+
+  // add osds
+  vector<string> sections;
+  conf.get_all_sections(sections);
+
+  for (auto &section : sections) {
+    if (section.find("osd.") != 0)
+      continue;
+
+    const char *begin = section.c_str() + 4;
+    char *end = (char*)begin;
+    int o = strtol(begin, &end, 10);
+    if (*end != '\0')
+      continue;
+
+    string host, rack, row, room, dc, pool;
+    vector<string> sectiontmp;
+    sectiontmp.push_back("osd");
+    sectiontmp.push_back(section);
+    conf.get_val_from_conf_file(sectiontmp, "host", host, false);
+    conf.get_val_from_conf_file(sectiontmp, "rack", rack, false);
+    conf.get_val_from_conf_file(sectiontmp, "row", row, false);
+    conf.get_val_from_conf_file(sectiontmp, "room", room, false);
+    conf.get_val_from_conf_file(sectiontmp, "datacenter", dc, false);
+    conf.get_val_from_conf_file(sectiontmp, "root", pool, false);
+
+    if (host.length() == 0)
+      host = "unknownhost";
+    if (rack.length() == 0)
+      rack = "unknownrack";
+
+    map<string,string> loc;
+    loc["host"] = host;
+    loc["rack"] = rack;
+    if (row.size())
+      loc["row"] = row;
+    if (room.size())
+      loc["room"] = room;
+    if (dc.size())
+      loc["datacenter"] = dc;
+    loc["root"] = "default";
+
+    ldout(cct, 5) << " adding osd." << o << " at " << loc << dendl;
+    crush.insert_item(cct, o, 1.0, section, loc);
+  }
+
+  build_simple_crush_rules(cct, crush, "default", ss);
+
+  crush.finalize();
+
+  return 0;
+}
+
+
+int OSDMap::build_simple_crush_rules(
+  CephContext *cct,
+  CrushWrapper& crush,
+  const string& root,
+  ostream *ss)
+{
+  int crush_rule = crush.get_osd_pool_default_crush_replicated_ruleset(cct);
+  string failure_domain =
+    crush.get_type_name(cct->_conf->osd_crush_chooseleaf_type);
+
+  int r;
+  r = crush.add_simple_rule_at(
+    "replicated_rule", root, failure_domain, "",
+    "firstn", pg_pool_t::TYPE_REPLICATED,
+    crush_rule, ss);
+  if (r < 0)
+    return r;
+  // do not add an erasure rule by default or else we will implicitly
+  // require the crush_v2 feature of clients
+  return 0;
+}
+
+int OSDMap::summarize_mapping_stats(
+  OSDMap *newmap,
+  const set<int64_t> *pools,
+  std::string *out,
+  Formatter *f) const
+{
+  set<int64_t> ls;
+  if (pools) {
+    ls = *pools;
+  } else {
+    for (auto &p : get_pools())
+      ls.insert(p.first);
+  }
+
+  unsigned total_pg = 0;
+  unsigned moved_pg = 0;
+  vector<unsigned> base_by_osd(get_max_osd(), 0);
+  vector<unsigned> new_by_osd(get_max_osd(), 0);
+  for (int64_t pool_id : ls) {
+    const pg_pool_t *pi = get_pg_pool(pool_id);
+    vector<int> up, up2;
+    int up_primary;
+    for (unsigned ps = 0; ps < pi->get_pg_num(); ++ps) {
+      pg_t pgid(ps, pool_id);
+      total_pg += pi->get_size();
+      pg_to_up_acting_osds(pgid, &up, &up_primary, nullptr, nullptr);
+      for (int osd : up) {
+	if (osd >= 0 && osd < get_max_osd())
+	  ++base_by_osd[osd];
+      }
+      if (newmap) {
+	newmap->pg_to_up_acting_osds(pgid, &up2, &up_primary, nullptr, nullptr);
+	for (int osd : up2) {
+	  if (osd >= 0 && osd < get_max_osd())
+	    ++new_by_osd[osd];
+	}
+	if (pi->type == pg_pool_t::TYPE_ERASURE) {
+	  for (unsigned i=0; i<up.size(); ++i) {
+	    if (up[i] != up2[i]) {
+	      ++moved_pg;
+	    }
+	  }
+	} else if (pi->type == pg_pool_t::TYPE_REPLICATED) {
+	  for (int osd : up) {
+	    if (std::find(up2.begin(), up2.end(), osd) == up2.end()) {
+	      ++moved_pg;
+	    }
+	  }
+	} else {
+	  ceph_abort_msg("unhandled pool type");
+	}
+      }
+    }
+  }
+
+  unsigned num_up_in = 0;
+  for (int osd = 0; osd < get_max_osd(); ++osd) {
+    if (is_up(osd) && is_in(osd))
+      ++num_up_in;
+  }
+  if (!num_up_in) {
+    return -EINVAL;
+  }
+
+  float avg_pg = (float)total_pg / (float)num_up_in;
+  float base_stddev = 0, new_stddev = 0;
+  int min = -1, max = -1;
+  unsigned min_base_pg = 0, max_base_pg = 0;
+  unsigned min_new_pg = 0, max_new_pg = 0;
+  for (int osd = 0; osd < get_max_osd(); ++osd) {
+    if (is_up(osd) && is_in(osd)) {
+      float base_diff = (float)base_by_osd[osd] - avg_pg;
+      base_stddev += base_diff * base_diff;
+      float new_diff = (float)new_by_osd[osd] - avg_pg;
+      new_stddev += new_diff * new_diff;
+      if (min < 0 || base_by_osd[osd] < min_base_pg) {
+	min = osd;
+	min_base_pg = base_by_osd[osd];
+	min_new_pg = new_by_osd[osd];
+      }
+      if (max < 0 || base_by_osd[osd] > max_base_pg) {
+	max = osd;
+	max_base_pg = base_by_osd[osd];
+	max_new_pg = new_by_osd[osd];
+      }
+    }
+  }
+  base_stddev = sqrt(base_stddev / num_up_in);
+  new_stddev = sqrt(new_stddev / num_up_in);
+
+  float edev = sqrt(avg_pg * (1.0 - (1.0 / (double)num_up_in)));
+
+  ostringstream ss;
+  if (f)
+    f->open_object_section("utilization");
+  if (newmap) {
+    if (f) {
+      f->dump_unsigned("moved_pgs", moved_pg);
+      f->dump_unsigned("total_pgs", total_pg);
+    } else {
+      float percent = 0;
+      if (total_pg)
+        percent = (float)moved_pg * 100.0 / (float)total_pg;
+      ss << "moved " << moved_pg << " / " << total_pg
+	 << " (" << percent << "%)\n";
+    }
+  }
+  if (f) {
+    f->dump_float("avg_pgs", avg_pg);
+    f->dump_float("std_dev", base_stddev);
+    f->dump_float("expected_baseline_std_dev", edev);
+    if (newmap)
+      f->dump_float("new_std_dev", new_stddev);
+  } else {
+    ss << "avg " << avg_pg << "\n";
+    ss << "stddev " << base_stddev;
+    if (newmap)
+      ss << " -> " << new_stddev;
+    ss << " (expected baseline " << edev << ")\n";
+  }
+  if (min >= 0) {
+    if (f) {
+      f->dump_unsigned("min_osd", min);
+      f->dump_unsigned("min_osd_pgs", min_base_pg);
+      if (newmap)
+	f->dump_unsigned("new_min_osd_pgs", min_new_pg);
+    } else {
+      ss << "min osd." << min << " with " << min_base_pg;
+      if (newmap)
+	ss << " -> " << min_new_pg;
+      ss << " pgs (" << (float)min_base_pg / avg_pg;
+      if (newmap)
+	ss << " -> " << (float)min_new_pg / avg_pg;
+      ss << " * mean)\n";
+    }
+  }
+  if (max >= 0) {
+    if (f) {
+      f->dump_unsigned("max_osd", max);
+      f->dump_unsigned("max_osd_pgs", max_base_pg);
+      if (newmap)
+	f->dump_unsigned("new_max_osd_pgs", max_new_pg);
+    } else {
+      ss << "max osd." << max << " with " << max_base_pg;
+      if (newmap)
+	ss << " -> " << max_new_pg;
+      ss << " pgs (" << (float)max_base_pg / avg_pg;
+      if (newmap)
+	ss << " -> " << (float)max_new_pg / avg_pg;
+      ss << " * mean)\n";
+    }
+  }
+  if (f)
+    f->close_section();
+  if (out)
+    *out = ss.str();
+  return 0;
+}
+
+bool OSDMap::try_pg_upmap(
+  CephContext *cct,
+  pg_t pg,                       ///< pg to potentially remap
+  const set<int>& overfull,      ///< osds we'd want to evacuate
+  const vector<int>& underfull,  ///< osds to move to, in order of preference
+  const vector<int>& more_underfull,  ///< more osds only slightly underfull
+  vector<int> *orig,
+  vector<int> *out)              ///< resulting alternative mapping
+{
+  const pg_pool_t *pool = get_pg_pool(pg.pool());
+  if (!pool)
+    return false;
+  int rule = crush->find_rule(pool->get_crush_rule(), pool->get_type(),
+			      pool->get_size());
+  if (rule < 0)
+    return false;
+
+  // make sure there is something there to remap
+  bool any = false;
+  for (auto osd : *orig) {
+    if (overfull.count(osd)) {
+      any = true;
+      break;
+    }
+  }
+  if (!any) {
+    return false;
+  }
+
+  int r = crush->try_remap_rule(
+    cct,
+    rule,
+    pool->get_size(),
+    overfull, underfull,
+    more_underfull,
+    *orig,
+    out);
+  if (r < 0)
+    return false;
+  if (*out == *orig)
+    return false;
+  return true;
+}
+
+int OSDMap::calc_pg_upmaps(
+  CephContext *cct,
+  uint32_t max_deviation,
+  int max,
+  const set<int64_t>& only_pools,
+  OSDMap::Incremental *pending_inc)
+{
+  ldout(cct, 10) << __func__ << " pools " << only_pools << dendl;
+  OSDMap tmp;
+  // Can't be less than 1 pg
+  if (max_deviation < 1)
+    max_deviation = 1;
+  tmp.deepish_copy_from(*this);
+  int num_changed = 0;
+  map<int,set<pg_t>> pgs_by_osd;
+  int total_pgs = 0;
+  float osd_weight_total = 0;
+  map<int,float> osd_weight;
+  for (auto& i : pools) {
+    if (!only_pools.empty() && !only_pools.count(i.first))
+      continue;
+    for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) {
+      pg_t pg(ps, i.first);
+      vector<int> up;
+      tmp.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr);
+      ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl;
+      for (auto osd : up) {
+        if (osd != CRUSH_ITEM_NONE)
+	  pgs_by_osd[osd].insert(pg);
+      }
+    }
+    total_pgs += i.second.get_size() * i.second.get_pg_num();
+
+    map<int,float> pmap;
+    int ruleno = tmp.crush->find_rule(i.second.get_crush_rule(),
+				      i.second.get_type(),
+				      i.second.get_size());
+    tmp.crush->get_rule_weight_osd_map(ruleno, &pmap);
+    ldout(cct,20) << __func__ << " pool " << i.first
+                  << " ruleno " << ruleno
+                  << " weight-map " << pmap
+                  << dendl;
+    for (auto p : pmap) {
+      auto adjusted_weight = tmp.get_weightf(p.first) * p.second;
+      if (adjusted_weight == 0) {
+        continue;
+      }
+      osd_weight[p.first] += adjusted_weight;
+      osd_weight_total += adjusted_weight;
+    }
+  }
+  for (auto& i : osd_weight) {
+    int pgs = 0;
+    auto p = pgs_by_osd.find(i.first);
+    if (p != pgs_by_osd.end())
+	pgs = p->second.size();
+    else
+	pgs_by_osd.emplace(i.first, set<pg_t>());
+    ldout(cct, 20) << " osd." << i.first << " weight " << i.second
+		     << " pgs " << pgs << dendl;
+  }
+  if (osd_weight_total == 0) {
+    lderr(cct) << __func__ << " abort due to osd_weight_total == 0" << dendl;
+    return 0;
+  }
+  float pgs_per_weight = total_pgs / osd_weight_total;
+  ldout(cct, 10) << " osd_weight_total " << osd_weight_total << dendl;
+  ldout(cct, 10) << " pgs_per_weight " << pgs_per_weight << dendl;
+
+  if (max <= 0) {
+    lderr(cct) << __func__ << " abort due to max <= 0" << dendl;
+    return 0;
+  }
+  float stddev = 0;
+  map<int,float> osd_deviation;       // osd, deviation(pgs)
+  multimap<float,int> deviation_osd;  // deviation(pgs), osd
+  float cur_max_deviation = 0;
+  for (auto& i : pgs_by_osd) {
+    // make sure osd is still there (belongs to this crush-tree)
+    ceph_assert(osd_weight.count(i.first));
+    float target = osd_weight[i.first] * pgs_per_weight;
+    float deviation = (float)i.second.size() - target;
+    ldout(cct, 20) << " osd." << i.first
+                   << "\tpgs " << i.second.size()
+                   << "\ttarget " << target
+                   << "\tdeviation " << deviation
+                   << dendl;
+    osd_deviation[i.first] = deviation;
+    deviation_osd.insert(make_pair(deviation, i.first));
+    stddev += deviation * deviation;
+    if (fabsf(deviation) > cur_max_deviation)
+      cur_max_deviation = fabsf(deviation);
+  }
+  ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
+  if (cur_max_deviation <= max_deviation) {
+    ldout(cct, 10) << __func__ << " distribution is almost perfect"
+                   << dendl;
+    return 0;
+  }
+  bool skip_overfull = false;
+  auto aggressive =
+    cct->_conf.get_val<bool>("osd_calc_pg_upmaps_aggressively");
+  auto local_fallback_retries =
+    cct->_conf.get_val<uint64_t>("osd_calc_pg_upmaps_local_fallback_retries");
+  while (max--) {
+    ldout(cct, 30) << "Top of loop #" << max+1 << dendl;
+    // build overfull and underfull
+    set<int> overfull;
+    set<int> more_overfull;
+    bool using_more_overfull = false;
+    vector<int> underfull;
+    vector<int> more_underfull;
+    for (auto i = deviation_osd.rbegin(); i != deviation_osd.rend(); i++) {
+        ldout(cct, 30) << " check " << i->first << " <= " << max_deviation << dendl;
+	if (i->first <= 0)
+	  break;
+        if (i->first > max_deviation) {
+	  ldout(cct, 30) << " add overfull osd." << i->second << dendl;
+          overfull.insert(i->second);
+	} else {
+          more_overfull.insert(i->second);
+	}
+      }
+
+    for (auto i = deviation_osd.begin(); i != deviation_osd.end(); i++) {
+        ldout(cct, 30) << " check " << i->first << " >= " << -(int)max_deviation << dendl;
+        if (i->first >= 0)
+          break;
+        if (i->first < -(int)max_deviation) {
+	  ldout(cct, 30) << " add underfull osd." << i->second << dendl;
+          underfull.push_back(i->second);
+	} else {
+          more_underfull.push_back(i->second);
+	}
+    }
+    if (underfull.empty() && overfull.empty()) {
+      ldout(cct, 20) << __func__ << " failed to build overfull and underfull" << dendl;
+      break;
+    }
+    if (overfull.empty() && !underfull.empty()) {
+      ldout(cct, 20) << __func__ << " Using more_overfull since we still have underfull" << dendl;
+      overfull = more_overfull;
+      using_more_overfull = true;
+    }
+
+    ldout(cct, 10) << " overfull " << overfull
+                   << " underfull " << underfull
+                   << dendl;
+    set<pg_t> to_skip;
+    uint64_t local_fallback_retried = 0;
+
+  retry:
+
+    set<pg_t> to_unmap;
+    map<pg_t, mempool::osdmap::vector<pair<int32_t,int32_t>>> to_upmap;
+    auto temp_pgs_by_osd = pgs_by_osd;
+    // always start with fullest, break if we find any changes to make
+    for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
+      if (skip_overfull && !underfull.empty()) {
+        ldout(cct, 10) << " skipping overfull " << dendl;
+        break; // fall through to check underfull
+      }
+      int osd = p->second;
+      float deviation = p->first;
+      if (deviation < 0) {
+        ldout(cct, 10) << " hitting underfull osds now"
+                       << " when trying to remap overfull osds"
+                       << dendl;
+        break;
+      }
+      float target = osd_weight[osd] * pgs_per_weight;
+      ldout(cct, 10) << " Overfull search osd." << osd
+                       << " target " << target
+                       << " deviation " << deviation
+		       << dendl;
+      ceph_assert(target > 0);
+      if (!using_more_overfull && deviation <= max_deviation) {
+	ldout(cct, 10) << " osd." << osd
+                       << " target " << target
+                       << " deviation " << deviation
+                       << " < max deviation " << max_deviation
+                       << dendl;
+	break;
+      }
+
+      vector<pg_t> pgs;
+      pgs.reserve(pgs_by_osd[osd].size());
+      for (auto& pg : pgs_by_osd[osd]) {
+        if (to_skip.count(pg))
+          continue;
+        pgs.push_back(pg);
+      }
+      if (aggressive) {
+        // shuffle PG list so they all get equal (in)attention
+        std::random_device rd;
+        std::default_random_engine rng{rd()};
+        std::shuffle(pgs.begin(), pgs.end(), rng);
+      }
+      // look for remaps we can un-remap
+      for (auto pg : pgs) {
+	auto p = tmp.pg_upmap_items.find(pg);
+        if (p == tmp.pg_upmap_items.end())
+          continue;
+        mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
+        for (auto q : p->second) {
+	  if (q.second == osd) {
+            ldout(cct, 10) << " will try dropping existing"
+                           << " remapping pair "
+                           << q.first << " -> " << q.second
+                           << " which remapped " << pg
+                           << " into overfull osd." << osd
+                           << dendl;
+            temp_pgs_by_osd[q.second].erase(pg);
+            temp_pgs_by_osd[q.first].insert(pg);
+          } else {
+            new_upmap_items.push_back(q);
+          }
+        }
+        if (new_upmap_items.empty()) {
+          // drop whole item
+          ldout(cct, 10) << " existing pg_upmap_items " << p->second
+                         << " remapped " << pg << " into overfull osd." << osd
+                         << ", will try cancelling it entirely"
+                         << dendl;
+          to_unmap.insert(pg);
+          goto test_change;
+        } else if (new_upmap_items.size() != p->second.size()) {
+          // drop single remapping pair, updating
+          ceph_assert(new_upmap_items.size() < p->second.size());
+          ldout(cct, 10) << " existing pg_upmap_items " << p->second
+                         << " remapped " << pg << " into overfull osd." << osd
+                         << ", new_pg_upmap_items now " << new_upmap_items
+                         << dendl;
+          to_upmap[pg] = new_upmap_items;
+          goto test_change;
+        }
+      }
+
+      // try upmap
+      for (auto pg : pgs) {
+        auto temp_it = tmp.pg_upmap.find(pg);
+        if (temp_it != tmp.pg_upmap.end()) {
+          // leave pg_upmap alone
+          // it must be specified by admin since balancer does not
+          // support pg_upmap yet
+	  ldout(cct, 10) << " " << pg << " already has pg_upmap "
+                         << temp_it->second << ", skipping"
+                         << dendl;
+	  continue;
+	}
+        auto pg_pool_size = tmp.get_pg_pool_size(pg);
+        mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
+        set<int> existing;
+        auto it = tmp.pg_upmap_items.find(pg);
+        if (it != tmp.pg_upmap_items.end() &&
+            it->second.size() >= (size_t)pg_pool_size) {
+          ldout(cct, 10) << " " << pg << " already has full-size pg_upmap_items "
+                         << it->second << ", skipping"
+                         << dendl;
+          continue;
+        } else if (it != tmp.pg_upmap_items.end()) {
+          ldout(cct, 10) << " " << pg << " already has pg_upmap_items "
+                         << it->second
+                         << dendl;
+          new_upmap_items = it->second;
+          // build existing too (for dedup)
+          for (auto i : it->second) {
+            existing.insert(i.first);
+            existing.insert(i.second);
+          }
+          // fall through
+          // to see if we can append more remapping pairs
+        }
+	ldout(cct, 10) << " trying " << pg << dendl;
+        vector<int> raw, orig, out;
+        tmp.pg_to_raw_upmap(pg, &raw, &orig); // including existing upmaps too
+	if (!try_pg_upmap(cct, pg, overfull, underfull, more_underfull, &orig, &out)) {
+	  continue;
+	}
+	ldout(cct, 10) << " " << pg << " " << orig << " -> " << out << dendl;
+	if (orig.size() != out.size()) {
+	  continue;
+	}
+	ceph_assert(orig != out);
+	int pos = -1;
+	float max_dev = 0;
+	for (unsigned i = 0; i < out.size(); ++i) {
+          if (orig[i] == out[i])
+            continue; // skip invalid remappings
+          if (existing.count(orig[i]) || existing.count(out[i]))
+            continue; // we want new remappings only!
+	  if (osd_deviation[orig[i]] > max_dev) {
+	    max_dev = osd_deviation[orig[i]];
+	    pos = i;
+	    ldout(cct, 30) << "Max osd." << orig[i] << " pos " << i << " dev " << osd_deviation[orig[i]] << dendl;
+	  }
+	}
+	if (pos != -1) {
+	  int i = pos;
+          ldout(cct, 10) << " will try adding new remapping pair "
+                         << orig[i] << " -> " << out[i] << " for " << pg
+			 << (orig[i] != osd ? " NOT selected osd" : "")
+                         << dendl;
+          existing.insert(orig[i]);
+          existing.insert(out[i]);
+          temp_pgs_by_osd[orig[i]].erase(pg);
+          temp_pgs_by_osd[out[i]].insert(pg);
+          ceph_assert(new_upmap_items.size() < (size_t)pg_pool_size);
+          new_upmap_items.push_back(make_pair(orig[i], out[i]));
+          // append new remapping pairs slowly
+          // This way we can make sure that each tiny change will
+          // definitely make distribution of PGs converging to
+          // the perfect status.
+          to_upmap[pg] = new_upmap_items;
+          goto test_change;
+	}
+      }
+    }
+
+    ceph_assert(!(to_unmap.size() || to_upmap.size()));
+    ldout(cct, 10) << " failed to find any changes for overfull osds"
+                   << dendl;
+    for (auto& p : deviation_osd) {
+      if (std::find(underfull.begin(), underfull.end(), p.second) ==
+                    underfull.end())
+        break;
+      int osd = p.second;
+      float deviation = p.first;
+      float target = osd_weight[osd] * pgs_per_weight;
+      ceph_assert(target > 0);
+      if (fabsf(deviation) < max_deviation) {
+        // respect max_deviation too
+        ldout(cct, 10) << " osd." << osd
+                       << " target " << target
+                       << " deviation " << deviation
+                       << " -> absolute " << fabsf(deviation)
+                       << " < max " << max_deviation
+                       << dendl;
+        break;
+      }
+      // look for remaps we can un-remap
+      vector<pair<pg_t,
+        mempool::osdmap::vector<pair<int32_t,int32_t>>>> candidates;
+      candidates.reserve(tmp.pg_upmap_items.size());
+      for (auto& i : tmp.pg_upmap_items) {
+        if (to_skip.count(i.first))
+          continue;
+        if (!only_pools.empty() && !only_pools.count(i.first.pool()))
+          continue;
+        candidates.push_back(make_pair(i.first, i.second));
+      }
+      if (aggressive) {
+        // shuffle candidates so they all get equal (in)attention
+        std::random_device rd;
+        std::default_random_engine rng{rd()};
+        std::shuffle(candidates.begin(), candidates.end(), rng);
+      }
+      for (auto& i : candidates) {
+        auto pg = i.first;
+        mempool::osdmap::vector<pair<int32_t,int32_t>> new_upmap_items;
+        for (auto& j : i.second) {
+          if (j.first == osd) {
+            ldout(cct, 10) << " will try dropping existing"
+                           << " remapping pair "
+                           << j.first << " -> " << j.second
+                           << " which remapped " << pg
+                           << " out from underfull osd." << osd
+                           << dendl;
+            temp_pgs_by_osd[j.second].erase(pg);
+            temp_pgs_by_osd[j.first].insert(pg);
+          } else {
+            new_upmap_items.push_back(j);
+          }
+        }
+        if (new_upmap_items.empty()) {
+          // drop whole item
+          ldout(cct, 10) << " existing pg_upmap_items " << i.second
+                         << " remapped " << pg
+                         << " out from underfull osd." << osd
+                         << ", will try cancelling it entirely"
+                         << dendl;
+          to_unmap.insert(pg);
+          goto test_change;
+        } else if (new_upmap_items.size() != i.second.size()) {
+          // drop single remapping pair, updating
+          ceph_assert(new_upmap_items.size() < i.second.size());
+          ldout(cct, 10) << " existing pg_upmap_items " << i.second
+                         << " remapped " << pg
+                         << " out from underfull osd." << osd
+                         << ", new_pg_upmap_items now " << new_upmap_items
+                         << dendl;
+          to_upmap[pg] = new_upmap_items;
+          goto test_change;
+        }
+      }
+    }
+
+    ceph_assert(!(to_unmap.size() || to_upmap.size()));
+    ldout(cct, 10) << " failed to find any changes for underfull osds"
+                   << dendl;
+    if (!aggressive) {
+      ldout(cct, 10) << " break due to aggressive mode not enabled" << dendl;
+      break;
+    } else if (!skip_overfull) {
+      // safe to quit because below here we know
+      // we've done checking both overfull and underfull osds..
+      ldout(cct, 10) << " break due to not being able to find any"
+                     << " further optimizations"
+                     << dendl;
+      break;
+    }
+    // restart with fullest and do exhaustive searching
+    skip_overfull = false;
+    continue;
+
+  test_change:
+
+    // test change, apply if change is good
+    ceph_assert(to_unmap.size() || to_upmap.size());
+    float new_stddev = 0;
+    map<int,float> temp_osd_deviation;
+    multimap<float,int> temp_deviation_osd;
+    float cur_max_deviation = 0;
+    for (auto& i : temp_pgs_by_osd) {
+      // make sure osd is still there (belongs to this crush-tree)
+      ceph_assert(osd_weight.count(i.first));
+      float target = osd_weight[i.first] * pgs_per_weight;
+      float deviation = (float)i.second.size() - target;
+      ldout(cct, 20) << " osd." << i.first
+                     << "\tpgs " << i.second.size()
+                     << "\ttarget " << target
+                     << "\tdeviation " << deviation
+                     << dendl;
+      temp_osd_deviation[i.first] = deviation;
+      temp_deviation_osd.insert(make_pair(deviation, i.first));
+       new_stddev += deviation * deviation;
+      if (fabsf(deviation) > cur_max_deviation)
+        cur_max_deviation = fabsf(deviation);
+    }
+    ldout(cct, 10) << " stddev " << stddev << " -> " << new_stddev << dendl;
+    if (new_stddev >= stddev) {
+      if (!aggressive) {
+        ldout(cct, 10) << " break because stddev is not decreasing"
+                       << " and aggressive mode is not enabled"
+                       << dendl;
+        break;
+      }
+      local_fallback_retried++;
+      if (local_fallback_retried >= local_fallback_retries) {
+        // does not make progress
+        // flip *skip_overfull* so both overfull and underfull
+        // get equal (in)attention
+        skip_overfull = !skip_overfull;
+        ldout(cct, 10) << " hit local_fallback_retries "
+                       << local_fallback_retries
+                       << dendl;
+        continue;
+      }
+      for (auto& i : to_unmap)
+        to_skip.insert(i);
+      for (auto& i : to_upmap)
+        to_skip.insert(i.first);
+      ldout(cct, 20) << " local_fallback_retried " << local_fallback_retried
+                     << " to_skip " << to_skip
+                     << dendl;
+      goto retry;
+    }
+
+    // ready to go
+    ceph_assert(new_stddev < stddev);
+    stddev = new_stddev;
+    pgs_by_osd = temp_pgs_by_osd;
+    osd_deviation = temp_osd_deviation;
+    deviation_osd = temp_deviation_osd;
+    for (auto& i : to_unmap) {
+      ldout(cct, 10) << " unmap pg " << i << dendl;
+      ceph_assert(tmp.pg_upmap_items.count(i));
+      tmp.pg_upmap_items.erase(i);
+      pending_inc->old_pg_upmap_items.insert(i);
+      ++num_changed;
+    }
+    for (auto& i : to_upmap) {
+      ldout(cct, 10) << " upmap pg " << i.first
+                     << " new pg_upmap_items " << i.second
+                     << dendl;
+      tmp.pg_upmap_items[i.first] = i.second;
+      pending_inc->new_pg_upmap_items[i.first] = i.second;
+      ++num_changed;
+    }
+    ldout(cct, 20) << " stdev " << stddev << " max_deviation " << cur_max_deviation << dendl;
+    if (cur_max_deviation <= max_deviation) {
+      ldout(cct, 10) << __func__ << " Optimization plan is almost perfect"
+                     << dendl;
+      break;
+    }
+  }
+  ldout(cct, 10) << " num_changed = " << num_changed << dendl;
+  return num_changed;
+}
+
+int OSDMap::get_osds_by_bucket_name(const string &name, set<int> *osds) const
+{
+  return crush->get_leaves(name, osds);
+}
+
+// get pools whose crush rules might reference the given osd
+void OSDMap::get_pool_ids_by_osd(CephContext *cct,
+                                int osd,
+                                set<int64_t> *pool_ids) const
+{
+  ceph_assert(pool_ids);
+  set<int> raw_rules;
+  int r = crush->get_rules_by_osd(osd, &raw_rules);
+  if (r < 0) {
+    lderr(cct) << __func__ << " get_rules_by_osd failed: " << cpp_strerror(r)
+               << dendl;
+    ceph_assert(r >= 0);
+  }
+  set<int> rules;
+  for (auto &i: raw_rules) {
+    // exclude any dead rule
+    if (crush_rule_in_use(i)) {
+      rules.insert(i);
+    }
+  }
+  for (auto &r: rules) {
+    get_pool_ids_by_rule(r, pool_ids);
+  }
+}
+
+template <typename F>
+class OSDUtilizationDumper : public CrushTreeDumper::Dumper<F> {
+public:
+  typedef CrushTreeDumper::Dumper<F> Parent;
+
+  OSDUtilizationDumper(const CrushWrapper *crush, const OSDMap *osdmap_,
+                       const PGMap& pgmap_, bool tree_,
+                       const string& filter) :
+    Parent(crush, osdmap_->get_pool_names()),
+    osdmap(osdmap_),
+    pgmap(pgmap_),
+    tree(tree_),
+    min_var(-1),
+    max_var(-1),
+    stddev(0),
+    sum(0) {
+    if (osdmap->crush->name_exists(filter)) {
+      // filter by crush node
+      auto item_id = osdmap->crush->get_item_id(filter);
+      allowed.insert(item_id);
+      osdmap->crush->get_all_children(item_id, &allowed);
+    } else if (osdmap->crush->class_exists(filter)) {
+      // filter by device class
+      class_id = osdmap->crush->get_class_id(filter);
+    } else if (auto pool_id = osdmap->lookup_pg_pool_name(filter);
+               pool_id >= 0) {
+      // filter by pool
+      auto crush_rule = osdmap->get_pool_crush_rule(pool_id);
+      set<int> roots;
+      osdmap->crush->find_takes_by_rule(crush_rule, &roots);
+      allowed = roots;
+      for (auto r : roots)
+        osdmap->crush->get_all_children(r, &allowed);
+    }
+    average_util = average_utilization();
+  }
+
+protected:
+
+  bool should_dump(int id) const {
+    if (!allowed.empty() && !allowed.count(id)) // filter by name
+      return false;
+    if (id >= 0 && class_id >= 0) {
+      auto item_class_id = osdmap->crush->get_item_class_id(id);
+      if (item_class_id < 0 || // not bound to a class yet
+          item_class_id != class_id) // or already bound to a different class
+        return false;
+    }
+    return true;
+  }
+
+  set<int> get_dumped_osds() {
+    if (allowed.empty() && class_id < 0) {
+      // old way, all
+      return {};
+    }
+    return dumped_osds;
+  }
+
+  void dump_stray(F *f) {
+    for (int i = 0; i < osdmap->get_max_osd(); i++) {
+      if (osdmap->exists(i) && !this->is_touched(i))
+	dump_item(CrushTreeDumper::Item(i, 0, 0, 0), f);
+    }
+  }
+
+  void dump_item(const CrushTreeDumper::Item &qi, F *f) override {
+    if (!tree && (qi.is_bucket() || dumped_osds.count(qi.id)))
+      return;
+    if (!should_dump(qi.id))
+      return;
+
+    if (!qi.is_bucket())
+      dumped_osds.insert(qi.id);
+    float reweight = qi.is_bucket() ? -1 : osdmap->get_weightf(qi.id);
+    int64_t kb = 0, kb_used = 0, kb_used_data = 0, kb_used_omap = 0,
+      kb_used_meta = 0, kb_avail = 0;
+    double util = 0;
+    if (get_bucket_utilization(qi.id, &kb, &kb_used, &kb_used_data,
+			       &kb_used_omap, &kb_used_meta, &kb_avail))
+      if (kb_used && kb)
+        util = 100.0 * (double)kb_used / (double)kb;
+
+    double var = 1.0;
+    if (average_util)
+      var = util / average_util;
+
+    size_t num_pgs = qi.is_bucket() ? 0 : pgmap.get_num_pg_by_osd(qi.id);
+
+    dump_item(qi, reweight, kb, kb_used,
+	      kb_used_data, kb_used_omap, kb_used_meta,
+	      kb_avail, util, var, num_pgs, f);
+
+    if (!qi.is_bucket() && reweight > 0) {
+      if (min_var < 0 || var < min_var)
+	min_var = var;
+      if (max_var < 0 || var > max_var)
+	max_var = var;
+
+      double dev = util - average_util;
+      dev *= dev;
+      stddev += reweight * dev;
+      sum += reweight;
+    }
+  }
+
+  virtual void dump_item(const CrushTreeDumper::Item &qi,
+			 float &reweight,
+			 int64_t kb,
+			 int64_t kb_used,
+			 int64_t kb_used_data,
+			 int64_t kb_used_omap,
+			 int64_t kb_used_meta,
+			 int64_t kb_avail,
+			 double& util,
+			 double& var,
+			 const size_t num_pgs,
+			 F *f) = 0;
+
+  double dev() {
+    return sum > 0 ? sqrt(stddev / sum) : 0;
+  }
+
+  double average_utilization() {
+    int64_t kb = 0, kb_used = 0;
+    for (int i = 0; i < osdmap->get_max_osd(); i++) {
+      if (!osdmap->exists(i) ||
+           osdmap->get_weight(i) == 0 ||
+          !should_dump(i))
+	continue;
+      int64_t kb_i, kb_used_i, kb_used_data_i, kb_used_omap_i, kb_used_meta_i,
+	kb_avail_i;
+      if (get_osd_utilization(i, &kb_i, &kb_used_i, &kb_used_data_i,
+			      &kb_used_omap_i, &kb_used_meta_i, &kb_avail_i)) {
+	kb += kb_i;
+	kb_used += kb_used_i;
+      }
+    }
+    return kb > 0 ? 100.0 * (double)kb_used / (double)kb : 0;
+  }
+
+  bool get_osd_utilization(int id, int64_t* kb, int64_t* kb_used,
+			   int64_t* kb_used_data,
+			   int64_t* kb_used_omap,
+			   int64_t* kb_used_meta,
+			   int64_t* kb_avail) const {
+    const osd_stat_t *p = pgmap.get_osd_stat(id);
+    if (!p) return false;
+    *kb = p->statfs.kb();
+    *kb_used = p->statfs.kb_used_raw();
+    *kb_used_data = p->statfs.kb_used_data();
+    *kb_used_omap = p->statfs.kb_used_omap();
+    *kb_used_meta = p->statfs.kb_used_internal_metadata();
+    *kb_avail = p->statfs.kb_avail();
+    
+    return true;
+  }
+
+  bool get_bucket_utilization(int id, int64_t* kb, int64_t* kb_used,
+			      int64_t* kb_used_data,
+			      int64_t* kb_used_omap,
+			      int64_t* kb_used_meta,
+			      int64_t* kb_avail) const {
+    if (id >= 0) {
+      if (osdmap->is_out(id) || !should_dump(id)) {
+        *kb = 0;
+        *kb_used = 0;
+	*kb_used_data = 0;
+	*kb_used_omap = 0;
+	*kb_used_meta = 0;
+        *kb_avail = 0;
+        return true;
+      }
+      return get_osd_utilization(id, kb, kb_used, kb_used_data,
+				 kb_used_omap, kb_used_meta, kb_avail);
+    }
+
+    *kb = 0;
+    *kb_used = 0;
+    *kb_used_data = 0;
+    *kb_used_omap = 0;
+    *kb_used_meta = 0;
+    *kb_avail = 0;
+
+    for (int k = osdmap->crush->get_bucket_size(id) - 1; k >= 0; k--) {
+      int item = osdmap->crush->get_bucket_item(id, k);
+      int64_t kb_i = 0, kb_used_i = 0, kb_used_data_i = 0,
+	kb_used_omap_i = 0, kb_used_meta_i = 0, kb_avail_i = 0;
+      if (!get_bucket_utilization(item, &kb_i, &kb_used_i,
+				  &kb_used_data_i, &kb_used_omap_i,
+				  &kb_used_meta_i, &kb_avail_i))
+	return false;
+      *kb += kb_i;
+      *kb_used += kb_used_i;
+      *kb_used_data += kb_used_data_i;
+      *kb_used_omap += kb_used_omap_i;
+      *kb_used_meta += kb_used_meta_i;
+      *kb_avail += kb_avail_i;
+    }
+    return true;
+  }
+
+protected:
+  const OSDMap *osdmap;
+  const PGMap& pgmap;
+  bool tree;
+  double average_util;
+  double min_var;
+  double max_var;
+  double stddev;
+  double sum;
+  int class_id = -1;
+  set<int> allowed;
+  set<int> dumped_osds;
+};
+
+
+class OSDUtilizationPlainDumper : public OSDUtilizationDumper<TextTable> {
+public:
+  typedef OSDUtilizationDumper<TextTable> Parent;
+
+  OSDUtilizationPlainDumper(const CrushWrapper *crush, const OSDMap *osdmap,
+                            const PGMap& pgmap, bool tree,
+                            const string& filter) :
+    Parent(crush, osdmap, pgmap, tree, filter) {}
+
+  void dump(TextTable *tbl) {
+    tbl->define_column("ID", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("CLASS", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("WEIGHT", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("REWEIGHT", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("RAW USE", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("DATA", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("OMAP", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("META", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("%USE", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("VAR", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("PGS", TextTable::LEFT, TextTable::RIGHT);
+    tbl->define_column("STATUS", TextTable::LEFT, TextTable::RIGHT);
+    if (tree)
+      tbl->define_column("TYPE NAME", TextTable::LEFT, TextTable::LEFT);
+
+    Parent::dump(tbl);
+
+    dump_stray(tbl);
+
+    auto sum = pgmap.get_osd_sum(get_dumped_osds());
+    *tbl << ""
+	 << ""
+	 << "" << "TOTAL"
+	 << byte_u_t(sum.statfs.total)
+	 << byte_u_t(sum.statfs.get_used_raw())
+	 << byte_u_t(sum.statfs.allocated)
+	 << byte_u_t(sum.statfs.omap_allocated)
+	 << byte_u_t(sum.statfs.internal_metadata)
+	 << byte_u_t(sum.statfs.available)
+	 << lowprecision_t(average_util)
+	 << ""
+	 << TextTable::endrow;
+  }
+
+protected:
+  struct lowprecision_t {
+    float v;
+    explicit lowprecision_t(float _v) : v(_v) {}
+  };
+  friend std::ostream &operator<<(ostream& out, const lowprecision_t& v);
+
+  using OSDUtilizationDumper<TextTable>::dump_item;
+  void dump_item(const CrushTreeDumper::Item &qi,
+			 float &reweight,
+			 int64_t kb,
+			 int64_t kb_used,
+			 int64_t kb_used_data,
+			 int64_t kb_used_omap,
+			 int64_t kb_used_meta,
+			 int64_t kb_avail,
+			 double& util,
+			 double& var,
+			 const size_t num_pgs,
+			 TextTable *tbl) override {
+    const char *c = crush->get_item_class(qi.id);
+    if (!c)
+      c = "";
+    *tbl << qi.id
+	 << c
+	 << weightf_t(qi.weight)
+	 << weightf_t(reweight)
+	 << byte_u_t(kb << 10)
+	 << byte_u_t(kb_used << 10)
+	 << byte_u_t(kb_used_data << 10)
+	 << byte_u_t(kb_used_omap << 10)
+	 << byte_u_t(kb_used_meta << 10)
+	 << byte_u_t(kb_avail << 10)
+	 << lowprecision_t(util)
+	 << lowprecision_t(var);
+
+    if (qi.is_bucket()) {
+      *tbl << "-";
+      *tbl << "";
+    } else {
+      *tbl << num_pgs;
+      if (osdmap->is_up(qi.id)) {
+        *tbl << "up";
+      } else if (osdmap->is_destroyed(qi.id)) {
+        *tbl << "destroyed";
+      } else {
+        *tbl << "down";
+      }
+    }
+
+    if (tree) {
+      ostringstream name;
+      for (int k = 0; k < qi.depth; k++)
+	name << "    ";
+      if (qi.is_bucket()) {
+	int type = crush->get_bucket_type(qi.id);
+	name << crush->get_type_name(type) << " "
+	     << crush->get_item_name(qi.id);
+      } else {
+	name << "osd." << qi.id;
+      }
+      *tbl << name.str();
+    }
+
+    *tbl << TextTable::endrow;
+  }
+
+public:
+  string summary() {
+    ostringstream out;
+    out << "MIN/MAX VAR: " << lowprecision_t(min_var)
+	<< "/" << lowprecision_t(max_var) << "  "
+	<< "STDDEV: " << lowprecision_t(dev());
+    return out.str();
+  }
+};
+
+ostream& operator<<(ostream& out,
+		    const OSDUtilizationPlainDumper::lowprecision_t& v)
+{
+  if (v.v < -0.01) {
+    return out << "-";
+  } else if (v.v < 0.001) {
+    return out << "0";
+  } else {
+    std::streamsize p = out.precision();
+    return out << std::fixed << std::setprecision(2) << v.v << std::setprecision(p);
+  }
+}
+
+class OSDUtilizationFormatDumper : public OSDUtilizationDumper<Formatter> {
+public:
+  typedef OSDUtilizationDumper<Formatter> Parent;
+
+  OSDUtilizationFormatDumper(const CrushWrapper *crush, const OSDMap *osdmap,
+                             const PGMap& pgmap, bool tree,
+                             const string& filter) :
+    Parent(crush, osdmap, pgmap, tree, filter) {}
+
+  void dump(Formatter *f) {
+    f->open_array_section("nodes");
+    Parent::dump(f);
+    f->close_section();
+
+    f->open_array_section("stray");
+    dump_stray(f);
+    f->close_section();
+  }
+
+protected:
+  using OSDUtilizationDumper<Formatter>::dump_item;
+  void dump_item(const CrushTreeDumper::Item &qi,
+		 float &reweight,
+		 int64_t kb,
+		 int64_t kb_used,
+		 int64_t kb_used_data,
+		 int64_t kb_used_omap,
+		 int64_t kb_used_meta,
+		 int64_t kb_avail,
+		 double& util,
+		 double& var,
+		 const size_t num_pgs,
+		 Formatter *f) override {
+    f->open_object_section("item");
+    CrushTreeDumper::dump_item_fields(crush, weight_set_names, qi, f);
+    f->dump_float("reweight", reweight);
+    f->dump_int("kb", kb);
+    f->dump_int("kb_used", kb_used);
+    f->dump_int("kb_used_data", kb_used_data);
+    f->dump_int("kb_used_omap", kb_used_omap);
+    f->dump_int("kb_used_meta", kb_used_meta);
+    f->dump_int("kb_avail", kb_avail);
+    f->dump_float("utilization", util);
+    f->dump_float("var", var);
+    f->dump_unsigned("pgs", num_pgs);
+    if (!qi.is_bucket()) {
+      if (osdmap->is_up(qi.id)) {
+        f->dump_string("status", "up");
+      } else if (osdmap->is_destroyed(qi.id)) {
+        f->dump_string("status", "destroyed");
+      } else {
+        f->dump_string("status", "down");
+      }
+    }
+    CrushTreeDumper::dump_bucket_children(crush, qi, f);
+    f->close_section();
+  }
+
+public:
+  void summary(Formatter *f) {
+    f->open_object_section("summary");
+    auto sum = pgmap.get_osd_sum(get_dumped_osds());
+    auto& s = sum.statfs;
+
+    f->dump_int("total_kb", s.kb());
+    f->dump_int("total_kb_used", s.kb_used_raw());
+    f->dump_int("total_kb_used_data", s.kb_used_data());
+    f->dump_int("total_kb_used_omap", s.kb_used_omap());
+    f->dump_int("total_kb_used_meta", s.kb_used_internal_metadata());
+    f->dump_int("total_kb_avail", s.kb_avail());
+    f->dump_float("average_utilization", average_util);
+    f->dump_float("min_var", min_var);
+    f->dump_float("max_var", max_var);
+    f->dump_float("dev", dev());
+    f->close_section();
+  }
+};
+
+void print_osd_utilization(const OSDMap& osdmap,
+                           const PGMap& pgmap,
+                           ostream& out,
+                           Formatter *f,
+                           bool tree,
+                           const string& filter)
+{
+  const CrushWrapper *crush = osdmap.crush.get();
+  if (f) {
+    f->open_object_section("df");
+    OSDUtilizationFormatDumper d(crush, &osdmap, pgmap, tree, filter);
+    d.dump(f);
+    d.summary(f);
+    f->close_section();
+    f->flush(out);
+  } else {
+    OSDUtilizationPlainDumper d(crush, &osdmap, pgmap, tree, filter);
+    TextTable tbl;
+    d.dump(&tbl);
+    out << tbl << d.summary() << "\n";
+  }
+}
+
+void OSDMap::check_health(CephContext *cct,
+			  health_check_map_t *checks) const
+{
+  int num_osds = get_num_osds();
+
+  // OSD_DOWN
+  // OSD_$subtree_DOWN
+  // OSD_ORPHAN
+  if (num_osds >= 0) {
+    int num_in_osds = 0;
+    int num_down_in_osds = 0;
+    set<int> osds;
+    set<int> down_in_osds;
+    set<int> up_in_osds;
+    set<int> subtree_up;
+    unordered_map<int, set<int> > subtree_type_down;
+    unordered_map<int, int> num_osds_subtree;
+    int max_type = crush->get_max_type_id();
+
+    for (int i = 0; i < get_max_osd(); i++) {
+      if (!exists(i)) {
+        if (crush->item_exists(i)) {
+          osds.insert(i);
+        }
+	continue;
+      }
+      if (is_out(i) || (osd_state[i] & CEPH_OSD_NEW))
+        continue;
+      ++num_in_osds;
+      if (down_in_osds.count(i) || up_in_osds.count(i))
+	continue;
+      if (!is_up(i)) {
+	down_in_osds.insert(i);
+	int parent_id = 0;
+	int current = i;
+	for (int type = 0; type <= max_type; type++) {
+	  if (!crush->get_type_name(type))
+	    continue;
+	  int r = crush->get_immediate_parent_id(current, &parent_id);
+	  if (r == -ENOENT)
+	    break;
+	  // break early if this parent is already marked as up
+	  if (subtree_up.count(parent_id))
+	    break;
+	  type = crush->get_bucket_type(parent_id);
+	  if (!subtree_type_is_down(
+		cct, parent_id, type,
+		&down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
+	    break;
+	  current = parent_id;
+	}
+      }
+    }
+
+    // calculate the number of down osds in each down subtree and
+    // store it in num_osds_subtree
+    for (int type = 1; type <= max_type; type++) {
+      if (!crush->get_type_name(type))
+	continue;
+      for (auto j = subtree_type_down[type].begin();
+	   j != subtree_type_down[type].end();
+	   ++j) {
+	list<int> children;
+	int num = 0;
+	int num_children = crush->get_children(*j, &children);
+	if (num_children == 0)
+	  continue;
+	for (auto l = children.begin(); l != children.end(); ++l) {
+	  if (*l >= 0) {
+	    ++num;
+	  } else if (num_osds_subtree[*l] > 0) {
+	    num = num + num_osds_subtree[*l];
+	  }
+	}
+	num_osds_subtree[*j] = num;
+      }
+    }
+    num_down_in_osds = down_in_osds.size();
+    ceph_assert(num_down_in_osds <= num_in_osds);
+    if (num_down_in_osds > 0) {
+      // summary of down subtree types and osds
+      for (int type = max_type; type > 0; type--) {
+	if (!crush->get_type_name(type))
+	  continue;
+	if (subtree_type_down[type].size() > 0) {
+	  ostringstream ss;
+	  ss << subtree_type_down[type].size() << " "
+	     << crush->get_type_name(type);
+	  if (subtree_type_down[type].size() > 1) {
+	    ss << "s";
+	  }
+	  int sum_down_osds = 0;
+	  for (auto j = subtree_type_down[type].begin();
+	       j != subtree_type_down[type].end();
+	       ++j) {
+	    sum_down_osds = sum_down_osds + num_osds_subtree[*j];
+	  }
+          ss << " (" << sum_down_osds << " osds) down";
+	  string err = string("OSD_") +
+	    string(crush->get_type_name(type)) + "_DOWN";
+	  boost::to_upper(err);
+	  auto& d = checks->add(err, HEALTH_WARN, ss.str(),
+				subtree_type_down[type].size());
+	  for (auto j = subtree_type_down[type].rbegin();
+	       j != subtree_type_down[type].rend();
+	       ++j) {
+	    ostringstream ss;
+	    ss << crush->get_type_name(type);
+	    ss << " ";
+	    ss << crush->get_item_name(*j);
+	    // at the top level, do not print location
+	    if (type != max_type) {
+              ss << " (";
+              ss << crush->get_full_location_ordered_string(*j);
+              ss << ")";
+	    }
+	    int num = num_osds_subtree[*j];
+	    ss << " (" << num << " osds)";
+	    ss << " is down";
+	    d.detail.push_back(ss.str());
+	  }
+	}
+      }
+      ostringstream ss;
+      ss << down_in_osds.size() << " osds down";
+      auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(),
+			    down_in_osds.size());
+      for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
+	ostringstream ss;
+	ss << "osd." << *it << " (";
+	ss << crush->get_full_location_ordered_string(*it);
+	ss << ") is down";
+	d.detail.push_back(ss.str());
+      }
+    }
+
+    if (!osds.empty()) {
+      ostringstream ss;
+      ss << osds.size() << " osds exist in the crush map but not in the osdmap";
+      auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(),
+			    osds.size());
+      for (auto osd : osds) {
+	ostringstream ss;
+	ss << "osd." << osd << " exists in crush map but not in osdmap";
+	d.detail.push_back(ss.str());
+      }
+    }
+  }
+
+  std::list<std::string> scrub_messages;
+  bool noscrub = false, nodeepscrub = false;
+  for (const auto &p : pools) {
+    if (p.second.flags & pg_pool_t::FLAG_NOSCRUB) {
+      ostringstream ss;
+      ss << "Pool " << get_pool_name(p.first) << " has noscrub flag";
+      scrub_messages.push_back(ss.str());
+      noscrub = true;
+    }
+    if (p.second.flags & pg_pool_t::FLAG_NODEEP_SCRUB) {
+      ostringstream ss;
+      ss << "Pool " << get_pool_name(p.first) << " has nodeep-scrub flag";
+      scrub_messages.push_back(ss.str());
+      nodeepscrub = true;
+    }
+  }
+  if (noscrub || nodeepscrub) {
+    string out = "";
+    out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
+    out += nodeepscrub ? "nodeep-scrub" : "";
+    auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
+			  "Some pool(s) have the " + out + " flag(s) set", 0);
+    d.detail.splice(d.detail.end(), scrub_messages);
+  }
+
+  // OSD_OUT_OF_ORDER_FULL
+  {
+    // An osd could configure failsafe ratio, to something different
+    // but for now assume it is the same here.
+    float fsr = cct->_conf->osd_failsafe_full_ratio;
+    if (fsr > 1.0) fsr /= 100;
+    float fr = get_full_ratio();
+    float br = get_backfillfull_ratio();
+    float nr = get_nearfull_ratio();
+
+    list<string> detail;
+    // These checks correspond to how OSDService::check_full_status() in an OSD
+    // handles the improper setting of these values.
+    if (br < nr) {
+      ostringstream ss;
+      ss << "backfillfull_ratio (" << br
+	 << ") < nearfull_ratio (" << nr << "), increased";
+      detail.push_back(ss.str());
+      br = nr;
+    }
+    if (fr < br) {
+      ostringstream ss;
+      ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
+	 << "), increased";
+      detail.push_back(ss.str());
+      fr = br;
+    }
+    if (fsr < fr) {
+      ostringstream ss;
+      ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
+	 << "), increased";
+      detail.push_back(ss.str());
+    }
+    if (!detail.empty()) {
+      auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
+			    "full ratio(s) out of order", 0);
+      d.detail.swap(detail);
+    }
+  }
+
+  // OSD_FULL
+  // OSD_NEARFULL
+  // OSD_BACKFILLFULL
+  // OSD_FAILSAFE_FULL
+  {
+    set<int> full, backfillfull, nearfull;
+    get_full_osd_counts(&full, &backfillfull, &nearfull);
+    if (full.size()) {
+      ostringstream ss;
+      ss << full.size() << " full osd(s)";
+      auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size());
+      for (auto& i: full) {
+	ostringstream ss;
+	ss << "osd." << i << " is full";
+	d.detail.push_back(ss.str());
+      }
+    }
+    if (backfillfull.size()) {
+      ostringstream ss;
+      ss << backfillfull.size() << " backfillfull osd(s)";
+      auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(),
+			    backfillfull.size());
+      for (auto& i: backfillfull) {
+	ostringstream ss;
+	ss << "osd." << i << " is backfill full";
+	d.detail.push_back(ss.str());
+      }
+    }
+    if (nearfull.size()) {
+      ostringstream ss;
+      ss << nearfull.size() << " nearfull osd(s)";
+      auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size());
+      for (auto& i: nearfull) {
+	ostringstream ss;
+	ss << "osd." << i << " is near full";
+	d.detail.push_back(ss.str());
+      }
+    }
+  }
+
+  // OSDMAP_FLAGS
+  {
+    // warn about flags
+    uint64_t warn_flags =
+      CEPH_OSDMAP_PAUSERD |
+      CEPH_OSDMAP_PAUSEWR |
+      CEPH_OSDMAP_PAUSEREC |
+      CEPH_OSDMAP_NOUP |
+      CEPH_OSDMAP_NODOWN |
+      CEPH_OSDMAP_NOIN |
+      CEPH_OSDMAP_NOOUT |
+      CEPH_OSDMAP_NOBACKFILL |
+      CEPH_OSDMAP_NORECOVER |
+      CEPH_OSDMAP_NOSCRUB |
+      CEPH_OSDMAP_NODEEP_SCRUB |
+      CEPH_OSDMAP_NOTIERAGENT |
+      CEPH_OSDMAP_NOSNAPTRIM |
+      CEPH_OSDMAP_NOREBALANCE;
+    if (test_flag(warn_flags)) {
+      ostringstream ss;
+      string s = get_flag_string(get_flags() & warn_flags);
+      ss << s << " flag(s) set";
+      checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(),
+		  s.size() /* kludgey but sufficient */);
+    }
+  }
+
+  // OSD_FLAGS
+  {
+    list<string> detail;
+    const unsigned flags =
+      CEPH_OSD_NOUP |
+      CEPH_OSD_NOIN |
+      CEPH_OSD_NODOWN |
+      CEPH_OSD_NOOUT;
+    for (int i = 0; i < max_osd; ++i) {
+      if (osd_state[i] & flags) {
+	ostringstream ss;
+	set<string> states;
+	OSDMap::calc_state_set(osd_state[i] & flags, states);
+	ss << "osd." << i << " has flags " << states;
+	detail.push_back(ss.str());
+      }
+    }
+    for (auto& i : crush_node_flags) {
+      if (i.second && crush->item_exists(i.first)) {
+	ostringstream ss;
+	set<string> states;
+	OSDMap::calc_state_set(i.second, states);
+	int t = i.first >= 0 ? 0 : crush->get_bucket_type(i.first);
+	const char *tn = crush->get_type_name(t);
+	ss << (tn ? tn : "node") << " "
+	   << crush->get_item_name(i.first) << " has flags " << states;
+	detail.push_back(ss.str());
+      }
+    }
+    for (auto& i : device_class_flags) {
+      const char* class_name = crush->get_class_name(i.first);
+      if (i.second && class_name) {
+        ostringstream ss;
+        set<string> states;
+        OSDMap::calc_state_set(i.second, states);
+        ss << "device class '" << class_name << "' has flags " << states;
+        detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
+      auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size());
+      d.detail.swap(detail);
+    }
+  }
+
+  // OLD_CRUSH_TUNABLES
+  if (cct->_conf->mon_warn_on_legacy_crush_tunables) {
+    string min = crush->get_min_required_version();
+    if (min < cct->_conf->mon_crush_min_required_version) {
+      ostringstream ss;
+      ss << "crush map has legacy tunables (require " << min
+	 << ", min is " << cct->_conf->mon_crush_min_required_version << ")";
+      auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0);
+      d.detail.push_back("see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
+    }
+  }
+
+  // OLD_CRUSH_STRAW_CALC_VERSION
+  if (cct->_conf->mon_warn_on_crush_straw_calc_version_zero) {
+    if (crush->get_straw_calc_version() == 0) {
+      ostringstream ss;
+      ss << "crush map has straw_calc_version=0";
+      auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0);
+      d.detail.push_back(
+	"see http://docs.ceph.com/en/latest/rados/operations/crush-map/#tunables");
+    }
+  }
+
+  // CACHE_POOL_NO_HIT_SET
+  if (cct->_conf->mon_warn_on_cache_pools_without_hit_sets) {
+    list<string> detail;
+    for (auto p = pools.cbegin(); p != pools.cend(); ++p) {
+      const pg_pool_t& info = p->second;
+      if (info.cache_mode_requires_hit_set() &&
+	  info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
+	ostringstream ss;
+	ss << "pool '" << get_pool_name(p->first)
+	   << "' with cache_mode " << info.get_cache_mode_name()
+	   << " needs hit_set_type to be set but it is not";
+	detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << detail.size() << " cache pools are missing hit_sets";
+      auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(),
+			    detail.size());
+      d.detail.swap(detail);
+    }
+  }
+
+  // OSD_NO_SORTBITWISE
+  if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
+    ostringstream ss;
+    ss << "'sortbitwise' flag is not set";
+    checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0);
+  }
+
+  // OSD_UPGRADE_FINISHED
+  if (auto require_release = pending_require_osd_release()) {
+    ostringstream ss;
+    ss << "all OSDs are running " << *require_release << " or later but"
+       << " require_osd_release < " << *require_release;
+    auto& d = checks->add("OSD_UPGRADE_FINISHED", HEALTH_WARN, ss.str(), 0);
+    d.detail.push_back(ss.str());
+  }
+
+  // POOL_NEARFULL/BACKFILLFULL/FULL
+  {
+    list<string> full_detail, backfillfull_detail, nearfull_detail;
+    for (auto it : get_pools()) {
+      const pg_pool_t &pool = it.second;
+      const string& pool_name = get_pool_name(it.first);
+      if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
+	stringstream ss;
+        if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
+          // may run out of space too,
+          // but we want EQUOTA taking precedence
+          ss << "pool '" << pool_name << "' is full (running out of quota)";
+        } else {
+          ss << "pool '" << pool_name << "' is full (no space)";
+        }
+	full_detail.push_back(ss.str());
+      } else if (pool.has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
+        stringstream ss;
+        ss << "pool '" << pool_name << "' is backfillfull";
+        backfillfull_detail.push_back(ss.str());
+      } else if (pool.has_flag(pg_pool_t::FLAG_NEARFULL)) {
+        stringstream ss;
+        ss << "pool '" << pool_name << "' is nearfull";
+        nearfull_detail.push_back(ss.str());
+      }
+    }
+    if (!full_detail.empty()) {
+      ostringstream ss;
+      ss << full_detail.size() << " pool(s) full";
+      auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size());
+      d.detail.swap(full_detail);
+    }
+    if (!backfillfull_detail.empty()) {
+      ostringstream ss;
+      ss << backfillfull_detail.size() << " pool(s) backfillfull";
+      auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(),
+			    backfillfull_detail.size());
+      d.detail.swap(backfillfull_detail);
+    }
+    if (!nearfull_detail.empty()) {
+      ostringstream ss;
+      ss << nearfull_detail.size() << " pool(s) nearfull";
+      auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(),
+			    nearfull_detail.size());
+      d.detail.swap(nearfull_detail);
+    }
+  }
+
+  // POOL_PG_NUM_NOT_POWER_OF_TWO
+  if (cct->_conf.get_val<bool>("mon_warn_on_pool_pg_num_not_power_of_two")) {
+    list<string> detail;
+    for (auto it : get_pools()) {
+      if (!isp2(it.second.get_pg_num_target())) {
+	ostringstream ss;
+	ss << "pool '" << get_pool_name(it.first)
+	   << "' pg_num " << it.second.get_pg_num_target()
+	   << " is not a power of two";
+	detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << detail.size() << " pool(s) have non-power-of-two pg_num";
+      auto& d = checks->add("POOL_PG_NUM_NOT_POWER_OF_TWO", HEALTH_WARN,
+			    ss.str(), detail.size());
+      d.detail.swap(detail);
+    }
+  }
+
+  // POOL_NO_REDUNDANCY
+  if (cct->_conf.get_val<bool>("mon_warn_on_pool_no_redundancy"))
+  {
+    list<string> detail;
+    for (auto it : get_pools()) {
+      if (it.second.get_size() == 1) {
+        ostringstream ss;
+        ss << "pool '" << get_pool_name(it.first)
+           << "' has no replicas configured";
+        detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << detail.size() << " pool(s) have no replicas configured";
+      auto& d = checks->add("POOL_NO_REDUNDANCY", HEALTH_WARN,
+        ss.str(), detail.size());
+      d.detail.swap(detail);
+    }
+  }
+
+  // DEGRADED STRETCH MODE
+  if (cct->_conf.get_val<bool>("mon_warn_on_degraded_stretch_mode")) {
+    if (recovering_stretch_mode) {
+      stringstream ss;
+      ss << "We are recovering stretch mode buckets, only requiring "
+	 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
+      checks->add("RECOVERING_STRETCH_MODE", HEALTH_WARN,
+			    ss.str(), 0);
+    } else if (degraded_stretch_mode) {
+      stringstream ss;
+      ss << "We are missing stretch mode buckets, only requiring "
+	 << degraded_stretch_mode << " of " << stretch_bucket_count << " buckets to peer" ;
+      checks->add("DEGRADED_STRETCH_MODE", HEALTH_WARN,
+			    ss.str(), 0);
+    }
+  }
+}
+
+int OSDMap::parse_osd_id_list(const vector<string>& ls, set<int> *out,
+			      ostream *ss) const
+{
+  out->clear();
+  for (auto i = ls.begin(); i != ls.end(); ++i) {
+    if (i == ls.begin() &&
+	(*i == "any" || *i == "all" || *i == "*")) {
+      get_all_osds(*out);
+      break;
+    }
+    long osd = TOPNSPC::common::parse_osd_id(i->c_str(), ss);
+    if (osd < 0) {
+      *ss << "invalid osd id '" << *i << "'";
+      return -EINVAL;
+    }
+    out->insert(osd);
+  }
+  return 0;
+}
+
+void OSDMap::get_random_up_osds_by_subtree(int n,     // whoami
+                                           string &subtree,
+                                           int limit, // how many
+                                           set<int> skip,
+                                           set<int> *want) const {
+  if (limit <= 0)
+    return;
+  int subtree_type = crush->get_type_id(subtree);
+  if (subtree_type < 1)
+    return;
+  vector<int> subtrees;
+  crush->get_subtree_of_type(subtree_type, &subtrees);
+  std::random_device rd;
+  std::default_random_engine rng{rd()};
+  std::shuffle(subtrees.begin(), subtrees.end(), rng);
+  for (auto s : subtrees) {
+    if (limit <= 0)
+      break;
+    if (crush->subtree_contains(s, n))
+      continue;
+    vector<int> osds;
+    crush->get_children_of_type(s, 0, &osds);
+    if (osds.empty())
+      continue;
+    vector<int> up_osds;
+    for (auto o : osds) {
+      if (is_up(o) && !skip.count(o))
+        up_osds.push_back(o);
+    }
+    if (up_osds.empty())
+      continue;
+    auto it = up_osds.begin();
+    std::advance(it, (n % up_osds.size()));
+    want->insert(*it);
+    --limit;
+  }
+}
+
+float OSDMap::pool_raw_used_rate(int64_t poolid) const
+{
+  const pg_pool_t *pool = get_pg_pool(poolid);
+  assert(pool != nullptr);
+
+  switch (pool->get_type()) {
+  case pg_pool_t::TYPE_REPLICATED:
+    return pool->get_size();
+  case pg_pool_t::TYPE_ERASURE:
+  {
+    auto& ecp =
+      get_erasure_code_profile(pool->erasure_code_profile);
+    auto pm = ecp.find("m");
+    auto pk = ecp.find("k");
+    if (pm != ecp.end() && pk != ecp.end()) {
+      int k = atoi(pk->second.c_str());
+      int m = atoi(pm->second.c_str());
+      int mk = m + k;
+      ceph_assert(mk != 0);
+      ceph_assert(k != 0);
+      return (float)mk / k;
+    } else {
+      return 0.0;
+    }
+  }
+  break;
+  default:
+    ceph_abort_msg("unrecognized pool type");
+  }
+}
+
+unsigned OSDMap::get_osd_crush_node_flags(int osd) const
+{
+  unsigned flags = 0;
+  if (!crush_node_flags.empty()) {
+    // the map will contain type -> name
+    std::map<std::string,std::string> ploc = crush->get_full_location(osd);
+    for (auto& i : ploc) {
+      int id = crush->get_item_id(i.second);
+      auto p = crush_node_flags.find(id);
+      if (p != crush_node_flags.end()) {
+	flags |= p->second;
+      }
+    }
+  }
+  return flags;
+}
+
+unsigned OSDMap::get_crush_node_flags(int id) const
+{
+  unsigned flags = 0;
+  auto it = crush_node_flags.find(id);
+  if (it != crush_node_flags.end())
+    flags = it->second;
+  return flags;
+}
+
+unsigned OSDMap::get_device_class_flags(int id) const
+{
+  unsigned flags = 0;
+  auto it = device_class_flags.find(id);
+  if (it != device_class_flags.end())
+    flags = it->second;
+  return flags;
+}
+
+std::optional<std::string> OSDMap::pending_require_osd_release() const
+{
+  if (HAVE_FEATURE(get_up_osd_features(), SERVER_PACIFIC) &&
+      require_osd_release < ceph_release_t::pacific) {
+    return "pacific";
+  }
+  if (HAVE_FEATURE(get_up_osd_features(), SERVER_OCTOPUS) &&
+      require_osd_release < ceph_release_t::octopus) {
+    return "octopus";
+  }
+  if (HAVE_FEATURE(get_up_osd_features(), SERVER_NAUTILUS) &&
+      require_osd_release < ceph_release_t::nautilus) {
+    return "nautilus";
+  }
+
+  return std::nullopt;
+}
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
new file mode 100644
index 000000000..83ab75e0d
--- /dev/null
+++ b/src/osd/OSDMap.h
@@ -0,0 +1,1600 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_OSDMAP_H
+#define CEPH_OSDMAP_H
+
+/*
+ * describe properties of the OSD cluster.
+ *   disks, disk groups, total # osds,
+ *
+ */
+#include <vector>
+#include <list>
+#include <set>
+#include <map>
+#include <memory>
+
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+#include "include/btree_map.h"
+#include "include/common_fwd.h"
+#include "include/types.h"
+#include "common/ceph_releases.h"
+#include "osd_types.h"
+
+//#include "include/ceph_features.h"
+#include "crush/CrushWrapper.h"
+
+// forward declaration
+class CrushWrapper;
+class health_check_map_t;
+
+/*
+ * we track up to two intervals during which the osd was alive and
+ * healthy.  the most recent is [up_from,up_thru), where up_thru is
+ * the last epoch the osd is known to have _started_.  i.e., a lower
+ * bound on the actual osd death.  down_at (if it is > up_from) is an
+ * upper bound on the actual osd death.
+ *
+ * the second is the last_clean interval [begin,end).  in that case,
+ * the last interval is the last epoch known to have been either
+ * _finished_, or during which the osd cleanly shut down.  when
+ * possible, we push this forward to the epoch the osd was eventually
+ * marked down.
+ *
+ * the lost_at is used to allow build_prior to proceed without waiting
+ * for an osd to recover.  In certain cases, progress may be blocked 
+ * because an osd is down that may contain updates (i.e., a pg may have
+ * gone rw during an interval).  If the osd can't be brought online, we
+ * can force things to proceed knowing that we _might_ be losing some
+ * acked writes.  If the osd comes back to life later, that's fine to,
+ * but those writes will still be lost (the divergent objects will be
+ * thrown out).
+ */
+struct osd_info_t {
+  epoch_t last_clean_begin;  // last interval that ended with a clean osd shutdown
+  epoch_t last_clean_end;
+  epoch_t up_from;   // epoch osd marked up
+  epoch_t up_thru;   // lower bound on actual osd death (if > up_from)
+  epoch_t down_at;   // upper bound on actual osd death (if > up_from)
+  epoch_t lost_at;   // last epoch we decided data was "lost"
+  
+  osd_info_t() : last_clean_begin(0), last_clean_end(0),
+		 up_from(0), up_thru(0), down_at(0), lost_at(0) {}
+
+  void dump(ceph::Formatter *f) const;
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  static void generate_test_instances(std::list<osd_info_t*>& o);
+};
+WRITE_CLASS_ENCODER(osd_info_t)
+
+std::ostream& operator<<(std::ostream& out, const osd_info_t& info);
+
+struct osd_xinfo_t {
+  utime_t down_stamp;      ///< timestamp when we were last marked down
+  float laggy_probability; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy
+  __u32 laggy_interval;    ///< average interval between being marked laggy and recovering
+  uint64_t features;       ///< features supported by this osd we should know about
+  __u32 old_weight;        ///< weight prior to being auto marked out
+  utime_t last_purged_snaps_scrub; ///< last scrub of purged_snaps
+  epoch_t dead_epoch = 0;  ///< last epoch we were confirmed dead (not just down)
+
+  osd_xinfo_t() : laggy_probability(0), laggy_interval(0),
+                  features(0), old_weight(0) {}
+
+  void dump(ceph::Formatter *f) const;
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  static void generate_test_instances(std::list<osd_xinfo_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(osd_xinfo_t)
+
+std::ostream& operator<<(std::ostream& out, const osd_xinfo_t& xi);
+
+
+struct PGTempMap {
+#if 1
+  ceph::buffer::list data;
+  typedef btree::btree_map<pg_t,ceph_le32*> map_t;
+  map_t map;
+
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    uint32_t n = map.size();
+    encode(n, bl);
+    for (auto &p : map) {
+      encode(p.first, bl);
+      bl.append((char*)p.second, (*p.second + 1) * sizeof(ceph_le32));
+    }
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    using ceph::decode;
+    data.clear();
+    map.clear();
+    uint32_t n;
+    decode(n, p);
+    if (!n)
+      return;
+    auto pstart = p;
+    size_t start_off = pstart.get_off();
+    std::vector<std::pair<pg_t,size_t>> offsets;
+    offsets.resize(n);
+    for (unsigned i=0; i<n; ++i) {
+      pg_t pgid;
+      decode(pgid, p);
+      offsets[i].first = pgid;
+      offsets[i].second = p.get_off() - start_off;
+      uint32_t vn;
+      decode(vn, p);
+      p += vn * sizeof(int32_t);
+    }
+    size_t len = p.get_off() - start_off;
+    pstart.copy(len, data);
+    if (data.get_num_buffers() > 1) {
+      data.rebuild();
+    }
+    //map.reserve(n);
+    char *start = data.c_str();
+    for (auto i : offsets) {
+      map.insert(map.end(), std::make_pair(i.first, (ceph_le32*)(start + i.second)));
+    }
+  }
+  void rebuild() {
+    ceph::buffer::list bl;
+    encode(bl);
+    auto p = std::cbegin(bl);
+    decode(p);
+  }
+  friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
+    return
+      l.map.size() == r.map.size() &&
+      l.data.contents_equal(r.data);
+  }
+
+  class iterator {
+    map_t::const_iterator it;
+    map_t::const_iterator end;
+    std::pair<pg_t,std::vector<int32_t>> current;
+    void init_current() {
+      if (it != end) {
+	current.first = it->first;
+	ceph_assert(it->second);
+	current.second.resize(*it->second);
+	ceph_le32 *p = it->second + 1;
+	for (uint32_t n = 0; n < *it->second; ++n, ++p) {
+	  current.second[n] = *p;
+	}
+      }
+    }
+  public:
+    iterator(map_t::const_iterator p,
+	     map_t::const_iterator e)
+      : it(p), end(e) {
+      init_current();
+    }
+
+    const std::pair<pg_t,std::vector<int32_t>>& operator*() const {
+      return current;
+    }
+    const std::pair<pg_t,std::vector<int32_t>>* operator->() const {
+      return &current;
+    }
+    friend bool operator==(const iterator& l, const iterator& r) {
+      return l.it == r.it;
+    }
+    friend bool operator!=(const iterator& l, const iterator& r) {
+      return l.it != r.it;
+    }
+    iterator& operator++() {
+      ++it;
+      if (it != end)
+	init_current();
+      return *this;
+    }
+    iterator operator++(int) {
+      iterator r = *this;
+      ++it;
+      if (it != end)
+	init_current();
+      return r;
+    }
+  };
+  iterator begin() const {
+    return iterator(map.begin(), map.end());
+  }
+  iterator end() const {
+    return iterator(map.end(), map.end());
+  }
+  iterator find(pg_t pgid) const {
+    return iterator(map.find(pgid), map.end());
+  }
+  size_t size() const {
+    return map.size();
+  }
+  size_t count(pg_t pgid) const {
+    return map.count(pgid);
+  }
+  void erase(pg_t pgid) {
+    map.erase(pgid);
+  }
+  void clear() {
+    map.clear();
+    data.clear();
+  }
+  void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
+    using ceph::encode;
+    size_t need = sizeof(ceph_le32) * (1 + v.size());
+    if (need < data.get_append_buffer_unused_tail_length()) {
+      ceph::buffer::ptr z(data.get_append_buffer_unused_tail_length());
+      z.zero();
+      data.append(z.c_str(), z.length());
+    }
+    encode(v, data);
+    map[pgid] = (ceph_le32*)(data.back().end_c_str()) - (1 + v.size());
+  }
+  mempool::osdmap::vector<int32_t> get(pg_t pgid) {
+    mempool::osdmap::vector<int32_t> v;
+    ceph_le32 *p = map[pgid];
+    size_t n = *p++;
+    v.resize(n);
+    for (size_t i = 0; i < n; ++i, ++p) {
+      v[i] = *p;
+    }
+    return v;
+  }
+#else
+  // trivial implementation
+  mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > pg_temp;
+
+  void encode(ceph::buffer::list& bl) const {
+    encode(pg_temp, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    decode(pg_temp, p);
+  }
+  friend bool operator==(const PGTempMap& l, const PGTempMap& r) {
+    return
+      l.pg_temp.size() == r.pg_temp.size() &&
+      l.pg_temp == r.pg_temp;
+  }
+
+  class iterator {
+    mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> >::const_iterator it;
+  public:
+    iterator(mempool::osdmap::map<pg_t,
+	     mempool::osdmap::vector<int32_t> >::const_iterator p)
+      : it(p) {}
+
+    std::pair<pg_t,const mempool::osdmap::vector<int32_t>&> operator*() const {
+      return *it;
+    }
+    const std::pair<const pg_t,mempool::osdmap::vector<int32_t>>* operator->() const {
+      return &*it;
+    }
+    friend bool operator==(const iterator& l, const iterator& r) {
+      return l.it == r.it;
+    }
+    friend bool operator!=(const iterator& l, const iterator& r) {
+      return l.it != r.it;
+    }
+    iterator& operator++() {
+      ++it;
+      return *this;
+    }
+    iterator operator++(int) {
+      iterator r = *this;
+      ++it;
+      return r;
+    }
+  };
+  iterator begin() const {
+    return iterator(pg_temp.cbegin());
+  }
+  iterator end() const {
+    return iterator(pg_temp.cend());
+  }
+  iterator find(pg_t pgid) const {
+    return iterator(pg_temp.find(pgid));
+  }
+  size_t size() const {
+    return pg_temp.size();
+  }
+  size_t count(pg_t pgid) const {
+    return pg_temp.count(pgid);
+  }
+  void erase(pg_t pgid) {
+    pg_temp.erase(pgid);
+  }
+  void clear() {
+    pg_temp.clear();
+  }
+  void set(pg_t pgid, const mempool::osdmap::vector<int32_t>& v) {
+    pg_temp[pgid] = v;
+  }
+  const mempool::osdmap::vector<int32_t>& get(pg_t pgid) {
+    return pg_temp.at(pgid);
+  }
+#endif
+  void dump(ceph::Formatter *f) const {
+    for (const auto &pg : *this) {
+      f->open_object_section("osds");
+      f->dump_stream("pgid") << pg.first;
+      f->open_array_section("osds");
+      for (const auto osd : pg.second)
+	f->dump_int("osd", osd);
+      f->close_section();
+      f->close_section();
+    }
+  }
+};
+WRITE_CLASS_ENCODER(PGTempMap)
+
+/** OSDMap
+ */
+class OSDMap {
+public:
+  MEMPOOL_CLASS_HELPERS();
+
+  class Incremental {
+  public:
+    MEMPOOL_CLASS_HELPERS();
+
+    /// feature bits we were encoded with.  the subsequent OSDMap
+    /// encoding should match.
+    uint64_t encode_features;
+    uuid_d fsid;
+    epoch_t epoch;   // new epoch; we are a diff from epoch-1 to epoch
+    utime_t modified;
+    int64_t new_pool_max; //incremented by the OSDMonitor on each pool create
+    int32_t new_flags;
+    ceph_release_t new_require_osd_release{0xff};
+    uint32_t new_stretch_bucket_count{0};
+    uint32_t new_degraded_stretch_mode{0};
+    uint32_t new_recovering_stretch_mode{0};
+    int32_t new_stretch_mode_bucket{0};
+    bool stretch_mode_enabled{false};
+    bool change_stretch_mode{false};
+
+    // full (rare)
+    ceph::buffer::list fullmap;  // in lieu of below.
+    ceph::buffer::list crush;
+
+    // incremental
+    int32_t new_max_osd;
+    mempool::osdmap::map<int64_t,pg_pool_t> new_pools;
+    mempool::osdmap::map<int64_t,std::string> new_pool_names;
+    mempool::osdmap::set<int64_t> old_pools;
+    mempool::osdmap::map<std::string,std::map<std::string,std::string> > new_erasure_code_profiles;
+    mempool::osdmap::vector<std::string> old_erasure_code_profiles;
+    mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_client;
+    mempool::osdmap::map<int32_t,entity_addrvec_t> new_up_cluster;
+    mempool::osdmap::map<int32_t,uint32_t> new_state;             // XORed onto previous state.
+    mempool::osdmap::map<int32_t,uint32_t> new_weight;
+    mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > new_pg_temp;     // [] to remove
+    mempool::osdmap::map<pg_t, int32_t> new_primary_temp;            // [-1] to remove
+    mempool::osdmap::map<int32_t,uint32_t> new_primary_affinity;
+    mempool::osdmap::map<int32_t,epoch_t> new_up_thru;
+    mempool::osdmap::map<int32_t,std::pair<epoch_t,epoch_t> > new_last_clean_interval;
+    mempool::osdmap::map<int32_t,epoch_t> new_lost;
+    mempool::osdmap::map<int32_t,uuid_d> new_uuid;
+    mempool::osdmap::map<int32_t,osd_xinfo_t> new_xinfo;
+
+    mempool::osdmap::map<entity_addr_t,utime_t> new_blocklist;
+    mempool::osdmap::vector<entity_addr_t> old_blocklist;
+    mempool::osdmap::map<entity_addr_t,utime_t> new_range_blocklist;
+    mempool::osdmap::vector<entity_addr_t> old_range_blocklist;
+    mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_back_up;
+    mempool::osdmap::map<int32_t, entity_addrvec_t> new_hb_front_up;
+
+    mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> new_pg_upmap;
+    mempool::osdmap::map<pg_t,mempool::osdmap::vector<std::pair<int32_t,int32_t>>> new_pg_upmap_items;
+    mempool::osdmap::set<pg_t> old_pg_upmap, old_pg_upmap_items;
+    mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps;
+    mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps;
+
+    mempool::osdmap::map<int32_t,uint32_t> new_crush_node_flags;
+    mempool::osdmap::map<int32_t,uint32_t> new_device_class_flags;
+
+    std::string cluster_snapshot;
+
+    float new_nearfull_ratio = -1;
+    float new_backfillfull_ratio = -1;
+    float new_full_ratio = -1;
+
+    ceph_release_t new_require_min_compat_client{0xff};
+
+    utime_t new_last_up_change, new_last_in_change;
+
+    mutable bool have_crc;      ///< crc values are defined
+    uint32_t full_crc;  ///< crc of the resulting OSDMap
+    mutable uint32_t inc_crc;   ///< crc of this incremental
+
+    int get_net_marked_out(const OSDMap *previous) const;
+    int get_net_marked_down(const OSDMap *previous) const;
+    int identify_osd(uuid_d u) const;
+
+    void encode_client_old(ceph::buffer::list& bl) const;
+    void encode_classic(ceph::buffer::list& bl, uint64_t features) const;
+    void encode(ceph::buffer::list& bl, uint64_t features=CEPH_FEATURES_ALL) const;
+    void decode_classic(ceph::buffer::list::const_iterator &p);
+    void decode(ceph::buffer::list::const_iterator &bl);
+    void dump(ceph::Formatter *f) const;
+    static void generate_test_instances(std::list<Incremental*>& o);
+
+    explicit Incremental(epoch_t e=0) :
+      encode_features(0),
+      epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1),
+      have_crc(false), full_crc(0), inc_crc(0) {
+    }
+    explicit Incremental(ceph::buffer::list &bl) {
+      auto p = std::cbegin(bl);
+      decode(p);
+    }
+    explicit Incremental(ceph::buffer::list::const_iterator &p) {
+      decode(p);
+    }
+
+    pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) {
+      if (new_pools.count(pool) == 0)
+	new_pools[pool] = *orig;
+      return &new_pools[pool];
+    }
+    bool has_erasure_code_profile(const std::string &name) const {
+      auto i = new_erasure_code_profiles.find(name);
+      return i != new_erasure_code_profiles.end();
+    }
+    void set_erasure_code_profile(const std::string &name,
+				  const std::map<std::string,std::string>& profile) {
+      new_erasure_code_profiles[name] = profile;
+    }
+    mempool::osdmap::map<std::string,std::map<std::string,std::string>> get_erasure_code_profiles() const {
+      return new_erasure_code_profiles;
+    }
+
+    /// propagate update pools' (snap and other) metadata to any of their tiers
+    int propagate_base_properties_to_tiers(CephContext *cct, const OSDMap &base);
+
+    /// filter out osds with any pending state changing
+    size_t get_pending_state_osds(std::vector<int> *osds) {
+      ceph_assert(osds);
+      osds->clear();
+
+      for (auto &p : new_state) {
+        osds->push_back(p.first);
+      }
+
+      return osds->size();
+    }
+
+    bool pending_osd_has_state(int osd, unsigned state) {
+      return new_state.count(osd) && (new_state[osd] & state) != 0;
+    }
+
+    bool pending_osd_state_set(int osd, unsigned state) {
+      if (pending_osd_has_state(osd, state))
+        return false;
+      new_state[osd] |= state;
+      return true;
+    }
+
+    // cancel the specified pending osd state if there is any
+    // return ture on success, false otherwise.
+    bool pending_osd_state_clear(int osd, unsigned state) {
+      if (!pending_osd_has_state(osd, state)) {
+        // never has been set or already has been cancelled.
+        return false;
+      }
+
+      new_state[osd] &= ~state;
+      if (!new_state[osd]) {
+        // all flags cleared
+        new_state.erase(osd);
+      }
+      return true;
+    }
+
+    bool in_new_removed_snaps(int64_t pool, snapid_t snap) const {
+      auto p = new_removed_snaps.find(pool);
+      if (p == new_removed_snaps.end()) {
+	return false;
+      }
+      return p->second.contains(snap);
+    }
+  };
+  
+private:
+  uuid_d fsid;
+  epoch_t epoch;        // what epoch of the osd cluster descriptor is this
+  utime_t created, modified; // epoch start time
+  int32_t pool_max;     // the largest pool num, ever
+
+  uint32_t flags;
+
+  int num_osd;         // not saved; see calc_num_osds
+  int num_up_osd;      // not saved; see calc_num_osds
+  int num_in_osd;      // not saved; see calc_num_osds
+
+  int32_t max_osd;
+  std::vector<uint32_t> osd_state;
+
+  mempool::osdmap::map<int32_t,uint32_t> crush_node_flags; // crush node -> CEPH_OSD_* flags
+  mempool::osdmap::map<int32_t,uint32_t> device_class_flags; // device class -> CEPH_OSD_* flags
+
+  utime_t last_up_change, last_in_change;
+
+  // These features affect OSDMap[::Incremental] encoding, or the
+  // encoding of some type embedded therein (CrushWrapper, something
+  // from osd_types, etc.).
+  static constexpr uint64_t SIGNIFICANT_FEATURES =
+    CEPH_FEATUREMASK_PGID64 |
+    CEPH_FEATUREMASK_PGPOOL3 |
+    CEPH_FEATUREMASK_OSDENC |
+    CEPH_FEATUREMASK_OSDMAP_ENC |
+    CEPH_FEATUREMASK_OSD_POOLRESEND |
+    CEPH_FEATUREMASK_NEW_OSDOP_ENCODING |
+    CEPH_FEATUREMASK_MSG_ADDR2 |
+    CEPH_FEATUREMASK_CRUSH_TUNABLES5 |
+    CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS |
+    CEPH_FEATUREMASK_SERVER_LUMINOUS |
+    CEPH_FEATUREMASK_SERVER_MIMIC |
+    CEPH_FEATUREMASK_SERVER_NAUTILUS |
+    CEPH_FEATUREMASK_SERVER_OCTOPUS;
+
+  struct addrs_s {
+    mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > client_addrs;
+    mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > cluster_addrs;
+    mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_back_addrs;
+    mempool::osdmap::vector<std::shared_ptr<entity_addrvec_t> > hb_front_addrs;
+  };
+  std::shared_ptr<addrs_s> osd_addrs;
+
+  entity_addrvec_t _blank_addrvec;
+
+  mempool::osdmap::vector<__u32>   osd_weight;   // 16.16 fixed point, 0x10000 = "in", 0 = "out"
+  mempool::osdmap::vector<osd_info_t> osd_info;
+  std::shared_ptr<PGTempMap> pg_temp;  // temp pg mapping (e.g. while we rebuild)
+  std::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp;  // temp primary mapping (e.g. while we rebuild)
+  std::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
+
+  // remap (post-CRUSH, pre-up)
+  mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> pg_upmap; ///< remap pg
+  mempool::osdmap::map<pg_t,mempool::osdmap::vector<std::pair<int32_t,int32_t>>> pg_upmap_items; ///< remap osds in up set
+
+  mempool::osdmap::map<int64_t,pg_pool_t> pools;
+  mempool::osdmap::map<int64_t,std::string> pool_name;
+  mempool::osdmap::map<std::string, std::map<std::string,std::string>> erasure_code_profiles;
+  mempool::osdmap::map<std::string,int64_t, std::less<>> name_pool;
+
+  std::shared_ptr< mempool::osdmap::vector<uuid_d> > osd_uuid;
+  mempool::osdmap::vector<osd_xinfo_t> osd_xinfo;
+
+  class range_bits {
+    struct ip6 {
+      uint64_t upper_64_bits, lower_64_bits;
+      uint64_t upper_mask, lower_mask;
+    };
+    struct ip4 {
+      uint32_t ip_32_bits;
+      uint32_t mask;
+    };
+    union {
+      ip6 ipv6;
+      ip4 ipv4;
+    } bits;
+    bool ipv6;
+    static void get_ipv6_bytes(unsigned const char *addr,
+			uint64_t *upper, uint64_t *lower);
+  public:
+    range_bits();
+    range_bits(const entity_addr_t& addr);
+    void parse(const entity_addr_t& addr);
+    bool matches(const entity_addr_t& addr) const;
+  };
+  mempool::osdmap::unordered_map<entity_addr_t,utime_t> blocklist;
+  mempool::osdmap::map<entity_addr_t,utime_t> range_blocklist;
+  mempool::osdmap::map<entity_addr_t,range_bits> calculated_ranges;
+
+  /// queue of snaps to remove
+  mempool::osdmap::map<int64_t, snap_interval_set_t> removed_snaps_queue;
+
+  /// removed_snaps additions this epoch
+  mempool::osdmap::map<int64_t, snap_interval_set_t> new_removed_snaps;
+
+  /// removed_snaps removals this epoch
+  mempool::osdmap::map<int64_t, snap_interval_set_t> new_purged_snaps;
+
+  epoch_t cluster_snapshot_epoch;
+  std::string cluster_snapshot;
+  bool new_blocklist_entries;
+
+  float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0;
+
+  /// min compat client we want to support
+  ceph_release_t require_min_compat_client{ceph_release_t::unknown};
+
+public:
+  /// require osds to run at least this release
+  ceph_release_t require_osd_release{ceph_release_t::unknown};
+
+private:
+  mutable uint64_t cached_up_osd_features;
+
+  mutable bool crc_defined;
+  mutable uint32_t crc;
+
+  void _calc_up_osd_features();
+
+ public:
+  bool have_crc() const { return crc_defined; }
+  uint32_t get_crc() const { return crc; }
+
+  std::shared_ptr<CrushWrapper> crush;       // hierarchical map
+  bool stretch_mode_enabled; // we are in stretch mode, requiring multiple sites
+  uint32_t stretch_bucket_count; // number of sites we expect to be in
+  uint32_t degraded_stretch_mode; // 0 if not degraded; else count of up sites
+  uint32_t recovering_stretch_mode; // 0 if not recovering; else 1
+  int32_t stretch_mode_bucket; // the bucket type we're stretched across
+private:
+  uint32_t crush_version = 1;
+
+  friend class OSDMonitor;
+
+ public:
+  OSDMap() : epoch(0), 
+	     pool_max(0),
+	     flags(0),
+	     num_osd(0), num_up_osd(0), num_in_osd(0),
+	     max_osd(0),
+	     osd_addrs(std::make_shared<addrs_s>()),
+	     pg_temp(std::make_shared<PGTempMap>()),
+	     primary_temp(std::make_shared<mempool::osdmap::map<pg_t,int32_t>>()),
+	     osd_uuid(std::make_shared<mempool::osdmap::vector<uuid_d>>()),
+	     cluster_snapshot_epoch(0),
+	     new_blocklist_entries(false),
+	     cached_up_osd_features(0),
+	     crc_defined(false), crc(0),
+	     crush(std::make_shared<CrushWrapper>()),
+	     stretch_mode_enabled(false), stretch_bucket_count(0),
+	     degraded_stretch_mode(0), recovering_stretch_mode(0), stretch_mode_bucket(0) {
+  }
+
+private:
+  OSDMap(const OSDMap& other) = default;
+  OSDMap& operator=(const OSDMap& other) = default;
+public:
+
+  /// return feature mask subset that is relevant to OSDMap encoding
+  static uint64_t get_significant_features(uint64_t features) {
+    return SIGNIFICANT_FEATURES & features;
+  }
+
+  uint64_t get_encoding_features() const;
+
+  void deepish_copy_from(const OSDMap& o) {
+    *this = o;
+    primary_temp.reset(new mempool::osdmap::map<pg_t,int32_t>(*o.primary_temp));
+    pg_temp.reset(new PGTempMap(*o.pg_temp));
+    osd_uuid.reset(new mempool::osdmap::vector<uuid_d>(*o.osd_uuid));
+
+    if (o.osd_primary_affinity)
+      osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity));
+
+    // NOTE: this still references shared entity_addrvec_t's.
+    osd_addrs.reset(new addrs_s(*o.osd_addrs));
+
+    // NOTE: we do not copy crush.  note that apply_incremental will
+    // allocate a new CrushWrapper, though.
+  }
+
+  // map info
+  const uuid_d& get_fsid() const { return fsid; }
+  void set_fsid(uuid_d& f) { fsid = f; }
+
+  epoch_t get_epoch() const { return epoch; }
+  void inc_epoch() { epoch++; }
+
+  void set_epoch(epoch_t e);
+
+  uint32_t get_crush_version() const {
+    return crush_version;
+  }
+
+  /* stamps etc */
+  const utime_t& get_created() const { return created; }
+  const utime_t& get_modified() const { return modified; }
+
+  bool is_blocklisted(const entity_addr_t& a, CephContext *cct=nullptr) const;
+  bool is_blocklisted(const entity_addrvec_t& a, CephContext *cct=nullptr) const;
+  void get_blocklist(std::list<std::pair<entity_addr_t,utime_t > > *bl,
+		     std::list<std::pair<entity_addr_t,utime_t> > *rl) const;
+  void get_blocklist(std::set<entity_addr_t> *bl,
+		     std::set<entity_addr_t> *rl) const;
+
+  std::string get_cluster_snapshot() const {
+    if (cluster_snapshot_epoch == epoch)
+      return cluster_snapshot;
+    return std::string();
+  }
+
+  float get_full_ratio() const {
+    return full_ratio;
+  }
+  float get_backfillfull_ratio() const {
+    return backfillfull_ratio;
+  }
+  float get_nearfull_ratio() const {
+    return nearfull_ratio;
+  }
+  void get_full_pools(CephContext *cct,
+                      std::set<int64_t> *full,
+                      std::set<int64_t> *backfillfull,
+                      std::set<int64_t> *nearfull) const;
+  void get_full_osd_counts(std::set<int> *full, std::set<int> *backfill,
+			   std::set<int> *nearfull) const;
+
+
+  /***** cluster state *****/
+  /* osds */
+  int get_max_osd() const { return max_osd; }
+  void set_max_osd(int m);
+
+  unsigned get_num_osds() const {
+    return num_osd;
+  }
+  unsigned get_num_up_osds() const {
+    return num_up_osd;
+  }
+  unsigned get_num_in_osds() const {
+    return num_in_osd;
+  }
+  /// recalculate cached values for get_num{,_up,_in}_osds
+  int calc_num_osds();
+
+  void get_all_osds(std::set<int32_t>& ls) const;
+  void get_up_osds(std::set<int32_t>& ls) const;
+  void get_out_existing_osds(std::set<int32_t>& ls) const;
+  unsigned get_num_pg_temp() const {
+    return pg_temp->size();
+  }
+
+  int get_flags() const { return flags; }
+  bool test_flag(int f) const { return flags & f; }
+  void set_flag(int f) { flags |= f; }
+  void clear_flag(int f) { flags &= ~f; }
+
+  void get_flag_set(std::set<std::string> *flagset) const;
+
+  static void calc_state_set(int state, std::set<std::string>& st);
+
+  int get_state(int o) const {
+    ceph_assert(o < max_osd);
+    return osd_state[o];
+  }
+  int get_state(int o, std::set<std::string>& st) const {
+    ceph_assert(o < max_osd);
+    unsigned t = osd_state[o];
+    calc_state_set(t, st);
+    return osd_state[o];
+  }
+  void set_state(int o, unsigned s) {
+    ceph_assert(o < max_osd);
+    osd_state[o] = s;
+  }
+  void set_weight(int o, unsigned w) {
+    ceph_assert(o < max_osd);
+    osd_weight[o] = w;
+    if (w)
+      osd_state[o] |= CEPH_OSD_EXISTS;
+  }
+  unsigned get_weight(int o) const {
+    ceph_assert(o < max_osd);
+    return osd_weight[o];
+  }
+  float get_weightf(int o) const {
+    return (float)get_weight(o) / (float)CEPH_OSD_IN;
+  }
+  void adjust_osd_weights(const std::map<int,double>& weights, Incremental& inc) const;
+
+  void set_primary_affinity(int o, int w) {
+    ceph_assert(o < max_osd);
+    if (!osd_primary_affinity)
+      osd_primary_affinity.reset(
+	new mempool::osdmap::vector<__u32>(
+	  max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY));
+    (*osd_primary_affinity)[o] = w;
+  }
+  unsigned get_primary_affinity(int o) const {
+    ceph_assert(o < max_osd);
+    if (!osd_primary_affinity)
+      return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+    return (*osd_primary_affinity)[o];
+  }
+  float get_primary_affinityf(int o) const {
+    return (float)get_primary_affinity(o) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY;
+  }
+
+  bool has_erasure_code_profile(const std::string &name) const {
+    auto i = erasure_code_profiles.find(name);
+    return i != erasure_code_profiles.end();
+  }
+  int get_erasure_code_profile_default(CephContext *cct,
+				       std::map<std::string,std::string> &profile_map,
+				       std::ostream *ss);
+  void set_erasure_code_profile(const std::string &name,
+				const std::map<std::string,std::string>& profile) {
+    erasure_code_profiles[name] = profile;
+  }
+  const std::map<std::string,std::string> &get_erasure_code_profile(
+    const std::string &name) const {
+    static std::map<std::string,std::string> empty;
+    auto i = erasure_code_profiles.find(name);
+    if (i == erasure_code_profiles.end())
+      return empty;
+    else
+      return i->second;
+  }
+  const mempool::osdmap::map<std::string,std::map<std::string,std::string>> &get_erasure_code_profiles() const {
+    return erasure_code_profiles;
+  }
+
+  bool exists(int osd) const {
+    //assert(osd >= 0);
+    return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS);
+  }
+
+  bool is_destroyed(int osd) const {
+    return exists(osd) && (osd_state[osd] & CEPH_OSD_DESTROYED);
+  }
+
+  bool is_up(int osd) const {
+    return exists(osd) && (osd_state[osd] & CEPH_OSD_UP);
+  }
+
+  bool has_been_up_since(int osd, epoch_t epoch) const {
+    return is_up(osd) && get_up_from(osd) <= epoch;
+  }
+
+  bool is_down(int osd) const {
+    return !is_up(osd);
+  }
+
+  bool is_stop(int osd) const {
+    return exists(osd) && is_down(osd) &&
+           (osd_state[osd] & CEPH_OSD_STOP);
+  }
+
+  bool is_out(int osd) const {
+    return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT;
+  }
+
+  bool is_in(int osd) const {
+    return !is_out(osd);
+  }
+
+  bool is_dead(int osd) const {
+    if (!exists(osd)) {
+      return false; // unclear if they know they are removed from map
+    }
+    return get_xinfo(osd).dead_epoch > get_info(osd).up_from;
+  }
+
+  unsigned get_osd_crush_node_flags(int osd) const;
+  unsigned get_crush_node_flags(int id) const;
+  unsigned get_device_class_flags(int id) const;
+
+  bool is_noup_by_osd(int osd) const {
+    return exists(osd) && (osd_state[osd] & CEPH_OSD_NOUP);
+  }
+
+  bool is_nodown_by_osd(int osd) const {
+    return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN);
+  }
+
+  bool is_noin_by_osd(int osd) const {
+    return exists(osd) && (osd_state[osd] & CEPH_OSD_NOIN);
+  }
+
+  bool is_noout_by_osd(int osd) const {
+    return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT);
+  }
+
+  bool is_noup(int osd) const {
+    if (test_flag(CEPH_OSDMAP_NOUP)) // global?
+      return true;
+    if (is_noup_by_osd(osd)) // by osd?
+      return true;
+    if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOUP) // by crush-node?
+      return true;
+    if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
+        get_device_class_flags(class_id) & CEPH_OSD_NOUP) // by device-class?
+      return true;
+    return false;
+  }
+
+  bool is_nodown(int osd) const {
+    if (test_flag(CEPH_OSDMAP_NODOWN))
+      return true;
+    if (is_nodown_by_osd(osd))
+      return true;
+    if (get_osd_crush_node_flags(osd) & CEPH_OSD_NODOWN)
+      return true;
+    if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
+        get_device_class_flags(class_id) & CEPH_OSD_NODOWN)
+      return true;
+    return false;
+  }
+
+  bool is_noin(int osd) const {
+    if (test_flag(CEPH_OSDMAP_NOIN))
+      return true;
+    if (is_noin_by_osd(osd))
+      return true;
+    if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOIN)
+      return true;
+    if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
+        get_device_class_flags(class_id) & CEPH_OSD_NOIN)
+      return true;
+    return false;
+  }
+
+  bool is_noout(int osd) const {
+    if (test_flag(CEPH_OSDMAP_NOOUT))
+      return true;
+    if (is_noout_by_osd(osd))
+      return true;
+    if (get_osd_crush_node_flags(osd) & CEPH_OSD_NOOUT)
+      return true;
+    if (auto class_id = crush->get_item_class_id(osd); class_id >= 0 &&
+        get_device_class_flags(class_id) & CEPH_OSD_NOOUT)
+      return true;
+    return false;
+  }
+
+  /**
+   * check if an entire crush subtree is down
+   */
+  bool subtree_is_down(int id, std::set<int> *down_cache) const;
+  bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, std::set<int> *down_cache) const;
+
+  bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, std::set<int> *down_in_osds, std::set<int> *up_in_osds,
+                            std::set<int> *subtree_up, std::unordered_map<int, std::set<int> > *subtree_type_down) const;
+
+  int identify_osd(const entity_addr_t& addr) const;
+  int identify_osd(const uuid_d& u) const;
+  int identify_osd_on_all_channels(const entity_addr_t& addr) const;
+
+  bool have_addr(const entity_addr_t& addr) const {
+    return identify_osd(addr) >= 0;
+  }
+  int find_osd_on_ip(const entity_addr_t& ip) const;
+
+  const entity_addrvec_t& get_addrs(int osd) const {
+    ceph_assert(exists(osd));
+    return osd_addrs->client_addrs[osd] ?
+      *osd_addrs->client_addrs[osd] : _blank_addrvec;
+  }
+  const entity_addrvec_t& get_most_recent_addrs(int osd) const {
+    return get_addrs(osd);
+  }
+  const entity_addrvec_t &get_cluster_addrs(int osd) const {
+    ceph_assert(exists(osd));
+    return osd_addrs->cluster_addrs[osd] ?
+      *osd_addrs->cluster_addrs[osd] : _blank_addrvec;
+  }
+  const entity_addrvec_t &get_hb_back_addrs(int osd) const {
+    ceph_assert(exists(osd));
+    return osd_addrs->hb_back_addrs[osd] ?
+      *osd_addrs->hb_back_addrs[osd] : _blank_addrvec;
+  }
+  const entity_addrvec_t &get_hb_front_addrs(int osd) const {
+    ceph_assert(exists(osd));
+    return osd_addrs->hb_front_addrs[osd] ?
+      *osd_addrs->hb_front_addrs[osd] : _blank_addrvec;
+  }
+
+  const uuid_d& get_uuid(int osd) const {
+    ceph_assert(exists(osd));
+    return (*osd_uuid)[osd];
+  }
+
+  const epoch_t& get_up_from(int osd) const {
+    ceph_assert(exists(osd));
+    return osd_info[osd].up_from;
+  }
+  const epoch_t& get_up_thru(int osd) const {
+    ceph_assert(exists(osd));
+    return osd_info[osd].up_thru;
+  }
+  const epoch_t& get_down_at(int osd) const {
+    ceph_assert(exists(osd));
+    return osd_info[osd].down_at;
+  }
+  const osd_info_t& get_info(int osd) const {
+    ceph_assert(osd < max_osd);
+    return osd_info[osd];
+  }
+
+  const osd_xinfo_t& get_xinfo(int osd) const {
+    ceph_assert(osd < max_osd);
+    return osd_xinfo[osd];
+  }
+  
+  int get_next_up_osd_after(int n) const {
+    if (get_max_osd() == 0)
+      return -1;
+    for (int i = n + 1; i != n; ++i) {
+      if (i >= get_max_osd())
+	i = 0;
+      if (i == n)
+	break;
+      if (is_up(i))
+	return i;
+    }
+    return -1;
+  }
+
+  int get_previous_up_osd_before(int n) const {
+    if (get_max_osd() == 0)
+      return -1;
+    for (int i = n - 1; i != n; --i) {
+      if (i < 0)
+	i = get_max_osd() - 1;
+      if (i == n)
+	break;
+      if (is_up(i))
+	return i;
+    }
+    return -1;
+  }
+
+
+  void get_random_up_osds_by_subtree(int n,     // whoami
+                                     std::string &subtree,
+                                     int limit, // how many
+                                     std::set<int> skip,
+                                     std::set<int> *want) const;
+
+  /**
+   * get feature bits required by the current structure
+   *
+   * @param entity_type [in] what entity type we are asking about
+   * @param mask [out] std::set of all possible map-related features we could std::set
+   * @return feature bits used by this map
+   */
+  uint64_t get_features(int entity_type, uint64_t *mask) const;
+
+  /**
+   * get oldest *client* version (firefly, hammer, etc.) that can connect given
+   * the feature bits required (according to get_features()).
+   */
+  ceph_release_t get_min_compat_client() const;
+
+  /**
+   * gets the required minimum *client* version that can connect to the cluster.
+   */
+  ceph_release_t get_require_min_compat_client() const;
+
+  /**
+   * get intersection of features supported by up osds
+   */
+  uint64_t get_up_osd_features() const;
+
+  void get_upmap_pgs(std::vector<pg_t> *upmap_pgs) const;
+  bool check_pg_upmaps(
+    CephContext *cct,
+    const std::vector<pg_t>& to_check,
+    std::vector<pg_t> *to_cancel,
+    std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>> *to_remap) const;
+  void clean_pg_upmaps(
+    CephContext *cct,
+    Incremental *pending_inc,
+    const std::vector<pg_t>& to_cancel,
+    const std::map<pg_t, mempool::osdmap::vector<std::pair<int,int>>>& to_remap) const;
+  bool clean_pg_upmaps(CephContext *cct, Incremental *pending_inc) const;
+
+  int apply_incremental(const Incremental &inc);
+
+  /// try to re-use/reference addrs in oldmap from newmap
+  static void dedup(const OSDMap *oldmap, OSDMap *newmap);
+
+  static void clean_temps(CephContext *cct,
+			  const OSDMap& oldmap,
+			  const OSDMap& nextmap,
+			  Incremental *pending_inc);
+
+  // serialize, unserialize
+private:
+  void encode_client_old(ceph::buffer::list& bl) const;
+  void encode_classic(ceph::buffer::list& bl, uint64_t features) const;
+  void decode_classic(ceph::buffer::list::const_iterator& p);
+  void post_decode();
+public:
+  void encode(ceph::buffer::list& bl, uint64_t features=CEPH_FEATURES_ALL) const;
+  void decode(ceph::buffer::list& bl);
+  void decode(ceph::buffer::list::const_iterator& bl);
+
+
+  /****   mapping facilities   ****/
+  int map_to_pg(
+    int64_t pool,
+    const std::string& name,
+    const std::string& key,
+    const std::string& nspace,
+    pg_t *pg) const;
+  int object_locator_to_pg(const object_t& oid, const object_locator_t& loc,
+			   pg_t &pg) const;
+  pg_t object_locator_to_pg(const object_t& oid,
+			    const object_locator_t& loc) const {
+    pg_t pg;
+    int ret = object_locator_to_pg(oid, loc, pg);
+    ceph_assert(ret == 0);
+    return pg;
+  }
+
+
+  static object_locator_t file_to_object_locator(const file_layout_t& layout) {
+    return object_locator_t(layout.pool_id, layout.pool_ns);
+  }
+
+  ceph_object_layout file_to_object_layout(object_t oid,
+					   file_layout_t& layout) const {
+    return make_object_layout(oid, layout.pool_id, layout.pool_ns);
+  }
+
+  ceph_object_layout make_object_layout(object_t oid, int pg_pool,
+					std::string nspace) const;
+
+  int get_pg_num(int pg_pool) const
+  {
+    const pg_pool_t *pool = get_pg_pool(pg_pool);
+    ceph_assert(NULL != pool);
+    return pool->get_pg_num();
+  }
+
+  bool pg_exists(pg_t pgid) const {
+    const pg_pool_t *p = get_pg_pool(pgid.pool());
+    return p && pgid.ps() < p->get_pg_num();
+  }
+
+  int get_pg_pool_min_size(pg_t pgid) const {
+    if (!pg_exists(pgid)) {
+      return -ENOENT;
+    }
+    const pg_pool_t *p = get_pg_pool(pgid.pool());
+    ceph_assert(p);
+    return p->get_min_size();
+  }
+
+  int get_pg_pool_size(pg_t pgid) const {
+    if (!pg_exists(pgid)) {
+      return -ENOENT;
+    }
+    const pg_pool_t *p = get_pg_pool(pgid.pool());
+    ceph_assert(p);
+    return p->get_size();
+  }
+
+  int get_pg_pool_crush_rule(pg_t pgid) const {
+    if (!pg_exists(pgid)) {
+      return -ENOENT;
+    }
+    const pg_pool_t *p = get_pg_pool(pgid.pool());
+    ceph_assert(p);
+    return p->get_crush_rule();
+  }
+
+private:
+  /// pg -> (raw osd std::list)
+  void _pg_to_raw_osds(
+    const pg_pool_t& pool, pg_t pg,
+    std::vector<int> *osds,
+    ps_t *ppps) const;
+  int _pick_primary(const std::vector<int>& osds) const;
+  void _remove_nonexistent_osds(const pg_pool_t& pool, std::vector<int>& osds) const;
+
+  void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool,
+			       std::vector<int> *osds, int *primary) const;
+
+  /// apply pg_upmap[_items] mappings
+  void _apply_upmap(const pg_pool_t& pi, pg_t pg, std::vector<int> *raw) const;
+
+  /// pg -> (up osd std::list)
+  void _raw_to_up_osds(const pg_pool_t& pool, const std::vector<int>& raw,
+                       std::vector<int> *up) const;
+
+
+  /**
+   * Get the pg and primary temp, if they are specified.
+   * @param temp_pg [out] Will be empty or contain the temp PG mapping on return
+   * @param temp_primary [out] Will be the value in primary_temp, or a value derived
+   * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary.
+   */
+  void _get_temp_osds(const pg_pool_t& pool, pg_t pg,
+                      std::vector<int> *temp_pg, int *temp_primary) const;
+
+  /**
+   *  map to up and acting. Fills in whatever fields are non-NULL.
+   */
+  void _pg_to_up_acting_osds(const pg_t& pg, std::vector<int> *up, int *up_primary,
+                             std::vector<int> *acting, int *acting_primary,
+			     bool raw_pg_to_pg = true) const;
+
+public:
+  /***
+   * This is suitable only for looking at raw CRUSH outputs. It skips
+   * applying the temp and up checks and should not be used
+   * by anybody for data mapping purposes.
+   * raw and primary must be non-NULL
+   */
+  void pg_to_raw_osds(pg_t pg, std::vector<int> *raw, int *primary) const;
+  void pg_to_raw_upmap(pg_t pg, std::vector<int> *raw,
+                       std::vector<int> *raw_upmap) const;
+  /// map a pg to its acting set. @return acting set size
+  void pg_to_acting_osds(const pg_t& pg, std::vector<int> *acting,
+                        int *acting_primary) const {
+    _pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary);
+  }
+  void pg_to_acting_osds(pg_t pg, std::vector<int>& acting) const {
+    return pg_to_acting_osds(pg, &acting, NULL);
+  }
+  /**
+   * This does not apply temp overrides and should not be used
+   * by anybody for data mapping purposes. Specify both pointers.
+   */
+  void pg_to_raw_up(pg_t pg, std::vector<int> *up, int *primary) const;
+  /**
+   * map a pg to its acting set as well as its up set. You must use
+   * the acting set for data mapping purposes, but some users will
+   * also find the up set useful for things like deciding what to
+   * set as pg_temp.
+   * Each of these pointers must be non-NULL.
+   */
+  void pg_to_up_acting_osds(pg_t pg, std::vector<int> *up, int *up_primary,
+                            std::vector<int> *acting, int *acting_primary) const {
+    _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary);
+  }
+  void pg_to_up_acting_osds(pg_t pg, std::vector<int>& up, std::vector<int>& acting) const {
+    int up_primary, acting_primary;
+    pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary);
+  }
+  bool pg_is_ec(pg_t pg) const {
+    auto i = pools.find(pg.pool());
+    ceph_assert(i != pools.end());
+    return i->second.is_erasure();
+  }
+  bool get_primary_shard(const pg_t& pgid, spg_t *out) const {
+    auto i = get_pools().find(pgid.pool());
+    if (i == get_pools().end()) {
+      return false;
+    }
+    if (!i->second.is_erasure()) {
+      *out = spg_t(pgid);
+      return true;
+    }
+    int primary;
+    std::vector<int> acting;
+    pg_to_acting_osds(pgid, &acting, &primary);
+    for (uint8_t i = 0; i < acting.size(); ++i) {
+      if (acting[i] == primary) {
+        *out = spg_t(pgid, shard_id_t(i));
+        return true;
+      }
+    }
+    return false;
+  }
+  bool get_primary_shard(const pg_t& pgid, int *primary, spg_t *out) const {
+    auto i = get_pools().find(pgid.pool());
+    if (i == get_pools().end()) {
+      return false;
+    }
+    std::vector<int> acting;
+    pg_to_acting_osds(pgid, &acting, primary);
+    if (i->second.is_erasure()) {
+      for (uint8_t i = 0; i < acting.size(); ++i) {
+	if (acting[i] == *primary) {
+	  *out = spg_t(pgid, shard_id_t(i));
+	  return true;
+	}
+      }
+    } else {
+      *out = spg_t(pgid);
+      return true;
+    }
+    return false;
+  }
+
+  bool in_removed_snaps_queue(int64_t pool, snapid_t snap) const {
+    auto p = removed_snaps_queue.find(pool);
+    if (p == removed_snaps_queue.end()) {
+      return false;
+    }
+    return p->second.contains(snap);
+  }
+
+  const mempool::osdmap::map<int64_t,snap_interval_set_t>&
+  get_removed_snaps_queue() const {
+    return removed_snaps_queue;
+  }
+  const mempool::osdmap::map<int64_t,snap_interval_set_t>&
+  get_new_removed_snaps() const {
+    return new_removed_snaps;
+  }
+  const mempool::osdmap::map<int64_t,snap_interval_set_t>&
+  get_new_purged_snaps() const {
+    return new_purged_snaps;
+  }
+
+  int64_t lookup_pg_pool_name(std::string_view name) const {
+    auto p = name_pool.find(name);
+    if (p == name_pool.end())
+      return -ENOENT;
+    return p->second;
+  }
+
+  int64_t get_pool_max() const {
+    return pool_max;
+  }
+  const mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() const {
+    return pools;
+  }
+  mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() {
+    return pools;
+  }
+  void get_pool_ids_by_rule(int rule_id, std::set<int64_t> *pool_ids) const {
+    ceph_assert(pool_ids);
+    for (auto &p: pools) {
+      if (p.second.get_crush_rule() == rule_id) {
+        pool_ids->insert(p.first);
+      }
+    }
+  }
+  void get_pool_ids_by_osd(CephContext *cct,
+                           int osd,
+                           std::set<int64_t> *pool_ids) const;
+  const std::string& get_pool_name(int64_t p) const {
+    auto i = pool_name.find(p);
+    ceph_assert(i != pool_name.end());
+    return i->second;
+  }
+  const mempool::osdmap::map<int64_t,std::string>& get_pool_names() const {
+    return pool_name;
+  }
+  bool have_pg_pool(int64_t p) const {
+    return pools.count(p);
+  }
+  const pg_pool_t* get_pg_pool(int64_t p) const {
+    auto i = pools.find(p);
+    if (i != pools.end())
+      return &i->second;
+    return NULL;
+  }
+  unsigned get_pg_size(pg_t pg) const {
+    auto p = pools.find(pg.pool());
+    ceph_assert(p != pools.end());
+    return p->second.get_size();
+  }
+  int get_pg_type(pg_t pg) const {
+    auto p = pools.find(pg.pool());
+    ceph_assert(p != pools.end());
+    return p->second.get_type();
+  }
+  int get_pool_crush_rule(int64_t pool_id) const {
+    auto pool = get_pg_pool(pool_id);
+    if (!pool)
+      return -ENOENT;
+    return pool->get_crush_rule();
+  }
+
+
+  pg_t raw_pg_to_pg(pg_t pg) const {
+    auto p = pools.find(pg.pool());
+    ceph_assert(p != pools.end());
+    return p->second.raw_pg_to_pg(pg);
+  }
+
+  // pg -> acting primary osd
+  int get_pg_acting_primary(pg_t pg) const {
+    int primary = -1;
+    _pg_to_up_acting_osds(pg, nullptr, nullptr, nullptr, &primary);
+    return primary;
+  }
+
+  /*
+   * check whether an spg_t maps to a particular osd
+   */
+  bool is_up_acting_osd_shard(spg_t pg, int osd) const {
+    std::vector<int> up, acting;
+    _pg_to_up_acting_osds(pg.pgid, &up, NULL, &acting, NULL, false);
+    if (calc_pg_role(pg_shard_t(osd, pg.shard), acting) >= 0 ||
+	calc_pg_role(pg_shard_t(osd, pg.shard), up) >= 0) {
+      return true;
+    }
+    return false;
+  }
+
+
+  static int calc_pg_role_broken(int osd, const std::vector<int>& acting, int nrep=0);
+  static int calc_pg_role(pg_shard_t who, const std::vector<int>& acting);
+  static bool primary_changed_broken(
+    int oldprimary,
+    const std::vector<int> &oldacting,
+    int newprimary,
+    const std::vector<int> &newacting);
+  
+  /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
+  int get_pg_acting_role(spg_t pg, int osd) const {
+    std::vector<int> group;
+    pg_to_acting_osds(pg.pgid, group);
+    return calc_pg_role(pg_shard_t(osd, pg.shard), group);
+  }
+
+  bool try_pg_upmap(
+    CephContext *cct,
+    pg_t pg,                       ///< pg to potentially remap
+    const std::set<int>& overfull,      ///< osds we'd want to evacuate
+    const std::vector<int>& underfull,  ///< osds to move to, in order of preference
+    const std::vector<int>& more_underfull,  ///< less full osds to move to, in order of preference
+    std::vector<int> *orig,
+    std::vector<int> *out);             ///< resulting alternative mapping
+
+  int calc_pg_upmaps(
+    CephContext *cct,
+    uint32_t max_deviation, ///< max deviation from target (value >= 1)
+    int max_iterations,  ///< max iterations to run
+    const std::set<int64_t>& pools,        ///< [optional] restrict to pool
+    Incremental *pending_inc
+    );
+
+  int get_osds_by_bucket_name(const std::string &name, std::set<int> *osds) const;
+
+  bool have_pg_upmaps(pg_t pg) const {
+    return pg_upmap.count(pg) ||
+      pg_upmap_items.count(pg);
+  }
+
+  bool check_full(const std::set<pg_shard_t> &missing_on) const {
+    for (auto shard : missing_on) {
+      if (get_state(shard.osd) & CEPH_OSD_FULL)
+	return true;
+    }
+    return false;
+  }
+
+  /*
+   * handy helpers to build simple maps...
+   */
+  /**
+   * Build an OSD map suitable for basic usage. If **num_osd** is >= 0
+   * it will be initialized with the specified number of OSDs in a
+   * single host. If **num_osd** is < 0 the layout of the OSD map will 
+   * be built by reading the content of the configuration file.
+   *
+   * @param cct [in] in core ceph context 
+   * @param e [in] initial epoch
+   * @param fsid [in] id of the cluster
+   * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0
+   * @return **0** on success, negative errno on error.
+   */
+private:
+  int build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid,
+			    int num_osd, int pg_bits, int pgp_bits,
+			    bool default_pool);
+public:
+  int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
+		   int num_osd) {
+    return build_simple_optioned(cct, e, fsid, num_osd, 0, 0, false);
+  }
+  int build_simple_with_pool(CephContext *cct, epoch_t e, uuid_d &fsid,
+			     int num_osd, int pg_bits, int pgp_bits) {
+    return build_simple_optioned(cct, e, fsid, num_osd,
+				 pg_bits, pgp_bits, true);
+  }
+  static int _build_crush_types(CrushWrapper& crush);
+  static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
+				    int num_osd, std::ostream *ss);
+  static int build_simple_crush_map_from_conf(CephContext *cct,
+					      CrushWrapper& crush,
+					      std::ostream *ss);
+  static int build_simple_crush_rules(
+    CephContext *cct, CrushWrapper& crush,
+    const std::string& root,
+    std::ostream *ss);
+
+  bool crush_rule_in_use(int rule_id) const;
+
+  int validate_crush_rules(CrushWrapper *crush, std::ostream *ss) const;
+
+  void clear_temp() {
+    pg_temp->clear();
+    primary_temp->clear();
+  }
+
+private:
+  void print_osd_line(int cur, std::ostream *out, ceph::Formatter *f) const;
+public:
+  void print(std::ostream& out) const;
+  void print_osd(int id, std::ostream& out) const;
+  void print_osds(std::ostream& out) const;
+  void print_pools(std::ostream& out) const;
+  void print_summary(ceph::Formatter *f, std::ostream& out,
+		     const std::string& prefix, bool extra=false) const;
+  void print_oneline_summary(std::ostream& out) const;
+
+  enum {
+    DUMP_IN = 1,         // only 'in' osds
+    DUMP_OUT = 2,        // only 'out' osds
+    DUMP_UP = 4,         // only 'up' osds
+    DUMP_DOWN = 8,       // only 'down' osds
+    DUMP_DESTROYED = 16, // only 'destroyed' osds
+  };
+  void print_tree(ceph::Formatter *f, std::ostream *out,
+		  unsigned dump_flags=0, std::string bucket="") const;
+
+  int summarize_mapping_stats(
+    OSDMap *newmap,
+    const std::set<int64_t> *pools,
+    std::string *out,
+    ceph::Formatter *f) const;
+
+  std::string get_flag_string() const;
+  static std::string get_flag_string(unsigned flags);
+  static void dump_erasure_code_profiles(
+    const mempool::osdmap::map<std::string,std::map<std::string,std::string> > &profiles,
+    ceph::Formatter *f);
+  void dump(ceph::Formatter *f) const;
+  void dump_osd(int id, ceph::Formatter *f) const;
+  void dump_osds(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<OSDMap*>& o);
+  bool check_new_blocklist_entries() const { return new_blocklist_entries; }
+
+  void check_health(CephContext *cct, health_check_map_t *checks) const;
+
+  int parse_osd_id_list(const std::vector<std::string>& ls,
+			std::set<int> *out,
+			std::ostream *ss) const;
+
+  float pool_raw_used_rate(int64_t poolid) const;
+  std::optional<std::string> pending_require_osd_release() const;
+
+};
+WRITE_CLASS_ENCODER_FEATURES(OSDMap)
+WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental)
+
+#ifdef WITH_SEASTAR
+using OSDMapRef = boost::local_shared_ptr<const OSDMap>;
+#else
+using OSDMapRef = std::shared_ptr<const OSDMap>;
+#endif
+
+
+inline std::ostream& operator<<(std::ostream& out, const OSDMap& m) {
+  m.print_oneline_summary(out);
+  return out;
+}
+
+class PGMap;
+
+void print_osd_utilization(const OSDMap& osdmap,
+                           const PGMap& pgmap,
+                           std::ostream& out,
+                           ceph::Formatter *f,
+                           bool tree,
+                           const std::string& filter);
+
+#endif
diff --git a/src/osd/OSDMapMapping.cc b/src/osd/OSDMapMapping.cc
new file mode 100644
index 000000000..9cd1fbf58
--- /dev/null
+++ b/src/osd/OSDMapMapping.cc
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "OSDMapMapping.h"
+#include "OSDMap.h"
+
+#define dout_subsys ceph_subsys_mon
+
+#include "common/debug.h"
+
+using std::vector;
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(OSDMapMapping, osdmapmapping,
+			      osdmap_mapping);
+
+// ensure that we have a PoolMappings for each pool and that
+// the dimensions (pg_num and size) match up.
+void OSDMapMapping::_init_mappings(const OSDMap& osdmap)
+{
+  num_pgs = 0;
+  auto q = pools.begin();
+  for (auto& p : osdmap.get_pools()) {
+    num_pgs += p.second.get_pg_num();
+    // drop unneeded pools
+    while (q != pools.end() && q->first < p.first) {
+      q = pools.erase(q);
+    }
+    if (q != pools.end() && q->first == p.first) {
+      if (q->second.pg_num != p.second.get_pg_num() ||
+	  q->second.size != p.second.get_size()) {
+	// pg_num changed
+	q = pools.erase(q);
+      } else {
+	// keep it
+	++q;
+	continue;
+      }
+    }
+    pools.emplace(p.first, PoolMapping(p.second.get_size(),
+				       p.second.get_pg_num(),
+				       p.second.is_erasure()));
+  }
+  pools.erase(q, pools.end());
+  ceph_assert(pools.size() == osdmap.get_pools().size());
+}
+
+void OSDMapMapping::update(const OSDMap& osdmap)
+{
+  _start(osdmap);
+  for (auto& p : osdmap.get_pools()) {
+    _update_range(osdmap, p.first, 0, p.second.get_pg_num());
+  }
+  _finish(osdmap);
+  //_dump();  // for debugging
+}
+
+void OSDMapMapping::update(const OSDMap& osdmap, pg_t pgid)
+{
+  _update_range(osdmap, pgid.pool(), pgid.ps(), pgid.ps() + 1);
+}
+
+void OSDMapMapping::_build_rmap(const OSDMap& osdmap)
+{
+  acting_rmap.resize(osdmap.get_max_osd());
+  //up_rmap.resize(osdmap.get_max_osd());
+  for (auto& v : acting_rmap) {
+    v.resize(0);
+  }
+  //for (auto& v : up_rmap) {
+  //  v.resize(0);
+  //}
+  for (auto& p : pools) {
+    pg_t pgid(0, p.first);
+    for (unsigned ps = 0; ps < p.second.pg_num; ++ps) {
+      pgid.set_ps(ps);
+      int32_t *row = &p.second.table[p.second.row_size() * ps];
+      for (int i = 0; i < row[2]; ++i) {
+	if (row[4 + i] != CRUSH_ITEM_NONE) {
+	  acting_rmap[row[4 + i]].push_back(pgid);
+	}
+      }
+      //for (int i = 0; i < row[3]; ++i) {
+      //up_rmap[row[4 + p.second.size + i]].push_back(pgid);
+      //}
+    }
+  }
+}
+
+void OSDMapMapping::_finish(const OSDMap& osdmap)
+{
+  _build_rmap(osdmap);
+  epoch = osdmap.get_epoch();
+}
+
+void OSDMapMapping::_dump()
+{
+  for (auto& p : pools) {
+    std::cout << "pool " << p.first << std::endl;
+    for (unsigned i = 0; i < p.second.table.size(); ++i) {
+      std::cout << " " << p.second.table[i];
+      if (i % p.second.row_size() == p.second.row_size() - 1)
+	std::cout << std::endl;
+    }
+  }
+}
+
+void OSDMapMapping::_update_range(
+  const OSDMap& osdmap,
+  int64_t pool,
+  unsigned pg_begin,
+  unsigned pg_end)
+{
+  auto i = pools.find(pool);
+  ceph_assert(i != pools.end());
+  ceph_assert(pg_begin <= pg_end);
+  ceph_assert(pg_end <= i->second.pg_num);
+  for (unsigned ps = pg_begin; ps < pg_end; ++ps) {
+    std::vector<int> up, acting;
+    int up_primary, acting_primary;
+    osdmap.pg_to_up_acting_osds(
+      pg_t(ps, pool),
+      &up, &up_primary, &acting, &acting_primary);
+    i->second.set(ps, std::move(up), up_primary,
+		  std::move(acting), acting_primary);
+  }
+}
+
+// ---------------------------
+
+void ParallelPGMapper::Job::finish_one()
+{
+  Context *fin = nullptr;
+  {
+    std::lock_guard l(lock);
+    if (--shards == 0) {
+      if (!aborted) {
+	finish = ceph_clock_now();
+	complete();
+      }
+      cond.notify_all();
+      fin = onfinish;
+      onfinish = nullptr;
+    }
+  }
+  if (fin) {
+    fin->complete(0);
+  }
+}
+
+void ParallelPGMapper::WQ::_process(Item *i, ThreadPool::TPHandle &h)
+{
+  ldout(m->cct, 20) << __func__ << " " << i->job << " pool " << i->pool
+                    << " [" << i->begin << "," << i->end << ")"
+                    << " pgs " << i->pgs
+                    << dendl;
+  if (!i->pgs.empty())
+    i->job->process(i->pgs);
+  else
+    i->job->process(i->pool, i->begin, i->end);
+  i->job->finish_one();
+  delete i;
+}
+
+void ParallelPGMapper::queue(
+  Job *job,
+  unsigned pgs_per_item,
+  const vector<pg_t>& input_pgs)
+{
+  bool any = false;
+  if (!input_pgs.empty()) {
+    unsigned i = 0;
+    vector<pg_t> item_pgs;
+    item_pgs.reserve(pgs_per_item);
+    for (auto& pg : input_pgs) {
+      if (i < pgs_per_item) {
+        ++i;
+        item_pgs.push_back(pg);
+      }
+      if (i >= pgs_per_item) {
+        job->start_one();
+        wq.queue(new Item(job, item_pgs));
+        i = 0;
+        item_pgs.clear();
+        any = true;
+      }
+    }
+    if (!item_pgs.empty()) {
+      job->start_one();
+      wq.queue(new Item(job, item_pgs));
+      any = true;
+    }
+    ceph_assert(any);
+    return;
+  }
+  // no input pgs, load all from map
+  for (auto& p : job->osdmap->get_pools()) {
+    for (unsigned ps = 0; ps < p.second.get_pg_num(); ps += pgs_per_item) {
+      unsigned ps_end = std::min(ps + pgs_per_item, p.second.get_pg_num());
+      job->start_one();
+      wq.queue(new Item(job, p.first, ps, ps_end));
+      ldout(cct, 20) << __func__ << " " << job << " " << p.first << " [" << ps
+		     << "," << ps_end << ")" << dendl;
+      any = true;
+    }
+  }
+  ceph_assert(any);
+}
diff --git a/src/osd/OSDMapMapping.h b/src/osd/OSDMapMapping.h
new file mode 100644
index 000000000..3274d02e4
--- /dev/null
+++ b/src/osd/OSDMapMapping.h
@@ -0,0 +1,352 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+
+#ifndef CEPH_OSDMAPMAPPING_H
+#define CEPH_OSDMAPMAPPING_H
+
+#include <vector>
+#include <map>
+
+#include "osd/osd_types.h"
+#include "common/WorkQueue.h"
+#include "common/Cond.h"
+
+class OSDMap;
+
+/// work queue to perform work on batches of pgids on multiple CPUs
+class ParallelPGMapper {
+public:
+  struct Job {
+    utime_t start, finish;
+    unsigned shards = 0;
+    const OSDMap *osdmap;
+    bool aborted = false;
+    Context *onfinish = nullptr;
+
+    ceph::mutex lock = ceph::make_mutex("ParallelPGMapper::Job::lock");
+    ceph::condition_variable cond;
+
+    Job(const OSDMap *om) : start(ceph_clock_now()), osdmap(om) {}
+    virtual ~Job() {
+      ceph_assert(shards == 0);
+    }
+
+    // child must implement either form of process
+    virtual void process(const std::vector<pg_t>& pgs) = 0;
+    virtual void process(int64_t poolid, unsigned ps_begin, unsigned ps_end) = 0;
+    virtual void complete() = 0;
+
+    void set_finish_event(Context *fin) {
+      lock.lock();
+      if (shards == 0) {
+	// already done.
+	lock.unlock();
+	fin->complete(0);
+      } else {
+	// set finisher
+	onfinish = fin;
+	lock.unlock();
+      }
+    }
+    bool is_done() {
+      std::lock_guard l(lock);
+      return shards == 0;
+    }
+    utime_t get_duration() {
+      return finish - start;
+    }
+    void wait() {
+      std::unique_lock l(lock);
+      cond.wait(l, [this] { return shards == 0; });
+    }
+    bool wait_for(double duration) {
+      utime_t until = start;
+      until += duration;
+      std::unique_lock l(lock);
+      while (shards > 0) {
+	if (ceph_clock_now() >= until) {
+	  return false;
+	}
+	cond.wait(l);
+      }
+      return true;
+    }
+    void abort() {
+      Context *fin = nullptr;
+      {
+	std::unique_lock l(lock);
+	aborted = true;
+	fin = onfinish;
+	onfinish = nullptr;
+	cond.wait(l, [this] { return shards == 0; });
+      }
+      if (fin) {
+	fin->complete(-ECANCELED);
+      }
+    }
+
+    void start_one() {
+      std::lock_guard l(lock);
+      ++shards;
+    }
+    void finish_one();
+  };
+
+protected:
+  CephContext *cct;
+
+  struct Item {
+    Job *job;
+    int64_t pool;
+    unsigned begin, end;
+    std::vector<pg_t> pgs;
+
+    Item(Job *j, std::vector<pg_t> pgs) : job(j), pgs(pgs) {}
+    Item(Job *j, int64_t p, unsigned b, unsigned e)
+      : job(j),
+	pool(p),
+	begin(b),
+	end(e) {}
+  };
+  std::deque<Item*> q;
+
+  struct WQ : public ThreadPool::WorkQueue<Item> {
+    ParallelPGMapper *m;
+
+    WQ(ParallelPGMapper *m_, ThreadPool *tp)
+      : ThreadPool::WorkQueue<Item>(
+	"ParallelPGMapper::WQ",
+	ceph::make_timespan(m_->cct->_conf->threadpool_default_timeout),
+	ceph::timespan::zero(),
+	tp),
+        m(m_) {}
+
+    bool _enqueue(Item *i) override {
+      m->q.push_back(i);
+      return true;
+    }
+    void _dequeue(Item *i) override {
+      ceph_abort();
+    }
+    Item *_dequeue() override {
+      while (!m->q.empty()) {
+	Item *i = m->q.front();
+	m->q.pop_front();
+	if (i->job->aborted) {
+	  i->job->finish_one();
+	  delete i;
+	} else {
+	  return i;
+	}
+      }
+      return nullptr;
+    }
+
+    void _process(Item *i, ThreadPool::TPHandle &h) override;
+
+    void _clear() override {
+      ceph_assert(_empty());
+    }
+
+    bool _empty() override {
+      return m->q.empty();
+    }
+  } wq;
+
+public:
+  ParallelPGMapper(CephContext *cct, ThreadPool *tp)
+    : cct(cct),
+      wq(this, tp) {}
+
+  void queue(
+    Job *job,
+    unsigned pgs_per_item,
+    const std::vector<pg_t>& input_pgs);
+
+  void drain() {
+    wq.drain();
+  }
+};
+
+
+/// a precalculated mapping of every PG for a given OSDMap
+class OSDMapMapping {
+public:
+  MEMPOOL_CLASS_HELPERS();
+private:
+
+  struct PoolMapping {
+    MEMPOOL_CLASS_HELPERS();
+
+    unsigned size = 0;
+    unsigned pg_num = 0;
+    bool erasure = false;
+    mempool::osdmap_mapping::vector<int32_t> table;
+
+    size_t row_size() const {
+      return
+	1 + // acting_primary
+	1 + // up_primary
+	1 + // num acting
+	1 + // num up
+	size + // acting
+	size;  // up
+    }
+
+    PoolMapping(int s, int p, bool e)
+      : size(s),
+	pg_num(p),
+	erasure(e),
+	table(pg_num * row_size()) {
+    }
+
+    void get(size_t ps,
+	     std::vector<int> *up,
+	     int *up_primary,
+	     std::vector<int> *acting,
+	     int *acting_primary) const {
+      const int32_t *row = &table[row_size() * ps];
+      if (acting_primary) {
+	*acting_primary = row[0];
+      }
+      if (up_primary) {
+	*up_primary = row[1];
+      }
+      if (acting) {
+	acting->resize(row[2]);
+	for (int i = 0; i < row[2]; ++i) {
+	  (*acting)[i] = row[4 + i];
+	}
+      }
+      if (up) {
+	up->resize(row[3]);
+	for (int i = 0; i < row[3]; ++i) {
+	  (*up)[i] = row[4 + size + i];
+	}
+      }
+    }
+
+    void set(size_t ps,
+	     const std::vector<int>& up,
+	     int up_primary,
+	     const std::vector<int>& acting,
+	     int acting_primary) {
+      int32_t *row = &table[row_size() * ps];
+      row[0] = acting_primary;
+      row[1] = up_primary;
+      // these should always be <= the pool size, but just in case, avoid
+      // blowing out the array.  Note that our mapping is not completely
+      // accurate in this case--this is just to avoid crashing.
+      row[2] = std::min<int32_t>(acting.size(), size);
+      row[3] = std::min<int32_t>(up.size(), size);
+      for (int i = 0; i < row[2]; ++i) {
+	row[4 + i] = acting[i];
+      }
+      for (int i = 0; i < row[3]; ++i) {
+	row[4 + size + i] = up[i];
+      }
+    }
+  };
+
+  mempool::osdmap_mapping::map<int64_t,PoolMapping> pools;
+  mempool::osdmap_mapping::vector<
+    mempool::osdmap_mapping::vector<pg_t>> acting_rmap;  // osd -> pg
+  //unused: mempool::osdmap_mapping::vector<std::vector<pg_t>> up_rmap;  // osd -> pg
+  epoch_t epoch = 0;
+  uint64_t num_pgs = 0;
+
+  void _init_mappings(const OSDMap& osdmap);
+  void _update_range(
+    const OSDMap& map,
+    int64_t pool,
+    unsigned pg_begin, unsigned pg_end);
+
+  void _build_rmap(const OSDMap& osdmap);
+
+  void _start(const OSDMap& osdmap) {
+    _init_mappings(osdmap);
+  }
+  void _finish(const OSDMap& osdmap);
+
+  void _dump();
+
+  friend class ParallelPGMapper;
+
+  struct MappingJob : public ParallelPGMapper::Job {
+    OSDMapMapping *mapping;
+    MappingJob(const OSDMap *osdmap, OSDMapMapping *m)
+      : Job(osdmap), mapping(m) {
+      mapping->_start(*osdmap);
+    }
+    void process(const std::vector<pg_t>& pgs) override {}
+    void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
+      mapping->_update_range(*osdmap, pool, ps_begin, ps_end);
+    }
+    void complete() override {
+      mapping->_finish(*osdmap);
+    }
+  };
+
+public:
+  void get(pg_t pgid,
+	   std::vector<int> *up,
+	   int *up_primary,
+	   std::vector<int> *acting,
+	   int *acting_primary) const {
+    auto p = pools.find(pgid.pool());
+    ceph_assert(p != pools.end());
+    ceph_assert(pgid.ps() < p->second.pg_num);
+    p->second.get(pgid.ps(), up, up_primary, acting, acting_primary);
+  }
+
+  bool get_primary_and_shard(pg_t pgid,
+			     int *acting_primary,
+			     spg_t *spgid) {
+    auto p = pools.find(pgid.pool());
+    ceph_assert(p != pools.end());
+    ceph_assert(pgid.ps() < p->second.pg_num);
+    std::vector<int> acting;
+    p->second.get(pgid.ps(), nullptr, nullptr, &acting, acting_primary);
+    if (p->second.erasure) {
+      for (uint8_t i = 0; i < acting.size(); ++i) {
+	if (acting[i] == *acting_primary) {
+	  *spgid = spg_t(pgid, shard_id_t(i));
+	  return true;
+	}
+      }
+      return false;
+    } else {
+      *spgid = spg_t(pgid);
+      return true;
+    }
+  }
+
+  const mempool::osdmap_mapping::vector<pg_t>& get_osd_acting_pgs(unsigned osd) {
+    ceph_assert(osd < acting_rmap.size());
+    return acting_rmap[osd];
+  }
+
+  void update(const OSDMap& map);
+  void update(const OSDMap& map, pg_t pgid);
+
+  std::unique_ptr<MappingJob> start_update(
+    const OSDMap& map,
+    ParallelPGMapper& mapper,
+    unsigned pgs_per_item) {
+    std::unique_ptr<MappingJob> job(new MappingJob(&map, this));
+    mapper.queue(job.get(), pgs_per_item, {});
+    return job;
+  }
+
+  epoch_t get_epoch() const {
+    return epoch;
+  }
+
+  uint64_t get_num_pgs() const {
+    return num_pgs;
+  }
+};
+
+
+#endif
diff --git a/src/osd/ObjectVersioner.h b/src/osd/ObjectVersioner.h
new file mode 100644
index 000000000..f7d756330
--- /dev/null
+++ b/src/osd/ObjectVersioner.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_OSD_OBJECTVERSIONER_H
+#define CEPH_OSD_OBJECTVERSIONER_H
+
+class ObjectVersioner {
+ public:
+  pobject_t oid;
+
+  void get_versions(list<version_t>& ls);
+  version_t head();      // newest
+  version_t committed(); // last committed
+  version_t tail();      // oldest
+
+  /* 
+   * prepare a new version, starting wit "raw" transaction t.
+   */
+  void prepare(ObjectStore::Transaction& t, version_t v);
+  void rollback_to(version_t v);
+  void commit_to(version_t v);
+};
+
+#endif
diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc
new file mode 100644
index 000000000..0eb92c23a
--- /dev/null
+++ b/src/osd/OpRequest.cc
@@ -0,0 +1,170 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+
+#include "OpRequest.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <vector>
+#include "common/debug.h"
+#include "common/config.h"
+#include "msg/Message.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+#include "include/ceph_assert.h"
+#include "osd/osd_types.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/oprequest.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+using std::ostream;
+using std::set;
+using std::string;
+using std::stringstream;
+
+using ceph::Formatter;
+
+OpRequest::OpRequest(Message* req, OpTracker* tracker)
+    : TrackedOp(tracker, req->get_throttle_stamp()),
+      request(req),
+      hit_flag_points(0),
+      latest_flag_point(0),
+      hitset_inserted(false) {
+  if (req->get_priority() < tracker->cct->_conf->osd_client_op_priority) {
+    // don't warn as quickly for low priority ops
+    warn_interval_multiplier = tracker->cct->_conf->osd_recovery_op_warn_multiple;
+  }
+  if (req->get_type() == CEPH_MSG_OSD_OP) {
+    reqid = static_cast<MOSDOp*>(req)->get_reqid();
+  } else if (req->get_type() == MSG_OSD_REPOP) {
+    reqid = static_cast<MOSDRepOp*>(req)->reqid;
+  } else if (req->get_type() == MSG_OSD_REPOPREPLY) {
+    reqid = static_cast<MOSDRepOpReply*>(req)->reqid;
+  }
+  req_src_inst = req->get_source_inst();
+}
+
+void OpRequest::_dump(Formatter *f) const
+{
+  Message *m = request;
+  f->dump_string("flag_point", state_string());
+  if (m->get_orig_source().is_client()) {
+    f->open_object_section("client_info");
+    stringstream client_name, client_addr;
+    client_name << req_src_inst.name;
+    client_addr << req_src_inst.addr;
+    f->dump_string("client", client_name.str());
+    f->dump_string("client_addr", client_addr.str());
+    f->dump_unsigned("tid", m->get_tid());
+    f->close_section(); // client_info
+  }
+
+  {
+    f->open_array_section("events");
+    std::lock_guard l(lock);
+
+    for (auto i = events.begin(); i != events.end(); ++i) {
+      f->open_object_section("event");
+      f->dump_string("event", i->str);
+      f->dump_stream("time") << i->stamp;
+
+      auto i_next = i + 1;
+
+      if (i_next < events.end()) {
+	f->dump_float("duration", i_next->stamp - i->stamp);
+      } else {
+	f->dump_float("duration", events.rbegin()->stamp - get_initiated());
+      }
+
+      f->close_section();
+    }
+    f->close_section();
+  }
+}
+
+void OpRequest::_dump_op_descriptor_unlocked(ostream& stream) const
+{
+  get_req()->print(stream);
+}
+
+void OpRequest::_unregistered() {
+  request->clear_data();
+  request->clear_payload();
+  request->release_message_throttle();
+  request->set_connection(nullptr);
+}
+
+int OpRequest::maybe_init_op_info(const OSDMap &osdmap) {
+  if (op_info.get_flags())
+    return 0;
+
+  auto m = get_req<MOSDOp>();
+
+#ifdef WITH_LTTNG
+  auto old_rmw_flags = op_info.get_flags();
+#endif
+  auto ret = op_info.set_from_op(m, osdmap);
+  tracepoint(oprequest, set_rmw_flags, reqid.name._type,
+	     reqid.name._num, reqid.tid, reqid.inc,
+	     op_info.get_flags(), old_rmw_flags, op_info.get_flags());
+  return ret;
+}
+
+void OpRequest::mark_flag_point(uint8_t flag, const char *s) {
+#ifdef WITH_LTTNG
+  uint8_t old_flags = hit_flag_points;
+#endif
+  mark_event(s);
+  hit_flag_points |= flag;
+  latest_flag_point = flag;
+  tracepoint(oprequest, mark_flag_point, reqid.name._type,
+	     reqid.name._num, reqid.tid, reqid.inc, op_info.get_flags(),
+	     flag, s, old_flags, hit_flag_points);
+}
+
+void OpRequest::mark_flag_point_string(uint8_t flag, const string& s) {
+#ifdef WITH_LTTNG
+  uint8_t old_flags = hit_flag_points;
+#endif
+  mark_event(s);
+  hit_flag_points |= flag;
+  latest_flag_point = flag;
+  tracepoint(oprequest, mark_flag_point, reqid.name._type,
+	     reqid.name._num, reqid.tid, reqid.inc, op_info.get_flags(),
+	     flag, s.c_str(), old_flags, hit_flag_points);
+}
+
+bool OpRequest::filter_out(const set<string>& filters)
+{
+  set<entity_addr_t> addrs;
+  for (auto it = filters.begin(); it != filters.end(); it++) {
+    entity_addr_t addr;
+    if (addr.parse((*it).c_str())) {
+      addrs.insert(addr);
+    }
+  }
+  if (addrs.empty())
+    return true;
+
+  entity_addr_t cmp_addr = req_src_inst.addr;
+  if (addrs.count(cmp_addr)) {
+    return true;
+  }
+  cmp_addr.set_nonce(0);
+  if (addrs.count(cmp_addr)) {
+    return true;
+  }
+  cmp_addr.set_port(0);
+  if (addrs.count(cmp_addr)) {
+    return true;
+  }
+
+  return false;
+}
+
diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h
new file mode 100644
index 000000000..daa0e1993
--- /dev/null
+++ b/src/osd/OpRequest.h
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 New Dream Network/Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef OPREQUEST_H_
+#define OPREQUEST_H_
+
+#include "osd/osd_op_util.h"
+#include "osd/osd_types.h"
+#include "common/TrackedOp.h"
+#ifdef HAVE_JAEGER
+#include "common/tracer.h"
+#endif
+
+/**
+ * The OpRequest takes in a Message* and takes over a single reference
+ * to it, which it puts() when destroyed.
+ */
+struct OpRequest : public TrackedOp {
+  friend class OpTracker;
+
+private:
+  OpInfo op_info;
+
+public:
+  int maybe_init_op_info(const OSDMap &osdmap);
+
+  auto get_flags() const { return op_info.get_flags(); }
+  bool op_info_needs_init() const { return op_info.get_flags() == 0; }
+  bool check_rmw(int flag) const { return op_info.check_rmw(flag); }
+  bool may_read() const { return op_info.may_read(); }
+  bool may_write() const { return op_info.may_write(); }
+  bool may_cache() const { return op_info.may_cache(); }
+  bool rwordered_forced() const { return op_info.rwordered_forced(); }
+  bool rwordered() const { return op_info.rwordered(); }
+  bool includes_pg_op() const { return op_info.includes_pg_op(); }
+  bool need_read_cap() const { return op_info.need_read_cap(); }
+  bool need_write_cap() const { return op_info.need_write_cap(); }
+  bool need_promote() const { return op_info.need_promote(); }
+  bool need_skip_handle_cache() const { return op_info.need_skip_handle_cache(); }
+  bool need_skip_promote() const { return op_info.need_skip_promote(); }
+  bool allows_returnvec() const { return op_info.allows_returnvec(); }
+
+  std::vector<OpInfo::ClassInfo> classes() const {
+    return op_info.get_classes();
+  }
+
+  void _dump(ceph::Formatter *f) const override;
+
+  bool has_feature(uint64_t f) const {
+    return request->get_connection()->has_feature(f);
+  }
+
+private:
+  Message *request; /// the logical request we are tracking
+  osd_reqid_t reqid;
+  entity_inst_t req_src_inst;
+  uint8_t hit_flag_points;
+  uint8_t latest_flag_point;
+  utime_t dequeued_time;
+  static const uint8_t flag_queued_for_pg=1 << 0;
+  static const uint8_t flag_reached_pg =  1 << 1;
+  static const uint8_t flag_delayed =     1 << 2;
+  static const uint8_t flag_started =     1 << 3;
+  static const uint8_t flag_sub_op_sent = 1 << 4;
+  static const uint8_t flag_commit_sent = 1 << 5;
+
+  OpRequest(Message *req, OpTracker *tracker);
+
+protected:
+  void _dump_op_descriptor_unlocked(std::ostream& stream) const override;
+  void _unregistered() override;
+  bool filter_out(const std::set<std::string>& filters) override;
+
+public:
+  ~OpRequest() override {
+    request->put();
+  }
+
+  bool check_send_map = true; ///< true until we check if sender needs a map
+  epoch_t sent_epoch = 0;     ///< client's map epoch
+  epoch_t min_epoch = 0;      ///< min epoch needed to handle this msg
+
+  bool hitset_inserted;
+#ifdef HAVE_JAEGER
+  jspan osd_parent_span = nullptr;
+  void set_osd_parent_span(jspan& span) {
+    if(osd_parent_span){
+      jaeger_tracing::finish_span(osd_parent_span);
+    }
+    osd_parent_span = move(span);
+  }
+#else
+  void set_osd_parent_span(...) {}
+#endif
+  template<class T>
+  const T* get_req() const { return static_cast<const T*>(request); }
+
+  const Message *get_req() const { return request; }
+  Message *get_nonconst_req() { return request; }
+
+  entity_name_t get_source() {
+    if (request) {
+      return request->get_source();
+    } else {
+      return entity_name_t();
+    }
+  }
+  uint8_t state_flag() const {
+    return latest_flag_point;
+  }
+
+  std::string_view state_string() const override {
+    switch(latest_flag_point) {
+    case flag_queued_for_pg: return "queued for pg";
+    case flag_reached_pg: return "reached pg";
+    case flag_delayed: return "delayed";
+    case flag_started: return "started";
+    case flag_sub_op_sent: return "waiting for sub ops";
+    case flag_commit_sent: return "commit sent; apply or cleanup";
+    default: break;
+    }
+    return "no flag points reached";
+  }
+
+  static std::string get_state_string(uint8_t flag) {
+    std::string flag_point;
+
+    switch(flag) {
+      case flag_queued_for_pg:
+        flag_point = "queued for pg";
+        break;
+      case flag_reached_pg:
+        flag_point = "reached pg";
+        break;
+      case flag_delayed:
+        flag_point = "delayed";
+        break;
+      case flag_started:
+        flag_point = "started";
+        break;
+      case flag_sub_op_sent:
+        flag_point = "waiting for sub ops";
+        break;
+      case flag_commit_sent:
+        flag_point = "commit sent; apply or cleanup";
+        break;
+    }
+    return flag_point;
+  }
+
+  void mark_queued_for_pg() {
+    mark_flag_point(flag_queued_for_pg, "queued_for_pg");
+  }
+  void mark_reached_pg() {
+    mark_flag_point(flag_reached_pg, "reached_pg");
+  }
+  void mark_delayed(const std::string& s) {
+    mark_flag_point_string(flag_delayed, s);
+  }
+  void mark_started() {
+    mark_flag_point(flag_started, "started");
+  }
+  void mark_sub_op_sent(const std::string& s) {
+    mark_flag_point_string(flag_sub_op_sent, s);
+  }
+  void mark_commit_sent() {
+    mark_flag_point(flag_commit_sent, "commit_sent");
+  }
+
+  utime_t get_dequeued_time() const {
+    return dequeued_time;
+  }
+  void set_dequeued_time(utime_t deq_time) {
+    dequeued_time = deq_time;
+  }
+
+  osd_reqid_t get_reqid() const {
+    return reqid;
+  }
+
+  typedef boost::intrusive_ptr<OpRequest> Ref;
+
+private:
+  void mark_flag_point(uint8_t flag, const char *s);
+  void mark_flag_point_string(uint8_t flag, const std::string& s);
+};
+
+typedef OpRequest::Ref OpRequestRef;
+
+#endif /* OPREQUEST_H_ */
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
new file mode 100644
index 000000000..5b10f1466
--- /dev/null
+++ b/src/osd/PG.cc
@@ -0,0 +1,2753 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "PG.h"
+#include "messages/MOSDRepScrub.h"
+
+#include "common/errno.h"
+#include "common/ceph_releases.h"
+#include "common/config.h"
+#include "OSD.h"
+#include "OpRequest.h"
+#include "ScrubStore.h"
+#include "pg_scrubber.h"
+#include "Session.h"
+#include "osd/scheduler/OpSchedulerItem.h"
+
+#include "common/Timer.h"
+#include "common/perf_counters.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGInfo.h"
+#include "messages/MOSDPGScan.h"
+#include "messages/MOSDPGBackfill.h"
+#include "messages/MOSDPGBackfillRemove.h"
+#include "messages/MBackfillReserve.h"
+#include "messages/MRecoveryReserve.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDECSubOpWrite.h"
+#include "messages/MOSDECSubOpWriteReply.h"
+#include "messages/MOSDECSubOpRead.h"
+#include "messages/MOSDECSubOpReadReply.h"
+#include "messages/MOSDPGUpdateLogMissing.h"
+#include "messages/MOSDPGUpdateLogMissingReply.h"
+#include "messages/MOSDBackoff.h"
+#include "messages/MOSDScrubReserve.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+
+#include "common/BackTrace.h"
+#include "common/EventTrace.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/pg.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#include <sstream>
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+using std::list;
+using std::map;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+
+using namespace ceph::osd::scheduler;
+
+template <class T>
+static ostream& _prefix(std::ostream *_dout, T *t)
+{
+  return t->gen_prefix(*_dout);
+}
+
+void PG::get(const char* tag)
+{
+  int after = ++ref;
+  lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " "
+				 << "tag " << (tag ? tag : "(none") << " "
+				 << (after - 1) << " -> " << after << dendl;
+#ifdef PG_DEBUG_REFS
+  std::lock_guard l(_ref_id_lock);
+  _tag_counts[tag]++;
+#endif
+}
+
+void PG::put(const char* tag)
+{
+#ifdef PG_DEBUG_REFS
+  {
+    std::lock_guard l(_ref_id_lock);
+    auto tag_counts_entry = _tag_counts.find(tag);
+    ceph_assert(tag_counts_entry != _tag_counts.end());
+    --tag_counts_entry->second;
+    if (tag_counts_entry->second == 0) {
+      _tag_counts.erase(tag_counts_entry);
+    }
+  }
+#endif
+  auto local_cct = cct;
+  int after = --ref;
+  lgeneric_subdout(local_cct, refs, 5) << "PG::put " << this << " "
+				       << "tag " << (tag ? tag : "(none") << " "
+				       << (after + 1) << " -> " << after
+				       << dendl;
+  if (after == 0)
+    delete this;
+}
+
+#ifdef PG_DEBUG_REFS
+uint64_t PG::get_with_id()
+{
+  ref++;
+  std::lock_guard l(_ref_id_lock);
+  uint64_t id = ++_ref_id;
+  BackTrace bt(0);
+  stringstream ss;
+  bt.print(ss);
+  lgeneric_subdout(cct, refs, 5) << "PG::get " << this << " " << info.pgid
+				 << " got id " << id << " "
+				 << (ref - 1) << " -> " << ref
+				 << dendl;
+  ceph_assert(!_live_ids.count(id));
+  _live_ids.insert(make_pair(id, ss.str()));
+  return id;
+}
+
+void PG::put_with_id(uint64_t id)
+{
+  int newref = --ref;
+  lgeneric_subdout(cct, refs, 5) << "PG::put " << this << " " << info.pgid
+				 << " put id " << id << " "
+				 << (newref + 1) << " -> " << newref
+				 << dendl;
+  {
+    std::lock_guard l(_ref_id_lock);
+    ceph_assert(_live_ids.count(id));
+    _live_ids.erase(id);
+  }
+  if (newref)
+    delete this;
+}
+
+void PG::dump_live_ids()
+{
+  std::lock_guard l(_ref_id_lock);
+  dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
+  for (map<uint64_t, string>::iterator i = _live_ids.begin();
+       i != _live_ids.end();
+       ++i) {
+    dout(0) << "\t\tid: " << *i << dendl;
+  }
+  dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
+  for (map<string, uint64_t>::iterator i = _tag_counts.begin();
+       i != _tag_counts.end();
+       ++i) {
+    dout(0) << "\t\tid: " << *i << dendl;
+  }
+}
+#endif
+
+PG::PG(OSDService *o, OSDMapRef curmap,
+       const PGPool &_pool, spg_t p) :
+  pg_whoami(o->whoami, p.shard),
+  pg_id(p),
+  coll(p),
+  osd(o),
+  cct(o->cct),
+  osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
+  snap_mapper(
+    cct,
+    &osdriver,
+    p.ps(),
+    p.get_split_bits(_pool.info.get_pg_num()),
+    _pool.id,
+    p.shard),
+  trace_endpoint("0.0.0.0", 0, "PG"),
+  info_struct_v(0),
+  pgmeta_oid(p.make_pgmeta_oid()),
+  stat_queue_item(this),
+  recovery_queued(false),
+  recovery_ops_active(0),
+  backfill_reserving(false),
+  pg_stats_publish_valid(false),
+  finish_sync_event(NULL),
+  scrub_after_recovery(false),
+  active_pushes(0),
+  recovery_state(
+    o->cct,
+    pg_whoami,
+    p,
+    _pool,
+    curmap,
+    this,
+    this),
+  pool(recovery_state.get_pool()),
+  info(recovery_state.get_info())
+{
+#ifdef PG_DEBUG_REFS
+  osd->add_pgid(p, this);
+#endif
+#ifdef WITH_BLKIN
+  std::stringstream ss;
+  ss << "PG " << info.pgid;
+  trace_endpoint.copy_name(ss.str());
+#endif
+}
+
+PG::~PG()
+{
+#ifdef PG_DEBUG_REFS
+  osd->remove_pgid(info.pgid, this);
+#endif
+}
+
+void PG::lock(bool no_lockdep) const
+{
+#ifdef CEPH_DEBUG_MUTEX
+  _lock.lock(no_lockdep);
+#else
+  _lock.lock();
+  locked_by = std::this_thread::get_id();
+#endif
+  // if we have unrecorded dirty state with the lock dropped, there is a bug
+  ceph_assert(!recovery_state.debug_has_dirty_state());
+
+  dout(30) << "lock" << dendl;
+}
+
+bool PG::is_locked() const
+{
+  return ceph_mutex_is_locked(_lock);
+}
+
+void PG::unlock() const
+{
+  //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
+  ceph_assert(!recovery_state.debug_has_dirty_state());
+#ifndef CEPH_DEBUG_MUTEX
+  locked_by = {};
+#endif
+  _lock.unlock();
+}
+
+std::ostream& PG::gen_prefix(std::ostream& out) const
+{
+  OSDMapRef mapref = recovery_state.get_osdmap();
+#ifdef CEPH_DEBUG_MUTEX
+  if (_lock.is_locked_by_me()) {
+#else
+  if (locked_by == std::this_thread::get_id()) {
+#endif
+    out << "osd." << osd->whoami
+	<< " pg_epoch: " << (mapref ? mapref->get_epoch():0)
+	<< " " << *this << " ";
+  } else {
+    out << "osd." << osd->whoami
+	<< " pg_epoch: " << (mapref ? mapref->get_epoch():0)
+	<< " pg[" << pg_id.pgid << "(unlocked)] ";
+  }
+  return out;
+}
+
+PerfCounters &PG::get_peering_perf() {
+  return *(osd->recoverystate_perf);
+}
+
+PerfCounters &PG::get_perf_logger() {
+  return *(osd->logger);
+}
+
+void PG::log_state_enter(const char *state) {
+  osd->pg_recovery_stats.log_enter(state);
+}
+
+void PG::log_state_exit(
+  const char *state_name, utime_t enter_time,
+  uint64_t events, utime_t event_dur) {
+  osd->pg_recovery_stats.log_exit(
+    state_name, ceph_clock_now() - enter_time, events, event_dur);
+}
+
+/********* PG **********/
+
+void PG::remove_snap_mapped_object(
+  ObjectStore::Transaction &t, const hobject_t &soid)
+{
+  t.remove(
+    coll,
+    ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
+  clear_object_snap_mapping(&t, soid);
+}
+
+void PG::clear_object_snap_mapping(
+  ObjectStore::Transaction *t, const hobject_t &soid)
+{
+  OSDriver::OSTransaction _t(osdriver.get_transaction(t));
+  if (soid.snap < CEPH_MAXSNAP) {
+    int r = snap_mapper.remove_oid(
+      soid,
+      &_t);
+    if (!(r == 0 || r == -ENOENT)) {
+      derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
+      ceph_abort();
+    }
+  }
+}
+
+void PG::update_object_snap_mapping(
+  ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
+{
+  OSDriver::OSTransaction _t(osdriver.get_transaction(t));
+  ceph_assert(soid.snap < CEPH_MAXSNAP);
+  int r = snap_mapper.remove_oid(
+    soid,
+    &_t);
+  if (!(r == 0 || r == -ENOENT)) {
+    derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
+    ceph_abort();
+  }
+  snap_mapper.add_oid(
+    soid,
+    snaps,
+    &_t);
+}
+
+/******* PG ***********/
+void PG::clear_primary_state()
+{
+  dout(20) << __func__ << dendl;
+
+  projected_log = PGLog::IndexedLog();
+
+  snap_trimq.clear();
+  snap_trimq_repeat.clear();
+  finish_sync_event = 0;  // so that _finish_recovery doesn't go off in another thread
+  release_pg_backoffs();
+
+  if (m_scrubber) {
+    m_scrubber->discard_replica_reservations();
+  }
+  scrub_after_recovery = false;
+
+  agent_clear();
+}
+
+
+bool PG::op_has_sufficient_caps(OpRequestRef& op)
+{
+  // only check MOSDOp
+  if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
+    return true;
+
+  auto req = op->get_req<MOSDOp>();
+  auto priv = req->get_connection()->get_priv();
+  auto session = static_cast<Session*>(priv.get());
+  if (!session) {
+    dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
+    return false;
+  }
+  OSDCap& caps = session->caps;
+  priv.reset();
+
+  const string &key = req->get_hobj().get_key().empty() ?
+    req->get_oid().name :
+    req->get_hobj().get_key();
+
+  bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
+			     pool.info.application_metadata,
+			     key,
+			     op->need_read_cap(),
+			     op->need_write_cap(),
+			     op->classes(),
+			     session->get_peer_socket_addr());
+
+  dout(20) << "op_has_sufficient_caps "
+           << "session=" << session
+           << " pool=" << pool.id << " (" << pool.name
+           << " " << req->get_hobj().nspace
+	   << ")"
+	   << " pool_app_metadata=" << pool.info.application_metadata
+	   << " need_read_cap=" << op->need_read_cap()
+	   << " need_write_cap=" << op->need_write_cap()
+	   << " classes=" << op->classes()
+	   << " -> " << (cap ? "yes" : "NO")
+	   << dendl;
+  return cap;
+}
+
+void PG::queue_recovery()
+{
+  if (!is_primary() || !is_peered()) {
+    dout(10) << "queue_recovery -- not primary or not peered " << dendl;
+    ceph_assert(!recovery_queued);
+  } else if (recovery_queued) {
+    dout(10) << "queue_recovery -- already queued" << dendl;
+  } else {
+    dout(10) << "queue_recovery -- queuing" << dendl;
+    recovery_queued = true;
+    osd->queue_for_recovery(this);
+  }
+}
+
+void PG::queue_scrub_after_repair()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(ceph_mutex_is_locked(_lock));
+
+  m_planned_scrub.must_deep_scrub = true;
+  m_planned_scrub.check_repair = true;
+  m_planned_scrub.must_scrub = true;
+
+  if (is_scrub_queued_or_active()) {
+    dout(10) << __func__ << ": scrubbing already ("
+             << (is_scrubbing() ? "active)" : "queued)") << dendl;
+    return;
+  }
+
+  m_scrubber->set_op_parameters(m_planned_scrub);
+  dout(15) << __func__ << ": queueing" << dendl;
+
+  m_scrubber->set_queued_or_active();
+  osd->queue_scrub_after_repair(this, Scrub::scrub_prio_t::high_priority);
+}
+
+unsigned PG::get_scrub_priority()
+{
+  // a higher value -> a higher priority
+  int64_t pool_scrub_priority =
+    pool.info.opts.value_or(pool_opts_t::SCRUB_PRIORITY, (int64_t)0);
+  return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
+}
+
+Context *PG::finish_recovery()
+{
+  dout(10) << "finish_recovery" << dendl;
+  ceph_assert(info.last_complete == info.last_update);
+
+  clear_recovery_state();
+
+  /*
+   * sync all this before purging strays.  but don't block!
+   */
+  finish_sync_event = new C_PG_FinishRecovery(this);
+  return finish_sync_event;
+}
+
+void PG::_finish_recovery(Context* c)
+{
+  dout(15) << __func__ << " finish_sync_event? " << finish_sync_event << " clean? "
+		 << is_clean() << dendl;
+
+  std::scoped_lock locker{*this};
+  if (recovery_state.is_deleting() || !is_clean()) {
+    dout(10) << __func__ << " raced with delete or repair" << dendl;
+    return;
+  }
+  // When recovery is initiated by a repair, that flag is left on
+  state_clear(PG_STATE_REPAIR);
+  if (c == finish_sync_event) {
+    dout(15) << __func__ << " scrub_after_recovery? " << scrub_after_recovery << dendl;
+    finish_sync_event = 0;
+    recovery_state.purge_strays();
+
+    publish_stats_to_osd();
+
+    if (scrub_after_recovery) {
+      dout(10) << "_finish_recovery requeueing for scrub" << dendl;
+      scrub_after_recovery = false;
+      queue_scrub_after_repair();
+    }
+  } else {
+    dout(10) << "_finish_recovery -- stale" << dendl;
+  }
+}
+
+void PG::start_recovery_op(const hobject_t& soid)
+{
+  dout(10) << "start_recovery_op " << soid
+#ifdef DEBUG_RECOVERY_OIDS
+	   << " (" << recovering_oids << ")"
+#endif
+	   << dendl;
+  ceph_assert(recovery_ops_active >= 0);
+  recovery_ops_active++;
+#ifdef DEBUG_RECOVERY_OIDS
+  recovering_oids.insert(soid);
+#endif
+  osd->start_recovery_op(this, soid);
+}
+
+void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
+{
+  dout(10) << "finish_recovery_op " << soid
+#ifdef DEBUG_RECOVERY_OIDS
+	   << " (" << recovering_oids << ")"
+#endif
+	   << dendl;
+  ceph_assert(recovery_ops_active > 0);
+  recovery_ops_active--;
+#ifdef DEBUG_RECOVERY_OIDS
+  ceph_assert(recovering_oids.count(soid));
+  recovering_oids.erase(recovering_oids.find(soid));
+#endif
+  osd->finish_recovery_op(this, soid, dequeue);
+
+  if (!dequeue) {
+    queue_recovery();
+  }
+}
+
+void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
+{
+  recovery_state.split_into(child_pgid, &child->recovery_state, split_bits);
+
+  child->update_snap_mapper_bits(split_bits);
+
+  child->snap_trimq = snap_trimq;
+  child->snap_trimq_repeat = snap_trimq_repeat;
+
+  _split_into(child_pgid, child, split_bits);
+
+  // release all backoffs for simplicity
+  release_backoffs(hobject_t(), hobject_t::get_max());
+}
+
+void PG::start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
+{
+  recovery_state.start_split_stats(childpgs, out);
+}
+
+void PG::finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction &t)
+{
+  recovery_state.finish_split_stats(stats, t);
+}
+
+void PG::merge_from(map<spg_t,PGRef>& sources, PeeringCtx &rctx,
+		    unsigned split_bits,
+		    const pg_merge_meta_t& last_pg_merge_meta)
+{
+  dout(10) << __func__ << " from " << sources << " split_bits " << split_bits
+	   << dendl;
+  map<spg_t, PeeringState*> source_ps;
+  for (auto &&source : sources) {
+    source_ps.emplace(source.first, &source.second->recovery_state);
+  }
+  recovery_state.merge_from(source_ps, rctx, split_bits, last_pg_merge_meta);
+
+  for (auto& i : sources) {
+    auto& source = i.second;
+    // wipe out source's pgmeta
+    rctx.transaction.remove(source->coll, source->pgmeta_oid);
+
+    // merge (and destroy source collection)
+    rctx.transaction.merge_collection(source->coll, coll, split_bits);
+  }
+
+  // merge_collection does this, but maybe all of our sources were missing.
+  rctx.transaction.collection_set_bits(coll, split_bits);
+
+  snap_mapper.update_bits(split_bits);
+}
+
+void PG::add_backoff(const ceph::ref_t<Session>& s, const hobject_t& begin, const hobject_t& end)
+{
+  auto con = s->con;
+  if (!con)   // OSD::ms_handle_reset clears s->con without a lock
+    return;
+  auto b = s->have_backoff(info.pgid, begin);
+  if (b) {
+    derr << __func__ << " already have backoff for " << s << " begin " << begin
+	 << " " << *b << dendl;
+    ceph_abort();
+  }
+  std::lock_guard l(backoff_lock);
+  b = ceph::make_ref<Backoff>(info.pgid, this, s, ++s->backoff_seq, begin, end);
+  backoffs[begin].insert(b);
+  s->add_backoff(b);
+  dout(10) << __func__ << " session " << s << " added " << *b << dendl;
+  con->send_message(
+    new MOSDBackoff(
+      info.pgid,
+      get_osdmap_epoch(),
+      CEPH_OSD_BACKOFF_OP_BLOCK,
+      b->id,
+      begin,
+      end));
+}
+
+void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
+{
+  dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
+  vector<ceph::ref_t<Backoff>> bv;
+  {
+    std::lock_guard l(backoff_lock);
+    auto p = backoffs.lower_bound(begin);
+    while (p != backoffs.end()) {
+      int r = cmp(p->first, end);
+      dout(20) << __func__ << " ? " << r << " " << p->first
+	       << " " << p->second << dendl;
+      // note: must still examine begin=end=p->first case
+      if (r > 0 || (r == 0 && begin < end)) {
+	break;
+      }
+      dout(20) << __func__ << " checking " << p->first
+	       << " " << p->second << dendl;
+      auto q = p->second.begin();
+      while (q != p->second.end()) {
+	dout(20) << __func__ << " checking  " << *q << dendl;
+	int r = cmp((*q)->begin, begin);
+	if (r == 0 || (r > 0 && (*q)->end < end)) {
+	  bv.push_back(*q);
+	  q = p->second.erase(q);
+	} else {
+	  ++q;
+	}
+      }
+      if (p->second.empty()) {
+	p = backoffs.erase(p);
+      } else {
+	++p;
+      }
+    }
+  }
+  for (auto b : bv) {
+    std::lock_guard l(b->lock);
+    dout(10) << __func__ << " " << *b << dendl;
+    if (b->session) {
+      ceph_assert(b->pg == this);
+      ConnectionRef con = b->session->con;
+      if (con) {   // OSD::ms_handle_reset clears s->con without a lock
+	con->send_message(
+	  new MOSDBackoff(
+	    info.pgid,
+	    get_osdmap_epoch(),
+	    CEPH_OSD_BACKOFF_OP_UNBLOCK,
+	    b->id,
+	    b->begin,
+	    b->end));
+      }
+      if (b->is_new()) {
+	b->state = Backoff::STATE_DELETING;
+      } else {
+	b->session->rm_backoff(b);
+	b->session.reset();
+      }
+      b->pg.reset();
+    }
+  }
+}
+
+void PG::clear_backoffs()
+{
+  dout(10) << __func__ << " " << dendl;
+  map<hobject_t,set<ceph::ref_t<Backoff>>> ls;
+  {
+    std::lock_guard l(backoff_lock);
+    ls.swap(backoffs);
+  }
+  for (auto& p : ls) {
+    for (auto& b : p.second) {
+      std::lock_guard l(b->lock);
+      dout(10) << __func__ << " " << *b << dendl;
+      if (b->session) {
+	ceph_assert(b->pg == this);
+	if (b->is_new()) {
+	  b->state = Backoff::STATE_DELETING;
+	} else {
+	  b->session->rm_backoff(b);
+	  b->session.reset();
+	}
+	b->pg.reset();
+      }
+    }
+  }
+}
+
+// called by Session::clear_backoffs()
+void PG::rm_backoff(const ceph::ref_t<Backoff>& b)
+{
+  dout(10) << __func__ << " " << *b << dendl;
+  std::lock_guard l(backoff_lock);
+  ceph_assert(ceph_mutex_is_locked_by_me(b->lock));
+  ceph_assert(b->pg == this);
+  auto p = backoffs.find(b->begin);
+  // may race with release_backoffs()
+  if (p != backoffs.end()) {
+    auto q = p->second.find(b);
+    if (q != p->second.end()) {
+      p->second.erase(q);
+      if (p->second.empty()) {
+	backoffs.erase(p);
+      }
+    }
+  }
+}
+
+void PG::clear_recovery_state()
+{
+  dout(10) << "clear_recovery_state" << dendl;
+
+  finish_sync_event = 0;
+
+  hobject_t soid;
+  while (recovery_ops_active > 0) {
+#ifdef DEBUG_RECOVERY_OIDS
+    soid = *recovering_oids.begin();
+#endif
+    finish_recovery_op(soid, true);
+  }
+
+  backfill_info.clear();
+  peer_backfill_info.clear();
+  waiting_on_backfill.clear();
+  _clear_recovery_state();  // pg impl specific hook
+}
+
+void PG::cancel_recovery()
+{
+  dout(10) << "cancel_recovery" << dendl;
+  clear_recovery_state();
+}
+
+void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
+{
+  std::lock_guard l(heartbeat_peer_lock);
+  probe_targets.clear();
+  for (set<pg_shard_t>::iterator i = probe_set.begin();
+       i != probe_set.end();
+       ++i) {
+    probe_targets.insert(i->osd);
+  }
+}
+
+void PG::send_cluster_message(
+  int target, MessageRef m,
+  epoch_t epoch, bool share_map_update=false)
+{
+  ConnectionRef con = osd->get_con_osd_cluster(
+    target, get_osdmap_epoch());
+  if (!con) {
+    return;
+  }
+
+  if (share_map_update) {
+    osd->maybe_share_map(con.get(), get_osdmap());
+  }
+  osd->send_message_osd_cluster(m, con.get());
+}
+
+void PG::clear_probe_targets()
+{
+  std::lock_guard l(heartbeat_peer_lock);
+  probe_targets.clear();
+}
+
+void PG::update_heartbeat_peers(set<int> new_peers)
+{
+  bool need_update = false;
+  heartbeat_peer_lock.lock();
+  if (new_peers == heartbeat_peers) {
+    dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
+  } else {
+    dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
+    heartbeat_peers.swap(new_peers);
+    need_update = true;
+  }
+  heartbeat_peer_lock.unlock();
+
+  if (need_update)
+    osd->need_heartbeat_peer_update();
+}
+
+
+bool PG::check_in_progress_op(
+  const osd_reqid_t &r,
+  eversion_t *version,
+  version_t *user_version,
+  int *return_code,
+  vector<pg_log_op_return_item_t> *op_returns
+  ) const
+{
+  return (
+    projected_log.get_request(r, version, user_version, return_code,
+			      op_returns) ||
+    recovery_state.get_pg_log().get_log().get_request(
+      r, version, user_version, return_code, op_returns));
+}
+
+void PG::publish_stats_to_osd()
+{
+  if (!is_primary())
+    return;
+
+  std::lock_guard l{pg_stats_publish_lock};
+  auto stats = recovery_state.prepare_stats_for_publish(
+    pg_stats_publish_valid,
+    pg_stats_publish,
+    unstable_stats);
+  if (stats) {
+    pg_stats_publish = stats.value();
+    pg_stats_publish_valid = true;
+  }
+}
+
+unsigned PG::get_target_pg_log_entries() const
+{
+  return osd->get_target_pg_log_entries();
+}
+
+void PG::clear_publish_stats()
+{
+  dout(15) << "clear_stats" << dendl;
+  std::lock_guard l{pg_stats_publish_lock};
+  pg_stats_publish_valid = false;
+}
+
+/**
+ * initialize a newly instantiated pg
+ *
+ * Initialize PG state, as when a PG is initially created, or when it
+ * is first instantiated on the current node.
+ *
+ * @param role our role/rank
+ * @param newup up set
+ * @param newacting acting set
+ * @param history pg history
+ * @param pi past_intervals
+ * @param backfill true if info should be marked as backfill
+ * @param t transaction to write out our new state in
+ */
+void PG::init(
+  int role,
+  const vector<int>& newup, int new_up_primary,
+  const vector<int>& newacting, int new_acting_primary,
+  const pg_history_t& history,
+  const PastIntervals& pi,
+  bool backfill,
+  ObjectStore::Transaction &t)
+{
+  recovery_state.init(
+    role, newup, new_up_primary, newacting,
+    new_acting_primary, history, pi, backfill, t);
+}
+
+void PG::shutdown()
+{
+  ch->flush();
+  std::scoped_lock l{*this};
+  recovery_state.shutdown();
+  on_shutdown();
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+void PG::upgrade(ObjectStore *store)
+{
+  dout(0) << __func__ << " " << info_struct_v << " -> " << pg_latest_struct_v
+	  << dendl;
+  ceph_assert(info_struct_v <= 10);
+  ObjectStore::Transaction t;
+
+  // <do upgrade steps here>
+
+  // finished upgrade!
+  ceph_assert(info_struct_v == 10);
+
+  // update infover_key
+  if (info_struct_v < pg_latest_struct_v) {
+    map<string,bufferlist> v;
+    __u8 ver = pg_latest_struct_v;
+    encode(ver, v[string(infover_key)]);
+    t.omap_setkeys(coll, pgmeta_oid, v);
+  }
+
+  recovery_state.force_write_state(t);
+
+  ObjectStore::CollectionHandle ch = store->open_collection(coll);
+  int r = store->queue_transaction(ch, std::move(t));
+  if (r != 0) {
+    derr << __func__ << ": queue_transaction returned "
+	 << cpp_strerror(r) << dendl;
+    ceph_abort();
+  }
+  ceph_assert(r == 0);
+
+  C_SaferCond waiter;
+  if (!ch->flush_commit(&waiter)) {
+    waiter.wait();
+  }
+}
+
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
+
+void PG::prepare_write(
+  pg_info_t &info,
+  pg_info_t &last_written_info,
+  PastIntervals &past_intervals,
+  PGLog &pglog,
+  bool dirty_info,
+  bool dirty_big_info,
+  bool need_write_epoch,
+  ObjectStore::Transaction &t)
+{
+  info.stats.stats.add(unstable_stats);
+  unstable_stats.clear();
+  map<string,bufferlist> km;
+  string key_to_remove;
+  if (dirty_big_info || dirty_info) {
+    int ret = prepare_info_keymap(
+      cct,
+      &km,
+      &key_to_remove,
+      get_osdmap_epoch(),
+      info,
+      last_written_info,
+      past_intervals,
+      dirty_big_info,
+      need_write_epoch,
+      cct->_conf->osd_fast_info,
+      osd->logger,
+      this);
+    ceph_assert(ret == 0);
+  }
+  pglog.write_log_and_missing(
+    t, &km, coll, pgmeta_oid, pool.info.require_rollback());
+  if (!km.empty())
+    t.omap_setkeys(coll, pgmeta_oid, km);
+  if (!key_to_remove.empty())
+    t.omap_rmkey(coll, pgmeta_oid, key_to_remove);
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+bool PG::_has_removal_flag(ObjectStore *store,
+			   spg_t pgid)
+{
+  coll_t coll(pgid);
+  ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
+
+  // first try new way
+  set<string> keys;
+  keys.insert("_remove");
+  map<string,bufferlist> values;
+  auto ch = store->open_collection(coll);
+  ceph_assert(ch);
+  if (store->omap_get_values(ch, pgmeta_oid, keys, &values) == 0 &&
+      values.size() == 1)
+    return true;
+
+  return false;
+}
+
+int PG::peek_map_epoch(ObjectStore *store,
+		       spg_t pgid,
+		       epoch_t *pepoch)
+{
+  coll_t coll(pgid);
+  ghobject_t legacy_infos_oid(OSD::make_infos_oid());
+  ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
+  epoch_t cur_epoch = 0;
+
+  // validate collection name
+  ceph_assert(coll.is_pg());
+
+  // try for v8
+  set<string> keys;
+  keys.insert(string(infover_key));
+  keys.insert(string(epoch_key));
+  map<string,bufferlist> values;
+  auto ch = store->open_collection(coll);
+  ceph_assert(ch);
+  int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
+  if (r == 0) {
+    ceph_assert(values.size() == 2);
+
+    // sanity check version
+    auto bp = values[string(infover_key)].cbegin();
+    __u8 struct_v = 0;
+    decode(struct_v, bp);
+    ceph_assert(struct_v >= 8);
+
+    // get epoch
+    bp = values[string(epoch_key)].begin();
+    decode(cur_epoch, bp);
+  } else {
+    // probably bug 10617; see OSD::load_pgs()
+    return -1;
+  }
+
+  *pepoch = cur_epoch;
+  return 0;
+}
+
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
+
+bool PG::check_log_for_corruption(ObjectStore *store)
+{
+  /// TODO: this method needs to work with the omap log
+  return true;
+}
+
+//! Get the name we're going to save our corrupt page log as
+std::string PG::get_corrupt_pg_log_name() const
+{
+  const int MAX_BUF = 512;
+  char buf[MAX_BUF];
+  struct tm tm_buf;
+  time_t my_time(time(NULL));
+  const struct tm *t = localtime_r(&my_time, &tm_buf);
+  int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
+  if (ret == 0) {
+    dout(0) << "strftime failed" << dendl;
+    return "corrupt_log_unknown_time";
+  }
+  string out(buf);
+  out += stringify(info.pgid);
+  return out;
+}
+
+int PG::read_info(
+  ObjectStore *store, spg_t pgid, const coll_t &coll,
+  pg_info_t &info, PastIntervals &past_intervals,
+  __u8 &struct_v)
+{
+  set<string> keys;
+  keys.insert(string(infover_key));
+  keys.insert(string(info_key));
+  keys.insert(string(biginfo_key));
+  keys.insert(string(fastinfo_key));
+  ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
+  map<string,bufferlist> values;
+  auto ch = store->open_collection(coll);
+  ceph_assert(ch);
+  int r = store->omap_get_values(ch, pgmeta_oid, keys, &values);
+  ceph_assert(r == 0);
+  ceph_assert(values.size() == 3 ||
+	 values.size() == 4);
+
+  auto p = values[string(infover_key)].cbegin();
+  decode(struct_v, p);
+  ceph_assert(struct_v >= 10);
+
+  p = values[string(info_key)].begin();
+  decode(info, p);
+
+  p = values[string(biginfo_key)].begin();
+  decode(past_intervals, p);
+  decode(info.purged_snaps, p);
+
+  p = values[string(fastinfo_key)].begin();
+  if (!p.end()) {
+    pg_fast_info_t fast;
+    decode(fast, p);
+    fast.try_apply_to(&info);
+  }
+  return 0;
+}
+
+void PG::read_state(ObjectStore *store)
+{
+  PastIntervals past_intervals_from_disk;
+  pg_info_t info_from_disk;
+  int r = read_info(
+    store,
+    pg_id,
+    coll,
+    info_from_disk,
+    past_intervals_from_disk,
+    info_struct_v);
+  ceph_assert(r >= 0);
+
+  if (info_struct_v < pg_compat_struct_v) {
+    derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
+	 << " an older version first." << dendl;
+    ceph_abort_msg("PG too old to upgrade");
+  }
+
+  recovery_state.init_from_disk_state(
+    std::move(info_from_disk),
+    std::move(past_intervals_from_disk),
+    [this, store] (PGLog &pglog) {
+      ostringstream oss;
+      pglog.read_log_and_missing(
+	store,
+	ch,
+	pgmeta_oid,
+	info,
+	oss,
+	cct->_conf->osd_ignore_stale_divergent_priors,
+	cct->_conf->osd_debug_verify_missing_on_start);
+
+      if (oss.tellp())
+	osd->clog->error() << oss.str();
+      return 0;
+    });
+
+  if (info_struct_v < pg_latest_struct_v) {
+    upgrade(store);
+  }
+
+  // initialize current mapping
+  {
+    int primary, up_primary;
+    vector<int> acting, up;
+    get_osdmap()->pg_to_up_acting_osds(
+      pg_id.pgid, &up, &up_primary, &acting, &primary);
+    recovery_state.init_primary_up_acting(
+      up,
+      acting,
+      up_primary,
+      primary);
+    recovery_state.set_role(OSDMap::calc_pg_role(pg_whoami, acting));
+  }
+
+  // init pool options
+  store->set_collection_opts(ch, pool.info.opts);
+
+  PeeringCtx rctx(ceph_release_t::unknown);
+  handle_initialize(rctx);
+  // note: we don't activate here because we know the OSD will advance maps
+  // during boot.
+  write_if_dirty(rctx.transaction);
+  store->queue_transaction(ch, std::move(rctx.transaction));
+}
+
+void PG::update_snap_map(
+  const vector<pg_log_entry_t> &log_entries,
+  ObjectStore::Transaction &t)
+{
+  for (auto i = log_entries.cbegin(); i != log_entries.cend(); ++i) {
+    OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
+    if (i->soid.snap < CEPH_MAXSNAP) {
+      if (i->is_delete()) {
+	int r = snap_mapper.remove_oid(
+	  i->soid,
+	  &_t);
+	if (r)
+	  derr << __func__ << " remove_oid " << i->soid << " failed with " << r << dendl;
+        // On removal tolerate missing key corruption
+        ceph_assert(r == 0 || r == -ENOENT);
+      } else if (i->is_update()) {
+	ceph_assert(i->snaps.length() > 0);
+	vector<snapid_t> snaps;
+	bufferlist snapbl = i->snaps;
+	auto p = snapbl.cbegin();
+	try {
+	  decode(snaps, p);
+	} catch (...) {
+	  derr << __func__ << " decode snaps failure on " << *i << dendl;
+	  snaps.clear();
+	}
+	set<snapid_t> _snaps(snaps.begin(), snaps.end());
+
+	if (i->is_clone() || i->is_promote()) {
+	  snap_mapper.add_oid(
+	    i->soid,
+	    _snaps,
+	    &_t);
+	} else if (i->is_modify()) {
+	  int r = snap_mapper.update_snaps(
+	    i->soid,
+	    _snaps,
+	    0,
+	    &_t);
+	  ceph_assert(r == 0);
+	} else {
+	  ceph_assert(i->is_clean());
+	}
+      }
+    }
+  }
+}
+
+/**
+ * filter trimming|trimmed snaps out of snapcontext
+ */
+void PG::filter_snapc(vector<snapid_t> &snaps)
+{
+  // nothing needs to trim, we can return immediately
+  if (snap_trimq.empty() && info.purged_snaps.empty())
+    return;
+
+  bool filtering = false;
+  vector<snapid_t> newsnaps;
+  for (vector<snapid_t>::iterator p = snaps.begin();
+       p != snaps.end();
+       ++p) {
+    if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
+      if (!filtering) {
+	// start building a new vector with what we've seen so far
+	dout(10) << "filter_snapc filtering " << snaps << dendl;
+	newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
+	filtering = true;
+      }
+      dout(20) << "filter_snapc  removing trimq|purged snap " << *p << dendl;
+    } else {
+      if (filtering)
+	newsnaps.push_back(*p);  // continue building new vector
+    }
+  }
+  if (filtering) {
+    snaps.swap(newsnaps);
+    dout(10) << "filter_snapc  result " << snaps << dendl;
+  }
+}
+
+void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
+{
+  for (auto it = m.begin(); it != m.end(); ++it)
+    requeue_ops(it->second);
+  m.clear();
+}
+
+void PG::requeue_op(OpRequestRef op)
+{
+  auto p = waiting_for_map.find(op->get_source());
+  if (p != waiting_for_map.end()) {
+    dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
+	     << dendl;
+    p->second.push_front(op);
+  } else {
+    dout(20) << __func__ << " " << op << dendl;
+    osd->enqueue_front(
+      OpSchedulerItem(
+        unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, op)),
+	op->get_req()->get_cost(),
+	op->get_req()->get_priority(),
+	op->get_req()->get_recv_stamp(),
+	op->get_req()->get_source().num(),
+	get_osdmap_epoch()));
+  }
+}
+
+void PG::requeue_ops(list<OpRequestRef> &ls)
+{
+  for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
+       i != ls.rend();
+       ++i) {
+    requeue_op(*i);
+  }
+  ls.clear();
+}
+
+void PG::requeue_map_waiters()
+{
+  epoch_t epoch = get_osdmap_epoch();
+  auto p = waiting_for_map.begin();
+  while (p != waiting_for_map.end()) {
+    if (epoch < p->second.front()->min_epoch) {
+      dout(20) << __func__ << " " << p->first << " front op "
+	       << p->second.front() << " must still wait, doing nothing"
+	       << dendl;
+      ++p;
+    } else {
+      dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
+      for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
+	auto req = *q;
+	osd->enqueue_front(OpSchedulerItem(
+          unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, req)),
+	  req->get_req()->get_cost(),
+	  req->get_req()->get_priority(),
+	  req->get_req()->get_recv_stamp(),
+	  req->get_req()->get_source().num(),
+	  epoch));
+      }
+      p = waiting_for_map.erase(p);
+    }
+  }
+}
+
+bool PG::get_must_scrub() const
+{
+  dout(20) << __func__ << " must_scrub? " << (m_planned_scrub.must_scrub ? "true" : "false") << dendl;
+  return m_planned_scrub.must_scrub;
+}
+
+unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
+{
+  return m_scrubber->scrub_requeue_priority(with_priority);
+}
+
+unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const
+{
+  return m_scrubber->scrub_requeue_priority(with_priority, suggested_priority);
+}
+
+// ==========================================================================================
+// SCRUB
+
+/*
+ *  implementation note:
+ *  PG::sched_scrub() is called only once per a specific scrub session.
+ *  That call commits us to the whatever choices are made (deep/shallow, etc').
+ *  Unless failing to start scrubbing, the 'planned scrub' flag-set is 'frozen' into
+ *  PgScrubber's m_flags, then cleared.
+ */
+bool PG::sched_scrub()
+{
+  dout(15) << __func__ << " pg(" << info.pgid
+	  << (is_active() ? ") <active>" : ") <not-active>")
+	  << (is_clean() ? " <clean>" : " <not-clean>") << dendl;
+  ceph_assert(ceph_mutex_is_locked(_lock));
+
+  if (!is_primary() || !is_active() || !is_clean()) {
+    return false;
+  }
+
+  if (is_scrub_queued_or_active()) {
+    return false;
+  }
+
+  // analyse the combination of the requested scrub flags, the osd/pool configuration
+  // and the PG status to determine whether we should scrub now, and what type of scrub
+  // should that be.
+  auto updated_flags = verify_scrub_mode();
+  if (!updated_flags) {
+    // the stars do not align for starting a scrub for this PG at this time
+    // (due to configuration or priority issues)
+    // The reason was already reported by the callee.
+    dout(10) << __func__ << ": failed to initiate a scrub" << dendl;
+    return false;
+  }
+
+  // try to reserve the local OSD resources. If failing: no harm. We will
+  // be retried by the OSD later on.
+  if (!m_scrubber->reserve_local()) {
+    dout(10) << __func__ << ": failed to reserve locally" << dendl;
+    return false;
+  }
+
+  // can commit to the updated flags now, as nothing will stop the scrub
+  m_planned_scrub = *updated_flags;
+
+  // An interrupted recovery repair could leave this set.
+  state_clear(PG_STATE_REPAIR);
+
+  // Pass control to the scrubber. It is the scrubber that handles the replicas'
+  // resources reservations.
+  m_scrubber->set_op_parameters(m_planned_scrub);
+
+  dout(10) << __func__ << ": queueing" << dendl;
+  m_scrubber->set_queued_or_active();
+  osd->queue_for_scrub(this, Scrub::scrub_prio_t::low_priority);
+  return true;
+}
+
+double PG::next_deepscrub_interval() const
+{
+  double deep_scrub_interval =
+    pool.info.opts.value_or(pool_opts_t::DEEP_SCRUB_INTERVAL, 0.0);
+  if (deep_scrub_interval <= 0.0)
+    deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
+  return info.history.last_deep_scrub_stamp + deep_scrub_interval;
+}
+
+bool PG::is_time_for_deep(bool allow_deep_scrub,
+			  bool allow_scrub,
+			  bool has_deep_errors,
+			  const requested_scrub_t& planned) const
+{
+  dout(10) << __func__ << ": need_auto?" << planned.need_auto << " allow_deep_scrub? "
+	   << allow_deep_scrub << dendl;
+
+  if (!allow_deep_scrub)
+    return false;
+
+  if (planned.need_auto) {
+    dout(10) << __func__ << ": need repair after scrub errors" << dendl;
+    return true;
+  }
+
+  if (ceph_clock_now() >= next_deepscrub_interval()) {
+    dout(20) << __func__ << ": now (" << ceph_clock_now() << ") >= time for deep ("
+	     << next_deepscrub_interval() << ")" << dendl;
+    return true;
+  }
+
+  if (has_deep_errors) {
+    osd->clog->info() << "osd." << osd->whoami << " pg " << info.pgid
+		      << " Deep scrub errors, upgrading scrub to deep-scrub";
+    return true;
+  }
+
+  // we only flip coins if 'allow_scrub' is asserted. Otherwise - as this function is
+  // called often, we will probably be deep-scrubbing most of the time.
+  if (allow_scrub) {
+    bool deep_coin_flip =
+      (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
+
+    dout(15) << __func__ << ": time_for_deep=" << planned.time_for_deep
+	     << " deep_coin_flip=" << deep_coin_flip << dendl;
+
+    if (deep_coin_flip)
+      return true;
+  }
+
+  return false;
+}
+
+bool PG::verify_periodic_scrub_mode(bool allow_deep_scrub,
+			      bool try_to_auto_repair,
+			      bool allow_regular_scrub,
+			      bool has_deep_errors,
+			      requested_scrub_t& planned) const
+
+{
+  ceph_assert(!planned.must_deep_scrub && !planned.must_repair);
+
+  if (!allow_deep_scrub && has_deep_errors) {
+      osd->clog->error()
+	<< "osd." << osd->whoami << " pg " << info.pgid
+	<< " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
+      return false;
+  }
+
+  if (allow_deep_scrub) {
+    // Initial entry and scheduled scrubs without nodeep_scrub set get here
+
+    planned.time_for_deep =
+      is_time_for_deep(allow_deep_scrub, allow_regular_scrub, has_deep_errors, planned);
+
+    if (try_to_auto_repair) {
+      if (planned.time_for_deep) {
+	dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
+	planned.auto_repair = true;
+      } else if (allow_regular_scrub) {
+	dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found"
+		 << dendl;
+	planned.deep_scrub_on_error = true;
+      }
+    }
+  }
+
+  dout(20) << __func__ << " updated flags: " << planned
+	   << " allow_regular_scrub: " << allow_regular_scrub << dendl;
+
+  // NOSCRUB so skip regular scrubs
+  if (!allow_regular_scrub && !planned.time_for_deep) {
+    return false;
+  }
+
+  return true;
+}
+
+std::optional<requested_scrub_t> PG::verify_scrub_mode() const
+{
+  dout(10) << __func__ << " processing pg " << info.pgid << dendl;
+
+  bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
+			    pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB));
+  bool allow_regular_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+			       pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB));
+  bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0);
+  bool try_to_auto_repair =
+    (cct->_conf->osd_scrub_auto_repair && get_pgbackend()->auto_repair_supported());
+
+  auto upd_flags = m_planned_scrub;
+
+  upd_flags.time_for_deep = false;
+  // Clear these in case user issues the scrub/repair command during
+  // the scheduling of the scrub/repair (e.g. request reservation)
+  upd_flags.deep_scrub_on_error = false;
+  upd_flags.auto_repair = false;
+
+  if (upd_flags.must_scrub && !upd_flags.must_deep_scrub && has_deep_errors) {
+    osd->clog->error() << "osd." << osd->whoami << " pg " << info.pgid
+		       << " Regular scrub request, deep-scrub details will be lost";
+  }
+
+  if (!upd_flags.must_scrub) {
+    // All periodic scrub handling goes here because must_scrub is
+    // always set for must_deep_scrub and must_repair.
+
+    bool can_start_periodic =
+      verify_periodic_scrub_mode(allow_deep_scrub, try_to_auto_repair,
+				 allow_regular_scrub, has_deep_errors, upd_flags);
+    if (!can_start_periodic) {
+      return std::nullopt;
+    }
+  }
+
+  //  scrubbing while recovering?
+
+  bool prevented_by_recovery =
+    osd->is_recovery_active() && !cct->_conf->osd_scrub_during_recovery &&
+    (!cct->_conf->osd_repair_during_recovery || !upd_flags.must_repair);
+
+  if (prevented_by_recovery) {
+    dout(20) << __func__ << ": scrubbing prevented during recovery" << dendl;
+    return std::nullopt;
+  }
+
+  upd_flags.need_auto = false;
+  return upd_flags;
+}
+
+void PG::reg_next_scrub()
+{
+  m_scrubber->reg_next_scrub(m_planned_scrub);
+}
+
+void PG::on_info_history_change()
+{
+  dout(20) << __func__ << dendl;
+  if (m_scrubber) {
+    m_scrubber->unreg_next_scrub();
+    m_scrubber->reg_next_scrub(m_planned_scrub);
+  }
+}
+
+void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type)
+{
+  if (m_scrubber) {
+    m_scrubber->scrub_requested(scrub_level, scrub_type, m_planned_scrub);
+  }
+}
+
+void PG::clear_ready_to_merge() {
+  osd->clear_ready_to_merge(this);
+}
+
+void PG::queue_want_pg_temp(const vector<int> &wanted) {
+  osd->queue_want_pg_temp(get_pgid().pgid, wanted);
+}
+
+void PG::clear_want_pg_temp() {
+  osd->remove_want_pg_temp(get_pgid().pgid);
+}
+
+void PG::on_role_change() {
+  requeue_ops(waiting_for_peered);
+  plpg_on_role_change();
+}
+
+void PG::on_new_interval()
+{
+  dout(20) << __func__ << ": scrub flags on new interval: " << m_planned_scrub
+	   << dendl;
+  projected_last_update = eversion_t();
+  cancel_recovery();
+}
+
+epoch_t PG::oldest_stored_osdmap() {
+  return osd->get_superblock().oldest_map;
+}
+
+OstreamTemp PG::get_clog_info() {
+  return osd->clog->info();
+}
+
+OstreamTemp PG::get_clog_debug() {
+  return osd->clog->debug();
+}
+
+OstreamTemp PG::get_clog_error() {
+  return osd->clog->error();
+}
+
+void PG::schedule_event_after(
+  PGPeeringEventRef event,
+  float delay) {
+  std::lock_guard lock(osd->recovery_request_lock);
+  osd->recovery_request_timer.add_event_after(
+    delay,
+    new QueuePeeringEvt(
+      this,
+      std::move(event)));
+}
+
+void PG::request_local_background_io_reservation(
+  unsigned priority,
+  PGPeeringEventURef on_grant,
+  PGPeeringEventURef on_preempt) {
+  osd->local_reserver.request_reservation(
+    pg_id,
+    on_grant ? new QueuePeeringEvt(
+      this, std::move(on_grant)) : nullptr,
+    priority,
+    on_preempt ? new QueuePeeringEvt(
+      this, std::move(on_preempt)) : nullptr);
+}
+
+void PG::update_local_background_io_priority(
+  unsigned priority) {
+  osd->local_reserver.update_priority(
+    pg_id,
+    priority);
+}
+
+void PG::cancel_local_background_io_reservation() {
+  osd->local_reserver.cancel_reservation(
+    pg_id);
+}
+
+void PG::request_remote_recovery_reservation(
+  unsigned priority,
+  PGPeeringEventURef on_grant,
+  PGPeeringEventURef on_preempt) {
+  osd->remote_reserver.request_reservation(
+    pg_id,
+    on_grant ? new QueuePeeringEvt(
+      this, std::move(on_grant)) : nullptr,
+    priority,
+    on_preempt ? new QueuePeeringEvt(
+      this, std::move(on_preempt)) : nullptr);
+}
+
+void PG::cancel_remote_recovery_reservation() {
+  osd->remote_reserver.cancel_reservation(
+    pg_id);
+}
+
+void PG::schedule_event_on_commit(
+  ObjectStore::Transaction &t,
+  PGPeeringEventRef on_commit)
+{
+  t.register_on_commit(new QueuePeeringEvt(this, on_commit));
+}
+
+void PG::on_activate(interval_set<snapid_t> snaps)
+{
+  ceph_assert(!m_scrubber->are_callbacks_pending());
+  ceph_assert(callbacks_for_degraded_object.empty());
+  snap_trimq = snaps;
+  release_pg_backoffs();
+  projected_last_update = info.last_update;
+}
+
+void PG::on_active_exit()
+{
+  backfill_reserving = false;
+  agent_stop();
+}
+
+void PG::on_active_advmap(const OSDMapRef &osdmap)
+{
+  const auto& new_removed_snaps = osdmap->get_new_removed_snaps();
+  auto i = new_removed_snaps.find(get_pgid().pool());
+  if (i != new_removed_snaps.end()) {
+    bool bad = false;
+    for (auto j : i->second) {
+      if (snap_trimq.intersects(j.first, j.second)) {
+	decltype(snap_trimq) added, overlap;
+	added.insert(j.first, j.second);
+	overlap.intersection_of(snap_trimq, added);
+	derr << __func__ << " removed_snaps already contains "
+	     << overlap << dendl;
+	bad = true;
+	snap_trimq.union_of(added);
+      } else {
+	snap_trimq.insert(j.first, j.second);
+      }
+    }
+    dout(10) << __func__ << " new removed_snaps " << i->second
+	     << ", snap_trimq now " << snap_trimq << dendl;
+    ceph_assert(!bad || !cct->_conf->osd_debug_verify_cached_snaps);
+  }
+
+  const auto& new_purged_snaps = osdmap->get_new_purged_snaps();
+  auto j = new_purged_snaps.find(get_pgid().pgid.pool());
+  if (j != new_purged_snaps.end()) {
+    bool bad = false;
+    for (auto k : j->second) {
+      if (!recovery_state.get_info().purged_snaps.contains(k.first, k.second)) {
+	interval_set<snapid_t> rm, overlap;
+	rm.insert(k.first, k.second);
+	overlap.intersection_of(recovery_state.get_info().purged_snaps, rm);
+	derr << __func__ << " purged_snaps does not contain "
+	     << rm << ", only " << overlap << dendl;
+	recovery_state.adjust_purged_snaps(
+	  [&overlap](auto &purged_snaps) {
+	    purged_snaps.subtract(overlap);
+	  });
+	// This can currently happen in the normal (if unlikely) course of
+	// events.  Because adding snaps to purged_snaps does not increase
+	// the pg version or add a pg log entry, we don't reliably propagate
+	// purged_snaps additions to other OSDs.
+	// One example:
+	//  - purge S
+	//  - primary and replicas update purged_snaps
+	//  - no object updates
+	//  - pg mapping changes, new primary on different node
+	//  - new primary pg version == eversion_t(), so info is not
+	//    propagated.
+	//bad = true;
+      } else {
+	recovery_state.adjust_purged_snaps(
+	  [&k](auto &purged_snaps) {
+	    purged_snaps.erase(k.first, k.second);
+	  });
+      }
+    }
+    dout(10) << __func__ << " new purged_snaps " << j->second
+	     << ", now " << recovery_state.get_info().purged_snaps << dendl;
+    ceph_assert(!bad || !cct->_conf->osd_debug_verify_cached_snaps);
+  }
+}
+
+void PG::queue_snap_retrim(snapid_t snap)
+{
+  if (!is_active() ||
+      !is_primary()) {
+    dout(10) << __func__ << " snap " << snap << " - not active and primary"
+	     << dendl;
+    return;
+  }
+  if (!snap_trimq.contains(snap)) {
+    snap_trimq.insert(snap);
+    snap_trimq_repeat.insert(snap);
+    dout(20) << __func__ << " snap " << snap
+	     << ", trimq now " << snap_trimq
+	     << ", repeat " << snap_trimq_repeat << dendl;
+    kick_snap_trim();
+  } else {
+    dout(20) << __func__ << " snap " << snap
+	     << " already in trimq " << snap_trimq << dendl;
+  }
+}
+
+void PG::on_active_actmap()
+{
+  if (cct->_conf->osd_check_for_log_corruption)
+    check_log_for_corruption(osd->store);
+
+
+  if (recovery_state.is_active()) {
+    dout(10) << "Active: kicking snap trim" << dendl;
+    kick_snap_trim();
+  }
+
+  if (recovery_state.is_peered() &&
+      !recovery_state.is_clean() &&
+      !recovery_state.get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
+      (!recovery_state.get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) ||
+       recovery_state.is_degraded())) {
+    queue_recovery();
+  }
+}
+
+void PG::on_backfill_reserved()
+{
+  backfill_reserving = false;
+  queue_recovery();
+}
+
+void PG::on_backfill_canceled()
+{
+  if (!waiting_on_backfill.empty()) {
+    waiting_on_backfill.clear();
+    finish_recovery_op(hobject_t::get_max());
+  }
+}
+
+void PG::on_recovery_reserved()
+{
+  queue_recovery();
+}
+
+void PG::set_not_ready_to_merge_target(pg_t pgid, pg_t src)
+{
+  osd->set_not_ready_to_merge_target(pgid, src);
+}
+
+void PG::set_not_ready_to_merge_source(pg_t pgid)
+{
+  osd->set_not_ready_to_merge_source(pgid);
+}
+
+void PG::set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec)
+{
+  osd->set_ready_to_merge_target(this, lu, les, lec);
+}
+
+void PG::set_ready_to_merge_source(eversion_t lu)
+{
+  osd->set_ready_to_merge_source(this, lu);
+}
+
+void PG::send_pg_created(pg_t pgid)
+{
+  osd->send_pg_created(pgid);
+}
+
+ceph::signedspan PG::get_mnow()
+{
+  return osd->get_mnow();
+}
+
+HeartbeatStampsRef PG::get_hb_stamps(int peer)
+{
+  return osd->get_hb_stamps(peer);
+}
+
+void PG::schedule_renew_lease(epoch_t lpr, ceph::timespan delay)
+{
+  auto spgid = info.pgid;
+  auto o = osd;
+  osd->mono_timer.add_event(
+    delay,
+    [o, lpr, spgid]() {
+      o->queue_renew_lease(lpr, spgid);
+    });
+}
+
+void PG::queue_check_readable(epoch_t lpr, ceph::timespan delay)
+{
+  osd->queue_check_readable(info.pgid, lpr, delay);
+}
+
+void PG::rebuild_missing_set_with_deletes(PGLog &pglog)
+{
+  pglog.rebuild_missing_set_with_deletes(
+    osd->store,
+    ch,
+    recovery_state.get_info());
+}
+
+void PG::on_activate_committed()
+{
+  if (!is_primary()) {
+    // waiters
+    if (recovery_state.needs_flush() == 0) {
+      requeue_ops(waiting_for_peered);
+    } else if (!waiting_for_peered.empty()) {
+      dout(10) << __func__ << " flushes in progress, moving "
+	       << waiting_for_peered.size() << " items to waiting_for_flush"
+	       << dendl;
+      ceph_assert(waiting_for_flush.empty());
+      waiting_for_flush.swap(waiting_for_peered);
+    }
+  }
+}
+
+// Compute pending backfill data
+static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
+{
+  lgeneric_dout(cct, 20) << __func__ << " Adjust local usage "
+			 << (local_bytes >> 10) << "KiB"
+			 << " primary usage " << (bf_bytes >> 10)
+			 << "KiB" << dendl;
+
+  return std::max((int64_t)0, bf_bytes - local_bytes);
+}
+
+
+// We can zero the value of primary num_bytes as just an atomic.
+// However, setting above zero reserves space for backfill and requires
+// the OSDService::stat_lock which protects all OSD usage
+bool PG::try_reserve_recovery_space(
+  int64_t primary_bytes, int64_t local_bytes) {
+  // Use tentative_bacfill_full() to make sure enough
+  // space is available to handle target bytes from primary.
+
+  // TODO: If we passed num_objects from primary we could account for
+  // an estimate of the metadata overhead.
+
+  // TODO: If we had compressed_allocated and compressed_original from primary
+  // we could compute compression ratio and adjust accordingly.
+
+  // XXX: There is no way to get omap overhead and this would only apply
+  // to whatever possibly different partition that is storing the database.
+
+  // update_osd_stat() from heartbeat will do this on a new
+  // statfs using ps->primary_bytes.
+  uint64_t pending_adjustment = 0;
+  if (primary_bytes) {
+    // For erasure coded pool overestimate by a full stripe per object
+    // because we don't know how each objected rounded to the nearest stripe
+    if (pool.info.is_erasure()) {
+      primary_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+      primary_bytes += get_pgbackend()->get_ec_stripe_chunk_size() *
+	info.stats.stats.sum.num_objects;
+      local_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+      local_bytes += get_pgbackend()->get_ec_stripe_chunk_size() *
+	info.stats.stats.sum.num_objects;
+    }
+    pending_adjustment = pending_backfill(
+      cct,
+      primary_bytes,
+      local_bytes);
+    dout(10) << __func__ << " primary_bytes " << (primary_bytes >> 10)
+	     << "KiB"
+	     << " local " << (local_bytes >> 10) << "KiB"
+	     << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
+	     << dendl;
+  }
+
+  // This lock protects not only the stats OSDService but also setting the
+  // pg primary_bytes.  That's why we don't immediately unlock
+  std::lock_guard l{osd->stat_lock};
+  osd_stat_t cur_stat = osd->osd_stat;
+  if (cct->_conf->osd_debug_reject_backfill_probability > 0 &&
+      (rand()%1000 < (cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
+    dout(10) << "backfill reservation rejected: failure injection"
+	     << dendl;
+    return false;
+  } else if (!cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
+      osd->tentative_backfill_full(this, pending_adjustment, cur_stat)) {
+    dout(10) << "backfill reservation rejected: backfill full"
+	     << dendl;
+    return false;
+  } else {
+    // Don't reserve space if skipped reservation check, this is used
+    // to test the other backfill full check AND in case a corruption
+    // of num_bytes requires ignoring that value and trying the
+    // backfill anyway.
+    if (primary_bytes &&
+	!cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) {
+      primary_num_bytes.store(primary_bytes);
+      local_num_bytes.store(local_bytes);
+    } else {
+      unreserve_recovery_space();
+    }
+    return true;
+  }
+}
+
+void PG::unreserve_recovery_space() {
+  primary_num_bytes.store(0);
+  local_num_bytes.store(0);
+}
+
+void PG::_scan_rollback_obs(const vector<ghobject_t> &rollback_obs)
+{
+  ObjectStore::Transaction t;
+  eversion_t trimmed_to = recovery_state.get_last_rollback_info_trimmed_to_applied();
+  for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
+       i != rollback_obs.end();
+       ++i) {
+    if (i->generation < trimmed_to.version) {
+      dout(10) << __func__ << "osd." << osd->whoami
+	       << " pg " << info.pgid
+	       << " found obsolete rollback obj "
+	       << *i << " generation < trimmed_to "
+	       << trimmed_to
+	       << "...repaired" << dendl;
+      t.remove(coll, *i);
+    }
+  }
+  if (!t.empty()) {
+    derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
+	 << dendl;
+    osd->store->queue_transaction(ch, std::move(t), NULL);
+  }
+}
+
+
+void PG::_repair_oinfo_oid(ScrubMap &smap)
+{
+  for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
+       i != smap.objects.rend();
+       ++i) {
+    const hobject_t &hoid = i->first;
+    ScrubMap::object &o = i->second;
+
+    bufferlist bl;
+    if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
+      continue;
+    }
+    bl.push_back(o.attrs[OI_ATTR]);
+    object_info_t oi;
+    try {
+      oi.decode(bl);
+    } catch(...) {
+      continue;
+    }
+    if (oi.soid != hoid) {
+      ObjectStore::Transaction t;
+      OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
+      osd->clog->error() << "osd." << osd->whoami
+			    << " found object info error on pg "
+			    << info.pgid
+			    << " oid " << hoid << " oid in object info: "
+			    << oi.soid
+			    << "...repaired";
+      // Fix object info
+      oi.soid = hoid;
+      bl.clear();
+      encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+
+      bufferptr bp(bl.c_str(), bl.length());
+      o.attrs[OI_ATTR] = bp;
+
+      t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
+      int r = osd->store->queue_transaction(ch, std::move(t));
+      if (r != 0) {
+	derr << __func__ << ": queue_transaction got " << cpp_strerror(r)
+	     << dendl;
+      }
+    }
+  }
+}
+
+void PG::repair_object(
+  const hobject_t &soid,
+  const list<pair<ScrubMap::object, pg_shard_t> > &ok_peers,
+  const set<pg_shard_t> &bad_peers)
+{
+  set<pg_shard_t> ok_shards;
+  for (auto &&peer: ok_peers) ok_shards.insert(peer.second);
+
+  dout(10) << "repair_object " << soid
+	   << " bad_peers osd.{" << bad_peers << "},"
+	   << " ok_peers osd.{" << ok_shards << "}" << dendl;
+
+  const ScrubMap::object &po = ok_peers.back().first;
+  eversion_t v;
+  object_info_t oi;
+  try {
+    bufferlist bv;
+    if (po.attrs.count(OI_ATTR)) {
+      bv.push_back(po.attrs.find(OI_ATTR)->second);
+    }
+    auto bliter = bv.cbegin();
+    decode(oi, bliter);
+  } catch (...) {
+    dout(0) << __func__ << ": Need version of replica, bad object_info_t: "
+	    << soid << dendl;
+    ceph_abort();
+  }
+
+  if (bad_peers.count(get_primary())) {
+    // We should only be scrubbing if the PG is clean.
+    ceph_assert(waiting_for_unreadable_object.empty());
+    dout(10) << __func__ << ": primary = " << get_primary() << dendl;
+  }
+
+  /* No need to pass ok_peers, they must not be missing the object, so
+   * force_object_missing will add them to missing_loc anyway */
+  recovery_state.force_object_missing(bad_peers, soid, oi.version);
+}
+
+void PG::forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued, std::string_view desc)
+{
+  dout(20) << __func__ << ": " << desc << " queued at: " << epoch_queued << dendl;
+  ceph_assert(m_scrubber);
+  if (is_active()) {
+    ((*m_scrubber).*fn)(epoch_queued);
+  } else {
+    // pg might be in the process of being deleted
+    dout(5) << __func__ << " refusing to forward. " << (is_clean() ? "(clean) " : "(not clean) ") <<
+	      (is_active() ? "(active) " : "(not active) ") <<  dendl;
+  }
+}
+
+void PG::forward_scrub_event(ScrubSafeAPI fn,
+			     epoch_t epoch_queued,
+			     Scrub::act_token_t act_token,
+			     std::string_view desc)
+{
+  dout(20) << __func__ << ": " << desc << " queued: " << epoch_queued
+	   << " token: " << act_token << dendl;
+  ceph_assert(m_scrubber);
+  if (is_active()) {
+    ((*m_scrubber).*fn)(epoch_queued, act_token);
+  } else {
+    // pg might be in the process of being deleted
+    dout(5) << __func__ << " refusing to forward. "
+	    << (is_clean() ? "(clean) " : "(not clean) ")
+	    << (is_active() ? "(active) " : "(not active) ") << dendl;
+  }
+}
+
+void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle)
+{
+  dout(10) << __func__ << " (op)" << dendl;
+  ceph_assert(m_scrubber);
+  m_scrubber->replica_scrub_op(op);
+}
+
+void PG::replica_scrub(epoch_t epoch_queued,
+		       Scrub::act_token_t act_token,
+		       [[maybe_unused]] ThreadPool::TPHandle& handle)
+{
+  dout(10) << __func__ << " queued at: " << epoch_queued
+	   << (is_primary() ? " (primary)" : " (replica)") << dendl;
+  forward_scrub_event(&ScrubPgIF::send_start_replica, epoch_queued, act_token,
+		      "StartReplica/nw");
+}
+
+bool PG::ops_blocked_by_scrub() const
+{
+  return !waiting_for_scrub.empty();
+}
+
+Scrub::scrub_prio_t PG::is_scrub_blocking_ops() const
+{
+  return waiting_for_scrub.empty() ? Scrub::scrub_prio_t::low_priority
+				   : Scrub::scrub_prio_t::high_priority;
+}
+
+bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
+{
+  if (auto last_reset = get_last_peering_reset();
+      last_reset > reply_epoch || last_reset > query_epoch) {
+    dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch "
+	     << query_epoch << " last_peering_reset " << last_reset << dendl;
+    return true;
+  }
+  return false;
+}
+
+struct FlushState {
+  PGRef pg;
+  epoch_t epoch;
+  FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
+  ~FlushState() {
+    std::scoped_lock l{*pg};
+    if (!pg->pg_has_reset_since(epoch)) {
+      pg->recovery_state.complete_flush();
+    }
+  }
+};
+typedef std::shared_ptr<FlushState> FlushStateRef;
+
+void PG::start_flush_on_transaction(ObjectStore::Transaction &t)
+{
+  // flush in progress ops
+  FlushStateRef flush_trigger (std::make_shared<FlushState>(
+                               this, get_osdmap_epoch()));
+  t.register_on_applied(new ContainerContext<FlushStateRef>(flush_trigger));
+  t.register_on_commit(new ContainerContext<FlushStateRef>(flush_trigger));
+}
+
+bool PG::try_flush_or_schedule_async()
+{
+  Context *c = new QueuePeeringEvt(
+    this, get_osdmap_epoch(), PeeringState::IntervalFlush());
+  if (!ch->flush_commit(c)) {
+    return false;
+  } else {
+    delete c;
+    return true;
+  }
+}
+
+ostream& operator<<(ostream& out, const PG& pg)
+{
+  out << pg.recovery_state;
+
+  // listing all scrub-related flags - both current and "planned next scrub"
+  if (pg.is_scrubbing()) {
+    out << *pg.m_scrubber;
+  }
+  out << pg.m_planned_scrub;
+
+  if (pg.recovery_ops_active)
+    out << " rops=" << pg.recovery_ops_active;
+
+  //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
+  if (pg.recovery_state.have_missing()) {
+    out << " m=" << pg.recovery_state.get_num_missing();
+    if (pg.is_primary()) {
+      uint64_t unfound = pg.recovery_state.get_num_unfound();
+      if (unfound)
+	out << " u=" << unfound;
+    }
+  }
+  if (!pg.is_clean()) {
+    out << " mbc=" << pg.recovery_state.get_missing_by_count();
+  }
+  if (!pg.snap_trimq.empty()) {
+    out << " trimq=";
+    // only show a count if the set is large
+    if (pg.snap_trimq.num_intervals() > 16) {
+      out << pg.snap_trimq.size();
+      if (!pg.snap_trimq_repeat.empty()) {
+	out << "(" << pg.snap_trimq_repeat.size() << ")";
+      }
+    } else {
+      out << pg.snap_trimq;
+      if (!pg.snap_trimq_repeat.empty()) {
+	out << "(" << pg.snap_trimq_repeat << ")";
+      }
+    }
+  }
+  if (!pg.recovery_state.get_info().purged_snaps.empty()) {
+    out << " ps="; // snap trim queue / purged snaps
+    if (pg.recovery_state.get_info().purged_snaps.num_intervals() > 16) {
+      out << pg.recovery_state.get_info().purged_snaps.size();
+    } else {
+      out << pg.recovery_state.get_info().purged_snaps;
+    }
+  }
+
+  out << "]";
+  return out;
+}
+
+bool PG::can_discard_op(OpRequestRef& op)
+{
+  auto m = op->get_req<MOSDOp>();
+  if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
+    dout(20) << " discard " << *m << dendl;
+    return true;
+  }
+
+  if (m->get_map_epoch() < info.history.same_primary_since) {
+    dout(7) << " changed after " << m->get_map_epoch()
+	    << ", dropping " << *m << dendl;
+    return true;
+  }
+
+  if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
+			 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+      !is_primary() &&
+      m->get_map_epoch() < info.history.same_interval_since) {
+    // Note: the Objecter will resend on interval change without the primary
+    // changing if it actually sent to a replica.  If the primary hasn't
+    // changed since the send epoch, we got it, and we're primary, it won't
+    // have resent even if the interval did change as it sent it to the primary
+    // (us).
+    return true;
+  }
+
+
+  if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
+    // >= luminous client
+    if (m->get_connection()->has_feature(CEPH_FEATURE_SERVER_NAUTILUS)) {
+      // >= nautilus client
+      if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
+	dout(7) << __func__ << " sent before last_force_op_resend "
+		<< pool.info.last_force_op_resend
+		<< ", dropping" << *m << dendl;
+	return true;
+      }
+    } else {
+      // == < nautilus client (luminous or mimic)
+      if (m->get_map_epoch() < pool.info.get_last_force_op_resend_prenautilus()) {
+	dout(7) << __func__ << " sent before last_force_op_resend_prenautilus "
+		<< pool.info.last_force_op_resend_prenautilus
+		<< ", dropping" << *m << dendl;
+	return true;
+      }
+    }
+    if (m->get_map_epoch() < info.history.last_epoch_split) {
+      dout(7) << __func__ << " pg split in "
+	      << info.history.last_epoch_split << ", dropping" << dendl;
+      return true;
+    }
+  } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
+    // < luminous client
+    if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
+      dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
+	      << pool.info.last_force_op_resend_preluminous
+	      << ", dropping" << *m << dendl;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+template<typename T, int MSGTYPE>
+bool PG::can_discard_replica_op(OpRequestRef& op)
+{
+  auto m = op->get_req<T>();
+  ceph_assert(m->get_type() == MSGTYPE);
+
+  int from = m->get_source().num();
+
+  // if a repop is replied after a replica goes down in a new osdmap, and
+  // before the pg advances to this new osdmap, the repop replies before this
+  // repop can be discarded by that replica OSD, because the primary resets the
+  // connection to it when handling the new osdmap marking it down, and also
+  // resets the messenger sesssion when the replica reconnects. to avoid the
+  // out-of-order replies, the messages from that replica should be discarded.
+  OSDMapRef next_map = osd->get_next_osdmap();
+  if (next_map->is_down(from)) {
+    dout(20) << " " << __func__ << " dead for nextmap is down " << from << dendl;
+    return true;
+  }
+  /* Mostly, this overlaps with the old_peering_msg
+   * condition.  An important exception is pushes
+   * sent by replicas not in the acting set, since
+   * if such a replica goes down it does not cause
+   * a new interval. */
+  if (next_map->get_down_at(from) >= m->map_epoch) {
+    dout(20) << " " << __func__ << " dead for 'get_down_at' " << from << dendl;
+    return true;
+  }
+
+  // same pg?
+  //  if pg changes _at all_, we reset and repeer!
+  if (old_peering_msg(m->map_epoch, m->map_epoch)) {
+    dout(10) << "can_discard_replica_op pg changed " << info.history
+	     << " after " << m->map_epoch
+	     << ", dropping" << dendl;
+    return true;
+  }
+  return false;
+}
+
+bool PG::can_discard_scan(OpRequestRef op)
+{
+  auto m = op->get_req<MOSDPGScan>();
+  ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
+
+  if (old_peering_msg(m->map_epoch, m->query_epoch)) {
+    dout(10) << " got old scan, ignoring" << dendl;
+    return true;
+  }
+  return false;
+}
+
+bool PG::can_discard_backfill(OpRequestRef op)
+{
+  auto m = op->get_req<MOSDPGBackfill>();
+  ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
+
+  if (old_peering_msg(m->map_epoch, m->query_epoch)) {
+    dout(10) << " got old backfill, ignoring" << dendl;
+    return true;
+  }
+
+  return false;
+
+}
+
+bool PG::can_discard_request(OpRequestRef& op)
+{
+  switch (op->get_req()->get_type()) {
+  case CEPH_MSG_OSD_OP:
+    return can_discard_op(op);
+  case CEPH_MSG_OSD_BACKOFF:
+    return false; // never discard
+  case MSG_OSD_REPOP:
+    return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
+  case MSG_OSD_PG_PUSH:
+    return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
+  case MSG_OSD_PG_PULL:
+    return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
+  case MSG_OSD_PG_PUSH_REPLY:
+    return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
+  case MSG_OSD_REPOPREPLY:
+    return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
+  case MSG_OSD_PG_RECOVERY_DELETE:
+    return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
+
+  case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+    return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
+
+  case MSG_OSD_EC_WRITE:
+    return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
+  case MSG_OSD_EC_WRITE_REPLY:
+    return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
+  case MSG_OSD_EC_READ:
+    return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
+  case MSG_OSD_EC_READ_REPLY:
+    return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
+  case MSG_OSD_REP_SCRUB:
+    return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
+  case MSG_OSD_SCRUB_RESERVE:
+    return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
+  case MSG_OSD_REP_SCRUBMAP:
+    return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
+  case MSG_OSD_PG_UPDATE_LOG_MISSING:
+    return can_discard_replica_op<
+      MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
+  case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
+    return can_discard_replica_op<
+      MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
+
+  case MSG_OSD_PG_SCAN:
+    return can_discard_scan(op);
+  case MSG_OSD_PG_BACKFILL:
+    return can_discard_backfill(op);
+  case MSG_OSD_PG_BACKFILL_REMOVE:
+    return can_discard_replica_op<MOSDPGBackfillRemove,
+				  MSG_OSD_PG_BACKFILL_REMOVE>(op);
+  }
+  return true;
+}
+
+void PG::do_peering_event(PGPeeringEventRef evt, PeeringCtx &rctx)
+{
+  dout(10) << __func__ << ": " << evt->get_desc() << dendl;
+  ceph_assert(have_same_or_newer_map(evt->get_epoch_sent()));
+  if (old_peering_evt(evt)) {
+    dout(10) << "discard old " << evt->get_desc() << dendl;
+  } else {
+    recovery_state.handle_event(evt, &rctx);
+  }
+  // write_if_dirty regardless of path above to ensure we capture any work
+  // done by OSD::advance_pg().
+  write_if_dirty(rctx.transaction);
+}
+
+void PG::queue_peering_event(PGPeeringEventRef evt)
+{
+  if (old_peering_evt(evt))
+    return;
+  osd->osd->enqueue_peering_evt(info.pgid, evt);
+}
+
+void PG::queue_null(epoch_t msg_epoch,
+		    epoch_t query_epoch)
+{
+  dout(10) << "null" << dendl;
+  queue_peering_event(
+    PGPeeringEventRef(std::make_shared<PGPeeringEvent>(msg_epoch, query_epoch,
+					 NullEvt())));
+}
+
+void PG::find_unfound(epoch_t queued, PeeringCtx &rctx)
+{
+  /*
+    * if we couldn't start any recovery ops and things are still
+    * unfound, see if we can discover more missing object locations.
+    * It may be that our initial locations were bad and we errored
+    * out while trying to pull.
+    */
+  if (!recovery_state.discover_all_missing(rctx)) {
+    string action;
+    if (state_test(PG_STATE_BACKFILLING)) {
+      auto evt = PGPeeringEventRef(
+	new PGPeeringEvent(
+	  queued,
+	  queued,
+	  PeeringState::UnfoundBackfill()));
+      queue_peering_event(evt);
+      action = "in backfill";
+    } else if (state_test(PG_STATE_RECOVERING)) {
+      auto evt = PGPeeringEventRef(
+	new PGPeeringEvent(
+	  queued,
+	  queued,
+	  PeeringState::UnfoundRecovery()));
+      queue_peering_event(evt);
+      action = "in recovery";
+    } else {
+      action = "already out of recovery/backfill";
+    }
+    dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
+  } else {
+    dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
+    queue_recovery();
+  }
+}
+
+void PG::handle_advance_map(
+  OSDMapRef osdmap, OSDMapRef lastmap,
+  vector<int>& newup, int up_primary,
+  vector<int>& newacting, int acting_primary,
+  PeeringCtx &rctx)
+{
+  dout(10) << __func__ << ": " << osdmap->get_epoch() << dendl;
+  osd_shard->update_pg_epoch(pg_slot, osdmap->get_epoch());
+  recovery_state.advance_map(
+    osdmap,
+    lastmap,
+    newup,
+    up_primary,
+    newacting,
+    acting_primary,
+    rctx);
+}
+
+void PG::handle_activate_map(PeeringCtx &rctx)
+{
+  dout(10) << __func__ << ": " << get_osdmap()->get_epoch()
+	   << dendl;
+  recovery_state.activate_map(rctx);
+
+  requeue_map_waiters();
+}
+
+void PG::handle_initialize(PeeringCtx &rctx)
+{
+  dout(10) << __func__ << dendl;
+  PeeringState::Initialize evt;
+  recovery_state.handle_event(evt, &rctx);
+}
+
+
+void PG::handle_query_state(Formatter *f)
+{
+  dout(10) << "handle_query_state" << dendl;
+  PeeringState::QueryState q(f);
+  recovery_state.handle_event(q, 0);
+
+  // This code has moved to after the close of recovery_state array.
+  // I don't think that scrub is a recovery state
+  if (is_primary() && is_active() && m_scrubber && m_scrubber->is_scrub_active()) {
+    m_scrubber->handle_query_state(f);
+  }
+}
+
+void PG::init_collection_pool_opts()
+{
+  auto r = osd->store->set_collection_opts(ch, pool.info.opts);
+  if (r < 0 && r != -EOPNOTSUPP) {
+    derr << __func__ << " set_collection_opts returns error:" << r << dendl;
+  }
+}
+
+void PG::on_pool_change()
+{
+  init_collection_pool_opts();
+  plpg_on_pool_change();
+}
+
+void PG::C_DeleteMore::complete(int r) {
+  ceph_assert(r == 0);
+  pg->lock();
+  if (!pg->pg_has_reset_since(epoch)) {
+    pg->osd->queue_for_pg_delete(pg->get_pgid(), epoch);
+  }
+  pg->unlock();
+  delete this;
+}
+
+std::pair<ghobject_t, bool> PG::do_delete_work(
+  ObjectStore::Transaction &t,
+  ghobject_t _next)
+{
+  dout(10) << __func__ << dendl;
+
+  {
+    float osd_delete_sleep = osd->osd->get_osd_delete_sleep();
+    if (osd_delete_sleep > 0 && delete_needs_sleep) {
+      epoch_t e = get_osdmap()->get_epoch();
+      PGRef pgref(this);
+      auto delete_requeue_callback = new LambdaContext([this, pgref, e](int r) {
+        dout(20) << "do_delete_work() [cb] wake up at "
+                 << ceph_clock_now()
+	         << ", re-queuing delete" << dendl;
+        std::scoped_lock locker{*this};
+        delete_needs_sleep = false;
+        if (!pg_has_reset_since(e)) {
+          osd->queue_for_pg_delete(get_pgid(), e);
+        }
+      });
+
+      auto delete_schedule_time = ceph::real_clock::now();
+      delete_schedule_time += ceph::make_timespan(osd_delete_sleep);
+      std::lock_guard l{osd->sleep_lock};
+      osd->sleep_timer.add_event_at(delete_schedule_time,
+				    delete_requeue_callback);
+      dout(20) << __func__ << " Delete scheduled at " << delete_schedule_time << dendl;
+      return std::make_pair(_next, true);
+    }
+  }
+
+  delete_needs_sleep = true;
+
+  ghobject_t next;
+
+  vector<ghobject_t> olist;
+  int max = std::min(osd->store->get_ideal_list_max(),
+		     (int)cct->_conf->osd_target_transaction_size);
+
+  osd->store->collection_list(
+    ch,
+    _next,
+    ghobject_t::get_max(),
+    max,
+    &olist,
+    &next);
+  dout(20) << __func__ << " " << olist << dendl;
+
+  // make sure we've removed everything
+  // by one more listing from the beginning
+  if (_next != ghobject_t() && olist.empty()) {
+    next = ghobject_t();
+    osd->store->collection_list(
+      ch,
+      next,
+      ghobject_t::get_max(),
+      max,
+      &olist,
+      &next);
+    if (!olist.empty()) {
+      for (auto& oid : olist) {
+        if (oid == pgmeta_oid) {
+          dout(20) << __func__ << " removing pgmeta object " << oid << dendl;
+        } else {
+          dout(0) << __func__ << " additional unexpected onode"
+                  <<" new onode has appeared since PG removal started"
+                  << oid << dendl;
+        }
+      }
+    }
+  }
+
+  OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
+  int64_t num = 0;
+  for (auto& oid : olist) {
+    if (oid == pgmeta_oid) {
+      continue;
+    }
+    if (oid.is_pgmeta()) {
+      osd->clog->warn() << info.pgid << " found stray pgmeta-like " << oid
+			<< " during PG removal";
+    }
+    int r = snap_mapper.remove_oid(oid.hobj, &_t);
+    if (r != 0 && r != -ENOENT) {
+      ceph_abort();
+    }
+    t.remove(coll, oid);
+    ++num;
+  }
+  bool running = true;
+  if (num) {
+    dout(20) << __func__ << " deleting " << num << " objects" << dendl;
+    Context *fin = new C_DeleteMore(this, get_osdmap_epoch());
+    t.register_on_commit(fin);
+  } else {
+    if (cct->_conf->osd_inject_failure_on_pg_removal) {
+      _exit(1);
+    }
+
+    // final flush here to ensure completions drop refs.  Of particular concern
+    // are the SnapMapper ContainerContexts.
+    {
+      PGRef pgref(this);
+      PGLog::clear_info_log(info.pgid, &t);
+      t.remove_collection(coll);
+      t.register_on_commit(new ContainerContext<PGRef>(pgref));
+      t.register_on_applied(new ContainerContext<PGRef>(pgref));
+      osd->store->queue_transaction(ch, std::move(t));
+    }
+    ch->flush();
+
+    if (!osd->try_finish_pg_delete(this, pool.info.get_pg_num())) {
+      dout(1) << __func__ << " raced with merge, reinstantiating" << dendl;
+      ch = osd->store->create_new_collection(coll);
+      create_pg_collection(t,
+	      info.pgid,
+	      info.pgid.get_split_bits(pool.info.get_pg_num()));
+      init_pg_ondisk(t, info.pgid, &pool.info);
+      recovery_state.reset_last_persisted();
+    } else {
+      recovery_state.set_delete_complete();
+
+      // cancel reserver here, since the PG is about to get deleted and the
+      // exit() methods don't run when that happens.
+      osd->local_reserver.cancel_reservation(info.pgid);
+
+      running = false;
+    }
+  }
+  return {next, running};
+}
+
+int PG::pg_stat_adjust(osd_stat_t *ns)
+{
+  osd_stat_t &new_stat = *ns;
+  if (is_primary()) {
+    return 0;
+  }
+  // Adjust the kb_used by adding pending backfill data
+  uint64_t reserved_num_bytes = get_reserved_num_bytes();
+
+  // For now we don't consider projected space gains here
+  // I suggest we have an optional 2 pass backfill that frees up
+  // space in a first pass.  This could be triggered when at nearfull
+  // or near to backfillfull.
+  if (reserved_num_bytes > 0) {
+    // TODO: Handle compression by adjusting by the PGs average
+    // compression precentage.
+    dout(20) << __func__ << " reserved_num_bytes " << (reserved_num_bytes >> 10) << "KiB"
+             << " Before kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
+    if (new_stat.statfs.available > reserved_num_bytes)
+      new_stat.statfs.available -= reserved_num_bytes;
+    else
+      new_stat.statfs.available = 0;
+    dout(20) << __func__ << " After kb_used " << new_stat.statfs.kb_used() << "KiB" << dendl;
+    return 1;
+  }
+  return 0;
+}
+
+void PG::dump_pgstate_history(Formatter *f)
+{
+  std::scoped_lock l{*this};
+  recovery_state.dump_history(f);
+}
+
+void PG::dump_missing(Formatter *f)
+{
+  for (auto& i : recovery_state.get_pg_log().get_missing().get_items()) {
+    f->open_object_section("object");
+    f->dump_object("oid", i.first);
+    f->dump_object("missing_info", i.second);
+    if (recovery_state.get_missing_loc().needs_recovery(i.first)) {
+      f->dump_bool(
+	"unfound",
+	recovery_state.get_missing_loc().is_unfound(i.first));
+      f->open_array_section("locations");
+      for (auto l : recovery_state.get_missing_loc().get_locations(i.first)) {
+	f->dump_object("shard", l);
+      }
+      f->close_section();
+    }
+    f->close_section();
+  }
+}
+
+void PG::get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f)
+{
+  std::lock_guard l{pg_stats_publish_lock};
+  if (pg_stats_publish_valid) {
+    f(pg_stats_publish, pg_stats_publish.get_effective_last_epoch_clean());
+  }
+}
+
+void PG::with_heartbeat_peers(std::function<void(int)> f)
+{
+  std::lock_guard l{heartbeat_peer_lock};
+  for (auto p : heartbeat_peers) {
+    f(p);
+  }
+  for (auto p : probe_targets) {
+    f(p);
+  }
+}
+
+uint64_t PG::get_min_alloc_size() const {
+  return osd->store->get_min_alloc_size();
+}
diff --git a/src/osd/PG.h b/src/osd/PG.h
new file mode 100644
index 000000000..61adae120
--- /dev/null
+++ b/src/osd/PG.h
@@ -0,0 +1,1341 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_PG_H
+#define CEPH_PG_H
+
+#include <boost/scoped_ptr.hpp>
+#include <boost/container/flat_set.hpp>
+#include "include/mempool.h"
+
+// re-include our assert to clobber boost's
+#include "include/ceph_assert.h" 
+#include "include/common_fwd.h"
+
+#include "include/types.h"
+#include "include/stringify.h"
+#include "osd_types.h"
+#include "include/xlist.h"
+#include "SnapMapper.h"
+#include "Session.h"
+#include "common/Timer.h"
+
+#include "PGLog.h"
+#include "OSDMap.h"
+#include "messages/MOSDPGLog.h"
+#include "include/str_list.h"
+#include "PGBackend.h"
+#include "PGPeeringEvent.h"
+#include "PeeringState.h"
+#include "recovery_types.h"
+#include "MissingLoc.h"
+#include "scrubber_common.h"
+
+#include "mgr/OSDPerfMetricTypes.h"
+
+#include <atomic>
+#include <list>
+#include <memory>
+#include <string>
+#include <tuple>
+
+//#define DEBUG_RECOVERY_OIDS   // track std::set of recovering oids explicitly, to find counting bugs
+//#define PG_DEBUG_REFS    // track provenance of pg refs, helpful for finding leaks
+
+class OSD;
+class OSDService;
+class OSDShard;
+class OSDShardPGSlot;
+class MOSDPGScan;
+class MOSDPGBackfill;
+class MOSDPGInfo;
+
+class PG;
+struct OpRequest;
+typedef OpRequest::Ref OpRequestRef;
+class MOSDPGLog;
+class DynamicPerfStats;
+class PgScrubber;
+
+namespace Scrub {
+  class Store;
+  class ReplicaReservations;
+  class LocalReservation;
+  class ReservedByRemotePrimary;
+}
+
+#ifdef PG_DEBUG_REFS
+#include "common/tracked_int_ptr.hpp"
+  uint64_t get_with_id(PG *pg);
+  void put_with_id(PG *pg, uint64_t id);
+  typedef TrackedIntPtr<PG> PGRef;
+#else
+  typedef boost::intrusive_ptr<PG> PGRef;
+#endif
+
+class PGRecoveryStats {
+  struct per_state_info {
+    uint64_t enter, exit;     // enter/exit counts
+    uint64_t events;
+    utime_t event_time;       // time spent processing events
+    utime_t total_time;       // total time in state
+    utime_t min_time, max_time;
+
+    // cppcheck-suppress unreachableCode
+    per_state_info() : enter(0), exit(0), events(0) {}
+  };
+  std::map<const char *,per_state_info> info;
+  ceph::mutex lock = ceph::make_mutex("PGRecoverStats::lock");
+
+  public:
+  PGRecoveryStats() = default;
+
+  void reset() {
+    std::lock_guard l(lock);
+    info.clear();
+  }
+  void dump(ostream& out) {
+    std::lock_guard l(lock);
+    for (std::map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
+      per_state_info& i = p->second;
+      out << i.enter << "\t" << i.exit << "\t"
+	  << i.events << "\t" << i.event_time << "\t"
+	  << i.total_time << "\t"
+	  << i.min_time << "\t" << i.max_time << "\t"
+	  << p->first << "\n";
+    }
+  }
+
+  void dump_formatted(ceph::Formatter *f) {
+    std::lock_guard l(lock);
+    f->open_array_section("pg_recovery_stats");
+    for (std::map<const char *,per_state_info>::iterator p = info.begin();
+	 p != info.end(); ++p) {
+      per_state_info& i = p->second;
+      f->open_object_section("recovery_state");
+      f->dump_int("enter", i.enter);
+      f->dump_int("exit", i.exit);
+      f->dump_int("events", i.events);
+      f->dump_stream("event_time") << i.event_time;
+      f->dump_stream("total_time") << i.total_time;
+      f->dump_stream("min_time") << i.min_time;
+      f->dump_stream("max_time") << i.max_time;
+      std::vector<std::string> states;
+      get_str_vec(p->first, "/", states);
+      f->open_array_section("nested_states");
+      for (std::vector<std::string>::iterator st = states.begin();
+	   st != states.end(); ++st) {
+	f->dump_string("state", *st);
+      }
+      f->close_section();
+      f->close_section();
+    }
+    f->close_section();
+  }
+
+  void log_enter(const char *s) {
+    std::lock_guard l(lock);
+    info[s].enter++;
+  }
+  void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
+    std::lock_guard l(lock);
+    per_state_info &i = info[s];
+    i.exit++;
+    i.total_time += dur;
+    if (dur > i.max_time)
+      i.max_time = dur;
+    if (dur < i.min_time || i.min_time == utime_t())
+      i.min_time = dur;
+    i.events += events;
+    i.event_time += event_dur;
+  }
+};
+
+/** PG - Replica Placement Group
+ *
+ */
+
+class PG : public DoutPrefixProvider, public PeeringState::PeeringListener {
+  friend struct NamedState;
+  friend class PeeringState;
+  friend class PgScrubber;
+  friend class PrimaryLogScrub;
+  friend class Scrub::ReplicaReservations;
+
+public:
+  const pg_shard_t pg_whoami;
+  const spg_t pg_id;
+
+  std::unique_ptr<ScrubPgIF> m_scrubber;
+
+  /// flags detailing scheduling/operation characteristics of the next scrub 
+  requested_scrub_t m_planned_scrub;
+  /// scrubbing state for both Primary & replicas
+  bool is_scrub_active() const { return m_scrubber->is_scrub_active(); }
+
+  /// set when the scrub request is queued, and reset after scrubbing fully
+  /// cleaned up.
+  bool is_scrub_queued_or_active() const { return m_scrubber->is_queued_or_active(); }
+
+public:
+  // -- members --
+  const coll_t coll;
+
+  ObjectStore::CollectionHandle ch;
+
+  // -- methods --
+  std::ostream& gen_prefix(std::ostream& out) const override;
+  CephContext *get_cct() const override {
+    return cct;
+  }
+  unsigned get_subsys() const override {
+    return ceph_subsys_osd;
+  }
+
+  const char* const get_current_state() const {
+    return recovery_state.get_current_state();
+  }
+
+  const OSDMapRef& get_osdmap() const {
+    ceph_assert(is_locked());
+    return recovery_state.get_osdmap();
+  }
+
+  epoch_t get_osdmap_epoch() const override final {
+    return recovery_state.get_osdmap()->get_epoch();
+  }
+
+  PerfCounters &get_peering_perf() override;
+  PerfCounters &get_perf_logger() override;
+  void log_state_enter(const char *state) override;
+  void log_state_exit(
+    const char *state_name, utime_t enter_time,
+    uint64_t events, utime_t event_dur) override;
+
+  void lock(bool no_lockdep = false) const;
+  void unlock() const;
+  bool is_locked() const;
+
+  const spg_t& get_pgid() const {
+    return pg_id;
+  }
+
+  const PGPool& get_pool() const {
+    return pool;
+  }
+  uint64_t get_last_user_version() const {
+    return info.last_user_version;
+  }
+  const pg_history_t& get_history() const {
+    return info.history;
+  }
+  bool get_need_up_thru() const {
+    return recovery_state.get_need_up_thru();
+  }
+  epoch_t get_same_interval_since() const {
+    return info.history.same_interval_since;
+  }
+
+  static void set_last_scrub_stamp(
+    utime_t t, pg_history_t &history, pg_stat_t &stats) {
+    stats.last_scrub_stamp = t;
+    history.last_scrub_stamp = t;
+  }
+
+  void set_last_scrub_stamp(utime_t t) {
+    recovery_state.update_stats(
+      [=](auto &history, auto &stats) {
+	set_last_scrub_stamp(t, history, stats);
+	return true;
+      });
+  }
+
+  static void set_last_deep_scrub_stamp(
+    utime_t t, pg_history_t &history, pg_stat_t &stats) {
+    stats.last_deep_scrub_stamp = t;
+    history.last_deep_scrub_stamp = t;
+  }
+
+  void set_last_deep_scrub_stamp(utime_t t) {
+    recovery_state.update_stats(
+      [=](auto &history, auto &stats) {
+	set_last_deep_scrub_stamp(t, history, stats);
+	return true;
+      });
+  }
+
+  bool is_deleting() const {
+    return recovery_state.is_deleting();
+  }
+  bool is_deleted() const {
+    return recovery_state.is_deleted();
+  }
+  bool is_nonprimary() const {
+    return recovery_state.is_nonprimary();
+  }
+  bool is_primary() const {
+    return recovery_state.is_primary();
+  }
+  bool pg_has_reset_since(epoch_t e) {
+    ceph_assert(is_locked());
+    return recovery_state.pg_has_reset_since(e);
+  }
+
+  bool is_ec_pg() const {
+    return recovery_state.is_ec_pg();
+  }
+  int get_role() const {
+    return recovery_state.get_role();
+  }
+  const std::vector<int> get_acting() const {
+    return recovery_state.get_acting();
+  }
+  const std::set<pg_shard_t> &get_actingset() const {
+    return recovery_state.get_actingset();
+  }
+  int get_acting_primary() const {
+    return recovery_state.get_acting_primary();
+  }
+  pg_shard_t get_primary() const {
+    return recovery_state.get_primary();
+  }
+  const std::vector<int> get_up() const {
+    return recovery_state.get_up();
+  }
+  int get_up_primary() const {
+    return recovery_state.get_up_primary();
+  }
+  const PastIntervals& get_past_intervals() const {
+    return recovery_state.get_past_intervals();
+  }
+  bool is_acting_recovery_backfill(pg_shard_t osd) const {
+    return recovery_state.is_acting_recovery_backfill(osd);
+  }
+  const std::set<pg_shard_t> &get_acting_recovery_backfill() const {
+    return recovery_state.get_acting_recovery_backfill();
+  }
+  bool is_acting(pg_shard_t osd) const {
+    return recovery_state.is_acting(osd);
+  }
+  bool is_up(pg_shard_t osd) const {
+    return recovery_state.is_up(osd);
+  }
+  static bool has_shard(bool ec, const std::vector<int>& v, pg_shard_t osd) {
+    return PeeringState::has_shard(ec, v, osd);
+  }
+
+  /// initialize created PG
+  void init(
+    int role,
+    const std::vector<int>& up,
+    int up_primary,
+    const std::vector<int>& acting,
+    int acting_primary,
+    const pg_history_t& history,
+    const PastIntervals& pim,
+    bool backfill,
+    ObjectStore::Transaction &t);
+
+  /// read existing pg state off disk
+  void read_state(ObjectStore *store);
+  static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch);
+
+  static int get_latest_struct_v() {
+    return pg_latest_struct_v;
+  }
+  static int get_compat_struct_v() {
+    return pg_compat_struct_v;
+  }
+  static int read_info(
+    ObjectStore *store, spg_t pgid, const coll_t &coll,
+    pg_info_t &info, PastIntervals &past_intervals,
+    __u8 &);
+  static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
+
+  void rm_backoff(const ceph::ref_t<Backoff>& b);
+
+  void update_snap_mapper_bits(uint32_t bits) {
+    snap_mapper.update_bits(bits);
+  }
+  void start_split_stats(const std::set<spg_t>& childpgs, std::vector<object_stat_sum_t> *v);
+  virtual void split_colls(
+    spg_t child,
+    int split_bits,
+    int seed,
+    const pg_pool_t *pool,
+    ObjectStore::Transaction &t) = 0;
+  void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
+  void merge_from(std::map<spg_t,PGRef>& sources, PeeringCtx &rctx,
+		  unsigned split_bits,
+		  const pg_merge_meta_t& last_pg_merge_meta);
+  void finish_split_stats(const object_stat_sum_t& stats,
+			  ObjectStore::Transaction &t);
+
+  void scrub(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    // a new scrub
+    forward_scrub_event(&ScrubPgIF::initiate_regular_scrub, queued, "StartScrub"sv);
+  }
+
+  /**
+   *  a special version of PG::scrub(), which:
+   *  - is initiated after repair, and
+   * (not true anymore:)
+   *  - is not required to allocate local/remote OSD scrub resources
+   */
+  void recovery_scrub(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    // a new scrub
+    forward_scrub_event(&ScrubPgIF::initiate_scrub_after_repair, queued,
+			"AfterRepairScrub"sv);
+  }
+
+  void replica_scrub(epoch_t queued,
+		     Scrub::act_token_t act_token,
+		     ThreadPool::TPHandle& handle);
+
+  void replica_scrub_resched(epoch_t queued,
+			     Scrub::act_token_t act_token,
+			     ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_sched_replica, queued, act_token,
+			"SchedReplica");
+  }
+
+  void scrub_send_resources_granted(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_remotes_reserved, queued, "RemotesReserved"sv);
+  }
+
+  void scrub_send_resources_denied(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_reservation_failure, queued,
+			"ReservationFailure"sv);
+  }
+
+  void scrub_send_scrub_resched(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_scrub_resched, queued, "InternalSchedScrub");
+  }
+
+  void scrub_send_pushes_update(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::active_pushes_notification, queued,
+			"ActivePushesUpd"sv);
+  }
+
+  void scrub_send_applied_update(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::update_applied_notification, queued,
+			"UpdatesApplied"sv);
+  }
+
+  void scrub_send_unblocking(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_scrub_unblock, queued, "Unblocked"sv);
+  }
+
+  void scrub_send_digest_update(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::digest_update_notification, queued, "DigestUpdate"sv);
+  }
+
+  void scrub_send_local_map_ready(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_local_map_done, queued, "IntLocalMapDone"sv);
+  }
+
+  void scrub_send_replmaps_ready(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_replica_maps_ready, queued, "GotReplicas"sv);
+  }
+
+  void scrub_send_replica_pushes(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_replica_pushes_upd, queued,
+			"ReplicaPushesUpd"sv);
+  }
+
+  void scrub_send_maps_compared(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_maps_compared, queued, "MapsCompared"sv);
+  }
+
+  void scrub_send_get_next_chunk(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_get_next_chunk, queued, "NextChunk"sv);
+  }
+
+  void scrub_send_scrub_is_finished(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_scrub_is_finished, queued, "ScrubFinished"sv);
+  }
+
+  void scrub_send_chunk_free(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_chunk_free, queued, "SelectedChunkFree"sv);
+  }
+
+  void scrub_send_chunk_busy(epoch_t queued, ThreadPool::TPHandle& handle)
+  {
+    forward_scrub_event(&ScrubPgIF::send_chunk_busy, queued, "ChunkIsBusy"sv);
+  }
+
+  void reg_next_scrub();
+
+  void queue_want_pg_temp(const std::vector<int> &wanted) override;
+  void clear_want_pg_temp() override;
+
+  void on_new_interval() override;
+
+  void on_role_change() override;
+  virtual void plpg_on_role_change() = 0;
+
+  void init_collection_pool_opts();
+  void on_pool_change() override;
+  virtual void plpg_on_pool_change() = 0;
+
+  void on_info_history_change() override;
+
+  void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) override;
+
+  uint64_t get_snap_trimq_size() const override {
+    return snap_trimq.size();
+  }
+  unsigned get_target_pg_log_entries() const override;
+
+  void clear_publish_stats() override;
+  void clear_primary_state() override;
+
+  epoch_t oldest_stored_osdmap() override;
+  OstreamTemp get_clog_error() override;
+  OstreamTemp get_clog_info() override;
+  OstreamTemp get_clog_debug() override;
+
+  void schedule_event_after(
+    PGPeeringEventRef event,
+    float delay) override;
+  void request_local_background_io_reservation(
+    unsigned priority,
+    PGPeeringEventURef on_grant,
+    PGPeeringEventURef on_preempt) override;
+  void update_local_background_io_priority(
+    unsigned priority) override;
+  void cancel_local_background_io_reservation() override;
+
+  void request_remote_recovery_reservation(
+    unsigned priority,
+    PGPeeringEventURef on_grant,
+    PGPeeringEventURef on_preempt) override;
+  void cancel_remote_recovery_reservation() override;
+
+  void schedule_event_on_commit(
+    ObjectStore::Transaction &t,
+    PGPeeringEventRef on_commit) override;
+
+  void on_active_exit() override;
+
+  Context *on_clean() override {
+    if (is_active()) {
+      kick_snap_trim();
+    }
+    requeue_ops(waiting_for_clean_to_primary_repair);
+    return finish_recovery();
+  }
+
+  void on_activate(interval_set<snapid_t> snaps) override;
+
+  void on_activate_committed() override;
+
+  void on_active_actmap() override;
+  void on_active_advmap(const OSDMapRef &osdmap) override;
+
+  void queue_snap_retrim(snapid_t snap);
+
+  void on_backfill_reserved() override;
+  void on_backfill_canceled() override;
+  void on_recovery_reserved() override;
+
+  bool is_forced_recovery_or_backfill() const {
+    return recovery_state.is_forced_recovery_or_backfill();
+  }
+
+  PGLog::LogEntryHandlerRef get_log_handler(
+    ObjectStore::Transaction &t) override {
+    return std::make_unique<PG::PGLogEntryHandler>(this, &t);
+  }
+
+  std::pair<ghobject_t, bool> do_delete_work(ObjectStore::Transaction &t,
+    ghobject_t _next) override;
+
+  void clear_ready_to_merge() override;
+  void set_not_ready_to_merge_target(pg_t pgid, pg_t src) override;
+  void set_not_ready_to_merge_source(pg_t pgid) override;
+  void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) override;
+  void set_ready_to_merge_source(eversion_t lu) override;
+
+  void send_pg_created(pg_t pgid) override;
+
+  ceph::signedspan get_mnow() override;
+  HeartbeatStampsRef get_hb_stamps(int peer) override;
+  void schedule_renew_lease(epoch_t lpr, ceph::timespan delay) override;
+  void queue_check_readable(epoch_t lpr, ceph::timespan delay) override;
+
+  void rebuild_missing_set_with_deletes(PGLog &pglog) override;
+
+  void queue_peering_event(PGPeeringEventRef evt);
+  void do_peering_event(PGPeeringEventRef evt, PeeringCtx &rcx);
+  void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
+  void queue_flushed(epoch_t started_at);
+  void handle_advance_map(
+    OSDMapRef osdmap, OSDMapRef lastmap,
+    std::vector<int>& newup, int up_primary,
+    std::vector<int>& newacting, int acting_primary,
+    PeeringCtx &rctx);
+  void handle_activate_map(PeeringCtx &rctx);
+  void handle_initialize(PeeringCtx &rxcx);
+  void handle_query_state(ceph::Formatter *f);
+
+  /**
+   * @param ops_begun returns how many recovery ops the function started
+   * @returns true if any useful work was accomplished; false otherwise
+   */
+  virtual bool start_recovery_ops(
+    uint64_t max,
+    ThreadPool::TPHandle &handle,
+    uint64_t *ops_begun) = 0;
+
+  // more work after the above, but with a PeeringCtx
+  void find_unfound(epoch_t queued, PeeringCtx &rctx);
+
+  virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
+
+  void dump_pgstate_history(ceph::Formatter *f);
+  void dump_missing(ceph::Formatter *f);
+
+  void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f);
+  void with_heartbeat_peers(std::function<void(int)> f);
+
+  void shutdown();
+  virtual void on_shutdown() = 0;
+
+  bool get_must_scrub() const;
+  bool sched_scrub();
+
+  unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const;
+  /// the version that refers to flags_.priority
+  unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const;
+private:
+  // auxiliaries used by sched_scrub():
+  double next_deepscrub_interval() const;
+
+  /// should we perform deep scrub?
+  bool is_time_for_deep(bool allow_deep_scrub,
+		        bool allow_scrub,
+		        bool has_deep_errors,
+		        const requested_scrub_t& planned) const;
+
+  /**
+   * Verify the various 'next scrub' flags in m_planned_scrub against configuration
+   * and scrub-related timestamps.
+   *
+   * @returns an updated copy of the m_planned_flags (or nothing if no scrubbing)
+   */
+  std::optional<requested_scrub_t> verify_scrub_mode() const;
+
+  bool verify_periodic_scrub_mode(bool allow_deep_scrub,
+				  bool try_to_auto_repair,
+				  bool allow_regular_scrub,
+				  bool has_deep_errors,
+				  requested_scrub_t& planned) const;
+
+  using ScrubAPI = void (ScrubPgIF::*)(epoch_t epoch_queued);
+  void forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued, std::string_view desc);
+  // and for events that carry a meaningful 'activation token'
+  using ScrubSafeAPI = void (ScrubPgIF::*)(epoch_t epoch_queued,
+					   Scrub::act_token_t act_token);
+  void forward_scrub_event(ScrubSafeAPI fn,
+			   epoch_t epoch_queued,
+			   Scrub::act_token_t act_token,
+			   std::string_view desc);
+
+public:
+  virtual void do_request(
+    OpRequestRef& op,
+    ThreadPool::TPHandle &handle
+  ) = 0;
+  virtual void clear_cache() = 0;
+  virtual int get_cache_obj_count() = 0;
+
+  virtual void snap_trimmer(epoch_t epoch_queued) = 0;
+  virtual void do_command(
+    const std::string_view& prefix,
+    const cmdmap_t& cmdmap,
+    const ceph::buffer::list& idata,
+    std::function<void(int,const std::string&,ceph::buffer::list&)> on_finish) = 0;
+
+  virtual bool agent_work(int max) = 0;
+  virtual bool agent_work(int max, int agent_flush_quota) = 0;
+  virtual void agent_stop() = 0;
+  virtual void agent_delay() = 0;
+  virtual void agent_clear() = 0;
+  virtual void agent_choose_mode_restart() = 0;
+
+  struct C_DeleteMore : public Context {
+    PGRef pg;
+    epoch_t epoch;
+    C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {}
+    void finish(int r) override {
+      ceph_abort();
+    }
+    void complete(int r) override;
+  };
+
+  void _delete_some(ObjectStore::Transaction *t);
+
+  virtual void set_dynamic_perf_stats_queries(
+    const std::list<OSDPerfMetricQuery> &queries) {
+  }
+  virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) {
+  }
+
+  uint64_t get_min_alloc_size() const;
+
+  // reference counting
+#ifdef PG_DEBUG_REFS
+  uint64_t get_with_id();
+  void put_with_id(uint64_t);
+  void dump_live_ids();
+#endif
+  void get(const char* tag);
+  void put(const char* tag);
+  int get_num_ref() {
+    return ref;
+  }
+
+  // ctor
+  PG(OSDService *o, OSDMapRef curmap,
+     const PGPool &pool, spg_t p);
+  ~PG() override;
+
+  // prevent copying
+  explicit PG(const PG& rhs) = delete;
+  PG& operator=(const PG& rhs) = delete;
+
+protected:
+  // -------------
+  // protected
+  OSDService *osd;
+public:
+  OSDShard *osd_shard = nullptr;
+  OSDShardPGSlot *pg_slot = nullptr;
+protected:
+  CephContext *cct;
+
+  // locking and reference counting.
+  // I destroy myself when the reference count hits zero.
+  // lock() should be called before doing anything.
+  // get() should be called on pointer copy (to another thread, etc.).
+  // put() should be called on destruction of some previously copied pointer.
+  // unlock() when done with the current pointer (_most common_).
+  mutable ceph::mutex _lock = ceph::make_mutex("PG::_lock");
+#ifndef CEPH_DEBUG_MUTEX
+  mutable std::thread::id locked_by;
+#endif
+  std::atomic<unsigned int> ref{0};
+
+#ifdef PG_DEBUG_REFS
+  ceph::mutex _ref_id_lock = ceph::make_mutex("PG::_ref_id_lock");
+  std::map<uint64_t, std::string> _live_ids;
+  std::map<std::string, uint64_t> _tag_counts;
+  uint64_t _ref_id = 0;
+
+  friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
+  friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
+#endif
+
+private:
+  friend void intrusive_ptr_add_ref(PG *pg) {
+    pg->get("intptr");
+  }
+  friend void intrusive_ptr_release(PG *pg) {
+    pg->put("intptr");
+  }
+
+
+  // =====================
+
+protected:
+  OSDriver osdriver;
+  SnapMapper snap_mapper;
+
+  virtual PGBackend *get_pgbackend() = 0;
+  virtual const PGBackend* get_pgbackend() const = 0;
+
+protected:
+  void requeue_map_waiters();
+
+protected:
+
+  ZTracer::Endpoint trace_endpoint;
+
+
+protected:
+  __u8 info_struct_v = 0;
+  void upgrade(ObjectStore *store);
+
+protected:
+  ghobject_t    pgmeta_oid;
+
+  // ------------------
+  interval_set<snapid_t> snap_trimq;
+  std::set<snapid_t> snap_trimq_repeat;
+
+  /* You should not use these items without taking their respective queue locks
+   * (if they have one) */
+  xlist<PG*>::item stat_queue_item;
+  bool recovery_queued;
+
+  int recovery_ops_active;
+  std::set<pg_shard_t> waiting_on_backfill;
+#ifdef DEBUG_RECOVERY_OIDS
+  multiset<hobject_t> recovering_oids;
+#endif
+
+public:
+  bool dne() { return info.dne(); }
+
+  void send_cluster_message(
+    int osd, MessageRef m, epoch_t epoch, bool share_map_update) override;
+
+protected:
+  epoch_t get_last_peering_reset() const {
+    return recovery_state.get_last_peering_reset();
+  }
+
+  /* heartbeat peers */
+  void set_probe_targets(const std::set<pg_shard_t> &probe_set) override;
+  void clear_probe_targets() override;
+
+  ceph::mutex heartbeat_peer_lock =
+    ceph::make_mutex("PG::heartbeat_peer_lock");
+  std::set<int> heartbeat_peers;
+  std::set<int> probe_targets;
+
+protected:
+  BackfillInterval backfill_info;
+  std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
+  bool backfill_reserving;
+
+  // The primary's num_bytes and local num_bytes for this pg, only valid
+  // during backfill for non-primary shards.
+  // Both of these are adjusted for EC to reflect the on-disk bytes
+  std::atomic<int64_t> primary_num_bytes = 0;
+  std::atomic<int64_t> local_num_bytes = 0;
+
+public:
+  // Space reserved for backfill is primary_num_bytes - local_num_bytes
+  // Don't care that difference itself isn't atomic
+  uint64_t get_reserved_num_bytes() {
+    int64_t primary = primary_num_bytes.load();
+    int64_t local = local_num_bytes.load();
+    if (primary > local)
+      return primary - local;
+    else
+      return 0;
+  }
+
+  bool is_remote_backfilling() {
+    return primary_num_bytes.load() > 0;
+  }
+
+  bool try_reserve_recovery_space(int64_t primary, int64_t local) override;
+  void unreserve_recovery_space() override;
+
+  // If num_bytes are inconsistent and local_num- goes negative
+  // it's ok, because it would then be ignored.
+
+  // The value of num_bytes could be negative,
+  // but we don't let local_num_bytes go negative.
+  void add_local_num_bytes(int64_t num_bytes) {
+    if (num_bytes) {
+      int64_t prev_bytes = local_num_bytes.load();
+      int64_t new_bytes;
+      do {
+        new_bytes = prev_bytes + num_bytes;
+        if (new_bytes < 0)
+          new_bytes = 0;
+      } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
+    }
+  }
+  void sub_local_num_bytes(int64_t num_bytes) {
+    ceph_assert(num_bytes >= 0);
+    if (num_bytes) {
+      int64_t prev_bytes = local_num_bytes.load();
+      int64_t new_bytes;
+      do {
+        new_bytes = prev_bytes - num_bytes;
+        if (new_bytes < 0)
+          new_bytes = 0;
+      } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
+    }
+  }
+  // The value of num_bytes could be negative,
+  // but we don't let info.stats.stats.sum.num_bytes go negative.
+  void add_num_bytes(int64_t num_bytes) {
+    ceph_assert(ceph_mutex_is_locked_by_me(_lock));
+    if (num_bytes) {
+      recovery_state.update_stats(
+	[num_bytes](auto &history, auto &stats) {
+	  stats.stats.sum.num_bytes += num_bytes;
+	  if (stats.stats.sum.num_bytes < 0) {
+	    stats.stats.sum.num_bytes = 0;
+	  }
+	  return false;
+	});
+    }
+  }
+  void sub_num_bytes(int64_t num_bytes) {
+    ceph_assert(ceph_mutex_is_locked_by_me(_lock));
+    ceph_assert(num_bytes >= 0);
+    if (num_bytes) {
+      recovery_state.update_stats(
+	[num_bytes](auto &history, auto &stats) {
+	  stats.stats.sum.num_bytes -= num_bytes;
+	  if (stats.stats.sum.num_bytes < 0) {
+	    stats.stats.sum.num_bytes = 0;
+	  }
+	  return false;
+	});
+    }
+  }
+
+  // Only used in testing so not worried about needing the PG lock here
+  int64_t get_stats_num_bytes() {
+    std::lock_guard l{_lock};
+    int num_bytes = info.stats.stats.sum.num_bytes;
+    if (pool.info.is_erasure()) {
+      num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+      // Round up each object by a stripe
+      num_bytes +=  get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
+    }
+    int64_t lnb = local_num_bytes.load();
+    if (lnb && lnb != num_bytes) {
+      lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
+			    << lnb << " vs stats "
+                            << info.stats.stats.sum.num_bytes << " / chunk "
+                            << get_pgbackend()->get_ec_data_chunk_count()
+                            << dendl;
+    }
+    return num_bytes;
+  }
+
+protected:
+
+  /*
+   * blocked request wait hierarchy
+   *
+   * In order to preserve request ordering we need to be careful about the
+   * order in which blocked requests get requeued.  Generally speaking, we
+   * push the requests back up to the op_wq in reverse order (most recent
+   * request first) so that they come back out again in the original order.
+   * However, because there are multiple wait queues, we need to requeue
+   * waitlists in order.  Generally speaking, we requeue the wait lists
+   * that are checked first.
+   *
+   * Here are the various wait lists, in the order they are used during
+   * request processing, with notes:
+   *
+   *  - waiting_for_map
+   *    - may start or stop blocking at any time (depending on client epoch)
+   *  - waiting_for_peered
+   *    - !is_peered()
+   *    - only starts blocking on interval change; never restarts
+   *  - waiting_for_flush
+   *    - flushes_in_progress
+   *    - waiting for final flush during activate
+   *  - waiting_for_active
+   *    - !is_active()
+   *    - only starts blocking on interval change; never restarts
+   *  - waiting_for_readable
+   *    - now > readable_until
+   *    - unblocks when we get fresh(er) osd_pings
+   *  - waiting_for_scrub
+   *    - starts and stops blocking for varying intervals during scrub
+   *  - waiting_for_unreadable_object
+   *    - never restarts once object is readable (* except for EIO?)
+   *  - waiting_for_degraded_object
+   *    - never restarts once object is writeable (* except for EIO?)
+   *  - waiting_for_blocked_object
+   *    - starts and stops based on proxied op activity
+   *  - obc rwlocks
+   *    - starts and stops based on read/write activity
+   *
+   * Notes:
+   *
+   *  1. During and interval change, we requeue *everything* in the above order.
+   *
+   *  2. When an obc rwlock is released, we check for a scrub block and requeue
+   *     the op there if it applies.  We ignore the unreadable/degraded/blocked
+   *     queues because we assume they cannot apply at that time (this is
+   *     probably mostly true).
+   *
+   *  3. The requeue_ops helper will push ops onto the waiting_for_map std::list if
+   *     it is non-empty.
+   *
+   * These three behaviors are generally sufficient to maintain ordering, with
+   * the possible exception of cases where we make an object degraded or
+   * unreadable that was previously okay, e.g. when scrub or op processing
+   * encounter an unexpected error.  FIXME.
+   */
+
+  // ops with newer maps than our (or blocked behind them)
+  // track these by client, since inter-request ordering doesn't otherwise
+  // matter.
+  std::unordered_map<entity_name_t,std::list<OpRequestRef>> waiting_for_map;
+
+  // ops waiting on peered
+  std::list<OpRequestRef>            waiting_for_peered;
+
+  /// ops waiting on readble
+  std::list<OpRequestRef>            waiting_for_readable;
+
+  // ops waiting on active (require peered as well)
+  std::list<OpRequestRef>            waiting_for_active;
+  std::list<OpRequestRef>            waiting_for_flush;
+  std::list<OpRequestRef>            waiting_for_scrub;
+
+  std::list<OpRequestRef>            waiting_for_cache_not_full;
+  std::list<OpRequestRef>            waiting_for_clean_to_primary_repair;
+  std::map<hobject_t, std::list<OpRequestRef>> waiting_for_unreadable_object,
+			     waiting_for_degraded_object,
+			     waiting_for_blocked_object;
+
+  std::set<hobject_t> objects_blocked_on_cache_full;
+  std::map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
+  std::map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
+
+  // Callbacks should assume pg (and nothing else) is locked
+  std::map<hobject_t, std::list<Context*>> callbacks_for_degraded_object;
+
+  std::map<eversion_t,
+      std::list<
+	std::tuple<OpRequestRef, version_t, int,
+		   std::vector<pg_log_op_return_item_t>>>> waiting_for_ondisk;
+
+  void requeue_object_waiters(std::map<hobject_t, std::list<OpRequestRef>>& m);
+  void requeue_op(OpRequestRef op);
+  void requeue_ops(std::list<OpRequestRef> &l);
+
+  // stats that persist lazily
+  object_stat_collection_t unstable_stats;
+
+  // publish stats
+  ceph::mutex pg_stats_publish_lock =
+    ceph::make_mutex("PG::pg_stats_publish_lock");
+  bool pg_stats_publish_valid;
+  pg_stat_t pg_stats_publish;
+
+  friend class TestOpsSocketHook;
+  void publish_stats_to_osd() override;
+
+  bool needs_recovery() const {
+    return recovery_state.needs_recovery();
+  }
+  bool needs_backfill() const {
+    return recovery_state.needs_backfill();
+  }
+
+  bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
+
+  struct PGLogEntryHandler : public PGLog::LogEntryHandler {
+    PG *pg;
+    ObjectStore::Transaction *t;
+    PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
+
+    // LogEntryHandler
+    void remove(const hobject_t &hoid) override {
+      pg->get_pgbackend()->remove(hoid, t);
+    }
+    void try_stash(const hobject_t &hoid, version_t v) override {
+      pg->get_pgbackend()->try_stash(hoid, v, t);
+    }
+    void rollback(const pg_log_entry_t &entry) override {
+      ceph_assert(entry.can_rollback());
+      pg->get_pgbackend()->rollback(entry, t);
+    }
+    void rollforward(const pg_log_entry_t &entry) override {
+      pg->get_pgbackend()->rollforward(entry, t);
+    }
+    void trim(const pg_log_entry_t &entry) override {
+      pg->get_pgbackend()->trim(entry, t);
+    }
+  };
+
+  void update_object_snap_mapping(
+    ObjectStore::Transaction *t, const hobject_t &soid,
+    const std::set<snapid_t> &snaps);
+  void clear_object_snap_mapping(
+    ObjectStore::Transaction *t, const hobject_t &soid);
+  void remove_snap_mapped_object(
+    ObjectStore::Transaction& t, const hobject_t& soid);
+
+  bool have_unfound() const { 
+    return recovery_state.have_unfound();
+  }
+  uint64_t get_num_unfound() const {
+    return recovery_state.get_num_unfound();
+  }
+
+  virtual void check_local() = 0;
+
+  void purge_strays();
+
+  void update_heartbeat_peers(std::set<int> peers) override;
+
+  Context *finish_sync_event;
+
+  Context *finish_recovery();
+  void _finish_recovery(Context *c);
+  struct C_PG_FinishRecovery : public Context {
+    PGRef pg;
+    explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
+    void finish(int r) override {
+      pg->_finish_recovery(this);
+    }
+  };
+  void cancel_recovery();
+  void clear_recovery_state();
+  virtual void _clear_recovery_state() = 0;
+  void start_recovery_op(const hobject_t& soid);
+  void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
+
+  virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
+
+  friend class C_OSD_RepModify_Commit;
+  friend struct C_DeleteMore;
+
+  // -- backoff --
+  ceph::mutex backoff_lock = // orders inside Backoff::lock
+    ceph::make_mutex("PG::backoff_lock");
+  std::map<hobject_t,std::set<ceph::ref_t<Backoff>>> backoffs;
+
+  void add_backoff(const ceph::ref_t<Session>& s, const hobject_t& begin, const hobject_t& end);
+  void release_backoffs(const hobject_t& begin, const hobject_t& end);
+  void release_backoffs(const hobject_t& o) {
+    release_backoffs(o, o);
+  }
+  void clear_backoffs();
+
+  void add_pg_backoff(const ceph::ref_t<Session>& s) {
+    hobject_t begin = info.pgid.pgid.get_hobj_start();
+    hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
+    add_backoff(s, begin, end);
+  }
+public:
+  void release_pg_backoffs() {
+    hobject_t begin = info.pgid.pgid.get_hobj_start();
+    hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
+    release_backoffs(begin, end);
+  }
+
+  // -- scrub --
+protected:
+  bool scrub_after_recovery;
+
+  int active_pushes;
+
+  void repair_object(
+    const hobject_t &soid,
+    const std::list<std::pair<ScrubMap::object, pg_shard_t> > &ok_peers,
+    const std::set<pg_shard_t> &bad_peers);
+
+  [[nodiscard]] bool ops_blocked_by_scrub() const;
+  [[nodiscard]] Scrub::scrub_prio_t is_scrub_blocking_ops() const;
+
+  void _repair_oinfo_oid(ScrubMap &map);
+  void _scan_rollback_obs(const std::vector<ghobject_t> &rollback_obs);
+  /**
+   * returns true if [begin, end) is good to scrub at this time
+   * a false return value obliges the implementer to requeue scrub when the
+   * condition preventing scrub clears
+   */
+  virtual bool _range_available_for_scrub(
+    const hobject_t &begin, const hobject_t &end) = 0;
+
+  /**
+   * Initiate the process that will create our scrub map for the Primary.
+   * (triggered by MSG_OSD_REP_SCRUB)
+   */
+  void replica_scrub(OpRequestRef op, ThreadPool::TPHandle &handle);
+
+  // -- recovery state --
+
+  struct QueuePeeringEvt : Context {
+    PGRef pg;
+    PGPeeringEventRef evt;
+
+    template <class EVT>
+    QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
+      pg(pg), evt(std::make_shared<PGPeeringEvent>(epoch, epoch, evt)) {}
+
+    QueuePeeringEvt(PG *pg, PGPeeringEventRef evt) :
+      pg(pg), evt(std::move(evt)) {}
+
+    void finish(int r) override {
+      pg->lock();
+      pg->queue_peering_event(std::move(evt));
+      pg->unlock();
+    }
+  };
+
+
+public:
+  int pg_stat_adjust(osd_stat_t *new_stat);
+protected:
+  bool delete_needs_sleep = false;
+
+protected:
+  bool state_test(uint64_t m) const { return recovery_state.state_test(m); }
+  void state_set(uint64_t m) { recovery_state.state_set(m); }
+  void state_clear(uint64_t m) { recovery_state.state_clear(m); }
+
+  bool is_complete() const {
+    return recovery_state.is_complete();
+  }
+  bool should_send_notify() const {
+    return recovery_state.should_send_notify();
+  }
+
+  bool is_active() const { return recovery_state.is_active(); }
+  bool is_activating() const { return recovery_state.is_activating(); }
+  bool is_peering() const { return recovery_state.is_peering(); }
+  bool is_down() const { return recovery_state.is_down(); }
+  bool is_recovery_unfound() const { return recovery_state.is_recovery_unfound(); }
+  bool is_backfill_unfound() const { return recovery_state.is_backfill_unfound(); }
+  bool is_incomplete() const { return recovery_state.is_incomplete(); }
+  bool is_clean() const { return recovery_state.is_clean(); }
+  bool is_degraded() const { return recovery_state.is_degraded(); }
+  bool is_undersized() const { return recovery_state.is_undersized(); }
+  bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); } // Primary only
+  bool is_remapped() const { return recovery_state.is_remapped(); }
+  bool is_peered() const { return recovery_state.is_peered(); }
+  bool is_recovering() const { return recovery_state.is_recovering(); }
+  bool is_premerge() const { return recovery_state.is_premerge(); }
+  bool is_repair() const { return recovery_state.is_repair(); }
+  bool is_laggy() const { return state_test(PG_STATE_LAGGY); }
+  bool is_wait() const { return state_test(PG_STATE_WAIT); }
+
+  bool is_empty() const { return recovery_state.is_empty(); }
+
+  // pg on-disk state
+  void do_pending_flush();
+
+public:
+  void prepare_write(
+    pg_info_t &info,
+    pg_info_t &last_written_info,
+    PastIntervals &past_intervals,
+    PGLog &pglog,
+    bool dirty_info,
+    bool dirty_big_info,
+    bool need_write_epoch,
+    ObjectStore::Transaction &t) override;
+
+  void write_if_dirty(PeeringCtx &rctx) {
+    write_if_dirty(rctx.transaction);
+  }
+protected:
+  void write_if_dirty(ObjectStore::Transaction& t) {
+    recovery_state.write_if_dirty(t);
+  }
+
+  PGLog::IndexedLog projected_log;
+  bool check_in_progress_op(
+    const osd_reqid_t &r,
+    eversion_t *version,
+    version_t *user_version,
+    int *return_code,
+    std::vector<pg_log_op_return_item_t> *op_returns) const;
+  eversion_t projected_last_update;
+  eversion_t get_next_version() const {
+    eversion_t at_version(
+      get_osdmap_epoch(),
+      projected_last_update.version+1);
+    ceph_assert(at_version > info.last_update);
+    ceph_assert(at_version > recovery_state.get_pg_log().get_head());
+    ceph_assert(at_version > projected_last_update);
+    return at_version;
+  }
+
+  bool check_log_for_corruption(ObjectStore *store);
+
+  std::string get_corrupt_pg_log_name() const;
+
+  void update_snap_map(
+    const std::vector<pg_log_entry_t> &log_entries,
+    ObjectStore::Transaction& t);
+
+  void filter_snapc(std::vector<snapid_t> &snaps);
+
+  virtual void kick_snap_trim() = 0;
+  virtual void snap_trimmer_scrub_complete() = 0;
+
+  void queue_recovery();
+  void queue_scrub_after_repair();
+  unsigned int get_scrub_priority();
+
+  bool try_flush_or_schedule_async() override;
+  void start_flush_on_transaction(
+    ObjectStore::Transaction &t) override;
+
+  void update_history(const pg_history_t& history) {
+    recovery_state.update_history(history);
+  }
+
+  // OpRequest queueing
+  bool can_discard_op(OpRequestRef& op);
+  bool can_discard_scan(OpRequestRef op);
+  bool can_discard_backfill(OpRequestRef op);
+  bool can_discard_request(OpRequestRef& op);
+
+  template<typename T, int MSGTYPE>
+  bool can_discard_replica_op(OpRequestRef& op);
+
+  bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
+  bool old_peering_evt(PGPeeringEventRef evt) {
+    return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
+  }
+  bool have_same_or_newer_map(epoch_t e) {
+    return e <= get_osdmap_epoch();
+  }
+
+  bool op_has_sufficient_caps(OpRequestRef& op);
+
+  // abstract bits
+  friend struct FlushState;
+
+  friend ostream& operator<<(ostream& out, const PG& pg);
+
+protected:
+  PeeringState recovery_state;
+
+  // ref to recovery_state.pool
+  const PGPool &pool;
+
+  // ref to recovery_state.info
+  const pg_info_t &info;
+};
+
+#endif
diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc
new file mode 100644
index 000000000..ef2eb5381
--- /dev/null
+++ b/src/osd/PGBackend.cc
@@ -0,0 +1,1324 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013,2014 Inktank Storage, Inc.
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include "common/errno.h"
+#include "common/scrub_types.h"
+#include "ReplicatedBackend.h"
+#include "ScrubStore.h"
+#include "ECBackend.h"
+#include "PGBackend.h"
+#include "OSD.h"
+#include "erasure-code/ErasureCodePlugin.h"
+#include "OSDMap.h"
+#include "PGLog.h"
+#include "common/LogClient.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::ErasureCodeProfile;
+using ceph::ErasureCodeInterfaceRef;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#define DOUT_PREFIX_ARGS this
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) {
+  return pgb->get_parent()->gen_dbg_prefix(*_dout);
+}
+
+void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v,
+				      RecoveryHandle *h)
+{
+  ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0);
+  for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
+    if (shard == get_parent()->whoami_shard())
+      continue;
+    if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
+      dout(20) << __func__ << " will remove " << oid << " " << v << " from "
+	       << shard << dendl;
+      h->deletes[shard].push_back(make_pair(oid, v));
+      get_parent()->begin_peer_recover(shard, oid);
+    }
+  }
+}
+
+void PGBackend::send_recovery_deletes(int prio,
+				      const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes)
+{
+  epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch();
+  for (const auto& p : deletes) {
+    const auto& shard = p.first;
+    const auto& objects = p.second;
+    ConnectionRef con = get_parent()->get_con_osd_cluster(
+      shard.osd,
+      get_osdmap_epoch());
+    if (!con)
+      continue;
+    auto it = objects.begin();
+    while (it != objects.end()) {
+      uint64_t cost = 0;
+      uint64_t deletes = 0;
+      spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard);
+      MOSDPGRecoveryDelete *msg =
+	new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
+				 target_pg,
+				 get_osdmap_epoch(),
+				 min_epoch);
+      msg->set_priority(prio);
+
+      while (it != objects.end() &&
+	     cost < cct->_conf->osd_max_push_cost &&
+	     deletes < cct->_conf->osd_max_push_objects) {
+	dout(20) << __func__ << ": sending recovery delete << " << it->first
+		 << " " << it->second << " to osd." << shard << dendl;
+	msg->objects.push_back(*it);
+	cost += cct->_conf->osd_push_per_object_cost;
+	++deletes;
+	++it;
+      }
+
+      msg->set_cost(cost);
+      get_parent()->send_message_osd_cluster(msg, con);
+    }
+  }
+}
+
+bool PGBackend::handle_message(OpRequestRef op)
+{
+  switch (op->get_req()->get_type()) {
+  case MSG_OSD_PG_RECOVERY_DELETE:
+    handle_recovery_delete(op);
+    return true;
+
+  case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+    handle_recovery_delete_reply(op);
+    return true;
+
+  default:
+    break;
+  }
+
+  return _handle_message(op);
+}
+
+void PGBackend::handle_recovery_delete(OpRequestRef op)
+{
+  auto m = op->get_req<MOSDPGRecoveryDelete>();
+  ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE);
+  dout(20) << __func__ << " " << op << dendl;
+
+  op->mark_started();
+
+  C_GatherBuilder gather(cct);
+  for (const auto &p : m->objects) {
+    get_parent()->remove_missing_object(p.first, p.second, gather.new_sub());
+  }
+
+  auto reply = make_message<MOSDPGRecoveryDeleteReply>();
+  reply->from = get_parent()->whoami_shard();
+  reply->set_priority(m->get_priority());
+  reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard);
+  reply->map_epoch = m->map_epoch;
+  reply->min_epoch = m->min_epoch;
+  reply->objects = m->objects;
+  ConnectionRef conn = m->get_connection();
+
+  gather.set_finisher(new LambdaContext(
+    [=](int r) {
+      if (r != -EAGAIN) {
+	get_parent()->send_message_osd_cluster(reply, conn.get());
+      }
+    }));
+  gather.activate();
+}
+
+void PGBackend::handle_recovery_delete_reply(OpRequestRef op)
+{
+  auto m = op->get_req<MOSDPGRecoveryDeleteReply>();
+  ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY);
+  dout(20) << __func__ << " " << op << dendl;
+
+  for (const auto &p : m->objects) {
+    ObjectRecoveryInfo recovery_info;
+    hobject_t oid = p.first;
+    recovery_info.version = p.second;
+    get_parent()->on_peer_recover(m->from, oid, recovery_info);
+    bool peers_recovered = true;
+    for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
+      if (shard == get_parent()->whoami_shard())
+	continue;
+      if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
+	dout(20) << __func__ << " " << oid << " still missing on at least "
+		 << shard << dendl;
+	peers_recovered = false;
+	break;
+      }
+    }
+    if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) {
+      dout(20) << __func__ << " completed recovery, local_missing = "
+	       << get_parent()->get_local_missing() << dendl;
+      object_stat_sum_t stat_diff;
+      stat_diff.num_objects_recovered = 1;
+      get_parent()->on_global_recover(p.first, stat_diff, true);
+    }
+  }
+}
+
+void PGBackend::rollback(
+  const pg_log_entry_t &entry,
+  ObjectStore::Transaction *t)
+{
+
+  struct RollbackVisitor : public ObjectModDesc::Visitor {
+    const hobject_t &hoid;
+    PGBackend *pg;
+    ObjectStore::Transaction t;
+    RollbackVisitor(
+      const hobject_t &hoid,
+      PGBackend *pg) : hoid(hoid), pg(pg) {}
+    void append(uint64_t old_size) override {
+      ObjectStore::Transaction temp;
+      pg->rollback_append(hoid, old_size, &temp);
+      temp.append(t);
+      temp.swap(t);
+    }
+    void setattrs(map<string, std::optional<bufferlist> > &attrs) override {
+      ObjectStore::Transaction temp;
+      pg->rollback_setattrs(hoid, attrs, &temp);
+      temp.append(t);
+      temp.swap(t);
+    }
+    void rmobject(version_t old_version) override {
+      ObjectStore::Transaction temp;
+      pg->rollback_stash(hoid, old_version, &temp);
+      temp.append(t);
+      temp.swap(t);
+    }
+    void try_rmobject(version_t old_version) override {
+      ObjectStore::Transaction temp;
+      pg->rollback_try_stash(hoid, old_version, &temp);
+      temp.append(t);
+      temp.swap(t);
+    }
+    void create() override {
+      ObjectStore::Transaction temp;
+      pg->rollback_create(hoid, &temp);
+      temp.append(t);
+      temp.swap(t);
+    }
+    void update_snaps(const set<snapid_t> &snaps) override {
+      ObjectStore::Transaction temp;
+      pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp);
+      temp.append(t);
+      temp.swap(t);
+    }
+    void rollback_extents(
+      version_t gen,
+      const vector<pair<uint64_t, uint64_t> > &extents) override {
+      ObjectStore::Transaction temp;
+      pg->rollback_extents(gen, extents, hoid, &temp);
+      temp.append(t);
+      temp.swap(t);
+    }
+  };
+
+  ceph_assert(entry.mod_desc.can_rollback());
+  RollbackVisitor vis(entry.soid, this);
+  entry.mod_desc.visit(&vis);
+  t->append(vis.t);
+}
+
+struct Trimmer : public ObjectModDesc::Visitor {
+  const hobject_t &soid;
+  PGBackend *pg;
+  ObjectStore::Transaction *t;
+  Trimmer(
+    const hobject_t &soid,
+    PGBackend *pg,
+    ObjectStore::Transaction *t)
+    : soid(soid), pg(pg), t(t) {}
+  void rmobject(version_t old_version) override {
+    pg->trim_rollback_object(
+      soid,
+      old_version,
+      t);
+  }
+  // try_rmobject defaults to rmobject
+  void rollback_extents(
+    version_t gen,
+    const vector<pair<uint64_t, uint64_t> > &extents) override {
+    pg->trim_rollback_object(
+      soid,
+      gen,
+      t);
+  }
+};
+
+void PGBackend::rollforward(
+  const pg_log_entry_t &entry,
+  ObjectStore::Transaction *t)
+{
+  auto dpp = get_parent()->get_dpp();
+  ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl;
+  if (!entry.can_rollback())
+    return;
+  Trimmer trimmer(entry.soid, this, t);
+  entry.mod_desc.visit(&trimmer);
+}
+
+void PGBackend::trim(
+  const pg_log_entry_t &entry,
+  ObjectStore::Transaction *t)
+{
+  if (!entry.can_rollback())
+    return;
+  Trimmer trimmer(entry.soid, this, t);
+  entry.mod_desc.visit(&trimmer);
+}
+
+void PGBackend::try_stash(
+  const hobject_t &hoid,
+  version_t v,
+  ObjectStore::Transaction *t)
+{
+  t->try_rename(
+    coll,
+    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    ghobject_t(hoid, v, get_parent()->whoami_shard().shard));
+}
+
+void PGBackend::remove(
+  const hobject_t &hoid,
+  ObjectStore::Transaction *t) {
+  ceph_assert(!hoid.is_temp());
+  t->remove(
+    coll,
+    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+  get_parent()->pgb_clear_object_snap_mapping(hoid, t);
+}
+
+void PGBackend::on_change_cleanup(ObjectStore::Transaction *t)
+{
+  dout(10) << __func__ << dendl;
+  // clear temp
+  for (set<hobject_t>::iterator i = temp_contents.begin();
+       i != temp_contents.end();
+       ++i) {
+    dout(10) << __func__ << ": Removing oid "
+	     << *i << " from the temp collection" << dendl;
+    t->remove(
+      coll,
+      ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+  }
+  temp_contents.clear();
+}
+
+int PGBackend::objects_list_partial(
+  const hobject_t &begin,
+  int min,
+  int max,
+  vector<hobject_t> *ls,
+  hobject_t *next)
+{
+  ceph_assert(ls);
+  // Starts with the smallest generation to make sure the result list
+  // has the marker object (it might have multiple generations
+  // though, which would be filtered).
+  ghobject_t _next;
+  if (!begin.is_min())
+    _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard);
+  ls->reserve(max);
+  int r = 0;
+
+  if (min > max)
+    min = max;
+
+  while (!_next.is_max() && ls->size() < (unsigned)min) {
+    vector<ghobject_t> objects;
+    if (HAVE_FEATURE(parent->min_upacting_features(),
+                     OSD_FIXED_COLLECTION_LIST)) {
+      r = store->collection_list(
+        ch,
+        _next,
+        ghobject_t::get_max(),
+        max - ls->size(),
+        &objects,
+        &_next);
+    } else {
+      r = store->collection_list_legacy(
+        ch,
+        _next,
+        ghobject_t::get_max(),
+        max - ls->size(),
+        &objects,
+        &_next);
+    }
+    if (r != 0) {
+      derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl;
+      break;
+    }
+    for (vector<ghobject_t>::iterator i = objects.begin();
+	 i != objects.end();
+	 ++i) {
+      if (i->is_pgmeta() || i->hobj.is_temp()) {
+	continue;
+      }
+      if (i->is_no_gen()) {
+	ls->push_back(i->hobj);
+      }
+    }
+  }
+  if (r == 0)
+    *next = _next.hobj;
+  return r;
+}
+
+int PGBackend::objects_list_range(
+  const hobject_t &start,
+  const hobject_t &end,
+  vector<hobject_t> *ls,
+  vector<ghobject_t> *gen_obs)
+{
+  ceph_assert(ls);
+  vector<ghobject_t> objects;
+  int r;
+  if (HAVE_FEATURE(parent->min_upacting_features(),
+                   OSD_FIXED_COLLECTION_LIST)) {
+    r = store->collection_list(
+      ch,
+      ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      INT_MAX,
+      &objects,
+      NULL);
+  } else {
+    r = store->collection_list_legacy(
+      ch,
+      ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      INT_MAX,
+      &objects,
+      NULL);
+  }
+  ls->reserve(objects.size());
+  for (vector<ghobject_t>::iterator i = objects.begin();
+       i != objects.end();
+       ++i) {
+    if (i->is_pgmeta() || i->hobj.is_temp()) {
+      continue;
+    }
+    if (i->is_no_gen()) {
+      ls->push_back(i->hobj);
+    } else if (gen_obs) {
+      gen_obs->push_back(*i);
+    }
+  }
+  return r;
+}
+
+int PGBackend::objects_get_attr(
+  const hobject_t &hoid,
+  const string &attr,
+  bufferlist *out)
+{
+  bufferptr bp;
+  int r = store->getattr(
+    ch,
+    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    attr.c_str(),
+    bp);
+  if (r >= 0 && out) {
+    out->clear();
+    out->push_back(std::move(bp));
+  }
+  return r;
+}
+
+int PGBackend::objects_get_attrs(
+  const hobject_t &hoid,
+  map<string, bufferlist> *out)
+{
+  return store->getattrs(
+    ch,
+    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    *out);
+}
+
+void PGBackend::rollback_setattrs(
+  const hobject_t &hoid,
+  map<string, std::optional<bufferlist> > &old_attrs,
+  ObjectStore::Transaction *t) {
+  map<string, bufferlist> to_set;
+  ceph_assert(!hoid.is_temp());
+  for (map<string, std::optional<bufferlist> >::iterator i = old_attrs.begin();
+       i != old_attrs.end();
+       ++i) {
+    if (i->second) {
+      to_set[i->first] = *(i->second);
+    } else {
+      t->rmattr(
+	coll,
+	ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+	i->first);
+    }
+  }
+  t->setattrs(
+    coll,
+    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    to_set);
+}
+
+void PGBackend::rollback_append(
+  const hobject_t &hoid,
+  uint64_t old_size,
+  ObjectStore::Transaction *t) {
+  ceph_assert(!hoid.is_temp());
+  t->truncate(
+    coll,
+    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    old_size);
+}
+
+void PGBackend::rollback_stash(
+  const hobject_t &hoid,
+  version_t old_version,
+  ObjectStore::Transaction *t) {
+  ceph_assert(!hoid.is_temp());
+  t->remove(
+    coll,
+    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+  t->collection_move_rename(
+    coll,
+    ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
+    coll,
+    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+}
+
+void PGBackend::rollback_try_stash(
+  const hobject_t &hoid,
+  version_t old_version,
+  ObjectStore::Transaction *t) {
+  ceph_assert(!hoid.is_temp());
+  t->remove(
+    coll,
+    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+  t->try_rename(
+    coll,
+    ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
+    ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+}
+
+void PGBackend::rollback_extents(
+  version_t gen,
+  const vector<pair<uint64_t, uint64_t> > &extents,
+  const hobject_t &hoid,
+  ObjectStore::Transaction *t) {
+  auto shard = get_parent()->whoami_shard().shard;
+  for (auto &&extent: extents) {
+    t->clone_range(
+      coll,
+      ghobject_t(hoid, gen, shard),
+      ghobject_t(hoid, ghobject_t::NO_GEN, shard),
+      extent.first,
+      extent.second,
+      extent.first);
+  }
+  t->remove(
+    coll,
+    ghobject_t(hoid, gen, shard));
+}
+
+void PGBackend::trim_rollback_object(
+  const hobject_t &hoid,
+  version_t old_version,
+  ObjectStore::Transaction *t) {
+  ceph_assert(!hoid.is_temp());
+  t->remove(
+    coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
+}
+
+PGBackend *PGBackend::build_pg_backend(
+  const pg_pool_t &pool,
+  const map<string,string>& profile,
+  Listener *l,
+  coll_t coll,
+  ObjectStore::CollectionHandle &ch,
+  ObjectStore *store,
+  CephContext *cct)
+{
+  ErasureCodeProfile ec_profile = profile;
+  switch (pool.type) {
+  case pg_pool_t::TYPE_REPLICATED: {
+    return new ReplicatedBackend(l, coll, ch, store, cct);
+  }
+  case pg_pool_t::TYPE_ERASURE: {
+    ErasureCodeInterfaceRef ec_impl;
+    stringstream ss;
+    ceph::ErasureCodePluginRegistry::instance().factory(
+      profile.find("plugin")->second,
+      cct->_conf.get_val<std::string>("erasure_code_dir"),
+      ec_profile,
+      &ec_impl,
+      &ss);
+    ceph_assert(ec_impl);
+    return new ECBackend(
+      l,
+      coll,
+      ch,
+      store,
+      cct,
+      ec_impl,
+      pool.stripe_width);
+  }
+  default:
+    ceph_abort();
+    return NULL;
+  }
+}
+
+int PGBackend::be_scan_list(
+  ScrubMap &map,
+  ScrubMapBuilder &pos)
+{
+  dout(10) << __func__ << " " << pos << dendl;
+  ceph_assert(!pos.done());
+  ceph_assert(pos.pos < pos.ls.size());
+  hobject_t& poid = pos.ls[pos.pos];
+
+  struct stat st;
+  int r = store->stat(
+    ch,
+    ghobject_t(
+      poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+    &st,
+    true);
+  if (r == 0) {
+    ScrubMap::object &o = map.objects[poid];
+    o.size = st.st_size;
+    ceph_assert(!o.negative);
+    store->getattrs(
+      ch,
+      ghobject_t(
+	poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      o.attrs);
+
+    if (pos.deep) {
+      r = be_deep_scrub(poid, map, pos, o);
+    }
+    dout(25) << __func__ << "  " << poid << dendl;
+  } else if (r == -ENOENT) {
+    dout(25) << __func__ << "  " << poid << " got " << r
+	     << ", skipping" << dendl;
+  } else if (r == -EIO) {
+    dout(25) << __func__ << "  " << poid << " got " << r
+	     << ", stat_error" << dendl;
+    ScrubMap::object &o = map.objects[poid];
+    o.stat_error = true;
+  } else {
+    derr << __func__ << " got: " << cpp_strerror(r) << dendl;
+    ceph_abort();
+  }
+  if (r == -EINPROGRESS) {
+    return -EINPROGRESS;
+  }
+  pos.next_object();
+  return 0;
+}
+
+bool PGBackend::be_compare_scrub_objects(
+  pg_shard_t auth_shard,
+  const ScrubMap::object &auth,
+  const object_info_t& auth_oi,
+  const ScrubMap::object &candidate,
+  shard_info_wrapper &shard_result,
+  inconsistent_obj_wrapper &obj_result,
+  ostream &errorstream,
+  bool has_snapset)
+{
+  enum { CLEAN, FOUND_ERROR } error = CLEAN;
+  if (auth.digest_present && candidate.digest_present) {
+    if (auth.digest != candidate.digest) {
+      if (error != CLEAN)
+        errorstream << ", ";
+      error = FOUND_ERROR;
+      errorstream << "data_digest 0x" << std::hex << candidate.digest
+		  << " != data_digest 0x" << auth.digest << std::dec
+		  << " from shard " << auth_shard;
+      obj_result.set_data_digest_mismatch();
+    }
+  }
+  if (auth.omap_digest_present && candidate.omap_digest_present) {
+    if (auth.omap_digest != candidate.omap_digest) {
+      if (error != CLEAN)
+        errorstream << ", ";
+      error = FOUND_ERROR;
+      errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
+		  << " != omap_digest 0x" << auth.omap_digest << std::dec
+		  << " from shard " << auth_shard;
+      obj_result.set_omap_digest_mismatch();
+    }
+  }
+  if (parent->get_pool().is_replicated()) {
+    if (auth_oi.is_data_digest() && candidate.digest_present) {
+      if (auth_oi.data_digest != candidate.digest) {
+        if (error != CLEAN)
+          errorstream << ", ";
+        error = FOUND_ERROR;
+        errorstream << "data_digest 0x" << std::hex << candidate.digest
+		    << " != data_digest 0x" << auth_oi.data_digest << std::dec
+		    << " from auth oi " << auth_oi;
+        shard_result.set_data_digest_mismatch_info();
+      }
+    }
+    if (auth_oi.is_omap_digest() && candidate.omap_digest_present) {
+      if (auth_oi.omap_digest != candidate.omap_digest) {
+        if (error != CLEAN)
+          errorstream << ", ";
+        error = FOUND_ERROR;
+        errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
+		    << " != omap_digest 0x" << auth_oi.omap_digest << std::dec
+		    << " from auth oi " << auth_oi;
+        shard_result.set_omap_digest_mismatch_info();
+      }
+    }
+  }
+  if (candidate.stat_error)
+    return error == FOUND_ERROR;
+  if (!shard_result.has_info_missing()
+      && !shard_result.has_info_corrupted()) {
+    bufferlist can_bl, auth_bl;
+    auto can_attr = candidate.attrs.find(OI_ATTR);
+    auto auth_attr = auth.attrs.find(OI_ATTR);
+
+    ceph_assert(auth_attr != auth.attrs.end());
+    ceph_assert(can_attr != candidate.attrs.end());
+
+    can_bl.push_back(can_attr->second);
+    auth_bl.push_back(auth_attr->second);
+    if (!can_bl.contents_equal(auth_bl)) {
+      if (error != CLEAN)
+        errorstream << ", ";
+      error = FOUND_ERROR;
+      obj_result.set_object_info_inconsistency();
+      errorstream << "object info inconsistent ";
+    }
+  }
+  if (has_snapset) {
+    if (!shard_result.has_snapset_missing()
+        && !shard_result.has_snapset_corrupted()) {
+      bufferlist can_bl, auth_bl;
+      auto can_attr = candidate.attrs.find(SS_ATTR);
+      auto auth_attr = auth.attrs.find(SS_ATTR);
+
+      ceph_assert(auth_attr != auth.attrs.end());
+      ceph_assert(can_attr != candidate.attrs.end());
+
+      can_bl.push_back(can_attr->second);
+      auth_bl.push_back(auth_attr->second);
+      if (!can_bl.contents_equal(auth_bl)) {
+	  if (error != CLEAN)
+	    errorstream << ", ";
+	  error = FOUND_ERROR;
+	  obj_result.set_snapset_inconsistency();
+	  errorstream << "snapset inconsistent ";
+      }
+    }
+  }
+  if (parent->get_pool().is_erasure()) {
+    if (!shard_result.has_hinfo_missing()
+        && !shard_result.has_hinfo_corrupted()) {
+      bufferlist can_bl, auth_bl;
+      auto can_hi = candidate.attrs.find(ECUtil::get_hinfo_key());
+      auto auth_hi = auth.attrs.find(ECUtil::get_hinfo_key());
+
+      ceph_assert(auth_hi != auth.attrs.end());
+      ceph_assert(can_hi != candidate.attrs.end());
+
+      can_bl.push_back(can_hi->second);
+      auth_bl.push_back(auth_hi->second);
+      if (!can_bl.contents_equal(auth_bl)) {
+        if (error != CLEAN)
+	  errorstream << ", ";
+	error = FOUND_ERROR;
+	obj_result.set_hinfo_inconsistency();
+	errorstream << "hinfo inconsistent ";
+      }
+    }
+  }
+  uint64_t oi_size = be_get_ondisk_size(auth_oi.size);
+  if (oi_size != candidate.size) {
+    if (error != CLEAN)
+      errorstream << ", ";
+    error = FOUND_ERROR;
+    errorstream << "size " << candidate.size
+		<< " != size " << oi_size
+		<< " from auth oi " << auth_oi;
+    shard_result.set_size_mismatch_info();
+  }
+  if (auth.size != candidate.size) {
+    if (error != CLEAN)
+      errorstream << ", ";
+    error = FOUND_ERROR;
+    errorstream << "size " << candidate.size
+		<< " != size " << auth.size
+		<< " from shard " << auth_shard;
+    obj_result.set_size_mismatch();
+  }
+  // If the replica is too large and we didn't already count it for this object
+  //
+  if (candidate.size > cct->_conf->osd_max_object_size
+      && !obj_result.has_size_too_large()) {
+    if (error != CLEAN)
+      errorstream << ", ";
+    error = FOUND_ERROR;
+    errorstream << "size " << candidate.size
+		<< " > " << cct->_conf->osd_max_object_size
+		<< " is too large";
+    obj_result.set_size_too_large();
+  }
+  for (map<string,bufferptr>::const_iterator i = auth.attrs.begin();
+       i != auth.attrs.end();
+       ++i) {
+    // We check system keys seperately
+    if (i->first == OI_ATTR || i->first[0] != '_')
+      continue;
+    if (!candidate.attrs.count(i->first)) {
+      if (error != CLEAN)
+        errorstream << ", ";
+      error = FOUND_ERROR;
+      errorstream << "attr name mismatch '" << i->first << "'";
+      obj_result.set_attr_name_mismatch();
+    } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) {
+      if (error != CLEAN)
+        errorstream << ", ";
+      error = FOUND_ERROR;
+      errorstream << "attr value mismatch '" << i->first << "'";
+      obj_result.set_attr_value_mismatch();
+    }
+  }
+  for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin();
+       i != candidate.attrs.end();
+       ++i) {
+    // We check system keys seperately
+    if (i->first == OI_ATTR || i->first[0] != '_')
+      continue;
+    if (!auth.attrs.count(i->first)) {
+      if (error != CLEAN)
+        errorstream << ", ";
+      error = FOUND_ERROR;
+      errorstream << "attr name mismatch '" << i->first << "'";
+      obj_result.set_attr_name_mismatch();
+    }
+  }
+  return error == FOUND_ERROR;
+}
+
+static int dcount(const object_info_t &oi)
+{
+  int count = 0;
+  if (oi.is_data_digest())
+    count++;
+  if (oi.is_omap_digest())
+    count++;
+  return count;
+}
+
+map<pg_shard_t, ScrubMap *>::const_iterator
+  PGBackend::be_select_auth_object(
+  const hobject_t &obj,
+  const map<pg_shard_t,ScrubMap*> &maps,
+  object_info_t *auth_oi,
+  map<pg_shard_t, shard_info_wrapper> &shard_map,
+  bool &digest_match,
+  spg_t pgid,
+  ostream &errorstream)
+{
+  eversion_t auth_version;
+
+  // Create list of shards with primary first so it will be auth copy all
+  // other things being equal.
+  list<pg_shard_t> shards;
+  for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
+       j != maps.end();
+       ++j) {
+    if (j->first == get_parent()->whoami_shard())
+      continue;
+    shards.push_back(j->first);
+  }
+  shards.push_front(get_parent()->whoami_shard());
+
+  map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
+  digest_match = true;
+  for (auto &l : shards) {
+    ostringstream shard_errorstream;
+    bool error = false;
+    map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l);
+    map<hobject_t, ScrubMap::object>::iterator i =
+      j->second->objects.find(obj);
+    if (i == j->second->objects.end()) {
+      continue;
+    }
+    auto& shard_info = shard_map[j->first];
+    if (j->first == get_parent()->whoami_shard())
+      shard_info.primary = true;
+    if (i->second.read_error) {
+      shard_info.set_read_error();
+      if (error)
+        shard_errorstream << ", ";
+      error = true;
+      shard_errorstream << "candidate had a read error";
+    }
+    if (i->second.ec_hash_mismatch) {
+      shard_info.set_ec_hash_mismatch();
+      if (error)
+        shard_errorstream << ", ";
+      error = true;
+      shard_errorstream << "candidate had an ec hash mismatch";
+    }
+    if (i->second.ec_size_mismatch) {
+      shard_info.set_ec_size_mismatch();
+      if (error)
+        shard_errorstream << ", ";
+      error = true;
+      shard_errorstream << "candidate had an ec size mismatch";
+    }
+
+    object_info_t oi;
+    bufferlist bl;
+    map<string, bufferptr>::iterator k;
+    SnapSet ss;
+    bufferlist ss_bl, hk_bl;
+
+    if (i->second.stat_error) {
+      shard_info.set_stat_error();
+      if (error)
+        shard_errorstream << ", ";
+      error = true;
+      shard_errorstream << "candidate had a stat error";
+      // With stat_error no further checking
+      // We don't need to also see a missing_object_info_attr
+      goto out;
+    }
+
+    // We won't pick an auth copy if the snapset is missing or won't decode.
+    ceph_assert(!obj.is_snapdir());
+    if (obj.is_head()) {
+      k = i->second.attrs.find(SS_ATTR);
+      if (k == i->second.attrs.end()) {
+	shard_info.set_snapset_missing();
+        if (error)
+          shard_errorstream << ", ";
+        error = true;
+        shard_errorstream << "candidate had a missing snapset key";
+      } else {
+        ss_bl.push_back(k->second);
+        try {
+	  auto bliter = ss_bl.cbegin();
+	  decode(ss, bliter);
+        } catch (...) {
+	  // invalid snapset, probably corrupt
+	  shard_info.set_snapset_corrupted();
+          if (error)
+            shard_errorstream << ", ";
+          error = true;
+          shard_errorstream << "candidate had a corrupt snapset";
+        }
+      }
+    }
+
+    if (parent->get_pool().is_erasure()) {
+      ECUtil::HashInfo hi;
+      k = i->second.attrs.find(ECUtil::get_hinfo_key());
+      if (k == i->second.attrs.end()) {
+	shard_info.set_hinfo_missing();
+        if (error)
+          shard_errorstream << ", ";
+        error = true;
+        shard_errorstream << "candidate had a missing hinfo key";
+      } else {
+	hk_bl.push_back(k->second);
+        try {
+	  auto bliter = hk_bl.cbegin();
+	  decode(hi, bliter);
+        } catch (...) {
+	  // invalid snapset, probably corrupt
+	  shard_info.set_hinfo_corrupted();
+          if (error)
+            shard_errorstream << ", ";
+          error = true;
+          shard_errorstream << "candidate had a corrupt hinfo";
+        }
+      }
+    }
+
+    k = i->second.attrs.find(OI_ATTR);
+    if (k == i->second.attrs.end()) {
+      // no object info on object, probably corrupt
+      shard_info.set_info_missing();
+      if (error)
+        shard_errorstream << ", ";
+      error = true;
+      shard_errorstream << "candidate had a missing info key";
+      goto out;
+    }
+    bl.push_back(k->second);
+    try {
+      auto bliter = bl.cbegin();
+      decode(oi, bliter);
+    } catch (...) {
+      // invalid object info, probably corrupt
+      shard_info.set_info_corrupted();
+      if (error)
+        shard_errorstream << ", ";
+      error = true;
+      shard_errorstream << "candidate had a corrupt info";
+      goto out;
+    }
+
+    // This is automatically corrected in PG::_repair_oinfo_oid()
+    ceph_assert(oi.soid == obj);
+
+    if (i->second.size != be_get_ondisk_size(oi.size)) {
+      shard_info.set_obj_size_info_mismatch();
+      if (error)
+        shard_errorstream << ", ";
+      error = true;
+      shard_errorstream << "candidate size " << i->second.size << " info size "
+			<< oi.size << " mismatch";
+    }
+
+    // digest_match will only be true if computed digests are the same
+    if (auth_version != eversion_t()
+        && auth->second->objects[obj].digest_present
+        && i->second.digest_present
+        && auth->second->objects[obj].digest != i->second.digest) {
+      digest_match = false;
+      dout(10) << __func__ << " digest_match = false, " << obj << " data_digest 0x" << std::hex << i->second.digest
+		    << " != data_digest 0x" << auth->second->objects[obj].digest << std::dec
+		    << dendl;
+    }
+
+    // Don't use this particular shard due to previous errors
+    // XXX: For now we can't pick one shard for repair and another's object info or snapset
+    if (shard_info.errors)
+      goto out;
+
+    if (auth_version == eversion_t() || oi.version > auth_version ||
+        (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) {
+      auth = j;
+      *auth_oi = oi;
+      auth_version = oi.version;
+    }
+
+out:
+    if (error)
+        errorstream << pgid.pgid << " shard " << l << " soid " << obj
+		    << " : " << shard_errorstream.str() << "\n";
+    // Keep scanning other shards
+  }
+  dout(10) << __func__ << ": selecting osd " << auth->first
+	   << " for obj " << obj
+	   << " with oi " << *auth_oi
+	   << dendl;
+  return auth;
+}
+
+void PGBackend::be_compare_scrubmaps(
+  const map<pg_shard_t,ScrubMap*> &maps,
+  const set<hobject_t> &master_set,
+  bool repair,
+  map<hobject_t, set<pg_shard_t>> &missing,
+  map<hobject_t, set<pg_shard_t>> &inconsistent,
+  map<hobject_t, list<pg_shard_t>> &authoritative,
+  map<hobject_t, pair<std::optional<uint32_t>,
+                      std::optional<uint32_t>>> &missing_digest,
+  int &shallow_errors, int &deep_errors,
+  Scrub::Store *store,
+  const spg_t& pgid,
+  const vector<int> &acting,
+  ostream &errorstream)
+{
+  utime_t now = ceph_clock_now();
+
+  // Check maps against master set and each other
+  for (set<hobject_t>::const_iterator k = master_set.begin();
+       k != master_set.end();
+       ++k) {
+    object_info_t auth_oi;
+    map<pg_shard_t, shard_info_wrapper> shard_map;
+
+    inconsistent_obj_wrapper object_error{*k};
+
+    bool digest_match;
+    map<pg_shard_t, ScrubMap *>::const_iterator auth =
+      be_select_auth_object(*k, maps, &auth_oi, shard_map, digest_match,
+			    pgid, errorstream);
+
+    list<pg_shard_t> auth_list;
+    set<pg_shard_t> object_errors;
+    if (auth == maps.end()) {
+      object_error.set_version(0);
+      object_error.set_auth_missing(*k, maps, shard_map, shallow_errors,
+	deep_errors, get_parent()->whoami_shard());
+      if (object_error.has_deep_errors())
+	++deep_errors;
+      else if (object_error.has_shallow_errors())
+	++shallow_errors;
+      store->add_object_error(k->pool, object_error);
+      errorstream << pgid.pgid << " soid " << *k
+		  << " : failed to pick suitable object info\n";
+      continue;
+    }
+    object_error.set_version(auth_oi.user_version);
+    ScrubMap::object& auth_object = auth->second->objects[*k];
+    set<pg_shard_t> cur_missing;
+    set<pg_shard_t> cur_inconsistent;
+    bool fix_digest = false;
+
+    for (auto j = maps.cbegin(); j != maps.cend(); ++j) {
+      if (j == auth)
+	shard_map[auth->first].selected_oi = true;
+      if (j->second->objects.count(*k)) {
+	shard_map[j->first].set_object(j->second->objects[*k]);
+	// Compare
+	stringstream ss;
+	bool found = be_compare_scrub_objects(auth->first,
+				   auth_object,
+				   auth_oi,
+				   j->second->objects[*k],
+				   shard_map[j->first],
+				   object_error,
+				   ss,
+				   k->has_snapset());
+
+	dout(20) << __func__ << (repair ? " repair " : " ") << (parent->get_pool().is_replicated() ? "replicated " : "")
+	 << (j == auth ? "auth" : "") << "shards " << shard_map.size() << (digest_match ? " digest_match " : " ")
+	 << (shard_map[j->first].only_data_digest_mismatch_info() ? "'info mismatch info'" : "")
+	 << dendl;
+	// If all replicas match, but they don't match object_info we can
+	// repair it by using missing_digest mechanism
+	if (repair && parent->get_pool().is_replicated() && j == auth && shard_map.size() > 1
+	    && digest_match && shard_map[j->first].only_data_digest_mismatch_info()
+	    && auth_object.digest_present) {
+	  // Set in missing_digests
+	  fix_digest = true;
+	  // Clear the error
+	  shard_map[j->first].clear_data_digest_mismatch_info();
+	  errorstream << pgid << " soid " << *k << " : repairing object info data_digest" << "\n";
+	}
+	// Some errors might have already been set in be_select_auth_object()
+	if (shard_map[j->first].errors != 0) {
+	  cur_inconsistent.insert(j->first);
+          if (shard_map[j->first].has_deep_errors())
+	    ++deep_errors;
+	  else
+	    ++shallow_errors;
+	  // Only true if be_compare_scrub_objects() found errors and put something
+	  // in ss.
+	  if (found)
+	    errorstream << pgid << " shard " << j->first << " soid " << *k
+		      << " : " << ss.str() << "\n";
+	} else if (found) {
+	  // Track possible shard to use as authoritative, if needed
+	  // There are errors, without identifying the shard
+	  object_errors.insert(j->first);
+	  errorstream << pgid << " soid " << *k << " : " << ss.str() << "\n";
+	} else {
+	  // XXX: The auth shard might get here that we don't know
+	  // that it has the "correct" data.
+	  auth_list.push_back(j->first);
+	}
+      } else {
+	cur_missing.insert(j->first);
+	shard_map[j->first].set_missing();
+        shard_map[j->first].primary = (j->first == get_parent()->whoami_shard());
+	// Can't have any other errors if there is no information available
+	++shallow_errors;
+	errorstream << pgid << " shard " << j->first << " " << *k << " : missing\n";
+      }
+      object_error.add_shard(j->first, shard_map[j->first]);
+    }
+
+    if (auth_list.empty()) {
+      if (object_errors.empty()) {
+        errorstream << pgid.pgid << " soid " << *k
+		  << " : failed to pick suitable auth object\n";
+        goto out;
+      }
+      // Object errors exist and nothing in auth_list
+      // Prefer the auth shard otherwise take first from list.
+      pg_shard_t shard;
+      if (object_errors.count(auth->first)) {
+	shard = auth->first;
+      } else {
+	shard = *(object_errors.begin());
+      }
+      auth_list.push_back(shard);
+      object_errors.erase(shard);
+    }
+    // At this point auth_list is populated, so we add the object errors shards
+    // as inconsistent.
+    cur_inconsistent.insert(object_errors.begin(), object_errors.end());
+    if (!cur_missing.empty()) {
+      missing[*k] = cur_missing;
+    }
+    if (!cur_inconsistent.empty()) {
+      inconsistent[*k] = cur_inconsistent;
+    }
+
+    if (fix_digest) {
+      std::optional<uint32_t> data_digest, omap_digest;
+      ceph_assert(auth_object.digest_present);
+      data_digest = auth_object.digest;
+      if (auth_object.omap_digest_present) {
+        omap_digest = auth_object.omap_digest;
+      }
+      missing_digest[*k] = make_pair(data_digest, omap_digest);
+    }
+    if (!cur_inconsistent.empty() || !cur_missing.empty()) {
+      authoritative[*k] = auth_list;
+    } else if (!fix_digest && parent->get_pool().is_replicated()) {
+      enum {
+	NO = 0,
+	MAYBE = 1,
+	FORCE = 2,
+      } update = NO;
+
+      if (auth_object.digest_present && !auth_oi.is_data_digest()) {
+	dout(20) << __func__ << " missing data digest on " << *k << dendl;
+	update = MAYBE;
+      }
+      if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) {
+	dout(20) << __func__ << " missing omap digest on " << *k << dendl;
+	update = MAYBE;
+      }
+
+      // recorded digest != actual digest?
+      if (auth_oi.is_data_digest() && auth_object.digest_present &&
+	  auth_oi.data_digest != auth_object.digest) {
+        ceph_assert(shard_map[auth->first].has_data_digest_mismatch_info());
+	errorstream << pgid << " recorded data digest 0x"
+		    << std::hex << auth_oi.data_digest << " != on disk 0x"
+		    << auth_object.digest << std::dec << " on " << auth_oi.soid
+		    << "\n";
+	if (repair)
+	  update = FORCE;
+      }
+      if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
+	  auth_oi.omap_digest != auth_object.omap_digest) {
+        ceph_assert(shard_map[auth->first].has_omap_digest_mismatch_info());
+	errorstream << pgid << " recorded omap digest 0x"
+		    << std::hex << auth_oi.omap_digest << " != on disk 0x"
+		    << auth_object.omap_digest << std::dec
+		    << " on " << auth_oi.soid << "\n";
+	if (repair)
+	  update = FORCE;
+      }
+
+      if (update != NO) {
+	utime_t age = now - auth_oi.local_mtime;
+	if (update == FORCE ||
+	    age > cct->_conf->osd_deep_scrub_update_digest_min_age) {
+          std::optional<uint32_t> data_digest, omap_digest;
+          if (auth_object.digest_present) {
+            data_digest = auth_object.digest;
+	    dout(20) << __func__ << " will update data digest on " << *k << dendl;
+          }
+          if (auth_object.omap_digest_present) {
+            omap_digest = auth_object.omap_digest;
+	    dout(20) << __func__ << " will update omap digest on " << *k << dendl;
+          }
+	  missing_digest[*k] = make_pair(data_digest, omap_digest);
+	} else {
+	  dout(20) << __func__ << " missing digest but age " << age
+		   << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age
+		   << " on " << *k << dendl;
+	}
+      }
+    }
+out:
+    if (object_error.has_deep_errors())
+      ++deep_errors;
+    else if (object_error.has_shallow_errors())
+      ++shallow_errors;
+    if (object_error.errors || object_error.union_shards.errors) {
+      store->add_object_error(k->pool, object_error);
+    }
+  }
+}
+
+void PGBackend::be_omap_checks(const map<pg_shard_t,ScrubMap*> &maps,
+  const set<hobject_t> &master_set,
+  omap_stat_t& omap_stats,
+  ostream &warnstream) const
+{
+  bool needs_omap_check = false;
+  for (const auto& map : maps) {
+    if (map.second->has_large_omap_object_errors || map.second->has_omap_keys) {
+      needs_omap_check = true;
+      break;
+    }
+  }
+
+  if (!needs_omap_check) {
+    return; // Nothing to do
+  }
+
+  // Iterate through objects and update omap stats
+  for (const auto& k : master_set) {
+    for (const auto& map : maps) {
+      if (map.first != get_parent()->primary_shard()) {
+        // Only set omap stats for the primary
+        continue;
+      }
+      auto it = map.second->objects.find(k);
+      if (it == map.second->objects.end())
+        continue;
+      ScrubMap::object& obj = it->second;
+      omap_stats.omap_bytes += obj.object_omap_bytes;
+      omap_stats.omap_keys += obj.object_omap_keys;
+      if (obj.large_omap_object_found) {
+        pg_t pg;
+        auto osdmap = get_osdmap();
+        osdmap->map_to_pg(k.pool, k.oid.name, k.get_key(), k.nspace, &pg);
+        pg_t mpg = osdmap->raw_pg_to_pg(pg);
+        omap_stats.large_omap_objects++;
+        warnstream << "Large omap object found. Object: " << k
+                   << " PG: " << pg << " (" << mpg << ")"
+                   << " Key count: " << obj.large_omap_object_key_count
+                   << " Size (bytes): " << obj.large_omap_object_value_size
+                   << '\n';
+        break;
+      }
+    }
+  }
+}
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
new file mode 100644
index 000000000..12bdfc0d1
--- /dev/null
+++ b/src/osd/PGBackend.h
@@ -0,0 +1,641 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013,2014 Inktank Storage, Inc.
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef PGBACKEND_H
+#define PGBACKEND_H
+
+#include "osd_types.h"
+#include "common/WorkQueue.h"
+#include "include/Context.h"
+#include "os/ObjectStore.h"
+#include "common/LogClient.h"
+#include <string>
+#include "PGTransaction.h"
+#include "common/ostream_temp.h"
+
+namespace Scrub {
+  class Store;
+}
+struct shard_info_wrapper;
+struct inconsistent_obj_wrapper;
+
+//forward declaration
+class OSDMap;
+class PGLog;
+typedef std::shared_ptr<const OSDMap> OSDMapRef;
+
+ /**
+  * PGBackend
+  *
+  * PGBackend defines an interface for logic handling IO and
+  * replication on RADOS objects.  The PGBackend implementation
+  * is responsible for:
+  *
+  * 1) Handling client operations
+  * 2) Handling object recovery
+  * 3) Handling object access
+  * 4) Handling scrub, deep-scrub, repair
+  */
+ class PGBackend {
+ public:
+   CephContext* cct;
+ protected:
+   ObjectStore *store;
+   const coll_t coll;
+   ObjectStore::CollectionHandle &ch;
+ public:
+   /**
+    * Provides interfaces for PGBackend callbacks
+    *
+    * The intention is that the parent calls into the PGBackend
+    * implementation holding a lock and that the callbacks are
+    * called under the same locks.
+    */
+   class Listener {
+   public:
+     /// Debugging
+     virtual DoutPrefixProvider *get_dpp() = 0;
+
+     /// Recovery
+
+     /**
+      * Called with the transaction recovering oid
+      */
+     virtual void on_local_recover(
+       const hobject_t &oid,
+       const ObjectRecoveryInfo &recovery_info,
+       ObjectContextRef obc,
+       bool is_delete,
+       ObjectStore::Transaction *t
+       ) = 0;
+
+     /**
+      * Called when transaction recovering oid is durable and
+      * applied on all replicas
+      */
+     virtual void on_global_recover(
+       const hobject_t &oid,
+       const object_stat_sum_t &stat_diff,
+       bool is_delete
+       ) = 0;
+
+     /**
+      * Called when peer is recovered
+      */
+     virtual void on_peer_recover(
+       pg_shard_t peer,
+       const hobject_t &oid,
+       const ObjectRecoveryInfo &recovery_info
+       ) = 0;
+
+     virtual void begin_peer_recover(
+       pg_shard_t peer,
+       const hobject_t oid) = 0;
+
+     virtual void apply_stats(
+       const hobject_t &soid,
+       const object_stat_sum_t &delta_stats) = 0;
+
+     /**
+      * Called when a read from a std::set of replicas/primary fails
+      */
+     virtual void on_failed_pull(
+       const std::set<pg_shard_t> &from,
+       const hobject_t &soid,
+       const eversion_t &v
+       ) = 0;
+
+     /**
+      * Called when a pull on soid cannot be completed due to
+      * down peers
+      */
+     virtual void cancel_pull(
+       const hobject_t &soid) = 0;
+
+     /**
+      * Called to remove an object.
+      */
+     virtual void remove_missing_object(
+       const hobject_t &oid,
+       eversion_t v,
+       Context *on_complete) = 0;
+
+     /**
+      * Bless a context
+      *
+      * Wraps a context in whatever outer layers the parent usually
+      * uses to call into the PGBackend
+      */
+     virtual Context *bless_context(Context *c) = 0;
+     virtual GenContext<ThreadPool::TPHandle&> *bless_gencontext(
+       GenContext<ThreadPool::TPHandle&> *c) = 0;
+     virtual GenContext<ThreadPool::TPHandle&> *bless_unlocked_gencontext(
+       GenContext<ThreadPool::TPHandle&> *c) = 0;
+
+     virtual void send_message(int to_osd, Message *m) = 0;
+     virtual void queue_transaction(
+       ObjectStore::Transaction&& t,
+       OpRequestRef op = OpRequestRef()
+       ) = 0;
+     virtual void queue_transactions(
+       std::vector<ObjectStore::Transaction>& tls,
+       OpRequestRef op = OpRequestRef()
+       ) = 0;
+     virtual epoch_t get_interval_start_epoch() const = 0;
+     virtual epoch_t get_last_peering_reset_epoch() const = 0;
+
+     virtual const std::set<pg_shard_t> &get_acting_recovery_backfill_shards() const = 0;
+     virtual const std::set<pg_shard_t> &get_acting_shards() const = 0;
+     virtual const std::set<pg_shard_t> &get_backfill_shards() const = 0;
+
+     virtual std::ostream& gen_dbg_prefix(std::ostream& out) const = 0;
+
+     virtual const std::map<hobject_t, std::set<pg_shard_t>> &get_missing_loc_shards()
+       const = 0;
+
+     virtual const pg_missing_tracker_t &get_local_missing() const = 0;
+     virtual void add_local_next_event(const pg_log_entry_t& e) = 0;
+     virtual const std::map<pg_shard_t, pg_missing_t> &get_shard_missing()
+       const = 0;
+     virtual const pg_missing_const_i * maybe_get_shard_missing(
+       pg_shard_t peer) const {
+       if (peer == primary_shard()) {
+	 return &get_local_missing();
+       } else {
+	 std::map<pg_shard_t, pg_missing_t>::const_iterator i =
+	   get_shard_missing().find(peer);
+	 if (i == get_shard_missing().end()) {
+	   return nullptr;
+	 } else {
+	   return &(i->second);
+	 }
+       }
+     }
+     virtual const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const {
+       auto m = maybe_get_shard_missing(peer);
+       ceph_assert(m);
+       return *m;
+     }
+
+     virtual const std::map<pg_shard_t, pg_info_t> &get_shard_info() const = 0;
+     virtual const pg_info_t &get_shard_info(pg_shard_t peer) const {
+       if (peer == primary_shard()) {
+	 return get_info();
+       } else {
+	 std::map<pg_shard_t, pg_info_t>::const_iterator i =
+	   get_shard_info().find(peer);
+	 ceph_assert(i != get_shard_info().end());
+	 return i->second;
+       }
+     }
+
+     virtual const PGLog &get_log() const = 0;
+     virtual bool pgb_is_primary() const = 0;
+     virtual const OSDMapRef& pgb_get_osdmap() const = 0;
+     virtual epoch_t pgb_get_osdmap_epoch() const = 0;
+     virtual const pg_info_t &get_info() const = 0;
+     virtual const pg_pool_t &get_pool() const = 0;
+
+     virtual ObjectContextRef get_obc(
+       const hobject_t &hoid,
+       const std::map<std::string, ceph::buffer::list> &attrs) = 0;
+
+     virtual bool try_lock_for_read(
+       const hobject_t &hoid,
+       ObcLockManager &manager) = 0;
+
+     virtual void release_locks(ObcLockManager &manager) = 0;
+
+     virtual void op_applied(
+       const eversion_t &applied_version) = 0;
+
+     virtual bool should_send_op(
+       pg_shard_t peer,
+       const hobject_t &hoid) = 0;
+
+     virtual bool pg_is_undersized() const = 0;
+     virtual bool pg_is_repair() const = 0;
+
+     virtual void log_operation(
+       std::vector<pg_log_entry_t>&& logv,
+       const std::optional<pg_hit_set_history_t> &hset_history,
+       const eversion_t &trim_to,
+       const eversion_t &roll_forward_to,
+       const eversion_t &min_last_complete_ondisk,
+       bool transaction_applied,
+       ObjectStore::Transaction &t,
+       bool async = false) = 0;
+
+     virtual void pgb_set_object_snap_mapping(
+       const hobject_t &soid,
+       const std::set<snapid_t> &snaps,
+       ObjectStore::Transaction *t) = 0;
+
+     virtual void pgb_clear_object_snap_mapping(
+       const hobject_t &soid,
+       ObjectStore::Transaction *t) = 0;
+
+     virtual void update_peer_last_complete_ondisk(
+       pg_shard_t fromosd,
+       eversion_t lcod) = 0;
+
+     virtual void update_last_complete_ondisk(
+       eversion_t lcod) = 0;
+
+     virtual void update_stats(
+       const pg_stat_t &stat) = 0;
+
+     virtual void schedule_recovery_work(
+       GenContext<ThreadPool::TPHandle&> *c) = 0;
+
+     virtual pg_shard_t whoami_shard() const = 0;
+     int whoami() const {
+       return whoami_shard().osd;
+     }
+     spg_t whoami_spg_t() const {
+       return get_info().pgid;
+     }
+
+     virtual spg_t primary_spg_t() const = 0;
+     virtual pg_shard_t primary_shard() const = 0;
+     virtual uint64_t min_peer_features() const = 0;
+     virtual uint64_t min_upacting_features() const = 0;
+     virtual hobject_t get_temp_recovery_object(const hobject_t& target,
+						eversion_t version) = 0;
+
+      virtual void send_message_osd_cluster(
+       int peer, Message *m, epoch_t from_epoch) = 0;
+      virtual void send_message_osd_cluster(
+       std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch) = 0;
+     virtual void send_message_osd_cluster(
+       MessageRef, Connection *con) = 0;
+     virtual void send_message_osd_cluster(
+       Message *m, const ConnectionRef& con) = 0;
+     virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0;
+     virtual entity_name_t get_cluster_msgr_name() = 0;
+
+     virtual PerfCounters *get_logger() = 0;
+
+     virtual ceph_tid_t get_tid() = 0;
+
+     virtual OstreamTemp clog_error() = 0;
+     virtual OstreamTemp clog_warn() = 0;
+
+     virtual bool check_failsafe_full() = 0;
+
+     virtual bool pg_is_repair() = 0;
+     virtual void inc_osd_stat_repaired() = 0;
+     virtual bool pg_is_remote_backfilling() = 0;
+     virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0;
+     virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0;
+     virtual void pg_add_num_bytes(int64_t num_bytes) = 0;
+     virtual void pg_sub_num_bytes(int64_t num_bytes) = 0;
+     virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0;
+     virtual ~Listener() {}
+   };
+   Listener *parent;
+   Listener *get_parent() const { return parent; }
+   PGBackend(CephContext* cct, Listener *l, ObjectStore *store, const coll_t &coll,
+	     ObjectStore::CollectionHandle &ch) :
+     cct(cct),
+     store(store),
+     coll(coll),
+     ch(ch),
+     parent(l) {}
+   bool is_primary() const { return get_parent()->pgb_is_primary(); }
+   const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
+   epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); }
+   const pg_info_t &get_info() { return get_parent()->get_info(); }
+
+   std::ostream& gen_prefix(std::ostream& out) const {
+     return parent->gen_dbg_prefix(out);
+   }
+
+   /**
+    * RecoveryHandle
+    *
+    * We may want to recover multiple objects in the same std::set of
+    * messages.  RecoveryHandle is an interface for the opaque
+    * object used by the implementation to store the details of
+    * the pending recovery operations.
+    */
+   struct RecoveryHandle {
+     bool cache_dont_need;
+     std::map<pg_shard_t, std::vector<std::pair<hobject_t, eversion_t> > > deletes;
+
+     RecoveryHandle(): cache_dont_need(false) {}
+     virtual ~RecoveryHandle() {}
+   };
+
+   /// Get a fresh recovery operation
+   virtual RecoveryHandle *open_recovery_op() = 0;
+
+   /// run_recovery_op: finish the operation represented by h
+   virtual void run_recovery_op(
+     RecoveryHandle *h,     ///< [in] op to finish
+     int priority           ///< [in] msg priority
+     ) = 0;
+
+   void recover_delete_object(const hobject_t &oid, eversion_t v,
+			      RecoveryHandle *h);
+   void send_recovery_deletes(int prio,
+			      const std::map<pg_shard_t, std::vector<std::pair<hobject_t, eversion_t> > > &deletes);
+
+   /**
+    * recover_object
+    *
+    * Triggers a recovery operation on the specified hobject_t
+    * onreadable must be called before onwriteable
+    *
+    * On each replica (primary included), get_parent()->on_not_missing()
+    * must be called when the transaction finalizing the recovery
+    * is queued.  Similarly, get_parent()->on_readable() must be called
+    * when the transaction is applied in the backing store.
+    *
+    * get_parent()->on_not_degraded() should be called on the primary
+    * when writes can resume on the object.
+    *
+    * obc may be NULL if the primary lacks the object.
+    *
+    * head may be NULL only if the head/snapdir is missing
+    *
+    * @param missing [in] std::set of info, missing pairs for queried nodes
+    * @param overlaps [in] mapping of object to file offset overlaps
+    */
+   virtual int recover_object(
+     const hobject_t &hoid, ///< [in] object to recover
+     eversion_t v,          ///< [in] version to recover
+     ObjectContextRef head,  ///< [in] context of the head/snapdir object
+     ObjectContextRef obc,  ///< [in] context of the object
+     RecoveryHandle *h      ///< [in,out] handle to attach recovery op to
+     ) = 0;
+
+   /**
+    * true if PGBackend can handle this message while inactive
+    *
+    * If it returns true, handle_message *must* also return true
+    */
+   virtual bool can_handle_while_inactive(OpRequestRef op) = 0;
+
+   /// gives PGBackend a crack at an incoming message
+   bool handle_message(
+     OpRequestRef op ///< [in] message received
+     ); ///< @return true if the message was handled
+
+   /// the variant of handle_message that is overridden by child classes
+   virtual bool _handle_message(OpRequestRef op) = 0;
+
+   virtual void check_recovery_sources(const OSDMapRef& osdmap) = 0;
+
+
+   /**
+    * clean up any temporary on-disk state due to a pg interval change
+    */
+   void on_change_cleanup(ObjectStore::Transaction *t);
+   /**
+    * implementation should clear itself, contexts blessed prior to on_change
+    * won't be called after on_change()
+    */
+   virtual void on_change() = 0;
+   virtual void clear_recovery_state() = 0;
+
+   virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() const = 0;
+   virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0;
+   virtual int get_ec_data_chunk_count() const { return 0; };
+   virtual int get_ec_stripe_chunk_size() const { return 0; };
+
+   virtual void dump_recovery_info(ceph::Formatter *f) const = 0;
+
+ private:
+   std::set<hobject_t> temp_contents;
+ public:
+   // Track contents of temp collection, clear on reset
+   void add_temp_obj(const hobject_t &oid) {
+     temp_contents.insert(oid);
+   }
+   void add_temp_objs(const std::set<hobject_t> &oids) {
+     temp_contents.insert(oids.begin(), oids.end());
+   }
+   void clear_temp_obj(const hobject_t &oid) {
+     temp_contents.erase(oid);
+   }
+   void clear_temp_objs(const std::set<hobject_t> &oids) {
+     for (std::set<hobject_t>::const_iterator i = oids.begin();
+	  i != oids.end();
+	  ++i) {
+       temp_contents.erase(*i);
+     }
+   }
+
+   virtual ~PGBackend() {}
+
+   /// execute implementation specific transaction
+   virtual void submit_transaction(
+     const hobject_t &hoid,               ///< [in] object
+     const object_stat_sum_t &delta_stats,///< [in] stat change
+     const eversion_t &at_version,        ///< [in] version
+     PGTransactionUPtr &&t,               ///< [in] trans to execute (move)
+     const eversion_t &trim_to,           ///< [in] trim log to here
+     const eversion_t &min_last_complete_ondisk, ///< [in] lower bound on
+                                                 ///  committed version
+     std::vector<pg_log_entry_t>&& log_entries, ///< [in] log entries for t
+     /// [in] hitset history (if updated with this transaction)
+     std::optional<pg_hit_set_history_t> &hset_history,
+     Context *on_all_commit,              ///< [in] called when all commit
+     ceph_tid_t tid,                      ///< [in] tid
+     osd_reqid_t reqid,                   ///< [in] reqid
+     OpRequestRef op                      ///< [in] op
+     ) = 0;
+
+   /// submit callback to be called in order with pending writes
+   virtual void call_write_ordered(std::function<void(void)> &&cb) = 0;
+
+   void try_stash(
+     const hobject_t &hoid,
+     version_t v,
+     ObjectStore::Transaction *t);
+
+   void rollback(
+     const pg_log_entry_t &entry,
+     ObjectStore::Transaction *t);
+
+   friend class LRBTrimmer;
+   void rollforward(
+     const pg_log_entry_t &entry,
+     ObjectStore::Transaction *t);
+
+   void trim(
+     const pg_log_entry_t &entry,
+     ObjectStore::Transaction *t);
+
+   void remove(
+     const hobject_t &hoid,
+     ObjectStore::Transaction *t);
+
+ protected:
+
+   void handle_recovery_delete(OpRequestRef op);
+   void handle_recovery_delete_reply(OpRequestRef op);
+
+   /// Reapply old attributes
+   void rollback_setattrs(
+     const hobject_t &hoid,
+     std::map<std::string, std::optional<ceph::buffer::list> > &old_attrs,
+     ObjectStore::Transaction *t);
+
+   /// Truncate object to rollback append
+   virtual void rollback_append(
+     const hobject_t &hoid,
+     uint64_t old_size,
+     ObjectStore::Transaction *t);
+
+   /// Unstash object to rollback stash
+   void rollback_stash(
+     const hobject_t &hoid,
+     version_t old_version,
+     ObjectStore::Transaction *t);
+
+   /// Unstash object to rollback stash
+   void rollback_try_stash(
+     const hobject_t &hoid,
+     version_t old_version,
+     ObjectStore::Transaction *t);
+
+   /// Delete object to rollback create
+   void rollback_create(
+     const hobject_t &hoid,
+     ObjectStore::Transaction *t) {
+     remove(hoid, t);
+   }
+
+   /// Clone the extents back into place
+   void rollback_extents(
+     version_t gen,
+     const std::vector<std::pair<uint64_t, uint64_t> > &extents,
+     const hobject_t &hoid,
+     ObjectStore::Transaction *t);
+ public:
+
+   /// Trim object stashed at version
+   void trim_rollback_object(
+     const hobject_t &hoid,
+     version_t gen,
+     ObjectStore::Transaction *t);
+
+   /// Std::list objects in collection
+   int objects_list_partial(
+     const hobject_t &begin,
+     int min,
+     int max,
+     std::vector<hobject_t> *ls,
+     hobject_t *next);
+
+   int objects_list_range(
+     const hobject_t &start,
+     const hobject_t &end,
+     std::vector<hobject_t> *ls,
+     std::vector<ghobject_t> *gen_obs=0);
+
+   int objects_get_attr(
+     const hobject_t &hoid,
+     const std::string &attr,
+     ceph::buffer::list *out);
+
+   virtual int objects_get_attrs(
+     const hobject_t &hoid,
+     std::map<std::string, ceph::buffer::list> *out);
+
+   virtual int objects_read_sync(
+     const hobject_t &hoid,
+     uint64_t off,
+     uint64_t len,
+     uint32_t op_flags,
+     ceph::buffer::list *bl) = 0;
+
+   virtual int objects_readv_sync(
+     const hobject_t &hoid,
+     std::map<uint64_t, uint64_t>&& m,
+     uint32_t op_flags,
+     ceph::buffer::list *bl) {
+     return -EOPNOTSUPP;
+   }
+
+   virtual void objects_read_async(
+     const hobject_t &hoid,
+     const std::list<std::pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+		std::pair<ceph::buffer::list*, Context*> > > &to_read,
+     Context *on_complete, bool fast_read = false) = 0;
+
+   virtual bool auto_repair_supported() const = 0;
+   int be_scan_list(
+     ScrubMap &map,
+     ScrubMapBuilder &pos);
+   bool be_compare_scrub_objects(
+     pg_shard_t auth_shard,
+     const ScrubMap::object &auth,
+     const object_info_t& auth_oi,
+     const ScrubMap::object &candidate,
+     shard_info_wrapper& shard_error,
+     inconsistent_obj_wrapper &result,
+     std::ostream &errorstream,
+     bool has_snapset);
+   std::map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object(
+     const hobject_t &obj,
+     const std::map<pg_shard_t,ScrubMap*> &maps,
+     object_info_t *auth_oi,
+     std::map<pg_shard_t, shard_info_wrapper> &shard_map,
+     bool &digest_match,
+     spg_t pgid,
+     std::ostream &errorstream);
+   void be_compare_scrubmaps(
+     const std::map<pg_shard_t,ScrubMap*> &maps,
+     const std::set<hobject_t> &master_set,
+     bool repair,
+     std::map<hobject_t, std::set<pg_shard_t>> &missing,
+     std::map<hobject_t, std::set<pg_shard_t>> &inconsistent,
+     std::map<hobject_t, std::list<pg_shard_t>> &authoritative,
+     std::map<hobject_t, std::pair<std::optional<uint32_t>,
+                         std::optional<uint32_t>>> &missing_digest,
+     int &shallow_errors, int &deep_errors,
+     Scrub::Store *store,
+     const spg_t& pgid,
+     const std::vector<int> &acting,
+     std::ostream &errorstream);
+   virtual uint64_t be_get_ondisk_size(
+     uint64_t logical_size) = 0;
+   virtual int be_deep_scrub(
+     const hobject_t &oid,
+     ScrubMap &map,
+     ScrubMapBuilder &pos,
+     ScrubMap::object &o) = 0;
+   void be_omap_checks(
+     const std::map<pg_shard_t,ScrubMap*> &maps,
+     const std::set<hobject_t> &master_set,
+     omap_stat_t& omap_stats,
+     std::ostream &warnstream) const;
+
+   static PGBackend *build_pg_backend(
+     const pg_pool_t &pool,
+     const std::map<std::string,std::string>& profile,
+     Listener *l,
+     coll_t coll,
+     ObjectStore::CollectionHandle &ch,
+     ObjectStore *store,
+     CephContext *cct);
+};
+
+#endif
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
new file mode 100644
index 000000000..c881dbabe
--- /dev/null
+++ b/src/osd/PGLog.cc
@@ -0,0 +1,1189 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "PGLog.h"
+#include "include/unordered_map.h"
+#include "common/ceph_context.h"
+
+using std::make_pair;
+using std::map;
+using std::ostream;
+using std::set;
+using std::string;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+static ostream& _prefix(std::ostream *_dout, const PGLog *pglog)
+{
+  return pglog->gen_prefix(*_dout);
+}
+
+//////////////////// PGLog::IndexedLog ////////////////////
+
+void PGLog::IndexedLog::split_out_child(
+  pg_t child_pgid,
+  unsigned split_bits,
+  PGLog::IndexedLog *target)
+{
+  unindex();
+  *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits));
+  index();
+  target->index();
+  reset_rollback_info_trimmed_to_riter();
+}
+
+void PGLog::IndexedLog::trim(
+  CephContext* cct,
+  eversion_t s,
+  set<eversion_t> *trimmed,
+  set<string>* trimmed_dups,
+  eversion_t *write_from_dups)
+{
+  lgeneric_subdout(cct, osd, 10) << "IndexedLog::trim s=" << s << dendl;
+  ceph_assert(s <= can_rollback_to);
+  if (complete_to != log.end())
+    lgeneric_subdout(cct, osd, 20) << " complete_to " << complete_to->version << dendl;
+
+  auto earliest_dup_version =
+    log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
+    ? 0u
+    : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked + 1;
+
+  lgeneric_subdout(cct, osd, 20) << "earliest_dup_version = " << earliest_dup_version << dendl;
+  while (!log.empty()) {
+    const pg_log_entry_t &e = *log.begin();
+    if (e.version > s)
+      break;
+    lgeneric_subdout(cct, osd, 20) << "trim " << e << dendl;
+    if (trimmed)
+      trimmed->emplace(e.version);
+
+    unindex(e);         // remove from index,
+
+    // add to dup list
+    if (e.version.version >= earliest_dup_version) {
+      if (write_from_dups != nullptr && *write_from_dups > e.version) {
+	lgeneric_subdout(cct, osd, 20) << "updating write_from_dups from " << *write_from_dups << " to " << e.version << dendl;
+	*write_from_dups = e.version;
+      }
+      dups.push_back(pg_log_dup_t(e));
+      index(dups.back());
+      uint32_t idx = 0;
+      for (const auto& extra : e.extra_reqids) {
+	int return_code = e.return_code;
+	if (return_code >= 0) {
+	  auto it = e.extra_reqid_return_codes.find(idx);
+	  if (it != e.extra_reqid_return_codes.end()) {
+	    return_code = it->second;
+	    // FIXME: we aren't setting op_returns for these extra_reqids
+	  }
+	}
+	++idx;
+
+	// note: extras have the same version as outer op
+	dups.push_back(pg_log_dup_t(e.version, extra.second,
+				    extra.first, return_code));
+	index(dups.back());
+      }
+    }
+
+    bool reset_complete_to = false;
+    // we are trimming past complete_to, so reset complete_to
+    if (complete_to != log.end() && e.version >= complete_to->version)
+      reset_complete_to = true;
+    if (rollback_info_trimmed_to_riter == log.rend() ||
+	e.version == rollback_info_trimmed_to_riter->version) {
+      log.pop_front();
+      rollback_info_trimmed_to_riter = log.rend();
+    } else {
+      log.pop_front();
+    }
+
+    // reset complete_to to the beginning of the log
+    if (reset_complete_to) {
+      complete_to = log.begin();
+      if (complete_to != log.end()) {
+        lgeneric_subdout(cct, osd, 20) << " moving complete_to to "
+                                       << log.begin()->version << dendl;
+      } else {
+        lgeneric_subdout(cct, osd, 20) << " log is now empty" << dendl;
+      }
+    }
+  }
+
+  // we can hit an inflated `dups` b/c of https://tracker.ceph.com/issues/53729
+  // the idea is to slowly trim them over a prolonged period of time and mix
+  // omap deletes with writes (if we're here, a new log entry got added) to
+  // neither: 1) blow size of single Transaction nor 2) generate-n-accumulate
+  // large amount of tombstones in BlueStore's RocksDB.
+  // if trimming immediately is a must, then the ceph-objectstore-tool is
+  // the way to go.
+  const size_t max_dups = cct->_conf->osd_pg_log_dups_tracked;
+  for (size_t max_dups_to_trim = cct->_conf->osd_pg_log_trim_max;
+       max_dups_to_trim > 0 && dups.size() > max_dups;
+       max_dups_to_trim--) {
+    const auto& e = *dups.begin();
+    lgeneric_subdout(cct, osd, 20) << "trim dup " << e << dendl;
+    if (trimmed_dups)
+      trimmed_dups->insert(e.get_key_name());
+    unindex(e);
+    dups.pop_front();
+  }
+
+  // raise tail?
+  if (tail < s)
+    tail = s;
+  lgeneric_subdout(cct, osd, 20) << "IndexedLog::trim after trim"
+				 << " dups.size()=" << dups.size()
+				 << " tail=" << tail
+				 << " s=" << s << dendl;
+}
+
+ostream& PGLog::IndexedLog::print(ostream& out) const
+{
+  out << *this << std::endl;
+  for (auto p = log.begin(); p != log.end(); ++p) {
+    out << *p << " " <<
+      (logged_object(p->soid) ? "indexed" : "NOT INDEXED") <<
+      std::endl;
+    ceph_assert(!p->reqid_is_indexed() || logged_req(p->reqid));
+  }
+
+  for (auto p = dups.begin(); p != dups.end(); ++p) {
+    out << *p << std::endl;
+  }
+
+  return out;
+}
+
+//////////////////// PGLog ////////////////////
+
+void PGLog::reset_backfill()
+{
+  missing.clear();
+}
+
+void PGLog::clear() {
+  missing.clear();
+  log.clear();
+  log_keys_debug.clear();
+  undirty();
+}
+
+void PGLog::clear_info_log(
+  spg_t pgid,
+  ObjectStore::Transaction *t) {
+  coll_t coll(pgid);
+  t->remove(coll, pgid.make_pgmeta_oid());
+}
+
+void PGLog::trim(
+  eversion_t trim_to,
+  pg_info_t &info,
+  bool transaction_applied,
+  bool async)
+{
+  dout(10) << __func__ << " proposed trim_to = " << trim_to << dendl;
+  // trim?
+  if (trim_to > log.tail) {
+    dout(10) << __func__ << " missing = " << missing.num_missing() << dendl;
+    // Don't assert for async_recovery_targets or backfill_targets
+    // or whenever there are missing items
+    if (transaction_applied && !async && (missing.num_missing() == 0))
+      ceph_assert(trim_to <= info.last_complete);
+
+    dout(10) << "trim " << log << " to " << trim_to << dendl;
+    log.trim(cct, trim_to, &trimmed, &trimmed_dups, &write_from_dups);
+    info.log_tail = log.tail;
+    if (log.complete_to != log.log.end())
+      dout(10) << " after trim complete_to " << log.complete_to->version << dendl;
+  }
+}
+
+void PGLog::proc_replica_log(
+  pg_info_t &oinfo,
+  const pg_log_t &olog,
+  pg_missing_t& omissing,
+  pg_shard_t from) const
+{
+  dout(10) << "proc_replica_log for osd." << from << ": "
+	   << oinfo << " " << olog << " " << omissing << dendl;
+
+  if (olog.head < log.tail) {
+    dout(10) << __func__ << ": osd." << from << " does not overlap, not looking "
+	     << "for divergent objects" << dendl;
+    return;
+  }
+  if (olog.head == log.head) {
+    dout(10) << __func__ << ": osd." << from << " same log head, not looking "
+	     << "for divergent objects" << dendl;
+    return;
+  }
+
+  /*
+    basically what we're doing here is rewinding the remote log,
+    dropping divergent entries, until we find something that matches
+    our master log.  we then reset last_update to reflect the new
+    point up to which missing is accurate.
+
+    later, in activate(), missing will get wound forward again and
+    we will send the peer enough log to arrive at the same state.
+  */
+
+  for (auto i = omissing.get_items().begin();
+       i != omissing.get_items().end();
+       ++i) {
+    dout(20) << " before missing " << i->first << " need " << i->second.need
+	     << " have " << i->second.have << dendl;
+  }
+
+  auto first_non_divergent = log.log.rbegin();
+  while (1) {
+    if (first_non_divergent == log.log.rend())
+      break;
+    if (first_non_divergent->version <= olog.head) {
+      dout(20) << "merge_log point (usually last shared) is "
+	       << *first_non_divergent << dendl;
+      break;
+    }
+    ++first_non_divergent;
+  }
+
+  /* Because olog.head >= log.tail, we know that both pgs must at least have
+   * the event represented by log.tail.  Similarly, because log.head >= olog.tail,
+   * we know that the event represented by olog.tail must be common to both logs.
+   * Furthermore, the event represented by a log tail was necessarily trimmed,
+   * thus neither olog.tail nor log.tail can be divergent. It's
+   * possible that olog/log contain no actual events between olog.head and
+   * max(log.tail, olog.tail), however, since they might have been split out.
+   * Thus, if we cannot find an event e such that
+   * log.tail <= e.version <= log.head, the last_update must actually be
+   * max(log.tail, olog.tail).
+   */
+  eversion_t limit = std::max(olog.tail, log.tail);
+  eversion_t lu =
+    (first_non_divergent == log.log.rend() ||
+     first_non_divergent->version < limit) ?
+    limit :
+    first_non_divergent->version;
+
+  // we merge and adjust the replica's log, rollback the rollbackable divergent entry, 
+  // remove the unrollbackable divergent entry and mark the according object as missing. 
+  // the rollback boundary must choose crt of the olog which going to be merged. 
+  // The replica log's(olog) crt will not be modified, so it could get passed
+  // to _merge_divergent_entries() directly.
+  IndexedLog folog(olog);
+  auto divergent = folog.rewind_from_head(lu);
+  _merge_divergent_entries(
+    folog,
+    divergent,
+    oinfo,
+    olog.get_can_rollback_to(),
+    omissing,
+    0,
+    this);
+
+  if (lu < oinfo.last_update) {
+    dout(10) << " peer osd." << from << " last_update now " << lu << dendl;
+    oinfo.last_update = lu;
+  }
+
+  if (omissing.have_missing()) {
+    eversion_t first_missing =
+      omissing.get_items().at(omissing.get_rmissing().begin()->second).need;
+    oinfo.last_complete = eversion_t();
+    for (auto i = olog.log.begin(); i != olog.log.end(); ++i) {
+      if (i->version < first_missing)
+	oinfo.last_complete = i->version;
+      else
+	break;
+    }
+  } else {
+    oinfo.last_complete = oinfo.last_update;
+  }
+} // proc_replica_log
+
+/**
+ * rewind divergent entries at the head of the log
+ *
+ * This rewinds entries off the head of our log that are divergent.
+ * This is used by replicas during activation.
+ *
+ * @param newhead new head to rewind to
+ */
+void PGLog::rewind_divergent_log(eversion_t newhead,
+				 pg_info_t &info, LogEntryHandler *rollbacker,
+				 bool &dirty_info, bool &dirty_big_info)
+{
+  dout(10) << "rewind_divergent_log truncate divergent future " <<
+    newhead << dendl;
+
+  // We need to preserve the original crt before it gets updated in rewind_from_head().
+  // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback
+  // a divergent entry or not.
+  eversion_t original_crt = log.get_can_rollback_to();
+  dout(20) << __func__ << " original_crt = " << original_crt << dendl;
+  if (info.last_complete > newhead)
+    info.last_complete = newhead;
+
+  auto divergent = log.rewind_from_head(newhead);
+  if (!divergent.empty()) {
+    mark_dirty_from(divergent.front().version);
+  }
+  for (auto &&entry: divergent) {
+    dout(10) << "rewind_divergent_log future divergent " << entry << dendl;
+  }
+  info.last_update = newhead;
+
+  _merge_divergent_entries(
+    log,
+    divergent,
+    info,
+    original_crt,
+    missing,
+    rollbacker,
+    this);
+
+  dirty_info = true;
+  dirty_big_info = true;
+}
+
+void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd,
+                      pg_info_t &info, LogEntryHandler *rollbacker,
+                      bool &dirty_info, bool &dirty_big_info)
+{
+  dout(10) << "merge_log " << olog << " from osd." << fromosd
+           << " into " << log << dendl;
+
+  // Check preconditions
+
+  // If our log is empty, the incoming log needs to have not been trimmed.
+  ceph_assert(!log.null() || olog.tail == eversion_t());
+  // The logs must overlap.
+  ceph_assert(log.head >= olog.tail && olog.head >= log.tail);
+
+  for (auto i = missing.get_items().begin();
+       i != missing.get_items().end();
+       ++i) {
+    dout(20) << "pg_missing_t sobject: " << i->first << dendl;
+  }
+
+  bool changed = false;
+
+  // extend on tail?
+  //  this is just filling in history.  it does not affect our
+  //  missing set, as that should already be consistent with our
+  //  current log.
+  eversion_t orig_tail = log.tail;
+  if (olog.tail < log.tail) {
+    dout(10) << "merge_log extending tail to " << olog.tail << dendl;
+    auto from = olog.log.begin();
+    auto to = from;
+    eversion_t last;
+    for (; to != olog.log.end(); ++to) {
+      if (to->version > log.tail)
+	break;
+      log.index(*to);
+      dout(15) << *to << dendl;
+      last = to->version;
+    }
+    mark_dirty_to(last);
+
+    // splice into our log.
+    log.log.splice(log.log.begin(),
+		   std::move(olog.log), from, to);
+
+    info.log_tail = log.tail = olog.tail;
+    changed = true;
+  }
+
+  if (oinfo.stats.reported_seq < info.stats.reported_seq ||   // make sure reported always increases
+      oinfo.stats.reported_epoch < info.stats.reported_epoch) {
+    oinfo.stats.reported_seq = info.stats.reported_seq;
+    oinfo.stats.reported_epoch = info.stats.reported_epoch;
+  }
+  if (info.last_backfill.is_max())
+    info.stats = oinfo.stats;
+  info.hit_set = oinfo.hit_set;
+
+  // do we have divergent entries to throw out?
+  if (olog.head < log.head) {
+    rewind_divergent_log(olog.head, info, rollbacker, dirty_info, dirty_big_info);
+    changed = true;
+  }
+
+  // extend on head?
+  if (olog.head > log.head) {
+    dout(10) << "merge_log extending head to " << olog.head << dendl;
+
+    // find start point in olog
+    auto to = olog.log.end();
+    auto from = olog.log.end();
+    eversion_t lower_bound = std::max(olog.tail, orig_tail);
+    while (1) {
+      if (from == olog.log.begin())
+	break;
+      --from;
+      dout(20) << "  ? " << *from << dendl;
+      if (from->version <= log.head) {
+	lower_bound = std::max(lower_bound, from->version);
+	++from;
+	break;
+      }
+    }
+    dout(20) << "merge_log cut point (usually last shared) is "
+	     << lower_bound << dendl;
+    mark_dirty_from(lower_bound);
+
+    // We need to preserve the original crt before it gets updated in rewind_from_head().
+    // Later, in merge_object_divergent_entries(), we use it to check whether we can rollback
+    // a divergent entry or not.
+    eversion_t original_crt = log.get_can_rollback_to();
+    dout(20) << __func__ << " original_crt = " << original_crt << dendl;
+    auto divergent = log.rewind_from_head(lower_bound);
+    // move aside divergent items
+    for (auto &&oe: divergent) {
+      dout(10) << "merge_log divergent " << oe << dendl;
+    }
+    log.roll_forward_to(log.head, rollbacker);
+
+    mempool::osd_pglog::list<pg_log_entry_t> new_entries;
+    new_entries.splice(new_entries.end(), olog.log, from, to);
+    append_log_entries_update_missing(
+      info.last_backfill,
+      new_entries,
+      false,
+      &log,
+      missing,
+      rollbacker,
+      this);
+
+    _merge_divergent_entries(
+      log,
+      divergent,
+      info,
+      original_crt,
+      missing,
+      rollbacker,
+      this);
+
+    info.last_update = log.head = olog.head;
+
+    // We cannot rollback into the new log entries
+    log.skip_can_rollback_to_to_head();
+
+    info.last_user_version = oinfo.last_user_version;
+    info.purged_snaps = oinfo.purged_snaps;
+    // update num_missing too
+    // we might have appended some more missing objects above
+    info.stats.stats.sum.num_objects_missing = missing.num_missing();
+
+    changed = true;
+  }
+
+  // now handle dups
+  if (merge_log_dups(olog)) {
+    changed = true;
+  }
+
+  dout(10) << "merge_log result " << log << " " << missing <<
+    " changed=" << changed << dendl;
+
+  if (changed) {
+    dirty_info = true;
+    dirty_big_info = true;
+  }
+}
+
+
+// returns true if any changes were made to log.dups
+bool PGLog::merge_log_dups(const pg_log_t& olog) {
+  dout(5) << __func__
+	  << " log.dups.size()=" << log.dups.size()
+	  <<  "olog.dups.size()=" << olog.dups.size() << dendl;
+  bool changed = false;
+
+  if (!olog.dups.empty()) {
+    if (log.dups.empty()) {
+      dout(10) << "merge_log copying olog dups to log " <<
+	olog.dups.front().version << " to " <<
+	olog.dups.back().version << dendl;
+      changed = true;
+      dirty_from_dups = eversion_t();
+      dirty_to_dups = eversion_t::max();
+      // since our log.dups is empty just copy them
+      for (const auto& i : olog.dups) {
+	log.dups.push_back(i);
+	log.index(log.dups.back());
+      }
+    } else {
+      // since our log.dups is not empty try to extend on each end
+
+      if (olog.dups.back().version > log.dups.back().version) {
+	// extend the dups's tail (i.e., newer dups)
+	dout(10) << "merge_log extending dups tail to " <<
+	  olog.dups.back().version << dendl;
+	changed = true;
+
+	auto log_tail_version = log.dups.back().version;
+
+	auto insert_cursor = log.dups.end();
+	eversion_t last_shared = eversion_t::max();
+	for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) {
+	  if (i->version <= log_tail_version) break;
+	  log.dups.insert(insert_cursor, *i);
+	  last_shared = i->version;
+
+	  auto prev = insert_cursor;
+	  --prev;
+	  // be sure to pass reference of copy in log.dups
+	  log.index(*prev);
+
+	  --insert_cursor; // make sure we insert in reverse order
+	}
+	mark_dirty_from_dups(last_shared);
+      }
+
+      if (olog.dups.front().version < log.dups.front().version) {
+	// extend the dups's head (i.e., older dups)
+	dout(10) << "merge_log extending dups head to " <<
+	  olog.dups.front().version << dendl;
+	changed = true;
+
+	eversion_t last;
+	auto insert_cursor = log.dups.begin();
+	for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) {
+	  if (i->version >= insert_cursor->version) break;
+	  log.dups.insert(insert_cursor, *i);
+	  last = i->version;
+	  auto prev = insert_cursor;
+	  --prev;
+	  // be sure to pass address of copy in log.dups
+	  log.index(*prev);
+	}
+	mark_dirty_to_dups(last);
+      }
+    }
+  }
+
+  // remove any dup entries that overlap with pglog
+  if (!log.dups.empty() && log.dups.back().version > log.tail) {
+    dout(10) << "merge_log removed dups overlapping log entries (" <<
+      log.tail << "," << log.dups.back().version << "]" << dendl;
+    changed = true;
+
+    while (!log.dups.empty() && log.dups.back().version > log.tail) {
+      log.unindex(log.dups.back());
+      mark_dirty_from_dups(log.dups.back().version);
+      log.dups.pop_back();
+    }
+  }
+
+  dout(5) << "end of " << __func__ << " changed=" << changed
+	  << " log.dups.size()=" << log.dups.size()
+	  << " olog.dups.size()=" << olog.dups.size() << dendl;
+
+  return changed;
+}
+
+void PGLog::check() {
+  if (!pg_log_debug)
+    return;
+  if (log.log.size() != log_keys_debug.size()) {
+    derr << "log.log.size() != log_keys_debug.size()" << dendl;
+    derr << "actual log:" << dendl;
+    for (auto i = log.log.begin(); i != log.log.end(); ++i) {
+      derr << "    " << *i << dendl;
+    }
+    derr << "log_keys_debug:" << dendl;
+    for (auto i = log_keys_debug.begin();
+	 i != log_keys_debug.end();
+	 ++i) {
+      derr << "    " << *i << dendl;
+    }
+  }
+  ceph_assert(log.log.size() == log_keys_debug.size());
+  for (auto i = log.log.begin(); i != log.log.end(); ++i) {
+    ceph_assert(log_keys_debug.count(i->get_key_name()));
+  }
+}
+
+// non-static
+void PGLog::write_log_and_missing(
+  ObjectStore::Transaction& t,
+  map<string,bufferlist> *km,
+  const coll_t& coll,
+  const ghobject_t &log_oid,
+  bool require_rollback)
+{
+  if (needs_write()) {
+    dout(6) << "write_log_and_missing with: "
+	     << "dirty_to: " << dirty_to
+	     << ", dirty_from: " << dirty_from
+	     << ", writeout_from: " << writeout_from
+	     << ", trimmed: " << trimmed
+	     << ", trimmed_dups: " << trimmed_dups
+	     << ", clear_divergent_priors: " << clear_divergent_priors
+	     << dendl;
+    _write_log_and_missing(
+      t, km, log, coll, log_oid,
+      dirty_to,
+      dirty_from,
+      writeout_from,
+      std::move(trimmed),
+      std::move(trimmed_dups),
+      missing,
+      !touched_log,
+      require_rollback,
+      clear_divergent_priors,
+      dirty_to_dups,
+      dirty_from_dups,
+      write_from_dups,
+      &may_include_deletes_in_missing_dirty,
+      (pg_log_debug ? &log_keys_debug : nullptr),
+      this);
+    undirty();
+  } else {
+    dout(10) << "log is not dirty" << dendl;
+  }
+}
+
+// static
+void PGLog::write_log_and_missing_wo_missing(
+    ObjectStore::Transaction& t,
+    map<string,bufferlist> *km,
+    pg_log_t &log,
+    const coll_t& coll, const ghobject_t &log_oid,
+    map<eversion_t, hobject_t> &divergent_priors,
+    bool require_rollback,
+    const DoutPrefixProvider *dpp
+    )
+{
+  _write_log_and_missing_wo_missing(
+    t, km, log, coll, log_oid,
+    divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
+    true, true, require_rollback,
+    eversion_t::max(), eversion_t(), eversion_t(), nullptr, dpp);
+}
+
+// static
+void PGLog::write_log_and_missing(
+    ObjectStore::Transaction& t,
+    map<string,bufferlist> *km,
+    pg_log_t &log,
+    const coll_t& coll,
+    const ghobject_t &log_oid,
+    const pg_missing_tracker_t &missing,
+    bool require_rollback,
+    bool *may_include_deletes_in_missing_dirty,
+    const DoutPrefixProvider *dpp)
+{
+  _write_log_and_missing(
+    t, km, log, coll, log_oid,
+    eversion_t::max(),
+    eversion_t(),
+    eversion_t(),
+    set<eversion_t>(),
+    set<string>(),
+    missing,
+    true, require_rollback, false,
+    eversion_t::max(),
+    eversion_t(),
+    eversion_t(),
+    may_include_deletes_in_missing_dirty, nullptr, dpp);
+}
+
+// static
+void PGLog::_write_log_and_missing_wo_missing(
+  ObjectStore::Transaction& t,
+  map<string,bufferlist> *km,
+  pg_log_t &log,
+  const coll_t& coll, const ghobject_t &log_oid,
+  map<eversion_t, hobject_t> &divergent_priors,
+  eversion_t dirty_to,
+  eversion_t dirty_from,
+  eversion_t writeout_from,
+  bool dirty_divergent_priors,
+  bool touch_log,
+  bool require_rollback,
+  eversion_t dirty_to_dups,
+  eversion_t dirty_from_dups,
+  eversion_t write_from_dups,
+  set<string> *log_keys_debug,
+  const DoutPrefixProvider *dpp
+  )
+{
+  ldpp_dout(dpp, 10) << "_write_log_and_missing_wo_missing, clearing up to " << dirty_to
+		     << " dirty_to_dups=" << dirty_to_dups
+		     << " dirty_from_dups=" << dirty_from_dups
+		     << " write_from_dups=" << write_from_dups << dendl;
+  if (touch_log)
+    t.touch(coll, log_oid);
+  if (dirty_to != eversion_t()) {
+    t.omap_rmkeyrange(
+      coll, log_oid,
+      eversion_t().get_key_name(), dirty_to.get_key_name());
+    clear_up_to(log_keys_debug, dirty_to.get_key_name());
+  }
+  if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
+    // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
+    t.omap_rmkeyrange(
+      coll, log_oid,
+      dirty_from.get_key_name(), eversion_t::max().get_key_name());
+    clear_after(log_keys_debug, dirty_from.get_key_name());
+  }
+
+  for (auto p = log.log.begin();
+       p != log.log.end() && p->version <= dirty_to;
+       ++p) {
+    bufferlist bl(sizeof(*p) * 2);
+    p->encode_with_checksum(bl);
+    (*km)[p->get_key_name()] = std::move(bl);
+  }
+
+  for (auto p = log.log.rbegin();
+       p != log.log.rend() &&
+	 (p->version >= dirty_from || p->version >= writeout_from) &&
+	 p->version >= dirty_to;
+       ++p) {
+    bufferlist bl(sizeof(*p) * 2);
+    p->encode_with_checksum(bl);
+    (*km)[p->get_key_name()] = std::move(bl);
+  }
+
+  if (log_keys_debug) {
+    for (auto i = (*km).begin();
+	 i != (*km).end();
+	 ++i) {
+      if (i->first[0] == '_')
+	continue;
+      ceph_assert(!log_keys_debug->count(i->first));
+      log_keys_debug->insert(i->first);
+    }
+  }
+
+  // process dups after log_keys_debug is filled, so dups do not
+  // end up in that set
+  if (dirty_to_dups != eversion_t()) {
+    pg_log_dup_t min, dirty_to_dup;
+    dirty_to_dup.version = dirty_to_dups;
+    ldpp_dout(dpp, 10) << __func__ << " remove dups min=" << min.get_key_name()
+		       << " to dirty_to_dup=" << dirty_to_dup.get_key_name() << dendl;
+    t.omap_rmkeyrange(
+      coll, log_oid,
+      min.get_key_name(), dirty_to_dup.get_key_name());
+  }
+  if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
+    pg_log_dup_t max, dirty_from_dup;
+    max.version = eversion_t::max();
+    dirty_from_dup.version = dirty_from_dups;
+    ldpp_dout(dpp, 10) << __func__ << " remove dups dirty_from_dup="
+		       << dirty_from_dup.get_key_name()
+		       << " to max=" << max.get_key_name() << dendl;
+    t.omap_rmkeyrange(
+      coll, log_oid,
+      dirty_from_dup.get_key_name(), max.get_key_name());
+  }
+
+  ldpp_dout(dpp, 10) << __func__ << " going to encode log.dups.size()="
+		     << log.dups.size() << dendl;
+  for (const auto& entry : log.dups) {
+    if (entry.version > dirty_to_dups)
+      break;
+    bufferlist bl;
+    encode(entry, bl);
+    (*km)[entry.get_key_name()] = std::move(bl);
+  }
+  ldpp_dout(dpp, 10) << __func__ << " 1st round encoded log.dups.size()="
+		     << log.dups.size() << dendl;
+  for (auto p = log.dups.rbegin();
+       p != log.dups.rend() &&
+	 (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
+	 p->version >= dirty_to_dups;
+       ++p) {
+    bufferlist bl;
+    encode(*p, bl);
+    (*km)[p->get_key_name()] = std::move(bl);
+  }
+  ldpp_dout(dpp, 10) << __func__ << " 2st round encoded log.dups.size()="
+		     << log.dups.size() << dendl;
+
+  if (dirty_divergent_priors) {
+    ldpp_dout(dpp, 10) << "write_log_and_missing: writing divergent_priors"
+		       << dendl;
+    encode(divergent_priors, (*km)["divergent_priors"]);
+  }
+  if (require_rollback) {
+    encode(
+      log.get_can_rollback_to(),
+      (*km)["can_rollback_to"]);
+    encode(
+      log.get_rollback_info_trimmed_to(),
+      (*km)["rollback_info_trimmed_to"]);
+  }
+  ldpp_dout(dpp, 10) << "end of " << __func__ << dendl;
+}
+
+// static
+void PGLog::_write_log_and_missing(
+  ObjectStore::Transaction& t,
+  map<string,bufferlist>* km,
+  pg_log_t &log,
+  const coll_t& coll, const ghobject_t &log_oid,
+  eversion_t dirty_to,
+  eversion_t dirty_from,
+  eversion_t writeout_from,
+  set<eversion_t> &&trimmed,
+  set<string> &&trimmed_dups,
+  const pg_missing_tracker_t &missing,
+  bool touch_log,
+  bool require_rollback,
+  bool clear_divergent_priors,
+  eversion_t dirty_to_dups,
+  eversion_t dirty_from_dups,
+  eversion_t write_from_dups,
+  bool *may_include_deletes_in_missing_dirty, // in/out param
+  set<string> *log_keys_debug,
+  const DoutPrefixProvider *dpp
+  ) {
+  ldpp_dout(dpp, 10) << __func__ << " clearing up to " << dirty_to
+		     << " dirty_to_dups=" << dirty_to_dups
+		     << " dirty_from_dups=" << dirty_from_dups
+		     << " write_from_dups=" << write_from_dups
+		     << " trimmed_dups.size()=" << trimmed_dups.size() << dendl;
+  set<string> to_remove;
+  to_remove.swap(trimmed_dups);
+  for (auto& t : trimmed) {
+    string key = t.get_key_name();
+    if (log_keys_debug) {
+      auto it = log_keys_debug->find(key);
+      ceph_assert(it != log_keys_debug->end());
+      log_keys_debug->erase(it);
+    }
+    to_remove.emplace(std::move(key));
+  }
+  trimmed.clear();
+
+  if (touch_log)
+    t.touch(coll, log_oid);
+  if (dirty_to != eversion_t()) {
+    t.omap_rmkeyrange(
+      coll, log_oid,
+      eversion_t().get_key_name(), dirty_to.get_key_name());
+    clear_up_to(log_keys_debug, dirty_to.get_key_name());
+  }
+  if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
+    ldpp_dout(dpp, 10) << "write_log_and_missing, clearing from "
+		       << dirty_from << dendl;
+    t.omap_rmkeyrange(
+      coll, log_oid,
+      dirty_from.get_key_name(), eversion_t::max().get_key_name());
+    clear_after(log_keys_debug, dirty_from.get_key_name());
+  }
+
+  for (auto p = log.log.begin();
+       p != log.log.end() && p->version <= dirty_to;
+       ++p) {
+    bufferlist bl(sizeof(*p) * 2);
+    p->encode_with_checksum(bl);
+    (*km)[p->get_key_name()] = std::move(bl);
+  }
+
+  for (auto p = log.log.rbegin();
+       p != log.log.rend() &&
+	 (p->version >= dirty_from || p->version >= writeout_from) &&
+	 p->version >= dirty_to;
+       ++p) {
+    bufferlist bl(sizeof(*p) * 2);
+    p->encode_with_checksum(bl);
+    (*km)[p->get_key_name()] = std::move(bl);
+  }
+
+  if (log_keys_debug) {
+    for (auto i = (*km).begin();
+	 i != (*km).end();
+	 ++i) {
+      if (i->first[0] == '_')
+	continue;
+      ceph_assert(!log_keys_debug->count(i->first));
+      log_keys_debug->insert(i->first);
+    }
+  }
+
+  // process dups after log_keys_debug is filled, so dups do not
+  // end up in that set
+  if (dirty_to_dups != eversion_t()) {
+    pg_log_dup_t min, dirty_to_dup;
+    dirty_to_dup.version = dirty_to_dups;
+    ldpp_dout(dpp, 10) << __func__ << " remove dups min=" << min.get_key_name()
+		       << " to dirty_to_dup=" << dirty_to_dup.get_key_name() << dendl;
+    t.omap_rmkeyrange(
+      coll, log_oid,
+      min.get_key_name(), dirty_to_dup.get_key_name());
+  }
+  if (dirty_to_dups != eversion_t::max() && dirty_from_dups != eversion_t::max()) {
+    pg_log_dup_t max, dirty_from_dup;
+    max.version = eversion_t::max();
+    dirty_from_dup.version = dirty_from_dups;
+    ldpp_dout(dpp, 10) << __func__ << " remove dups dirty_from_dup="
+		       << dirty_from_dup.get_key_name()
+		       << " to max=" << max.get_key_name() << dendl;
+    t.omap_rmkeyrange(
+      coll, log_oid,
+      dirty_from_dup.get_key_name(), max.get_key_name());
+  }
+
+  ldpp_dout(dpp, 10) << __func__ << " going to encode log.dups.size()="
+		     << log.dups.size() << dendl;
+  for (const auto& entry : log.dups) {
+    if (entry.version > dirty_to_dups)
+      break;
+    bufferlist bl;
+    encode(entry, bl);
+    (*km)[entry.get_key_name()] = std::move(bl);
+  }
+  ldpp_dout(dpp, 10) << __func__ << " 1st round encoded log.dups.size()="
+		     << log.dups.size() << dendl;
+
+  for (auto p = log.dups.rbegin();
+       p != log.dups.rend() &&
+	 (p->version >= dirty_from_dups || p->version >= write_from_dups) &&
+	 p->version >= dirty_to_dups;
+       ++p) {
+    bufferlist bl;
+    encode(*p, bl);
+    (*km)[p->get_key_name()] = std::move(bl);
+  }
+  ldpp_dout(dpp, 10) << __func__ << " 2st round encoded log.dups.size()="
+		     << log.dups.size() << dendl;
+
+  if (clear_divergent_priors) {
+    ldpp_dout(dpp, 10) << "write_log_and_missing: writing divergent_priors"
+		       << dendl;
+    to_remove.insert("divergent_priors");
+  }
+  // since we encode individual missing items instead of a whole
+  // missing set, we need another key to store this bit of state
+  if (*may_include_deletes_in_missing_dirty) {
+    (*km)["may_include_deletes_in_missing"] = bufferlist();
+    *may_include_deletes_in_missing_dirty = false;
+  }
+  missing.get_changed(
+    [&](const hobject_t &obj) {
+      string key = string("missing/") + obj.to_str();
+      pg_missing_item item;
+      if (!missing.is_missing(obj, &item)) {
+	to_remove.insert(key);
+      } else {
+	encode(make_pair(obj, item), (*km)[key], CEPH_FEATUREMASK_SERVER_OCTOPUS);
+      }
+    });
+  if (require_rollback) {
+    encode(
+      log.get_can_rollback_to(),
+      (*km)["can_rollback_to"]);
+    encode(
+      log.get_rollback_info_trimmed_to(),
+      (*km)["rollback_info_trimmed_to"]);
+  }
+
+  if (!to_remove.empty())
+    t.omap_rmkeys(coll, log_oid, to_remove);
+  ldpp_dout(dpp, 10) << "end of " << __func__ << dendl;
+}
+
+void PGLog::rebuild_missing_set_with_deletes(
+  ObjectStore *store,
+  ObjectStore::CollectionHandle& ch,
+  const pg_info_t &info)
+{
+  // save entries not generated from the current log (e.g. added due
+  // to repair, EIO handling, or divergent_priors).
+  map<hobject_t, pg_missing_item> extra_missing;
+  for (const auto& p : missing.get_items()) {
+    if (!log.logged_object(p.first)) {
+      dout(20) << __func__ << " extra missing entry: " << p.first
+	       << " " << p.second << dendl;
+      extra_missing[p.first] = p.second;
+    }
+  }
+  missing.clear();
+
+  // go through the log and add items that are not present or older
+  // versions on disk, just as if we were reading the log + metadata
+  // off disk originally
+  set<hobject_t> did;
+  for (auto i = log.log.rbegin();
+       i != log.log.rend();
+       ++i) {
+    if (i->version <= info.last_complete)
+      break;
+    if (i->soid > info.last_backfill ||
+	i->is_error() ||
+	did.find(i->soid) != did.end())
+      continue;
+    did.insert(i->soid);
+
+    bufferlist bv;
+    int r = store->getattr(
+      ch,
+	ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
+	OI_ATTR,
+	bv);
+    dout(20) << __func__ << " check for log entry: " << *i << " = " << r << dendl;
+
+    if (r >= 0) {
+      object_info_t oi(bv);
+      dout(20) << __func__ << " store version = " << oi.version << dendl;
+      if (oi.version < i->version) {
+	missing.add(i->soid, i->version, oi.version, i->is_delete());
+      }
+    } else {
+      missing.add(i->soid, i->version, eversion_t(), i->is_delete());
+    }
+  }
+
+  for (const auto& p : extra_missing) {
+    missing.add(p.first, p.second.need, p.second.have, p.second.is_delete());
+  }
+
+  set_missing_may_contain_deletes();
+}
+
+#ifdef WITH_SEASTAR
+
+namespace {
+  struct FuturizedStoreLogReader {
+    crimson::os::FuturizedStore &store;
+    const pg_info_t &info;
+    PGLog::IndexedLog &log;
+    std::set<std::string>* log_keys_debug = NULL;
+    pg_missing_tracker_t &missing;
+    const DoutPrefixProvider *dpp;
+
+    eversion_t on_disk_can_rollback_to;
+    eversion_t on_disk_rollback_info_trimmed_to;
+
+    std::map<eversion_t, hobject_t> divergent_priors;
+    bool must_rebuild = false;
+    std::list<pg_log_entry_t> entries;
+    std::list<pg_log_dup_t> dups;
+
+    std::optional<std::string> next;
+
+    void process_entry(crimson::os::FuturizedStore::OmapIteratorRef &p) {
+      if (p->key()[0] == '_')
+        return;
+      //Copy ceph::buffer::list before creating iterator
+      auto bl = p->value();
+      auto bp = bl.cbegin();
+      if (p->key() == "divergent_priors") {
+        decode(divergent_priors, bp);
+        ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size()
+                           << " divergent_priors" << dendl;
+        ceph_assert("crimson shouldn't have had divergent_priors" == 0);
+      } else if (p->key() == "can_rollback_to") {
+        decode(on_disk_can_rollback_to, bp);
+      } else if (p->key() == "rollback_info_trimmed_to") {
+        decode(on_disk_rollback_info_trimmed_to, bp);
+      } else if (p->key() == "may_include_deletes_in_missing") {
+        missing.may_include_deletes = true;
+      } else if (p->key().substr(0, 7) == std::string("missing")) {
+        hobject_t oid;
+        pg_missing_item item;
+        decode(oid, bp);
+        decode(item, bp);
+        if (item.is_delete()) {
+          ceph_assert(missing.may_include_deletes);
+        }
+        missing.add(oid, std::move(item));
+      } else if (p->key().substr(0, 4) == std::string("dup_")) {
+        pg_log_dup_t dup;
+        decode(dup, bp);
+        if (!dups.empty()) {
+          ceph_assert(dups.back().version < dup.version);
+        }
+        dups.push_back(dup);
+      } else {
+        pg_log_entry_t e;
+        e.decode_with_checksum(bp);
+        ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl;
+        if (!entries.empty()) {
+          pg_log_entry_t last_e(entries.back());
+          ceph_assert(last_e.version.version < e.version.version);
+          ceph_assert(last_e.version.epoch <= e.version.epoch);
+        }
+        entries.push_back(e);
+        if (log_keys_debug)
+          log_keys_debug->insert(e.get_key_name());
+      }
+    }
+
+    seastar::future<> read(crimson::os::CollectionRef ch,
+                           ghobject_t pgmeta_oid) {
+      // will get overridden if recorded
+      on_disk_can_rollback_to = info.last_update;
+      missing.may_include_deletes = false;
+
+      return store.get_omap_iterator(ch, pgmeta_oid).then([this](auto iter) {
+        return seastar::do_until([iter] { return !iter->valid(); },
+                                 [iter, this]() mutable {
+          process_entry(iter);
+          return iter->next();
+        });
+      }).then([this] {
+        log = PGLog::IndexedLog(
+             info.last_update,
+             info.log_tail,
+             on_disk_can_rollback_to,
+             on_disk_rollback_info_trimmed_to,
+             std::move(entries),
+             std::move(dups));
+      });
+    }
+  };
+}
+
+seastar::future<> PGLog::read_log_and_missing_crimson(
+  crimson::os::FuturizedStore &store,
+  crimson::os::CollectionRef ch,
+  const pg_info_t &info,
+  IndexedLog &log,
+  std::set<std::string>* log_keys_debug,
+  pg_missing_tracker_t &missing,
+  ghobject_t pgmeta_oid,
+  const DoutPrefixProvider *dpp)
+{
+  ldpp_dout(dpp, 20) << "read_log_and_missing coll "
+                     << ch->get_cid()
+                     << " " << pgmeta_oid << dendl;
+  return seastar::do_with(FuturizedStoreLogReader{
+      store, info, log, log_keys_debug,
+      missing, dpp},
+    [ch, pgmeta_oid](FuturizedStoreLogReader& reader) {
+    return reader.read(ch, pgmeta_oid);
+  });
+}
+
+#endif
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
new file mode 100644
index 000000000..69ca1d20c
--- /dev/null
+++ b/src/osd/PGLog.h
@@ -0,0 +1,1697 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#pragma once
+
+// re-include our assert to clobber boost's
+#include "include/ceph_assert.h"
+#include "include/common_fwd.h"
+#include "osd_types.h"
+#include "os/ObjectStore.h"
+#include <list>
+
+#ifdef WITH_SEASTAR
+#include <seastar/core/future.hh>
+#include "crimson/os/futurized_store.h"
+#include "crimson/os/cyanstore/cyan_collection.h"
+#endif
+
+/** @name PG Log
+ *
+ * The pg log serves three primary purposes:
+ *
+ * 1) improving recovery speed
+ *
+ * 2) detecting duplicate ops
+ *
+ * 3) making erasure coded updates safe
+ *
+ * For (1), the main data type is pg_log_entry_t.  this is indexed in
+ * memory by the IndexedLog class - this is where most of the logic
+ * surrounding pg log is kept, even though the low level types are in
+ * src/osd/osd_types.h
+ *
+ * (2) uses a type which is a subset of the full log entry, containing
+ * just the pieces we need to identify and respond to a duplicate
+ * request.
+ *
+ * As we trim the log, we convert pg_log_entry_t to smaller
+ * pg_log_dup_t, and finally remove them once we reach a higher
+ * limit. This is controlled by a few options:
+ *
+ * osd_min_pg_log_entries osd_max_pg_log_entries
+ * osd_pg_log_dups_tracked
+ *
+ * For example, with a min of 100, max of 1000, and dups tracked of
+ * 3000, the log entries and dups stored would span the following
+ * versions, assuming the current earliest is version 1:
+ *
+ *   version: 3000 2001 2000 1 [ pg log entries ] [ pg log dups ]
+ *
+ * after osd_pg_log_trim_min subsequent writes to this PG, the log
+ * would be trimmed to look like:
+ *
+ *   version: 3100 2101 2100 101 [ pg log entries ] [ pg log dups ]
+ *
+ * (3) means tracking the previous state of an object, so that we can
+ * rollback to that prior state if necessary. It's only used for
+ * erasure coding. Consider an erasure code of 4+2, for example.
+ *
+ * This means we split the object into 4 pieces (called shards) and
+ * compute 2 parity shards. Each of these shards is stored on a
+ * separate OSD. As long as 4 shards are the same version, we can
+ * recover the remaining 2 by computation. Imagine during a write, 3
+ * of the osds go down and restart, resulting in shards 0,1,2
+ * reflecting version A and shards 3,4,5 reflecting version B, after
+ * the write.
+ *
+ * If we had no way to reconstruct version A for another shard, we
+ * would have lost the object.
+ *
+ * The actual data for rollback is stored in a look-aside object and
+ * is removed once the EC write commits on all shards. The pg log just
+ * stores the versions so we can tell how far we can rollback, and a
+ * description of the type of operation for each log entry.  Beyond
+ * the pg log, see PGBackend::Trimmer and PGBackend::RollbackVisitor
+ * for more details on this.
+ *
+ * An important implication of this is that although the pg log length
+ * is normally bounded, under extreme conditions, with many EC I/Os
+ * outstanding, the log may grow beyond that point because we need to
+ * keep the rollback information for all outstanding EC I/O.
+ *
+ * For more on pg log bounds, see where it is calculated in
+ * PeeringState::calc_trim_to_aggressive().
+ *
+ * For more details on how peering uses the pg log, and architectural
+ * reasons for its existence, see:
+ *
+ *   doc/dev/osd_internals/log_based_pg.rst
+ *
+ */
+
+constexpr auto PGLOG_INDEXED_OBJECTS          = 1 << 0;
+constexpr auto PGLOG_INDEXED_CALLER_OPS       = 1 << 1;
+constexpr auto PGLOG_INDEXED_EXTRA_CALLER_OPS = 1 << 2;
+constexpr auto PGLOG_INDEXED_DUPS             = 1 << 3;
+constexpr auto PGLOG_INDEXED_ALL              = PGLOG_INDEXED_OBJECTS 
+                                              | PGLOG_INDEXED_CALLER_OPS 
+                                              | PGLOG_INDEXED_EXTRA_CALLER_OPS 
+                                              | PGLOG_INDEXED_DUPS;
+
+struct PGLog : DoutPrefixProvider {
+  std::ostream& gen_prefix(std::ostream& out) const override {
+    return out;
+  }
+  unsigned get_subsys() const override {
+    return static_cast<unsigned>(ceph_subsys_osd);
+  }
+  CephContext *get_cct() const override {
+    return cct;
+  }
+
+  ////////////////////////////// sub classes //////////////////////////////
+  struct LogEntryHandler {
+    virtual void rollback(
+      const pg_log_entry_t &entry) = 0;
+    virtual void rollforward(
+      const pg_log_entry_t &entry) = 0;
+    virtual void trim(
+      const pg_log_entry_t &entry) = 0;
+    virtual void remove(
+      const hobject_t &hoid) = 0;
+    virtual void try_stash(
+      const hobject_t &hoid,
+      version_t v) = 0;
+    virtual ~LogEntryHandler() {}
+  };
+  using LogEntryHandlerRef = std::unique_ptr<LogEntryHandler>;
+
+public:
+  /**
+   * IndexLog - adds in-memory index of the log, by oid.
+   * plus some methods to manipulate it all.
+   */
+  struct IndexedLog : public pg_log_t {
+    mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects;  // ptrs into log.  be careful!
+    mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops;
+    mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops;
+    mutable ceph::unordered_map<osd_reqid_t,pg_log_dup_t*> dup_index;
+
+    // recovery pointers
+    std::list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
+    version_t last_requested = 0;               // last object requested by primary
+
+    //
+  private:
+    mutable __u16 indexed_data = 0;
+    /**
+     * rollback_info_trimmed_to_riter points to the first log entry <=
+     * rollback_info_trimmed_to
+     *
+     * It's a reverse_iterator because rend() is a natural representation for
+     * tail, and rbegin() works nicely for head.
+     */
+    mempool::osd_pglog::list<pg_log_entry_t>::reverse_iterator
+      rollback_info_trimmed_to_riter;
+
+    /*
+     * return true if we need to mark the pglog as dirty
+     */
+    template <typename F>
+    bool advance_can_rollback_to(eversion_t to, F &&f) {
+      bool dirty_log = to > can_rollback_to || to > rollback_info_trimmed_to;
+      if (dirty_log) {
+	if (to > can_rollback_to)
+	  can_rollback_to = to;
+
+	if (to > rollback_info_trimmed_to)
+	  rollback_info_trimmed_to = to;
+      }
+
+      while (rollback_info_trimmed_to_riter != log.rbegin()) {
+	--rollback_info_trimmed_to_riter;
+	if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
+	  ++rollback_info_trimmed_to_riter;
+	  break;
+	}
+	f(*rollback_info_trimmed_to_riter);
+      }
+
+      return dirty_log;
+    }
+
+    void reset_rollback_info_trimmed_to_riter() {
+      rollback_info_trimmed_to_riter = log.rbegin();
+      while (rollback_info_trimmed_to_riter != log.rend() &&
+	     rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
+	++rollback_info_trimmed_to_riter;
+    }
+
+    // indexes objects, caller ops and extra caller ops
+  public:
+    IndexedLog() :
+      complete_to(log.end()),
+      last_requested(0),
+      indexed_data(0),
+      rollback_info_trimmed_to_riter(log.rbegin())
+    { }
+
+    template <typename... Args>
+    explicit IndexedLog(Args&&... args) :
+      pg_log_t(std::forward<Args>(args)...),
+      complete_to(log.end()),
+      last_requested(0),
+      indexed_data(0),
+      rollback_info_trimmed_to_riter(log.rbegin())
+    {
+      reset_rollback_info_trimmed_to_riter();
+      index();
+    }
+
+    IndexedLog(const IndexedLog &rhs) :
+      pg_log_t(rhs),
+      complete_to(log.end()),
+      last_requested(rhs.last_requested),
+      indexed_data(0),
+      rollback_info_trimmed_to_riter(log.rbegin())
+    {
+      reset_rollback_info_trimmed_to_riter();
+      index(rhs.indexed_data);
+    }
+
+    IndexedLog &operator=(const IndexedLog &rhs) {
+      this->~IndexedLog();
+      new (this) IndexedLog(rhs);
+      return *this;
+    }
+
+    void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) {
+      advance_can_rollback_to(
+	to,
+	[&](pg_log_entry_t &entry) {
+	  h->trim(entry);
+	});
+    }
+    bool roll_forward_to(eversion_t to, LogEntryHandler *h) {
+      return advance_can_rollback_to(
+	to,
+	[&](pg_log_entry_t &entry) {
+	  h->rollforward(entry);
+	});
+    }
+
+    void skip_can_rollback_to_to_head() {
+      advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {});
+    }
+
+    mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
+      auto divergent = pg_log_t::rewind_from_head(newhead);
+      index();
+      reset_rollback_info_trimmed_to_riter();
+      return divergent;
+    }
+
+    template <typename T>
+    void scan_log_after(
+      const eversion_t &bound, ///< [in] scan entries > bound
+      T &&f) const {
+      auto iter = log.rbegin();
+      while (iter != log.rend() && iter->version > bound)
+	++iter;
+
+      while (true) {
+	if (iter == log.rbegin())
+	  break;
+	f(*(--iter));
+      }
+    }
+
+    /****/
+    void claim_log_and_clear_rollback_info(const pg_log_t& o) {
+      // we must have already trimmed the old entries
+      ceph_assert(rollback_info_trimmed_to == head);
+      ceph_assert(rollback_info_trimmed_to_riter == log.rbegin());
+
+      *this = IndexedLog(o);
+
+      skip_can_rollback_to_to_head();
+      index();
+    }
+
+    void split_out_child(
+      pg_t child_pgid,
+      unsigned split_bits,
+      IndexedLog *target);
+
+    void zero() {
+      // we must have already trimmed the old entries
+      ceph_assert(rollback_info_trimmed_to == head);
+      ceph_assert(rollback_info_trimmed_to_riter == log.rbegin());
+
+      unindex();
+      pg_log_t::clear();
+      rollback_info_trimmed_to_riter = log.rbegin();
+      reset_recovery_pointers();
+    }
+    void clear() {
+      skip_can_rollback_to_to_head();
+      zero();
+    }
+    void reset_recovery_pointers() {
+      complete_to = log.end();
+      last_requested = 0;
+    }
+
+    bool logged_object(const hobject_t& oid) const {
+      if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
+         index_objects();
+      }
+      return objects.count(oid);
+    }
+
+    bool logged_req(const osd_reqid_t &r) const {
+      if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
+        index_caller_ops();
+      }
+      if (!caller_ops.count(r)) {
+        if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
+          index_extra_caller_ops();
+        }
+        return extra_caller_ops.count(r);
+      }
+      return true;
+    }
+
+    bool get_request(
+      const osd_reqid_t &r,
+      eversion_t *version,
+      version_t *user_version,
+      int *return_code,
+      std::vector<pg_log_op_return_item_t> *op_returns) const
+    {
+      ceph_assert(version);
+      ceph_assert(user_version);
+      ceph_assert(return_code);
+      if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) {
+        index_caller_ops();
+      }
+      auto p = caller_ops.find(r);
+      if (p != caller_ops.end()) {
+	*version = p->second->version;
+	*user_version = p->second->user_version;
+	*return_code = p->second->return_code;
+	*op_returns = p->second->op_returns;
+	return true;
+      }
+
+      // warning: we will return *a* request for this reqid, but not
+      // necessarily the most recent.
+      if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) {
+        index_extra_caller_ops();
+      }
+      p = extra_caller_ops.find(r);
+      if (p != extra_caller_ops.end()) {
+	uint32_t idx = 0;
+	for (auto i = p->second->extra_reqids.begin();
+	     i != p->second->extra_reqids.end();
+	     ++idx, ++i) {
+	  if (i->first == r) {
+	    *version = p->second->version;
+	    *user_version = i->second;
+	    *return_code = p->second->return_code;
+	    *op_returns = p->second->op_returns;
+	    if (*return_code >= 0) {
+	      auto it = p->second->extra_reqid_return_codes.find(idx);
+	      if (it != p->second->extra_reqid_return_codes.end()) {
+		*return_code = it->second;
+	      }
+	    }
+	    return true;
+	  }
+	}
+	ceph_abort_msg("in extra_caller_ops but not extra_reqids");
+      }
+
+      if (!(indexed_data & PGLOG_INDEXED_DUPS)) {
+        index_dups();
+      }
+      auto q = dup_index.find(r);
+      if (q != dup_index.end()) {
+	*version = q->second->version;
+	*user_version = q->second->user_version;
+	*return_code = q->second->return_code;
+	*op_returns = q->second->op_returns;
+	return true;
+      }
+
+      return false;
+    }
+
+    bool has_write_since(const hobject_t &oid, const eversion_t &bound) const {
+      for (auto i = log.rbegin(); i != log.rend(); ++i) {
+	if (i->version <= bound)
+	  return false;
+	if (i->soid.get_head() == oid.get_head())
+	  return true;
+      }
+      return false;
+    }
+
+    /// get a (bounded) std::list of recent reqids for the given object
+    void get_object_reqids(const hobject_t& oid, unsigned max,
+			   mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > *pls,
+			   mempool::osd_pglog::map<uint32_t, int> *return_codes) const {
+       // make sure object is present at least once before we do an
+       // O(n) search.
+      if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) {
+        index_objects();
+      }
+      if (objects.count(oid) == 0)
+	return;
+
+      for (auto i = log.rbegin(); i != log.rend(); ++i) {
+	if (i->soid == oid) {
+	  if (i->reqid_is_indexed()) {
+	    if (i->op == pg_log_entry_t::ERROR) {
+	      // propagate op errors to the cache tier's PG log
+	      return_codes->emplace(pls->size(), i->return_code);
+	    }
+	    pls->push_back(std::make_pair(i->reqid, i->user_version));
+	  }
+
+	  pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end());
+	  if (pls->size() >= max) {
+	    if (pls->size() > max) {
+	      pls->resize(max);
+	    }
+	    return;
+	  }
+	}
+      }
+    }
+
+    void index(__u16 to_index = PGLOG_INDEXED_ALL) const {
+      // if to_index is 0, no need to run any of this code, especially
+      // loop below; this can happen with copy constructor for
+      // IndexedLog (and indirectly through assignment operator)
+      if (!to_index) return;
+
+      if (to_index & PGLOG_INDEXED_OBJECTS)
+	objects.clear();
+      if (to_index & PGLOG_INDEXED_CALLER_OPS)
+	caller_ops.clear();
+      if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS)
+	extra_caller_ops.clear();
+      if (to_index & PGLOG_INDEXED_DUPS) {
+	dup_index.clear();
+	for (auto& i : dups) {
+	  dup_index[i.reqid] = const_cast<pg_log_dup_t*>(&i);
+	}
+      }
+
+      constexpr __u16 any_log_entry_index =
+	PGLOG_INDEXED_OBJECTS |
+	PGLOG_INDEXED_CALLER_OPS |
+	PGLOG_INDEXED_EXTRA_CALLER_OPS;
+
+      if (to_index & any_log_entry_index) {
+	for (auto i = log.begin(); i != log.end(); ++i) {
+	  if (to_index & PGLOG_INDEXED_OBJECTS) {
+	    if (i->object_is_indexed()) {
+	      objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i));
+	    }
+	  }
+
+	  if (to_index & PGLOG_INDEXED_CALLER_OPS) {
+	    if (i->reqid_is_indexed()) {
+	      caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i));
+	    }
+	  }
+
+	  if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
+	    for (auto j = i->extra_reqids.begin();
+		 j != i->extra_reqids.end();
+		 ++j) {
+	      extra_caller_ops.insert(
+		std::make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i))));
+	    }
+	  }
+	}
+      }
+
+      indexed_data |= to_index;
+    }
+
+    void index_objects() const {
+      index(PGLOG_INDEXED_OBJECTS);
+    }
+
+    void index_caller_ops() const {
+      index(PGLOG_INDEXED_CALLER_OPS);
+    }
+
+    void index_extra_caller_ops() const {
+      index(PGLOG_INDEXED_EXTRA_CALLER_OPS);
+    }
+
+    void index_dups() const {
+      index(PGLOG_INDEXED_DUPS);
+    }
+
+    void index(pg_log_entry_t& e) {
+      if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
+        if (objects.count(e.soid) == 0 ||
+            objects[e.soid]->version < e.version)
+          objects[e.soid] = &e;
+      }
+      if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
+	// divergent merge_log indexes new before unindexing old
+        if (e.reqid_is_indexed()) {
+	  caller_ops[e.reqid] = &e;
+        }
+      }
+      if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
+        for (auto j = e.extra_reqids.begin();
+	     j != e.extra_reqids.end();
+	     ++j) {
+	  extra_caller_ops.insert(std::make_pair(j->first, &e));
+        }
+      }
+    }
+
+    void unindex() {
+      objects.clear();
+      caller_ops.clear();
+      extra_caller_ops.clear();
+      dup_index.clear();
+      indexed_data = 0;
+    }
+
+    void unindex(const pg_log_entry_t& e) {
+      // NOTE: this only works if we remove from the _tail_ of the log!
+      if (indexed_data & PGLOG_INDEXED_OBJECTS) {
+	auto it = objects.find(e.soid);
+        if (it != objects.end() && it->second->version == e.version)
+          objects.erase(it);
+      }
+      if (e.reqid_is_indexed()) {
+        if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
+	  auto it = caller_ops.find(e.reqid);
+	  // divergent merge_log indexes new before unindexing old
+          if (it != caller_ops.end() && it->second == &e)
+            caller_ops.erase(it);
+        }
+      }
+      if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
+        for (auto j = e.extra_reqids.begin();
+             j != e.extra_reqids.end();
+             ++j) {
+          for (auto k = extra_caller_ops.find(j->first);
+               k != extra_caller_ops.end() && k->first == j->first;
+               ++k) {
+            if (k->second == &e) {
+              extra_caller_ops.erase(k);
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    void index(pg_log_dup_t& e) {
+      if (indexed_data & PGLOG_INDEXED_DUPS) {
+	dup_index[e.reqid] = &e;
+      }
+    }
+
+    void unindex(const pg_log_dup_t& e) {
+      if (indexed_data & PGLOG_INDEXED_DUPS) {
+	auto i = dup_index.find(e.reqid);
+	if (i != dup_index.end()) {
+	  dup_index.erase(i);
+	}
+      }
+    }
+
+    // actors
+    void add(const pg_log_entry_t& e, bool applied = true) {
+      if (!applied) {
+	ceph_assert(get_can_rollback_to() == head);
+      }
+
+      // make sure our buffers don't pin bigger buffers
+      e.mod_desc.trim_bl();
+
+      // add to log
+      log.push_back(e);
+
+      // riter previously pointed to the previous entry
+      if (rollback_info_trimmed_to_riter == log.rbegin())
+	++rollback_info_trimmed_to_riter;
+
+      ceph_assert(e.version > head);
+      ceph_assert(head.version == 0 || e.version.version > head.version);
+      head = e.version;
+
+      // to our index
+      if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
+        objects[e.soid] = &(log.back());
+      }
+      if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
+        if (e.reqid_is_indexed()) {
+	  caller_ops[e.reqid] = &(log.back());
+        }
+      }
+
+      if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
+        for (auto j = e.extra_reqids.begin();
+	     j != e.extra_reqids.end();
+	     ++j) {
+	  extra_caller_ops.insert(std::make_pair(j->first, &(log.back())));
+        }
+      }
+
+      if (!applied) {
+	skip_can_rollback_to_to_head();
+      }
+    } // add
+
+    void trim(
+      CephContext* cct,
+      eversion_t s,
+      std::set<eversion_t> *trimmed,
+      std::set<std::string>* trimmed_dups,
+      eversion_t *write_from_dups);
+
+    std::ostream& print(std::ostream& out) const;
+  }; // IndexedLog
+
+
+protected:
+  //////////////////// data members ////////////////////
+
+  pg_missing_tracker_t missing;
+  IndexedLog  log;
+
+  eversion_t dirty_to;         ///< must clear/writeout all keys <= dirty_to
+  eversion_t dirty_from;       ///< must clear/writeout all keys >= dirty_from
+  eversion_t writeout_from;    ///< must writout keys >= writeout_from
+  std::set<eversion_t> trimmed;     ///< must clear keys in trimmed
+  eversion_t dirty_to_dups;    ///< must clear/writeout all dups <= dirty_to_dups
+  eversion_t dirty_from_dups;  ///< must clear/writeout all dups >= dirty_from_dups
+  eversion_t write_from_dups;  ///< must write keys >= write_from_dups
+  std::set<std::string> trimmed_dups;    ///< must clear keys in trimmed_dups
+  CephContext *cct;
+  bool pg_log_debug;
+  /// Log is clean on [dirty_to, dirty_from)
+  bool touched_log;
+  bool dirty_log;
+  bool clear_divergent_priors;
+  bool may_include_deletes_in_missing_dirty = false;
+
+  void mark_dirty_to(eversion_t to) {
+    if (to > dirty_to)
+      dirty_to = to;
+  }
+  void mark_dirty_from(eversion_t from) {
+    if (from < dirty_from)
+      dirty_from = from;
+  }
+  void mark_writeout_from(eversion_t from) {
+    if (from < writeout_from)
+      writeout_from = from;
+  }
+  void mark_dirty_to_dups(eversion_t to) {
+    if (to > dirty_to_dups)
+      dirty_to_dups = to;
+  }
+  void mark_dirty_from_dups(eversion_t from) {
+    if (from < dirty_from_dups)
+      dirty_from_dups = from;
+  }
+public:
+  bool needs_write() const {
+    return !touched_log || is_dirty();
+  }
+
+  bool is_dirty() const {
+    return dirty_log ||
+      (dirty_to != eversion_t()) ||
+      (dirty_from != eversion_t::max()) ||
+      (writeout_from != eversion_t::max()) ||
+      !(trimmed.empty()) ||
+      !missing.is_clean() ||
+      !(trimmed_dups.empty()) ||
+      (dirty_to_dups != eversion_t()) ||
+      (dirty_from_dups != eversion_t::max()) ||
+      (write_from_dups != eversion_t::max()) ||
+      may_include_deletes_in_missing_dirty;
+  }
+
+  void mark_log_for_rewrite() {
+    mark_dirty_to(eversion_t::max());
+    mark_dirty_from(eversion_t());
+    mark_dirty_to_dups(eversion_t::max());
+    mark_dirty_from_dups(eversion_t());
+    touched_log = false;
+  }
+  bool get_may_include_deletes_in_missing_dirty() const {
+    return may_include_deletes_in_missing_dirty;
+  }
+protected:
+
+  /// DEBUG
+  std::set<std::string> log_keys_debug;
+  static void clear_after(std::set<std::string> *log_keys_debug, const std::string &lb) {
+    if (!log_keys_debug)
+      return;
+    for (auto i = log_keys_debug->lower_bound(lb);
+	 i != log_keys_debug->end();
+	 log_keys_debug->erase(i++));
+  }
+  static void clear_up_to(std::set<std::string> *log_keys_debug, const std::string &ub) {
+    if (!log_keys_debug)
+      return;
+    for (auto i = log_keys_debug->begin();
+	 i != log_keys_debug->end() && *i < ub;
+	 log_keys_debug->erase(i++));
+  }
+
+  void check();
+  void undirty() {
+    dirty_to = eversion_t();
+    dirty_from = eversion_t::max();
+    touched_log = true;
+    dirty_log = false;
+    trimmed.clear();
+    trimmed_dups.clear();
+    writeout_from = eversion_t::max();
+    check();
+    missing.flush();
+    dirty_to_dups = eversion_t();
+    dirty_from_dups = eversion_t::max();
+    write_from_dups = eversion_t::max();
+  }
+public:
+
+  // cppcheck-suppress noExplicitConstructor
+  PGLog(CephContext *cct) :
+    dirty_from(eversion_t::max()),
+    writeout_from(eversion_t::max()),
+    dirty_from_dups(eversion_t::max()),
+    write_from_dups(eversion_t::max()),
+    cct(cct),
+    pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
+    touched_log(false),
+    dirty_log(false),
+    clear_divergent_priors(false)
+  { }
+
+  void reset_backfill();
+
+  void clear();
+
+  //////////////////// get or std::set missing ////////////////////
+
+  const pg_missing_tracker_t& get_missing() const { return missing; }
+
+  void missing_add(const hobject_t& oid, eversion_t need, eversion_t have, bool is_delete=false) {
+    missing.add(oid, need, have, is_delete);
+  }
+
+  void missing_add_next_entry(const pg_log_entry_t& e) {
+    missing.add_next_event(e);
+  }
+
+  //////////////////// get or std::set log ////////////////////
+
+  const IndexedLog &get_log() const { return log; }
+
+  const eversion_t &get_tail() const { return log.tail; }
+
+  void set_tail(eversion_t tail) { log.tail = tail; }
+
+  const eversion_t &get_head() const { return log.head; }
+
+  void set_head(eversion_t head) { log.head = head; }
+
+  void set_last_requested(version_t last_requested) {
+    log.last_requested = last_requested;
+  }
+
+  void index() { log.index(); }
+
+  void unindex() { log.unindex(); }
+
+  void add(const pg_log_entry_t& e, bool applied = true) {
+    mark_writeout_from(e.version);
+    log.add(e, applied);
+  }
+
+  void reset_recovery_pointers() { log.reset_recovery_pointers(); }
+
+  static void clear_info_log(
+    spg_t pgid,
+    ObjectStore::Transaction *t);
+
+  void trim(
+    eversion_t trim_to,
+    pg_info_t &info,
+    bool transaction_applied = true,
+    bool async = false);
+
+  void roll_forward_to(
+    eversion_t roll_forward_to,
+    LogEntryHandler *h) {
+    if (log.roll_forward_to(
+	  roll_forward_to,
+	  h))
+      dirty_log = true;
+  }
+
+  eversion_t get_can_rollback_to() const {
+    return log.get_can_rollback_to();
+  }
+
+  void roll_forward(LogEntryHandler *h) {
+    roll_forward_to(
+      log.head,
+      h);
+  }
+
+  void skip_rollforward() {
+    log.skip_can_rollback_to_to_head();
+  }
+
+  //////////////////// get or std::set log & missing ////////////////////
+
+  void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) {
+    log.trim_rollback_info_to(log.head, h);
+    log.claim_log_and_clear_rollback_info(o);
+    missing.clear();
+    mark_dirty_to(eversion_t::max());
+    mark_dirty_to_dups(eversion_t::max());
+  }
+
+  void split_into(
+      pg_t child_pgid,
+      unsigned split_bits,
+      PGLog *opg_log) {
+    log.split_out_child(child_pgid, split_bits, &opg_log->log);
+    missing.split_into(child_pgid, split_bits, &(opg_log->missing));
+    opg_log->mark_dirty_to(eversion_t::max());
+    opg_log->mark_dirty_to_dups(eversion_t::max());
+    mark_dirty_to(eversion_t::max());
+    mark_dirty_to_dups(eversion_t::max());
+    if (missing.may_include_deletes) {
+      opg_log->set_missing_may_contain_deletes();
+    }
+  }
+
+  void merge_from(
+    const std::vector<PGLog*>& sources,
+    eversion_t last_update) {
+    unindex();
+    missing.clear();
+
+    std::vector<pg_log_t*> slogs;
+    for (auto s : sources) {
+      slogs.push_back(&s->log);
+    }
+    log.merge_from(slogs, last_update);
+
+    index();
+
+    mark_log_for_rewrite();
+  }
+
+  void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) {
+    if (missing.is_missing(oid, v)) {
+      missing.got(oid, v);
+      info.stats.stats.sum.num_objects_missing = missing.num_missing();
+
+      // raise last_complete?
+      if (missing.get_items().empty()) {
+	log.complete_to = log.log.end();
+	info.last_complete = info.last_update;
+      }
+      auto oldest_need = missing.get_oldest_need();
+      while (log.complete_to != log.log.end()) {
+	if (oldest_need <= log.complete_to->version)
+	  break;
+	if (info.last_complete < log.complete_to->version)
+	  info.last_complete = log.complete_to->version;
+	++log.complete_to;
+      }
+    }
+
+    ceph_assert(log.get_can_rollback_to() >= v);
+  }
+
+  void reset_complete_to(pg_info_t *info) {
+    if (log.log.empty()) // caller is split_into()
+      return;
+    log.complete_to = log.log.begin();
+    ceph_assert(log.complete_to != log.log.end());
+    auto oldest_need = missing.get_oldest_need();
+    if (oldest_need != eversion_t()) {
+      while (log.complete_to->version < oldest_need) {
+        ++log.complete_to;
+        ceph_assert(log.complete_to != log.log.end());
+      }
+    }
+    if (!info)
+      return;
+    if (log.complete_to == log.log.begin()) {
+      info->last_complete = eversion_t();
+    } else {
+      --log.complete_to;
+      info->last_complete = log.complete_to->version;
+      ++log.complete_to;
+    }
+  }
+
+  void activate_not_complete(pg_info_t &info) {
+    reset_complete_to(&info);
+    log.last_requested = 0;
+  }
+
+  void proc_replica_log(pg_info_t &oinfo,
+			const pg_log_t &olog,
+			pg_missing_t& omissing, pg_shard_t from) const;
+
+  void set_missing_may_contain_deletes() {
+    missing.may_include_deletes = true;
+    may_include_deletes_in_missing_dirty = true;
+  }
+
+  void rebuild_missing_set_with_deletes(ObjectStore *store,
+					ObjectStore::CollectionHandle& ch,
+					const pg_info_t &info);
+
+protected:
+  static void split_by_object(
+    mempool::osd_pglog::list<pg_log_entry_t> &entries,
+    std::map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>> *out_entries) {
+    while (!entries.empty()) {
+      auto &out_list = (*out_entries)[entries.front().soid];
+      out_list.splice(out_list.end(), entries, entries.begin());
+    }
+  }
+
+  /**
+   * _merge_object_divergent_entries
+   *
+   * There are 5 distinct cases:
+   * 1) There is a more recent update: in this case we assume we adjusted the
+   *    store and missing during merge_log
+   * 2) The first entry in the divergent sequence is a create.  This might
+   *    either be because the object is a clone or because prior_version is
+   *    eversion_t().  In this case the object does not exist and we must
+   *    adjust missing and the store to match.
+   * 3) We are currently missing the object.  In this case, we adjust the
+   *    missing to our prior_version taking care to add a divergent_prior
+   *    if necessary
+   * 4) We can rollback all of the entries.  In this case, we do so using
+   *    the rollbacker and return -- the object does not go into missing.
+   * 5) We cannot rollback at least 1 of the entries.  In this case, we
+   *    clear the object out of the store and add a missing entry at
+   *    prior_version taking care to add a divergent_prior if
+   *    necessary.
+   */
+  template <typename missing_type>
+  static void _merge_object_divergent_entries(
+    const IndexedLog &log,               ///< [in] log to merge against
+    const hobject_t &hoid,               ///< [in] object we are merging
+    const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge
+    const pg_info_t &info,              ///< [in] info for merging entries
+    eversion_t olog_can_rollback_to,     ///< [in] rollback boundary of input InedexedLog
+    missing_type &missing,               ///< [in,out] missing to adjust, use
+    LogEntryHandler *rollbacker,         ///< [in] optional rollbacker object
+    const DoutPrefixProvider *dpp        ///< [in] logging provider
+    ) {
+    ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid
+		       << " entries: " << orig_entries << dendl;
+
+    if (hoid > info.last_backfill) {
+      ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill"
+			 << dendl;
+      return;
+    }
+
+    // entries is non-empty
+    ceph_assert(!orig_entries.empty());
+    // strip out and ignore ERROR entries
+    mempool::osd_pglog::list<pg_log_entry_t> entries;
+    eversion_t last;
+    bool seen_non_error = false;
+    for (auto i = orig_entries.begin();
+	 i != orig_entries.end();
+	 ++i) {
+      // all entries are on hoid
+      ceph_assert(i->soid == hoid);
+      // did not see error entries before this entry and this entry is not error
+      // then this entry is the first non error entry
+      bool first_non_error = ! seen_non_error && ! i->is_error();
+      if (! i->is_error() ) {
+        // see a non error entry now
+        seen_non_error = true;
+      }
+      
+      // No need to check the first entry since it prior_version is unavailable
+      // in the std::list
+      // No need to check if the prior_version is the minimal version
+      // No need to check the first non-error entry since the leading error
+      // entries are not its prior version
+      if (i != orig_entries.begin() && i->prior_version != eversion_t() &&
+          ! first_non_error) {
+	// in increasing order of version
+	ceph_assert(i->version > last);
+	// prior_version correct (unless it is an ERROR entry)
+	ceph_assert(i->prior_version == last || i->is_error());
+      }
+      if (i->is_error()) {
+	ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl;
+      } else {
+	ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl;
+	entries.push_back(*i);
+	last = i->version;
+      }
+    }
+    if (entries.empty()) {
+      ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl;
+      return;
+    }
+
+    const eversion_t prior_version = entries.begin()->prior_version;
+    const eversion_t first_divergent_update = entries.begin()->version;
+    const eversion_t last_divergent_update = entries.rbegin()->version;
+    const bool object_not_in_store =
+      !missing.is_missing(hoid) &&
+      entries.rbegin()->is_delete();
+    ldpp_dout(dpp, 10) << __func__ << ": hoid " << " object_not_in_store: "
+                       << object_not_in_store << dendl;
+    ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+		       << " prior_version: " << prior_version
+		       << " first_divergent_update: " << first_divergent_update
+		       << " last_divergent_update: " << last_divergent_update
+		       << dendl;
+
+    auto objiter = log.objects.find(hoid);
+    if (objiter != log.objects.end() &&
+	objiter->second->version >= first_divergent_update) {
+      /// Case 1)
+      ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: "
+			 << *objiter->second << ", already merged" << dendl;
+
+      ceph_assert(objiter->second->version > last_divergent_update);
+
+      // ensure missing has been updated appropriately
+      if (objiter->second->is_update() ||
+	  (missing.may_include_deletes && objiter->second->is_delete())) {
+	ceph_assert(missing.is_missing(hoid) &&
+	       missing.get_items().at(hoid).need == objiter->second->version);
+      } else {
+	ceph_assert(!missing.is_missing(hoid));
+      }
+      missing.revise_have(hoid, eversion_t());
+      missing.mark_fully_dirty(hoid);
+      if (rollbacker) {
+	if (!object_not_in_store) {
+	  rollbacker->remove(hoid);
+	}
+	for (auto &&i: entries) {
+	  rollbacker->trim(i);
+	}
+      }
+      return;
+    }
+
+    ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+		       <<" has no more recent entries in log" << dendl;
+    if (prior_version == eversion_t() || entries.front().is_clone()) {
+      /// Case 2)
+      ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+			 << " prior_version or op type indicates creation,"
+			 << " deleting"
+			 << dendl;
+      if (missing.is_missing(hoid))
+	missing.rm(missing.get_items().find(hoid));
+      if (rollbacker) {
+	if (!object_not_in_store) {
+	  rollbacker->remove(hoid);
+	}
+	for (auto &&i: entries) {
+	  rollbacker->trim(i);
+	}
+      }
+      return;
+    }
+
+    if (missing.is_missing(hoid)) {
+      /// Case 3)
+      ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+			 << " missing, " << missing.get_items().at(hoid)
+			 << " adjusting" << dendl;
+
+      if (missing.get_items().at(hoid).have == prior_version) {
+	ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+			   << " missing.have is prior_version " << prior_version
+			   << " removing from missing" << dendl;
+	missing.rm(missing.get_items().find(hoid));
+      } else {
+	ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+			   << " missing.have is " << missing.get_items().at(hoid).have
+			   << ", adjusting" << dendl;
+	missing.revise_need(hoid, prior_version, false);
+	if (prior_version <= info.log_tail) {
+	  ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+			     << " prior_version " << prior_version
+			     << " <= info.log_tail "
+			     << info.log_tail << dendl;
+	}
+      }
+      if (rollbacker) {
+	for (auto &&i: entries) {
+	  rollbacker->trim(i);
+	}
+      }
+      return;
+    }
+
+    ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+		       << " must be rolled back or recovered,"
+		       << " attempting to rollback"
+		       << dendl;
+    bool can_rollback = true;
+    // We are going to make an important decision based on the
+    // olog_can_rollback_to value we have received, better known it.
+    ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+                       << " olog_can_rollback_to: "
+                       << olog_can_rollback_to << dendl;
+    /// Distinguish between 4) and 5)
+    for (auto i = entries.rbegin(); i != entries.rend(); ++i) {
+      if (!i->can_rollback() || i->version <= olog_can_rollback_to) {
+	ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback "
+			   << *i << dendl;
+	can_rollback = false;
+	break;
+      }
+    }
+
+    if (can_rollback) {
+      /// Case 4)
+      for (auto i = entries.rbegin(); i != entries.rend(); ++i) {
+	ceph_assert(i->can_rollback() && i->version > olog_can_rollback_to);
+	ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+			   << " rolling back " << *i << dendl;
+	if (rollbacker)
+	  rollbacker->rollback(*i);
+      }
+      ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+			 << " rolled back" << dendl;
+      return;
+    } else {
+      /// Case 5)
+      ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, "
+			 << "removing and adding to missing" << dendl;
+      if (rollbacker) {
+	if (!object_not_in_store)
+	  rollbacker->remove(hoid);
+	for (auto &&i: entries) {
+	  rollbacker->trim(i);
+	}
+      }
+      missing.add(hoid, prior_version, eversion_t(), false);
+      if (prior_version <= info.log_tail) {
+	ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid
+			   << " prior_version " << prior_version
+			   << " <= info.log_tail "
+			   << info.log_tail << dendl;
+      }
+    }
+  }
+
+  /// Merge all entries using above
+  template <typename missing_type>
+  static void _merge_divergent_entries(
+    const IndexedLog &log,               ///< [in] log to merge against
+    mempool::osd_pglog::list<pg_log_entry_t> &entries,       ///< [in] entries to merge
+    const pg_info_t &oinfo,              ///< [in] info for merging entries
+    eversion_t olog_can_rollback_to,     ///< [in] rollback boundary of input IndexedLog
+    missing_type &omissing,              ///< [in,out] missing to adjust, use
+    LogEntryHandler *rollbacker,         ///< [in] optional rollbacker object
+    const DoutPrefixProvider *dpp        ///< [in] logging provider
+    ) {
+    std::map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split;
+    split_by_object(entries, &split);
+    for (auto i = split.begin(); i != split.end(); ++i) {
+      _merge_object_divergent_entries(
+	log,
+	i->first,
+	i->second,
+	oinfo,
+	olog_can_rollback_to,
+	omissing,
+	rollbacker,
+	dpp);
+    }
+  }
+
+  /**
+   * Exists for use in TestPGLog for simply testing single divergent log
+   * cases
+   */
+  void merge_old_entry(
+    ObjectStore::Transaction& t,
+    const pg_log_entry_t& oe,
+    const pg_info_t& info,
+    LogEntryHandler *rollbacker) {
+    mempool::osd_pglog::list<pg_log_entry_t> entries;
+    entries.push_back(oe);
+    _merge_object_divergent_entries(
+      log,
+      oe.soid,
+      entries,
+      info,
+      log.get_can_rollback_to(),
+      missing,
+      rollbacker,
+      this);
+  }
+
+  bool merge_log_dups(const pg_log_t& olog);
+
+public:
+
+  void rewind_divergent_log(eversion_t newhead,
+                            pg_info_t &info,
+                            LogEntryHandler *rollbacker,
+                            bool &dirty_info,
+                            bool &dirty_big_info);
+
+  void merge_log(pg_info_t &oinfo,
+		 pg_log_t&& olog,
+		 pg_shard_t from,
+		 pg_info_t &info, LogEntryHandler *rollbacker,
+		 bool &dirty_info, bool &dirty_big_info);
+
+  template <typename missing_type>
+  static bool append_log_entries_update_missing(
+    const hobject_t &last_backfill,
+    const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+    bool maintain_rollback,
+    IndexedLog *log,
+    missing_type &missing,
+    LogEntryHandler *rollbacker,
+    const DoutPrefixProvider *dpp) {
+    bool invalidate_stats = false;
+    if (log && !entries.empty()) {
+      ceph_assert(log->head < entries.begin()->version);
+    }
+    for (auto p = entries.begin(); p != entries.end(); ++p) {
+      invalidate_stats = invalidate_stats || !p->is_error();
+      if (log) {
+	ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl;
+	log->add(*p);
+      }
+      if (p->soid <= last_backfill &&
+	  !p->is_error()) {
+	if (missing.may_include_deletes) {
+	  missing.add_next_event(*p);
+	} else {
+	  if (p->is_delete()) {
+	    missing.rm(p->soid, p->version);
+	  } else {
+	    missing.add_next_event(*p);
+	  }
+	  if (rollbacker) {
+	    // hack to match PG::mark_all_unfound_lost
+	    if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) {
+	      rollbacker->try_stash(p->soid, p->version.version);
+	    } else if (p->is_delete()) {
+	      rollbacker->remove(p->soid);
+	    }
+	  }
+	}
+      }
+    }
+    return invalidate_stats;
+  }
+  bool append_new_log_entries(
+    const hobject_t &last_backfill,
+    const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+    LogEntryHandler *rollbacker) {
+    bool invalidate_stats = append_log_entries_update_missing(
+      last_backfill,
+      entries,
+      true,
+      &log,
+      missing,
+      rollbacker,
+      this);
+    if (!entries.empty()) {
+      mark_writeout_from(entries.begin()->version);
+      if (entries.begin()->is_lost_delete()) {
+	// hack: since lost deletes queue recovery directly, and don't
+	// go through activate_not_complete() again, our complete_to
+	// iterator may still point at log.end(). Reset it to point
+	// before these new lost_delete entries.  This only occurs
+	// when lost+delete entries are initially added, which is
+	// always in a std::list of solely lost_delete entries, so it is
+	// sufficient to check whether the first entry is a
+	// lost_delete
+	reset_complete_to(nullptr);
+      }
+    }
+    return invalidate_stats;
+  }
+
+  void write_log_and_missing(
+    ObjectStore::Transaction& t,
+    std::map<std::string,ceph::buffer::list> *km,
+    const coll_t& coll,
+    const ghobject_t &log_oid,
+    bool require_rollback);
+
+  static void write_log_and_missing_wo_missing(
+    ObjectStore::Transaction& t,
+    std::map<std::string,ceph::buffer::list>* km,
+    pg_log_t &log,
+    const coll_t& coll,
+    const ghobject_t &log_oid, std::map<eversion_t, hobject_t> &divergent_priors,
+    bool require_rollback,
+    const DoutPrefixProvider *dpp = nullptr);
+
+  static void write_log_and_missing(
+    ObjectStore::Transaction& t,
+    std::map<std::string,ceph::buffer::list>* km,
+    pg_log_t &log,
+    const coll_t& coll,
+    const ghobject_t &log_oid,
+    const pg_missing_tracker_t &missing,
+    bool require_rollback,
+    bool *rebuilt_missing_set_with_deletes,
+    const DoutPrefixProvider *dpp = nullptr);
+
+  static void _write_log_and_missing_wo_missing(
+    ObjectStore::Transaction& t,
+    std::map<std::string,ceph::buffer::list>* km,
+    pg_log_t &log,
+    const coll_t& coll, const ghobject_t &log_oid,
+    std::map<eversion_t, hobject_t> &divergent_priors,
+    eversion_t dirty_to,
+    eversion_t dirty_from,
+    eversion_t writeout_from,
+    bool dirty_divergent_priors,
+    bool touch_log,
+    bool require_rollback,
+    eversion_t dirty_to_dups,
+    eversion_t dirty_from_dups,
+    eversion_t write_from_dups,
+    std::set<std::string> *log_keys_debug,
+    const DoutPrefixProvider *dpp = nullptr
+    );
+
+  static void _write_log_and_missing(
+    ObjectStore::Transaction& t,
+    std::map<std::string,ceph::buffer::list>* km,
+    pg_log_t &log,
+    const coll_t& coll, const ghobject_t &log_oid,
+    eversion_t dirty_to,
+    eversion_t dirty_from,
+    eversion_t writeout_from,
+    std::set<eversion_t> &&trimmed,
+    std::set<std::string> &&trimmed_dups,
+    const pg_missing_tracker_t &missing,
+    bool touch_log,
+    bool require_rollback,
+    bool clear_divergent_priors,
+    eversion_t dirty_to_dups,
+    eversion_t dirty_from_dups,
+    eversion_t write_from_dups,
+    bool *may_include_deletes_in_missing_dirty,
+    std::set<std::string> *log_keys_debug,
+    const DoutPrefixProvider *dpp = nullptr
+    );
+
+  void read_log_and_missing(
+    ObjectStore *store,
+    ObjectStore::CollectionHandle& ch,
+    ghobject_t pgmeta_oid,
+    const pg_info_t &info,
+    std::ostringstream &oss,
+    bool tolerate_divergent_missing_log,
+    bool debug_verify_stored_missing = false
+    ) {
+    return read_log_and_missing(
+      cct, store, ch, pgmeta_oid, info,
+      log, missing, oss,
+      tolerate_divergent_missing_log,
+      &clear_divergent_priors,
+      this,
+      (pg_log_debug ? &log_keys_debug : nullptr),
+      debug_verify_stored_missing);
+  }
+
+  template <typename missing_type>
+  static void read_log_and_missing(
+    CephContext *cct,
+    ObjectStore *store,
+    ObjectStore::CollectionHandle &ch,
+    ghobject_t pgmeta_oid,
+    const pg_info_t &info,
+    IndexedLog &log,
+    missing_type &missing,
+    std::ostringstream &oss,
+    bool tolerate_divergent_missing_log,
+    bool *clear_divergent_priors = nullptr,
+    const DoutPrefixProvider *dpp = nullptr,
+    std::set<std::string> *log_keys_debug = nullptr,
+    bool debug_verify_stored_missing = false
+    ) {
+    ldpp_dout(dpp, 10) << "read_log_and_missing coll " << ch->cid
+		       << " " << pgmeta_oid << dendl;
+    size_t total_dups = 0;
+
+    // legacy?
+    struct stat st;
+    int r = store->stat(ch, pgmeta_oid, &st);
+    ceph_assert(r == 0);
+    ceph_assert(st.st_size == 0);
+
+    // will get overridden below if it had been recorded
+    eversion_t on_disk_can_rollback_to = info.last_update;
+    eversion_t on_disk_rollback_info_trimmed_to = eversion_t();
+    ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch,
+							      pgmeta_oid);
+    std::map<eversion_t, hobject_t> divergent_priors;
+    bool must_rebuild = false;
+    missing.may_include_deletes = false;
+    std::list<pg_log_entry_t> entries;
+    std::list<pg_log_dup_t> dups;
+    const auto NUM_DUPS_WARN_THRESHOLD = 2*cct->_conf->osd_pg_log_dups_tracked;
+    if (p) {
+      using ceph::decode;
+      for (p->seek_to_first(); p->valid() ; p->next()) {
+	// non-log pgmeta_oid keys are prefixed with _; skip those
+	if (p->key()[0] == '_')
+	  continue;
+	auto bl = p->value();//Copy ceph::buffer::list before creating iterator
+	auto bp = bl.cbegin();
+	if (p->key() == "divergent_priors") {
+	  decode(divergent_priors, bp);
+	  ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size()
+			     << " divergent_priors" << dendl;
+	  must_rebuild = true;
+	  debug_verify_stored_missing = false;
+	} else if (p->key() == "can_rollback_to") {
+	  decode(on_disk_can_rollback_to, bp);
+	} else if (p->key() == "rollback_info_trimmed_to") {
+	  decode(on_disk_rollback_info_trimmed_to, bp);
+	} else if (p->key() == "may_include_deletes_in_missing") {
+	  missing.may_include_deletes = true;
+	} else if (p->key().substr(0, 7) == std::string("missing")) {
+	  hobject_t oid;
+	  pg_missing_item item;
+	  decode(oid, bp);
+	  decode(item, bp);
+          ldpp_dout(dpp, 20) << "read_log_and_missing " << item << dendl;
+	  if (item.is_delete()) {
+	    ceph_assert(missing.may_include_deletes);
+	  }
+	  missing.add(oid, std::move(item));
+	} else if (p->key().substr(0, 4) == std::string("dup_")) {
+	  ++total_dups;
+	  pg_log_dup_t dup;
+	  decode(dup, bp);
+	  if (!dups.empty()) {
+	    ceph_assert(dups.back().version < dup.version);
+	  }
+	  if (dups.size() == NUM_DUPS_WARN_THRESHOLD) {
+	    ldpp_dout(dpp, 0) << "read_log_and_missing WARN num of dups exceeded "
+			      << NUM_DUPS_WARN_THRESHOLD << "."
+			      << " You can be hit by THE DUPS BUG"
+			      << " https://tracker.ceph.com/issues/53729."
+			      << " Consider ceph-objectstore-tool --op trim-pg-log-dups"
+			      << dendl;
+	  }
+	  dups.push_back(dup);
+	} else {
+	  pg_log_entry_t e;
+	  e.decode_with_checksum(bp);
+	  ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl;
+	  if (!entries.empty()) {
+	    pg_log_entry_t last_e(entries.back());
+	    ceph_assert(last_e.version.version < e.version.version);
+	    ceph_assert(last_e.version.epoch <= e.version.epoch);
+	  }
+	  entries.push_back(e);
+	  if (log_keys_debug)
+	    log_keys_debug->insert(e.get_key_name());
+	}
+      }
+    }
+    log = IndexedLog(
+      info.last_update,
+      info.log_tail,
+      on_disk_can_rollback_to,
+      on_disk_rollback_info_trimmed_to,
+      std::move(entries),
+      std::move(dups));
+
+    if (must_rebuild || debug_verify_stored_missing) {
+      // build missing
+      if (debug_verify_stored_missing || info.last_complete < info.last_update) {
+	ldpp_dout(dpp, 10)
+	  << "read_log_and_missing checking for missing items over interval ("
+	  << info.last_complete
+	  << "," << info.last_update << "]" << dendl;
+
+	std::set<hobject_t> did;
+	std::set<hobject_t> checked;
+	std::set<hobject_t> skipped;
+	for (auto i = log.log.rbegin(); i != log.log.rend(); ++i) {
+	  if (i->soid > info.last_backfill)
+	    continue;
+	  if (i->is_error())
+	    continue;
+	  if (did.count(i->soid)) continue;
+	  did.insert(i->soid);
+
+	  if (!missing.may_include_deletes && i->is_delete())
+	    continue;
+
+	  ceph::buffer::list bv;
+	  int r = store->getattr(
+	    ch,
+	    ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard),
+	    OI_ATTR,
+	    bv);
+	  if (r >= 0) {
+	    object_info_t oi(bv);
+	    if (oi.version < i->version) {
+	      ldpp_dout(dpp, 15) << "read_log_and_missing  missing " << *i
+                           << " (have " << oi.version << ")"
+                           << " clean_regions " << i->clean_regions << dendl;
+
+	      if (debug_verify_stored_missing) {
+		auto miter = missing.get_items().find(i->soid);
+		ceph_assert(miter != missing.get_items().end());
+		ceph_assert(miter->second.need == i->version);
+		// the 'have' version is reset if an object is deleted,
+		// then created again
+		ceph_assert(miter->second.have == oi.version || miter->second.have == eversion_t());
+		checked.insert(i->soid);
+	      } else {
+		missing.add(i->soid, i->version, oi.version, i->is_delete());
+	      }
+	    }
+	  } else {
+	    ldpp_dout(dpp, 15) << "read_log_and_missing  missing " << *i << dendl;
+	    if (debug_verify_stored_missing) {
+	      auto miter = missing.get_items().find(i->soid);
+	      if (i->is_delete()) {
+		ceph_assert(miter == missing.get_items().end() ||
+		       (miter->second.need == i->version &&
+			miter->second.have == eversion_t()));
+	      } else {
+		ceph_assert(miter != missing.get_items().end());
+		ceph_assert(miter->second.need == i->version);
+		ceph_assert(miter->second.have == eversion_t());
+	      }
+	      checked.insert(i->soid);
+	    } else {
+	      missing.add(i->soid, i->version, eversion_t(), i->is_delete());
+	    }
+	  }
+	}
+	if (debug_verify_stored_missing) {
+	  for (auto &&i: missing.get_items()) {
+	    if (checked.count(i.first))
+	      continue;
+	    if (i.first > info.last_backfill) {
+	      ldpp_dout(dpp, -1) << __func__ << ": invalid missing std::set entry "
+				<< "found before last_backfill: "
+				<< i.first << " " << i.second
+				<< " last_backfill = " << info.last_backfill
+				<< dendl;
+	      ceph_abort_msg("invalid missing std::set entry found");
+	    }
+	    ceph::buffer::list bv;
+	    int r = store->getattr(
+	      ch,
+	      ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard),
+	      OI_ATTR,
+	      bv);
+	    if (r >= 0) {
+	      object_info_t oi(bv);
+	      ceph_assert(oi.version == i.second.have || eversion_t() == i.second.have);
+	    } else {
+	      ceph_assert(i.second.is_delete() || eversion_t() == i.second.have);
+	    }
+	  }
+	} else {
+	  ceph_assert(must_rebuild);
+	  for (auto i = divergent_priors.rbegin();
+	       i != divergent_priors.rend();
+	       ++i) {
+	    if (i->first <= info.last_complete) break;
+	    if (i->second > info.last_backfill)
+	      continue;
+	    if (did.count(i->second)) continue;
+	    did.insert(i->second);
+	    ceph::buffer::list bv;
+	    int r = store->getattr(
+	      ch,
+	      ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard),
+	      OI_ATTR,
+	      bv);
+	    if (r >= 0) {
+	      object_info_t oi(bv);
+	      /**
+		 * 1) we see this entry in the divergent priors mapping
+		 * 2) we didn't see an entry for this object in the log
+		 *
+		 * From 1 & 2 we know that either the object does not exist
+		 * or it is at the version specified in the divergent_priors
+		 * map since the object would have been deleted atomically
+		 * with the addition of the divergent_priors entry, an older
+		 * version would not have been recovered, and a newer version
+		 * would show up in the log above.
+		 */
+	      /**
+		 * Unfortunately the assessment above is incorrect because of
+		 * http://tracker.ceph.com/issues/17916 (we were incorrectly
+		 * not removing the divergent_priors std::set from disk state!),
+		 * so let's check that.
+		 */
+	      if (oi.version > i->first && tolerate_divergent_missing_log) {
+		ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i
+				  << ") inconsistent with disk state (" <<  oi
+				  << "), assuming it is tracker.ceph.com/issues/17916"
+				  << dendl;
+	      } else {
+		ceph_assert(oi.version == i->first);
+	      }
+	    } else {
+	      ldpp_dout(dpp, 15) << "read_log_and_missing  missing " << *i << dendl;
+	      missing.add(i->second, i->first, eversion_t(), false);
+	    }
+	  }
+	}
+	if (clear_divergent_priors)
+	  (*clear_divergent_priors) = true;
+      }
+    }
+
+    if (!must_rebuild) {
+      if (clear_divergent_priors)
+	(*clear_divergent_priors) = false;
+      missing.flush();
+    }
+    ldpp_dout(dpp, 10) << "read_log_and_missing done coll " << ch->cid
+		       << " total_dups=" << total_dups
+		       << " log.dups.size()=" << log.dups.size() << dendl;
+  } // static read_log_and_missing
+
+#ifdef WITH_SEASTAR
+  seastar::future<> read_log_and_missing_crimson(
+    crimson::os::FuturizedStore &store,
+    crimson::os::CollectionRef ch,
+    const pg_info_t &info,
+    ghobject_t pgmeta_oid
+    ) {
+    return read_log_and_missing_crimson(
+      store, ch, info,
+      log, (pg_log_debug ? &log_keys_debug : nullptr),
+      missing, pgmeta_oid, this);
+  }
+
+  static seastar::future<> read_log_and_missing_crimson(
+    crimson::os::FuturizedStore &store,
+    crimson::os::CollectionRef ch,
+    const pg_info_t &info,
+    IndexedLog &log,
+    std::set<std::string>* log_keys_debug,
+    pg_missing_tracker_t &missing,
+    ghobject_t pgmeta_oid,
+    const DoutPrefixProvider *dpp = nullptr);
+
+#endif
+
+}; // struct PGLog
diff --git a/src/osd/PGPeeringEvent.cc b/src/osd/PGPeeringEvent.cc
new file mode 100644
index 000000000..2d28c6f84
--- /dev/null
+++ b/src/osd/PGPeeringEvent.cc
@@ -0,0 +1,17 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/mempool.h"
+#include "osd/PGPeeringEvent.h"
+#include "messages/MOSDPGLog.h"
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(PGPeeringEvent, pg_peering_evt, osd);
+
+MLogRec::MLogRec(pg_shard_t from, MOSDPGLog *msg)
+  : from(from), msg(msg) {}
+
+void MLogRec::print(std::ostream *out) const
+{
+  *out << "MLogRec from " << from << " ";
+  msg->inner_print(*out);
+}
diff --git a/src/osd/PGPeeringEvent.h b/src/osd/PGPeeringEvent.h
new file mode 100644
index 000000000..2828880f6
--- /dev/null
+++ b/src/osd/PGPeeringEvent.h
@@ -0,0 +1,220 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/statechart/event.hpp>
+
+#include "osd/osd_types.h"
+
+class MOSDPGLog;
+
+/// what we need to instantiate a pg
+struct PGCreateInfo {
+  spg_t pgid;
+  epoch_t epoch = 0;
+  pg_history_t history;
+  PastIntervals past_intervals;
+  bool by_mon;
+  PGCreateInfo(spg_t p, epoch_t e,
+	       const pg_history_t& h,
+	       const PastIntervals& pi,
+	       bool mon)
+    : pgid(p), epoch(e), history(h), past_intervals(pi), by_mon(mon) {}
+};
+
+class PGPeeringEvent {
+  epoch_t epoch_sent;
+  epoch_t epoch_requested;
+  std::string desc;
+public:
+  boost::intrusive_ptr< const boost::statechart::event_base > evt;
+  bool requires_pg;
+  std::unique_ptr<PGCreateInfo> create_info;
+  MEMPOOL_CLASS_HELPERS();
+  template <class T>
+  PGPeeringEvent(
+    epoch_t epoch_sent,
+    epoch_t epoch_requested,
+    const T &evt_,
+    bool req = true,
+    PGCreateInfo *ci = 0)
+    : epoch_sent(epoch_sent),
+      epoch_requested(epoch_requested),
+      evt(evt_.intrusive_from_this()),
+      requires_pg(req),
+      create_info(ci) {
+    std::stringstream out;
+    out << "epoch_sent: " << epoch_sent
+	<< " epoch_requested: " << epoch_requested << " ";
+    evt_.print(&out);
+    if (create_info) {
+      out << " +create_info";
+    }
+    desc = out.str();
+  }
+  epoch_t get_epoch_sent() const {
+    return epoch_sent;
+  }
+  epoch_t get_epoch_requested() const {
+    return epoch_requested;
+  }
+  const boost::statechart::event_base &get_event() const {
+    return *evt;
+  }
+  const std::string& get_desc() const {
+    return desc;
+  }
+};
+typedef std::shared_ptr<PGPeeringEvent> PGPeeringEventRef;
+typedef std::unique_ptr<PGPeeringEvent> PGPeeringEventURef;
+
+struct MInfoRec : boost::statechart::event< MInfoRec > {
+  pg_shard_t from;
+  pg_info_t info;
+  epoch_t msg_epoch;
+  std::optional<pg_lease_t> lease;
+  std::optional<pg_lease_ack_t> lease_ack;
+  MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch,
+	   std::optional<pg_lease_t> l = {},
+	   std::optional<pg_lease_ack_t> la = {})
+    : from(from), info(info), msg_epoch(msg_epoch),
+      lease(l), lease_ack(la) {}
+  void print(std::ostream *out) const {
+    *out << "MInfoRec from " << from << " info: " << info;
+    if (lease) {
+      *out << " " << *lease;
+    }
+    if (lease_ack) {
+      *out << " " << *lease_ack;
+    }
+  }
+};
+
+struct MLogRec : boost::statechart::event< MLogRec > {
+  pg_shard_t from;
+  boost::intrusive_ptr<MOSDPGLog> msg;
+  MLogRec(pg_shard_t from, MOSDPGLog *msg);
+  void print(std::ostream *out) const;
+};
+
+struct MNotifyRec : boost::statechart::event< MNotifyRec > {
+  spg_t pgid;
+  pg_shard_t from;
+  pg_notify_t notify;
+  uint64_t features;
+  MNotifyRec(spg_t p, pg_shard_t from, const pg_notify_t &notify, uint64_t f)
+    : pgid(p), from(from), notify(notify), features(f) {}
+  void print(std::ostream *out) const {
+    *out << "MNotifyRec " << pgid << " from " << from << " notify: " << notify
+	 << " features: 0x" << std::hex << features << std::dec;
+  }
+};
+
+struct MQuery : boost::statechart::event< MQuery > {
+  spg_t pgid;
+  pg_shard_t from;
+  pg_query_t query;
+  epoch_t query_epoch;
+  MQuery(spg_t p, pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
+    : pgid(p), from(from), query(query), query_epoch(query_epoch) {}
+  void print(std::ostream *out) const {
+    *out << "MQuery " << pgid << " from " << from
+	 << " query_epoch " << query_epoch
+	 << " query: " << query;
+  }
+};
+
+struct MTrim : boost::statechart::event<MTrim> {
+  epoch_t epoch;
+  int from;
+  shard_id_t shard;
+  eversion_t trim_to;
+  MTrim(epoch_t epoch, int from, shard_id_t shard, eversion_t trim_to)
+    : epoch(epoch), from(from), shard(shard), trim_to(trim_to) {}
+  void print(std::ostream *out) const {
+    *out << "MTrim epoch " << epoch << " from " << from << " shard " << shard
+	 << " trim_to " << trim_to;
+  }
+};
+
+struct MLease : boost::statechart::event<MLease> {
+  epoch_t epoch;
+  int from;
+  pg_lease_t lease;
+  MLease(epoch_t epoch, int from, pg_lease_t l)
+    : epoch(epoch), from(from), lease(l) {}
+  void print(std::ostream *out) const {
+    *out << "MLease epoch " << epoch << " from osd." << from << " " << lease;
+  }
+};
+
+struct MLeaseAck : boost::statechart::event<MLeaseAck> {
+  epoch_t epoch;
+  int from;
+  pg_lease_ack_t lease_ack;
+  MLeaseAck(epoch_t epoch, int from, pg_lease_ack_t l)
+    : epoch(epoch), from(from), lease_ack(l) {}
+  void print(std::ostream *out) const {
+    *out << "MLeaseAck epoch " << epoch << " from osd." << from
+	 << " " << lease_ack;
+  }
+};
+
+struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
+  unsigned priority;
+  int64_t primary_num_bytes;
+  int64_t local_num_bytes;
+  explicit RequestBackfillPrio(unsigned prio, int64_t pbytes, int64_t lbytes) :
+    boost::statechart::event< RequestBackfillPrio >(),
+    priority(prio), primary_num_bytes(pbytes), local_num_bytes(lbytes) {}
+  void print(std::ostream *out) const {
+    *out << "RequestBackfillPrio: priority " << priority
+         << " primary bytes " << primary_num_bytes
+         << " local bytes " << local_num_bytes;
+  }
+};
+
+struct RequestRecoveryPrio : boost::statechart::event< RequestRecoveryPrio > {
+  unsigned priority;
+  explicit RequestRecoveryPrio(unsigned prio) :
+    boost::statechart::event< RequestRecoveryPrio >(),
+    priority(prio) {}
+  void print(std::ostream *out) const {
+    *out << "RequestRecoveryPrio: priority " << priority;
+  }
+};
+
+#define TrivialEvent(T) struct T : boost::statechart::event< T > { \
+    T() : boost::statechart::event< T >() {}			   \
+    void print(std::ostream *out) const {			   \
+      *out << #T;						   \
+    }								   \
+  };
+
+TrivialEvent(NullEvt)
+TrivialEvent(RemoteBackfillReserved)
+TrivialEvent(RemoteReservationRejectedTooFull)
+TrivialEvent(RemoteReservationRevokedTooFull)
+TrivialEvent(RemoteReservationRevoked)
+TrivialEvent(RemoteReservationCanceled)
+TrivialEvent(RemoteRecoveryReserved)
+TrivialEvent(RecoveryDone)
+
+struct DeferRecovery : boost::statechart::event<DeferRecovery> {
+  float delay;
+  explicit DeferRecovery(float delay) : delay(delay) {}
+  void print(std::ostream *out) const {
+    *out << "DeferRecovery: delay " << delay;
+  }
+};
+
+struct DeferBackfill : boost::statechart::event<DeferBackfill> {
+  float delay;
+  explicit DeferBackfill(float delay) : delay(delay) {}
+  void print(std::ostream *out) const {
+    *out << "DeferBackfill: delay " << delay;
+  }
+};
+
+TrivialEvent(RenewLease)
diff --git a/src/osd/PGStateUtils.cc b/src/osd/PGStateUtils.cc
new file mode 100644
index 000000000..5dbe78eb7
--- /dev/null
+++ b/src/osd/PGStateUtils.cc
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PGStateUtils.h"
+#include "common/Clock.h"
+
+using ceph::Formatter;
+
+/*------NamedState----*/
+NamedState::NamedState(PGStateHistory *pgsh, const char *state_name)
+  : pgsh(pgsh), state_name(state_name), enter_time(ceph_clock_now()) {
+  if(pgsh) {
+    pgsh->enter(enter_time, state_name);
+  }
+}
+
+NamedState::~NamedState() {
+  if(pgsh) {
+    pgsh->exit(state_name);
+  }
+}
+
+/*---------PGStateHistory---------*/
+void PGStateHistory::enter(const utime_t entime, const char* state)
+{
+  if (pi == nullptr) {
+    pi = std::make_unique<PGStateInstance>();
+  }
+  pi->enter_state(entime, state);
+}
+
+void PGStateHistory::exit(const char* state) {
+  pi->setepoch(es.get_osdmap_epoch());
+  pi->exit_state(ceph_clock_now());
+  if (pi->empty()) {
+    reset();
+  }
+}
+
+void PGStateHistory::dump(Formatter* f) const {
+  f->open_array_section("history");
+  for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
+    f->open_object_section("epochs");
+    f->dump_stream("epoch") << (*pi)->this_epoch;
+    f->open_array_section("states");
+    for (auto she : (*pi)->state_history) {
+      f->open_object_section("state");
+      f->dump_string("state", std::get<2>(she));
+      f->dump_stream("enter") << std::get<0>(she);
+      f->dump_stream("exit") << std::get<1>(she);
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+}
diff --git a/src/osd/PGStateUtils.h b/src/osd/PGStateUtils.h
new file mode 100644
index 000000000..952464641
--- /dev/null
+++ b/src/osd/PGStateUtils.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/utime.h"
+#include "common/Formatter.h"
+
+#include <stack>
+#include <vector>
+#include <boost/circular_buffer.hpp>
+
+class PGStateHistory;
+
+struct EpochSource {
+  virtual epoch_t get_osdmap_epoch() const = 0;
+  virtual ~EpochSource() {}
+};
+
+struct NamedState {
+  PGStateHistory *pgsh;
+  const char *state_name;
+  utime_t enter_time;
+  const char *get_state_name() { return state_name; }
+  NamedState(
+    PGStateHistory *pgsh,
+    const char *state_name_);
+  virtual ~NamedState();
+};
+
+using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
+using embedded_state = std::pair<utime_t, const char*>;
+
+struct PGStateInstance {
+  // Time spent in pg states
+
+  void setepoch(const epoch_t current_epoch) {
+    this_epoch = current_epoch;
+  }
+
+  void enter_state(const utime_t entime, const char* state) {
+    embedded_states.push(std::make_pair(entime, state));
+  }
+
+  void exit_state(const utime_t extime) {
+    embedded_state this_state = embedded_states.top();
+    state_history.push_back(state_history_entry{
+        this_state.first, extime, this_state.second});
+    embedded_states.pop();
+  }
+
+  bool empty() const {
+    return embedded_states.empty();
+  }
+
+  epoch_t this_epoch;
+  std::vector<state_history_entry> state_history;
+  std::stack<embedded_state> embedded_states;
+};
+
+class PGStateHistory {
+public:
+  PGStateHistory(const EpochSource &es) : buffer(10), es(es) {}
+
+  void enter(const utime_t entime, const char* state);
+
+  void exit(const char* state);
+
+  void reset() {
+    buffer.push_back(std::move(pi));
+    pi = nullptr;
+  }
+
+  void dump(ceph::Formatter* f) const;
+
+  const char *get_current_state() const {
+    if (pi == nullptr) return "unknown";
+    return std::get<1>(pi->embedded_states.top());
+  }
+
+private:
+  std::unique_ptr<PGStateInstance> pi;
+  boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
+  const EpochSource &es;
+};
diff --git a/src/osd/PGTransaction.h b/src/osd/PGTransaction.h
new file mode 100644
index 000000000..3b5b9e72c
--- /dev/null
+++ b/src/osd/PGTransaction.h
@@ -0,0 +1,601 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef PGTRANSACTION_H
+#define PGTRANSACTION_H
+
+#include <map>
+#include <memory>
+#include <optional>
+
+#include "common/hobject.h"
+#include "osd/osd_types.h"
+#include "osd/osd_internal_types.h"
+#include "common/interval_map.h"
+#include "common/inline_variant.h"
+
+/**
+ * This class represents transactions which can be submitted to
+ * a PGBackend.  For expediency, there are some constraints on
+ * the operations submitted:
+ * 1) Rename sources may only be referenced prior to the rename
+ *    operation to the destination.
+ * 2) The graph formed by edges of source->destination for clones
+ *    (Create) and Renames must be acyclic.
+ * 3) clone_range sources must not be modified by the same
+ *    transaction
+ */
+class PGTransaction {
+public:
+  std::map<hobject_t, ObjectContextRef> obc_map;
+
+  class ObjectOperation {
+  public:
+    struct Init
+    {
+      struct None {};
+      struct Create {};
+      struct Clone {
+	hobject_t source;
+      };
+      struct Rename {
+	hobject_t source; // must be temp object
+      };
+    };
+    using InitType = boost::variant<
+      Init::None,
+      Init::Create,
+      Init::Clone,
+      Init::Rename>;
+
+    InitType init_type = Init::None();
+    bool delete_first = false;
+
+    /**
+     * is_none() && is_delete() indicates that we are deleting an
+     * object which already exists and not recreating it. delete_first means
+     * that the transaction logically removes the object.
+
+     * There are really 4 cases:
+
+     * 1) We are modifying an existing object (is_none() &&
+     *    !is_delete())
+     *    a) If it's an append, we just write into the log entry the old size
+     *    b) If it's an actual overwrite, we save the old versions of the
+     *       extents being overwritten and write those offsets into the log
+     *       entry
+     * 2) We are removing and then recreating an object (!is_none() && is_delete())
+     *    -- stash
+     * 3) We are removing an object (is_none() && is_delete()) -- stash
+     * 4) We are creating an object (!is_none() && !is_delete()) -- create (no
+     *    stash)
+     *
+     * Create, Clone, Rename are the three ways we can recreate it.
+     * ECBackend transaction planning needs this context
+     * to figure out how to perform the transaction.
+     */
+    bool deletes_first() const {
+      return delete_first;
+    }
+    bool is_delete() const {
+      return boost::get<Init::None>(&init_type) != nullptr && delete_first;
+    }
+    bool is_none() const {
+      return boost::get<Init::None>(&init_type) != nullptr && !delete_first;
+    }
+    bool is_fresh_object() const {
+      return boost::get<Init::None>(&init_type) == nullptr;
+    }
+    bool is_rename() const {
+      return boost::get<Init::Rename>(&init_type) != nullptr;
+    }
+    bool has_source(hobject_t *source = nullptr) const {
+      return match(
+	init_type,
+	[&](const Init::Clone &op) -> bool {
+	  if (source)
+	    *source = op.source;
+	  return true;
+	},
+	[&](const Init::Rename &op) -> bool {
+	  if (source)
+	    *source = op.source;
+	  return true;
+	},
+	[&](const Init::None &) -> bool { return false; },
+	[&](const Init::Create &) -> bool { return false; });
+    }
+
+    bool clear_omap = false;
+
+    /**
+     * truncate
+     * <lowest, last> ?
+     *
+     * truncate is represented as a pair because in the event of
+     * multiple truncates within a single transaction we need to
+     * remember the lowest truncate and the final object size
+     * (the last truncate).  We also adjust the buffers map
+     * to account for truncates overriding previous writes */
+    std::optional<std::pair<uint64_t, uint64_t> > truncate = std::nullopt;
+
+    std::map<std::string, std::optional<ceph::buffer::list> > attr_updates;
+
+    enum class OmapUpdateType {Remove, Insert, RemoveRange};
+    std::vector<std::pair<OmapUpdateType, ceph::buffer::list> > omap_updates;
+
+    std::optional<ceph::buffer::list> omap_header;
+
+    /// (old, new) -- only valid with no truncate or buffer updates
+    std::optional<std::pair<std::set<snapid_t>, std::set<snapid_t>>> updated_snaps;
+
+    struct alloc_hint_t {
+      uint64_t expected_object_size;
+      uint64_t expected_write_size;
+      uint32_t flags;
+    };
+    std::optional<alloc_hint_t> alloc_hint;
+
+    struct BufferUpdate {
+      struct Write {
+	ceph::buffer::list buffer;
+	uint32_t fadvise_flags;
+      };
+      struct Zero {
+	uint64_t len;
+      };
+      struct CloneRange {
+	hobject_t from;
+	uint64_t offset;
+	uint64_t len;
+      };
+    };
+    using BufferUpdateType = boost::variant<
+      BufferUpdate::Write,
+      BufferUpdate::Zero,
+      BufferUpdate::CloneRange>;
+
+  private:
+    struct SplitMerger {
+      BufferUpdateType split(
+	uint64_t offset,
+	uint64_t len,
+	const BufferUpdateType &bu) const {
+	return match(
+	  bu,
+	  [&](const BufferUpdate::Write &w) -> BufferUpdateType {
+	    ceph::buffer::list bl;
+	    bl.substr_of(w.buffer, offset, len);
+	    return BufferUpdate::Write{bl, w.fadvise_flags};
+	  },
+	  [&](const BufferUpdate::Zero &) -> BufferUpdateType {
+	    return BufferUpdate::Zero{len};
+	  },
+	  [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType {
+	    return BufferUpdate::CloneRange{c.from, c.offset + offset, len};
+	  });
+      }
+      uint64_t length(
+	const BufferUpdateType &left) const {
+	return match(
+	  left,
+	  [&](const BufferUpdate::Write &w) -> uint64_t {
+	    return w.buffer.length();
+	  },
+	  [&](const BufferUpdate::Zero &z) -> uint64_t {
+	    return z.len;
+	  },
+	  [&](const BufferUpdate::CloneRange &c) -> uint64_t {
+	    return c.len;
+	  });
+      }
+      bool can_merge(
+	const BufferUpdateType &left,
+	const BufferUpdateType &right) const {
+	return match(
+	  left,
+	  [&](const BufferUpdate::Write &w) -> bool {
+	    auto r = boost::get<BufferUpdate::Write>(&right);
+	    return r != nullptr && (w.fadvise_flags == r->fadvise_flags);
+	  },
+	  [&](const BufferUpdate::Zero &) -> bool {
+	    auto r = boost::get<BufferUpdate::Zero>(&right);
+	    return r != nullptr;
+	  },
+	  [&](const BufferUpdate::CloneRange &c) -> bool {
+	    return false;
+	  });
+      }
+      BufferUpdateType merge(
+	BufferUpdateType &&left,
+	BufferUpdateType &&right) const {
+	return match(
+	  left,
+	  [&](const BufferUpdate::Write &w) -> BufferUpdateType {
+	    auto r = boost::get<BufferUpdate::Write>(&right);
+	    ceph_assert(r && w.fadvise_flags == r->fadvise_flags);
+	    ceph::buffer::list bl = w.buffer;
+	    bl.append(r->buffer);
+	    return BufferUpdate::Write{bl, w.fadvise_flags};
+	  },
+	  [&](const BufferUpdate::Zero &z) -> BufferUpdateType {
+	    auto r = boost::get<BufferUpdate::Zero>(&right);
+	    ceph_assert(r);
+	    return BufferUpdate::Zero{z.len + r->len};
+	  },
+	  [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType {
+	    ceph_abort_msg("violates can_merge condition");
+	    return left;
+	  });
+      }
+    };
+  public:
+    using buffer_update_type = interval_map<
+      uint64_t, BufferUpdateType, SplitMerger>;
+    buffer_update_type buffer_updates;
+
+    friend class PGTransaction;
+  };
+  std::map<hobject_t, ObjectOperation> op_map;
+private:
+  ObjectOperation &get_object_op_for_modify(const hobject_t &hoid) {
+    auto &op = op_map[hoid];
+    ceph_assert(!op.is_delete());
+    return op;
+  }
+  ObjectOperation &get_object_op(const hobject_t &hoid) {
+    return op_map[hoid];
+  }
+public:
+  void add_obc(
+    ObjectContextRef obc) {
+    ceph_assert(obc);
+    obc_map[obc->obs.oi.soid] = obc;
+  }
+  /// Sets up state for new object
+  void create(
+    const hobject_t &hoid
+    ) {
+    auto &op = op_map[hoid];
+    ceph_assert(op.is_none() || op.is_delete());
+    op.init_type = ObjectOperation::Init::Create();
+  }
+
+  /// Sets up state for target cloned from source
+  void clone(
+    const hobject_t &target,       ///< [in] obj to clone to
+    const hobject_t &source        ///< [in] obj to clone from
+    ) {
+    auto &op = op_map[target];
+    ceph_assert(op.is_none() || op.is_delete());
+    op.init_type = ObjectOperation::Init::Clone{source};
+  }
+
+  /// Sets up state for target renamed from source
+  void rename(
+    const hobject_t &target,       ///< [in] to, must not exist, be non-temp
+    const hobject_t &source        ///< [in] source (must be a temp object)
+    ) {
+    ceph_assert(source.is_temp());
+    ceph_assert(!target.is_temp());
+    auto &op = op_map[target];
+    ceph_assert(op.is_none() || op.is_delete());
+
+    bool del_first = op.is_delete();
+    auto iter = op_map.find(source);
+    if (iter != op_map.end()) {
+      op = iter->second;
+      op_map.erase(iter);
+      op.delete_first = del_first;
+    }
+
+    op.init_type = ObjectOperation::Init::Rename{source};
+  }
+
+  /// Remove -- must not be called on rename target
+  void remove(
+    const hobject_t &hoid          ///< [in] obj to remove
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    if (!op.is_fresh_object()) {
+      ceph_assert(!op.updated_snaps);
+      op = ObjectOperation();
+      op.delete_first = true;
+    } else {
+      ceph_assert(!op.is_rename());
+      op_map.erase(hoid); // make it a noop if it's a fresh object
+    }
+  }
+
+  void update_snaps(
+    const hobject_t &hoid,         ///< [in] object for snaps
+    const std::set<snapid_t> &old_snaps,///< [in] old snaps value
+    const std::set<snapid_t> &new_snaps ///< [in] new snaps value
+    ) {
+    auto &op = get_object_op(hoid);
+    ceph_assert(!op.updated_snaps);
+    ceph_assert(op.buffer_updates.empty());
+    ceph_assert(!op.truncate);
+    op.updated_snaps = make_pair(
+      old_snaps,
+      new_snaps);
+  }
+
+  /// Clears, truncates
+  void omap_clear(
+    const hobject_t &hoid          ///< [in] object to clear omap
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    op.clear_omap = true;
+    op.omap_updates.clear();
+    op.omap_header = std::nullopt;
+  }
+  void truncate(
+    const hobject_t &hoid,         ///< [in] object
+    uint64_t off                   ///< [in] offset to truncate to
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    ceph_assert(!op.updated_snaps);
+    op.buffer_updates.erase(
+      off,
+      std::numeric_limits<uint64_t>::max() - off);
+    if (!op.truncate || off < op.truncate->first) {
+      op.truncate = std::pair<uint64_t, uint64_t>(off, off);
+    } else {
+      op.truncate->second = off;
+    }
+  }
+
+  /// Attr ops
+  void setattrs(
+    const hobject_t &hoid,         ///< [in] object to write
+    std::map<std::string, ceph::buffer::list> &attrs ///< [in] attrs, may be cleared
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    for (auto &&i: attrs) {
+      auto& d = op.attr_updates[i.first];
+      d = i.second;
+      d->rebuild();
+    }
+  }
+  void setattr(
+    const hobject_t &hoid,         ///< [in] object to write
+    const std::string &attrname,        ///< [in] attr to write
+    ceph::buffer::list &bl                 ///< [in] val to write, may be claimed
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    auto& d = op.attr_updates[attrname];
+    d = bl;
+    d->rebuild();
+  }
+  void rmattr(
+    const hobject_t &hoid,         ///< [in] object to write
+    const std::string &attrname         ///< [in] attr to remove
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    op.attr_updates[attrname] = std::nullopt;
+  }
+
+  /// set alloc hint
+  void set_alloc_hint(
+    const hobject_t &hoid,         ///< [in] object (must exist)
+    uint64_t expected_object_size, ///< [in]
+    uint64_t expected_write_size,
+    uint32_t flags
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    op.alloc_hint = ObjectOperation::alloc_hint_t{
+      expected_object_size, expected_write_size, flags};
+  }
+
+  /// Buffer updates
+  void write(
+    const hobject_t &hoid,         ///< [in] object to write
+    uint64_t off,                  ///< [in] off at which to write
+    uint64_t len,                  ///< [in] len to write from bl
+    ceph::buffer::list &bl,                ///< [in] bl to write will be claimed to len
+    uint32_t fadvise_flags = 0     ///< [in] fadvise hint
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    ceph_assert(!op.updated_snaps);
+    ceph_assert(len > 0);
+    ceph_assert(len == bl.length());
+    op.buffer_updates.insert(
+      off,
+      len,
+      ObjectOperation::BufferUpdate::Write{bl, fadvise_flags});
+  }
+  void clone_range(
+    const hobject_t &from,         ///< [in] from
+    const hobject_t &to,           ///< [in] to
+    uint64_t fromoff,              ///< [in] offset
+    uint64_t len,                  ///< [in] len
+    uint64_t tooff                 ///< [in] offset
+    ) {
+    auto &op = get_object_op_for_modify(to);
+    ceph_assert(!op.updated_snaps);
+    op.buffer_updates.insert(
+      tooff,
+      len,
+      ObjectOperation::BufferUpdate::CloneRange{from, fromoff, len});
+  }
+  void zero(
+    const hobject_t &hoid,         ///< [in] object
+    uint64_t off,                  ///< [in] offset to start zeroing at
+    uint64_t len                   ///< [in] amount to zero
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    ceph_assert(!op.updated_snaps);
+    op.buffer_updates.insert(
+      off,
+      len,
+      ObjectOperation::BufferUpdate::Zero{len});
+  }
+
+  /// Omap updates
+  void omap_setkeys(
+    const hobject_t &hoid,         ///< [in] object to write
+    ceph::buffer::list &keys_bl            ///< [in] encoded map<string, ceph::buffer::list>
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    op.omap_updates.emplace_back(
+      std::make_pair(
+	ObjectOperation::OmapUpdateType::Insert,
+	keys_bl));
+  }
+  void omap_setkeys(
+    const hobject_t &hoid,         ///< [in] object to write
+    std::map<std::string, ceph::buffer::list> &keys  ///< [in] omap keys, may be cleared
+    ) {
+    using ceph::encode;
+    ceph::buffer::list bl;
+    encode(keys, bl);
+    omap_setkeys(hoid, bl);
+  }
+  void omap_rmkeys(
+    const hobject_t &hoid,         ///< [in] object to write
+    ceph::buffer::list &keys_bl            ///< [in] encode set<string>
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    op.omap_updates.emplace_back(
+      std::make_pair(
+	ObjectOperation::OmapUpdateType::Remove,
+	keys_bl));
+  }
+  void omap_rmkeys(
+    const hobject_t &hoid,         ///< [in] object to write
+    std::set<std::string> &keys              ///< [in] omap keys, may be cleared
+    ) {
+    using ceph::encode;
+    ceph::buffer::list bl;
+    encode(keys, bl);
+    omap_rmkeys(hoid, bl);
+  }
+  void omap_rmkeyrange(
+    const hobject_t &hoid,         ///< [in] object to write
+    ceph::buffer::list &range_bl           ///< [in] encode string[2]
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    op.omap_updates.emplace_back(
+      std::make_pair(
+	ObjectOperation::OmapUpdateType::RemoveRange,
+	range_bl));
+  }
+  void omap_rmkeyrange(
+    const hobject_t &hoid,         ///< [in] object to write
+    std::string& key_begin,        ///< [in] first key in range
+    std::string& key_end           ///< [in] first key past range, range is [first,last)
+    ) {
+    ceph::buffer::list bl;
+    ::encode(key_begin, bl);
+    ::encode(key_end, bl);
+    omap_rmkeyrange(hoid, bl);
+  }
+  void omap_setheader(
+    const hobject_t &hoid,         ///< [in] object to write
+    ceph::buffer::list &header             ///< [in] header
+    ) {
+    auto &op = get_object_op_for_modify(hoid);
+    op.omap_header = header;
+  }
+
+  bool empty() const {
+    return op_map.empty();
+  }
+
+  uint64_t get_bytes_written() const {
+    uint64_t ret = 0;
+    for (auto &&i: op_map) {
+      for (auto &&j: i.second.buffer_updates) {
+	ret += j.get_len();
+      }
+    }
+    return ret;
+  }
+
+  void nop(
+    const hobject_t &hoid ///< [in] obj to which we are doing nothing
+    ) {
+    get_object_op_for_modify(hoid);
+  }
+
+  /* Calls t() on all pair<hobject_t, ObjectOperation> & such that clone/rename
+   * sinks are always called before clone sources
+   *
+   * TODO: add a fast path for the single object case and possibly the single
+   * object clone from source case (make_writeable made a clone).
+   *
+   * This structure only requires that the source->sink graph be acyclic.
+   * This is much more general than is actually required by PrimaryLogPG.
+   * Only 4 flavors of multi-object transactions actually happen:
+   * 1) rename temp -> object for copyfrom
+   * 2) clone head -> clone, modify head for make_writeable on normal head write
+   * 3) clone clone -> head for rollback
+   * 4) 2 + 3
+   *
+   * We can bypass the below logic for single object transactions trivially
+   * (including case 1 above since temp doesn't show up again).
+   * For 2-3, we could add something ad-hoc to ensure that they happen in the
+   * right order, but it actually seems easier to just do the graph construction.
+   */
+  template <typename T>
+  void safe_create_traverse(T &&t) {
+    std::map<hobject_t, std::list<hobject_t>> dgraph;
+    std::list<hobject_t> stack;
+
+    // Populate stack with roots, dgraph with edges
+    for (auto &&opair: op_map) {
+      hobject_t source;
+      if (opair.second.has_source(&source)) {
+	auto &l = dgraph[source];
+	if (l.empty() && !op_map.count(source)) {
+	  /* Source oids not in op_map need to be added as roots
+	   * (but only once!) */
+	  stack.push_back(source);
+	}
+	l.push_back(opair.first);
+      } else {
+	stack.push_back(opair.first);
+      }
+    }
+
+    /* Why don't we need to worry about accessing the same node
+     * twice?  dgraph nodes always have in-degree at most 1 because
+     * the inverse graph nodes (source->dest) can have out-degree
+     * at most 1 (only one possible source).  We do a post-order
+     * depth-first traversal here to ensure we call f on children
+     * before parents.
+     */
+    while (!stack.empty()) {
+      hobject_t &cur = stack.front();
+      auto diter = dgraph.find(cur);
+      if (diter == dgraph.end()) {
+	/* Leaf: pop and call t() */
+	auto opiter = op_map.find(cur);
+	if (opiter != op_map.end())
+	  t(*opiter);
+	stack.pop_front();
+      } else {
+	/* Internal node: push children onto stack, remove edge,
+	 * recurse.  When this node is encountered again, it'll
+	 * be a leaf */
+	ceph_assert(!diter->second.empty());
+	stack.splice(stack.begin(), diter->second);
+	dgraph.erase(diter);
+      }
+    }
+  }
+};
+using PGTransactionUPtr = std::unique_ptr<PGTransaction>;
+
+#endif
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc
new file mode 100644
index 000000000..9709f3ce1
--- /dev/null
+++ b/src/osd/PeeringState.cc
@@ -0,0 +1,7607 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PGPeeringEvent.h"
+#include "common/ceph_releases.h"
+#include "common/dout.h"
+#include "PeeringState.h"
+
+#include "messages/MOSDPGRemove.h"
+#include "messages/MBackfillReserve.h"
+#include "messages/MRecoveryReserve.h"
+#include "messages/MOSDScrubReserve.h"
+#include "messages/MOSDPGInfo.h"
+#include "messages/MOSDPGInfo2.h"
+#include "messages/MOSDPGTrim.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGNotify2.h"
+#include "messages/MOSDPGQuery.h"
+#include "messages/MOSDPGQuery2.h"
+#include "messages/MOSDPGLease.h"
+#include "messages/MOSDPGLeaseAck.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+
+using std::dec;
+using std::hex;
+using std::make_pair;
+using std::map;
+using std::ostream;
+using std::pair;
+using std::set;
+using std::stringstream;
+using std::vector;
+
+using ceph::Formatter;
+using ceph::make_message;
+
+BufferedRecoveryMessages::BufferedRecoveryMessages(
+  ceph_release_t r,
+  PeeringCtx &ctx)
+  : require_osd_release(r) {
+  // steal messages from ctx
+  message_map.swap(ctx.message_map);
+}
+
+void BufferedRecoveryMessages::send_notify(int to, const pg_notify_t &n)
+{
+  if (require_osd_release >= ceph_release_t::octopus) {
+    spg_t pgid(n.info.pgid.pgid, n.to);
+    send_osd_message(to, make_message<MOSDPGNotify2>(pgid, n));
+  } else {
+    send_osd_message(to, make_message<MOSDPGNotify>(n.epoch_sent, vector{n}));
+  }
+}
+
+void BufferedRecoveryMessages::send_query(
+  int to,
+  spg_t to_spgid,
+  const pg_query_t &q)
+{
+  if (require_osd_release >= ceph_release_t::octopus) {
+    send_osd_message(to,
+		     make_message<MOSDPGQuery2>(to_spgid, q));
+  } else {
+    auto m = make_message<MOSDPGQuery>(
+      q.epoch_sent,
+      MOSDPGQuery::pg_list_t{{to_spgid, q}});
+    send_osd_message(to, m);
+  }
+}
+
+void BufferedRecoveryMessages::send_info(
+  int to,
+  spg_t to_spgid,
+  epoch_t min_epoch,
+  epoch_t cur_epoch,
+  const pg_info_t &info,
+  std::optional<pg_lease_t> lease,
+  std::optional<pg_lease_ack_t> lease_ack)
+{
+  if (require_osd_release >= ceph_release_t::octopus) {
+    send_osd_message(
+      to,
+      make_message<MOSDPGInfo2>(
+	to_spgid,
+	info,
+	cur_epoch,
+	min_epoch,
+	lease,
+	lease_ack)
+      );
+  } else {
+    send_osd_message(
+      to,
+      make_message<MOSDPGInfo>(
+        cur_epoch,
+        vector{pg_notify_t{to_spgid.shard,
+			   info.pgid.shard,
+			   min_epoch, cur_epoch,
+			   info, PastIntervals{}}})
+      );
+  }
+}
+
+void PGPool::update(OSDMapRef map)
+{
+  const pg_pool_t *pi = map->get_pg_pool(id);
+  if (!pi) {
+    return; // pool has been deleted
+  }
+  info = *pi;
+  name = map->get_pool_name(id);
+
+  bool updated = false;
+  if ((map->get_epoch() != cached_epoch + 1) ||
+      (pi->get_snap_epoch() == map->get_epoch())) {
+    updated = true;
+  }
+
+  if (info.is_pool_snaps_mode() && updated) {
+    snapc = pi->get_snap_context();
+  }
+  cached_epoch = map->get_epoch();
+}
+
+/*-------------Peering State Helpers----------------*/
+#undef dout_prefix
+#define dout_prefix (dpp->gen_prefix(*_dout))
+#undef psdout
+#define psdout(x) ldout(cct, x)
+
+PeeringState::PeeringState(
+  CephContext *cct,
+  pg_shard_t pg_whoami,
+  spg_t spgid,
+  const PGPool &_pool,
+  OSDMapRef curmap,
+  DoutPrefixProvider *dpp,
+  PeeringListener *pl)
+  : state_history(*pl),
+    cct(cct),
+    spgid(spgid),
+    dpp(dpp),
+    pl(pl),
+    orig_ctx(0),
+    osdmap_ref(curmap),
+    pool(_pool),
+    pg_whoami(pg_whoami),
+    info(spgid),
+    pg_log(cct),
+    missing_loc(spgid, this, dpp, cct),
+    machine(this, cct, spgid, dpp, pl, &state_history)
+{
+  machine.initiate();
+}
+
+void PeeringState::start_handle(PeeringCtx *new_ctx) {
+  ceph_assert(!rctx);
+  ceph_assert(!orig_ctx);
+  orig_ctx = new_ctx;
+  if (new_ctx) {
+    if (messages_pending_flush) {
+      rctx.emplace(*messages_pending_flush, *new_ctx);
+    } else {
+      rctx.emplace(*new_ctx);
+    }
+    rctx->start_time = ceph_clock_now();
+  }
+}
+
+void PeeringState::begin_block_outgoing() {
+  ceph_assert(!messages_pending_flush);
+  ceph_assert(orig_ctx);
+  ceph_assert(rctx);
+  messages_pending_flush = BufferedRecoveryMessages(
+    orig_ctx->require_osd_release);
+  rctx.emplace(*messages_pending_flush, *orig_ctx);
+}
+
+void PeeringState::clear_blocked_outgoing() {
+  ceph_assert(orig_ctx);
+  ceph_assert(rctx);
+  messages_pending_flush = std::optional<BufferedRecoveryMessages>();
+}
+
+void PeeringState::end_block_outgoing() {
+  ceph_assert(messages_pending_flush);
+  ceph_assert(orig_ctx);
+  ceph_assert(rctx);
+
+  orig_ctx->accept_buffered_messages(*messages_pending_flush);
+  rctx.emplace(*orig_ctx);
+  messages_pending_flush = std::optional<BufferedRecoveryMessages>();
+}
+
+void PeeringState::end_handle() {
+  if (rctx) {
+    utime_t dur = ceph_clock_now() - rctx->start_time;
+    machine.event_time += dur;
+  }
+
+  machine.event_count++;
+  rctx = std::nullopt;
+  orig_ctx = NULL;
+}
+
+void PeeringState::check_recovery_sources(const OSDMapRef& osdmap)
+{
+  /*
+   * check that any peers we are planning to (or currently) pulling
+   * objects from are dealt with.
+   */
+  missing_loc.check_recovery_sources(osdmap);
+  pl->check_recovery_sources(osdmap);
+
+  for (auto i = peer_log_requested.begin(); i != peer_log_requested.end();) {
+    if (!osdmap->is_up(i->osd)) {
+      psdout(10) << "peer_log_requested removing " << *i << dendl;
+      peer_log_requested.erase(i++);
+    } else {
+      ++i;
+    }
+  }
+
+  for (auto i = peer_missing_requested.begin();
+       i != peer_missing_requested.end();) {
+    if (!osdmap->is_up(i->osd)) {
+      psdout(10) << "peer_missing_requested removing " << *i << dendl;
+      peer_missing_requested.erase(i++);
+    } else {
+      ++i;
+    }
+  }
+}
+
+void PeeringState::update_history(const pg_history_t& new_history)
+{
+  auto mnow = pl->get_mnow();
+  info.history.refresh_prior_readable_until_ub(mnow, prior_readable_until_ub);
+  if (info.history.merge(new_history)) {
+    psdout(20) << __func__ << " advanced history from " << new_history << dendl;
+    dirty_info = true;
+    if (info.history.last_epoch_clean >= info.history.same_interval_since) {
+      psdout(20) << __func__ << " clearing past_intervals" << dendl;
+      past_intervals.clear();
+      dirty_big_info = true;
+    }
+    prior_readable_until_ub = info.history.get_prior_readable_until_ub(mnow);
+    if (prior_readable_until_ub != ceph::signedspan::zero()) {
+      dout(20) << __func__
+	       << " prior_readable_until_ub " << prior_readable_until_ub
+	       << " (mnow " << mnow << " + "
+	       << info.history.prior_readable_until_ub << ")" << dendl;
+    }
+  }
+  pl->on_info_history_change();
+}
+
+hobject_t PeeringState::earliest_backfill() const
+{
+  hobject_t e = hobject_t::get_max();
+  for (const pg_shard_t& bt : get_backfill_targets()) {
+    const pg_info_t &pi = get_peer_info(bt);
+    e = std::min(pi.last_backfill, e);
+  }
+  return e;
+}
+
+void PeeringState::purge_strays()
+{
+  if (is_premerge()) {
+    psdout(10) << "purge_strays " << stray_set << " but premerge, doing nothing"
+	       << dendl;
+    return;
+  }
+  if (cct->_conf.get_val<bool>("osd_debug_no_purge_strays")) {
+    return;
+  }
+  psdout(10) << "purge_strays " << stray_set << dendl;
+
+  bool removed = false;
+  for (auto p = stray_set.begin(); p != stray_set.end(); ++p) {
+    ceph_assert(!is_acting_recovery_backfill(*p));
+    if (get_osdmap()->is_up(p->osd)) {
+      psdout(10) << "sending PGRemove to osd." << *p << dendl;
+      vector<spg_t> to_remove;
+      to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
+      auto m = make_message<MOSDPGRemove>(
+	get_osdmap_epoch(),
+	to_remove);
+      pl->send_cluster_message(p->osd, m, get_osdmap_epoch());
+    } else {
+      psdout(10) << "not sending PGRemove to down osd." << *p << dendl;
+    }
+    peer_missing.erase(*p);
+    peer_info.erase(*p);
+    missing_loc.remove_stray_recovery_sources(*p);
+    peer_purged.insert(*p);
+    removed = true;
+  }
+
+  // if we removed anyone, update peers (which include peer_info)
+  if (removed)
+    update_heartbeat_peers();
+
+  stray_set.clear();
+
+  // clear _requested maps; we may have to peer() again if we discover
+  // (more) stray content
+  peer_log_requested.clear();
+  peer_missing_requested.clear();
+}
+
+void PeeringState::query_unfound(Formatter *f, string state)
+{
+  psdout(20) << "Enter PeeringState common QueryUnfound" << dendl;
+  {
+    f->dump_string("state", state);
+    f->dump_bool("available_might_have_unfound", true);
+    f->open_array_section("might_have_unfound");
+    for (auto p = might_have_unfound.begin();
+	 p != might_have_unfound.end();
+	 ++p) {
+      if (peer_missing.count(*p)) {
+	; // Ignore already probed OSDs
+      } else {
+        f->open_object_section("osd");
+        f->dump_stream("osd") << *p;
+	if (peer_missing_requested.count(*p)) {
+	  f->dump_string("status", "querying");
+        } else if (!get_osdmap()->is_up(p->osd)) {
+	  f->dump_string("status", "osd is down");
+        } else {
+	  f->dump_string("status", "not queried");
+        }
+        f->close_section();
+      }
+    }
+    f->close_section();
+  }
+  psdout(20) << "Exit PeeringState common QueryUnfound" << dendl;
+  return;
+}
+
+bool PeeringState::proc_replica_info(
+  pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
+{
+  auto p = peer_info.find(from);
+  if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
+    psdout(10) << " got dup osd." << from << " info "
+	       << oinfo << ", identical to ours" << dendl;
+    return false;
+  }
+
+  if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
+    psdout(10) << " got info " << oinfo << " from down osd." << from
+	     << " discarding" << dendl;
+    return false;
+  }
+
+  psdout(10) << " got osd." << from << " " << oinfo << dendl;
+  ceph_assert(is_primary());
+  peer_info[from] = oinfo;
+  might_have_unfound.insert(from);
+
+  update_history(oinfo.history);
+
+  // stray?
+  if (!is_up(from) && !is_acting(from)) {
+    psdout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
+    stray_set.insert(from);
+    if (is_clean()) {
+      purge_strays();
+    }
+  }
+
+  // was this a new info?  if so, update peers!
+  if (p == peer_info.end())
+    update_heartbeat_peers();
+
+  return true;
+}
+
+
+void PeeringState::remove_down_peer_info(const OSDMapRef &osdmap)
+{
+  // Remove any downed osds from peer_info
+  bool removed = false;
+  auto p = peer_info.begin();
+  while (p != peer_info.end()) {
+    if (!osdmap->is_up(p->first.osd)) {
+      psdout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
+      peer_missing.erase(p->first);
+      peer_log_requested.erase(p->first);
+      peer_missing_requested.erase(p->first);
+      peer_info.erase(p++);
+      removed = true;
+    } else
+      ++p;
+  }
+
+  // Remove any downed osds from peer_purged so we can re-purge if necessary
+  auto it = peer_purged.begin();
+  while (it != peer_purged.end()) {
+    if (!osdmap->is_up(it->osd)) {
+      psdout(10) << " dropping down osd." << *it << " from peer_purged" << dendl;
+      peer_purged.erase(it++);
+    } else {
+      ++it;
+    }
+  }
+
+  // if we removed anyone, update peers (which include peer_info)
+  if (removed)
+    update_heartbeat_peers();
+
+  check_recovery_sources(osdmap);
+}
+
+void PeeringState::update_heartbeat_peers()
+{
+  if (!is_primary())
+    return;
+
+  set<int> new_peers;
+  for (unsigned i=0; i<acting.size(); i++) {
+    if (acting[i] != CRUSH_ITEM_NONE)
+      new_peers.insert(acting[i]);
+  }
+  for (unsigned i=0; i<up.size(); i++) {
+    if (up[i] != CRUSH_ITEM_NONE)
+      new_peers.insert(up[i]);
+  }
+  for (auto p = peer_info.begin(); p != peer_info.end(); ++p) {
+    new_peers.insert(p->first.osd);
+  }
+  pl->update_heartbeat_peers(std::move(new_peers));
+}
+
+void PeeringState::write_if_dirty(ObjectStore::Transaction& t)
+{
+  pl->prepare_write(
+    info,
+    last_written_info,
+    past_intervals,
+    pg_log,
+    dirty_info,
+    dirty_big_info,
+    last_persisted_osdmap < get_osdmap_epoch(),
+    t);
+  if (dirty_info || dirty_big_info) {
+    last_persisted_osdmap = get_osdmap_epoch();
+    last_written_info = info;
+    dirty_info = false;
+    dirty_big_info = false;
+  }
+}
+
+void PeeringState::advance_map(
+  OSDMapRef osdmap, OSDMapRef lastmap,
+  vector<int>& newup, int up_primary,
+  vector<int>& newacting, int acting_primary,
+  PeeringCtx &rctx)
+{
+  ceph_assert(lastmap == osdmap_ref);
+  psdout(10) << "handle_advance_map "
+	    << newup << "/" << newacting
+	    << " -- " << up_primary << "/" << acting_primary
+	    << dendl;
+
+  update_osdmap_ref(osdmap);
+  pool.update(osdmap);
+
+  AdvMap evt(
+    osdmap, lastmap, newup, up_primary,
+    newacting, acting_primary);
+  handle_event(evt, &rctx);
+  if (pool.info.last_change == osdmap_ref->get_epoch()) {
+    pl->on_pool_change();
+  }
+  readable_interval = pool.get_readable_interval(cct->_conf);
+  last_require_osd_release = osdmap->require_osd_release;
+}
+
+void PeeringState::activate_map(PeeringCtx &rctx)
+{
+  psdout(10) << __func__ << dendl;
+  ActMap evt;
+  handle_event(evt, &rctx);
+  if (osdmap_ref->get_epoch() - last_persisted_osdmap >
+    cct->_conf->osd_pg_epoch_persisted_max_stale) {
+    psdout(20) << __func__ << ": Dirtying info: last_persisted is "
+	      << last_persisted_osdmap
+	      << " while current is " << osdmap_ref->get_epoch() << dendl;
+    dirty_info = true;
+  } else {
+    psdout(20) << __func__ << ": Not dirtying info: last_persisted is "
+	      << last_persisted_osdmap
+	      << " while current is " << osdmap_ref->get_epoch() << dendl;
+  }
+  write_if_dirty(rctx.transaction);
+
+  if (get_osdmap()->check_new_blocklist_entries()) {
+    pl->check_blocklisted_watchers();
+  }
+}
+
+void PeeringState::set_last_peering_reset()
+{
+  psdout(20) << "set_last_peering_reset " << get_osdmap_epoch() << dendl;
+  if (last_peering_reset != get_osdmap_epoch()) {
+    last_peering_reset = get_osdmap_epoch();
+    psdout(10) << "Clearing blocked outgoing recovery messages" << dendl;
+    clear_blocked_outgoing();
+    if (!pl->try_flush_or_schedule_async()) {
+      psdout(10) << "Beginning to block outgoing recovery messages" << dendl;
+      begin_block_outgoing();
+    } else {
+      psdout(10) << "Not blocking outgoing recovery messages" << dendl;
+    }
+  }
+}
+
+void PeeringState::complete_flush()
+{
+  flushes_in_progress--;
+  if (flushes_in_progress == 0) {
+    pl->on_flushed();
+  }
+}
+
+void PeeringState::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
+{
+  const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
+  if (!pi) {
+    return; // pool deleted
+  }
+  bool changed = false;
+  if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
+    const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
+    if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
+      psdout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
+      changed = true;
+    }
+  }
+  if (changed) {
+    info.history.last_epoch_marked_full = osdmap->get_epoch();
+    dirty_info = true;
+  }
+}
+
+bool PeeringState::should_restart_peering(
+  int newupprimary,
+  int newactingprimary,
+  const vector<int>& newup,
+  const vector<int>& newacting,
+  OSDMapRef lastmap,
+  OSDMapRef osdmap)
+{
+  if (PastIntervals::is_new_interval(
+	primary.osd,
+	newactingprimary,
+	acting,
+	newacting,
+	up_primary.osd,
+	newupprimary,
+	up,
+	newup,
+	osdmap.get(),
+	lastmap.get(),
+	info.pgid.pgid)) {
+    psdout(20) << "new interval newup " << newup
+	       << " newacting " << newacting << dendl;
+    return true;
+  }
+  if (!lastmap->is_up(pg_whoami.osd) && osdmap->is_up(pg_whoami.osd)) {
+    psdout(10) << __func__ << " osd transitioned from down -> up"
+	       << dendl;
+    return true;
+  }
+  return false;
+}
+
+/* Called before initializing peering during advance_map */
+void PeeringState::start_peering_interval(
+  const OSDMapRef lastmap,
+  const vector<int>& newup, int new_up_primary,
+  const vector<int>& newacting, int new_acting_primary,
+  ObjectStore::Transaction &t)
+{
+  const OSDMapRef osdmap = get_osdmap();
+
+  set_last_peering_reset();
+
+  vector<int> oldacting, oldup;
+  int oldrole = get_role();
+
+  if (is_primary()) {
+    pl->clear_ready_to_merge();
+  }
+
+
+  pg_shard_t old_acting_primary = get_primary();
+  pg_shard_t old_up_primary = up_primary;
+  bool was_old_primary = is_primary();
+  bool was_old_nonprimary = is_nonprimary();
+
+  acting.swap(oldacting);
+  up.swap(oldup);
+  init_primary_up_acting(
+    newup,
+    newacting,
+    new_up_primary,
+    new_acting_primary);
+
+  if (info.stats.up != up ||
+      info.stats.acting != acting ||
+      info.stats.up_primary != new_up_primary ||
+      info.stats.acting_primary != new_acting_primary) {
+    info.stats.up = up;
+    info.stats.up_primary = new_up_primary;
+    info.stats.acting = acting;
+    info.stats.acting_primary = new_acting_primary;
+    info.stats.mapping_epoch = osdmap->get_epoch();
+  }
+
+  pl->clear_publish_stats();
+
+  // This will now be remapped during a backfill in cases
+  // that it would not have been before.
+  if (up != acting)
+    state_set(PG_STATE_REMAPPED);
+  else
+    state_clear(PG_STATE_REMAPPED);
+
+  int role = osdmap->calc_pg_role(pg_whoami, acting);
+  set_role(role);
+
+  // did acting, up, primary|acker change?
+  if (!lastmap) {
+    psdout(10) << " no lastmap" << dendl;
+    dirty_info = true;
+    dirty_big_info = true;
+    info.history.same_interval_since = osdmap->get_epoch();
+  } else {
+    std::stringstream debug;
+    ceph_assert(info.history.same_interval_since != 0);
+    bool new_interval = PastIntervals::check_new_interval(
+      old_acting_primary.osd,
+      new_acting_primary,
+      oldacting, newacting,
+      old_up_primary.osd,
+      new_up_primary,
+      oldup, newup,
+      info.history.same_interval_since,
+      info.history.last_epoch_clean,
+      osdmap.get(),
+      lastmap.get(),
+      info.pgid.pgid,
+      missing_loc.get_recoverable_predicate(),
+      &past_intervals,
+      &debug);
+    psdout(10) << __func__ << ": check_new_interval output: "
+	       << debug.str() << dendl;
+    if (new_interval) {
+      if (osdmap->get_epoch() == pl->oldest_stored_osdmap() &&
+	  info.history.last_epoch_clean < osdmap->get_epoch()) {
+	psdout(10) << " map gap, clearing past_intervals and faking" << dendl;
+	// our information is incomplete and useless; someone else was clean
+	// after everything we know if osdmaps were trimmed.
+	past_intervals.clear();
+      } else {
+	psdout(10) << " noting past " << past_intervals << dendl;
+      }
+      dirty_info = true;
+      dirty_big_info = true;
+      info.history.same_interval_since = osdmap->get_epoch();
+      if (osdmap->have_pg_pool(info.pgid.pgid.pool()) &&
+	  info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
+				  osdmap->get_pg_num(info.pgid.pgid.pool()),
+				  nullptr)) {
+	info.history.last_epoch_split = osdmap->get_epoch();
+      }
+    }
+  }
+
+  if (old_up_primary != up_primary ||
+      oldup != up) {
+    info.history.same_up_since = osdmap->get_epoch();
+  }
+  // this comparison includes primary rank via pg_shard_t
+  if (old_acting_primary != get_primary()) {
+    info.history.same_primary_since = osdmap->get_epoch();
+  }
+
+  on_new_interval();
+  pl->on_info_history_change();
+
+  psdout(1) << __func__ << " up " << oldup << " -> " << up
+	    << ", acting " << oldacting << " -> " << acting
+	    << ", acting_primary " << old_acting_primary << " -> "
+	    << new_acting_primary
+	    << ", up_primary " << old_up_primary << " -> " << new_up_primary
+	    << ", role " << oldrole << " -> " << role
+	    << ", features acting " << acting_features
+	    << " upacting " << upacting_features
+	    << dendl;
+
+  // deactivate.
+  state_clear(PG_STATE_ACTIVE);
+  state_clear(PG_STATE_PEERED);
+  state_clear(PG_STATE_PREMERGE);
+  state_clear(PG_STATE_DOWN);
+  state_clear(PG_STATE_RECOVERY_WAIT);
+  state_clear(PG_STATE_RECOVERY_TOOFULL);
+  state_clear(PG_STATE_RECOVERING);
+
+  peer_purged.clear();
+  acting_recovery_backfill.clear();
+
+  // reset primary/replica state?
+  if (was_old_primary || is_primary()) {
+    pl->clear_want_pg_temp();
+  } else if (was_old_nonprimary || is_nonprimary()) {
+    pl->clear_want_pg_temp();
+  }
+  clear_primary_state();
+
+  pl->on_change(t);
+
+  ceph_assert(!deleting);
+
+  // should we tell the primary we are here?
+  send_notify = !is_primary();
+
+  if (role != oldrole ||
+      was_old_primary != is_primary()) {
+    // did primary change?
+    if (was_old_primary != is_primary()) {
+      state_clear(PG_STATE_CLEAN);
+    }
+
+    pl->on_role_change();
+  } else {
+    // no role change.
+    // did primary change?
+    if (get_primary() != old_acting_primary) {
+      psdout(10) << oldacting << " -> " << acting
+	       << ", acting primary "
+	       << old_acting_primary << " -> " << get_primary()
+	       << dendl;
+    } else {
+      // primary is the same.
+      if (is_primary()) {
+	// i am (still) primary. but my replica set changed.
+	state_clear(PG_STATE_CLEAN);
+
+	psdout(10) << oldacting << " -> " << acting
+		 << ", replicas changed" << dendl;
+      }
+    }
+  }
+
+  if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
+    psdout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
+    pl->queue_want_pg_temp(acting);
+  }
+}
+
+void PeeringState::on_new_interval()
+{
+  dout(20) << __func__ << dendl;
+  const OSDMapRef osdmap = get_osdmap();
+
+  // initialize features
+  acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+  upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+  for (auto p = acting.begin(); p != acting.end(); ++p) {
+    if (*p == CRUSH_ITEM_NONE)
+      continue;
+    uint64_t f = osdmap->get_xinfo(*p).features;
+    acting_features &= f;
+    upacting_features &= f;
+  }
+  for (auto p = up.begin(); p != up.end(); ++p) {
+    if (*p == CRUSH_ITEM_NONE)
+      continue;
+    upacting_features &= osdmap->get_xinfo(*p).features;
+  }
+  psdout(20) << __func__ << " upacting_features 0x" << std::hex
+	     << upacting_features << std::dec
+	     << " from " << acting << "+" << up << dendl;
+
+  psdout(20) << __func__ << " checking missing set deletes flag. missing = "
+	     << get_pg_log().get_missing() << dendl;
+
+  if (!pg_log.get_missing().may_include_deletes &&
+      !perform_deletes_during_peering()) {
+    pl->rebuild_missing_set_with_deletes(pg_log);
+  }
+  ceph_assert(
+    pg_log.get_missing().may_include_deletes ==
+    !perform_deletes_during_peering());
+
+  init_hb_stamps();
+
+  // update lease bounds for a new interval
+  auto mnow = pl->get_mnow();
+  prior_readable_until_ub = std::max(prior_readable_until_ub,
+				     readable_until_ub);
+  prior_readable_until_ub = info.history.refresh_prior_readable_until_ub(
+    mnow, prior_readable_until_ub);
+  psdout(10) << __func__ << " prior_readable_until_ub "
+	     << prior_readable_until_ub << " (mnow " << mnow << " + "
+	     << info.history.prior_readable_until_ub << ")" << dendl;
+  prior_readable_down_osds.clear(); // we populate this when we build the priorset
+
+  readable_until =
+    readable_until_ub =
+    readable_until_ub_sent =
+    readable_until_ub_from_primary = ceph::signedspan::zero();
+
+  acting_readable_until_ub.clear();
+  if (is_primary()) {
+    acting_readable_until_ub.resize(acting.size(), ceph::signedspan::zero());
+  }
+
+  pl->on_new_interval();
+}
+
+void PeeringState::init_primary_up_acting(
+  const vector<int> &newup,
+  const vector<int> &newacting,
+  int new_up_primary,
+  int new_acting_primary)
+{
+  actingset.clear();
+  acting = newacting;
+  for (uint8_t i = 0; i < acting.size(); ++i) {
+    if (acting[i] != CRUSH_ITEM_NONE)
+      actingset.insert(
+	pg_shard_t(
+	  acting[i],
+	  pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+  }
+  upset.clear();
+  up = newup;
+  for (uint8_t i = 0; i < up.size(); ++i) {
+    if (up[i] != CRUSH_ITEM_NONE)
+      upset.insert(
+	pg_shard_t(
+	  up[i],
+	  pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+  }
+  if (!pool.info.is_erasure()) {
+    // replicated
+    up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
+    primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
+  } else {
+    // erasure
+    up_primary = pg_shard_t();
+    primary = pg_shard_t();
+    for (uint8_t i = 0; i < up.size(); ++i) {
+      if (up[i] == new_up_primary) {
+	up_primary = pg_shard_t(up[i], shard_id_t(i));
+	break;
+      }
+    }
+    for (uint8_t i = 0; i < acting.size(); ++i) {
+      if (acting[i] == new_acting_primary) {
+	primary = pg_shard_t(acting[i], shard_id_t(i));
+	break;
+      }
+    }
+    ceph_assert(up_primary.osd == new_up_primary);
+    ceph_assert(primary.osd == new_acting_primary);
+  }
+}
+
+void PeeringState::init_hb_stamps()
+{
+  if (is_primary()) {
+    // we care about all other osds in the acting set
+    hb_stamps.resize(acting.size() - 1);
+    unsigned i = 0;
+    for (auto p : acting) {
+      if (p == CRUSH_ITEM_NONE || p == get_primary().osd) {
+	continue;
+      }
+      hb_stamps[i++] = pl->get_hb_stamps(p);
+    }
+    hb_stamps.resize(i);
+  } else if (is_nonprimary()) {
+    // we care about just the primary
+    hb_stamps.resize(1);
+    hb_stamps[0] = pl->get_hb_stamps(get_primary().osd);
+  } else {
+    hb_stamps.clear();
+  }
+  dout(10) << __func__ << " now " << hb_stamps << dendl;
+}
+
+
+void PeeringState::clear_recovery_state()
+{
+  async_recovery_targets.clear();
+  backfill_targets.clear();
+}
+
+void PeeringState::clear_primary_state()
+{
+  psdout(10) << "clear_primary_state" << dendl;
+
+  // clear peering state
+  stray_set.clear();
+  peer_log_requested.clear();
+  peer_missing_requested.clear();
+  peer_info.clear();
+  peer_bytes.clear();
+  peer_missing.clear();
+  peer_last_complete_ondisk.clear();
+  peer_activated.clear();
+  min_last_complete_ondisk = eversion_t();
+  pg_trim_to = eversion_t();
+  might_have_unfound.clear();
+  need_up_thru = false;
+  missing_loc.clear();
+  pg_log.reset_recovery_pointers();
+
+  clear_recovery_state();
+
+  last_update_ondisk = eversion_t();
+  missing_loc.clear();
+  pl->clear_primary_state();
+}
+
+/// return [start,end) bounds for required past_intervals
+static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
+  const pg_info_t &info,
+  epoch_t oldest_map) {
+  epoch_t start = std::max(
+    info.history.last_epoch_clean ? info.history.last_epoch_clean :
+    info.history.epoch_pool_created,
+    oldest_map);
+  epoch_t end = std::max(
+    info.history.same_interval_since,
+    info.history.epoch_pool_created);
+  return make_pair(start, end);
+}
+
+
+void PeeringState::check_past_interval_bounds() const
+{
+  auto oldest_epoch = pl->oldest_stored_osdmap();
+  auto rpib = get_required_past_interval_bounds(
+    info,
+    oldest_epoch);
+  if (rpib.first >= rpib.second) {
+    // do not warn if the start bound is dictated by oldest_map; the
+    // past intervals are presumably appropriate given the pg info.
+    if (!past_intervals.empty() &&
+	rpib.first > oldest_epoch) {
+      pl->get_clog_error() << info.pgid << " required past_interval bounds are"
+			     << " empty [" << rpib << ") but past_intervals is not: "
+			     << past_intervals;
+      derr << info.pgid << " required past_interval bounds are"
+	   << " empty [" << rpib << ") but past_intervals is not: "
+	   << past_intervals << dendl;
+    }
+  } else {
+    if (past_intervals.empty()) {
+      pl->get_clog_error() << info.pgid << " required past_interval bounds are"
+			     << " not empty [" << rpib << ") but past_intervals "
+			     << past_intervals << " is empty";
+      derr << info.pgid << " required past_interval bounds are"
+	   << " not empty [" << rpib << ") but past_intervals "
+	   << past_intervals << " is empty" << dendl;
+      ceph_assert(!past_intervals.empty());
+    }
+
+    auto apib = past_intervals.get_bounds();
+    if (apib.first > rpib.first) {
+      pl->get_clog_error() << info.pgid << " past_intervals [" << apib
+			     << ") start interval does not contain the required"
+			     << " bound [" << rpib << ") start";
+      derr << info.pgid << " past_intervals [" << apib
+	   << ") start interval does not contain the required"
+	   << " bound [" << rpib << ") start" << dendl;
+      ceph_abort_msg("past_interval start interval mismatch");
+    }
+    if (apib.second != rpib.second) {
+      pl->get_clog_error() << info.pgid << " past_interal bound [" << apib
+			     << ") end does not match required [" << rpib
+			     << ") end";
+      derr << info.pgid << " past_interal bound [" << apib
+	   << ") end does not match required [" << rpib
+	   << ") end" << dendl;
+      ceph_abort_msg("past_interval end mismatch");
+    }
+  }
+}
+
+int PeeringState::clamp_recovery_priority(int priority, int pool_recovery_priority, int max)
+{
+  static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
+  static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
+
+  ceph_assert(max <= OSD_RECOVERY_PRIORITY_MAX);
+
+  // User can't set this too high anymore, but might be a legacy value
+  if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX)
+    pool_recovery_priority = OSD_POOL_PRIORITY_MAX;
+  if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN)
+    pool_recovery_priority = OSD_POOL_PRIORITY_MIN;
+  // Shift range from min to max to 0 to max - min
+  pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN);
+  ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN));
+
+  priority += pool_recovery_priority;
+
+  // Clamp to valid range
+  if (priority > max) {
+    return max;
+  } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
+    return OSD_RECOVERY_PRIORITY_MIN;
+  } else {
+    return priority;
+  }
+}
+
+unsigned PeeringState::get_recovery_priority()
+{
+  // a higher value -> a higher priority
+  int ret = OSD_RECOVERY_PRIORITY_BASE;
+  int base = ret;
+
+  if (state & PG_STATE_FORCED_RECOVERY) {
+    ret = OSD_RECOVERY_PRIORITY_FORCED;
+  } else {
+    // XXX: This priority boost isn't so much about inactive, but about data-at-risk
+    if (is_degraded() && info.stats.avail_no_missing.size() < pool.info.min_size) {
+      base = OSD_RECOVERY_INACTIVE_PRIORITY_BASE;
+      // inactive: no. of replicas < min_size, highest priority since it blocks IO
+      ret = base + (pool.info.min_size - info.stats.avail_no_missing.size());
+    }
+
+    int64_t pool_recovery_priority = 0;
+    pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
+
+    ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
+  }
+  psdout(20) << __func__ << " recovery priority is " << ret << dendl;
+  return static_cast<unsigned>(ret);
+}
+
+unsigned PeeringState::get_backfill_priority()
+{
+  // a higher value -> a higher priority
+  int ret = OSD_BACKFILL_PRIORITY_BASE;
+  int base = ret;
+
+  if (state & PG_STATE_FORCED_BACKFILL) {
+    ret = OSD_BACKFILL_PRIORITY_FORCED;
+  } else {
+    if (actingset.size() < pool.info.min_size) {
+      base = OSD_BACKFILL_INACTIVE_PRIORITY_BASE;
+      // inactive: no. of replicas < min_size, highest priority since it blocks IO
+      ret = base + (pool.info.min_size - actingset.size());
+
+    } else if (is_undersized()) {
+      // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
+      ceph_assert(pool.info.size > actingset.size());
+      base = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
+      ret = base + (pool.info.size - actingset.size());
+
+    } else if (is_degraded()) {
+      // degraded: baseline degraded
+      base = ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
+    }
+
+    // Adjust with pool's recovery priority
+    int64_t pool_recovery_priority = 0;
+    pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
+
+    ret = clamp_recovery_priority(ret, pool_recovery_priority, max_prio_map[base]);
+  }
+
+  psdout(20) << __func__ << " backfill priority is " << ret << dendl;
+  return static_cast<unsigned>(ret);
+}
+
+unsigned PeeringState::get_delete_priority()
+{
+  auto state = get_osdmap()->get_state(pg_whoami.osd);
+  if (state & (CEPH_OSD_BACKFILLFULL |
+               CEPH_OSD_FULL)) {
+    return OSD_DELETE_PRIORITY_FULL;
+  } else if (state & CEPH_OSD_NEARFULL) {
+    return OSD_DELETE_PRIORITY_FULLISH;
+  } else {
+    return OSD_DELETE_PRIORITY_NORMAL;
+  }
+}
+
+bool PeeringState::set_force_recovery(bool b)
+{
+  bool did = false;
+  if (b) {
+    if (!(state & PG_STATE_FORCED_RECOVERY) &&
+	(state & (PG_STATE_DEGRADED |
+		  PG_STATE_RECOVERY_WAIT |
+		  PG_STATE_RECOVERING))) {
+      psdout(20) << __func__ << " set" << dendl;
+      state_set(PG_STATE_FORCED_RECOVERY);
+      pl->publish_stats_to_osd();
+      did = true;
+    }
+  } else if (state & PG_STATE_FORCED_RECOVERY) {
+    psdout(20) << __func__ << " clear" << dendl;
+    state_clear(PG_STATE_FORCED_RECOVERY);
+    pl->publish_stats_to_osd();
+    did = true;
+  }
+  if (did) {
+    psdout(20) << __func__ << " state " << get_current_state()
+	     << dendl;
+    pl->update_local_background_io_priority(get_recovery_priority());
+  }
+  return did;
+}
+
+bool PeeringState::set_force_backfill(bool b)
+{
+  bool did = false;
+  if (b) {
+    if (!(state & PG_STATE_FORCED_BACKFILL) &&
+	(state & (PG_STATE_DEGRADED |
+		  PG_STATE_BACKFILL_WAIT |
+		  PG_STATE_BACKFILLING))) {
+      psdout(10) << __func__ << " set" << dendl;
+      state_set(PG_STATE_FORCED_BACKFILL);
+      pl->publish_stats_to_osd();
+      did = true;
+    }
+  } else if (state & PG_STATE_FORCED_BACKFILL) {
+    psdout(10) << __func__ << " clear" << dendl;
+    state_clear(PG_STATE_FORCED_BACKFILL);
+    pl->publish_stats_to_osd();
+    did = true;
+  }
+  if (did) {
+    psdout(20) << __func__ << " state " << get_current_state()
+	     << dendl;
+    pl->update_local_background_io_priority(get_backfill_priority());
+  }
+  return did;
+}
+
+void PeeringState::schedule_renew_lease()
+{
+  pl->schedule_renew_lease(
+    last_peering_reset,
+    readable_interval / 2);
+}
+
+void PeeringState::send_lease()
+{
+  epoch_t epoch = pl->get_osdmap_epoch();
+  for (auto peer : actingset) {
+    if (peer == pg_whoami) {
+      continue;
+    }
+    pl->send_cluster_message(
+      peer.osd,
+      make_message<MOSDPGLease>(epoch,
+		      spg_t(spgid.pgid, peer.shard),
+		      get_lease()),
+      epoch);
+  }
+}
+
+void PeeringState::proc_lease(const pg_lease_t& l)
+{
+  if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
+    psdout(20) << __func__ << " no-op, upacting_features 0x" << std::hex
+	       << upacting_features << std::dec
+	       << " does not include SERVER_OCTOPUS" << dendl;
+    return;
+  }
+  if (!is_nonprimary()) {
+    psdout(20) << __func__ << " no-op, !nonprimary" << dendl;
+    return;
+  }
+  psdout(10) << __func__ << " " << l << dendl;
+  if (l.readable_until_ub > readable_until_ub_from_primary) {
+    readable_until_ub_from_primary = l.readable_until_ub;
+  }
+
+  ceph::signedspan ru = ceph::signedspan::zero();
+  if (l.readable_until != ceph::signedspan::zero() &&
+      hb_stamps[0]->peer_clock_delta_ub) {
+    ru = l.readable_until - *hb_stamps[0]->peer_clock_delta_ub;
+    psdout(20) << " peer_clock_delta_ub " << *hb_stamps[0]->peer_clock_delta_ub
+	       << " -> ru " << ru << dendl;
+  }
+  if (ru > readable_until) {
+    readable_until = ru;
+    psdout(20) << __func__ << " readable_until now " << readable_until << dendl;
+    // NOTE: if we ever decide to block/queue ops on the replica,
+    // we'll need to wake them up here.
+  }
+
+  ceph::signedspan ruub;
+  if (hb_stamps[0]->peer_clock_delta_lb) {
+    ruub = l.readable_until_ub - *hb_stamps[0]->peer_clock_delta_lb;
+    psdout(20) << " peer_clock_delta_lb " << *hb_stamps[0]->peer_clock_delta_lb
+	       << " -> ruub " << ruub << dendl;
+  } else {
+    ruub = pl->get_mnow() + l.interval;
+    psdout(20) << " no peer_clock_delta_lb -> ruub " << ruub << dendl;
+  }
+  if (ruub > readable_until_ub) {
+    readable_until_ub = ruub;
+    psdout(20) << __func__ << " readable_until_ub now " << readable_until_ub
+	       << dendl;
+  }
+}
+
+void PeeringState::proc_lease_ack(int from, const pg_lease_ack_t& a)
+{
+  if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
+    return;
+  }
+  auto now = pl->get_mnow();
+  bool was_min = false;
+  for (unsigned i = 0; i < acting.size(); ++i) {
+    if (from == acting[i]) {
+      // the lease_ack value is based on the primary's clock
+      if (a.readable_until_ub > acting_readable_until_ub[i]) {
+	if (acting_readable_until_ub[i] == readable_until) {
+	  was_min = true;
+	}
+	acting_readable_until_ub[i] = a.readable_until_ub;
+	break;
+      }
+    }
+  }
+  if (was_min) {
+    auto old_ru = readable_until;
+    recalc_readable_until();
+    if (now < old_ru) {
+      pl->recheck_readable();
+    }
+  }
+}
+
+void PeeringState::proc_renew_lease()
+{
+  if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
+    return;
+  }
+  renew_lease(pl->get_mnow());
+  send_lease();
+  schedule_renew_lease();
+}
+
+void PeeringState::recalc_readable_until()
+{
+  assert(is_primary());
+  ceph::signedspan min = readable_until_ub_sent;
+  for (unsigned i = 0; i < acting.size(); ++i) {
+    if (acting[i] == pg_whoami.osd || acting[i] == CRUSH_ITEM_NONE) {
+      continue;
+    }
+    dout(20) << __func__ << " peer osd." << acting[i]
+	     << " ruub " << acting_readable_until_ub[i] << dendl;
+    if (acting_readable_until_ub[i] < min) {
+      min = acting_readable_until_ub[i];
+    }
+  }
+  readable_until = min;
+  readable_until_ub = min;
+  dout(20) << __func__ << " readable_until[_ub] " << readable_until
+	   << " (sent " << readable_until_ub_sent << ")" << dendl;
+}
+
+bool PeeringState::check_prior_readable_down_osds(const OSDMapRef& map)
+{
+  if (!HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
+    return false;
+  }
+  bool changed = false;
+  auto p = prior_readable_down_osds.begin();
+  while (p != prior_readable_down_osds.end()) {
+    if (map->is_dead(*p)) {
+      dout(10) << __func__ << " prior_readable_down_osds osd." << *p
+	       << " is dead as of epoch " << map->get_epoch()
+	       << dendl;
+      p = prior_readable_down_osds.erase(p);
+      changed = true;
+    } else {
+      ++p;
+    }
+  }
+  if (changed && prior_readable_down_osds.empty()) {
+    psdout(10) << " empty prior_readable_down_osds, clearing ub" << dendl;
+    clear_prior_readable_until_ub();
+    return true;
+  }
+  return false;
+}
+
+bool PeeringState::adjust_need_up_thru(const OSDMapRef osdmap)
+{
+  epoch_t up_thru = osdmap->get_up_thru(pg_whoami.osd);
+  if (need_up_thru &&
+      up_thru >= info.history.same_interval_since) {
+    psdout(10) << "adjust_need_up_thru now "
+	       << up_thru << ", need_up_thru now false" << dendl;
+    need_up_thru = false;
+    return true;
+  }
+  return false;
+}
+
+PastIntervals::PriorSet PeeringState::build_prior()
+{
+  if (1) {
+    // sanity check
+    for (auto it = peer_info.begin(); it != peer_info.end(); ++it) {
+      ceph_assert(info.history.last_epoch_started >=
+		  it->second.history.last_epoch_started);
+    }
+  }
+
+  const OSDMap &osdmap = *get_osdmap();
+  PastIntervals::PriorSet prior = past_intervals.get_prior_set(
+    pool.info.is_erasure(),
+    info.history.last_epoch_started,
+    &missing_loc.get_recoverable_predicate(),
+    [&](epoch_t start, int osd, epoch_t *lost_at) {
+      const osd_info_t *pinfo = 0;
+      if (osdmap.exists(osd)) {
+	pinfo = &osdmap.get_info(osd);
+	if (lost_at)
+	  *lost_at = pinfo->lost_at;
+      }
+
+      if (osdmap.is_up(osd)) {
+	return PastIntervals::UP;
+      } else if (!pinfo) {
+	return PastIntervals::DNE;
+      } else if (pinfo->lost_at > start) {
+	return PastIntervals::LOST;
+      } else {
+	return PastIntervals::DOWN;
+      }
+    },
+    up,
+    acting,
+    dpp);
+
+  if (prior.pg_down) {
+    state_set(PG_STATE_DOWN);
+  }
+
+  if (get_osdmap()->get_up_thru(pg_whoami.osd) <
+      info.history.same_interval_since) {
+    psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
+	       << " < same_since " << info.history.same_interval_since
+	       << ", must notify monitor" << dendl;
+    need_up_thru = true;
+  } else {
+    psdout(10) << "up_thru " << get_osdmap()->get_up_thru(pg_whoami.osd)
+	       << " >= same_since " << info.history.same_interval_since
+	       << ", all is well" << dendl;
+    need_up_thru = false;
+  }
+  pl->set_probe_targets(prior.probe);
+  return prior;
+}
+
+bool PeeringState::needs_recovery() const
+{
+  ceph_assert(is_primary());
+
+  auto &missing = pg_log.get_missing();
+
+  if (missing.num_missing()) {
+    psdout(10) << __func__ << " primary has " << missing.num_missing()
+	       << " missing" << dendl;
+    return true;
+  }
+
+  ceph_assert(!acting_recovery_backfill.empty());
+  for (const pg_shard_t& peer : acting_recovery_backfill) {
+    if (peer == get_primary()) {
+      continue;
+    }
+    auto pm = peer_missing.find(peer);
+    if (pm == peer_missing.end()) {
+      psdout(10) << __func__ << " osd." << peer << " doesn't have missing set"
+		 << dendl;
+      continue;
+    }
+    if (pm->second.num_missing()) {
+      psdout(10) << __func__ << " osd." << peer << " has "
+		 << pm->second.num_missing() << " missing" << dendl;
+      return true;
+    }
+  }
+
+  psdout(10) << __func__ << " is recovered" << dendl;
+  return false;
+}
+
+bool PeeringState::needs_backfill() const
+{
+  ceph_assert(is_primary());
+
+  // We can assume that only possible osds that need backfill
+  // are on the backfill_targets vector nodes.
+  for (const pg_shard_t& peer : backfill_targets) {
+    auto pi = peer_info.find(peer);
+    ceph_assert(pi != peer_info.end());
+    if (!pi->second.last_backfill.is_max()) {
+      psdout(10) << __func__ << " osd." << peer
+		 << " has last_backfill " << pi->second.last_backfill << dendl;
+      return true;
+    }
+  }
+
+  psdout(10) << __func__ << " does not need backfill" << dendl;
+  return false;
+}
+
+/*
+ * Returns true unless there is a non-lost OSD in might_have_unfound.
+ */
+bool PeeringState::all_unfound_are_queried_or_lost(
+  const OSDMapRef osdmap) const
+{
+  ceph_assert(is_primary());
+
+  auto peer = might_have_unfound.begin();
+  auto mend = might_have_unfound.end();
+  for (; peer != mend; ++peer) {
+    if (peer_missing.count(*peer))
+      continue;
+    auto iter = peer_info.find(*peer);
+    if (iter != peer_info.end() &&
+        (iter->second.is_empty() || iter->second.dne()))
+      continue;
+    if (!osdmap->exists(peer->osd))
+      continue;
+    const osd_info_t &osd_info(osdmap->get_info(peer->osd));
+    if (osd_info.lost_at <= osd_info.up_from) {
+      // If there is even one OSD in might_have_unfound that isn't lost, we
+      // still might retrieve our unfound.
+      return false;
+    }
+  }
+  psdout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound "
+	     << might_have_unfound
+	     << " have been queried or are marked lost" << dendl;
+  return true;
+}
+
+
+void PeeringState::reject_reservation()
+{
+  pl->unreserve_recovery_space();
+  pl->send_cluster_message(
+    primary.osd,
+    make_message<MBackfillReserve>(
+      MBackfillReserve::REJECT_TOOFULL,
+      spg_t(info.pgid.pgid, primary.shard),
+      get_osdmap_epoch()),
+    get_osdmap_epoch());
+}
+
+/**
+ * find_best_info
+ *
+ * Returns an iterator to the best info in infos sorted by:
+ *  1) Prefer newer last_update
+ *  2) Prefer longer tail if it brings another info into contiguity
+ *  3) Prefer current primary
+ */
+map<pg_shard_t, pg_info_t>::const_iterator PeeringState::find_best_info(
+  const map<pg_shard_t, pg_info_t> &infos,
+  bool restrict_to_up_acting,
+  bool *history_les_bound) const
+{
+  ceph_assert(history_les_bound);
+  /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
+   * to make changes to this process.  Also, make sure to update it
+   * when you find bugs! */
+  epoch_t max_last_epoch_started_found = 0;
+  for (auto i = infos.begin(); i != infos.end(); ++i) {
+    if (!cct->_conf->osd_find_best_info_ignore_history_les &&
+	max_last_epoch_started_found < i->second.history.last_epoch_started) {
+      *history_les_bound = true;
+      max_last_epoch_started_found = i->second.history.last_epoch_started;
+    }
+    if (!i->second.is_incomplete() &&
+	max_last_epoch_started_found < i->second.last_epoch_started) {
+      *history_les_bound = false;
+      max_last_epoch_started_found = i->second.last_epoch_started;
+    }
+  }
+  eversion_t min_last_update_acceptable = eversion_t::max();
+  for (auto i = infos.begin(); i != infos.end(); ++i) {
+    if (max_last_epoch_started_found <= i->second.last_epoch_started) {
+      if (min_last_update_acceptable > i->second.last_update)
+	min_last_update_acceptable = i->second.last_update;
+    }
+  }
+  if (min_last_update_acceptable == eversion_t::max())
+    return infos.end();
+
+  auto best = infos.end();
+  // find osd with newest last_update (oldest for ec_pool).
+  // if there are multiples, prefer
+  //  - a longer tail, if it brings another peer into log contiguity
+  //  - the current primary
+  for (auto p = infos.begin(); p != infos.end(); ++p) {
+    if (restrict_to_up_acting && !is_up(p->first) &&
+	!is_acting(p->first))
+      continue;
+    // Only consider peers with last_update >= min_last_update_acceptable
+    if (p->second.last_update < min_last_update_acceptable)
+      continue;
+    // Disqualify anyone with a too old last_epoch_started
+    if (p->second.last_epoch_started < max_last_epoch_started_found)
+      continue;
+    // Disqualify anyone who is incomplete (not fully backfilled)
+    if (p->second.is_incomplete())
+      continue;
+    if (best == infos.end()) {
+      best = p;
+      continue;
+    }
+    // Prefer newer last_update
+    if (pool.info.require_rollback()) {
+      if (p->second.last_update > best->second.last_update)
+	continue;
+      if (p->second.last_update < best->second.last_update) {
+	best = p;
+	continue;
+      }
+    } else {
+      if (p->second.last_update < best->second.last_update)
+	continue;
+      if (p->second.last_update > best->second.last_update) {
+	best = p;
+	continue;
+      }
+    }
+
+    // Prefer longer tail
+    if (p->second.log_tail > best->second.log_tail) {
+      continue;
+    } else if (p->second.log_tail < best->second.log_tail) {
+      best = p;
+      continue;
+    }
+
+    if (!p->second.has_missing() && best->second.has_missing()) {
+      psdout(10) << __func__ << " prefer osd." << p->first
+               << " because it is complete while best has missing"
+               << dendl;
+      best = p;
+      continue;
+    } else if (p->second.has_missing() && !best->second.has_missing()) {
+      psdout(10) << __func__ << " skipping osd." << p->first
+               << " because it has missing while best is complete"
+               << dendl;
+      continue;
+    } else {
+      // both are complete or have missing
+      // fall through
+    }
+
+    // prefer current primary (usually the caller), all things being equal
+    if (p->first == pg_whoami) {
+      psdout(10) << "calc_acting prefer osd." << p->first
+		 << " because it is current primary" << dendl;
+      best = p;
+      continue;
+    }
+  }
+  return best;
+}
+
+void PeeringState::calc_ec_acting(
+  map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+  unsigned size,
+  const vector<int> &acting,
+  const vector<int> &up,
+  const map<pg_shard_t, pg_info_t> &all_info,
+  bool restrict_to_up_acting,
+  vector<int> *_want,
+  set<pg_shard_t> *backfill,
+  set<pg_shard_t> *acting_backfill,
+  ostream &ss)
+{
+  vector<int> want(size, CRUSH_ITEM_NONE);
+  map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
+  for (auto i = all_info.begin();
+       i != all_info.end();
+       ++i) {
+    all_info_by_shard[i->first.shard].insert(i->first);
+  }
+  for (uint8_t i = 0; i < want.size(); ++i) {
+    ss << "For position " << (unsigned)i << ": ";
+    if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
+	!all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
+	all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
+	auth_log_shard->second.log_tail) {
+      ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
+      want[i] = up[i];
+      continue;
+    }
+    if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
+      ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
+	 << " and ";
+      backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
+    }
+
+    if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
+	!all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
+	all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
+	auth_log_shard->second.log_tail) {
+      ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
+      want[i] = acting[i];
+    } else if (!restrict_to_up_acting) {
+      for (auto j = all_info_by_shard[shard_id_t(i)].begin();
+	   j != all_info_by_shard[shard_id_t(i)].end();
+	   ++j) {
+	ceph_assert(j->shard == i);
+	if (!all_info.find(*j)->second.is_incomplete() &&
+	    all_info.find(*j)->second.last_update >=
+	    auth_log_shard->second.log_tail) {
+	  ss << " selecting stray: " << *j << std::endl;
+	  want[i] = j->osd;
+	  break;
+	}
+      }
+      if (want[i] == CRUSH_ITEM_NONE)
+	ss << " failed to fill position " << (int)i << std::endl;
+    }
+  }
+
+  for (uint8_t i = 0; i < want.size(); ++i) {
+    if (want[i] != CRUSH_ITEM_NONE) {
+      acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
+    }
+  }
+  acting_backfill->insert(backfill->begin(), backfill->end());
+  _want->swap(want);
+}
+
+std::pair<map<pg_shard_t, pg_info_t>::const_iterator, eversion_t>
+PeeringState::select_replicated_primary(
+  map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+  uint64_t force_auth_primary_missing_objects,
+  const std::vector<int> &up,
+  pg_shard_t up_primary,
+  const map<pg_shard_t, pg_info_t> &all_info,
+  const OSDMapRef osdmap,
+  ostream &ss)
+{
+  pg_shard_t auth_log_shard_id = auth_log_shard->first;
+
+  ss << __func__ << " newest update on osd." << auth_log_shard_id
+     << " with " << auth_log_shard->second << std::endl;
+
+  // select primary
+  auto primary = all_info.find(up_primary);
+  if (up.size() &&
+      !primary->second.is_incomplete() &&
+      primary->second.last_update >=
+        auth_log_shard->second.log_tail) {
+    if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
+      auto approx_missing_objects =
+        primary->second.stats.stats.sum.num_objects_missing;
+      auto auth_version = auth_log_shard->second.last_update.version;
+      auto primary_version = primary->second.last_update.version;
+      if (auth_version > primary_version) {
+        approx_missing_objects += auth_version - primary_version;
+      } else {
+        approx_missing_objects += primary_version - auth_version;
+      }
+      if ((uint64_t)approx_missing_objects >
+          force_auth_primary_missing_objects) {
+        primary = auth_log_shard;
+        ss << "up_primary: " << up_primary << ") has approximate "
+           << approx_missing_objects
+           << "(>" << force_auth_primary_missing_objects <<") "
+           << "missing objects, osd." << auth_log_shard_id
+           << " selected as primary instead"
+           << std::endl;
+      } else {
+        ss << "up_primary: " << up_primary << ") selected as primary"
+           << std::endl;
+      }
+    } else {
+      ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
+    }
+  } else {
+    ceph_assert(!auth_log_shard->second.is_incomplete());
+    ss << "up[0] needs backfill, osd." << auth_log_shard_id
+       << " selected as primary instead" << std::endl;
+    primary = auth_log_shard;
+  }
+
+  ss << __func__ << " primary is osd." << primary->first
+     << " with " << primary->second << std::endl;
+
+  /* We include auth_log_shard->second.log_tail because in GetLog,
+   * we will request logs back to the min last_update over our
+   * acting_backfill set, which will result in our log being extended
+   * as far backwards as necessary to pick up any peers which can
+   * be log recovered by auth_log_shard's log */
+  eversion_t oldest_auth_log_entry =
+    std::min(primary->second.log_tail, auth_log_shard->second.log_tail);
+
+  return std::make_pair(primary, oldest_auth_log_entry);
+}
+
+
+/**
+ * calculate the desired acting set.
+ *
+ * Choose an appropriate acting set.  Prefer up[0], unless it is
+ * incomplete, or another osd has a longer tail that allows us to
+ * bring other up nodes up to date.
+ */
+void PeeringState::calc_replicated_acting(
+  map<pg_shard_t, pg_info_t>::const_iterator primary,
+  eversion_t oldest_auth_log_entry,
+  unsigned size,
+  const vector<int> &acting,
+  const vector<int> &up,
+  pg_shard_t up_primary,
+  const map<pg_shard_t, pg_info_t> &all_info,
+  bool restrict_to_up_acting,
+  vector<int> *want,
+  set<pg_shard_t> *backfill,
+  set<pg_shard_t> *acting_backfill,
+  const OSDMapRef osdmap,
+  const PGPool& pool,
+  ostream &ss)
+{
+  ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
+     << std::endl;
+
+  want->push_back(primary->first.osd);
+  acting_backfill->insert(primary->first);
+
+  // select replicas that have log contiguity with primary.
+  // prefer up, then acting, then any peer_info osds
+  for (auto i : up) {
+    pg_shard_t up_cand = pg_shard_t(i, shard_id_t::NO_SHARD);
+    if (up_cand == primary->first)
+      continue;
+    const pg_info_t &cur_info = all_info.find(up_cand)->second;
+    if (cur_info.is_incomplete() ||
+        cur_info.last_update < oldest_auth_log_entry) {
+      ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
+      backfill->insert(up_cand);
+      acting_backfill->insert(up_cand);
+    } else {
+      want->push_back(i);
+      acting_backfill->insert(up_cand);
+      ss << " osd." << i << " (up) accepted " << cur_info << std::endl;
+    }
+  }
+
+  if (want->size() >= size) {
+    return;
+  }
+
+  std::vector<std::pair<eversion_t, int>> candidate_by_last_update;
+  candidate_by_last_update.reserve(acting.size());
+  // This no longer has backfill OSDs, but they are covered above.
+  for (auto i : acting) {
+    pg_shard_t acting_cand(i, shard_id_t::NO_SHARD);
+    // skip up osds we already considered above
+    if (acting_cand == primary->first)
+      continue;
+    auto up_it = find(up.begin(), up.end(), i);
+    if (up_it != up.end())
+      continue;
+
+    const pg_info_t &cur_info = all_info.find(acting_cand)->second;
+    if (cur_info.is_incomplete() ||
+	cur_info.last_update < oldest_auth_log_entry) {
+      ss << " shard " << acting_cand << " (acting) REJECTED "
+	 << cur_info << std::endl;
+    } else {
+      candidate_by_last_update.emplace_back(cur_info.last_update, i);
+    }
+  }
+
+  auto sort_by_eversion =[](const std::pair<eversion_t, int> &lhs,
+                            const std::pair<eversion_t, int> &rhs) {
+    return lhs.first > rhs.first;
+  };
+  // sort by last_update, in descending order.
+  std::sort(candidate_by_last_update.begin(),
+            candidate_by_last_update.end(), sort_by_eversion);
+  for (auto &p: candidate_by_last_update) {
+    ceph_assert(want->size() < size);
+    want->push_back(p.second);
+    pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
+    acting_backfill->insert(s);
+    ss << " shard " << s << " (acting) accepted "
+       << all_info.find(s)->second << std::endl;
+    if (want->size() >= size) {
+      return;
+    }
+  }
+
+  if (restrict_to_up_acting) {
+    return;
+  }
+  candidate_by_last_update.clear();
+  candidate_by_last_update.reserve(all_info.size()); // overestimate but fine
+  // continue to search stray to find more suitable peers
+  for (auto &i : all_info) {
+    // skip up osds we already considered above
+    if (i.first == primary->first)
+      continue;
+    auto up_it = find(up.begin(), up.end(), i.first.osd);
+    if (up_it != up.end())
+      continue;
+    auto acting_it = find(
+      acting.begin(), acting.end(), i.first.osd);
+    if (acting_it != acting.end())
+      continue;
+
+    if (i.second.is_incomplete() ||
+	i.second.last_update < oldest_auth_log_entry) {
+      ss << " shard " << i.first << " (stray) REJECTED " << i.second
+         << std::endl;
+    } else {
+      candidate_by_last_update.emplace_back(
+        i.second.last_update, i.first.osd);
+    }
+  }
+
+  if (candidate_by_last_update.empty()) {
+    // save us some effort
+    return;
+  }
+
+  // sort by last_update, in descending order.
+  std::sort(candidate_by_last_update.begin(),
+            candidate_by_last_update.end(), sort_by_eversion);
+
+  for (auto &p: candidate_by_last_update) {
+    ceph_assert(want->size() < size);
+    want->push_back(p.second);
+    pg_shard_t s = pg_shard_t(p.second, shard_id_t::NO_SHARD);
+    acting_backfill->insert(s);
+    ss << " shard " << s << " (stray) accepted "
+       << all_info.find(s)->second << std::endl;
+    if (want->size() >= size) {
+      return;
+    }
+  }
+}
+
+// Defines osd preference order: acting set, then larger last_update
+using osd_ord_t = std::tuple<bool, eversion_t>; // <acting, last_update>
+using osd_id_t = int;
+
+class bucket_candidates_t {
+  std::deque<std::pair<osd_ord_t, osd_id_t>> osds;
+  int selected = 0;
+
+public:
+  void add_osd(osd_ord_t ord, osd_id_t osd) {
+    // osds will be added in smallest to largest order
+    assert(osds.empty() || osds.back().first <= ord);
+    osds.push_back(std::make_pair(ord, osd));
+  }
+  osd_id_t pop_osd() {
+    ceph_assert(!is_empty());
+    auto ret = osds.front();
+    osds.pop_front();
+    return ret.second;
+  }
+
+  void inc_selected() { selected++; }
+  unsigned get_num_selected() const { return selected; }
+
+  osd_ord_t get_ord() const {
+    return osds.empty() ? std::make_tuple(false, eversion_t())
+      : osds.front().first;
+  }
+
+  bool is_empty() const { return osds.empty(); }
+
+  bool operator<(const bucket_candidates_t &rhs) const {
+    return std::make_tuple(-selected, get_ord()) <
+      std::make_tuple(-rhs.selected, rhs.get_ord());
+  }
+
+  friend std::ostream &operator<<(std::ostream &, const bucket_candidates_t &);
+};
+
+std::ostream &operator<<(std::ostream &lhs, const bucket_candidates_t &cand)
+{
+  return lhs << "candidates[" << cand.osds << "]";
+}
+
+class bucket_heap_t {
+  using elem_t = std::reference_wrapper<bucket_candidates_t>;
+  std::vector<elem_t> heap;
+
+  // Max heap -- should emit buckets in order of preference
+  struct comp {
+    bool operator()(const elem_t &lhs, const elem_t &rhs) {
+      return lhs.get() < rhs.get();
+    }
+  };
+public:
+  void push_if_nonempty(elem_t e) {
+    if (!e.get().is_empty()) {
+      heap.push_back(e);
+      std::push_heap(heap.begin(), heap.end(), comp());
+    }
+  }
+  elem_t pop() {
+    std::pop_heap(heap.begin(), heap.end(), comp());
+    auto ret = heap.back();
+    heap.pop_back();
+    return ret;
+  }
+
+  bool is_empty() const { return heap.empty(); }
+};
+
+/**
+ * calc_replicated_acting_stretch
+ *
+ * Choose an acting set using as much of the up set as possible; filling 
+ * in the remaining slots so as to maximize the number of crush buckets at
+ * level pool.info.peering_crush_bucket_barrier represented.
+ *
+ * Stretch clusters are a bit special: while they have a "size" the
+ * same way as normal pools, if we happen to lose a data center
+ * (we call it a "stretch bucket", but really it'll be a data center or
+ * a cloud availability zone), we don't actually want to shove
+ * 2 DC's worth of replication into a single site -- it won't fit!
+ * So we locally calculate a bucket_max, based
+ * on the targeted number of stretch buckets for the pool and
+ * its size. Then we won't pull more than bucket_max from any
+ * given ancestor even if it leaves us undersized.
+
+ * There are two distinct phases: (commented below)
+ */
+void PeeringState::calc_replicated_acting_stretch(
+  map<pg_shard_t, pg_info_t>::const_iterator primary,
+  eversion_t oldest_auth_log_entry,
+  unsigned size,
+  const vector<int> &acting,
+  const vector<int> &up,
+  pg_shard_t up_primary,
+  const map<pg_shard_t, pg_info_t> &all_info,
+  bool restrict_to_up_acting,
+  vector<int> *want,
+  set<pg_shard_t> *backfill,
+  set<pg_shard_t> *acting_backfill,
+  const OSDMapRef osdmap,
+  const PGPool& pool,
+  ostream &ss)
+{
+  ceph_assert(want);
+  ceph_assert(acting_backfill);
+  ceph_assert(backfill);
+  ss << __func__ << (restrict_to_up_acting ? " restrict_to_up_acting" : "")
+     << std::endl;
+
+  auto used = [want](int osd) {
+    return std::find(want->begin(), want->end(), osd) != want->end();
+  };
+
+  auto usable_info = [&](const auto &cur_info) mutable {
+    return !(cur_info.is_incomplete() ||
+	     cur_info.last_update < oldest_auth_log_entry);
+  };
+
+  auto osd_info = [&](int osd) mutable -> const pg_info_t & {
+    pg_shard_t cand = pg_shard_t(osd, shard_id_t::NO_SHARD);
+    const pg_info_t &cur_info = all_info.find(cand)->second;
+    return cur_info;
+  };
+
+  auto usable_osd = [&](int osd) mutable {
+    return usable_info(osd_info(osd));
+  };
+
+  std::map<int, bucket_candidates_t> ancestors;
+  auto get_ancestor = [&](int osd) mutable {
+    int ancestor = osdmap->crush->get_parent_of_type(
+      osd,
+      pool.info.peering_crush_bucket_barrier,
+      pool.info.crush_rule);
+    return &ancestors[ancestor];
+  };
+
+  unsigned bucket_max = pool.info.size / pool.info.peering_crush_bucket_target;
+  if (bucket_max * pool.info.peering_crush_bucket_target < pool.info.size) {
+    ++bucket_max; 
+  }
+
+  /* 1) Select all usable osds from the up set as well as the primary
+   *
+   * We also stash any unusable osds from up into backfill.
+   */
+  auto add_required = [&](int osd) {
+    if (!used(osd)) {
+      want->push_back(osd);
+      acting_backfill->insert(
+	pg_shard_t(osd, shard_id_t::NO_SHARD));
+      get_ancestor(osd)->inc_selected();
+    }
+  };
+  add_required(primary->first.osd);
+  ss << " osd " << primary->first.osd << " primary accepted "
+     << osd_info(primary->first.osd) << std::endl;
+  for (auto upcand: up) {
+    auto upshard = pg_shard_t(upcand, shard_id_t::NO_SHARD);
+    auto &curinfo = osd_info(upcand);
+    if (usable_osd(upcand)) {
+      ss << " osd " << upcand << " (up) accepted " << curinfo << std::endl;
+      add_required(upcand);
+    } else {
+      ss << " osd " << upcand << " (up) backfill " << curinfo << std::endl;
+      backfill->insert(upshard);
+      acting_backfill->insert(upshard);
+    }
+  }
+
+  if (want->size() >= pool.info.size) { // non-failed CRUSH mappings are valid
+    ss << " up set sufficient" << std::endl;
+    return;
+  }
+  ss << " up set insufficient, considering remaining osds" << std::endl;
+
+  /* 2) Fill out remaining slots from usable osds in all_info
+   *    while maximizing the number of ancestor nodes at the
+   *    barrier_id crush level.
+   */
+  {
+    std::vector<std::pair<osd_ord_t, osd_id_t>> candidates;
+    /* To do this, we first filter the set of usable osd into an ordered
+     * list of usable osds
+     */
+    auto get_osd_ord = [&](bool is_acting, const pg_info_t &info) -> osd_ord_t {
+      return std::make_tuple(
+	!is_acting /* acting should sort first */,
+	info.last_update);
+    };
+    for (auto &cand : acting) {
+      auto &cand_info = osd_info(cand);
+      if (!used(cand) && usable_info(cand_info)) {
+	ss << " acting candidate " << cand << " " << cand_info << std::endl;
+	candidates.push_back(std::make_pair(get_osd_ord(true, cand_info), cand));
+      }
+    }
+    if (!restrict_to_up_acting) {
+      for (auto &[cand, info] : all_info) {
+	if (!used(cand.osd) && usable_info(info) &&
+	    (std::find(acting.begin(), acting.end(), cand.osd)
+	     == acting.end())) {
+	  ss << " other candidate " << cand << " " << info << std::endl;
+	  candidates.push_back(
+	    std::make_pair(get_osd_ord(false, info), cand.osd));
+	}
+      }
+    }
+    std::sort(candidates.begin(), candidates.end());
+
+    // We then filter these candidates by ancestor
+    std::for_each(candidates.begin(), candidates.end(), [&](auto cand) {
+      get_ancestor(cand.second)->add_osd(cand.first, cand.second);
+    });
+  }
+
+  auto pop_ancestor = [&](auto &ancestor) {
+    ceph_assert(!ancestor.is_empty());
+    auto osd = ancestor.pop_osd();
+
+    ss << " accepting candidate " << osd << std::endl;
+
+    ceph_assert(!used(osd));
+    ceph_assert(usable_osd(osd));
+
+    want->push_back(osd);
+    acting_backfill->insert(
+      pg_shard_t(osd, shard_id_t::NO_SHARD));
+    ancestor.inc_selected();
+  };
+
+  /* Next, we use the ancestors map to grab a descendant of the
+   * peering_crush_mandatory_member if not already represented.
+   *
+   * TODO: using 0 here to match other users.  Prior to merge, I
+   * expect that this and other users should instead check against
+   * CRUSH_ITEM_NONE.
+   */
+  if (pool.info.peering_crush_mandatory_member != CRUSH_ITEM_NONE) {
+    auto aiter = ancestors.find(pool.info.peering_crush_mandatory_member);
+    if (aiter != ancestors.end() &&
+	!aiter->second.get_num_selected()) {
+      ss << " adding required ancestor " << aiter->first << std::endl;
+      ceph_assert(!aiter->second.is_empty()); // wouldn't exist otherwise
+      pop_ancestor(aiter->second);
+    }
+  }
+
+  /* We then place the ancestors in a heap ordered by fewest selected
+   * and then by the ordering token of the next osd */
+  bucket_heap_t aheap;
+  std::for_each(ancestors.begin(), ancestors.end(), [&](auto &anc) {
+    aheap.push_if_nonempty(anc.second);
+  });
+
+  /* and pull from this heap until it's empty or we have enough.
+   * "We have enough" is a sufficient check here for
+   * stretch_set_can_peer() because our heap sorting always
+   * pulls from ancestors with the least number of included OSDs,
+   * so if it is possible to satisfy the bucket_count constraints we
+   * will do so.
+   */
+  while (!aheap.is_empty() && want->size() < pool.info.size) {
+    auto next = aheap.pop();
+    pop_ancestor(next.get());
+    if (next.get().get_num_selected() < bucket_max) {
+      aheap.push_if_nonempty(next);
+    }
+  }
+
+  /* The end result is that we should have as many buckets covered as
+   * possible while respecting up, the primary selection,
+   * the pool size (given bucket count constraints),
+   * and the mandatory member.
+   */
+}
+
+
+bool PeeringState::recoverable(const vector<int> &want) const
+{
+  unsigned num_want_acting = 0;
+  set<pg_shard_t> have;
+  for (int i = 0; i < (int)want.size(); ++i) {
+    if (want[i] != CRUSH_ITEM_NONE) {
+      ++num_want_acting;
+      have.insert(
+        pg_shard_t(
+          want[i],
+          pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+    }
+  }
+
+  if (num_want_acting < pool.info.min_size) {
+    const bool recovery_ec_pool_below_min_size=
+      HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_OCTOPUS);
+
+    if (pool.info.is_erasure() && !recovery_ec_pool_below_min_size) {
+      psdout(10) << __func__ << " failed, ec recovery below min size not supported by pre-octopus" << dendl;
+      return false;
+    } else if (!cct->_conf.get_val<bool>("osd_allow_recovery_below_min_size")) {
+      psdout(10) << __func__ << " failed, recovery below min size not enabled" << dendl;
+      return false;
+    }
+  }
+  if (missing_loc.get_recoverable_predicate()(have)) {
+    return true;
+  } else {
+    psdout(10) << __func__ << " failed, not recoverable " << dendl;
+    return false;
+  }
+}
+
+void PeeringState::choose_async_recovery_ec(
+  const map<pg_shard_t, pg_info_t> &all_info,
+  const pg_info_t &auth_info,
+  vector<int> *want,
+  set<pg_shard_t> *async_recovery,
+  const OSDMapRef osdmap) const
+{
+  set<pair<int, pg_shard_t> > candidates_by_cost;
+  for (uint8_t i = 0; i < want->size(); ++i) {
+    if ((*want)[i] == CRUSH_ITEM_NONE)
+      continue;
+
+    // Considering log entries to recover is accurate enough for
+    // now. We could use minimum_to_decode_with_cost() later if
+    // necessary.
+    pg_shard_t shard_i((*want)[i], shard_id_t(i));
+    // do not include strays
+    if (stray_set.find(shard_i) != stray_set.end())
+      continue;
+    // Do not include an osd that is not up, since choosing it as
+    // an async_recovery_target will move it out of the acting set.
+    // This results in it being identified as a stray during peering,
+    // because it is no longer in the up or acting set.
+    if (!is_up(shard_i))
+      continue;
+    auto shard_info = all_info.find(shard_i)->second;
+    // for ec pools we rollback all entries past the authoritative
+    // last_update *before* activation. This is relatively inexpensive
+    // compared to recovery, since it is purely local, so treat shards
+    // past the authoritative last_update the same as those equal to it.
+    version_t auth_version = auth_info.last_update.version;
+    version_t candidate_version = shard_info.last_update.version;
+    if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
+      auto approx_missing_objects =
+        shard_info.stats.stats.sum.num_objects_missing;
+      if (auth_version > candidate_version) {
+        approx_missing_objects += auth_version - candidate_version;
+      }
+      if (static_cast<uint64_t>(approx_missing_objects) >
+         cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
+        candidates_by_cost.emplace(approx_missing_objects, shard_i);
+      }
+    } else {
+      if (auth_version > candidate_version &&
+          (auth_version - candidate_version) > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
+        candidates_by_cost.insert(make_pair(auth_version - candidate_version, shard_i));
+      }
+    }
+  }
+
+  psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
+	     << dendl;
+
+  // take out as many osds as we can for async recovery, in order of cost
+  for (auto rit = candidates_by_cost.rbegin();
+       rit != candidates_by_cost.rend(); ++rit) {
+    pg_shard_t cur_shard = rit->second;
+    vector<int> candidate_want(*want);
+    candidate_want[cur_shard.shard.id] = CRUSH_ITEM_NONE;
+    if (recoverable(candidate_want)) {
+      want->swap(candidate_want);
+      async_recovery->insert(cur_shard);
+    }
+  }
+  psdout(20) << __func__ << " result want=" << *want
+	     << " async_recovery=" << *async_recovery << dendl;
+}
+
+void PeeringState::choose_async_recovery_replicated(
+  const map<pg_shard_t, pg_info_t> &all_info,
+  const pg_info_t &auth_info,
+  vector<int> *want,
+  set<pg_shard_t> *async_recovery,
+  const OSDMapRef osdmap) const
+{
+  set<pair<int, pg_shard_t> > candidates_by_cost;
+  for (auto osd_num : *want) {
+    pg_shard_t shard_i(osd_num, shard_id_t::NO_SHARD);
+    // do not include strays
+    if (stray_set.find(shard_i) != stray_set.end())
+      continue;
+    // Do not include an osd that is not up, since choosing it as
+    // an async_recovery_target will move it out of the acting set.
+    // This results in it being identified as a stray during peering,
+    // because it is no longer in the up or acting set.
+    if (!is_up(shard_i))
+      continue;
+    auto shard_info = all_info.find(shard_i)->second;
+    // use the approximate magnitude of the difference in length of
+    // logs plus historical missing objects as the cost of recovery
+    version_t auth_version = auth_info.last_update.version;
+    version_t candidate_version = shard_info.last_update.version;
+    if (HAVE_FEATURE(osdmap->get_up_osd_features(), SERVER_NAUTILUS)) {
+      auto approx_missing_objects =
+        shard_info.stats.stats.sum.num_objects_missing;
+      if (auth_version > candidate_version) {
+        approx_missing_objects += auth_version - candidate_version;
+      } else {
+        approx_missing_objects += candidate_version - auth_version;
+      }
+      if (static_cast<uint64_t>(approx_missing_objects)  >
+         cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
+        candidates_by_cost.emplace(approx_missing_objects, shard_i);
+      }
+    } else {
+      size_t approx_entries;
+      if (auth_version > candidate_version) {
+        approx_entries = auth_version - candidate_version;
+      } else {
+        approx_entries = candidate_version - auth_version;
+      }
+      if (approx_entries > cct->_conf.get_val<uint64_t>("osd_async_recovery_min_cost")) {
+        candidates_by_cost.insert(make_pair(approx_entries, shard_i));
+      }
+    }
+  }
+
+  psdout(20) << __func__ << " candidates by cost are: " << candidates_by_cost
+	     << dendl;
+  // take out as many osds as we can for async recovery, in order of cost
+  for (auto rit = candidates_by_cost.rbegin();
+       rit != candidates_by_cost.rend(); ++rit) {
+    if (want->size() <= pool.info.min_size) {
+      break;
+    }
+    pg_shard_t cur_shard = rit->second;
+    vector<int> candidate_want(*want);
+    for (auto it = candidate_want.begin(); it != candidate_want.end(); ++it) {
+      if (*it == cur_shard.osd) {
+	candidate_want.erase(it);
+	if (pool.info.stretch_set_can_peer(candidate_want, *osdmap, NULL)) {
+	  // if we're in stretch mode, we can only remove the osd if it doesn't
+	  // break peering limits.
+	  want->swap(candidate_want);
+	  async_recovery->insert(cur_shard);
+	}
+	break;
+      }
+    }
+  }
+
+  psdout(20) << __func__ << " result want=" << *want
+	     << " async_recovery=" << *async_recovery << dendl;
+}
+
+/**
+ * choose acting
+ *
+ * calculate the desired acting, and request a change with the monitor
+ * if it differs from the current acting.
+ *
+ * if restrict_to_up_acting=true, we filter out anything that's not in
+ * up/acting.  in order to lift this restriction, we need to
+ *  1) check whether it's worth switching the acting set any time we get
+ *     a new pg info (not just here, when recovery finishes)
+ *  2) check whether anything in want_acting went down on each new map
+ *     (and, if so, calculate a new want_acting)
+ *  3) remove the assertion in PG::PeeringState::Active::react(const AdvMap)
+ * TODO!
+ */
+bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id,
+				 bool restrict_to_up_acting,
+				 bool *history_les_bound,
+				 bool request_pg_temp_change_only)
+{
+  map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
+  all_info[pg_whoami] = info;
+
+  if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
+    for (auto p = all_info.begin(); p != all_info.end(); ++p) {
+      psdout(10) << __func__ << " all_info osd." << p->first << " "
+		 << p->second << dendl;
+    }
+  }
+
+  auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting,
+				       history_les_bound);
+
+  if (auth_log_shard == all_info.end()) {
+    if (up != acting) {
+      psdout(10) << __func__ << " no suitable info found (incomplete backfills?),"
+		 << " reverting to up" << dendl;
+      want_acting = up;
+      vector<int> empty;
+      pl->queue_want_pg_temp(empty);
+    } else {
+      psdout(10) << __func__ << " failed" << dendl;
+      ceph_assert(want_acting.empty());
+    }
+    return false;
+  }
+
+  ceph_assert(!auth_log_shard->second.is_incomplete());
+  auth_log_shard_id = auth_log_shard->first;
+
+  set<pg_shard_t> want_backfill, want_acting_backfill;
+  vector<int> want;
+  stringstream ss;
+  if (pool.info.is_replicated()) {
+    auto [primary_shard, oldest_log] = select_replicated_primary(
+      auth_log_shard,
+      cct->_conf.get_val<uint64_t>(
+	"osd_force_auth_primary_missing_objects"),
+      up,
+      up_primary,
+      all_info,
+      get_osdmap(),
+      ss);
+    if (pool.info.is_stretch_pool()) {
+      calc_replicated_acting_stretch(
+	primary_shard,
+	oldest_log,
+	get_osdmap()->get_pg_size(info.pgid.pgid),
+	acting,
+	up,
+	up_primary,
+	all_info,
+	restrict_to_up_acting,
+	&want,
+	&want_backfill,
+	&want_acting_backfill,
+	get_osdmap(),
+	pool,
+	ss);
+    } else {
+      calc_replicated_acting(
+	primary_shard,
+	oldest_log,
+	get_osdmap()->get_pg_size(info.pgid.pgid),
+	acting,
+	up,
+	up_primary,
+	all_info,
+	restrict_to_up_acting,
+	&want,
+	&want_backfill,
+	&want_acting_backfill,
+	get_osdmap(),
+	pool,
+	ss);
+    }
+  } else {
+    calc_ec_acting(
+      auth_log_shard,
+      get_osdmap()->get_pg_size(info.pgid.pgid),
+      acting,
+      up,
+      all_info,
+      restrict_to_up_acting,
+      &want,
+      &want_backfill,
+      &want_acting_backfill,
+      ss);
+  }
+  psdout(10) << ss.str() << dendl;
+
+  if (!recoverable(want)) {
+    want_acting.clear();
+    return false;
+  }
+
+  set<pg_shard_t> want_async_recovery;
+  if (HAVE_FEATURE(get_osdmap()->get_up_osd_features(), SERVER_MIMIC)) {
+    if (pool.info.is_erasure()) {
+      choose_async_recovery_ec(
+	all_info, auth_log_shard->second, &want, &want_async_recovery,
+	get_osdmap());
+    } else {
+      choose_async_recovery_replicated(
+	all_info, auth_log_shard->second, &want, &want_async_recovery,
+	get_osdmap());
+    }
+  }
+  while (want.size() > pool.info.size) {
+    // async recovery should have taken out as many osds as it can.
+    // if not, then always evict the last peer
+    // (will get synchronously recovered later)
+    psdout(10) << __func__ << " evicting osd." << want.back()
+               << " from oversized want " << want << dendl;
+    want.pop_back();
+  }
+  if (want != acting) {
+    psdout(10) << __func__ << " want " << want << " != acting " << acting
+	       << ", requesting pg_temp change" << dendl;
+    want_acting = want;
+
+    if (!cct->_conf->osd_debug_no_acting_change) {
+      if (want_acting == up) {
+	// There can't be any pending backfill if
+	// want is the same as crush map up OSDs.
+	ceph_assert(want_backfill.empty());
+	vector<int> empty;
+	pl->queue_want_pg_temp(empty);
+      } else
+	pl->queue_want_pg_temp(want);
+    }
+    return false;
+  }
+
+  if (request_pg_temp_change_only)
+    return true;
+  want_acting.clear();
+  acting_recovery_backfill = want_acting_backfill;
+  psdout(10) << "acting_recovery_backfill is "
+	     << acting_recovery_backfill << dendl;
+  ceph_assert(
+    backfill_targets.empty() ||
+    backfill_targets == want_backfill);
+  if (backfill_targets.empty()) {
+    // Caller is GetInfo
+    backfill_targets = want_backfill;
+  }
+  // Adding !needs_recovery() to let the async_recovery_targets reset after recovery is complete
+  ceph_assert(
+    async_recovery_targets.empty() ||
+    async_recovery_targets == want_async_recovery ||
+    !needs_recovery());
+  if (async_recovery_targets.empty() || !needs_recovery()) {
+    async_recovery_targets = want_async_recovery;
+  }
+  // Will not change if already set because up would have had to change
+  // Verify that nothing in backfill is in stray_set
+  for (auto i = want_backfill.begin(); i != want_backfill.end(); ++i) {
+    ceph_assert(stray_set.find(*i) == stray_set.end());
+  }
+  psdout(10) << "choose_acting want=" << want << " backfill_targets="
+           << want_backfill << " async_recovery_targets="
+           << async_recovery_targets << dendl;
+  return true;
+}
+
+void PeeringState::log_weirdness()
+{
+  if (pg_log.get_tail() != info.log_tail)
+    pl->get_clog_error() << info.pgid
+			   << " info mismatch, log.tail " << pg_log.get_tail()
+			   << " != info.log_tail " << info.log_tail;
+  if (pg_log.get_head() != info.last_update)
+    pl->get_clog_error() << info.pgid
+			   << " info mismatch, log.head " << pg_log.get_head()
+			   << " != info.last_update " << info.last_update;
+
+  if (!pg_log.get_log().empty()) {
+    // sloppy check
+    if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
+      pl->get_clog_error() << info.pgid
+			     << " log bound mismatch, info (tail,head] ("
+			     << pg_log.get_tail() << ","
+			     << pg_log.get_head() << "]"
+			     << " actual ["
+			     << pg_log.get_log().log.begin()->version << ","
+			     << pg_log.get_log().log.rbegin()->version << "]";
+  }
+
+  if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
+    pl->get_clog_error() << info.pgid
+			   << " caller_ops.size "
+			   << pg_log.get_log().caller_ops.size()
+			   << " > log size " << pg_log.get_log().log.size();
+  }
+}
+
+/*
+ * Process information from a replica to determine if it could have any
+ * objects that i need.
+ *
+ * TODO: if the missing set becomes very large, this could get expensive.
+ * Instead, we probably want to just iterate over our unfound set.
+ */
+bool PeeringState::search_for_missing(
+  const pg_info_t &oinfo, const pg_missing_t &omissing,
+  pg_shard_t from,
+  PeeringCtxWrapper &ctx)
+{
+  uint64_t num_unfound_before = missing_loc.num_unfound();
+  bool found_missing = missing_loc.add_source_info(
+    from, oinfo, omissing, ctx.handle);
+  if (found_missing && num_unfound_before != missing_loc.num_unfound())
+    pl->publish_stats_to_osd();
+  // avoid doing this if the peer is empty.  This is abit of paranoia
+  // to avoid doing something rash if add_source_info() above
+  // incorrectly decided we found something new. (if the peer has
+  // last_update=0'0 that's impossible.)
+  if (found_missing &&
+      oinfo.last_update != eversion_t()) {
+    pg_info_t tinfo(oinfo);
+    tinfo.pgid.shard = pg_whoami.shard;
+    ctx.send_info(
+      from.osd,
+      spg_t(info.pgid.pgid, from.shard),
+      get_osdmap_epoch(),  // fixme: use lower epoch?
+      get_osdmap_epoch(),
+      tinfo);
+  }
+  return found_missing;
+}
+
+bool PeeringState::discover_all_missing(
+  BufferedRecoveryMessages &rctx)
+{
+  auto &missing = pg_log.get_missing();
+  uint64_t unfound = get_num_unfound();
+  bool any = false;  // did we start any queries
+
+  psdout(10) << __func__ << " "
+	     << missing.num_missing() << " missing, "
+	     << unfound << " unfound"
+	     << dendl;
+
+  auto m = might_have_unfound.begin();
+  auto mend = might_have_unfound.end();
+  for (; m != mend; ++m) {
+    pg_shard_t peer(*m);
+
+    if (!get_osdmap()->is_up(peer.osd)) {
+      psdout(20) << __func__ << " skipping down osd." << peer << dendl;
+      continue;
+    }
+
+    if (peer_purged.count(peer)) {
+      psdout(20) << __func__ << " skipping purged osd." << peer << dendl;
+      continue;
+    }
+
+    auto iter = peer_info.find(peer);
+    if (iter != peer_info.end() &&
+        (iter->second.is_empty() || iter->second.dne())) {
+      // ignore empty peers
+      continue;
+    }
+
+    // If we've requested any of this stuff, the pg_missing_t information
+    // should be on its way.
+    // TODO: coalsce requested_* into a single data structure
+    if (peer_missing.find(peer) != peer_missing.end()) {
+      psdout(20) << __func__ << ": osd." << peer
+		 << ": we already have pg_missing_t" << dendl;
+      continue;
+    }
+    if (peer_log_requested.find(peer) != peer_log_requested.end()) {
+      psdout(20) << __func__ << ": osd." << peer
+		 << ": in peer_log_requested" << dendl;
+      continue;
+    }
+    if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
+      psdout(20) << __func__ << ": osd." << peer
+		 << ": in peer_missing_requested" << dendl;
+      continue;
+    }
+
+    // Request missing
+    psdout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
+	       << dendl;
+    peer_missing_requested.insert(peer);
+    rctx.send_query(
+      peer.osd,
+      spg_t(info.pgid.pgid, peer.shard),
+      pg_query_t(
+	pg_query_t::FULLLOG,
+	peer.shard, pg_whoami.shard,
+	info.history, get_osdmap_epoch()));
+    any = true;
+  }
+  return any;
+}
+
+/* Build the might_have_unfound set.
+ *
+ * This is used by the primary OSD during recovery.
+ *
+ * This set tracks the OSDs which might have unfound objects that the primary
+ * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
+ * will remove the OSD from the set.
+ */
+void PeeringState::build_might_have_unfound()
+{
+  ceph_assert(might_have_unfound.empty());
+  ceph_assert(is_primary());
+
+  psdout(10) << __func__ << dendl;
+
+  check_past_interval_bounds();
+
+  might_have_unfound = past_intervals.get_might_have_unfound(
+    pg_whoami,
+    pool.info.is_erasure());
+
+  // include any (stray) peers
+  for (auto p = peer_info.begin(); p != peer_info.end(); ++p)
+    might_have_unfound.insert(p->first);
+
+  psdout(15) << __func__ << ": built " << might_have_unfound << dendl;
+}
+
+void PeeringState::activate(
+  ObjectStore::Transaction& t,
+  epoch_t activation_epoch,
+  PeeringCtxWrapper &ctx)
+{
+  ceph_assert(!is_peered());
+
+  // twiddle pg state
+  state_clear(PG_STATE_DOWN);
+
+  send_notify = false;
+
+  if (is_primary()) {
+    // only update primary last_epoch_started if we will go active
+    if (acting_set_writeable()) {
+      ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
+	     info.last_epoch_started <= activation_epoch);
+      info.last_epoch_started = activation_epoch;
+      info.last_interval_started = info.history.same_interval_since;
+    }
+  } else if (is_acting(pg_whoami)) {
+    /* update last_epoch_started on acting replica to whatever the primary sent
+     * unless it's smaller (could happen if we are going peered rather than
+     * active, see doc/dev/osd_internals/last_epoch_started.rst) */
+    if (info.last_epoch_started < activation_epoch) {
+      info.last_epoch_started = activation_epoch;
+      info.last_interval_started = info.history.same_interval_since;
+    }
+  }
+
+  auto &missing = pg_log.get_missing();
+
+  min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
+  if (is_primary()) {
+    last_update_ondisk = info.last_update;
+  }
+  last_update_applied = info.last_update;
+  last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
+
+  need_up_thru = false;
+
+  // write pg info, log
+  dirty_info = true;
+  dirty_big_info = true; // maybe
+
+  pl->schedule_event_on_commit(
+    t,
+    std::make_shared<PGPeeringEvent>(
+      get_osdmap_epoch(),
+      get_osdmap_epoch(),
+      ActivateCommitted(
+	get_osdmap_epoch(),
+	activation_epoch)));
+
+  // init complete pointer
+  if (missing.num_missing() == 0) {
+    psdout(10) << "activate - no missing, moving last_complete " << info.last_complete
+	     << " -> " << info.last_update << dendl;
+    info.last_complete = info.last_update;
+    info.stats.stats.sum.num_objects_missing = 0;
+    pg_log.reset_recovery_pointers();
+  } else {
+    psdout(10) << "activate - not complete, " << missing << dendl;
+    info.stats.stats.sum.num_objects_missing = missing.num_missing();
+    pg_log.activate_not_complete(info);
+  }
+
+  log_weirdness();
+
+  if (is_primary()) {
+    // initialize snap_trimq
+    interval_set<snapid_t> to_trim;
+    auto& removed_snaps_queue = get_osdmap()->get_removed_snaps_queue();
+    auto p = removed_snaps_queue.find(info.pgid.pgid.pool());
+    if (p != removed_snaps_queue.end()) {
+      dout(20) << "activate - purged_snaps " << info.purged_snaps
+	       << " removed_snaps " << p->second
+	       << dendl;
+      for (auto q : p->second) {
+	to_trim.insert(q.first, q.second);
+      }
+    }
+    interval_set<snapid_t> purged;
+    purged.intersection_of(to_trim, info.purged_snaps);
+    to_trim.subtract(purged);
+
+    if (HAVE_FEATURE(upacting_features, SERVER_OCTOPUS)) {
+      renew_lease(pl->get_mnow());
+      // do not schedule until we are actually activated
+    }
+
+    // adjust purged_snaps: PG may have been inactive while snaps were pruned
+    // from the removed_snaps_queue in the osdmap.  update local purged_snaps
+    // reflect only those snaps that we thought were pruned and were still in
+    // the queue.
+    info.purged_snaps.swap(purged);
+
+    // start up replicas
+    if (prior_readable_down_osds.empty()) {
+      dout(10) << __func__ << " no prior_readable_down_osds to wait on, clearing ub"
+	       << dendl;
+      clear_prior_readable_until_ub();
+    }
+    info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
+						 prior_readable_until_ub);
+
+    ceph_assert(!acting_recovery_backfill.empty());
+    for (auto i = acting_recovery_backfill.begin();
+	 i != acting_recovery_backfill.end();
+	 ++i) {
+      if (*i == pg_whoami) continue;
+      pg_shard_t peer = *i;
+      ceph_assert(peer_info.count(peer));
+      pg_info_t& pi = peer_info[peer];
+
+      psdout(10) << "activate peer osd." << peer << " " << pi << dendl;
+
+      MRef<MOSDPGLog> m;
+      ceph_assert(peer_missing.count(peer));
+      pg_missing_t& pm = peer_missing[peer];
+
+      bool needs_past_intervals = pi.dne();
+
+      // Save num_bytes for backfill reservation request, can't be negative
+      peer_bytes[peer] = std::max<int64_t>(0, pi.stats.stats.sum.num_bytes);
+
+      if (pi.last_update == info.last_update) {
+        // empty log
+	if (!pi.last_backfill.is_max())
+	  pl->get_clog_info() << info.pgid << " continuing backfill to osd."
+				<< peer
+				<< " from (" << pi.log_tail << "," << pi.last_update
+				<< "] " << pi.last_backfill
+				<< " to " << info.last_update;
+	if (!pi.is_empty()) {
+	  psdout(10) << "activate peer osd." << peer
+		     << " is up to date, queueing in pending_activators" << dendl;
+	  ctx.send_info(
+	    peer.osd,
+	    spg_t(info.pgid.pgid, peer.shard),
+	    get_osdmap_epoch(), // fixme: use lower epoch?
+	    get_osdmap_epoch(),
+	    info,
+	    get_lease());
+	} else {
+	  psdout(10) << "activate peer osd." << peer
+		     << " is up to date, but sending pg_log anyway" << dendl;
+	  m = make_message<MOSDPGLog>(
+	    i->shard, pg_whoami.shard,
+	    get_osdmap_epoch(), info,
+	    last_peering_reset);
+	}
+      } else if (
+	pg_log.get_tail() > pi.last_update ||
+	pi.last_backfill == hobject_t() ||
+	(backfill_targets.count(*i) && pi.last_backfill.is_max())) {
+	/* ^ This last case covers a situation where a replica is not contiguous
+	 * with the auth_log, but is contiguous with this replica.  Reshuffling
+	 * the active set to handle this would be tricky, so instead we just go
+	 * ahead and backfill it anyway.  This is probably preferrable in any
+	 * case since the replica in question would have to be significantly
+	 * behind.
+	 */
+	// backfill
+	pl->get_clog_debug() << info.pgid << " starting backfill to osd." << peer
+			       << " from (" << pi.log_tail << "," << pi.last_update
+			       << "] " << pi.last_backfill
+			       << " to " << info.last_update;
+
+	pi.last_update = info.last_update;
+	pi.last_complete = info.last_update;
+	pi.set_last_backfill(hobject_t());
+	pi.last_epoch_started = info.last_epoch_started;
+	pi.last_interval_started = info.last_interval_started;
+	pi.history = info.history;
+	pi.hit_set = info.hit_set;
+        pi.stats.stats.clear();
+        pi.stats.stats.sum.num_bytes = peer_bytes[peer];
+
+	// initialize peer with our purged_snaps.
+	pi.purged_snaps = info.purged_snaps;
+
+	m = make_message<MOSDPGLog>(
+	  i->shard, pg_whoami.shard,
+	  get_osdmap_epoch(), pi,
+	  last_peering_reset /* epoch to create pg at */);
+
+	// send some recent log, so that op dup detection works well.
+	m->log.copy_up_to(cct, pg_log.get_log(),
+			  cct->_conf->osd_max_pg_log_entries);
+	m->info.log_tail = m->log.tail;
+	pi.log_tail = m->log.tail;  // sigh...
+
+	pm.clear();
+      } else {
+	// catch up
+	ceph_assert(pg_log.get_tail() <= pi.last_update);
+	m = make_message<MOSDPGLog>(
+	  i->shard, pg_whoami.shard,
+	  get_osdmap_epoch(), info,
+	  last_peering_reset /* epoch to create pg at */);
+	// send new stuff to append to replicas log
+	m->log.copy_after(cct, pg_log.get_log(), pi.last_update);
+      }
+
+      // share past_intervals if we are creating the pg on the replica
+      // based on whether our info for that peer was dne() *before*
+      // updating pi.history in the backfill block above.
+      if (m && needs_past_intervals)
+	m->past_intervals = past_intervals;
+
+      // update local version of peer's missing list!
+      if (m && pi.last_backfill != hobject_t()) {
+        for (auto p = m->log.log.begin(); p != m->log.log.end(); ++p) {
+	  if (p->soid <= pi.last_backfill &&
+	      !p->is_error()) {
+	    if (perform_deletes_during_peering() && p->is_delete()) {
+	      pm.rm(p->soid, p->version);
+	    } else {
+	      pm.add_next_event(*p);
+	    }
+	  }
+	}
+      }
+
+      if (m) {
+	dout(10) << "activate peer osd." << peer << " sending " << m->log
+		 << dendl;
+	m->lease = get_lease();
+	pl->send_cluster_message(peer.osd, m, get_osdmap_epoch());
+      }
+
+      // peer now has
+      pi.last_update = info.last_update;
+
+      // update our missing
+      if (pm.num_missing() == 0) {
+	pi.last_complete = pi.last_update;
+        psdout(10) << "activate peer osd." << peer << " " << pi
+		   << " uptodate" << dendl;
+      } else {
+        psdout(10) << "activate peer osd." << peer << " " << pi
+		   << " missing " << pm << dendl;
+      }
+    }
+
+    // Set up missing_loc
+    set<pg_shard_t> complete_shards;
+    for (auto i = acting_recovery_backfill.begin();
+	 i != acting_recovery_backfill.end();
+	 ++i) {
+      psdout(20) << __func__ << " setting up missing_loc from shard " << *i
+		 << " " << dendl;
+      if (*i == get_primary()) {
+	missing_loc.add_active_missing(missing);
+        if (!missing.have_missing())
+          complete_shards.insert(*i);
+      } else {
+	auto peer_missing_entry = peer_missing.find(*i);
+	ceph_assert(peer_missing_entry != peer_missing.end());
+	missing_loc.add_active_missing(peer_missing_entry->second);
+        if (!peer_missing_entry->second.have_missing() &&
+	    peer_info[*i].last_backfill.is_max())
+	  complete_shards.insert(*i);
+      }
+    }
+
+    // If necessary, create might_have_unfound to help us find our unfound objects.
+    // NOTE: It's important that we build might_have_unfound before trimming the
+    // past intervals.
+    might_have_unfound.clear();
+    if (needs_recovery()) {
+      // If only one shard has missing, we do a trick to add all others as recovery
+      // source, this is considered safe since the PGLogs have been merged locally,
+      // and covers vast majority of the use cases, like one OSD/host is down for
+      // a while for hardware repairing
+      if (complete_shards.size() + 1 == acting_recovery_backfill.size()) {
+        missing_loc.add_batch_sources_info(complete_shards, ctx.handle);
+      } else {
+        missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
+				    ctx.handle);
+        for (auto i = acting_recovery_backfill.begin();
+	     i != acting_recovery_backfill.end();
+	     ++i) {
+	  if (*i == pg_whoami) continue;
+	  psdout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
+	  ceph_assert(peer_missing.count(*i));
+	  ceph_assert(peer_info.count(*i));
+	  missing_loc.add_source_info(
+	    *i,
+	    peer_info[*i],
+	    peer_missing[*i],
+            ctx.handle);
+        }
+      }
+      for (auto i = peer_missing.begin(); i != peer_missing.end(); ++i) {
+	if (is_acting_recovery_backfill(i->first))
+	  continue;
+	ceph_assert(peer_info.count(i->first));
+	search_for_missing(
+	  peer_info[i->first],
+	  i->second,
+	  i->first,
+	  ctx);
+      }
+
+      build_might_have_unfound();
+
+      // Always call now so update_calc_stats() will be accurate
+      discover_all_missing(ctx.msgs);
+
+    }
+
+    // num_objects_degraded if calculated should reflect this too, unless no
+    // missing and we are about to go clean.
+    if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
+      state_set(PG_STATE_UNDERSIZED);
+    }
+
+    state_set(PG_STATE_ACTIVATING);
+    pl->on_activate(std::move(to_trim));
+  }
+  if (acting_set_writeable()) {
+    PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
+    pg_log.roll_forward(rollbacker.get());
+  }
+}
+
+void PeeringState::share_pg_info()
+{
+  psdout(10) << "share_pg_info" << dendl;
+
+  info.history.refresh_prior_readable_until_ub(pl->get_mnow(),
+					       prior_readable_until_ub);
+
+  // share new pg_info_t with replicas
+  ceph_assert(!acting_recovery_backfill.empty());
+  for (auto pg_shard : acting_recovery_backfill) {
+    if (pg_shard == pg_whoami) continue;
+    if (auto peer = peer_info.find(pg_shard); peer != peer_info.end()) {
+      peer->second.last_epoch_started = info.last_epoch_started;
+      peer->second.last_interval_started = info.last_interval_started;
+      peer->second.history.merge(info.history);
+    }
+    MessageRef m;
+    if (last_require_osd_release >= ceph_release_t::octopus) {
+      m = make_message<MOSDPGInfo2>(spg_t{info.pgid.pgid, pg_shard.shard},
+			  info,
+			  get_osdmap_epoch(),
+			  get_osdmap_epoch(),
+			  std::optional<pg_lease_t>{get_lease()},
+			  std::nullopt);
+    } else {
+      m = make_message<MOSDPGInfo>(get_osdmap_epoch(),
+	      MOSDPGInfo::pg_list_t{
+	        pg_notify_t{pg_shard.shard,
+			    pg_whoami.shard,
+			    get_osdmap_epoch(),
+			    get_osdmap_epoch(),
+			    info,
+			    past_intervals}});
+    }
+    pl->send_cluster_message(pg_shard.osd, m, get_osdmap_epoch());
+  }
+}
+
+void PeeringState::merge_log(
+  ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t&& olog,
+  pg_shard_t from)
+{
+  PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
+  pg_log.merge_log(
+    oinfo, std::move(olog), from, info, rollbacker.get(),
+    dirty_info, dirty_big_info);
+}
+
+void PeeringState::rewind_divergent_log(
+  ObjectStore::Transaction& t, eversion_t newhead)
+{
+  PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
+  pg_log.rewind_divergent_log(
+    newhead, info, rollbacker.get(), dirty_info, dirty_big_info);
+}
+
+
+void PeeringState::proc_primary_info(
+  ObjectStore::Transaction &t, const pg_info_t &oinfo)
+{
+  ceph_assert(!is_primary());
+
+  update_history(oinfo.history);
+  if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
+    info.stats.stats.sum.num_scrub_errors = 0;
+    info.stats.stats.sum.num_shallow_scrub_errors = 0;
+    info.stats.stats.sum.num_deep_scrub_errors = 0;
+    dirty_info = true;
+  }
+
+  if (!(info.purged_snaps == oinfo.purged_snaps)) {
+    psdout(10) << __func__ << " updating purged_snaps to "
+	       << oinfo.purged_snaps
+	       << dendl;
+    info.purged_snaps = oinfo.purged_snaps;
+    dirty_info = true;
+    dirty_big_info = true;
+  }
+}
+
+void PeeringState::proc_master_log(
+  ObjectStore::Transaction& t, pg_info_t &oinfo,
+  pg_log_t&& olog, pg_missing_t&& omissing, pg_shard_t from)
+{
+  psdout(10) << "proc_master_log for osd." << from << ": "
+	     << olog << " " << omissing << dendl;
+  ceph_assert(!is_peered() && is_primary());
+
+  // merge log into our own log to build master log.  no need to
+  // make any adjustments to their missing map; we are taking their
+  // log to be authoritative (i.e., their entries are by definitely
+  // non-divergent).
+  merge_log(t, oinfo, std::move(olog), from);
+  peer_info[from] = oinfo;
+  psdout(10) << " peer osd." << from << " now " << oinfo
+	     << " " << omissing << dendl;
+  might_have_unfound.insert(from);
+
+  // See doc/dev/osd_internals/last_epoch_started
+  if (oinfo.last_epoch_started > info.last_epoch_started) {
+    info.last_epoch_started = oinfo.last_epoch_started;
+    dirty_info = true;
+  }
+  if (oinfo.last_interval_started > info.last_interval_started) {
+    info.last_interval_started = oinfo.last_interval_started;
+    dirty_info = true;
+  }
+  update_history(oinfo.history);
+  ceph_assert(cct->_conf->osd_find_best_info_ignore_history_les ||
+	 info.last_epoch_started >= info.history.last_epoch_started);
+
+  peer_missing[from].claim(std::move(omissing));
+}
+
+void PeeringState::proc_replica_log(
+  pg_info_t &oinfo,
+  const pg_log_t &olog,
+  pg_missing_t&& omissing,
+  pg_shard_t from)
+{
+  psdout(10) << "proc_replica_log for osd." << from << ": "
+	     << oinfo << " " << olog << " " << omissing << dendl;
+
+  pg_log.proc_replica_log(oinfo, olog, omissing, from);
+
+  peer_info[from] = oinfo;
+  psdout(10) << " peer osd." << from << " now "
+	     << oinfo << " " << omissing << dendl;
+  might_have_unfound.insert(from);
+
+  for (auto i = omissing.get_items().begin();
+       i != omissing.get_items().end();
+       ++i) {
+    psdout(20) << " after missing " << i->first
+	       << " need " << i->second.need
+	       << " have " << i->second.have << dendl;
+  }
+  peer_missing[from].claim(std::move(omissing));
+}
+
+void PeeringState::fulfill_info(
+  pg_shard_t from, const pg_query_t &query,
+  pair<pg_shard_t, pg_info_t> &notify_info)
+{
+  ceph_assert(from == primary);
+  ceph_assert(query.type == pg_query_t::INFO);
+
+  // info
+  psdout(10) << "sending info" << dendl;
+  notify_info = make_pair(from, info);
+}
+
+void PeeringState::fulfill_log(
+  pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
+{
+  psdout(10) << "log request from " << from << dendl;
+  ceph_assert(from == primary);
+  ceph_assert(query.type != pg_query_t::INFO);
+
+  auto mlog = make_message<MOSDPGLog>(
+    from.shard, pg_whoami.shard,
+    get_osdmap_epoch(),
+    info, query_epoch);
+  mlog->missing = pg_log.get_missing();
+
+  // primary -> other, when building master log
+  if (query.type == pg_query_t::LOG) {
+    psdout(10) << " sending info+missing+log since " << query.since
+	       << dendl;
+    if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
+      pl->get_clog_error() << info.pgid << " got broken pg_query_t::LOG since "
+			     << query.since
+			     << " when my log.tail is " << pg_log.get_tail()
+			     << ", sending full log instead";
+      mlog->log = pg_log.get_log();           // primary should not have requested this!!
+    } else
+      mlog->log.copy_after(cct, pg_log.get_log(), query.since);
+  }
+  else if (query.type == pg_query_t::FULLLOG) {
+    psdout(10) << " sending info+missing+full log" << dendl;
+    mlog->log = pg_log.get_log();
+  }
+
+  psdout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
+
+  pl->send_cluster_message(from.osd, mlog, get_osdmap_epoch(), true);
+}
+
+void PeeringState::fulfill_query(const MQuery& query, PeeringCtxWrapper &rctx)
+{
+  if (query.query.type == pg_query_t::INFO) {
+    pair<pg_shard_t, pg_info_t> notify_info;
+    // note this refreshes our prior_readable_until_ub value
+    update_history(query.query.history);
+    fulfill_info(query.from, query.query, notify_info);
+    rctx.send_notify(
+      notify_info.first.osd,
+      pg_notify_t(
+	notify_info.first.shard, pg_whoami.shard,
+	query.query_epoch,
+	get_osdmap_epoch(),
+	notify_info.second,
+	past_intervals));
+  } else {
+    update_history(query.query.history);
+    fulfill_log(query.from, query.query, query.query_epoch);
+  }
+}
+
+void PeeringState::try_mark_clean()
+{
+  if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
+    state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
+    state_set(PG_STATE_CLEAN);
+    info.history.last_epoch_clean = get_osdmap_epoch();
+    info.history.last_interval_clean = info.history.same_interval_since;
+    past_intervals.clear();
+    dirty_big_info = true;
+    dirty_info = true;
+  }
+
+  if (!is_active() && is_peered()) {
+    if (is_clean()) {
+      bool target;
+      if (pool.info.is_pending_merge(info.pgid.pgid, &target)) {
+	if (target) {
+	  psdout(10) << "ready to merge (target)" << dendl;
+	  pl->set_ready_to_merge_target(
+	    info.last_update,
+	    info.history.last_epoch_started,
+	    info.history.last_epoch_clean);
+	} else {
+	  psdout(10) << "ready to merge (source)" << dendl;
+	  pl->set_ready_to_merge_source(info.last_update);
+	}
+      }
+    } else {
+      psdout(10) << "not clean, not ready to merge" << dendl;
+      // we should have notified OSD in Active state entry point
+    }
+  }
+
+  state_clear(PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
+
+  share_pg_info();
+  pl->publish_stats_to_osd();
+  clear_recovery_state();
+}
+
+void PeeringState::split_into(
+  pg_t child_pgid, PeeringState *child, unsigned split_bits)
+{
+  child->update_osdmap_ref(get_osdmap());
+  child->pool = pool;
+
+  // Log
+  pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
+  child->info.last_complete = info.last_complete;
+
+  info.last_update = pg_log.get_head();
+  child->info.last_update = child->pg_log.get_head();
+
+  child->info.last_user_version = info.last_user_version;
+
+  info.log_tail = pg_log.get_tail();
+  child->info.log_tail = child->pg_log.get_tail();
+
+  // reset last_complete, we might have modified pg_log & missing above
+  pg_log.reset_complete_to(&info);
+  child->pg_log.reset_complete_to(&child->info);
+
+  // Info
+  child->info.history = info.history;
+  child->info.history.epoch_created = get_osdmap_epoch();
+  child->info.purged_snaps = info.purged_snaps;
+
+  if (info.last_backfill.is_max()) {
+    child->info.set_last_backfill(hobject_t::get_max());
+  } else {
+    // restart backfill on parent and child to be safe.  we could
+    // probably do better in the bitwise sort case, but it's more
+    // fragile (there may be special work to do on backfill completion
+    // in the future).
+    info.set_last_backfill(hobject_t());
+    child->info.set_last_backfill(hobject_t());
+    // restarting backfill implies that the missing set is empty,
+    // since it is only used for objects prior to last_backfill
+    pg_log.reset_backfill();
+    child->pg_log.reset_backfill();
+  }
+
+  child->info.stats = info.stats;
+  child->info.stats.parent_split_bits = split_bits;
+  info.stats.stats_invalid = true;
+  child->info.stats.stats_invalid = true;
+  child->info.last_epoch_started = info.last_epoch_started;
+  child->info.last_interval_started = info.last_interval_started;
+
+  // There can't be recovery/backfill going on now
+  int primary, up_primary;
+  vector<int> newup, newacting;
+  get_osdmap()->pg_to_up_acting_osds(
+    child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
+  child->init_primary_up_acting(
+    newup,
+    newacting,
+    up_primary,
+    primary);
+  child->role = OSDMap::calc_pg_role(pg_whoami, child->acting);
+
+  // this comparison includes primary rank via pg_shard_t
+  if (get_primary() != child->get_primary())
+    child->info.history.same_primary_since = get_osdmap_epoch();
+
+  child->info.stats.up = newup;
+  child->info.stats.up_primary = up_primary;
+  child->info.stats.acting = newacting;
+  child->info.stats.acting_primary = primary;
+  child->info.stats.mapping_epoch = get_osdmap_epoch();
+
+  // History
+  child->past_intervals = past_intervals;
+
+  child->on_new_interval();
+
+  child->send_notify = !child->is_primary();
+
+  child->dirty_info = true;
+  child->dirty_big_info = true;
+  dirty_info = true;
+  dirty_big_info = true;
+}
+
+void PeeringState::merge_from(
+  map<spg_t,PeeringState *>& sources,
+  PeeringCtx &rctx,
+  unsigned split_bits,
+  const pg_merge_meta_t& last_pg_merge_meta)
+{
+  bool incomplete = false;
+  if (info.last_complete != info.last_update ||
+      info.is_incomplete() ||
+      info.dne()) {
+    psdout(10) << __func__ << " target incomplete" << dendl;
+    incomplete = true;
+  }
+  if (last_pg_merge_meta.source_pgid != pg_t()) {
+    if (info.pgid.pgid != last_pg_merge_meta.source_pgid.get_parent()) {
+      psdout(10) << __func__ << " target doesn't match expected parent "
+		 << last_pg_merge_meta.source_pgid.get_parent()
+		 << " of source_pgid " << last_pg_merge_meta.source_pgid
+		 << dendl;
+      incomplete = true;
+    }
+    if (info.last_update != last_pg_merge_meta.target_version) {
+      psdout(10) << __func__ << " target version doesn't match expected "
+	       << last_pg_merge_meta.target_version << dendl;
+      incomplete = true;
+    }
+  }
+
+  PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
+  pg_log.roll_forward(handler.get());
+
+  info.last_complete = info.last_update;  // to fake out trim()
+  pg_log.reset_recovery_pointers();
+  pg_log.trim(info.last_update, info);
+
+  vector<PGLog*> log_from;
+  for (auto& i : sources) {
+    auto& source = i.second;
+    if (!source) {
+      psdout(10) << __func__ << " source " << i.first << " missing" << dendl;
+      incomplete = true;
+      continue;
+    }
+    if (source->info.last_complete != source->info.last_update ||
+	source->info.is_incomplete() ||
+	source->info.dne()) {
+      psdout(10) << __func__ << " source " << source->pg_whoami
+		 << " incomplete"
+		 << dendl;
+      incomplete = true;
+    }
+    if (last_pg_merge_meta.source_pgid != pg_t()) {
+      if (source->info.pgid.pgid != last_pg_merge_meta.source_pgid) {
+	dout(10) << __func__ << " source " << source->info.pgid.pgid
+		 << " doesn't match expected source pgid "
+		 << last_pg_merge_meta.source_pgid << dendl;
+	incomplete = true;
+      }
+      if (source->info.last_update != last_pg_merge_meta.source_version) {
+	dout(10) << __func__ << " source version doesn't match expected "
+		 << last_pg_merge_meta.target_version << dendl;
+	incomplete = true;
+      }
+    }
+
+    // prepare log
+    PGLog::LogEntryHandlerRef handler{
+      source->pl->get_log_handler(rctx.transaction)};
+    source->pg_log.roll_forward(handler.get());
+    source->info.last_complete = source->info.last_update;  // to fake out trim()
+    source->pg_log.reset_recovery_pointers();
+    source->pg_log.trim(source->info.last_update, source->info);
+    log_from.push_back(&source->pg_log);
+
+    // combine stats
+    info.stats.add(source->info.stats);
+
+    // pull up last_update
+    info.last_update = std::max(info.last_update, source->info.last_update);
+
+    // adopt source's PastIntervals if target has none.  we can do this since
+    // pgp_num has been reduced prior to the merge, so the OSD mappings for
+    // the PGs are identical.
+    if (past_intervals.empty() && !source->past_intervals.empty()) {
+      psdout(10) << __func__ << " taking source's past_intervals" << dendl;
+      past_intervals = source->past_intervals;
+    }
+  }
+
+  info.last_complete = info.last_update;
+  info.log_tail = info.last_update;
+  if (incomplete) {
+    info.last_backfill = hobject_t();
+  }
+
+  // merge logs
+  pg_log.merge_from(log_from, info.last_update);
+
+  // make sure we have a meaningful last_epoch_started/clean (if we were a
+  // placeholder)
+  if (info.history.epoch_created == 0) {
+    // start with (a) source's history, since these PGs *should* have been
+    // remapped in concert with each other...
+    info.history = sources.begin()->second->info.history;
+
+    // we use the last_epoch_{started,clean} we got from
+    // the caller, which are the epochs that were reported by the PGs were
+    // found to be ready for merge.
+    info.history.last_epoch_clean = last_pg_merge_meta.last_epoch_clean;
+    info.history.last_epoch_started = last_pg_merge_meta.last_epoch_started;
+    info.last_epoch_started = last_pg_merge_meta.last_epoch_started;
+    psdout(10) << __func__
+	       << " set les/c to " << last_pg_merge_meta.last_epoch_started << "/"
+	       << last_pg_merge_meta.last_epoch_clean
+	       << " from pool last_dec_*, source pg history was "
+	       << sources.begin()->second->info.history
+	       << dendl;
+
+    // above we have pulled down source's history and we need to check
+    // history.epoch_created again to confirm that source is not a placeholder
+    // too. (peering requires a sane history.same_interval_since value for any
+    // non-newly created pg and below here we know we are basically iterating
+    // back a series of past maps to fake a merge process, hence we need to
+    // fix history.same_interval_since first so that start_peering_interval()
+    // will not complain)
+    if (info.history.epoch_created == 0) {
+      dout(10) << __func__ << " both merge target and source are placeholders,"
+               << " set sis to lec " << info.history.last_epoch_clean
+               << dendl;
+      info.history.same_interval_since = info.history.last_epoch_clean;
+    }
+
+    // if the past_intervals start is later than last_epoch_clean, it
+    // implies the source repeered again but the target didn't, or
+    // that the source became clean in a later epoch than the target.
+    // avoid the discrepancy but adjusting the interval start
+    // backwards to match so that check_past_interval_bounds() will
+    // not complain.
+    auto pib = past_intervals.get_bounds();
+    if (info.history.last_epoch_clean < pib.first) {
+      psdout(10) << __func__ << " last_epoch_clean "
+		 << info.history.last_epoch_clean << " < past_interval start "
+		 << pib.first << ", adjusting start backwards" << dendl;
+      past_intervals.adjust_start_backwards(info.history.last_epoch_clean);
+    }
+
+    // Similarly, if the same_interval_since value is later than
+    // last_epoch_clean, the next interval change will result in a
+    // past_interval start that is later than last_epoch_clean.  This
+    // can happen if we use the pg_history values from the merge
+    // source.  Adjust the same_interval_since value backwards if that
+    // happens.  (We trust the les and lec values more because they came from
+    // the real target, whereas the history value we stole from the source.)
+    if (info.history.last_epoch_started < info.history.same_interval_since) {
+      psdout(10) << __func__ << " last_epoch_started "
+		 << info.history.last_epoch_started << " < same_interval_since "
+		 << info.history.same_interval_since
+		 << ", adjusting pg_history backwards" << dendl;
+      info.history.same_interval_since = info.history.last_epoch_clean;
+      // make sure same_{up,primary}_since are <= same_interval_since
+      info.history.same_up_since = std::min(
+	info.history.same_up_since, info.history.same_interval_since);
+      info.history.same_primary_since = std::min(
+	info.history.same_primary_since, info.history.same_interval_since);
+    }
+  }
+
+  dirty_info = true;
+  dirty_big_info = true;
+}
+
+void PeeringState::start_split_stats(
+  const set<spg_t>& childpgs, vector<object_stat_sum_t> *out)
+{
+  out->resize(childpgs.size() + 1);
+  info.stats.stats.sum.split(*out);
+}
+
+void PeeringState::finish_split_stats(
+  const object_stat_sum_t& stats, ObjectStore::Transaction &t)
+{
+  info.stats.stats.sum = stats;
+  write_if_dirty(t);
+}
+
+void PeeringState::update_blocked_by()
+{
+  // set a max on the number of blocking peers we report. if we go
+  // over, report a random subset.  keep the result sorted.
+  unsigned keep = std::min<unsigned>(
+    blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
+  unsigned skip = blocked_by.size() - keep;
+  info.stats.blocked_by.clear();
+  info.stats.blocked_by.resize(keep);
+  unsigned pos = 0;
+  for (auto p = blocked_by.begin(); p != blocked_by.end() && keep > 0; ++p) {
+    if (skip > 0 && (rand() % (skip + keep) < skip)) {
+      --skip;
+    } else {
+      info.stats.blocked_by[pos++] = *p;
+      --keep;
+    }
+  }
+}
+
+static bool find_shard(const set<pg_shard_t> & pgs, shard_id_t shard)
+{
+    for (auto&p : pgs)
+      if (p.shard == shard)
+        return true;
+    return false;
+}
+
+static pg_shard_t get_another_shard(const set<pg_shard_t> & pgs, pg_shard_t skip, shard_id_t shard)
+{
+    for (auto&p : pgs) {
+      if (p == skip)
+        continue;
+      if (p.shard == shard)
+        return p;
+    }
+    return pg_shard_t();
+}
+
+void PeeringState::update_calc_stats()
+{
+  info.stats.version = info.last_update;
+  info.stats.created = info.history.epoch_created;
+  info.stats.last_scrub = info.history.last_scrub;
+  info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
+  info.stats.last_deep_scrub = info.history.last_deep_scrub;
+  info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
+  info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
+  info.stats.last_epoch_clean = info.history.last_epoch_clean;
+
+  info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
+  info.stats.ondisk_log_size = info.stats.log_size;
+  info.stats.log_start = pg_log.get_tail();
+  info.stats.ondisk_log_start = pg_log.get_tail();
+  info.stats.snaptrimq_len = pl->get_snap_trimq_size();
+
+  unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
+
+  // In rare case that upset is too large (usually transient), use as target
+  // for calculations below.
+  unsigned target = std::max(num_shards, (unsigned)upset.size());
+  // For undersized actingset may be larger with OSDs out
+  unsigned nrep = std::max(actingset.size(), upset.size());
+  // calc num_object_copies
+  info.stats.stats.calc_copies(std::max(target, nrep));
+  info.stats.stats.sum.num_objects_degraded = 0;
+  info.stats.stats.sum.num_objects_unfound = 0;
+  info.stats.stats.sum.num_objects_misplaced = 0;
+  info.stats.avail_no_missing.clear();
+  info.stats.object_location_counts.clear();
+
+  // We should never hit this condition, but if end up hitting it,
+  // make sure to update num_objects and set PG_STATE_INCONSISTENT.
+  if (info.stats.stats.sum.num_objects < 0) {
+    psdout(0) << __func__ << " negative num_objects = "
+              << info.stats.stats.sum.num_objects << " setting it to 0 "
+              << dendl;
+    info.stats.stats.sum.num_objects = 0;
+    state_set(PG_STATE_INCONSISTENT);
+  }
+
+  if ((is_remapped() || is_undersized() || !is_clean()) &&
+      (is_peered()|| is_activating())) {
+    psdout(20) << __func__ << " actingset " << actingset << " upset "
+	       << upset << " acting_recovery_backfill " << acting_recovery_backfill << dendl;
+
+    ceph_assert(!acting_recovery_backfill.empty());
+
+    bool estimate = false;
+
+    // NOTE: we only generate degraded, misplaced and unfound
+    // values for the summation, not individual stat categories.
+    int64_t num_objects = info.stats.stats.sum.num_objects;
+
+    // Objects missing from up nodes, sorted by # objects.
+    boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
+    // Objects missing from nodes not in up, sort by # objects
+    boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
+
+    // Fill missing_target_objects/acting_source_objects
+
+    {
+      int64_t missing;
+
+      // Primary first
+      missing = pg_log.get_missing().num_missing();
+      ceph_assert(acting_recovery_backfill.count(pg_whoami));
+      if (upset.count(pg_whoami)) {
+        missing_target_objects.emplace(missing, pg_whoami);
+      } else {
+        acting_source_objects.emplace(missing, pg_whoami);
+      }
+      info.stats.stats.sum.num_objects_missing_on_primary = missing;
+      if (missing == 0)
+        info.stats.avail_no_missing.push_back(pg_whoami);
+      psdout(20) << __func__ << " shard " << pg_whoami
+		 << " primary objects " << num_objects
+		 << " missing " << missing
+		 << dendl;
+    }
+
+    // All other peers
+    for (auto& peer : peer_info) {
+      // Primary should not be in the peer_info, skip if it is.
+      if (peer.first == pg_whoami) continue;
+      int64_t missing = 0;
+      int64_t peer_num_objects =
+        std::max((int64_t)0, peer.second.stats.stats.sum.num_objects);
+      // Backfill targets always track num_objects accurately
+      // all other peers track missing accurately.
+      if (is_backfill_target(peer.first)) {
+        missing = std::max((int64_t)0, num_objects - peer_num_objects);
+      } else {
+        if (peer_missing.count(peer.first)) {
+          missing = peer_missing[peer.first].num_missing();
+        } else {
+          psdout(20) << __func__ << " no peer_missing found for "
+		     << peer.first << dendl;
+          if (is_recovering()) {
+            estimate = true;
+          }
+          missing = std::max((int64_t)0, num_objects - peer_num_objects);
+        }
+      }
+      if (upset.count(peer.first)) {
+	missing_target_objects.emplace(missing, peer.first);
+      } else if (actingset.count(peer.first)) {
+	acting_source_objects.emplace(missing, peer.first);
+      }
+      peer.second.stats.stats.sum.num_objects_missing = missing;
+      if (missing == 0)
+        info.stats.avail_no_missing.push_back(peer.first);
+      psdout(20) << __func__ << " shard " << peer.first
+		 << " objects " << peer_num_objects
+		 << " missing " << missing
+		 << dendl;
+    }
+
+    // Compute object_location_counts
+    for (auto& ml: missing_loc.get_missing_locs()) {
+      info.stats.object_location_counts[ml.second]++;
+      psdout(30) << __func__ << " " << ml.first << " object_location_counts["
+		 << ml.second << "]=" << info.stats.object_location_counts[ml.second]
+		 << dendl;
+    }
+    int64_t not_missing = num_objects - missing_loc.get_missing_locs().size();
+    if (not_missing) {
+	// During recovery we know upset == actingset and is being populated
+	// During backfill we know that all non-missing objects are in the actingset
+        info.stats.object_location_counts[actingset] = not_missing;
+    }
+    psdout(30) << __func__ << " object_location_counts["
+	       << upset << "]=" << info.stats.object_location_counts[upset]
+	       << dendl;
+    psdout(20) << __func__ << " object_location_counts "
+	       << info.stats.object_location_counts << dendl;
+
+    // A misplaced object is not stored on the correct OSD
+    int64_t misplaced = 0;
+    // a degraded objects has fewer replicas or EC shards than the pool specifies.
+    int64_t degraded = 0;
+
+    if (is_recovering()) {
+      for (auto& sml: missing_loc.get_missing_by_count()) {
+        for (auto& ml: sml.second) {
+          int missing_shards;
+          if (sml.first == shard_id_t::NO_SHARD) {
+            psdout(20) << __func__ << " ml " << ml.second
+		       << " upset size " << upset.size()
+		       << " up " << ml.first.up << dendl;
+            missing_shards = (int)upset.size() - ml.first.up;
+          } else {
+	    // Handle shards not even in upset below
+            if (!find_shard(upset, sml.first))
+	      continue;
+	    missing_shards = std::max(0, 1 - ml.first.up);
+            psdout(20) << __func__
+		       << " shard " << sml.first
+		       << " ml " << ml.second
+		       << " missing shards " << missing_shards << dendl;
+          }
+          int odegraded = ml.second * missing_shards;
+          // Copies on other osds but limited to the possible degraded
+          int more_osds = std::min(missing_shards, ml.first.other);
+          int omisplaced = ml.second * more_osds;
+          ceph_assert(omisplaced <= odegraded);
+          odegraded -= omisplaced;
+
+          misplaced += omisplaced;
+          degraded += odegraded;
+        }
+      }
+
+      psdout(20) << __func__ << " missing based degraded "
+		 << degraded << dendl;
+      psdout(20) << __func__ << " missing based misplaced "
+		 << misplaced << dendl;
+
+      // Handle undersized case
+      if (pool.info.is_replicated()) {
+        // Add degraded for missing targets (num_objects missing)
+        ceph_assert(target >= upset.size());
+        unsigned needed = target - upset.size();
+        degraded += num_objects * needed;
+      } else {
+        for (unsigned i = 0 ; i < num_shards; ++i) {
+          shard_id_t shard(i);
+
+          if (!find_shard(upset, shard)) {
+            pg_shard_t pgs = get_another_shard(actingset, pg_shard_t(), shard);
+
+            if (pgs != pg_shard_t()) {
+              int64_t missing;
+
+              if (pgs == pg_whoami)
+                missing = info.stats.stats.sum.num_objects_missing_on_primary;
+              else
+                missing = peer_info[pgs].stats.stats.sum.num_objects_missing;
+
+              degraded += missing;
+              misplaced += std::max((int64_t)0, num_objects - missing);
+            } else {
+              // No shard anywhere
+              degraded += num_objects;
+            }
+          }
+        }
+      }
+      goto out;
+    }
+
+    // Handle undersized case
+    if (pool.info.is_replicated()) {
+      // Add to missing_target_objects
+      ceph_assert(target >= missing_target_objects.size());
+      unsigned needed = target - missing_target_objects.size();
+      if (needed)
+        missing_target_objects.emplace(num_objects * needed, pg_shard_t(pg_shard_t::NO_OSD));
+    } else {
+      for (unsigned i = 0 ; i < num_shards; ++i) {
+        shard_id_t shard(i);
+	bool found = false;
+	for (const auto& t : missing_target_objects) {
+	  if (std::get<1>(t).shard == shard) {
+	    found = true;
+	    break;
+	  }
+	}
+	if (!found)
+	  missing_target_objects.emplace(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard));
+      }
+    }
+
+    for (const auto& item : missing_target_objects)
+      psdout(20) << __func__ << " missing shard " << std::get<1>(item)
+		 << " missing= " << std::get<0>(item) << dendl;
+    for (const auto& item : acting_source_objects)
+      psdout(20) << __func__ << " acting shard " << std::get<1>(item)
+		 << " missing= " << std::get<0>(item) << dendl;
+
+    // Handle all objects not in missing for remapped
+    // or backfill
+    for (auto m = missing_target_objects.rbegin();
+        m != missing_target_objects.rend(); ++m) {
+
+      int64_t extra_missing = -1;
+
+      if (pool.info.is_replicated()) {
+	if (!acting_source_objects.empty()) {
+	  auto extra_copy = acting_source_objects.begin();
+	  extra_missing = std::get<0>(*extra_copy);
+          acting_source_objects.erase(extra_copy);
+	}
+      } else {	// Erasure coded
+	// Use corresponding shard
+	for (const auto& a : acting_source_objects) {
+	  if (std::get<1>(a).shard == std::get<1>(*m).shard) {
+	    extra_missing = std::get<0>(a);
+	    acting_source_objects.erase(a);
+	    break;
+	  }
+	}
+      }
+
+      if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
+	// We don't know which of the objects on the target
+	// are part of extra_missing so assume are all degraded.
+	misplaced += std::get<0>(*m) - extra_missing;
+	degraded += extra_missing;
+      } else {
+	// 1. extra_missing == -1, more targets than sources so degraded
+	// 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
+	//    previously degraded are now present on the target.
+	degraded += std::get<0>(*m);
+      }
+    }
+    // If there are still acting that haven't been accounted for
+    // then they are misplaced
+    for (const auto& a : acting_source_objects) {
+      int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
+      psdout(20) << __func__ << " extra acting misplaced " << extra_misplaced
+		 << dendl;
+      misplaced += extra_misplaced;
+    }
+out:
+    // NOTE: Tests use these messages to verify this code
+    psdout(20) << __func__ << " degraded " << degraded
+	       << (estimate ? " (est)": "") << dendl;
+    psdout(20) << __func__ << " misplaced " << misplaced
+	       << (estimate ? " (est)": "")<< dendl;
+
+    info.stats.stats.sum.num_objects_degraded = degraded;
+    info.stats.stats.sum.num_objects_unfound = get_num_unfound();
+    info.stats.stats.sum.num_objects_misplaced = misplaced;
+  }
+}
+
+std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
+  bool pg_stats_publish_valid,
+  const pg_stat_t &pg_stats_publish,
+  const object_stat_collection_t &unstable_stats)
+{
+  if (info.stats.stats.sum.num_scrub_errors) {
+    psdout(10) << __func__ << " inconsistent due to " <<
+      info.stats.stats.sum.num_scrub_errors << " scrub errors" << dendl;
+    state_set(PG_STATE_INCONSISTENT);
+  } else {
+    state_clear(PG_STATE_INCONSISTENT);
+    state_clear(PG_STATE_FAILED_REPAIR);
+  }
+
+  utime_t now = ceph_clock_now();
+  if (info.stats.state != state) {
+    info.stats.last_change = now;
+    // Optimistic estimation, if we just find out an inactive PG,
+    // assumt it is active till now.
+    if (!(state & PG_STATE_ACTIVE) &&
+	(info.stats.state & PG_STATE_ACTIVE))
+      info.stats.last_active = now;
+
+    if ((state & PG_STATE_ACTIVE) &&
+	!(info.stats.state & PG_STATE_ACTIVE))
+      info.stats.last_became_active = now;
+    if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
+	!(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
+      info.stats.last_became_peered = now;
+    info.stats.state = state;
+  }
+
+  update_calc_stats();
+  if (info.stats.stats.sum.num_objects_degraded) {
+    state_set(PG_STATE_DEGRADED);
+  } else {
+    state_clear(PG_STATE_DEGRADED);
+  }
+  update_blocked_by();
+
+  pg_stat_t pre_publish = info.stats;
+  pre_publish.stats.add(unstable_stats);
+  utime_t cutoff = now;
+  cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
+
+  // share (some of) our purged_snaps via the pg_stats. limit # of intervals
+  // because we don't want to make the pg_stat_t structures too expensive.
+  unsigned max = cct->_conf->osd_max_snap_prune_intervals_per_epoch;
+  unsigned num = 0;
+  auto i = info.purged_snaps.begin();
+  while (num < max && i != info.purged_snaps.end()) {
+    pre_publish.purged_snaps.insert(i.get_start(), i.get_len());
+    ++num;
+    ++i;
+  }
+  psdout(20) << __func__ << " reporting purged_snaps "
+	     << pre_publish.purged_snaps << dendl;
+
+  if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
+      info.stats.last_fresh > cutoff) {
+    psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
+	       << ": no change since " << info.stats.last_fresh << dendl;
+    return std::nullopt;
+  } else {
+    // update our stat summary and timestamps
+    info.stats.reported_epoch = get_osdmap_epoch();
+    ++info.stats.reported_seq;
+
+    info.stats.last_fresh = now;
+
+    if (info.stats.state & PG_STATE_CLEAN)
+      info.stats.last_clean = now;
+    if (info.stats.state & PG_STATE_ACTIVE)
+      info.stats.last_active = now;
+    if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
+      info.stats.last_peered = now;
+    info.stats.last_unstale = now;
+    if ((info.stats.state & PG_STATE_DEGRADED) == 0)
+      info.stats.last_undegraded = now;
+    if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
+      info.stats.last_fullsized = now;
+
+    psdout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
+	       << ":" << pg_stats_publish.reported_seq << dendl;
+    return std::make_optional(std::move(pre_publish));
+  }
+}
+
+void PeeringState::init(
+  int role,
+  const vector<int>& newup, int new_up_primary,
+  const vector<int>& newacting, int new_acting_primary,
+  const pg_history_t& history,
+  const PastIntervals& pi,
+  bool backfill,
+  ObjectStore::Transaction &t)
+{
+  psdout(10) << "init role " << role << " up "
+	     << newup << " acting " << newacting
+	     << " history " << history
+	     << " past_intervals " << pi
+	     << dendl;
+
+  set_role(role);
+  init_primary_up_acting(
+    newup,
+    newacting,
+    new_up_primary,
+    new_acting_primary);
+
+  info.history = history;
+  past_intervals = pi;
+
+  info.stats.up = up;
+  info.stats.up_primary = new_up_primary;
+  info.stats.acting = acting;
+  info.stats.acting_primary = new_acting_primary;
+  info.stats.mapping_epoch = info.history.same_interval_since;
+
+  if (!perform_deletes_during_peering()) {
+    pg_log.set_missing_may_contain_deletes();
+  }
+
+  if (backfill) {
+    psdout(10) << __func__ << ": Setting backfill" << dendl;
+    info.set_last_backfill(hobject_t());
+    info.last_complete = info.last_update;
+    pg_log.mark_log_for_rewrite();
+  }
+
+  on_new_interval();
+
+  dirty_info = true;
+  dirty_big_info = true;
+  write_if_dirty(t);
+}
+
+void PeeringState::dump_peering_state(Formatter *f)
+{
+  f->dump_string("state", get_pg_state_string());
+  f->dump_unsigned("epoch", get_osdmap_epoch());
+  f->open_array_section("up");
+  for (auto p = up.begin(); p != up.end(); ++p)
+    f->dump_unsigned("osd", *p);
+  f->close_section();
+  f->open_array_section("acting");
+  for (auto p = acting.begin(); p != acting.end(); ++p)
+    f->dump_unsigned("osd", *p);
+  f->close_section();
+  if (!backfill_targets.empty()) {
+    f->open_array_section("backfill_targets");
+    for (auto p = backfill_targets.begin(); p != backfill_targets.end(); ++p)
+      f->dump_stream("shard") << *p;
+    f->close_section();
+  }
+  if (!async_recovery_targets.empty()) {
+    f->open_array_section("async_recovery_targets");
+    for (auto p = async_recovery_targets.begin();
+	 p != async_recovery_targets.end();
+	 ++p)
+      f->dump_stream("shard") << *p;
+    f->close_section();
+  }
+  if (!acting_recovery_backfill.empty()) {
+    f->open_array_section("acting_recovery_backfill");
+    for (auto p = acting_recovery_backfill.begin();
+	 p != acting_recovery_backfill.end();
+	 ++p)
+      f->dump_stream("shard") << *p;
+    f->close_section();
+  }
+  f->open_object_section("info");
+  update_calc_stats();
+  info.dump(f);
+  f->close_section();
+
+  f->open_array_section("peer_info");
+  for (auto p = peer_info.begin(); p != peer_info.end(); ++p) {
+    f->open_object_section("info");
+    f->dump_stream("peer") << p->first;
+    p->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void PeeringState::update_stats(
+  std::function<bool(pg_history_t &, pg_stat_t &)> f,
+  ObjectStore::Transaction *t) {
+  if (f(info.history, info.stats)) {
+    pl->publish_stats_to_osd();
+  }
+  pl->on_info_history_change();
+
+  if (t) {
+    dirty_info = true;
+    write_if_dirty(*t);
+  }
+}
+
+bool PeeringState::append_log_entries_update_missing(
+  const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+  ObjectStore::Transaction &t, std::optional<eversion_t> trim_to,
+  std::optional<eversion_t> roll_forward_to)
+{
+  ceph_assert(!entries.empty());
+  ceph_assert(entries.begin()->version > info.last_update);
+
+  PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
+  bool invalidate_stats =
+    pg_log.append_new_log_entries(
+      info.last_backfill,
+      entries,
+      rollbacker.get());
+
+  if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
+    pg_log.roll_forward(rollbacker.get());
+  }
+  if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
+    pg_log.roll_forward_to(*roll_forward_to, rollbacker.get());
+    last_rollback_info_trimmed_to_applied = *roll_forward_to;
+  }
+
+  info.last_update = pg_log.get_head();
+
+  if (pg_log.get_missing().num_missing() == 0) {
+    // advance last_complete since nothing else is missing!
+    info.last_complete = info.last_update;
+  }
+  info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
+
+  psdout(20) << __func__ << " trim_to bool = " << bool(trim_to)
+	     << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
+  if (trim_to)
+    pg_log.trim(*trim_to, info);
+  dirty_info = true;
+  write_if_dirty(t);
+  return invalidate_stats;
+}
+
+void PeeringState::merge_new_log_entries(
+  const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+  ObjectStore::Transaction &t,
+  std::optional<eversion_t> trim_to,
+  std::optional<eversion_t> roll_forward_to)
+{
+  psdout(10) << __func__ << " " << entries << dendl;
+  ceph_assert(is_primary());
+
+  bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
+  for (auto i = acting_recovery_backfill.begin();
+       i != acting_recovery_backfill.end();
+       ++i) {
+    pg_shard_t peer(*i);
+    if (peer == pg_whoami) continue;
+    ceph_assert(peer_missing.count(peer));
+    ceph_assert(peer_info.count(peer));
+    pg_missing_t& pmissing(peer_missing[peer]);
+    psdout(20) << __func__ << " peer_missing for " << peer
+	       << " = " << pmissing << dendl;
+    pg_info_t& pinfo(peer_info[peer]);
+    bool invalidate_stats = PGLog::append_log_entries_update_missing(
+      pinfo.last_backfill,
+      entries,
+      true,
+      NULL,
+      pmissing,
+      NULL,
+      dpp);
+    pinfo.last_update = info.last_update;
+    pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
+    rebuild_missing = rebuild_missing || invalidate_stats;
+  }
+
+  if (!rebuild_missing) {
+    return;
+  }
+
+  for (auto &&i: entries) {
+    missing_loc.rebuild(
+      i.soid,
+      pg_whoami,
+      acting_recovery_backfill,
+      info,
+      pg_log.get_missing(),
+      peer_missing,
+      peer_info);
+  }
+}
+
+void PeeringState::add_log_entry(const pg_log_entry_t& e, bool applied)
+{
+  // raise last_complete only if we were previously up to date
+  if (info.last_complete == info.last_update)
+    info.last_complete = e.version;
+
+  // raise last_update.
+  ceph_assert(e.version > info.last_update);
+  info.last_update = e.version;
+
+  // raise user_version, if it increased (it may have not get bumped
+  // by all logged updates)
+  if (e.user_version > info.last_user_version)
+    info.last_user_version = e.user_version;
+
+  // log mutation
+  pg_log.add(e, applied);
+  psdout(10) << "add_log_entry " << e << dendl;
+}
+
+
+void PeeringState::append_log(
+  vector<pg_log_entry_t>&& logv,
+  eversion_t trim_to,
+  eversion_t roll_forward_to,
+  eversion_t mlcod,
+  ObjectStore::Transaction &t,
+  bool transaction_applied,
+  bool async)
+{
+  /* The primary has sent an info updating the history, but it may not
+   * have arrived yet.  We want to make sure that we cannot remember this
+   * write without remembering that it happened in an interval which went
+   * active in epoch history.last_epoch_started.
+   */
+  if (info.last_epoch_started != info.history.last_epoch_started) {
+    info.history.last_epoch_started = info.last_epoch_started;
+  }
+  if (info.last_interval_started != info.history.last_interval_started) {
+    info.history.last_interval_started = info.last_interval_started;
+  }
+  psdout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
+
+  PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
+  if (!transaction_applied) {
+     /* We must be a backfill or async recovery peer, so it's ok if we apply
+      * out-of-turn since we won't be considered when
+      * determining a min possible last_update.
+      *
+      * We skip_rollforward() here, which advances the crt, without
+      * doing an actual rollforward. This avoids cleaning up entries
+      * from the backend and we do not end up in a situation, where the
+      * object is deleted before we can _merge_object_divergent_entries().
+      */
+    pg_log.skip_rollforward();
+  }
+
+  for (auto p = logv.begin(); p != logv.end(); ++p) {
+    add_log_entry(*p, transaction_applied);
+
+    /* We don't want to leave the rollforward artifacts around
+     * here past last_backfill.  It's ok for the same reason as
+     * above */
+    if (transaction_applied &&
+	p->soid > info.last_backfill) {
+      pg_log.roll_forward(handler.get());
+    }
+  }
+  if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
+    pg_log.roll_forward_to(
+      roll_forward_to,
+      handler.get());
+    last_rollback_info_trimmed_to_applied = roll_forward_to;
+  }
+
+  psdout(10) << __func__ << " approx pg log length =  "
+	     << pg_log.get_log().approx_size() << dendl;
+  psdout(10) << __func__ << " dups pg log length =  "
+	     << pg_log.get_log().dups.size() << dendl;
+  psdout(10) << __func__ << " transaction_applied = "
+	     << transaction_applied << dendl;
+  if (!transaction_applied || async)
+    psdout(10) << __func__ << " " << pg_whoami
+	       << " is async_recovery or backfill target" << dendl;
+  pg_log.trim(trim_to, info, transaction_applied, async);
+
+  // update the local pg, pg log
+  dirty_info = true;
+  write_if_dirty(t);
+
+  if (!is_primary())
+    min_last_complete_ondisk = mlcod;
+}
+
+void PeeringState::recover_got(
+  const hobject_t &oid, eversion_t v,
+  bool is_delete,
+  ObjectStore::Transaction &t)
+{
+  if (v > pg_log.get_can_rollback_to()) {
+    /* This can only happen during a repair, and even then, it would
+     * be one heck of a race.  If we are repairing the object, the
+     * write in question must be fully committed, so it's not valid
+     * to roll it back anyway (and we'll be rolled forward shortly
+     * anyway) */
+    PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
+    pg_log.roll_forward_to(v, handler.get());
+  }
+
+  psdout(10) << "got missing " << oid << " v " << v << dendl;
+  pg_log.recover_got(oid, v, info);
+  if (pg_log.get_log().log.empty()) {
+    psdout(10) << "last_complete now " << info.last_complete
+               << " while log is empty" << dendl;
+  } else if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
+    psdout(10) << "last_complete now " << info.last_complete
+	       << " log.complete_to " << pg_log.get_log().complete_to->version
+	       << dendl;
+  } else {
+    psdout(10) << "last_complete now " << info.last_complete
+	       << " log.complete_to at end" << dendl;
+    //below is not true in the repair case.
+    //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
+    ceph_assert(info.last_complete == info.last_update);
+  }
+
+  if (is_primary()) {
+    ceph_assert(missing_loc.needs_recovery(oid));
+    if (!is_delete)
+      missing_loc.add_location(oid, pg_whoami);
+  }
+
+  // update pg
+  dirty_info = true;
+  write_if_dirty(t);
+}
+
+void PeeringState::update_backfill_progress(
+  const hobject_t &updated_backfill,
+  const pg_stat_t &updated_stats,
+  bool preserve_local_num_bytes,
+  ObjectStore::Transaction &t) {
+  info.set_last_backfill(updated_backfill);
+  if (preserve_local_num_bytes) {
+    psdout(25) << __func__ << " primary " << updated_stats.stats.sum.num_bytes
+	       << " local " << info.stats.stats.sum.num_bytes << dendl;
+    int64_t bytes = info.stats.stats.sum.num_bytes;
+    info.stats = updated_stats;
+    info.stats.stats.sum.num_bytes = bytes;
+  } else {
+    psdout(20) << __func__ << " final " << updated_stats.stats.sum.num_bytes
+	       << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
+    info.stats = updated_stats;
+  }
+
+  dirty_info = true;
+  write_if_dirty(t);
+}
+
+void PeeringState::adjust_purged_snaps(
+  std::function<void(interval_set<snapid_t> &snaps)> f) {
+  f(info.purged_snaps);
+  dirty_info = true;
+  dirty_big_info = true;
+}
+
+void PeeringState::on_peer_recover(
+  pg_shard_t peer,
+  const hobject_t &soid,
+  const eversion_t &version)
+{
+  pl->publish_stats_to_osd();
+  // done!
+  peer_missing[peer].got(soid, version);
+  missing_loc.add_location(soid, peer);
+}
+
+void PeeringState::begin_peer_recover(
+  pg_shard_t peer,
+  const hobject_t soid)
+{
+  peer_missing[peer].revise_have(soid, eversion_t());
+}
+
+void PeeringState::force_object_missing(
+  const set<pg_shard_t> &peers,
+  const hobject_t &soid,
+  eversion_t version)
+{
+  for (auto &&peer : peers) {
+    if (peer != primary) {
+      peer_missing[peer].add(soid, version, eversion_t(), false);
+    } else {
+      pg_log.missing_add(soid, version, eversion_t());
+      pg_log.reset_complete_to(&info);
+      pg_log.set_last_requested(0);
+    }
+  }
+
+  missing_loc.rebuild(
+    soid,
+    pg_whoami,
+    acting_recovery_backfill,
+    info,
+    pg_log.get_missing(),
+    peer_missing,
+    peer_info);
+}
+
+void PeeringState::pre_submit_op(
+  const hobject_t &hoid,
+  const vector<pg_log_entry_t>& logv,
+  eversion_t at_version)
+{
+  if (at_version > eversion_t()) {
+    for (auto &&i : get_acting_recovery_backfill()) {
+      if (i == primary) continue;
+      pg_info_t &pinfo = peer_info[i];
+      // keep peer_info up to date
+      if (pinfo.last_complete == pinfo.last_update)
+	pinfo.last_complete = at_version;
+      pinfo.last_update = at_version;
+    }
+  }
+
+  bool requires_missing_loc = false;
+  for (auto &&i : get_async_recovery_targets()) {
+    if (i == primary || !get_peer_missing(i).is_missing(hoid))
+      continue;
+    requires_missing_loc = true;
+    for (auto &&entry: logv) {
+      peer_missing[i].add_next_event(entry);
+    }
+  }
+
+  if (requires_missing_loc) {
+    for (auto &&entry: logv) {
+      psdout(30) << __func__ << " missing_loc before: "
+		 << missing_loc.get_locations(entry.soid) << dendl;
+      missing_loc.add_missing(entry.soid, entry.version,
+                              eversion_t(), entry.is_delete());
+      // clear out missing_loc
+      missing_loc.clear_location(entry.soid);
+      for (auto &i: get_actingset()) {
+        if (!get_peer_missing(i).is_missing(entry.soid))
+          missing_loc.add_location(entry.soid, i);
+      }
+      psdout(30) << __func__ << " missing_loc after: "
+		 << missing_loc.get_locations(entry.soid) << dendl;
+    }
+  }
+}
+
+void PeeringState::recovery_committed_to(eversion_t version)
+{
+  psdout(10) << __func__ << " version " << version
+	     << " now ondisk" << dendl;
+  last_complete_ondisk = version;
+
+  if (last_complete_ondisk == info.last_update) {
+    if (!is_primary()) {
+      // Either we are a replica or backfill target.
+      // we are fully up to date.  tell the primary!
+      pl->send_cluster_message(
+	get_primary().osd,
+	make_message<MOSDPGTrim>(
+	  get_osdmap_epoch(),
+	  spg_t(info.pgid.pgid, primary.shard),
+	  last_complete_ondisk),
+	get_osdmap_epoch());
+    } else {
+      calc_min_last_complete_ondisk();
+    }
+  }
+}
+
+void PeeringState::complete_write(eversion_t v, eversion_t lc)
+{
+  last_update_ondisk = v;
+  last_complete_ondisk = lc;
+  calc_min_last_complete_ondisk();
+}
+
+void PeeringState::calc_trim_to()
+{
+  size_t target = pl->get_target_pg_log_entries();
+
+  eversion_t limit = std::min(
+    min_last_complete_ondisk,
+    pg_log.get_can_rollback_to());
+  if (limit != eversion_t() &&
+      limit != pg_trim_to &&
+      pg_log.get_log().approx_size() > target) {
+    size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
+                             cct->_conf->osd_pg_log_trim_max);
+    if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
+        cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
+      return;
+    }
+    auto it = pg_log.get_log().log.begin();
+    eversion_t new_trim_to;
+    for (size_t i = 0; i < num_to_trim; ++i) {
+      new_trim_to = it->version;
+      ++it;
+      if (new_trim_to > limit) {
+        new_trim_to = limit;
+        psdout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
+        break;
+      }
+    }
+    psdout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
+    pg_trim_to = new_trim_to;
+    assert(pg_trim_to <= pg_log.get_head());
+    assert(pg_trim_to <= min_last_complete_ondisk);
+  }
+}
+
+void PeeringState::calc_trim_to_aggressive()
+{
+  size_t target = pl->get_target_pg_log_entries();
+
+  // limit pg log trimming up to the can_rollback_to value
+  eversion_t limit = std::min({
+    pg_log.get_head(),
+    pg_log.get_can_rollback_to(),
+    last_update_ondisk});
+  psdout(10) << __func__ << " limit = " << limit << dendl;
+
+  if (limit != eversion_t() &&
+      limit != pg_trim_to &&
+      pg_log.get_log().approx_size() > target) {
+    psdout(10) << __func__ << " approx pg log length =  "
+             << pg_log.get_log().approx_size() << dendl;
+    uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
+                                              cct->_conf->osd_pg_log_trim_max);
+    psdout(10) << __func__ << " num_to_trim =  " << num_to_trim << dendl;
+    if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
+	cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
+      return;
+    }
+    auto it = pg_log.get_log().log.begin(); // oldest log entry
+    auto rit = pg_log.get_log().log.rbegin();
+    eversion_t by_n_to_keep; // start from tail
+    eversion_t by_n_to_trim = eversion_t::max(); // start from head
+    for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
+      i++;
+      if (i > target && by_n_to_keep == eversion_t()) {
+        by_n_to_keep = rit->version;
+      }
+      if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
+        by_n_to_trim = it->version;
+      }
+      if (by_n_to_keep != eversion_t() &&
+          by_n_to_trim != eversion_t::max()) {
+        break;
+      }
+    }
+
+    if (by_n_to_keep == eversion_t()) {
+      return;
+    }
+
+    pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
+    psdout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
+    ceph_assert(pg_trim_to <= pg_log.get_head());
+  }
+}
+
+void PeeringState::apply_op_stats(
+  const hobject_t &soid,
+  const object_stat_sum_t &delta_stats)
+{
+  info.stats.stats.add(delta_stats);
+  info.stats.stats.floor(0);
+
+  for (auto i = get_backfill_targets().begin();
+       i != get_backfill_targets().end();
+       ++i) {
+    pg_shard_t bt = *i;
+    pg_info_t& pinfo = peer_info[bt];
+    if (soid <= pinfo.last_backfill)
+      pinfo.stats.stats.add(delta_stats);
+  }
+}
+
+void PeeringState::update_complete_backfill_object_stats(
+  const hobject_t &hoid,
+  const pg_stat_t &stats)
+{
+  for (auto &&bt: get_backfill_targets()) {
+    pg_info_t& pinfo = peer_info[bt];
+    //Add stats to all peers that were missing object
+    if (hoid > pinfo.last_backfill)
+      pinfo.stats.add(stats);
+  }
+}
+
+void PeeringState::update_peer_last_backfill(
+  pg_shard_t peer,
+  const hobject_t &new_last_backfill)
+{
+  pg_info_t &pinfo = peer_info[peer];
+  pinfo.last_backfill = new_last_backfill;
+  if (new_last_backfill.is_max()) {
+    /* pinfo.stats might be wrong if we did log-based recovery on the
+     * backfilled portion in addition to continuing backfill.
+     */
+    pinfo.stats = info.stats;
+  }
+}
+
+void PeeringState::set_revert_with_targets(
+  const hobject_t &soid,
+  const set<pg_shard_t> &good_peers)
+{
+  for (auto &&peer: good_peers) {
+    missing_loc.add_location(soid, peer);
+  }
+}
+
+void PeeringState::prepare_backfill_for_missing(
+  const hobject_t &soid,
+  const eversion_t &version,
+  const vector<pg_shard_t> &targets) {
+  for (auto &&peer: targets) {
+    peer_missing[peer].add(soid, version, eversion_t(), false);
+  }
+}
+
+void PeeringState::update_hset(const pg_hit_set_history_t &hset_history)
+{
+  info.hit_set = hset_history;
+}
+
+/*------------ Peering State Machine----------------*/
+#undef dout_prefix
+#define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
+                    << "state<" << get_state_name() << ">: ")
+#undef psdout
+#define psdout(x) ldout(context< PeeringMachine >().cct, x)
+
+#define DECLARE_LOCALS                                  \
+  PeeringState *ps = context< PeeringMachine >().state; \
+  std::ignore = ps;                                     \
+  PeeringListener *pl = context< PeeringMachine >().pl; \
+  std::ignore = pl
+
+
+/*------Crashed-------*/
+PeeringState::Crashed::Crashed(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Crashed")
+{
+  context< PeeringMachine >().log_enter(state_name);
+  ceph_abort_msg("we got a bad state machine event");
+}
+
+
+/*------Initial-------*/
+PeeringState::Initial::Initial(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Initial")
+{
+  context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result PeeringState::Initial::react(const MNotifyRec& notify)
+{
+  DECLARE_LOCALS;
+  ps->proc_replica_info(
+    notify.from, notify.notify.info, notify.notify.epoch_sent);
+  ps->set_last_peering_reset();
+  return transit< Primary >();
+}
+
+boost::statechart::result PeeringState::Initial::react(const MInfoRec& i)
+{
+  DECLARE_LOCALS;
+  ceph_assert(!ps->is_primary());
+  post_event(i);
+  return transit< Stray >();
+}
+
+boost::statechart::result PeeringState::Initial::react(const MLogRec& i)
+{
+  DECLARE_LOCALS;
+  ceph_assert(!ps->is_primary());
+  post_event(i);
+  return transit< Stray >();
+}
+
+void PeeringState::Initial::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_initial_latency, dur);
+}
+
+/*------Started-------*/
+PeeringState::Started::Started(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started")
+{
+  context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result
+PeeringState::Started::react(const IntervalFlush&)
+{
+  psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
+  context< PeeringMachine >().state->end_block_outgoing();
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Started::react(const AdvMap& advmap)
+{
+  DECLARE_LOCALS;
+  psdout(10) << "Started advmap" << dendl;
+  ps->check_full_transition(advmap.lastmap, advmap.osdmap);
+  if (ps->should_restart_peering(
+	advmap.up_primary,
+	advmap.acting_primary,
+	advmap.newup,
+	advmap.newacting,
+	advmap.lastmap,
+	advmap.osdmap)) {
+    psdout(10) << "should_restart_peering, transitioning to Reset"
+		       << dendl;
+    post_event(advmap);
+    return transit< Reset >();
+  }
+  ps->remove_down_peer_info(advmap.osdmap);
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Started::react(const QueryState& q)
+{
+  q.f->open_object_section("state");
+  q.f->dump_string("name", state_name);
+  q.f->dump_stream("enter_time") << enter_time;
+  q.f->close_section();
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Started::react(const QueryUnfound& q)
+{
+  q.f->dump_string("state", "Started");
+  q.f->dump_bool("available_might_have_unfound", false);
+  return discard_event();
+}
+
+void PeeringState::Started::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_started_latency, dur);
+  ps->state_clear(PG_STATE_WAIT | PG_STATE_LAGGY);
+}
+
+/*--------Reset---------*/
+PeeringState::Reset::Reset(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Reset")
+{
+  context< PeeringMachine >().log_enter(state_name);
+  DECLARE_LOCALS;
+
+  ps->flushes_in_progress = 0;
+  ps->set_last_peering_reset();
+  ps->log_weirdness();
+}
+
+boost::statechart::result
+PeeringState::Reset::react(const IntervalFlush&)
+{
+  psdout(10) << "Ending blocked outgoing recovery messages" << dendl;
+  context< PeeringMachine >().state->end_block_outgoing();
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Reset::react(const AdvMap& advmap)
+{
+  DECLARE_LOCALS;
+  psdout(10) << "Reset advmap" << dendl;
+
+  ps->check_full_transition(advmap.lastmap, advmap.osdmap);
+
+  if (ps->should_restart_peering(
+	advmap.up_primary,
+	advmap.acting_primary,
+	advmap.newup,
+	advmap.newacting,
+	advmap.lastmap,
+	advmap.osdmap)) {
+    psdout(10) << "should restart peering, calling start_peering_interval again"
+		       << dendl;
+    ps->start_peering_interval(
+      advmap.lastmap,
+      advmap.newup, advmap.up_primary,
+      advmap.newacting, advmap.acting_primary,
+      context< PeeringMachine >().get_cur_transaction());
+  }
+  ps->remove_down_peer_info(advmap.osdmap);
+  ps->check_past_interval_bounds();
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Reset::react(const ActMap&)
+{
+  DECLARE_LOCALS;
+  if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
+    ps->info.history.refresh_prior_readable_until_ub(
+      pl->get_mnow(),
+      ps->prior_readable_until_ub);
+    context< PeeringMachine >().send_notify(
+      ps->get_primary().osd,
+      pg_notify_t(
+	ps->get_primary().shard, ps->pg_whoami.shard,
+	ps->get_osdmap_epoch(),
+	ps->get_osdmap_epoch(),
+	ps->info,
+	ps->past_intervals));
+  }
+
+  ps->update_heartbeat_peers();
+
+  return transit< Started >();
+}
+
+boost::statechart::result PeeringState::Reset::react(const QueryState& q)
+{
+  q.f->open_object_section("state");
+  q.f->dump_string("name", state_name);
+  q.f->dump_stream("enter_time") << enter_time;
+  q.f->close_section();
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Reset::react(const QueryUnfound& q)
+{
+  q.f->dump_string("state", "Reset");
+  q.f->dump_bool("available_might_have_unfound", false);
+  return discard_event();
+}
+
+void PeeringState::Reset::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_reset_latency, dur);
+}
+
+/*-------Start---------*/
+PeeringState::Start::Start(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Start")
+{
+  context< PeeringMachine >().log_enter(state_name);
+
+  DECLARE_LOCALS;
+  if (ps->is_primary()) {
+    psdout(1) << "transitioning to Primary" << dendl;
+    post_event(MakePrimary());
+  } else { //is_stray
+    psdout(1) << "transitioning to Stray" << dendl;
+    post_event(MakeStray());
+  }
+}
+
+void PeeringState::Start::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_start_latency, dur);
+}
+
+/*---------Primary--------*/
+PeeringState::Primary::Primary(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary")
+{
+  context< PeeringMachine >().log_enter(state_name);
+  DECLARE_LOCALS;
+  ceph_assert(ps->want_acting.empty());
+
+  // set CREATING bit until we have peered for the first time.
+  if (ps->info.history.last_epoch_started == 0) {
+    ps->state_set(PG_STATE_CREATING);
+    // use the history timestamp, which ultimately comes from the
+    // monitor in the create case.
+    utime_t t = ps->info.history.last_scrub_stamp;
+    ps->info.stats.last_fresh = t;
+    ps->info.stats.last_active = t;
+    ps->info.stats.last_change = t;
+    ps->info.stats.last_peered = t;
+    ps->info.stats.last_clean = t;
+    ps->info.stats.last_unstale = t;
+    ps->info.stats.last_undegraded = t;
+    ps->info.stats.last_fullsized = t;
+    ps->info.stats.last_scrub_stamp = t;
+    ps->info.stats.last_deep_scrub_stamp = t;
+    ps->info.stats.last_clean_scrub_stamp = t;
+  }
+}
+
+boost::statechart::result PeeringState::Primary::react(const MNotifyRec& notevt)
+{
+  DECLARE_LOCALS;
+  psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
+  ps->proc_replica_info(
+    notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Primary::react(const ActMap&)
+{
+  DECLARE_LOCALS;
+  psdout(7) << "handle ActMap primary" << dendl;
+  pl->publish_stats_to_osd();
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Primary::react(
+  const SetForceRecovery&)
+{
+  DECLARE_LOCALS;
+  ps->set_force_recovery(true);
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Primary::react(
+  const UnsetForceRecovery&)
+{
+  DECLARE_LOCALS;
+  ps->set_force_recovery(false);
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Primary::react(
+  const RequestScrub& evt)
+{
+  DECLARE_LOCALS;
+  if (ps->is_primary()) {
+    pl->scrub_requested(evt.deep, evt.repair);
+    psdout(10) << "marking for scrub" << dendl;
+  }
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Primary::react(
+  const SetForceBackfill&)
+{
+  DECLARE_LOCALS;
+  ps->set_force_backfill(true);
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Primary::react(
+  const UnsetForceBackfill&)
+{
+  DECLARE_LOCALS;
+  ps->set_force_backfill(false);
+  return discard_event();
+}
+
+void PeeringState::Primary::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  ps->want_acting.clear();
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_primary_latency, dur);
+  pl->clear_primary_state();
+  ps->state_clear(PG_STATE_CREATING);
+}
+
+/*---------Peering--------*/
+PeeringState::Peering::Peering(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering"),
+    history_les_bound(false)
+{
+  context< PeeringMachine >().log_enter(state_name);
+  DECLARE_LOCALS;
+
+  ceph_assert(!ps->is_peered());
+  ceph_assert(!ps->is_peering());
+  ceph_assert(ps->is_primary());
+  ps->state_set(PG_STATE_PEERING);
+}
+
+boost::statechart::result PeeringState::Peering::react(const AdvMap& advmap)
+{
+  DECLARE_LOCALS;
+  psdout(10) << "Peering advmap" << dendl;
+  if (prior_set.affected_by_map(*(advmap.osdmap), ps->dpp)) {
+    psdout(1) << "Peering, affected_by_map, going to Reset" << dendl;
+    post_event(advmap);
+    return transit< Reset >();
+  }
+
+  ps->adjust_need_up_thru(advmap.osdmap);
+  ps->check_prior_readable_down_osds(advmap.osdmap);
+
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::Peering::react(const QueryState& q)
+{
+  DECLARE_LOCALS;
+
+  q.f->open_object_section("state");
+  q.f->dump_string("name", state_name);
+  q.f->dump_stream("enter_time") << enter_time;
+
+  q.f->open_array_section("past_intervals");
+  ps->past_intervals.dump(q.f);
+  q.f->close_section();
+
+  q.f->open_array_section("probing_osds");
+  for (auto p = prior_set.probe.begin(); p != prior_set.probe.end(); ++p)
+    q.f->dump_stream("osd") << *p;
+  q.f->close_section();
+
+  if (prior_set.pg_down)
+    q.f->dump_string("blocked", "peering is blocked due to down osds");
+
+  q.f->open_array_section("down_osds_we_would_probe");
+  for (auto p = prior_set.down.begin(); p != prior_set.down.end(); ++p)
+    q.f->dump_int("osd", *p);
+  q.f->close_section();
+
+  q.f->open_array_section("peering_blocked_by");
+  for (auto p = prior_set.blocked_by.begin();
+       p != prior_set.blocked_by.end();
+       ++p) {
+    q.f->open_object_section("osd");
+    q.f->dump_int("osd", p->first);
+    q.f->dump_int("current_lost_at", p->second);
+    q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
+    q.f->close_section();
+  }
+  q.f->close_section();
+
+  if (history_les_bound) {
+    q.f->open_array_section("peering_blocked_by_detail");
+    q.f->open_object_section("item");
+    q.f->dump_string("detail","peering_blocked_by_history_les_bound");
+    q.f->close_section();
+    q.f->close_section();
+  }
+
+  q.f->close_section();
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::Peering::react(const QueryUnfound& q)
+{
+  q.f->dump_string("state", "Peering");
+  q.f->dump_bool("available_might_have_unfound", false);
+  return discard_event();
+}
+
+void PeeringState::Peering::exit()
+{
+
+  DECLARE_LOCALS;
+  psdout(10) << "Leaving Peering" << dendl;
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  ps->state_clear(PG_STATE_PEERING);
+  pl->clear_probe_targets();
+
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_peering_latency, dur);
+}
+
+
+/*------Backfilling-------*/
+PeeringState::Backfilling::Backfilling(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Backfilling")
+{
+  context< PeeringMachine >().log_enter(state_name);
+
+
+  DECLARE_LOCALS;
+  ps->backfill_reserved = true;
+  pl->on_backfill_reserved();
+  ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
+  ps->state_clear(PG_STATE_BACKFILL_WAIT);
+  ps->state_set(PG_STATE_BACKFILLING);
+  pl->publish_stats_to_osd();
+}
+
+void PeeringState::Backfilling::backfill_release_reservations()
+{
+  DECLARE_LOCALS;
+  pl->cancel_local_background_io_reservation();
+  for (auto it = ps->backfill_targets.begin();
+       it != ps->backfill_targets.end();
+       ++it) {
+    ceph_assert(*it != ps->pg_whoami);
+    pl->send_cluster_message(
+      it->osd,
+      make_message<MBackfillReserve>(
+	MBackfillReserve::RELEASE,
+	spg_t(ps->info.pgid.pgid, it->shard),
+	ps->get_osdmap_epoch()),
+      ps->get_osdmap_epoch());
+  }
+}
+
+void PeeringState::Backfilling::cancel_backfill()
+{
+  DECLARE_LOCALS;
+  backfill_release_reservations();
+  pl->on_backfill_canceled();
+}
+
+boost::statechart::result
+PeeringState::Backfilling::react(const Backfilled &c)
+{
+  backfill_release_reservations();
+  return transit<Recovered>();
+}
+
+boost::statechart::result
+PeeringState::Backfilling::react(const DeferBackfill &c)
+{
+  DECLARE_LOCALS;
+
+  psdout(10) << "defer backfill, retry delay " << c.delay << dendl;
+  ps->state_set(PG_STATE_BACKFILL_WAIT);
+  ps->state_clear(PG_STATE_BACKFILLING);
+  cancel_backfill();
+
+  pl->schedule_event_after(
+    std::make_shared<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      RequestBackfill()),
+    c.delay);
+  return transit<NotBackfilling>();
+}
+
+boost::statechart::result
+PeeringState::Backfilling::react(const UnfoundBackfill &c)
+{
+  DECLARE_LOCALS;
+  psdout(10) << "backfill has unfound, can't continue" << dendl;
+  ps->state_set(PG_STATE_BACKFILL_UNFOUND);
+  ps->state_clear(PG_STATE_BACKFILLING);
+  cancel_backfill();
+  return transit<NotBackfilling>();
+}
+
+boost::statechart::result
+PeeringState::Backfilling::react(const RemoteReservationRevokedTooFull &)
+{
+  DECLARE_LOCALS;
+
+  ps->state_set(PG_STATE_BACKFILL_TOOFULL);
+  ps->state_clear(PG_STATE_BACKFILLING);
+  cancel_backfill();
+
+  pl->schedule_event_after(
+    std::make_shared<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      RequestBackfill()),
+    ps->cct->_conf->osd_backfill_retry_interval);
+
+  return transit<NotBackfilling>();
+}
+
+boost::statechart::result
+PeeringState::Backfilling::react(const RemoteReservationRevoked &)
+{
+  DECLARE_LOCALS;
+  ps->state_set(PG_STATE_BACKFILL_WAIT);
+  cancel_backfill();
+  if (ps->needs_backfill()) {
+    return transit<WaitLocalBackfillReserved>();
+  } else {
+    // raced with MOSDPGBackfill::OP_BACKFILL_FINISH, ignore
+    return discard_event();
+  }
+}
+
+void PeeringState::Backfilling::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  ps->backfill_reserved = false;
+  ps->state_clear(PG_STATE_BACKFILLING);
+  ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_backfilling_latency, dur);
+}
+
+/*--WaitRemoteBackfillReserved--*/
+
+PeeringState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteBackfillReserved"),
+    backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
+{
+  context< PeeringMachine >().log_enter(state_name);
+  DECLARE_LOCALS;
+
+  ps->state_set(PG_STATE_BACKFILL_WAIT);
+  pl->publish_stats_to_osd();
+  post_event(RemoteBackfillReserved());
+}
+
+boost::statechart::result
+PeeringState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
+{
+  DECLARE_LOCALS;
+
+  int64_t num_bytes = ps->info.stats.stats.sum.num_bytes;
+  psdout(10) << __func__ << " num_bytes " << num_bytes << dendl;
+  if (backfill_osd_it !=
+      context< Active >().remote_shards_to_reserve_backfill.end()) {
+    // The primary never backfills itself
+    ceph_assert(*backfill_osd_it != ps->pg_whoami);
+    pl->send_cluster_message(
+      backfill_osd_it->osd,
+      make_message<MBackfillReserve>(
+	MBackfillReserve::REQUEST,
+	spg_t(context< PeeringMachine >().spgid.pgid, backfill_osd_it->shard),
+	ps->get_osdmap_epoch(),
+	ps->get_backfill_priority(),
+        num_bytes,
+        ps->peer_bytes[*backfill_osd_it]),
+      ps->get_osdmap_epoch());
+    ++backfill_osd_it;
+  } else {
+    ps->peer_bytes.clear();
+    post_event(AllBackfillsReserved());
+  }
+  return discard_event();
+}
+
+void PeeringState::WaitRemoteBackfillReserved::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_waitremotebackfillreserved_latency, dur);
+}
+
+void PeeringState::WaitRemoteBackfillReserved::retry()
+{
+  DECLARE_LOCALS;
+  pl->cancel_local_background_io_reservation();
+
+  // Send CANCEL to all previously acquired reservations
+  set<pg_shard_t>::const_iterator it, begin, end;
+  begin = context< Active >().remote_shards_to_reserve_backfill.begin();
+  end = context< Active >().remote_shards_to_reserve_backfill.end();
+  ceph_assert(begin != end);
+  for (it = begin; it != backfill_osd_it; ++it) {
+    // The primary never backfills itself
+    ceph_assert(*it != ps->pg_whoami);
+    pl->send_cluster_message(
+      it->osd,
+      make_message<MBackfillReserve>(
+	MBackfillReserve::RELEASE,
+	spg_t(context< PeeringMachine >().spgid.pgid, it->shard),
+	ps->get_osdmap_epoch()),
+      ps->get_osdmap_epoch());
+  }
+
+  ps->state_clear(PG_STATE_BACKFILL_WAIT);
+  pl->publish_stats_to_osd();
+
+  pl->schedule_event_after(
+    std::make_shared<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      RequestBackfill()),
+    ps->cct->_conf->osd_backfill_retry_interval);
+}
+
+boost::statechart::result
+PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRejectedTooFull &evt)
+{
+  DECLARE_LOCALS;
+  ps->state_set(PG_STATE_BACKFILL_TOOFULL);
+  retry();
+  return transit<NotBackfilling>();
+}
+
+boost::statechart::result
+PeeringState::WaitRemoteBackfillReserved::react(const RemoteReservationRevoked &evt)
+{
+  retry();
+  return transit<NotBackfilling>();
+}
+
+/*--WaitLocalBackfillReserved--*/
+PeeringState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalBackfillReserved")
+{
+  context< PeeringMachine >().log_enter(state_name);
+  DECLARE_LOCALS;
+
+  ps->state_set(PG_STATE_BACKFILL_WAIT);
+  pl->request_local_background_io_reservation(
+    ps->get_backfill_priority(),
+    std::make_unique<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      LocalBackfillReserved()),
+    std::make_unique<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      DeferBackfill(0.0)));
+  pl->publish_stats_to_osd();
+}
+
+void PeeringState::WaitLocalBackfillReserved::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_waitlocalbackfillreserved_latency, dur);
+}
+
+/*----NotBackfilling------*/
+PeeringState::NotBackfilling::NotBackfilling(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotBackfilling")
+{
+  context< PeeringMachine >().log_enter(state_name);
+  DECLARE_LOCALS;
+  ps->state_clear(PG_STATE_REPAIR);
+  pl->publish_stats_to_osd();
+}
+
+boost::statechart::result PeeringState::NotBackfilling::react(const QueryUnfound& q)
+{
+  DECLARE_LOCALS;
+
+  ps->query_unfound(q.f, "NotBackfilling");
+  return discard_event();
+}
+
+boost::statechart::result
+PeeringState::NotBackfilling::react(const RemoteBackfillReserved &evt)
+{
+  return discard_event();
+}
+
+boost::statechart::result
+PeeringState::NotBackfilling::react(const RemoteReservationRejectedTooFull &evt)
+{
+  return discard_event();
+}
+
+void PeeringState::NotBackfilling::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+
+  DECLARE_LOCALS;
+  ps->state_clear(PG_STATE_BACKFILL_UNFOUND);
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_notbackfilling_latency, dur);
+}
+
+/*----NotRecovering------*/
+PeeringState::NotRecovering::NotRecovering(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/NotRecovering")
+{
+  context< PeeringMachine >().log_enter(state_name);
+  DECLARE_LOCALS;
+  ps->state_clear(PG_STATE_REPAIR);
+  pl->publish_stats_to_osd();
+}
+
+boost::statechart::result PeeringState::NotRecovering::react(const QueryUnfound& q)
+{
+  DECLARE_LOCALS;
+
+  ps->query_unfound(q.f, "NotRecovering");
+  return discard_event();
+}
+
+void PeeringState::NotRecovering::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+
+  DECLARE_LOCALS;
+  ps->state_clear(PG_STATE_RECOVERY_UNFOUND);
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_notrecovering_latency, dur);
+}
+
+/*---RepNotRecovering----*/
+PeeringState::RepNotRecovering::RepNotRecovering(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepNotRecovering")
+{
+  context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result
+PeeringState::RepNotRecovering::react(const RejectTooFullRemoteReservation &evt)
+{
+  DECLARE_LOCALS;
+  ps->reject_reservation();
+  post_event(RemoteReservationRejectedTooFull());
+  return discard_event();
+}
+
+void PeeringState::RepNotRecovering::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_repnotrecovering_latency, dur);
+}
+
+/*---RepWaitRecoveryReserved--*/
+PeeringState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitRecoveryReserved")
+{
+  context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result
+PeeringState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
+{
+  DECLARE_LOCALS;
+  pl->send_cluster_message(
+    ps->primary.osd,
+    make_message<MRecoveryReserve>(
+      MRecoveryReserve::GRANT,
+      spg_t(ps->info.pgid.pgid, ps->primary.shard),
+      ps->get_osdmap_epoch()),
+    ps->get_osdmap_epoch());
+  return transit<RepRecovering>();
+}
+
+boost::statechart::result
+PeeringState::RepWaitRecoveryReserved::react(
+  const RemoteReservationCanceled &evt)
+{
+  DECLARE_LOCALS;
+  pl->unreserve_recovery_space();
+
+  pl->cancel_remote_recovery_reservation();
+  return transit<RepNotRecovering>();
+}
+
+void PeeringState::RepWaitRecoveryReserved::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_repwaitrecoveryreserved_latency, dur);
+}
+
+/*-RepWaitBackfillReserved*/
+PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepWaitBackfillReserved")
+{
+  context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result
+PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
+{
+
+  DECLARE_LOCALS;
+
+  if (!pl->try_reserve_recovery_space(
+	evt.primary_num_bytes, evt.local_num_bytes)) {
+    post_event(RejectTooFullRemoteReservation());
+  } else {
+    PGPeeringEventURef preempt;
+    if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
+      // older peers will interpret preemption as TOOFULL
+      preempt = std::make_unique<PGPeeringEvent>(
+	pl->get_osdmap_epoch(),
+	pl->get_osdmap_epoch(),
+	RemoteBackfillPreempted());
+    }
+    pl->request_remote_recovery_reservation(
+      evt.priority,
+      std::make_unique<PGPeeringEvent>(
+	pl->get_osdmap_epoch(),
+	pl->get_osdmap_epoch(),
+        RemoteBackfillReserved()),
+      std::move(preempt));
+  }
+  return transit<RepWaitBackfillReserved>();
+}
+
+boost::statechart::result
+PeeringState::RepNotRecovering::react(const RequestRecoveryPrio &evt)
+{
+  DECLARE_LOCALS;
+
+  // fall back to a local reckoning of priority of primary doesn't pass one
+  // (pre-mimic compat)
+  int prio = evt.priority ? evt.priority : ps->get_recovery_priority();
+
+  PGPeeringEventURef preempt;
+  if (HAVE_FEATURE(ps->upacting_features, RECOVERY_RESERVATION_2)) {
+    // older peers can't handle this
+    preempt = std::make_unique<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      RemoteRecoveryPreempted());
+  }
+
+  pl->request_remote_recovery_reservation(
+    prio,
+    std::make_unique<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      RemoteRecoveryReserved()),
+    std::move(preempt));
+  return transit<RepWaitRecoveryReserved>();
+}
+
+void PeeringState::RepWaitBackfillReserved::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_repwaitbackfillreserved_latency, dur);
+}
+
+boost::statechart::result
+PeeringState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
+{
+  DECLARE_LOCALS;
+
+
+  pl->send_cluster_message(
+      ps->primary.osd,
+      make_message<MBackfillReserve>(
+	MBackfillReserve::GRANT,
+	spg_t(ps->info.pgid.pgid, ps->primary.shard),
+	ps->get_osdmap_epoch()),
+      ps->get_osdmap_epoch());
+  return transit<RepRecovering>();
+}
+
+boost::statechart::result
+PeeringState::RepWaitBackfillReserved::react(
+  const RejectTooFullRemoteReservation &evt)
+{
+  DECLARE_LOCALS;
+  ps->reject_reservation();
+  post_event(RemoteReservationRejectedTooFull());
+  return discard_event();
+}
+
+boost::statechart::result
+PeeringState::RepWaitBackfillReserved::react(
+  const RemoteReservationRejectedTooFull &evt)
+{
+  DECLARE_LOCALS;
+  pl->unreserve_recovery_space();
+
+  pl->cancel_remote_recovery_reservation();
+  return transit<RepNotRecovering>();
+}
+
+boost::statechart::result
+PeeringState::RepWaitBackfillReserved::react(
+  const RemoteReservationCanceled &evt)
+{
+  DECLARE_LOCALS;
+  pl->unreserve_recovery_space();
+
+  pl->cancel_remote_recovery_reservation();
+  return transit<RepNotRecovering>();
+}
+
+/*---RepRecovering-------*/
+PeeringState::RepRecovering::RepRecovering(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive/RepRecovering")
+{
+  context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result
+PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
+{
+  DECLARE_LOCALS;
+
+
+  pl->unreserve_recovery_space();
+  pl->send_cluster_message(
+    ps->primary.osd,
+    make_message<MRecoveryReserve>(
+      MRecoveryReserve::REVOKE,
+      spg_t(ps->info.pgid.pgid, ps->primary.shard),
+      ps->get_osdmap_epoch()),
+    ps->get_osdmap_epoch());
+  return discard_event();
+}
+
+boost::statechart::result
+PeeringState::RepRecovering::react(const BackfillTooFull &)
+{
+  DECLARE_LOCALS;
+
+
+  pl->unreserve_recovery_space();
+  pl->send_cluster_message(
+    ps->primary.osd,
+    make_message<MBackfillReserve>(
+      MBackfillReserve::REVOKE_TOOFULL,
+      spg_t(ps->info.pgid.pgid, ps->primary.shard),
+      ps->get_osdmap_epoch()),
+    ps->get_osdmap_epoch());
+  return discard_event();
+}
+
+boost::statechart::result
+PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
+{
+  DECLARE_LOCALS;
+
+
+  pl->unreserve_recovery_space();
+  pl->send_cluster_message(
+    ps->primary.osd,
+    make_message<MBackfillReserve>(
+      MBackfillReserve::REVOKE,
+      spg_t(ps->info.pgid.pgid, ps->primary.shard),
+      ps->get_osdmap_epoch()),
+    ps->get_osdmap_epoch());
+  return discard_event();
+}
+
+void PeeringState::RepRecovering::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  pl->unreserve_recovery_space();
+
+  pl->cancel_remote_recovery_reservation();
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_reprecovering_latency, dur);
+}
+
+/*------Activating--------*/
+PeeringState::Activating::Activating(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Activating")
+{
+  context< PeeringMachine >().log_enter(state_name);
+}
+
+void PeeringState::Activating::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_activating_latency, dur);
+}
+
+PeeringState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitLocalRecoveryReserved")
+{
+  context< PeeringMachine >().log_enter(state_name);
+  DECLARE_LOCALS;
+
+  // Make sure all nodes that part of the recovery aren't full
+  if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
+      ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
+    post_event(RecoveryTooFull());
+    return;
+  }
+
+  ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
+  ps->state_set(PG_STATE_RECOVERY_WAIT);
+  pl->request_local_background_io_reservation(
+    ps->get_recovery_priority(),
+    std::make_unique<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      LocalRecoveryReserved()),
+    std::make_unique<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      DeferRecovery(0.0)));
+  pl->publish_stats_to_osd();
+}
+
+boost::statechart::result
+PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
+{
+  DECLARE_LOCALS;
+  ps->state_set(PG_STATE_RECOVERY_TOOFULL);
+  pl->schedule_event_after(
+    std::make_shared<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      DoRecovery()),
+    ps->cct->_conf->osd_recovery_retry_interval);
+  return transit<NotRecovering>();
+}
+
+void PeeringState::WaitLocalRecoveryReserved::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_waitlocalrecoveryreserved_latency, dur);
+}
+
+PeeringState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
+    remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
+{
+  context< PeeringMachine >().log_enter(state_name);
+  post_event(RemoteRecoveryReserved());
+}
+
+boost::statechart::result
+PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
+  DECLARE_LOCALS;
+
+  if (remote_recovery_reservation_it !=
+      context< Active >().remote_shards_to_reserve_recovery.end()) {
+    ceph_assert(*remote_recovery_reservation_it != ps->pg_whoami);
+    pl->send_cluster_message(
+      remote_recovery_reservation_it->osd,
+      make_message<MRecoveryReserve>(
+	MRecoveryReserve::REQUEST,
+	spg_t(context< PeeringMachine >().spgid.pgid,
+	      remote_recovery_reservation_it->shard),
+	ps->get_osdmap_epoch(),
+	ps->get_recovery_priority()),
+      ps->get_osdmap_epoch());
+    ++remote_recovery_reservation_it;
+  } else {
+    post_event(AllRemotesReserved());
+  }
+  return discard_event();
+}
+
+void PeeringState::WaitRemoteRecoveryReserved::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_waitremoterecoveryreserved_latency, dur);
+}
+
+PeeringState::Recovering::Recovering(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovering")
+{
+  context< PeeringMachine >().log_enter(state_name);
+
+  DECLARE_LOCALS;
+  ps->state_clear(PG_STATE_RECOVERY_WAIT);
+  ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
+  ps->state_set(PG_STATE_RECOVERING);
+  pl->on_recovery_reserved();
+  ceph_assert(!ps->state_test(PG_STATE_ACTIVATING));
+  pl->publish_stats_to_osd();
+}
+
+void PeeringState::Recovering::release_reservations(bool cancel)
+{
+  DECLARE_LOCALS;
+  ceph_assert(cancel || !ps->pg_log.get_missing().have_missing());
+
+  // release remote reservations
+  for (auto i = context< Active >().remote_shards_to_reserve_recovery.begin();
+       i != context< Active >().remote_shards_to_reserve_recovery.end();
+       ++i) {
+    if (*i == ps->pg_whoami) // skip myself
+      continue;
+    pl->send_cluster_message(
+      i->osd,
+      make_message<MRecoveryReserve>(
+	MRecoveryReserve::RELEASE,
+	spg_t(ps->info.pgid.pgid, i->shard),
+	ps->get_osdmap_epoch()),
+      ps->get_osdmap_epoch());
+  }
+}
+
+boost::statechart::result
+PeeringState::Recovering::react(const AllReplicasRecovered &evt)
+{
+  DECLARE_LOCALS;
+  ps->state_clear(PG_STATE_FORCED_RECOVERY);
+  release_reservations();
+  pl->cancel_local_background_io_reservation();
+  return transit<Recovered>();
+}
+
+boost::statechart::result
+PeeringState::Recovering::react(const RequestBackfill &evt)
+{
+  DECLARE_LOCALS;
+
+  release_reservations();
+
+  ps->state_clear(PG_STATE_FORCED_RECOVERY);
+  pl->cancel_local_background_io_reservation();
+  pl->publish_stats_to_osd();
+  // transit any async_recovery_targets back into acting
+  // so pg won't have to stay undersized for long
+  // as backfill might take a long time to complete..
+  if (!ps->async_recovery_targets.empty()) {
+    pg_shard_t auth_log_shard;
+    bool history_les_bound = false;
+    // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
+    ps->choose_acting(auth_log_shard, true, &history_les_bound);
+  }
+  return transit<WaitLocalBackfillReserved>();
+}
+
+boost::statechart::result
+PeeringState::Recovering::react(const DeferRecovery &evt)
+{
+  DECLARE_LOCALS;
+  if (!ps->state_test(PG_STATE_RECOVERING)) {
+    // we may have finished recovery and have an AllReplicasRecovered
+    // event queued to move us to the next state.
+    psdout(10) << "got defer recovery but not recovering" << dendl;
+    return discard_event();
+  }
+  psdout(10) << "defer recovery, retry delay " << evt.delay << dendl;
+  ps->state_set(PG_STATE_RECOVERY_WAIT);
+  pl->cancel_local_background_io_reservation();
+  release_reservations(true);
+  pl->schedule_event_after(
+    std::make_shared<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      DoRecovery()),
+    evt.delay);
+  return transit<NotRecovering>();
+}
+
+boost::statechart::result
+PeeringState::Recovering::react(const UnfoundRecovery &evt)
+{
+  DECLARE_LOCALS;
+  psdout(10) << "recovery has unfound, can't continue" << dendl;
+  ps->state_set(PG_STATE_RECOVERY_UNFOUND);
+  pl->cancel_local_background_io_reservation();
+  release_reservations(true);
+  return transit<NotRecovering>();
+}
+
+void PeeringState::Recovering::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  ps->state_clear(PG_STATE_RECOVERING);
+  pl->get_peering_perf().tinc(rs_recovering_latency, dur);
+}
+
+PeeringState::Recovered::Recovered(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Recovered")
+{
+  pg_shard_t auth_log_shard;
+
+  context< PeeringMachine >().log_enter(state_name);
+
+  DECLARE_LOCALS;
+
+  ceph_assert(!ps->needs_recovery());
+
+  // if we finished backfill, all acting are active; recheck if
+  // DEGRADED | UNDERSIZED is appropriate.
+  ceph_assert(!ps->acting_recovery_backfill.empty());
+  if (ps->get_osdmap()->get_pg_size(context< PeeringMachine >().spgid.pgid) <=
+      ps->acting_recovery_backfill.size()) {
+    ps->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
+    pl->publish_stats_to_osd();
+  }
+
+  // adjust acting set?  (e.g. because backfill completed...)
+  bool history_les_bound = false;
+  if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard,
+						 true, &history_les_bound)) {
+    ceph_assert(ps->want_acting.size());
+  } else if (!ps->async_recovery_targets.empty()) {
+    // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
+    ps->choose_acting(auth_log_shard, true, &history_les_bound);
+  }
+
+  if (context< Active >().all_replicas_activated  &&
+      ps->async_recovery_targets.empty())
+    post_event(GoClean());
+}
+
+void PeeringState::Recovered::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_recovered_latency, dur);
+}
+
+PeeringState::Clean::Clean(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active/Clean")
+{
+  context< PeeringMachine >().log_enter(state_name);
+
+  DECLARE_LOCALS;
+
+  if (ps->info.last_complete != ps->info.last_update) {
+    ceph_abort();
+  }
+
+
+  ps->try_mark_clean();
+
+  context< PeeringMachine >().get_cur_transaction().register_on_commit(
+    pl->on_clean());
+}
+
+void PeeringState::Clean::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+
+  DECLARE_LOCALS;
+  ps->state_clear(PG_STATE_CLEAN);
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_clean_latency, dur);
+}
+
+template <typename T>
+set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
+{
+  set<int> osds_found;
+  set<pg_shard_t> out;
+  for (auto i = in.begin(); i != in.end(); ++i) {
+    if (*i != skip && !osds_found.count(i->osd)) {
+      osds_found.insert(i->osd);
+      out.insert(*i);
+    }
+  }
+  return out;
+}
+
+/*---------Active---------*/
+PeeringState::Active::Active(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Active"),
+    remote_shards_to_reserve_recovery(
+      unique_osd_shard_set(
+	context< PeeringMachine >().state->pg_whoami,
+	context< PeeringMachine >().state->acting_recovery_backfill)),
+    remote_shards_to_reserve_backfill(
+      unique_osd_shard_set(
+	context< PeeringMachine >().state->pg_whoami,
+	context< PeeringMachine >().state->backfill_targets)),
+    all_replicas_activated(false)
+{
+  context< PeeringMachine >().log_enter(state_name);
+
+
+  DECLARE_LOCALS;
+
+  ceph_assert(!ps->backfill_reserved);
+  ceph_assert(ps->is_primary());
+  psdout(10) << "In Active, about to call activate" << dendl;
+  ps->start_flush(context< PeeringMachine >().get_cur_transaction());
+  ps->activate(context< PeeringMachine >().get_cur_transaction(),
+	       ps->get_osdmap_epoch(),
+	       context< PeeringMachine >().get_recovery_ctx());
+
+  // everyone has to commit/ack before we are truly active
+  ps->blocked_by.clear();
+  for (auto p = ps->acting_recovery_backfill.begin();
+       p != ps->acting_recovery_backfill.end();
+       ++p) {
+    if (p->shard != ps->pg_whoami.shard) {
+      ps->blocked_by.insert(p->shard);
+    }
+  }
+  pl->publish_stats_to_osd();
+  psdout(10) << "Activate Finished" << dendl;
+}
+
+boost::statechart::result PeeringState::Active::react(const AdvMap& advmap)
+{
+  DECLARE_LOCALS;
+
+  if (ps->should_restart_peering(
+	advmap.up_primary,
+	advmap.acting_primary,
+	advmap.newup,
+	advmap.newacting,
+	advmap.lastmap,
+	advmap.osdmap)) {
+    psdout(10) << "Active advmap interval change, fast return" << dendl;
+    return forward_event();
+  }
+  psdout(10) << "Active advmap" << dendl;
+  bool need_publish = false;
+
+  pl->on_active_advmap(advmap.osdmap);
+  if (ps->dirty_big_info) {
+    // share updated purged_snaps to mgr/mon so that we (a) stop reporting
+    // purged snaps and (b) perhaps share more snaps that we have purged
+    // but didn't fit in pg_stat_t.
+    need_publish = true;
+    ps->share_pg_info();
+  }
+
+  bool need_acting_change = false;
+  for (size_t i = 0; i < ps->want_acting.size(); i++) {
+    int osd = ps->want_acting[i];
+    if (!advmap.osdmap->is_up(osd)) {
+      pg_shard_t osd_with_shard(osd, shard_id_t(i));
+      if (!ps->is_acting(osd_with_shard) && !ps->is_up(osd_with_shard)) {
+        psdout(10) << "Active stray osd." << osd << " in want_acting is down"
+                   << dendl;
+        need_acting_change = true;
+      }
+    }
+  }
+  if (need_acting_change) {
+     psdout(10) << "Active need acting change, call choose_acting again"
+                << dendl;
+    // possibly because we re-add some strays into the acting set and
+    // some of them then go down in a subsequent map before we could see
+    // the map changing the pg temp.
+    // call choose_acting again to clear them out.
+    // note that we leave restrict_to_up_acting to false in order to
+    // not overkill any chosen stray that is still alive.
+    pg_shard_t auth_log_shard;
+    bool history_les_bound = false;
+    ps->remove_down_peer_info(advmap.osdmap);
+    ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
+  }
+
+  /* Check for changes in pool size (if the acting set changed as a result,
+   * this does not matter) */
+  if (advmap.lastmap->get_pg_size(ps->info.pgid.pgid) !=
+      ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid)) {
+    if (ps->get_osdmap()->get_pg_size(ps->info.pgid.pgid) <=
+	ps->actingset.size()) {
+      ps->state_clear(PG_STATE_UNDERSIZED);
+    } else {
+      ps->state_set(PG_STATE_UNDERSIZED);
+    }
+    // degraded changes will be detected by call from publish_stats_to_osd()
+    need_publish = true;
+  }
+
+  // if we haven't reported our PG stats in a long time, do so now.
+  if (ps->info.stats.reported_epoch + ps->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
+    psdout(20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - ps->info.stats.reported_epoch)
+		       << " epochs" << dendl;
+    need_publish = true;
+  }
+
+  if (need_publish)
+    pl->publish_stats_to_osd();
+
+  if (ps->check_prior_readable_down_osds(advmap.osdmap)) {
+    pl->recheck_readable();
+  }
+
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const ActMap&)
+{
+  DECLARE_LOCALS;
+  psdout(10) << "Active: handling ActMap" << dendl;
+  ceph_assert(ps->is_primary());
+
+  pl->on_active_actmap();
+
+  if (ps->have_unfound()) {
+    // object may have become unfound
+    ps->discover_all_missing(context<PeeringMachine>().get_recovery_ctx().msgs);
+  }
+
+  uint64_t unfound = ps->missing_loc.num_unfound();
+  if (unfound > 0 &&
+      ps->all_unfound_are_queried_or_lost(ps->get_osdmap())) {
+    if (ps->cct->_conf->osd_auto_mark_unfound_lost) {
+      pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has " << unfound
+			    << " objects unfound and apparently lost, would automatically "
+			    << "mark these objects lost but this feature is not yet implemented "
+			    << "(osd_auto_mark_unfound_lost)";
+    } else
+      pl->get_clog_error() << context< PeeringMachine >().spgid.pgid << " has "
+                             << unfound << " objects unfound and apparently lost";
+  }
+
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt)
+{
+
+  DECLARE_LOCALS;
+  ceph_assert(ps->is_primary());
+  if (ps->peer_info.count(notevt.from)) {
+    psdout(10) << "Active: got notify from " << notevt.from
+		       << ", already have info from that osd, ignoring"
+		       << dendl;
+  } else if (ps->peer_purged.count(notevt.from)) {
+    psdout(10) << "Active: got notify from " << notevt.from
+		       << ", already purged that peer, ignoring"
+		       << dendl;
+  } else {
+    psdout(10) << "Active: got notify from " << notevt.from
+		       << ", calling proc_replica_info and discover_all_missing"
+		       << dendl;
+    ps->proc_replica_info(
+      notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
+    if (ps->have_unfound() || (ps->is_degraded() && ps->might_have_unfound.count(notevt.from))) {
+      ps->discover_all_missing(
+	context<PeeringMachine>().get_recovery_ctx().msgs);
+    }
+    // check if it is a previous down acting member that's coming back.
+    // if so, request pg_temp change to trigger a new interval transition
+    pg_shard_t auth_log_shard;
+    bool history_les_bound = false;
+    // FIXME: Uh-oh we have to check this return value; choose_acting can fail!
+    ps->choose_acting(auth_log_shard, false, &history_les_bound, true);
+    if (!ps->want_acting.empty() && ps->want_acting != ps->acting) {
+      psdout(10) << "Active: got notify from previous acting member "
+                 << notevt.from << ", requesting pg_temp change"
+                 << dendl;
+    }
+  }
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const MTrim& trim)
+{
+  DECLARE_LOCALS;
+  ceph_assert(ps->is_primary());
+
+  // peer is informing us of their last_complete_ondisk
+  ldout(ps->cct,10) << " replica osd." << trim.from << " lcod " << trim.trim_to << dendl;
+  ps->update_peer_last_complete_ondisk(pg_shard_t{trim.from, trim.shard},
+                                       trim.trim_to);
+  // trim log when the pg is recovered
+  ps->calc_min_last_complete_ondisk();
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const MInfoRec& infoevt)
+{
+  DECLARE_LOCALS;
+  ceph_assert(ps->is_primary());
+
+  ceph_assert(!ps->acting_recovery_backfill.empty());
+  if (infoevt.lease_ack) {
+    ps->proc_lease_ack(infoevt.from.osd, *infoevt.lease_ack);
+  }
+  // don't update history (yet) if we are active and primary; the replica
+  // may be telling us they have activated (and committed) but we can't
+  // share that until _everyone_ does the same.
+  if (ps->is_acting_recovery_backfill(infoevt.from) &&
+      ps->peer_activated.count(infoevt.from) == 0) {
+    psdout(10) << " peer osd." << infoevt.from
+	       << " activated and committed" << dendl;
+    ps->peer_activated.insert(infoevt.from);
+    ps->blocked_by.erase(infoevt.from.shard);
+    pl->publish_stats_to_osd();
+    if (ps->peer_activated.size() == ps->acting_recovery_backfill.size()) {
+      all_activated_and_committed();
+    }
+  }
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const MLogRec& logevt)
+{
+  DECLARE_LOCALS;
+  psdout(10) << "searching osd." << logevt.from
+		     << " log for unfound items" << dendl;
+  ps->proc_replica_log(
+    logevt.msg->info, logevt.msg->log, std::move(logevt.msg->missing), logevt.from);
+  bool got_missing = ps->search_for_missing(
+    ps->peer_info[logevt.from],
+    ps->peer_missing[logevt.from],
+    logevt.from,
+    context< PeeringMachine >().get_recovery_ctx());
+  // If there are missing AND we are "fully" active then start recovery now
+  if (got_missing && ps->state_test(PG_STATE_ACTIVE)) {
+    post_event(DoRecovery());
+  }
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const QueryState& q)
+{
+  DECLARE_LOCALS;
+
+  q.f->open_object_section("state");
+  q.f->dump_string("name", state_name);
+  q.f->dump_stream("enter_time") << enter_time;
+
+  {
+    q.f->open_array_section("might_have_unfound");
+    for (auto p = ps->might_have_unfound.begin();
+	 p != ps->might_have_unfound.end();
+	 ++p) {
+      q.f->open_object_section("osd");
+      q.f->dump_stream("osd") << *p;
+      if (ps->peer_missing.count(*p)) {
+	q.f->dump_string("status", "already probed");
+      } else if (ps->peer_missing_requested.count(*p)) {
+	q.f->dump_string("status", "querying");
+      } else if (!ps->get_osdmap()->is_up(p->osd)) {
+	q.f->dump_string("status", "osd is down");
+      } else {
+	q.f->dump_string("status", "not queried");
+      }
+      q.f->close_section();
+    }
+    q.f->close_section();
+  }
+  {
+    q.f->open_object_section("recovery_progress");
+    q.f->open_array_section("backfill_targets");
+    for (auto p = ps->backfill_targets.begin();
+	 p != ps->backfill_targets.end(); ++p)
+      q.f->dump_stream("replica") << *p;
+    q.f->close_section();
+    pl->dump_recovery_info(q.f);
+    q.f->close_section();
+  }
+
+  q.f->close_section();
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const QueryUnfound& q)
+{
+  DECLARE_LOCALS;
+
+  ps->query_unfound(q.f, "Active");
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(
+  const ActivateCommitted &evt)
+{
+  DECLARE_LOCALS;
+  ceph_assert(!ps->peer_activated.count(ps->pg_whoami));
+  ps->peer_activated.insert(ps->pg_whoami);
+  psdout(10) << "_activate_committed " << evt.epoch
+	     << " peer_activated now " << ps->peer_activated
+	     << " last_interval_started "
+	     << ps->info.history.last_interval_started
+	     << " last_epoch_started "
+	     << ps->info.history.last_epoch_started
+	     << " same_interval_since "
+	     << ps->info.history.same_interval_since
+	     << dendl;
+  ceph_assert(!ps->acting_recovery_backfill.empty());
+  if (ps->peer_activated.size() == ps->acting_recovery_backfill.size())
+    all_activated_and_committed();
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const AllReplicasActivated &evt)
+{
+
+  DECLARE_LOCALS;
+  pg_t pgid = context< PeeringMachine >().spgid.pgid;
+
+  all_replicas_activated = true;
+
+  ps->state_clear(PG_STATE_ACTIVATING);
+  ps->state_clear(PG_STATE_CREATING);
+  ps->state_clear(PG_STATE_PREMERGE);
+
+  bool merge_target;
+  if (ps->pool.info.is_pending_merge(pgid, &merge_target)) {
+    ps->state_set(PG_STATE_PEERED);
+    ps->state_set(PG_STATE_PREMERGE);
+
+    if (ps->actingset.size() != ps->get_osdmap()->get_pg_size(pgid)) {
+      if (merge_target) {
+	pg_t src = pgid;
+	src.set_ps(ps->pool.info.get_pg_num_pending());
+	assert(src.get_parent() == pgid);
+	pl->set_not_ready_to_merge_target(pgid, src);
+      } else {
+	pl->set_not_ready_to_merge_source(pgid);
+      }
+    }
+  } else if (!ps->acting_set_writeable()) {
+    ps->state_set(PG_STATE_PEERED);
+  } else {
+    ps->state_set(PG_STATE_ACTIVE);
+  }
+
+  auto mnow = pl->get_mnow();
+  if (ps->prior_readable_until_ub > mnow) {
+    psdout(10) << " waiting for prior_readable_until_ub "
+	       << ps->prior_readable_until_ub << " > mnow " << mnow << dendl;
+    ps->state_set(PG_STATE_WAIT);
+    pl->queue_check_readable(
+      ps->last_peering_reset,
+      ps->prior_readable_until_ub - mnow);
+  } else {
+    psdout(10) << " mnow " << mnow << " >= prior_readable_until_ub "
+	       << ps->prior_readable_until_ub << dendl;
+  }
+
+  if (ps->pool.info.has_flag(pg_pool_t::FLAG_CREATING)) {
+    pl->send_pg_created(pgid);
+  }
+
+  ps->info.history.last_epoch_started = ps->info.last_epoch_started;
+  ps->info.history.last_interval_started = ps->info.last_interval_started;
+  ps->dirty_info = true;
+
+  ps->share_pg_info();
+  pl->publish_stats_to_osd();
+
+  pl->on_activate_complete();
+
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const RenewLease& rl)
+{
+  DECLARE_LOCALS;
+  ps->proc_renew_lease();
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Active::react(const MLeaseAck& la)
+{
+  DECLARE_LOCALS;
+  ps->proc_lease_ack(la.from, la.lease_ack);
+  return discard_event();
+}
+
+
+boost::statechart::result PeeringState::Active::react(const CheckReadable &evt)
+{
+  DECLARE_LOCALS;
+  pl->recheck_readable();
+  return discard_event();
+}
+
+/*
+ * update info.history.last_epoch_started ONLY after we and all
+ * replicas have activated AND committed the activate transaction
+ * (i.e. the peering results are stable on disk).
+ */
+void PeeringState::Active::all_activated_and_committed()
+{
+  DECLARE_LOCALS;
+  psdout(10) << "all_activated_and_committed" << dendl;
+  ceph_assert(ps->is_primary());
+  ceph_assert(ps->peer_activated.size() == ps->acting_recovery_backfill.size());
+  ceph_assert(!ps->acting_recovery_backfill.empty());
+  ceph_assert(ps->blocked_by.empty());
+
+  if (HAVE_FEATURE(ps->upacting_features, SERVER_OCTOPUS)) {
+    // this is overkill when the activation is quick, but when it is slow it
+    // is important, because the lease was renewed by the activate itself but we
+    // don't know how long ago that was, and simply scheduling now may leave
+    // a gap in lease coverage.  keep it simple and aggressively renew.
+    ps->renew_lease(pl->get_mnow());
+    ps->send_lease();
+    ps->schedule_renew_lease();
+  }
+
+  // Degraded?
+  ps->update_calc_stats();
+  if (ps->info.stats.stats.sum.num_objects_degraded) {
+    ps->state_set(PG_STATE_DEGRADED);
+  } else {
+    ps->state_clear(PG_STATE_DEGRADED);
+  }
+
+  post_event(PeeringState::AllReplicasActivated());
+}
+
+
+void PeeringState::Active::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+
+
+  DECLARE_LOCALS;
+  pl->cancel_local_background_io_reservation();
+
+  ps->blocked_by.clear();
+  ps->backfill_reserved = false;
+  ps->state_clear(PG_STATE_ACTIVATING);
+  ps->state_clear(PG_STATE_DEGRADED);
+  ps->state_clear(PG_STATE_UNDERSIZED);
+  ps->state_clear(PG_STATE_BACKFILL_TOOFULL);
+  ps->state_clear(PG_STATE_BACKFILL_WAIT);
+  ps->state_clear(PG_STATE_RECOVERY_WAIT);
+  ps->state_clear(PG_STATE_RECOVERY_TOOFULL);
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_active_latency, dur);
+  pl->on_active_exit();
+}
+
+/*------ReplicaActive-----*/
+PeeringState::ReplicaActive::ReplicaActive(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/ReplicaActive")
+{
+  context< PeeringMachine >().log_enter(state_name);
+
+  DECLARE_LOCALS;
+  ps->start_flush(context< PeeringMachine >().get_cur_transaction());
+}
+
+
+boost::statechart::result PeeringState::ReplicaActive::react(
+  const Activate& actevt) {
+  DECLARE_LOCALS;
+  psdout(10) << "In ReplicaActive, about to call activate" << dendl;
+  ps->activate(
+    context< PeeringMachine >().get_cur_transaction(),
+    actevt.activation_epoch,
+    context< PeeringMachine >().get_recovery_ctx());
+  psdout(10) << "Activate Finished" << dendl;
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(
+  const ActivateCommitted &evt)
+{
+  DECLARE_LOCALS;
+  psdout(10) << __func__ << " " << evt.epoch << " telling primary" << dendl;
+
+  auto &rctx = context<PeeringMachine>().get_recovery_ctx();
+  auto epoch = ps->get_osdmap_epoch();
+  pg_info_t i = ps->info;
+  i.history.last_epoch_started = evt.activation_epoch;
+  i.history.last_interval_started = i.history.same_interval_since;
+  rctx.send_info(
+    ps->get_primary().osd,
+    spg_t(ps->info.pgid.pgid, ps->get_primary().shard),
+    epoch,
+    epoch,
+    i,
+    {}, /* lease */
+    ps->get_lease_ack());
+
+  if (ps->acting_set_writeable()) {
+    ps->state_set(PG_STATE_ACTIVE);
+  } else {
+    ps->state_set(PG_STATE_PEERED);
+  }
+  pl->on_activate_committed();
+
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const MLease& l)
+{
+  DECLARE_LOCALS;
+  spg_t spgid = context< PeeringMachine >().spgid;
+  epoch_t epoch = pl->get_osdmap_epoch();
+
+  ps->proc_lease(l.lease);
+  pl->send_cluster_message(
+    ps->get_primary().osd,
+    make_message<MOSDPGLeaseAck>(epoch,
+		       spg_t(spgid.pgid, ps->get_primary().shard),
+		       ps->get_lease_ack()),
+    epoch);
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const MInfoRec& infoevt)
+{
+  DECLARE_LOCALS;
+  ps->proc_primary_info(context<PeeringMachine>().get_cur_transaction(),
+			infoevt.info);
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& logevt)
+{
+  DECLARE_LOCALS;
+  psdout(10) << "received log from " << logevt.from << dendl;
+  ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
+  ps->merge_log(t, logevt.msg->info, std::move(logevt.msg->log), logevt.from);
+  ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
+  if (logevt.msg->lease) {
+    ps->proc_lease(*logevt.msg->lease);
+  }
+
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const MTrim& trim)
+{
+  DECLARE_LOCALS;
+  // primary is instructing us to trim
+  ps->pg_log.trim(trim.trim_to, ps->info);
+  ps->dirty_info = true;
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const ActMap&)
+{
+  DECLARE_LOCALS;
+  if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
+    ps->info.history.refresh_prior_readable_until_ub(
+      pl->get_mnow(), ps->prior_readable_until_ub);
+    context< PeeringMachine >().send_notify(
+      ps->get_primary().osd,
+      pg_notify_t(
+	ps->get_primary().shard, ps->pg_whoami.shard,
+	ps->get_osdmap_epoch(),
+	ps->get_osdmap_epoch(),
+	ps->info,
+	ps->past_intervals));
+  }
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(
+  const MQuery& query)
+{
+  DECLARE_LOCALS;
+  ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const QueryState& q)
+{
+  q.f->open_object_section("state");
+  q.f->dump_string("name", state_name);
+  q.f->dump_stream("enter_time") << enter_time;
+  q.f->close_section();
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::ReplicaActive::react(const QueryUnfound& q)
+{
+  q.f->dump_string("state", "ReplicaActive");
+  q.f->dump_bool("available_might_have_unfound", false);
+  return discard_event();
+}
+
+void PeeringState::ReplicaActive::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  pl->unreserve_recovery_space();
+
+  pl->cancel_remote_recovery_reservation();
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_replicaactive_latency, dur);
+
+  ps->min_last_complete_ondisk = eversion_t();
+}
+
+/*-------Stray---*/
+PeeringState::Stray::Stray(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Stray")
+{
+  context< PeeringMachine >().log_enter(state_name);
+
+
+  DECLARE_LOCALS;
+  ceph_assert(!ps->is_peered());
+  ceph_assert(!ps->is_peering());
+  ceph_assert(!ps->is_primary());
+
+  if (!ps->get_osdmap()->have_pg_pool(ps->info.pgid.pgid.pool())) {
+    ldout(ps->cct,10) << __func__ << " pool is deleted" << dendl;
+    post_event(DeleteStart());
+  } else {
+    ps->start_flush(context< PeeringMachine >().get_cur_transaction());
+  }
+}
+
+boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
+{
+  DECLARE_LOCALS;
+  MOSDPGLog *msg = logevt.msg.get();
+  psdout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
+
+  ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
+  if (msg->info.last_backfill == hobject_t()) {
+    // restart backfill
+    ps->info = msg->info;
+    pl->on_info_history_change();
+    ps->dirty_info = true;
+    ps->dirty_big_info = true;  // maybe.
+
+    PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
+    ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
+
+    ps->pg_log.reset_backfill();
+  } else {
+    ps->merge_log(t, msg->info, std::move(msg->log), logevt.from);
+  }
+  if (logevt.msg->lease) {
+    ps->proc_lease(*logevt.msg->lease);
+  }
+
+  ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
+
+  post_event(Activate(logevt.msg->info.last_epoch_started));
+  return transit<ReplicaActive>();
+}
+
+boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
+{
+  DECLARE_LOCALS;
+  psdout(10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
+
+  if (ps->info.last_update > infoevt.info.last_update) {
+    // rewind divergent log entries
+    ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
+    ps->rewind_divergent_log(t, infoevt.info.last_update);
+    ps->info.stats = infoevt.info.stats;
+    ps->info.hit_set = infoevt.info.hit_set;
+  }
+
+  if (infoevt.lease) {
+    ps->proc_lease(*infoevt.lease);
+  }
+
+  ceph_assert(infoevt.info.last_update == ps->info.last_update);
+  ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
+
+  post_event(Activate(infoevt.info.last_epoch_started));
+  return transit<ReplicaActive>();
+}
+
+boost::statechart::result PeeringState::Stray::react(const MQuery& query)
+{
+  DECLARE_LOCALS;
+  ps->fulfill_query(query, context<PeeringMachine>().get_recovery_ctx());
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Stray::react(const ActMap&)
+{
+  DECLARE_LOCALS;
+  if (ps->should_send_notify() && ps->get_primary().osd >= 0) {
+    ps->info.history.refresh_prior_readable_until_ub(
+      pl->get_mnow(), ps->prior_readable_until_ub);
+    context< PeeringMachine >().send_notify(
+      ps->get_primary().osd,
+      pg_notify_t(
+	ps->get_primary().shard, ps->pg_whoami.shard,
+	ps->get_osdmap_epoch(),
+	ps->get_osdmap_epoch(),
+	ps->info,
+	ps->past_intervals));
+  }
+  return discard_event();
+}
+
+void PeeringState::Stray::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_stray_latency, dur);
+}
+
+
+/*--------ToDelete----------*/
+PeeringState::ToDelete::ToDelete(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/ToDelete")
+{
+  context< PeeringMachine >().log_enter(state_name);
+  DECLARE_LOCALS;
+  pl->get_perf_logger().inc(l_osd_pg_removing);
+}
+
+void PeeringState::ToDelete::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  // note: on a successful removal, this path doesn't execute. see
+  // _delete_some().
+  pl->get_perf_logger().dec(l_osd_pg_removing);
+
+  pl->cancel_local_background_io_reservation();
+}
+
+/*----WaitDeleteReserved----*/
+PeeringState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history,
+	       "Started/ToDelete/WaitDeleteReseved")
+{
+  context< PeeringMachine >().log_enter(state_name);
+  DECLARE_LOCALS;
+  context< ToDelete >().priority = ps->get_delete_priority();
+
+  pl->cancel_local_background_io_reservation();
+  pl->request_local_background_io_reservation(
+    context<ToDelete>().priority,
+    std::make_unique<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      DeleteReserved()),
+    std::make_unique<PGPeeringEvent>(
+      ps->get_osdmap_epoch(),
+      ps->get_osdmap_epoch(),
+      DeleteInterrupted()));
+}
+
+boost::statechart::result PeeringState::ToDelete::react(
+  const ActMap& evt)
+{
+  DECLARE_LOCALS;
+  if (ps->get_delete_priority() != priority) {
+    psdout(10) << __func__ << " delete priority changed, resetting"
+		   << dendl;
+    return transit<ToDelete>();
+  }
+  return discard_event();
+}
+
+void PeeringState::WaitDeleteReserved::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+}
+
+/*----Deleting-----*/
+PeeringState::Deleting::Deleting(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/ToDelete/Deleting")
+{
+  context< PeeringMachine >().log_enter(state_name);
+
+  DECLARE_LOCALS;
+  ps->deleting = true;
+  ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
+
+  // clear log
+  PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
+  ps->pg_log.roll_forward(rollbacker.get());
+
+  // adjust info to backfill
+  ps->info.set_last_backfill(hobject_t());
+  ps->pg_log.reset_backfill();
+  ps->dirty_info = true;
+
+  pl->on_removal(t);
+}
+
+boost::statechart::result PeeringState::Deleting::react(
+  const DeleteSome& evt)
+{
+  DECLARE_LOCALS;
+  std::pair<ghobject_t, bool> p;
+  p = pl->do_delete_work(context<PeeringMachine>().get_cur_transaction(),
+    next);
+  next = p.first;
+  return p.second ? discard_event() : terminate();
+}
+
+void PeeringState::Deleting::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  ps->deleting = false;
+  pl->cancel_local_background_io_reservation();
+}
+
+/*--------GetInfo---------*/
+PeeringState::GetInfo::GetInfo(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetInfo")
+{
+  context< PeeringMachine >().log_enter(state_name);
+
+
+  DECLARE_LOCALS;
+  ps->check_past_interval_bounds();
+  ps->log_weirdness();
+  PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
+
+  ceph_assert(ps->blocked_by.empty());
+
+  prior_set = ps->build_prior();
+  ps->prior_readable_down_osds = prior_set.down;
+
+  if (ps->prior_readable_down_osds.empty()) {
+    psdout(10) << " no prior_set down osds, will clear prior_readable_until_ub before activating"
+	       << dendl;
+  }
+
+  ps->reset_min_peer_features();
+  get_infos();
+  if (prior_set.pg_down) {
+    post_event(IsDown());
+  } else if (peer_info_requested.empty()) {
+    post_event(GotInfo());
+  }
+}
+
+void PeeringState::GetInfo::get_infos()
+{
+  DECLARE_LOCALS;
+  PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
+
+  ps->blocked_by.clear();
+  for (auto it = prior_set.probe.begin(); it != prior_set.probe.end(); ++it) {
+    pg_shard_t peer = *it;
+    if (peer == ps->pg_whoami) {
+      continue;
+    }
+    if (ps->peer_info.count(peer)) {
+      psdout(10) << " have osd." << peer << " info " << ps->peer_info[peer] << dendl;
+      continue;
+    }
+    if (peer_info_requested.count(peer)) {
+      psdout(10) << " already requested info from osd." << peer << dendl;
+      ps->blocked_by.insert(peer.osd);
+    } else if (!ps->get_osdmap()->is_up(peer.osd)) {
+      psdout(10) << " not querying info from down osd." << peer << dendl;
+    } else {
+      psdout(10) << " querying info from osd." << peer << dendl;
+      context< PeeringMachine >().send_query(
+	peer.osd,
+	pg_query_t(pg_query_t::INFO,
+		   it->shard, ps->pg_whoami.shard,
+		   ps->info.history,
+		   ps->get_osdmap_epoch()));
+      peer_info_requested.insert(peer);
+      ps->blocked_by.insert(peer.osd);
+    }
+  }
+
+  ps->check_prior_readable_down_osds(ps->get_osdmap());
+
+  pl->publish_stats_to_osd();
+}
+
+boost::statechart::result PeeringState::GetInfo::react(const MNotifyRec& infoevt)
+{
+
+  DECLARE_LOCALS;
+
+  auto p = peer_info_requested.find(infoevt.from);
+  if (p != peer_info_requested.end()) {
+    peer_info_requested.erase(p);
+    ps->blocked_by.erase(infoevt.from.osd);
+  }
+
+  epoch_t old_start = ps->info.history.last_epoch_started;
+  if (ps->proc_replica_info(
+	infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
+    // we got something new ...
+    PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
+    if (old_start < ps->info.history.last_epoch_started) {
+      psdout(10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
+      prior_set = ps->build_prior();
+      ps->prior_readable_down_osds = prior_set.down;
+
+      // filter out any osds that got dropped from the probe set from
+      // peer_info_requested.  this is less expensive than restarting
+      // peering (which would re-probe everyone).
+      auto p = peer_info_requested.begin();
+      while (p != peer_info_requested.end()) {
+	if (prior_set.probe.count(*p) == 0) {
+	  psdout(20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
+	  peer_info_requested.erase(p++);
+	} else {
+	  ++p;
+	}
+      }
+      get_infos();
+    }
+    psdout(20) << "Adding osd: " << infoevt.from.osd << " peer features: "
+		       << hex << infoevt.features << dec << dendl;
+    ps->apply_peer_features(infoevt.features);
+
+    // are we done getting everything?
+    if (peer_info_requested.empty() && !prior_set.pg_down) {
+      psdout(20) << "Common peer features: " << hex << ps->get_min_peer_features() << dec << dendl;
+      psdout(20) << "Common acting features: " << hex << ps->get_min_acting_features() << dec << dendl;
+      psdout(20) << "Common upacting features: " << hex << ps->get_min_upacting_features() << dec << dendl;
+      post_event(GotInfo());
+    }
+  }
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::GetInfo::react(const QueryState& q)
+{
+  DECLARE_LOCALS;
+  q.f->open_object_section("state");
+  q.f->dump_string("name", state_name);
+  q.f->dump_stream("enter_time") << enter_time;
+
+  q.f->open_array_section("requested_info_from");
+  for (auto p = peer_info_requested.begin();
+       p != peer_info_requested.end();
+       ++p) {
+    q.f->open_object_section("osd");
+    q.f->dump_stream("osd") << *p;
+    if (ps->peer_info.count(*p)) {
+      q.f->open_object_section("got_info");
+      ps->peer_info[*p].dump(q.f);
+      q.f->close_section();
+    }
+    q.f->close_section();
+  }
+  q.f->close_section();
+
+  q.f->close_section();
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::GetInfo::react(const QueryUnfound& q)
+{
+  q.f->dump_string("state", "GetInfo");
+  q.f->dump_bool("available_might_have_unfound", false);
+  return discard_event();
+}
+
+void PeeringState::GetInfo::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_getinfo_latency, dur);
+  ps->blocked_by.clear();
+}
+
+/*------GetLog------------*/
+PeeringState::GetLog::GetLog(my_context ctx)
+  : my_base(ctx),
+    NamedState(
+      context< PeeringMachine >().state_history,
+      "Started/Primary/Peering/GetLog"),
+    msg(0)
+{
+  context< PeeringMachine >().log_enter(state_name);
+
+  DECLARE_LOCALS;
+
+  ps->log_weirdness();
+
+  // adjust acting?
+  if (!ps->choose_acting(auth_log_shard, false,
+			 &context< Peering >().history_les_bound)) {
+    if (!ps->want_acting.empty()) {
+      post_event(NeedActingChange());
+    } else {
+      post_event(IsIncomplete());
+    }
+    return;
+  }
+
+  // am i the best?
+  if (auth_log_shard == ps->pg_whoami) {
+    post_event(GotLog());
+    return;
+  }
+
+  const pg_info_t& best = ps->peer_info[auth_log_shard];
+
+  // am i broken?
+  if (ps->info.last_update < best.log_tail) {
+    psdout(10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
+    post_event(IsIncomplete());
+    return;
+  }
+
+  // how much log to request?
+  eversion_t request_log_from = ps->info.last_update;
+  ceph_assert(!ps->acting_recovery_backfill.empty());
+  for (auto p = ps->acting_recovery_backfill.begin();
+       p != ps->acting_recovery_backfill.end();
+       ++p) {
+    if (*p == ps->pg_whoami) continue;
+    pg_info_t& ri = ps->peer_info[*p];
+    if (ri.last_update < ps->info.log_tail && ri.last_update >= best.log_tail &&
+        ri.last_update < request_log_from)
+      request_log_from = ri.last_update;
+  }
+
+  // how much?
+  psdout(10) << " requesting log from osd." << auth_log_shard << dendl;
+  context<PeeringMachine>().send_query(
+    auth_log_shard.osd,
+    pg_query_t(
+      pg_query_t::LOG,
+      auth_log_shard.shard, ps->pg_whoami.shard,
+      request_log_from, ps->info.history,
+      ps->get_osdmap_epoch()));
+
+  ceph_assert(ps->blocked_by.empty());
+  ps->blocked_by.insert(auth_log_shard.osd);
+  pl->publish_stats_to_osd();
+}
+
+boost::statechart::result PeeringState::GetLog::react(const AdvMap& advmap)
+{
+  // make sure our log source didn't go down.  we need to check
+  // explicitly because it may not be part of the prior set, which
+  // means the Peering state check won't catch it going down.
+  if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
+    psdout(10) << "GetLog: auth_log_shard osd."
+		       << auth_log_shard.osd << " went down" << dendl;
+    post_event(advmap);
+    return transit< Reset >();
+  }
+
+  // let the Peering state do its checks.
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::GetLog::react(const MLogRec& logevt)
+{
+  ceph_assert(!msg);
+  if (logevt.from != auth_log_shard) {
+    psdout(10) << "GetLog: discarding log from "
+		       << "non-auth_log_shard osd." << logevt.from << dendl;
+    return discard_event();
+  }
+  psdout(10) << "GetLog: received master log from osd."
+		     << logevt.from << dendl;
+  msg = logevt.msg;
+  post_event(GotLog());
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::GetLog::react(const GotLog&)
+{
+
+  DECLARE_LOCALS;
+  psdout(10) << "leaving GetLog" << dendl;
+  if (msg) {
+    psdout(10) << "processing master log" << dendl;
+    ps->proc_master_log(context<PeeringMachine>().get_cur_transaction(),
+			msg->info, std::move(msg->log), std::move(msg->missing),
+			auth_log_shard);
+  }
+  ps->start_flush(context< PeeringMachine >().get_cur_transaction());
+  return transit< GetMissing >();
+}
+
+boost::statechart::result PeeringState::GetLog::react(const QueryState& q)
+{
+  q.f->open_object_section("state");
+  q.f->dump_string("name", state_name);
+  q.f->dump_stream("enter_time") << enter_time;
+  q.f->dump_stream("auth_log_shard") << auth_log_shard;
+  q.f->close_section();
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::GetLog::react(const QueryUnfound& q)
+{
+  q.f->dump_string("state", "GetLog");
+  q.f->dump_bool("available_might_have_unfound", false);
+  return discard_event();
+}
+
+void PeeringState::GetLog::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_getlog_latency, dur);
+  ps->blocked_by.clear();
+}
+
+/*------WaitActingChange--------*/
+PeeringState::WaitActingChange::WaitActingChange(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/WaitActingChange")
+{
+  context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result PeeringState::WaitActingChange::react(const AdvMap& advmap)
+{
+  DECLARE_LOCALS;
+  OSDMapRef osdmap = advmap.osdmap;
+
+  psdout(10) << "verifying no want_acting " << ps->want_acting << " targets didn't go down" << dendl;
+  for (auto p = ps->want_acting.begin(); p != ps->want_acting.end(); ++p) {
+    if (!osdmap->is_up(*p)) {
+      psdout(10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
+      post_event(advmap);
+      return transit< Reset >();
+    }
+  }
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::WaitActingChange::react(const MLogRec& logevt)
+{
+  psdout(10) << "In WaitActingChange, ignoring MLocRec" << dendl;
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::WaitActingChange::react(const MInfoRec& evt)
+{
+  psdout(10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::WaitActingChange::react(const MNotifyRec& evt)
+{
+  psdout(10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::WaitActingChange::react(const QueryState& q)
+{
+  q.f->open_object_section("state");
+  q.f->dump_string("name", state_name);
+  q.f->dump_stream("enter_time") << enter_time;
+  q.f->dump_string("comment", "waiting for pg acting set to change");
+  q.f->close_section();
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::WaitActingChange::react(const QueryUnfound& q)
+{
+  q.f->dump_string("state", "WaitActingChange");
+  q.f->dump_bool("available_might_have_unfound", false);
+  return discard_event();
+}
+
+void PeeringState::WaitActingChange::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_waitactingchange_latency, dur);
+}
+
+/*------Down--------*/
+PeeringState::Down::Down(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Down")
+{
+  context< PeeringMachine >().log_enter(state_name);
+  DECLARE_LOCALS;
+
+  ps->state_clear(PG_STATE_PEERING);
+  ps->state_set(PG_STATE_DOWN);
+
+  auto &prior_set = context< Peering >().prior_set;
+  ceph_assert(ps->blocked_by.empty());
+  ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
+  pl->publish_stats_to_osd();
+}
+
+void PeeringState::Down::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+
+  DECLARE_LOCALS;
+
+  ps->state_clear(PG_STATE_DOWN);
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_down_latency, dur);
+
+  ps->blocked_by.clear();
+}
+
+boost::statechart::result PeeringState::Down::react(const QueryState& q)
+{
+  q.f->open_object_section("state");
+  q.f->dump_string("name", state_name);
+  q.f->dump_stream("enter_time") << enter_time;
+  q.f->dump_string("comment",
+		   "not enough up instances of this PG to go active");
+  q.f->close_section();
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::Down::react(const QueryUnfound& q)
+{
+  q.f->dump_string("state", "Down");
+  q.f->dump_bool("available_might_have_unfound", false);
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::Down::react(const MNotifyRec& infoevt)
+{
+  DECLARE_LOCALS;
+
+  ceph_assert(ps->is_primary());
+  epoch_t old_start = ps->info.history.last_epoch_started;
+  if (!ps->peer_info.count(infoevt.from) &&
+      ps->get_osdmap()->has_been_up_since(infoevt.from.osd, infoevt.notify.epoch_sent)) {
+    ps->update_history(infoevt.notify.info.history);
+  }
+  // if we got something new to make pg escape down state
+  if (ps->info.history.last_epoch_started > old_start) {
+      psdout(10) << " last_epoch_started moved forward, re-enter getinfo" << dendl;
+    ps->state_clear(PG_STATE_DOWN);
+    ps->state_set(PG_STATE_PEERING);
+    return transit< GetInfo >();
+  }
+
+  return discard_event();
+}
+
+
+/*------Incomplete--------*/
+PeeringState::Incomplete::Incomplete(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/Incomplete")
+{
+  context< PeeringMachine >().log_enter(state_name);
+  DECLARE_LOCALS;
+
+  ps->state_clear(PG_STATE_PEERING);
+  ps->state_set(PG_STATE_INCOMPLETE);
+
+  PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
+  ceph_assert(ps->blocked_by.empty());
+  ps->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
+  pl->publish_stats_to_osd();
+}
+
+boost::statechart::result PeeringState::Incomplete::react(const AdvMap &advmap) {
+  DECLARE_LOCALS;
+  int64_t poolnum = ps->info.pgid.pool();
+
+  // Reset if min_size turn smaller than previous value, pg might now be able to go active
+  if (!advmap.osdmap->have_pg_pool(poolnum) ||
+      advmap.lastmap->get_pools().find(poolnum)->second.min_size >
+      advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
+    post_event(advmap);
+    return transit< Reset >();
+  }
+
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::Incomplete::react(const MNotifyRec& notevt) {
+  DECLARE_LOCALS;
+  psdout(7) << "handle_pg_notify from osd." << notevt.from << dendl;
+  if (ps->proc_replica_info(
+    notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
+    // We got something new, try again!
+    return transit< GetLog >();
+  } else {
+    return discard_event();
+  }
+}
+
+boost::statechart::result PeeringState::Incomplete::react(
+  const QueryState& q)
+{
+  q.f->open_object_section("state");
+  q.f->dump_string("name", state_name);
+  q.f->dump_stream("enter_time") << enter_time;
+  q.f->dump_string("comment", "not enough complete instances of this PG");
+  q.f->close_section();
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::Incomplete::react(const QueryUnfound& q)
+{
+  q.f->dump_string("state", "Incomplete");
+  q.f->dump_bool("available_might_have_unfound", false);
+  return discard_event();
+}
+
+void PeeringState::Incomplete::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+
+  DECLARE_LOCALS;
+
+  ps->state_clear(PG_STATE_INCOMPLETE);
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_incomplete_latency, dur);
+
+  ps->blocked_by.clear();
+}
+
+/*------GetMissing--------*/
+PeeringState::GetMissing::GetMissing(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/GetMissing")
+{
+  context< PeeringMachine >().log_enter(state_name);
+
+  DECLARE_LOCALS;
+  ps->log_weirdness();
+  ceph_assert(!ps->acting_recovery_backfill.empty());
+  eversion_t since;
+  for (auto i = ps->acting_recovery_backfill.begin();
+       i != ps->acting_recovery_backfill.end();
+       ++i) {
+    if (*i == ps->get_primary()) continue;
+    const pg_info_t& pi = ps->peer_info[*i];
+    // reset this so to make sure the pg_missing_t is initialized and
+    // has the correct semantics even if we don't need to get a
+    // missing set from a shard. This way later additions due to
+    // lost+unfound delete work properly.
+    ps->peer_missing[*i].may_include_deletes = !ps->perform_deletes_during_peering();
+
+    if (pi.is_empty())
+      continue;                                // no pg data, nothing divergent
+
+    if (pi.last_update < ps->pg_log.get_tail()) {
+      psdout(10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
+      ps->peer_missing[*i].clear();
+      continue;
+    }
+    if (pi.last_backfill == hobject_t()) {
+      psdout(10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
+      ps->peer_missing[*i].clear();
+      continue;
+    }
+
+    if (pi.last_update == pi.last_complete &&  // peer has no missing
+	pi.last_update == ps->info.last_update) {  // peer is up to date
+      // replica has no missing and identical log as us.  no need to
+      // pull anything.
+      // FIXME: we can do better here.  if last_update==last_complete we
+      //        can infer the rest!
+      psdout(10) << " osd." << *i << " has no missing, identical log" << dendl;
+      ps->peer_missing[*i].clear();
+      continue;
+    }
+
+    // We pull the log from the peer's last_epoch_started to ensure we
+    // get enough log to detect divergent updates.
+    since.epoch = pi.last_epoch_started;
+    ceph_assert(pi.last_update >= ps->info.log_tail);  // or else choose_acting() did a bad thing
+    if (pi.log_tail <= since) {
+      psdout(10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
+      context< PeeringMachine >().send_query(
+	i->osd,
+	pg_query_t(
+	  pg_query_t::LOG,
+	  i->shard, ps->pg_whoami.shard,
+	  since, ps->info.history,
+	  ps->get_osdmap_epoch()));
+    } else {
+      psdout(10) << " requesting fulllog+missing from osd." << *i
+			 << " (want since " << since << " < log.tail "
+			 << pi.log_tail << ")" << dendl;
+      context< PeeringMachine >().send_query(
+	i->osd, pg_query_t(
+	  pg_query_t::FULLLOG,
+	  i->shard, ps->pg_whoami.shard,
+	  ps->info.history, ps->get_osdmap_epoch()));
+    }
+    peer_missing_requested.insert(*i);
+    ps->blocked_by.insert(i->osd);
+  }
+
+  if (peer_missing_requested.empty()) {
+    if (ps->need_up_thru) {
+      psdout(10) << " still need up_thru update before going active"
+			 << dendl;
+      post_event(NeedUpThru());
+      return;
+    }
+
+    // all good!
+    post_event(Activate(ps->get_osdmap_epoch()));
+  } else {
+    pl->publish_stats_to_osd();
+  }
+}
+
+boost::statechart::result PeeringState::GetMissing::react(const MLogRec& logevt)
+{
+  DECLARE_LOCALS;
+
+  peer_missing_requested.erase(logevt.from);
+  ps->proc_replica_log(logevt.msg->info,
+		       logevt.msg->log,
+		       std::move(logevt.msg->missing),
+		       logevt.from);
+
+  if (peer_missing_requested.empty()) {
+    if (ps->need_up_thru) {
+      psdout(10) << " still need up_thru update before going active"
+			 << dendl;
+      post_event(NeedUpThru());
+    } else {
+      psdout(10) << "Got last missing, don't need missing "
+			 << "posting Activate" << dendl;
+      post_event(Activate(ps->get_osdmap_epoch()));
+    }
+  }
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::GetMissing::react(const QueryState& q)
+{
+  DECLARE_LOCALS;
+  q.f->open_object_section("state");
+  q.f->dump_string("name", state_name);
+  q.f->dump_stream("enter_time") << enter_time;
+
+  q.f->open_array_section("peer_missing_requested");
+  for (auto p = peer_missing_requested.begin();
+       p != peer_missing_requested.end();
+       ++p) {
+    q.f->open_object_section("osd");
+    q.f->dump_stream("osd") << *p;
+    if (ps->peer_missing.count(*p)) {
+      q.f->open_object_section("got_missing");
+      ps->peer_missing[*p].dump(q.f);
+      q.f->close_section();
+    }
+    q.f->close_section();
+  }
+  q.f->close_section();
+
+  q.f->close_section();
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::GetMissing::react(const QueryUnfound& q)
+{
+  q.f->dump_string("state", "GetMising");
+  q.f->dump_bool("available_might_have_unfound", false);
+  return discard_event();
+}
+
+void PeeringState::GetMissing::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_getmissing_latency, dur);
+  ps->blocked_by.clear();
+}
+
+/*------WaitUpThru--------*/
+PeeringState::WaitUpThru::WaitUpThru(my_context ctx)
+  : my_base(ctx),
+    NamedState(context< PeeringMachine >().state_history, "Started/Primary/Peering/WaitUpThru")
+{
+  context< PeeringMachine >().log_enter(state_name);
+}
+
+boost::statechart::result PeeringState::WaitUpThru::react(const ActMap& am)
+{
+  DECLARE_LOCALS;
+  if (!ps->need_up_thru) {
+    post_event(Activate(ps->get_osdmap_epoch()));
+  }
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::WaitUpThru::react(const MLogRec& logevt)
+{
+  DECLARE_LOCALS;
+  psdout(10) << "Noting missing from osd." << logevt.from << dendl;
+  ps->peer_missing[logevt.from].claim(std::move(logevt.msg->missing));
+  ps->peer_info[logevt.from] = logevt.msg->info;
+  return discard_event();
+}
+
+boost::statechart::result PeeringState::WaitUpThru::react(const QueryState& q)
+{
+  q.f->open_object_section("state");
+  q.f->dump_string("name", state_name);
+  q.f->dump_stream("enter_time") << enter_time;
+  q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
+  q.f->close_section();
+  return forward_event();
+}
+
+boost::statechart::result PeeringState::WaitUpThru::react(const QueryUnfound& q)
+{
+  q.f->dump_string("state", "WaitUpThru");
+  q.f->dump_bool("available_might_have_unfound", false);
+  return discard_event();
+}
+
+void PeeringState::WaitUpThru::exit()
+{
+  context< PeeringMachine >().log_exit(state_name, enter_time);
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  pl->get_peering_perf().tinc(rs_waitupthru_latency, dur);
+}
+
+/*----PeeringState::PeeringMachine Methods-----*/
+#undef dout_prefix
+#define dout_prefix dpp->gen_prefix(*_dout)
+
+void PeeringState::PeeringMachine::log_enter(const char *state_name)
+{
+  DECLARE_LOCALS;
+  psdout(5) << "enter " << state_name << dendl;
+  pl->log_state_enter(state_name);
+}
+
+void PeeringState::PeeringMachine::log_exit(const char *state_name, utime_t enter_time)
+{
+  DECLARE_LOCALS;
+  utime_t dur = ceph_clock_now() - enter_time;
+  psdout(5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
+  pl->log_state_exit(state_name, enter_time, event_count, event_time);
+  event_count = 0;
+  event_time = utime_t();
+}
+
+ostream &operator<<(ostream &out, const PeeringState &ps) {
+  out << "pg[" << ps.info
+      << " " << pg_vector_string(ps.up);
+  if (ps.acting != ps.up)
+    out << "/" << pg_vector_string(ps.acting);
+  if (ps.is_ec_pg())
+    out << "p" << ps.get_primary();
+  if (!ps.async_recovery_targets.empty())
+    out << " async=[" << ps.async_recovery_targets << "]";
+  if (!ps.backfill_targets.empty())
+    out << " backfill=[" << ps.backfill_targets << "]";
+  out << " r=" << ps.get_role();
+  out << " lpr=" << ps.get_last_peering_reset();
+
+  if (ps.deleting)
+    out << " DELETING";
+
+  if (!ps.past_intervals.empty()) {
+    out << " pi=[" << ps.past_intervals.get_bounds()
+	<< ")/" << ps.past_intervals.size();
+  }
+
+  if (ps.is_peered()) {
+    if (ps.last_update_ondisk != ps.info.last_update)
+      out << " luod=" << ps.last_update_ondisk;
+    if (ps.last_update_applied != ps.info.last_update)
+      out << " lua=" << ps.last_update_applied;
+  }
+
+  if (ps.pg_log.get_tail() != ps.info.log_tail ||
+      ps.pg_log.get_head() != ps.info.last_update)
+    out << " (info mismatch, " << ps.pg_log.get_log() << ")";
+
+  if (!ps.pg_log.get_log().empty()) {
+    if ((ps.pg_log.get_log().log.begin()->version <= ps.pg_log.get_tail())) {
+      out << " (log bound mismatch, actual=["
+	  << ps.pg_log.get_log().log.begin()->version << ","
+	  << ps.pg_log.get_log().log.rbegin()->version << "]";
+      out << ")";
+    }
+  }
+
+  out << " crt=" << ps.pg_log.get_can_rollback_to();
+
+  if (ps.last_complete_ondisk != ps.info.last_complete)
+    out << " lcod " << ps.last_complete_ondisk;
+
+  out << " mlcod " << ps.min_last_complete_ondisk;
+
+  out << " " << pg_state_string(ps.get_state());
+  if (ps.should_send_notify())
+    out << " NOTIFY";
+
+  if (ps.prior_readable_until_ub != ceph::signedspan::zero()) {
+    out << " pruub " << ps.prior_readable_until_ub
+	<< "@" << ps.get_prior_readable_down_osds();
+  }
+  return out;
+}
+
+std::vector<pg_shard_t> PeeringState::get_replica_recovery_order() const
+{
+  std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
+    async_by_num_missing;
+  replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
+  for (auto &p : get_acting_recovery_backfill()) {
+    if (p == get_primary()) {
+      continue;
+    }
+    auto pm = get_peer_missing().find(p);
+    assert(pm != get_peer_missing().end());
+    auto nm = pm->second.num_missing();
+    if (nm != 0) {
+      if (is_async_recovery_target(p)) {
+	async_by_num_missing.push_back(make_pair(nm, p));
+      } else {
+	replicas_by_num_missing.push_back(make_pair(nm, p));
+      }
+    }
+  }
+  // sort by number of missing objects, in ascending order.
+  auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
+		 const std::pair<unsigned int, pg_shard_t> &rhs) {
+    return lhs.first < rhs.first;
+  };
+  // acting goes first
+  std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
+  // then async_recovery_targets
+  std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
+  replicas_by_num_missing.insert(replicas_by_num_missing.end(),
+    async_by_num_missing.begin(), async_by_num_missing.end());
+
+  std::vector<pg_shard_t> ret;
+  ret.reserve(replicas_by_num_missing.size());
+  for (auto p : replicas_by_num_missing) {
+    ret.push_back(p.second);
+  }
+  return ret;
+}
+
+
diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h
new file mode 100644
index 000000000..2cc340cb9
--- /dev/null
+++ b/src/osd/PeeringState.h
@@ -0,0 +1,2442 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/statechart/custom_reaction.hpp>
+#include <boost/statechart/event.hpp>
+#include <boost/statechart/simple_state.hpp>
+#include <boost/statechart/state.hpp>
+#include <boost/statechart/state_machine.hpp>
+#include <boost/statechart/transition.hpp>
+#include <boost/statechart/event_base.hpp>
+#include <string>
+#include <atomic>
+
+#include "include/ceph_assert.h"
+#include "include/common_fwd.h"
+
+#include "PGLog.h"
+#include "PGStateUtils.h"
+#include "PGPeeringEvent.h"
+#include "osd_types.h"
+#include "os/ObjectStore.h"
+#include "OSDMap.h"
+#include "MissingLoc.h"
+#include "osd/osd_perf_counters.h"
+#include "common/ostream_temp.h"
+
+struct PGPool {
+  epoch_t cached_epoch;
+  int64_t id;
+  std::string name;
+
+  pg_pool_t info;
+  SnapContext snapc;   // the default pool snapc, ready to go.
+
+  PGPool(OSDMapRef map, int64_t i, const pg_pool_t& info,
+	 const std::string& name)
+    : cached_epoch(map->get_epoch()),
+      id(i),
+      name(name),
+      info(info) {
+    snapc = info.get_snap_context();
+  }
+
+  void update(OSDMapRef map);
+
+  ceph::timespan get_readable_interval(ConfigProxy &conf) const {
+    double v = 0;
+    if (info.opts.get(pool_opts_t::READ_LEASE_INTERVAL, &v)) {
+      return ceph::make_timespan(v);
+    } else {
+      auto hbi = conf->osd_heartbeat_grace;
+      auto fac = conf->osd_pool_default_read_lease_ratio;
+      return ceph::make_timespan(hbi * fac);
+    }
+  }
+};
+
+struct PeeringCtx;
+
+// [primary only] content recovery state
+struct BufferedRecoveryMessages {
+  ceph_release_t require_osd_release;
+  std::map<int, std::vector<MessageRef>> message_map;
+
+  BufferedRecoveryMessages(ceph_release_t r)
+    : require_osd_release(r) {
+  }
+  BufferedRecoveryMessages(ceph_release_t r, PeeringCtx &ctx);
+
+  void accept_buffered_messages(BufferedRecoveryMessages &m) {
+    for (auto &[target, ls] : m.message_map) {
+      auto &ovec = message_map[target];
+      // put buffered messages in front
+      ls.reserve(ls.size() + ovec.size());
+      ls.insert(ls.end(), ovec.begin(), ovec.end());
+      ovec.clear();
+      ovec.swap(ls);
+    }
+  }
+
+  void send_osd_message(int target, MessageRef m) {
+    message_map[target].push_back(std::move(m));
+  }
+  void send_notify(int to, const pg_notify_t &n);
+  void send_query(int to, spg_t spgid, const pg_query_t &q);
+  void send_info(int to, spg_t to_spgid,
+		 epoch_t min_epoch, epoch_t cur_epoch,
+		 const pg_info_t &info,
+		 std::optional<pg_lease_t> lease = {},
+		 std::optional<pg_lease_ack_t> lease_ack = {});
+};
+
+struct HeartbeatStamps : public RefCountedObject {
+  mutable ceph::mutex lock = ceph::make_mutex("HeartbeatStamps::lock");
+
+  const int osd;
+
+  // we maintain an upper and lower bound on the delta between our local
+  // mono_clock time (minus the startup_time) to the peer OSD's mono_clock
+  // time (minus its startup_time).
+  //
+  // delta is (remote_clock_time - local_clock_time), so that
+  // local_time + delta -> peer_time, and peer_time - delta -> local_time.
+  //
+  // we have an upper and lower bound value on this delta, meaning the
+  // value of the remote clock is somewhere between [my_time + lb, my_time + ub]
+  //
+  // conversely, if we have a remote timestamp T, then that is
+  // [T - ub, T - lb] in terms of the local clock.  i.e., if you are
+  // substracting the delta, then take care that you swap the role of the
+  // lb and ub values.
+
+  /// lower bound on peer clock - local clock
+  std::optional<ceph::signedspan> peer_clock_delta_lb;
+
+  /// upper bound on peer clock - local clock
+  std::optional<ceph::signedspan> peer_clock_delta_ub;
+
+  /// highest up_from we've seen from this rank
+  epoch_t up_from = 0;
+
+  void print(std::ostream& out) const {
+    std::lock_guard l(lock);
+    out << "hbstamp(osd." << osd << " up_from " << up_from
+	<< " peer_clock_delta [";
+    if (peer_clock_delta_lb) {
+      out << *peer_clock_delta_lb;
+    }
+    out << ",";
+    if (peer_clock_delta_ub) {
+      out << *peer_clock_delta_ub;
+    }
+    out << "])";
+  }
+
+  void sent_ping(std::optional<ceph::signedspan> *delta_ub) {
+    std::lock_guard l(lock);
+    // the non-primaries need a lower bound on remote clock - local clock.  if
+    // we assume the transit for the last ping_reply was
+    // instantaneous, that would be (the negative of) our last
+    // peer_clock_delta_lb value.
+    if (peer_clock_delta_lb) {
+      *delta_ub = - *peer_clock_delta_lb;
+    }
+  }
+
+  void got_ping(epoch_t this_up_from,
+		ceph::signedspan now,
+		ceph::signedspan peer_send_stamp,
+		std::optional<ceph::signedspan> delta_ub,
+		ceph::signedspan *out_delta_ub) {
+    std::lock_guard l(lock);
+    if (this_up_from < up_from) {
+      return;
+    }
+    if (this_up_from > up_from) {
+      up_from = this_up_from;
+    }
+    peer_clock_delta_lb = peer_send_stamp - now;
+    peer_clock_delta_ub = delta_ub;
+    *out_delta_ub = - *peer_clock_delta_lb;
+  }
+
+  void got_ping_reply(ceph::signedspan now,
+		      ceph::signedspan peer_send_stamp,
+		      std::optional<ceph::signedspan> delta_ub) {
+    std::lock_guard l(lock);
+    peer_clock_delta_lb = peer_send_stamp - now;
+    peer_clock_delta_ub = delta_ub;
+  }
+
+private:
+  FRIEND_MAKE_REF(HeartbeatStamps);
+  HeartbeatStamps(int o)
+    : RefCountedObject(NULL),
+      osd(o) {}
+};
+using HeartbeatStampsRef = ceph::ref_t<HeartbeatStamps>;
+
+inline std::ostream& operator<<(std::ostream& out, const HeartbeatStamps& hb)
+{
+  hb.print(out);
+  return out;
+}
+
+
+struct PeeringCtx : BufferedRecoveryMessages {
+  ObjectStore::Transaction transaction;
+  HBHandle* handle = nullptr;
+
+  PeeringCtx(ceph_release_t r)
+    : BufferedRecoveryMessages(r) {}
+
+  PeeringCtx(const PeeringCtx &) = delete;
+  PeeringCtx &operator=(const PeeringCtx &) = delete;
+
+  PeeringCtx(PeeringCtx &&) = default;
+  PeeringCtx &operator=(PeeringCtx &&) = default;
+
+  void reset_transaction() {
+    transaction = ObjectStore::Transaction();
+  }
+};
+
+/**
+ * Wraps PeeringCtx to hide the difference between buffering messages to
+ * be sent after flush or immediately.
+ */
+struct PeeringCtxWrapper {
+  utime_t start_time;
+  BufferedRecoveryMessages &msgs;
+  ObjectStore::Transaction &transaction;
+  HBHandle * const handle = nullptr;
+
+  PeeringCtxWrapper(PeeringCtx &wrapped) :
+    msgs(wrapped),
+    transaction(wrapped.transaction),
+    handle(wrapped.handle) {}
+
+  PeeringCtxWrapper(BufferedRecoveryMessages &buf, PeeringCtx &wrapped)
+    : msgs(buf),
+      transaction(wrapped.transaction),
+      handle(wrapped.handle) {}
+
+  PeeringCtxWrapper(PeeringCtxWrapper &&ctx) = default;
+
+  void send_osd_message(int target, MessageRef m) {
+    msgs.send_osd_message(target, std::move(m));
+  }
+  void send_notify(int to, const pg_notify_t &n) {
+    msgs.send_notify(to, n);
+  }
+  void send_query(int to, spg_t spgid, const pg_query_t &q) {
+    msgs.send_query(to, spgid, q);
+  }
+  void send_info(int to, spg_t to_spgid,
+		 epoch_t min_epoch, epoch_t cur_epoch,
+		 const pg_info_t &info,
+		 std::optional<pg_lease_t> lease = {},
+		 std::optional<pg_lease_ack_t> lease_ack = {}) {
+    msgs.send_info(to, to_spgid, min_epoch, cur_epoch, info,
+		   lease, lease_ack);
+  }
+};
+
+/* Encapsulates PG recovery process */
+class PeeringState : public MissingLoc::MappingInfo {
+public:
+  struct PeeringListener : public EpochSource {
+    /// Prepare t with written information
+    virtual void prepare_write(
+      pg_info_t &info,
+      pg_info_t &last_written_info,
+      PastIntervals &past_intervals,
+      PGLog &pglog,
+      bool dirty_info,
+      bool dirty_big_info,
+      bool need_write_epoch,
+      ObjectStore::Transaction &t) = 0;
+
+    /// Notify that info/history changed (generally to update scrub registration)
+    virtual void on_info_history_change() = 0;
+    /// Notify that a scrub has been requested
+    virtual void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) = 0;
+
+    /// Return current snap_trimq size
+    virtual uint64_t get_snap_trimq_size() const = 0;
+
+    /// Send cluster message to osd
+    virtual void send_cluster_message(
+      int osd, MessageRef m, epoch_t epoch, bool share_map_update=false) = 0;
+    /// Send pg_created to mon
+    virtual void send_pg_created(pg_t pgid) = 0;
+
+    virtual ceph::signedspan get_mnow() = 0;
+    virtual HeartbeatStampsRef get_hb_stamps(int peer) = 0;
+    virtual void schedule_renew_lease(epoch_t plr, ceph::timespan delay) = 0;
+    virtual void queue_check_readable(epoch_t lpr, ceph::timespan delay) = 0;
+    virtual void recheck_readable() = 0;
+
+    virtual unsigned get_target_pg_log_entries() const = 0;
+
+    // ============ Flush state ==================
+    /**
+     * try_flush_or_schedule_async()
+     *
+     * If true, caller may assume all past operations on this pg
+     * have been flushed.  Else, caller will receive an on_flushed()
+     * call once the flush has completed.
+     */
+    virtual bool try_flush_or_schedule_async() = 0;
+    /// Arranges for a commit on t to call on_flushed() once flushed.
+    virtual void start_flush_on_transaction(
+      ObjectStore::Transaction &t) = 0;
+    /// Notification that all outstanding flushes for interval have completed
+    virtual void on_flushed() = 0;
+
+    //============= Recovery ====================
+    /// Arrange for even to be queued after delay
+    virtual void schedule_event_after(
+      PGPeeringEventRef event,
+      float delay) = 0;
+    /**
+     * request_local_background_io_reservation
+     *
+     * Request reservation at priority with on_grant queued on grant
+     * and on_preempt on preempt
+     */
+    virtual void request_local_background_io_reservation(
+      unsigned priority,
+      PGPeeringEventURef on_grant,
+      PGPeeringEventURef on_preempt) = 0;
+    /// Modify pending local background reservation request priority
+    virtual void update_local_background_io_priority(
+      unsigned priority) = 0;
+    /// Cancel pending local background reservation request
+    virtual void cancel_local_background_io_reservation() = 0;
+
+    /**
+     * request_remote_background_io_reservation
+     *
+     * Request reservation at priority with on_grant queued on grant
+     * and on_preempt on preempt
+     */
+    virtual void request_remote_recovery_reservation(
+      unsigned priority,
+      PGPeeringEventURef on_grant,
+      PGPeeringEventURef on_preempt) = 0;
+    /// Cancel pending remote background reservation request
+    virtual void cancel_remote_recovery_reservation() = 0;
+
+    /// Arrange for on_commit to be queued upon commit of t
+    virtual void schedule_event_on_commit(
+      ObjectStore::Transaction &t,
+      PGPeeringEventRef on_commit) = 0;
+
+    //============================ HB =============================
+    /// Update hb set to peers
+    virtual void update_heartbeat_peers(std::set<int> peers) = 0;
+
+    /// Std::set targets being probed in this interval
+    virtual void set_probe_targets(const std::set<pg_shard_t> &probe_set) = 0;
+    /// Clear targets being probed in this interval
+    virtual void clear_probe_targets() = 0;
+
+    /// Queue for a pg_temp of wanted
+    virtual void queue_want_pg_temp(const std::vector<int> &wanted) = 0;
+    /// Clear queue for a pg_temp of wanted
+    virtual void clear_want_pg_temp() = 0;
+
+    /// Arrange for stats to be shipped to mon to be updated for this pg
+    virtual void publish_stats_to_osd() = 0;
+    /// Clear stats to be shipped to mon for this pg
+    virtual void clear_publish_stats() = 0;
+
+    /// Notification to check outstanding operation targets
+    virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
+    /// Notification to check outstanding blocklist
+    virtual void check_blocklisted_watchers() = 0;
+    /// Notification to clear state associated with primary
+    virtual void clear_primary_state() = 0;
+
+    // =================== Event notification ====================
+    virtual void on_pool_change() = 0;
+    virtual void on_role_change() = 0;
+    virtual void on_change(ObjectStore::Transaction &t) = 0;
+    virtual void on_activate(interval_set<snapid_t> to_trim) = 0;
+    virtual void on_activate_complete() = 0;
+    virtual void on_new_interval() = 0;
+    virtual Context *on_clean() = 0;
+    virtual void on_activate_committed() = 0;
+    virtual void on_active_exit() = 0;
+
+    // ====================== PG deletion =======================
+    /// Notification of removal complete, t must be populated to complete removal
+    virtual void on_removal(ObjectStore::Transaction &t) = 0;
+    /// Perform incremental removal work
+    virtual std::pair<ghobject_t, bool> do_delete_work(
+      ObjectStore::Transaction &t, ghobject_t _next) = 0;
+
+    // ======================= PG Merge =========================
+    virtual void clear_ready_to_merge() = 0;
+    virtual void set_not_ready_to_merge_target(pg_t pgid, pg_t src) = 0;
+    virtual void set_not_ready_to_merge_source(pg_t pgid) = 0;
+    virtual void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) = 0;
+    virtual void set_ready_to_merge_source(eversion_t lu) = 0;
+
+    // ==================== Std::map notifications ===================
+    virtual void on_active_actmap() = 0;
+    virtual void on_active_advmap(const OSDMapRef &osdmap) = 0;
+    virtual epoch_t oldest_stored_osdmap() = 0;
+
+    // ============ recovery reservation notifications ==========
+    virtual void on_backfill_reserved() = 0;
+    virtual void on_backfill_canceled() = 0;
+    virtual void on_recovery_reserved() = 0;
+
+    // ================recovery space accounting ================
+    virtual bool try_reserve_recovery_space(
+      int64_t primary_num_bytes, int64_t local_num_bytes) = 0;
+    virtual void unreserve_recovery_space() = 0;
+
+    // ================== Peering log events ====================
+    /// Get handler for rolling forward/back log entries
+    virtual PGLog::LogEntryHandlerRef get_log_handler(
+      ObjectStore::Transaction &t) = 0;
+
+    // ============ On disk representation changes ==============
+    virtual void rebuild_missing_set_with_deletes(PGLog &pglog) = 0;
+
+    // ======================= Logging ==========================
+    virtual PerfCounters &get_peering_perf() = 0;
+    virtual PerfCounters &get_perf_logger() = 0;
+    virtual void log_state_enter(const char *state) = 0;
+    virtual void log_state_exit(
+      const char *state_name, utime_t enter_time,
+      uint64_t events, utime_t event_dur) = 0;
+    virtual void dump_recovery_info(ceph::Formatter *f) const = 0;
+
+    virtual OstreamTemp get_clog_info() = 0;
+    virtual OstreamTemp get_clog_error() = 0;
+    virtual OstreamTemp get_clog_debug() = 0;
+
+    virtual ~PeeringListener() {}
+  };
+
+  struct QueryState : boost::statechart::event< QueryState > {
+    ceph::Formatter *f;
+    explicit QueryState(ceph::Formatter *f) : f(f) {}
+    void print(std::ostream *out) const {
+      *out << "Query";
+    }
+  };
+
+  struct QueryUnfound : boost::statechart::event< QueryUnfound > {
+    ceph::Formatter *f;
+    explicit QueryUnfound(ceph::Formatter *f) : f(f) {}
+    void print(std::ostream *out) const {
+      *out << "QueryUnfound";
+    }
+  };
+
+  struct AdvMap : boost::statechart::event< AdvMap > {
+    OSDMapRef osdmap;
+    OSDMapRef lastmap;
+    std::vector<int> newup, newacting;
+    int up_primary, acting_primary;
+    AdvMap(
+      OSDMapRef osdmap, OSDMapRef lastmap,
+      std::vector<int>& newup, int up_primary,
+      std::vector<int>& newacting, int acting_primary):
+      osdmap(osdmap), lastmap(lastmap),
+      newup(newup),
+      newacting(newacting),
+      up_primary(up_primary),
+      acting_primary(acting_primary) {}
+    void print(std::ostream *out) const {
+      *out << "AdvMap";
+    }
+  };
+
+  struct ActMap : boost::statechart::event< ActMap > {
+    ActMap() : boost::statechart::event< ActMap >() {}
+    void print(std::ostream *out) const {
+      *out << "ActMap";
+    }
+  };
+  struct Activate : boost::statechart::event< Activate > {
+    epoch_t activation_epoch;
+    explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
+			  activation_epoch(q) {}
+    void print(std::ostream *out) const {
+      *out << "Activate from " << activation_epoch;
+    }
+  };
+  struct ActivateCommitted : boost::statechart::event< ActivateCommitted > {
+    epoch_t epoch;
+    epoch_t activation_epoch;
+    explicit ActivateCommitted(epoch_t e, epoch_t ae)
+      : boost::statechart::event< ActivateCommitted >(),
+	epoch(e),
+	activation_epoch(ae) {}
+    void print(std::ostream *out) const {
+      *out << "ActivateCommitted from " << activation_epoch
+	   << " processed at " << epoch;
+    }
+  };
+public:
+  struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
+    explicit UnfoundBackfill() {}
+    void print(std::ostream *out) const {
+      *out << "UnfoundBackfill";
+    }
+  };
+  struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
+    explicit UnfoundRecovery() {}
+    void print(std::ostream *out) const {
+      *out << "UnfoundRecovery";
+    }
+  };
+
+  struct RequestScrub : boost::statechart::event<RequestScrub> {
+    scrub_level_t deep;
+    scrub_type_t repair;
+    explicit RequestScrub(bool d, bool r) : deep(scrub_level_t(d)), repair(scrub_type_t(r)) {}
+    void print(std::ostream *out) const {
+      *out << "RequestScrub(" << ((deep==scrub_level_t::deep) ? "deep" : "shallow")
+	   << ((repair==scrub_type_t::do_repair) ? " repair)" : ")");
+    }
+  };
+
+  TrivialEvent(Initialize)
+  TrivialEvent(GotInfo)
+  TrivialEvent(NeedUpThru)
+  TrivialEvent(Backfilled)
+  TrivialEvent(LocalBackfillReserved)
+  TrivialEvent(RejectTooFullRemoteReservation)
+  TrivialEvent(RequestBackfill)
+  TrivialEvent(RemoteRecoveryPreempted)
+  TrivialEvent(RemoteBackfillPreempted)
+  TrivialEvent(BackfillTooFull)
+  TrivialEvent(RecoveryTooFull)
+
+  TrivialEvent(MakePrimary)
+  TrivialEvent(MakeStray)
+  TrivialEvent(NeedActingChange)
+  TrivialEvent(IsIncomplete)
+  TrivialEvent(IsDown)
+
+  TrivialEvent(AllReplicasRecovered)
+  TrivialEvent(DoRecovery)
+  TrivialEvent(LocalRecoveryReserved)
+  TrivialEvent(AllRemotesReserved)
+  TrivialEvent(AllBackfillsReserved)
+  TrivialEvent(GoClean)
+
+  TrivialEvent(AllReplicasActivated)
+
+  TrivialEvent(IntervalFlush)
+
+  TrivialEvent(DeleteStart)
+  TrivialEvent(DeleteSome)
+
+  TrivialEvent(SetForceRecovery)
+  TrivialEvent(UnsetForceRecovery)
+  TrivialEvent(SetForceBackfill)
+  TrivialEvent(UnsetForceBackfill)
+
+  TrivialEvent(DeleteReserved)
+  TrivialEvent(DeleteInterrupted)
+
+  TrivialEvent(CheckReadable)
+
+  void start_handle(PeeringCtx *new_ctx);
+  void end_handle();
+  void begin_block_outgoing();
+  void end_block_outgoing();
+  void clear_blocked_outgoing();
+ private:
+
+  /* States */
+  struct Initial;
+  class PeeringMachine : public boost::statechart::state_machine< PeeringMachine, Initial > {
+  public:
+    PeeringState *state;
+    PGStateHistory *state_history;
+    CephContext *cct;
+    spg_t spgid;
+    DoutPrefixProvider *dpp;
+    PeeringListener *pl;
+
+    utime_t event_time;
+    uint64_t event_count;
+
+    void clear_event_counters() {
+      event_time = utime_t();
+      event_count = 0;
+    }
+
+    void log_enter(const char *state_name);
+    void log_exit(const char *state_name, utime_t duration);
+
+    PeeringMachine(
+      PeeringState *state, CephContext *cct,
+      spg_t spgid,
+      DoutPrefixProvider *dpp,
+      PeeringListener *pl,
+      PGStateHistory *state_history) :
+      state(state),
+      state_history(state_history),
+      cct(cct), spgid(spgid),
+      dpp(dpp), pl(pl),
+      event_count(0) {}
+
+    /* Accessor functions for state methods */
+    ObjectStore::Transaction& get_cur_transaction() {
+      ceph_assert(state->rctx);
+      return state->rctx->transaction;
+    }
+
+    PeeringCtxWrapper &get_recovery_ctx() {
+      assert(state->rctx);
+      return *(state->rctx);
+    }
+
+    void send_notify(int to, const pg_notify_t &n) {
+      ceph_assert(state->rctx);
+      state->rctx->send_notify(to, n);
+    }
+    void send_query(int to, const pg_query_t &query) {
+      state->rctx->send_query(
+	to,
+	spg_t(spgid.pgid, query.to),
+	query);
+    }
+  };
+  friend class PeeringMachine;
+
+  /* States */
+  // Initial
+  // Reset
+  // Start
+  //   Started
+  //     Primary
+  //       WaitActingChange
+  //       Peering
+  //         GetInfo
+  //         GetLog
+  //         GetMissing
+  //         WaitUpThru
+  //         Incomplete
+  //       Active
+  //         Activating
+  //         Clean
+  //         Recovered
+  //         Backfilling
+  //         WaitRemoteBackfillReserved
+  //         WaitLocalBackfillReserved
+  //         NotBackfilling
+  //         NotRecovering
+  //         Recovering
+  //         WaitRemoteRecoveryReserved
+  //         WaitLocalRecoveryReserved
+  //     ReplicaActive
+  //       RepNotRecovering
+  //       RepRecovering
+  //       RepWaitBackfillReserved
+  //       RepWaitRecoveryReserved
+  //     Stray
+  //     ToDelete
+  //       WaitDeleteReserved
+  //       Deleting
+  // Crashed
+
+  struct Crashed : boost::statechart::state< Crashed, PeeringMachine >, NamedState {
+    explicit Crashed(my_context ctx);
+  };
+
+  struct Reset;
+
+  struct Initial : boost::statechart::state< Initial, PeeringMachine >, NamedState {
+    explicit Initial(my_context ctx);
+    void exit();
+
+    typedef boost::mpl::list <
+      boost::statechart::transition< Initialize, Reset >,
+      boost::statechart::custom_reaction< NullEvt >,
+      boost::statechart::transition< boost::statechart::event_base, Crashed >
+      > reactions;
+
+    boost::statechart::result react(const MNotifyRec&);
+    boost::statechart::result react(const MInfoRec&);
+    boost::statechart::result react(const MLogRec&);
+    boost::statechart::result react(const boost::statechart::event_base&) {
+      return discard_event();
+    }
+  };
+
+  struct Reset : boost::statechart::state< Reset, PeeringMachine >, NamedState {
+    explicit Reset(my_context ctx);
+    void exit();
+
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< QueryState >,
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::custom_reaction< AdvMap >,
+      boost::statechart::custom_reaction< ActMap >,
+      boost::statechart::custom_reaction< NullEvt >,
+      boost::statechart::custom_reaction< IntervalFlush >,
+      boost::statechart::transition< boost::statechart::event_base, Crashed >
+      > reactions;
+    boost::statechart::result react(const QueryState& q);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const AdvMap&);
+    boost::statechart::result react(const ActMap&);
+    boost::statechart::result react(const IntervalFlush&);
+    boost::statechart::result react(const boost::statechart::event_base&) {
+      return discard_event();
+    }
+  };
+
+  struct Start;
+
+  struct Started : boost::statechart::state< Started, PeeringMachine, Start >, NamedState {
+    explicit Started(my_context ctx);
+    void exit();
+
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< QueryState >,
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::custom_reaction< AdvMap >,
+      boost::statechart::custom_reaction< IntervalFlush >,
+      // ignored
+      boost::statechart::custom_reaction< NullEvt >,
+      boost::statechart::custom_reaction<SetForceRecovery>,
+      boost::statechart::custom_reaction<UnsetForceRecovery>,
+      boost::statechart::custom_reaction<SetForceBackfill>,
+      boost::statechart::custom_reaction<UnsetForceBackfill>,
+      boost::statechart::custom_reaction<RequestScrub>,
+      boost::statechart::custom_reaction<CheckReadable>,
+      // crash
+      boost::statechart::transition< boost::statechart::event_base, Crashed >
+      > reactions;
+    boost::statechart::result react(const QueryState& q);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const AdvMap&);
+    boost::statechart::result react(const IntervalFlush&);
+    boost::statechart::result react(const boost::statechart::event_base&) {
+      return discard_event();
+    }
+  };
+
+  struct Primary;
+  struct Stray;
+
+  struct Start : boost::statechart::state< Start, Started >, NamedState {
+    explicit Start(my_context ctx);
+    void exit();
+
+    typedef boost::mpl::list <
+      boost::statechart::transition< MakePrimary, Primary >,
+      boost::statechart::transition< MakeStray, Stray >
+      > reactions;
+  };
+
+  struct Peering;
+  struct WaitActingChange;
+  struct Incomplete;
+  struct Down;
+
+  struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
+    explicit Primary(my_context ctx);
+    void exit();
+
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< ActMap >,
+      boost::statechart::custom_reaction< MNotifyRec >,
+      boost::statechart::custom_reaction<SetForceRecovery>,
+      boost::statechart::custom_reaction<UnsetForceRecovery>,
+      boost::statechart::custom_reaction<SetForceBackfill>,
+      boost::statechart::custom_reaction<UnsetForceBackfill>,
+      boost::statechart::custom_reaction<RequestScrub>
+      > reactions;
+    boost::statechart::result react(const ActMap&);
+    boost::statechart::result react(const MNotifyRec&);
+    boost::statechart::result react(const SetForceRecovery&);
+    boost::statechart::result react(const UnsetForceRecovery&);
+    boost::statechart::result react(const SetForceBackfill&);
+    boost::statechart::result react(const UnsetForceBackfill&);
+    boost::statechart::result react(const RequestScrub&);
+  };
+
+  struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
+		      NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< QueryState >,
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::custom_reaction< AdvMap >,
+      boost::statechart::custom_reaction< MLogRec >,
+      boost::statechart::custom_reaction< MInfoRec >,
+      boost::statechart::custom_reaction< MNotifyRec >
+      > reactions;
+    explicit WaitActingChange(my_context ctx);
+    boost::statechart::result react(const QueryState& q);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const AdvMap&);
+    boost::statechart::result react(const MLogRec&);
+    boost::statechart::result react(const MInfoRec&);
+    boost::statechart::result react(const MNotifyRec&);
+    void exit();
+  };
+
+  struct GetInfo;
+  struct Active;
+
+  struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
+    PastIntervals::PriorSet prior_set;
+    bool history_les_bound;  //< need osd_find_best_info_ignore_history_les
+
+    explicit Peering(my_context ctx);
+    void exit();
+
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< QueryState >,
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::transition< Activate, Active >,
+      boost::statechart::custom_reaction< AdvMap >
+      > reactions;
+    boost::statechart::result react(const QueryState& q);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const AdvMap &advmap);
+  };
+
+  struct WaitLocalRecoveryReserved;
+  struct Activating;
+  struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
+    explicit Active(my_context ctx);
+    void exit();
+
+    const std::set<pg_shard_t> remote_shards_to_reserve_recovery;
+    const std::set<pg_shard_t> remote_shards_to_reserve_backfill;
+    bool all_replicas_activated;
+
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< QueryState >,
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::custom_reaction< ActMap >,
+      boost::statechart::custom_reaction< AdvMap >,
+      boost::statechart::custom_reaction< MInfoRec >,
+      boost::statechart::custom_reaction< MNotifyRec >,
+      boost::statechart::custom_reaction< MLogRec >,
+      boost::statechart::custom_reaction< MTrim >,
+      boost::statechart::custom_reaction< Backfilled >,
+      boost::statechart::custom_reaction< ActivateCommitted >,
+      boost::statechart::custom_reaction< AllReplicasActivated >,
+      boost::statechart::custom_reaction< DeferRecovery >,
+      boost::statechart::custom_reaction< DeferBackfill >,
+      boost::statechart::custom_reaction< UnfoundRecovery >,
+      boost::statechart::custom_reaction< UnfoundBackfill >,
+      boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
+      boost::statechart::custom_reaction< RemoteReservationRevoked>,
+      boost::statechart::custom_reaction< DoRecovery>,
+      boost::statechart::custom_reaction< RenewLease>,
+      boost::statechart::custom_reaction< MLeaseAck>,
+      boost::statechart::custom_reaction< CheckReadable>
+      > reactions;
+    boost::statechart::result react(const QueryState& q);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const ActMap&);
+    boost::statechart::result react(const AdvMap&);
+    boost::statechart::result react(const MInfoRec& infoevt);
+    boost::statechart::result react(const MNotifyRec& notevt);
+    boost::statechart::result react(const MLogRec& logevt);
+    boost::statechart::result react(const MTrim& trimevt);
+    boost::statechart::result react(const Backfilled&) {
+      return discard_event();
+    }
+    boost::statechart::result react(const ActivateCommitted&);
+    boost::statechart::result react(const AllReplicasActivated&);
+    boost::statechart::result react(const RenewLease&);
+    boost::statechart::result react(const MLeaseAck&);
+    boost::statechart::result react(const DeferRecovery& evt) {
+      return discard_event();
+    }
+    boost::statechart::result react(const DeferBackfill& evt) {
+      return discard_event();
+    }
+    boost::statechart::result react(const UnfoundRecovery& evt) {
+      return discard_event();
+    }
+    boost::statechart::result react(const UnfoundBackfill& evt) {
+      return discard_event();
+    }
+    boost::statechart::result react(const RemoteReservationRevokedTooFull&) {
+      return discard_event();
+    }
+    boost::statechart::result react(const RemoteReservationRevoked&) {
+      return discard_event();
+    }
+    boost::statechart::result react(const DoRecovery&) {
+      return discard_event();
+    }
+    boost::statechart::result react(const CheckReadable&);
+    void all_activated_and_committed();
+  };
+
+  struct Clean : boost::statechart::state< Clean, Active >, NamedState {
+    typedef boost::mpl::list<
+      boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+      boost::statechart::custom_reaction<SetForceRecovery>,
+      boost::statechart::custom_reaction<SetForceBackfill>
+    > reactions;
+    explicit Clean(my_context ctx);
+    void exit();
+    boost::statechart::result react(const boost::statechart::event_base&) {
+      return discard_event();
+    }
+  };
+
+  struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
+    typedef boost::mpl::list<
+      boost::statechart::transition< GoClean, Clean >,
+      boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+      boost::statechart::custom_reaction< AllReplicasActivated >
+    > reactions;
+    explicit Recovered(my_context ctx);
+    void exit();
+    boost::statechart::result react(const AllReplicasActivated&) {
+      post_event(GoClean());
+      return forward_event();
+    }
+  };
+
+  struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
+    typedef boost::mpl::list<
+      boost::statechart::custom_reaction< Backfilled >,
+      boost::statechart::custom_reaction< DeferBackfill >,
+      boost::statechart::custom_reaction< UnfoundBackfill >,
+      boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
+      boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
+      boost::statechart::custom_reaction< RemoteReservationRevoked>
+      > reactions;
+    explicit Backfilling(my_context ctx);
+    boost::statechart::result react(const RemoteReservationRejectedTooFull& evt) {
+      // for compat with old peers
+      post_event(RemoteReservationRevokedTooFull());
+      return discard_event();
+    }
+    void backfill_release_reservations();
+    boost::statechart::result react(const Backfilled& evt);
+    boost::statechart::result react(const RemoteReservationRevokedTooFull& evt);
+    boost::statechart::result react(const RemoteReservationRevoked& evt);
+    boost::statechart::result react(const DeferBackfill& evt);
+    boost::statechart::result react(const UnfoundBackfill& evt);
+    void cancel_backfill();
+    void exit();
+  };
+
+  struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
+    typedef boost::mpl::list<
+      boost::statechart::custom_reaction< RemoteBackfillReserved >,
+      boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
+      boost::statechart::custom_reaction< RemoteReservationRevoked >,
+      boost::statechart::transition< AllBackfillsReserved, Backfilling >
+      > reactions;
+    std::set<pg_shard_t>::const_iterator backfill_osd_it;
+    explicit WaitRemoteBackfillReserved(my_context ctx);
+    void retry();
+    void exit();
+    boost::statechart::result react(const RemoteBackfillReserved& evt);
+    boost::statechart::result react(const RemoteReservationRejectedTooFull& evt);
+    boost::statechart::result react(const RemoteReservationRevoked& evt);
+  };
+
+  struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
+    typedef boost::mpl::list<
+      boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >,
+      boost::statechart::custom_reaction< RemoteBackfillReserved >
+      > reactions;
+    explicit WaitLocalBackfillReserved(my_context ctx);
+    boost::statechart::result react(const RemoteBackfillReserved& evt) {
+      /* no-op */
+      return discard_event();
+    }
+    void exit();
+  };
+
+  struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
+    typedef boost::mpl::list<
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
+      boost::statechart::custom_reaction< RemoteBackfillReserved >,
+      boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >
+      > reactions;
+    explicit NotBackfilling(my_context ctx);
+    void exit();
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const RemoteBackfillReserved& evt);
+    boost::statechart::result react(const RemoteReservationRejectedTooFull& evt);
+  };
+
+  struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
+    typedef boost::mpl::list<
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+      boost::statechart::custom_reaction< DeferRecovery >,
+      boost::statechart::custom_reaction< UnfoundRecovery >
+      > reactions;
+    explicit NotRecovering(my_context ctx);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const DeferRecovery& evt) {
+      /* no-op */
+      return discard_event();
+    }
+    boost::statechart::result react(const UnfoundRecovery& evt) {
+      /* no-op */
+      return discard_event();
+    }
+    void exit();
+  };
+
+  struct ToDelete;
+  struct RepNotRecovering;
+  struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
+    explicit ReplicaActive(my_context ctx);
+    void exit();
+
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< QueryState >,
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::custom_reaction< ActMap >,
+      boost::statechart::custom_reaction< MQuery >,
+      boost::statechart::custom_reaction< MInfoRec >,
+      boost::statechart::custom_reaction< MLogRec >,
+      boost::statechart::custom_reaction< MTrim >,
+      boost::statechart::custom_reaction< Activate >,
+      boost::statechart::custom_reaction< ActivateCommitted >,
+      boost::statechart::custom_reaction< DeferRecovery >,
+      boost::statechart::custom_reaction< DeferBackfill >,
+      boost::statechart::custom_reaction< UnfoundRecovery >,
+      boost::statechart::custom_reaction< UnfoundBackfill >,
+      boost::statechart::custom_reaction< RemoteBackfillPreempted >,
+      boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
+      boost::statechart::custom_reaction< RecoveryDone >,
+      boost::statechart::transition<DeleteStart, ToDelete>,
+      boost::statechart::custom_reaction< MLease >
+      > reactions;
+    boost::statechart::result react(const QueryState& q);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const MInfoRec& infoevt);
+    boost::statechart::result react(const MLogRec& logevt);
+    boost::statechart::result react(const MTrim& trimevt);
+    boost::statechart::result react(const ActMap&);
+    boost::statechart::result react(const MQuery&);
+    boost::statechart::result react(const Activate&);
+    boost::statechart::result react(const ActivateCommitted&);
+    boost::statechart::result react(const MLease&);
+    boost::statechart::result react(const RecoveryDone&) {
+      return discard_event();
+    }
+    boost::statechart::result react(const DeferRecovery& evt) {
+      return discard_event();
+    }
+    boost::statechart::result react(const DeferBackfill& evt) {
+      return discard_event();
+    }
+    boost::statechart::result react(const UnfoundRecovery& evt) {
+      return discard_event();
+    }
+    boost::statechart::result react(const UnfoundBackfill& evt) {
+      return discard_event();
+    }
+    boost::statechart::result react(const RemoteBackfillPreempted& evt) {
+      return discard_event();
+    }
+    boost::statechart::result react(const RemoteRecoveryPreempted& evt) {
+      return discard_event();
+    }
+  };
+
+  struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
+    typedef boost::mpl::list<
+      boost::statechart::transition< RecoveryDone, RepNotRecovering >,
+      // for compat with old peers
+      boost::statechart::transition< RemoteReservationRejectedTooFull, RepNotRecovering >,
+      boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
+      boost::statechart::custom_reaction< BackfillTooFull >,
+      boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
+      boost::statechart::custom_reaction< RemoteBackfillPreempted >
+      > reactions;
+    explicit RepRecovering(my_context ctx);
+    boost::statechart::result react(const RemoteRecoveryPreempted &evt);
+    boost::statechart::result react(const BackfillTooFull &evt);
+    boost::statechart::result react(const RemoteBackfillPreempted &evt);
+    void exit();
+  };
+
+  struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
+    typedef boost::mpl::list<
+      boost::statechart::custom_reaction< RemoteBackfillReserved >,
+      boost::statechart::custom_reaction< RejectTooFullRemoteReservation >,
+      boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
+      boost::statechart::custom_reaction< RemoteReservationCanceled >
+      > reactions;
+    explicit RepWaitBackfillReserved(my_context ctx);
+    void exit();
+    boost::statechart::result react(const RemoteBackfillReserved &evt);
+    boost::statechart::result react(const RejectTooFullRemoteReservation &evt);
+    boost::statechart::result react(const RemoteReservationRejectedTooFull &evt);
+    boost::statechart::result react(const RemoteReservationCanceled &evt);
+  };
+
+  struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
+    typedef boost::mpl::list<
+      boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+      // for compat with old peers
+      boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
+      boost::statechart::custom_reaction< RemoteReservationCanceled >
+      > reactions;
+    explicit RepWaitRecoveryReserved(my_context ctx);
+    void exit();
+    boost::statechart::result react(const RemoteRecoveryReserved &evt);
+    boost::statechart::result react(const RemoteReservationRejectedTooFull &evt) {
+      // for compat with old peers
+      post_event(RemoteReservationCanceled());
+      return discard_event();
+    }
+    boost::statechart::result react(const RemoteReservationCanceled &evt);
+  };
+
+  struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
+    typedef boost::mpl::list<
+      boost::statechart::custom_reaction< RequestRecoveryPrio >,
+      boost::statechart::custom_reaction< RequestBackfillPrio >,
+      boost::statechart::custom_reaction< RejectTooFullRemoteReservation >,
+      boost::statechart::transition< RemoteReservationRejectedTooFull, RepNotRecovering >,
+      boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
+      boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+      boost::statechart::custom_reaction< RemoteBackfillReserved >,
+      boost::statechart::transition< RecoveryDone, RepNotRecovering >  // for compat with pre-reservation peers
+      > reactions;
+    explicit RepNotRecovering(my_context ctx);
+    boost::statechart::result react(const RequestRecoveryPrio &evt);
+    boost::statechart::result react(const RequestBackfillPrio &evt);
+    boost::statechart::result react(const RemoteBackfillReserved &evt) {
+      // my reservation completion raced with a RELEASE from primary
+      return discard_event();
+    }
+    boost::statechart::result react(const RemoteRecoveryReserved &evt) {
+      // my reservation completion raced with a RELEASE from primary
+      return discard_event();
+    }
+    boost::statechart::result react(const RejectTooFullRemoteReservation &evt);
+    void exit();
+  };
+
+  struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< AllReplicasRecovered >,
+      boost::statechart::custom_reaction< DeferRecovery >,
+      boost::statechart::custom_reaction< UnfoundRecovery >,
+      boost::statechart::custom_reaction< RequestBackfill >
+      > reactions;
+    explicit Recovering(my_context ctx);
+    void exit();
+    void release_reservations(bool cancel = false);
+    boost::statechart::result react(const AllReplicasRecovered &evt);
+    boost::statechart::result react(const DeferRecovery& evt);
+    boost::statechart::result react(const UnfoundRecovery& evt);
+    boost::statechart::result react(const RequestBackfill &evt);
+  };
+
+  struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+      boost::statechart::transition< AllRemotesReserved, Recovering >
+      > reactions;
+    std::set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
+    explicit WaitRemoteRecoveryReserved(my_context ctx);
+    boost::statechart::result react(const RemoteRecoveryReserved &evt);
+    void exit();
+  };
+
+  struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
+      boost::statechart::custom_reaction< RecoveryTooFull >
+      > reactions;
+    explicit WaitLocalRecoveryReserved(my_context ctx);
+    void exit();
+    boost::statechart::result react(const RecoveryTooFull &evt);
+  };
+
+  struct Activating : boost::statechart::state< Activating, Active >, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::transition< AllReplicasRecovered, Recovered >,
+      boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+      boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
+      > reactions;
+    explicit Activating(my_context ctx);
+    void exit();
+  };
+
+  struct Stray : boost::statechart::state< Stray, Started >,
+            NamedState {
+    explicit Stray(my_context ctx);
+    void exit();
+
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< MQuery >,
+      boost::statechart::custom_reaction< MLogRec >,
+      boost::statechart::custom_reaction< MInfoRec >,
+      boost::statechart::custom_reaction< ActMap >,
+      boost::statechart::custom_reaction< RecoveryDone >,
+      boost::statechart::transition<DeleteStart, ToDelete>
+      > reactions;
+    boost::statechart::result react(const MQuery& query);
+    boost::statechart::result react(const MLogRec& logevt);
+    boost::statechart::result react(const MInfoRec& infoevt);
+    boost::statechart::result react(const ActMap&);
+    boost::statechart::result react(const RecoveryDone&) {
+      return discard_event();
+    }
+  };
+
+  struct WaitDeleteReserved;
+  struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState {
+    unsigned priority = 0;
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< ActMap >,
+      boost::statechart::custom_reaction< ActivateCommitted >,
+      boost::statechart::custom_reaction< DeleteSome >
+      > reactions;
+    explicit ToDelete(my_context ctx);
+    boost::statechart::result react(const ActMap &evt);
+    boost::statechart::result react(const DeleteSome &evt) {
+      // happens if we drop out of Deleting due to reprioritization etc.
+      return discard_event();
+    }
+    boost::statechart::result react(const ActivateCommitted&) {
+      // Can happens if we were activated as a stray but not actually pulled
+      // from prior to the pg going clean and sending a delete.
+      return discard_event();
+    }
+    void exit();
+  };
+
+  struct Deleting;
+  struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved,
+						 ToDelete>, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::transition<DeleteReserved, Deleting>
+      > reactions;
+    explicit WaitDeleteReserved(my_context ctx);
+    void exit();
+  };
+
+  struct Deleting : boost::statechart::state<Deleting,
+				       ToDelete>, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< DeleteSome >,
+      boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved>
+      > reactions;
+    ghobject_t next;
+    explicit Deleting(my_context ctx);
+    boost::statechart::result react(const DeleteSome &evt);
+    void exit();
+  };
+
+  struct GetLog;
+
+  struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
+    std::set<pg_shard_t> peer_info_requested;
+
+    explicit GetInfo(my_context ctx);
+    void exit();
+    void get_infos();
+
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< QueryState >,
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::transition< GotInfo, GetLog >,
+      boost::statechart::custom_reaction< MNotifyRec >,
+      boost::statechart::transition< IsDown, Down >
+      > reactions;
+    boost::statechart::result react(const QueryState& q);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const MNotifyRec& infoevt);
+  };
+
+  struct GotLog : boost::statechart::event< GotLog > {
+    GotLog() : boost::statechart::event< GotLog >() {}
+  };
+
+  struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
+    pg_shard_t auth_log_shard;
+    boost::intrusive_ptr<MOSDPGLog> msg;
+
+    explicit GetLog(my_context ctx);
+    void exit();
+
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< QueryState >,
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::custom_reaction< MLogRec >,
+      boost::statechart::custom_reaction< GotLog >,
+      boost::statechart::custom_reaction< AdvMap >,
+      boost::statechart::transition< NeedActingChange, WaitActingChange >,
+      boost::statechart::transition< IsIncomplete, Incomplete >
+      > reactions;
+    boost::statechart::result react(const AdvMap&);
+    boost::statechart::result react(const QueryState& q);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const MLogRec& logevt);
+    boost::statechart::result react(const GotLog&);
+  };
+
+  struct WaitUpThru;
+
+  struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
+    std::set<pg_shard_t> peer_missing_requested;
+
+    explicit GetMissing(my_context ctx);
+    void exit();
+
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< QueryState >,
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::custom_reaction< MLogRec >,
+      boost::statechart::transition< NeedUpThru, WaitUpThru >
+      > reactions;
+    boost::statechart::result react(const QueryState& q);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const MLogRec& logevt);
+  };
+
+  struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
+    explicit WaitUpThru(my_context ctx);
+    void exit();
+
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< QueryState >,
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::custom_reaction< ActMap >,
+      boost::statechart::custom_reaction< MLogRec >
+      > reactions;
+    boost::statechart::result react(const QueryState& q);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const ActMap& am);
+    boost::statechart::result react(const MLogRec& logrec);
+  };
+
+  struct Down : boost::statechart::state< Down, Peering>, NamedState {
+    explicit Down(my_context ctx);
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< QueryState >,
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::custom_reaction< MNotifyRec >
+      > reactions;
+    boost::statechart::result react(const QueryState& q);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const MNotifyRec& infoevt);
+    void exit();
+  };
+
+  struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< AdvMap >,
+      boost::statechart::custom_reaction< MNotifyRec >,
+      boost::statechart::custom_reaction< QueryUnfound >,
+      boost::statechart::custom_reaction< QueryState >
+      > reactions;
+    explicit Incomplete(my_context ctx);
+    boost::statechart::result react(const AdvMap &advmap);
+    boost::statechart::result react(const MNotifyRec& infoevt);
+    boost::statechart::result react(const QueryUnfound& q);
+    boost::statechart::result react(const QueryState& q);
+    void exit();
+  };
+
+  PGStateHistory state_history;
+  CephContext* cct;
+  spg_t spgid;
+  DoutPrefixProvider *dpp;
+  PeeringListener *pl;
+
+  /// context passed in by state machine caller
+  PeeringCtx *orig_ctx;
+
+  /// populated if we are buffering messages pending a flush
+  std::optional<BufferedRecoveryMessages> messages_pending_flush;
+
+  /**
+   * populated between start_handle() and end_handle(), points into
+   * the message lists for messages_pending_flush while blocking messages
+   * or into orig_ctx otherwise
+   */
+  std::optional<PeeringCtxWrapper> rctx;
+
+  /**
+   * OSDMap state
+   */
+  OSDMapRef osdmap_ref;              ///< Reference to current OSDMap
+  PGPool pool;                       ///< Current pool state
+  epoch_t last_persisted_osdmap = 0; ///< Last osdmap epoch persisted
+
+
+  /**
+   * Peering state information
+   */
+  int role = -1;             ///< 0 = primary, 1 = replica, -1=none.
+  uint64_t state = 0;        ///< PG_STATE_*
+
+  pg_shard_t primary;        ///< id/shard of primary
+  pg_shard_t pg_whoami;      ///< my id/shard
+  pg_shard_t up_primary;     ///< id/shard of primary of up set
+  std::vector<int> up;            ///< crush mapping without temp pgs
+  std::set<pg_shard_t> upset;     ///< up in set form
+  std::vector<int> acting;        ///< actual acting set for the current interval
+  std::set<pg_shard_t> actingset; ///< acting in set form
+
+  /// union of acting, recovery, and backfill targets
+  std::set<pg_shard_t> acting_recovery_backfill;
+
+  std::vector<HeartbeatStampsRef> hb_stamps;
+
+  ceph::signedspan readable_interval = ceph::signedspan::zero();
+
+  /// how long we can service reads in this interval
+  ceph::signedspan readable_until = ceph::signedspan::zero();
+
+  /// upper bound on any acting OSDs' readable_until in this interval
+  ceph::signedspan readable_until_ub = ceph::signedspan::zero();
+
+  /// upper bound from prior interval(s)
+  ceph::signedspan prior_readable_until_ub = ceph::signedspan::zero();
+
+  /// pg instances from prior interval(s) that may still be readable
+  std::set<int> prior_readable_down_osds;
+
+  /// [replica] upper bound we got from the primary (primary's clock)
+  ceph::signedspan readable_until_ub_from_primary = ceph::signedspan::zero();
+
+  /// [primary] last upper bound shared by primary to replicas
+  ceph::signedspan readable_until_ub_sent = ceph::signedspan::zero();
+
+  /// [primary] readable ub acked by acting set members
+  std::vector<ceph::signedspan> acting_readable_until_ub;
+
+  bool send_notify = false; ///< True if a notify needs to be sent to the primary
+
+  bool dirty_info = false;          ///< small info structu on disk out of date
+  bool dirty_big_info = false;      ///< big info structure on disk out of date
+
+  pg_info_t info;                   ///< current pg info
+  pg_info_t last_written_info;      ///< last written info
+  PastIntervals past_intervals;     ///< information about prior pg mappings
+  PGLog  pg_log;                    ///< pg log
+
+  epoch_t last_peering_reset = 0;   ///< epoch of last peering reset
+
+  /// last_update that has committed; ONLY DEFINED WHEN is_active()
+  eversion_t  last_update_ondisk;
+  eversion_t  last_complete_ondisk; ///< last_complete that has committed.
+  eversion_t  last_update_applied;  ///< last_update readable
+  /// last version to which rollback_info trimming has been applied
+  eversion_t  last_rollback_info_trimmed_to_applied;
+
+  /// Counter to determine when pending flushes have completed
+  unsigned flushes_in_progress = 0;
+
+  /**
+   * Primary state
+   */
+  std::set<pg_shard_t>    stray_set; ///< non-acting osds that have PG data.
+  std::map<pg_shard_t, pg_info_t>    peer_info; ///< info from peers (stray or prior)
+  std::map<pg_shard_t, int64_t>    peer_bytes; ///< Peer's num_bytes from peer_info
+  std::set<pg_shard_t> peer_purged; ///< peers purged
+  std::map<pg_shard_t, pg_missing_t> peer_missing; ///< peer missing sets
+  std::set<pg_shard_t> peer_log_requested; ///< logs i've requested (and start stamps)
+  std::set<pg_shard_t> peer_missing_requested; ///< missing sets requested
+
+  /// features supported by all peers
+  uint64_t peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+  /// features supported by acting set
+  uint64_t acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+  /// features supported by up and acting
+  uint64_t upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+
+  /// most recently consumed osdmap's require_osd_version
+  ceph_release_t last_require_osd_release = ceph_release_t::unknown;
+
+  std::vector<int> want_acting; ///< non-empty while peering needs a new acting set
+
+  // acting_recovery_backfill contains shards that are acting,
+  // async recovery targets, or backfill targets.
+  std::map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
+
+  /// up: min over last_complete_ondisk, peer_last_complete_ondisk
+  eversion_t  min_last_complete_ondisk;
+  /// point to which the log should be trimmed
+  eversion_t  pg_trim_to;
+
+  std::set<int> blocked_by; ///< osds we are blocked by (for pg stats)
+
+  bool need_up_thru = false; ///< true if osdmap with updated up_thru needed
+
+  /// I deleted these strays; ignore racing PGInfo from them
+  std::set<pg_shard_t> peer_activated;
+
+  std::set<pg_shard_t> backfill_targets;       ///< osds to be backfilled
+  std::set<pg_shard_t> async_recovery_targets; ///< osds to be async recovered
+
+  /// osds which might have objects on them which are unfound on the primary
+  std::set<pg_shard_t> might_have_unfound;
+
+  bool deleting = false;  /// true while in removing or OSD is shutting down
+  std::atomic<bool> deleted = {false}; /// true once deletion complete
+
+  MissingLoc missing_loc; ///< information about missing objects
+
+  bool backfill_reserved = false;
+  bool backfill_reserving = false;
+
+  PeeringMachine machine;
+
+  void update_osdmap_ref(OSDMapRef newmap) {
+    osdmap_ref = std::move(newmap);
+  }
+
+  void update_heartbeat_peers();
+  void query_unfound(Formatter *f, string state);
+  bool proc_replica_info(
+    pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch);
+  void remove_down_peer_info(const OSDMapRef &osdmap);
+  void check_recovery_sources(const OSDMapRef& map);
+  void set_last_peering_reset();
+  void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
+  bool should_restart_peering(
+    int newupprimary,
+    int newactingprimary,
+    const std::vector<int>& newup,
+    const std::vector<int>& newacting,
+    OSDMapRef lastmap,
+    OSDMapRef osdmap);
+  void start_peering_interval(
+    const OSDMapRef lastmap,
+    const std::vector<int>& newup, int up_primary,
+    const std::vector<int>& newacting, int acting_primary,
+    ObjectStore::Transaction &t);
+  void on_new_interval();
+  void clear_recovery_state();
+  void clear_primary_state();
+  void check_past_interval_bounds() const;
+  bool set_force_recovery(bool b);
+  bool set_force_backfill(bool b);
+
+  /// clip calculated priority to reasonable range
+  int clamp_recovery_priority(int prio, int pool_recovery_prio, int max);
+  /// get log recovery reservation priority
+  unsigned get_recovery_priority();
+  /// get backfill reservation priority
+  unsigned get_backfill_priority();
+  /// get priority for pg deletion
+  unsigned get_delete_priority();
+
+  bool check_prior_readable_down_osds(const OSDMapRef& map);
+
+  bool adjust_need_up_thru(const OSDMapRef osdmap);
+  PastIntervals::PriorSet build_prior();
+
+  void reject_reservation();
+
+  // acting std::set
+  std::map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
+    const std::map<pg_shard_t, pg_info_t> &infos,
+    bool restrict_to_up_acting,
+    bool *history_les_bound) const;
+
+  static void calc_ec_acting(
+    std::map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+    unsigned size,
+    const std::vector<int> &acting,
+    const std::vector<int> &up,
+    const std::map<pg_shard_t, pg_info_t> &all_info,
+    bool restrict_to_up_acting,
+    std::vector<int> *want,
+    std::set<pg_shard_t> *backfill,
+    std::set<pg_shard_t> *acting_backfill,
+    std::ostream &ss);
+
+  static std::pair<map<pg_shard_t, pg_info_t>::const_iterator, eversion_t>
+  select_replicated_primary(
+    map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+    uint64_t force_auth_primary_missing_objects,
+    const std::vector<int> &up,
+    pg_shard_t up_primary,
+    const map<pg_shard_t, pg_info_t> &all_info,
+    const OSDMapRef osdmap,
+    ostream &ss);
+
+  static void calc_replicated_acting(
+    map<pg_shard_t, pg_info_t>::const_iterator primary_shard,
+    eversion_t oldest_auth_log_entry,
+    unsigned size,
+    const std::vector<int> &acting,
+    const std::vector<int> &up,
+    pg_shard_t up_primary,
+    const std::map<pg_shard_t, pg_info_t> &all_info,
+    bool restrict_to_up_acting,
+    std::vector<int> *want,
+    std::set<pg_shard_t> *backfill,
+    std::set<pg_shard_t> *acting_backfill,
+    const OSDMapRef osdmap,
+    const PGPool& pool,
+    std::ostream &ss);
+  static void calc_replicated_acting_stretch(
+    map<pg_shard_t, pg_info_t>::const_iterator primary_shard,
+    eversion_t oldest_auth_log_entry,
+    unsigned size,
+    const std::vector<int> &acting,
+    const std::vector<int> &up,
+    pg_shard_t up_primary,
+    const std::map<pg_shard_t, pg_info_t> &all_info,
+    bool restrict_to_up_acting,
+    std::vector<int> *want,
+    std::set<pg_shard_t> *backfill,
+    std::set<pg_shard_t> *acting_backfill,
+    const OSDMapRef osdmap,
+    const PGPool& pool,
+    std::ostream &ss);
+
+  void choose_async_recovery_ec(
+    const std::map<pg_shard_t, pg_info_t> &all_info,
+    const pg_info_t &auth_info,
+    std::vector<int> *want,
+    std::set<pg_shard_t> *async_recovery,
+    const OSDMapRef osdmap) const;
+  void choose_async_recovery_replicated(
+    const std::map<pg_shard_t, pg_info_t> &all_info,
+    const pg_info_t &auth_info,
+    std::vector<int> *want,
+    std::set<pg_shard_t> *async_recovery,
+    const OSDMapRef osdmap) const;
+
+  bool recoverable(const std::vector<int> &want) const;
+  bool choose_acting(pg_shard_t &auth_log_shard,
+		     bool restrict_to_up_acting,
+		     bool *history_les_bound,
+		     bool request_pg_temp_change_only = false);
+
+  bool search_for_missing(
+    const pg_info_t &oinfo, const pg_missing_t &omissing,
+    pg_shard_t fromosd,
+    PeeringCtxWrapper &rctx);
+  void build_might_have_unfound();
+  void log_weirdness();
+  void activate(
+    ObjectStore::Transaction& t,
+    epoch_t activation_epoch,
+    PeeringCtxWrapper &ctx);
+
+  void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
+  void merge_log(
+    ObjectStore::Transaction& t, pg_info_t &oinfo,
+    pg_log_t&& olog, pg_shard_t from);
+
+  void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
+  void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo,
+		       pg_log_t&& olog, pg_missing_t&& omissing,
+		       pg_shard_t from);
+  void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
+			pg_missing_t&& omissing, pg_shard_t from);
+
+  void calc_min_last_complete_ondisk() {
+    eversion_t min = last_complete_ondisk;
+    ceph_assert(!acting_recovery_backfill.empty());
+    for (std::set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
+	 i != acting_recovery_backfill.end();
+	 ++i) {
+      if (*i == get_primary()) continue;
+      if (peer_last_complete_ondisk.count(*i) == 0)
+	return;   // we don't have complete info
+      eversion_t a = peer_last_complete_ondisk[*i];
+      if (a < min)
+	min = a;
+    }
+    if (min == min_last_complete_ondisk)
+      return;
+    min_last_complete_ondisk = min;
+    return;
+  }
+
+  void fulfill_info(
+    pg_shard_t from, const pg_query_t &query,
+    std::pair<pg_shard_t, pg_info_t> &notify_info);
+  void fulfill_log(
+    pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
+  void fulfill_query(const MQuery& q, PeeringCtxWrapper &rctx);
+
+  void try_mark_clean();
+
+  void update_blocked_by();
+  void update_calc_stats();
+
+  void add_log_entry(const pg_log_entry_t& e, bool applied);
+
+  void calc_trim_to();
+  void calc_trim_to_aggressive();
+
+public:
+  PeeringState(
+    CephContext *cct,
+    pg_shard_t pg_whoami,
+    spg_t spgid,
+    const PGPool &pool,
+    OSDMapRef curmap,
+    DoutPrefixProvider *dpp,
+    PeeringListener *pl);
+
+  /// Process evt
+  void handle_event(const boost::statechart::event_base &evt,
+		    PeeringCtx *rctx) {
+    start_handle(rctx);
+    machine.process_event(evt);
+    end_handle();
+  }
+
+  /// Process evt
+  void handle_event(PGPeeringEventRef evt,
+		    PeeringCtx *rctx) {
+    start_handle(rctx);
+    machine.process_event(evt->get_event());
+    end_handle();
+  }
+
+  /// Init fresh instance of PG
+  void init(
+    int role,
+    const std::vector<int>& newup, int new_up_primary,
+    const std::vector<int>& newacting, int new_acting_primary,
+    const pg_history_t& history,
+    const PastIntervals& pi,
+    bool backfill,
+    ObjectStore::Transaction &t);
+
+  /// Init pg instance from disk state
+  template <typename F>
+  auto init_from_disk_state(
+    pg_info_t &&info_from_disk,
+    PastIntervals &&past_intervals_from_disk,
+    F &&pg_log_init) {
+    info = std::move(info_from_disk);
+    last_written_info = info;
+    past_intervals = std::move(past_intervals_from_disk);
+    auto ret = pg_log_init(pg_log);
+    log_weirdness();
+    return ret;
+  }
+
+  /// Std::set initial primary/acting
+  void init_primary_up_acting(
+    const std::vector<int> &newup,
+    const std::vector<int> &newacting,
+    int new_up_primary,
+    int new_acting_primary);
+  void init_hb_stamps();
+
+  /// Std::set initial role
+  void set_role(int r) {
+    role = r;
+  }
+
+  /// Std::set predicates used for determining readable and recoverable
+  void set_backend_predicates(
+    IsPGReadablePredicate *is_readable,
+    IsPGRecoverablePredicate *is_recoverable) {
+    missing_loc.set_backend_predicates(is_readable, is_recoverable);
+  }
+
+  /// Send current pg_info to peers
+  void share_pg_info();
+
+  /// Get stats for child pgs
+  void start_split_stats(
+    const std::set<spg_t>& childpgs, std::vector<object_stat_sum_t> *out);
+
+  /// Update new child with stats
+  void finish_split_stats(
+    const object_stat_sum_t& stats, ObjectStore::Transaction &t);
+
+  /// Split state for child_pgid into *child
+  void split_into(
+    pg_t child_pgid, PeeringState *child, unsigned split_bits);
+
+  /// Merge state from sources
+  void merge_from(
+    std::map<spg_t,PeeringState *>& sources,
+    PeeringCtx &rctx,
+    unsigned split_bits,
+    const pg_merge_meta_t& last_pg_merge_meta);
+
+  /// Permit stray replicas to purge now unnecessary state
+  void purge_strays();
+
+  /**
+   * update_stats
+   *
+   * Mechanism for updating stats and/or history.  Pass t to mark
+   * dirty and write out.  Return true if stats should be published
+   * to the osd.
+   */
+  void update_stats(
+    std::function<bool(pg_history_t &, pg_stat_t &)> f,
+    ObjectStore::Transaction *t = nullptr);
+
+  /**
+   * adjust_purged_snaps
+   *
+   * Mechanism for updating purged_snaps.  Marks dirty_info, big_dirty_info.
+   */
+  void adjust_purged_snaps(
+    std::function<void(interval_set<snapid_t> &snaps)> f);
+
+  /// Updates info.hit_set to hset_history, does not dirty
+  void update_hset(const pg_hit_set_history_t &hset_history);
+
+  /// Get all pg_shards that needs recovery
+  std::vector<pg_shard_t> get_replica_recovery_order() const;
+
+  /**
+   * update_history
+   *
+   * Merges new_history into info.history clearing past_intervals and
+   * dirtying as needed.
+   *
+   * Calls PeeringListener::on_info_history_change()
+   */
+  void update_history(const pg_history_t& new_history);
+
+  /**
+   * prepare_stats_for_publish
+   *
+   * Returns updated pg_stat_t if stats have changed since
+   * pg_stats_publish adding in unstable_stats.
+   */
+  std::optional<pg_stat_t> prepare_stats_for_publish(
+    bool pg_stats_publish_valid,
+    const pg_stat_t &pg_stats_publish,
+    const object_stat_collection_t &unstable_stats);
+
+  /**
+   * Merge entries updating missing as necessary on all
+   * acting_recovery_backfill logs and missings (also missing_loc)
+   */
+  bool append_log_entries_update_missing(
+    const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+    ObjectStore::Transaction &t,
+    std::optional<eversion_t> trim_to,
+    std::optional<eversion_t> roll_forward_to);
+
+  void append_log_with_trim_to_updated(
+    std::vector<pg_log_entry_t>&& log_entries,
+    eversion_t roll_forward_to,
+    ObjectStore::Transaction &t,
+    bool transaction_applied,
+    bool async) {
+    update_trim_to();
+    append_log(std::move(log_entries), pg_trim_to, roll_forward_to,
+	min_last_complete_ondisk, t, transaction_applied, async);
+  }
+
+  /**
+   * Updates local log to reflect new write from primary.
+   */
+  void append_log(
+    std::vector<pg_log_entry_t>&& logv,
+    eversion_t trim_to,
+    eversion_t roll_forward_to,
+    eversion_t min_last_complete_ondisk,
+    ObjectStore::Transaction &t,
+    bool transaction_applied,
+    bool async);
+
+  /**
+   * retrieve the min last_backfill among backfill targets
+   */
+  hobject_t earliest_backfill() const;
+
+
+  /**
+   * Updates local log/missing to reflect new oob log update from primary
+   */
+  void merge_new_log_entries(
+    const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+    ObjectStore::Transaction &t,
+    std::optional<eversion_t> trim_to,
+    std::optional<eversion_t> roll_forward_to);
+
+  /// Update missing set to reflect e (TODOSAM: not sure why this is needed)
+  void add_local_next_event(const pg_log_entry_t& e) {
+    pg_log.missing_add_next_entry(e);
+  }
+
+  /// Update log trim boundary
+  void update_trim_to() {
+    bool hard_limit = (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
+    if (hard_limit)
+      calc_trim_to_aggressive();
+    else
+      calc_trim_to();
+  }
+
+  /// Pre-process pending update on hoid represented by logv
+  void pre_submit_op(
+    const hobject_t &hoid,
+    const std::vector<pg_log_entry_t>& logv,
+    eversion_t at_version);
+
+  /// Signal that oid has been locally recovered to version v
+  void recover_got(
+    const hobject_t &oid, eversion_t v,
+    bool is_delete,
+    ObjectStore::Transaction &t);
+
+  /// Signal that oid has been recovered on peer to version
+  void on_peer_recover(
+    pg_shard_t peer,
+    const hobject_t &soid,
+    const eversion_t &version);
+
+  /// Notify that soid is being recovered on peer
+  void begin_peer_recover(
+    pg_shard_t peer,
+    const hobject_t soid);
+
+  /// Pull missing sets from all candidate peers
+  bool discover_all_missing(
+    BufferedRecoveryMessages &rctx);
+
+  /// Notify that hoid has been fully recocovered
+  void object_recovered(
+    const hobject_t &hoid,
+    const object_stat_sum_t &stat_diff) {
+    info.stats.stats.sum.add(stat_diff);
+    missing_loc.recovered(hoid);
+  }
+
+  /// Update info/stats to reflect backfill progress
+  void update_backfill_progress(
+    const hobject_t &updated_backfill,
+    const pg_stat_t &updated_stats,
+    bool preserve_local_num_bytes,
+    ObjectStore::Transaction &t);
+
+  /// Update info/stats to reflect completed backfill on hoid
+  void update_complete_backfill_object_stats(
+    const hobject_t &hoid,
+    const pg_stat_t &stats);
+
+  /// Update last_backfill for peer to new_last_backfill
+  void update_peer_last_backfill(
+    pg_shard_t peer,
+    const hobject_t &new_last_backfill);
+
+  /// Update info.stats with delta_stats for operation on soid
+  void apply_op_stats(
+    const hobject_t &soid,
+    const object_stat_sum_t &delta_stats);
+
+  /**
+   * force_object_missing
+   *
+   * Force oid on peer to be missing at version.  If the object does not
+   * currently need recovery, either candidates if provided or the remainder
+   * of the acting std::set will be deemed to have the object.
+   */
+  void force_object_missing(
+    const pg_shard_t &peer,
+    const hobject_t &oid,
+    eversion_t version) {
+    force_object_missing(std::set<pg_shard_t>{peer}, oid, version);
+  }
+  void force_object_missing(
+    const std::set<pg_shard_t> &peer,
+    const hobject_t &oid,
+    eversion_t version);
+
+  /// Update state prior to backfilling soid on targets
+  void prepare_backfill_for_missing(
+    const hobject_t &soid,
+    const eversion_t &version,
+    const std::vector<pg_shard_t> &targets);
+
+  /// Std::set targets with the right version for revert (see recover_primary)
+  void set_revert_with_targets(
+    const hobject_t &soid,
+    const std::set<pg_shard_t> &good_peers);
+
+  /// Update lcod for fromosd
+  void update_peer_last_complete_ondisk(
+    pg_shard_t fromosd,
+    eversion_t lcod) {
+    peer_last_complete_ondisk[fromosd] = lcod;
+  }
+
+  /// Update lcod
+  void update_last_complete_ondisk(
+    eversion_t lcod) {
+    last_complete_ondisk = lcod;
+  }
+
+  /// Update state to reflect recovery up to version
+  void recovery_committed_to(eversion_t version);
+
+  /// Mark recovery complete
+  void local_recovery_complete() {
+    info.last_complete = info.last_update;
+  }
+
+  /// Update last_requested pointer to v
+  void set_last_requested(version_t v) {
+    pg_log.set_last_requested(v);
+  }
+
+  /// Write dirty state to t
+  void write_if_dirty(ObjectStore::Transaction& t);
+
+  /// Mark write completed to v with persisted lc
+  void complete_write(eversion_t v, eversion_t lc);
+
+  /// Update local write applied pointer
+  void local_write_applied(eversion_t v) {
+    last_update_applied = v;
+  }
+
+  /// Updates peering state with new map
+  void advance_map(
+    OSDMapRef osdmap,       ///< [in] new osdmap
+    OSDMapRef lastmap,      ///< [in] prev osdmap
+    std::vector<int>& newup,     ///< [in] new up set
+    int up_primary,         ///< [in] new up primary
+    std::vector<int>& newacting, ///< [in] new acting
+    int acting_primary,     ///< [in] new acting primary
+    PeeringCtx &rctx        ///< [out] recovery context
+    );
+
+  /// Activates most recently updated map
+  void activate_map(
+    PeeringCtx &rctx        ///< [out] recovery context
+    );
+
+  /// resets last_persisted_osdmap
+  void reset_last_persisted() {
+    last_persisted_osdmap = 0;
+    dirty_info = true;
+    dirty_big_info = true;
+  }
+
+  /// Signal shutdown beginning
+  void shutdown() {
+    deleting = true;
+  }
+
+  /// Signal shutdown complete
+  void set_delete_complete() {
+    deleted = true;
+  }
+
+  /// Dirty info and write out
+  void force_write_state(ObjectStore::Transaction &t) {
+    dirty_info = true;
+    dirty_big_info = true;
+    write_if_dirty(t);
+  }
+
+  /// Get current interval's readable_until
+  ceph::signedspan get_readable_until() const {
+    return readable_until;
+  }
+
+  /// Get prior intervals' readable_until upper bound
+  ceph::signedspan get_prior_readable_until_ub() const {
+    return prior_readable_until_ub;
+  }
+
+  /// Get prior intervals' readable_until down OSDs of note
+  const std::set<int>& get_prior_readable_down_osds() const {
+    return prior_readable_down_osds;
+  }
+
+  /// Reset prior intervals' readable_until upper bound (e.g., bc it passed)
+  void clear_prior_readable_until_ub() {
+    prior_readable_until_ub = ceph::signedspan::zero();
+    prior_readable_down_osds.clear();
+    info.history.prior_readable_until_ub = ceph::signedspan::zero();
+  }
+
+  void renew_lease(ceph::signedspan now) {
+    bool was_min = (readable_until_ub == readable_until);
+    readable_until_ub_sent = now + readable_interval;
+    if (was_min) {
+      recalc_readable_until();
+    }
+  }
+
+  void send_lease();
+  void schedule_renew_lease();
+
+  pg_lease_t get_lease() {
+    return pg_lease_t(readable_until, readable_until_ub_sent, readable_interval);
+  }
+
+  void proc_lease(const pg_lease_t& l);
+  void proc_lease_ack(int from, const pg_lease_ack_t& la);
+  void proc_renew_lease();
+
+  pg_lease_ack_t get_lease_ack() {
+    return pg_lease_ack_t(readable_until_ub_from_primary);
+  }
+
+  /// [primary] recalc readable_until[_ub] for the current interval
+  void recalc_readable_until();
+
+  //============================ const helpers ================================
+  const char *get_current_state() const {
+    return state_history.get_current_state();
+  }
+  epoch_t get_last_peering_reset() const {
+    return last_peering_reset;
+  }
+  eversion_t get_last_rollback_info_trimmed_to_applied() const {
+    return last_rollback_info_trimmed_to_applied;
+  }
+  /// Returns stable reference to internal pool structure
+  const PGPool &get_pool() const {
+    return pool;
+  }
+  /// Returns reference to current osdmap
+  const OSDMapRef &get_osdmap() const {
+    ceph_assert(osdmap_ref);
+    return osdmap_ref;
+  }
+  /// Returns epoch of current osdmap
+  epoch_t get_osdmap_epoch() const {
+    return get_osdmap()->get_epoch();
+  }
+
+  bool is_ec_pg() const override {
+    return pool.info.is_erasure();
+  }
+  int get_pg_size() const override {
+    return pool.info.size;
+  }
+  bool is_deleting() const {
+    return deleting;
+  }
+  bool is_deleted() const {
+    return deleted;
+  }
+  const std::set<pg_shard_t> &get_upset() const override {
+    return upset;
+  }
+  bool is_acting_recovery_backfill(pg_shard_t osd) const {
+    return acting_recovery_backfill.count(osd);
+  }
+  bool is_acting(pg_shard_t osd) const {
+    return has_shard(pool.info.is_erasure(), acting, osd);
+  }
+  bool is_up(pg_shard_t osd) const {
+    return has_shard(pool.info.is_erasure(), up, osd);
+  }
+  static bool has_shard(bool ec, const std::vector<int>& v, pg_shard_t osd) {
+    if (ec) {
+      return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
+    } else {
+      return std::find(v.begin(), v.end(), osd.osd) != v.end();
+    }
+  }
+  const PastIntervals& get_past_intervals() const {
+    return past_intervals;
+  }
+  /// acting osd that is not the primary
+  bool is_nonprimary() const {
+    return role >= 0 && pg_whoami != primary;
+  }
+  /// primary osd
+  bool is_primary() const {
+    return pg_whoami == primary;
+  }
+  bool pg_has_reset_since(epoch_t e) const {
+    return deleted || e < get_last_peering_reset();
+  }
+
+  int get_role() const {
+    return role;
+  }
+  const std::vector<int> &get_acting() const {
+    return acting;
+  }
+  const std::set<pg_shard_t> &get_actingset() const {
+    return actingset;
+  }
+  int get_acting_primary() const {
+    return primary.osd;
+  }
+  pg_shard_t get_primary() const {
+    return primary;
+  }
+  const std::vector<int> &get_up() const {
+    return up;
+  }
+  int get_up_primary() const {
+    return up_primary.osd;
+  }
+
+  bool is_backfill_target(pg_shard_t osd) const {
+    return backfill_targets.count(osd);
+  }
+  const std::set<pg_shard_t> &get_backfill_targets() const {
+    return backfill_targets;
+  }
+  bool is_async_recovery_target(pg_shard_t peer) const {
+    return async_recovery_targets.count(peer);
+  }
+  const std::set<pg_shard_t> &get_async_recovery_targets() const {
+    return async_recovery_targets;
+  }
+  const std::set<pg_shard_t> &get_acting_recovery_backfill() const {
+    return acting_recovery_backfill;
+  }
+
+  const PGLog &get_pg_log() const {
+    return pg_log;
+  }
+
+  bool state_test(uint64_t m) const { return (state & m) != 0; }
+  void state_set(uint64_t m) { state |= m; }
+  void state_clear(uint64_t m) { state &= ~m; }
+
+  bool is_complete() const { return info.last_complete == info.last_update; }
+  bool should_send_notify() const { return send_notify; }
+
+  uint64_t get_state() const { return state; }
+  bool is_active() const { return state_test(PG_STATE_ACTIVE); }
+  bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
+  bool is_peering() const { return state_test(PG_STATE_PEERING); }
+  bool is_down() const { return state_test(PG_STATE_DOWN); }
+  bool is_recovery_unfound() const {
+    return state_test(PG_STATE_RECOVERY_UNFOUND);
+  }
+  bool is_backfilling() const {
+    return state_test(PG_STATE_BACKFILLING);
+  }
+  bool is_backfill_unfound() const {
+    return state_test(PG_STATE_BACKFILL_UNFOUND);
+  }
+  bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
+  bool is_clean() const { return state_test(PG_STATE_CLEAN); }
+  bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
+  bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
+  bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
+  bool is_peered() const {
+    return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
+  }
+  bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
+  bool is_premerge() const { return state_test(PG_STATE_PREMERGE); }
+  bool is_repair() const { return state_test(PG_STATE_REPAIR); }
+  bool is_empty() const { return info.last_update == eversion_t(0,0); }
+
+  bool get_need_up_thru() const {
+    return need_up_thru;
+  }
+
+  bool is_forced_recovery_or_backfill() const {
+    return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
+  }
+
+  bool is_backfill_reserved() const {
+    return backfill_reserved;
+  }
+
+  bool is_backfill_reserving() const {
+    return backfill_reserving;
+  }
+
+  ceph_release_t get_last_require_osd_release() const {
+    return last_require_osd_release;
+  }
+
+  const pg_info_t &get_info() const {
+    return info;
+  }
+
+  const decltype(peer_info) &get_peer_info() const {
+    return peer_info;
+  }
+  const decltype(peer_missing) &get_peer_missing() const {
+    return peer_missing;
+  }
+  const pg_missing_const_i &get_peer_missing(const pg_shard_t &peer) const {
+    if (peer == pg_whoami) {
+      return pg_log.get_missing();
+    } else {
+      assert(peer_missing.count(peer));
+      return peer_missing.find(peer)->second;
+    }
+  }
+  const pg_info_t&get_peer_info(pg_shard_t peer) const {
+    assert(peer_info.count(peer));
+    return peer_info.find(peer)->second;
+  }
+  bool has_peer_info(pg_shard_t peer) const {
+    return peer_info.count(peer);
+  }
+
+  bool needs_recovery() const;
+  bool needs_backfill() const;
+
+  /**
+   * Returns whether a particular object can be safely read on this replica
+   */
+  bool can_serve_replica_read(const hobject_t &hoid) {
+    ceph_assert(!is_primary());
+    return !pg_log.get_log().has_write_since(
+      hoid, get_min_last_complete_ondisk());
+  }
+
+  /**
+   * Returns whether the current acting set is able to go active
+   * and serve writes. It needs to satisfy min_size and any
+   * applicable stretch cluster constraints.
+   */
+  bool acting_set_writeable() {
+    return (actingset.size() >= pool.info.min_size) &&
+      (pool.info.stretch_set_can_peer(acting, *get_osdmap(), NULL));
+  }
+
+  /**
+   * Returns whether all peers which might have unfound objects have been
+   * queried or marked lost.
+   */
+  bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
+  bool all_missing_unfound() const {
+    const auto& missing = pg_log.get_missing();
+    if (!missing.have_missing())
+      return false;
+    for (auto& m : missing.get_items()) {
+      if (!missing_loc.is_unfound(m.first))
+        return false;
+    }
+    return true;
+  }
+
+  bool perform_deletes_during_peering() const {
+    return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
+  }
+
+
+  bool have_unfound() const {
+    return missing_loc.have_unfound();
+  }
+  uint64_t get_num_unfound() const {
+    return missing_loc.num_unfound();
+  }
+
+  bool have_missing() const {
+    return pg_log.get_missing().num_missing() > 0;
+  }
+  unsigned int get_num_missing() const {
+    return pg_log.get_missing().num_missing();
+  }
+
+  const MissingLoc &get_missing_loc() const {
+    return missing_loc;
+  }
+
+  const MissingLoc::missing_by_count_t &get_missing_by_count() const {
+    return missing_loc.get_missing_by_count();
+  }
+
+  eversion_t get_min_last_complete_ondisk() const {
+    return min_last_complete_ondisk;
+  }
+
+  eversion_t get_pg_trim_to() const {
+    return pg_trim_to;
+  }
+
+  eversion_t get_last_update_applied() const {
+    return last_update_applied;
+  }
+
+  eversion_t get_last_update_ondisk() const {
+    return last_update_ondisk;
+  }
+
+  bool debug_has_dirty_state() const {
+    return dirty_info || dirty_big_info;
+  }
+
+  std::string get_pg_state_string() const {
+    return pg_state_string(state);
+  }
+
+  /// Dump representation of past_intervals to out
+  void print_past_intervals(std::ostream &out) const {
+    out << "[" << past_intervals.get_bounds()
+	<< ")/" << past_intervals.size();
+  }
+
+  void dump_history(ceph::Formatter *f) const {
+    state_history.dump(f);
+  }
+
+  /// Dump formatted peering status
+  void dump_peering_state(ceph::Formatter *f);
+
+private:
+  /// Mask feature vector with feature set from new peer
+  void apply_peer_features(uint64_t f) { peer_features &= f; }
+
+  /// Reset feature vector to default
+  void reset_min_peer_features() {
+    peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+  }
+public:
+  /// Get feature vector common to all known peers with this pg
+  uint64_t get_min_peer_features() const { return peer_features; }
+
+  /// Get feature vector common to acting set
+  uint64_t get_min_acting_features() const { return acting_features; }
+
+  /// Get feature vector common to up/acting set
+  uint64_t get_min_upacting_features() const { return upacting_features; }
+
+
+  // Flush control interface
+private:
+  /**
+   * Start additional flush (blocks needs_flush/activation until
+   * complete_flush is called once for each start_flush call as
+   * required by start_flush_on_transaction).
+   */
+  void start_flush(ObjectStore::Transaction &t) {
+    flushes_in_progress++;
+    pl->start_flush_on_transaction(t);
+  }
+public:
+  /// True if there are outstanding flushes
+  bool needs_flush() const {
+    return flushes_in_progress > 0;
+  }
+  /// Must be called once per start_flush
+  void complete_flush();
+
+  friend std::ostream &operator<<(std::ostream &out, const PeeringState &ps);
+};
+
+std::ostream &operator<<(std::ostream &out, const PeeringState &ps);
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
new file mode 100644
index 000000000..c1673bf70
--- /dev/null
+++ b/src/osd/PrimaryLogPG.cc
@@ -0,0 +1,15470 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "boost/tuple/tuple.hpp"
+#include "boost/intrusive_ptr.hpp"
+#include "PG.h"
+#include "pg_scrubber.h"
+#include "PrimaryLogPG.h"
+#include "OSD.h"
+#include "PrimaryLogScrub.h"
+#include "OpRequest.h"
+#include "ScrubStore.h"
+#include "Session.h"
+#include "objclass/objclass.h"
+#include "osd/ClassHandler.h"
+
+#include "cls/cas/cls_cas_ops.h"
+#include "common/ceph_crypto.h"
+#include "common/errno.h"
+#include "common/scrub_types.h"
+#include "common/perf_counters.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDBackoff.h"
+#include "messages/MOSDPGTrim.h"
+#include "messages/MOSDPGScan.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDPGBackfill.h"
+#include "messages/MOSDPGBackfillRemove.h"
+#include "messages/MOSDPGUpdateLogMissing.h"
+#include "messages/MOSDPGUpdateLogMissingReply.h"
+#include "messages/MCommandReply.h"
+#include "messages/MOSDScrubReserve.h"
+#include "common/EventTrace.h"
+
+#include "common/config.h"
+#include "include/compat.h"
+#include "mon/MonClient.h"
+#include "osdc/Objecter.h"
+#include "json_spirit/json_spirit_value.h"
+#include "json_spirit/json_spirit_reader.h"
+#include "include/ceph_assert.h"  // json_spirit clobbers it
+#include "include/rados/rados_types.hpp"
+
+#ifdef WITH_LTTNG
+#include "tracing/osd.h"
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+#include <sstream>
+#include <utility>
+
+#include <errno.h>
+#ifdef HAVE_JAEGER
+#include "common/tracer.h"
+#endif
+
+#include <common/CDC.h>
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
+
+using std::list;
+using std::ostream;
+using std::pair;
+using std::make_pair;
+using std::map;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::string_view;
+using std::stringstream;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::Formatter;
+using ceph::decode;
+using ceph::decode_noclear;
+using ceph::encode;
+using ceph::encode_destructively;
+
+using namespace ceph::osd::scheduler;
+using TOPNSPC::common::cmd_getval;
+
+template <typename T>
+static ostream& _prefix(std::ostream *_dout, T *pg) {
+  return pg->gen_prefix(*_dout);
+}
+
+/**
+ * The CopyCallback class defines an interface for completions to the
+ * copy_start code. Users of the copy infrastructure must implement
+ * one and give an instance of the class to start_copy.
+ *
+ * The implementer is responsible for making sure that the CopyCallback
+ * can associate itself with the correct copy operation.
+ */
+class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
+protected:
+  CopyCallback() {}
+  /**
+   * results.get<0>() is the return code: 0 for success; -ECANCELED if
+   * the operation was cancelled by the local OSD; -errno for other issues.
+   * results.get<1>() is a pointer to a CopyResults object, which you are
+   * responsible for deleting.
+   */
+  void finish(CopyCallbackResults results_) override = 0;
+
+public:
+  /// Provide the final size of the copied object to the CopyCallback
+  ~CopyCallback() override {}
+};
+
+template <typename T>
+class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
+  PrimaryLogPGRef pg;
+  unique_ptr<GenContext<T>> c;
+  epoch_t e;
+public:
+  BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
+    : pg(pg), c(c), e(e) {}
+  void finish(T t) override {
+    std::scoped_lock locker{*pg};
+    if (pg->pg_has_reset_since(e))
+      c.reset();
+    else
+      c.release()->complete(t);
+  }
+  bool sync_finish(T t) {
+    // we assume here all blessed/wrapped Contexts can complete synchronously.
+    c.release()->complete(t);
+    return true;
+  }
+};
+
+GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
+  GenContext<ThreadPool::TPHandle&> *c) {
+  return new BlessedGenContext<ThreadPool::TPHandle&>(
+    this, c, get_osdmap_epoch());
+}
+
+template <typename T>
+class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
+  PrimaryLogPGRef pg;
+  unique_ptr<GenContext<T>> c;
+  epoch_t e;
+public:
+  UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
+    : pg(pg), c(c), e(e) {}
+  void finish(T t) override {
+    if (pg->pg_has_reset_since(e))
+      c.reset();
+    else
+      c.release()->complete(t);
+  }
+  bool sync_finish(T t) {
+    // we assume here all blessed/wrapped Contexts can complete synchronously.
+    c.release()->complete(t);
+    return true;
+  }
+};
+
+GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
+  GenContext<ThreadPool::TPHandle&> *c) {
+  return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
+    this, c, get_osdmap_epoch());
+}
+
+class PrimaryLogPG::BlessedContext : public Context {
+  PrimaryLogPGRef pg;
+  unique_ptr<Context> c;
+  epoch_t e;
+public:
+  BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
+    : pg(pg), c(c), e(e) {}
+  void finish(int r) override {
+    std::scoped_lock locker{*pg};
+    if (pg->pg_has_reset_since(e))
+      c.reset();
+    else
+      c.release()->complete(r);
+  }
+  bool sync_finish(int r) override {
+    // we assume here all blessed/wrapped Contexts can complete synchronously.
+    c.release()->complete(r);
+    return true;
+  }
+};
+
+Context *PrimaryLogPG::bless_context(Context *c) {
+  return new BlessedContext(this, c, get_osdmap_epoch());
+}
+
+class PrimaryLogPG::C_PG_ObjectContext : public Context {
+  PrimaryLogPGRef pg;
+  ObjectContext *obc;
+  public:
+  C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
+    pg(p), obc(o) {}
+  void finish(int r) override {
+    pg->object_context_destructor_callback(obc);
+  }
+};
+
+struct OnReadComplete : public Context {
+  PrimaryLogPG *pg;
+  PrimaryLogPG::OpContext *opcontext;
+  OnReadComplete(
+    PrimaryLogPG *pg,
+    PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
+  void finish(int r) override {
+    opcontext->finish_read(pg);
+  }
+  ~OnReadComplete() override {}
+};
+
+class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
+  PrimaryLogPGRef pg;
+  ObjectContextRef obc;
+  public:
+  C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
+    pg(p), obc(o) {}
+  bool sync_finish(int r) override {
+    pg->_applied_recovered_object(obc);
+    return true;
+  }
+  void finish(int r) override {
+    std::scoped_lock locker{*pg};
+    pg->_applied_recovered_object(obc);
+  }
+};
+
+class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
+  PrimaryLogPGRef pg;
+  epoch_t epoch;
+  eversion_t last_complete;
+  public:
+  C_OSD_CommittedPushedObject(
+    PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
+    pg(p), epoch(epoch), last_complete(lc) {
+  }
+  void finish(int r) override {
+    pg->_committed_pushed_object(epoch, last_complete);
+  }
+};
+
+class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
+  PrimaryLogPGRef pg;
+  public:
+  explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
+    pg(p) {}
+  bool sync_finish(int r) override {
+    pg->_applied_recovered_object_replica();
+    return true;
+  }
+  void finish(int r) override {
+    std::scoped_lock locker{*pg};
+    pg->_applied_recovered_object_replica();
+  }
+};
+
+// OpContext
+void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
+{
+  inflightreads = 1;
+  list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
+	    pair<bufferlist*, Context*> > > in;
+  in.swap(pending_async_reads);
+  pg->pgbackend->objects_read_async(
+    obc->obs.oi.soid,
+    in,
+    new OnReadComplete(pg, this), pg->get_pool().fast_read);
+}
+void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
+{
+  ceph_assert(inflightreads > 0);
+  --inflightreads;
+  if (async_reads_complete()) {
+    ceph_assert(pg->in_progress_async_reads.size());
+    ceph_assert(pg->in_progress_async_reads.front().second == this);
+    pg->in_progress_async_reads.pop_front();
+
+    // Restart the op context now that all reads have been
+    // completed. Read failures will be handled by the op finisher
+    pg->execute_ctx(this);
+  }
+}
+
+class CopyFromCallback : public PrimaryLogPG::CopyCallback {
+public:
+  PrimaryLogPG::CopyResults *results = nullptr;
+  PrimaryLogPG::OpContext *ctx;
+  OSDOp &osd_op;
+  uint32_t truncate_seq;
+  uint64_t truncate_size;
+  bool have_truncate = false;
+
+  CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
+    : ctx(ctx), osd_op(osd_op) {
+  }
+  ~CopyFromCallback() override {}
+
+  void finish(PrimaryLogPG::CopyCallbackResults results_) override {
+    results = results_.get<1>();
+    int r = results_.get<0>();
+
+    // Only use truncate_{seq,size} from the original object if the client
+    // did not sent us these parameters
+    if (!have_truncate) {
+      truncate_seq = results->truncate_seq;
+      truncate_size = results->truncate_size;
+    }
+
+    // for finish_copyfrom
+    ctx->user_at_version = results->user_version;
+
+    if (r >= 0) {
+      ctx->pg->execute_ctx(ctx);
+    } else {
+      if (r != -ECANCELED) { // on cancel just toss it out; client resends
+	if (ctx->op)
+	  ctx->pg->osd->reply_op_error(ctx->op, r);
+      } else if (results->should_requeue) {
+	if (ctx->op)
+	  ctx->pg->requeue_op(ctx->op);
+      }
+      ctx->pg->close_op_ctx(ctx);
+    }
+  }
+
+  bool is_temp_obj_used() {
+    return results->started_temp_obj;
+  }
+  uint64_t get_data_size() {
+    return results->object_size;
+  }
+  void set_truncate(uint32_t seq, uint64_t size) {
+    truncate_seq = seq;
+    truncate_size = size;
+    have_truncate = true;
+  }
+};
+
+struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
+  CopyFromCallback *copy_from_callback;
+
+  explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
+    : copy_from_callback(copy_from_callback) {
+  }
+
+  int execute() override {
+    // instance will be destructed after this method completes
+    copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
+    return 0;
+  }
+};
+
+// ======================
+// PGBackend::Listener
+
+void PrimaryLogPG::on_local_recover(
+  const hobject_t &hoid,
+  const ObjectRecoveryInfo &_recovery_info,
+  ObjectContextRef obc,
+  bool is_delete,
+  ObjectStore::Transaction *t
+  )
+{
+  dout(10) << __func__ << ": " << hoid << dendl;
+
+  ObjectRecoveryInfo recovery_info(_recovery_info);
+  clear_object_snap_mapping(t, hoid);
+  if (!is_delete && recovery_info.soid.is_snap()) {
+    OSDriver::OSTransaction _t(osdriver.get_transaction(t));
+    set<snapid_t> snaps;
+    dout(20) << " snapset " << recovery_info.ss << dendl;
+    auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
+    if (p != recovery_info.ss.clone_snaps.end()) {
+      snaps.insert(p->second.begin(), p->second.end());
+      dout(20) << " snaps " << snaps << dendl;
+      snap_mapper.add_oid(
+	recovery_info.soid,
+	snaps,
+	&_t);
+    } else {
+      derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
+    }
+  }
+  if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) &&
+      recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
+    ceph_assert(is_primary());
+    const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second;
+    if (latest->op == pg_log_entry_t::LOST_REVERT &&
+	latest->reverting_to == recovery_info.version) {
+      dout(10) << " got old revert version " << recovery_info.version
+	       << " for " << *latest << dendl;
+      recovery_info.version = latest->version;
+      // update the attr to the revert event version
+      recovery_info.oi.prior_version = recovery_info.oi.version;
+      recovery_info.oi.version = latest->version;
+      bufferlist bl;
+      encode(recovery_info.oi, bl,
+	       get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+      ceph_assert(!pool.info.is_erasure());
+      t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
+      if (obc)
+	obc->attr_cache[OI_ATTR] = bl;
+    }
+  }
+
+  // keep track of active pushes for scrub
+  ++active_pushes;
+
+  recovery_state.recover_got(
+    recovery_info.soid,
+    recovery_info.version,
+    is_delete,
+    *t);
+
+  if (is_primary()) {
+    if (!is_delete) {
+      obc->obs.exists = true;
+
+      bool got = obc->get_recovery_read();
+      ceph_assert(got);
+
+      ceph_assert(recovering.count(obc->obs.oi.soid));
+      recovering[obc->obs.oi.soid] = obc;
+      obc->obs.oi = recovery_info.oi;  // may have been updated above
+    }
+
+    t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
+
+    publish_stats_to_osd();
+    release_backoffs(hoid);
+    if (!is_unreadable_object(hoid)) {
+      auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
+      if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
+	dout(20) << " kicking unreadable waiters on " << hoid << dendl;
+	requeue_ops(unreadable_object_entry->second);
+	waiting_for_unreadable_object.erase(unreadable_object_entry);
+      }
+    }
+  } else {
+    t->register_on_applied(
+      new C_OSD_AppliedRecoveredObjectReplica(this));
+
+  }
+
+  t->register_on_commit(
+    new C_OSD_CommittedPushedObject(
+      this,
+      get_osdmap_epoch(),
+      info.last_complete));
+}
+
+void PrimaryLogPG::on_global_recover(
+  const hobject_t &soid,
+  const object_stat_sum_t &stat_diff,
+  bool is_delete)
+{
+  recovery_state.object_recovered(soid, stat_diff);
+  publish_stats_to_osd();
+  dout(10) << "pushed " << soid << " to all replicas" << dendl;
+  auto i = recovering.find(soid);
+  ceph_assert(i != recovering.end());
+
+  if (i->second && i->second->rwstate.recovery_read_marker) {
+    // recover missing won't have had an obc, but it gets filled in
+    // during on_local_recover
+    ceph_assert(i->second);
+    list<OpRequestRef> requeue_list;
+    i->second->drop_recovery_read(&requeue_list);
+    requeue_ops(requeue_list);
+  }
+
+  backfills_in_flight.erase(soid);
+
+  recovering.erase(i);
+  finish_recovery_op(soid);
+  release_backoffs(soid);
+  auto degraded_object_entry = waiting_for_degraded_object.find(soid);
+  if (degraded_object_entry != waiting_for_degraded_object.end()) {
+    dout(20) << " kicking degraded waiters on " << soid << dendl;
+    requeue_ops(degraded_object_entry->second);
+    waiting_for_degraded_object.erase(degraded_object_entry);
+  }
+  auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
+  if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
+    dout(20) << " kicking unreadable waiters on " << soid << dendl;
+    requeue_ops(unreadable_object_entry->second);
+    waiting_for_unreadable_object.erase(unreadable_object_entry);
+  }
+  finish_degraded_object(soid);
+}
+
+void PrimaryLogPG::schedule_recovery_work(
+  GenContext<ThreadPool::TPHandle&> *c)
+{
+  osd->queue_recovery_context(this, c);
+}
+
+void PrimaryLogPG::replica_clear_repop_obc(
+  const vector<pg_log_entry_t> &logv,
+  ObjectStore::Transaction &t)
+{
+  for (auto &&e: logv) {
+    /* Have to blast all clones, they share a snapset */
+    object_contexts.clear_range(
+      e.soid.get_object_boundary(), e.soid.get_head());
+    ceph_assert(
+      snapset_contexts.find(e.soid.get_head()) ==
+      snapset_contexts.end());
+  }
+}
+
+bool PrimaryLogPG::should_send_op(
+  pg_shard_t peer,
+  const hobject_t &hoid) {
+  if (peer == get_primary())
+    return true;
+  ceph_assert(recovery_state.has_peer_info(peer));
+  bool should_send =
+      hoid.pool != (int64_t)info.pgid.pool() ||
+      hoid <= last_backfill_started ||
+      hoid <= recovery_state.get_peer_info(peer).last_backfill;
+  if (!should_send) {
+    ceph_assert(is_backfill_target(peer));
+    dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
+             << ", object " << hoid
+             << " beyond std::max(last_backfill_started "
+             << ", peer_info[peer].last_backfill "
+             << recovery_state.get_peer_info(peer).last_backfill
+	     << ")" << dendl;
+    return should_send;
+  }
+  if (is_async_recovery_target(peer) &&
+      recovery_state.get_peer_missing(peer).is_missing(hoid)) {
+    should_send = false;
+    dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
+             << ", object " << hoid
+             << " which is pending recovery in async_recovery_targets" << dendl;
+  }
+  return should_send;
+}
+
+
+ConnectionRef PrimaryLogPG::get_con_osd_cluster(
+  int peer, epoch_t from_epoch)
+{
+  return osd->get_con_osd_cluster(peer, from_epoch);
+}
+
+PerfCounters *PrimaryLogPG::get_logger()
+{
+  return osd->logger;
+}
+
+
+// ====================
+// missing objects
+
+bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
+{
+  return recovery_state.get_pg_log().get_missing().get_items().count(soid);
+}
+
+void PrimaryLogPG::maybe_kick_recovery(
+  const hobject_t &soid)
+{
+  eversion_t v;
+  bool work_started = false;
+  if (!recovery_state.get_missing_loc().needs_recovery(soid, &v))
+    return;
+
+  map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
+  if (p != recovering.end()) {
+    dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
+  } else if (recovery_state.get_missing_loc().is_unfound(soid)) {
+    dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
+  } else {
+    dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
+    PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+    if (is_missing_object(soid)) {
+      recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h);
+    } else if (recovery_state.get_missing_loc().is_deleted(soid)) {
+      prep_object_replica_deletes(soid, v, h, &work_started);
+    } else {
+      prep_object_replica_pushes(soid, v, h, &work_started);
+    }
+    pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH);
+  }
+}
+
+void PrimaryLogPG::wait_for_unreadable_object(
+  const hobject_t& soid, OpRequestRef op)
+{
+  ceph_assert(is_unreadable_object(soid));
+  maybe_kick_recovery(soid);
+  waiting_for_unreadable_object[soid].push_back(op);
+  op->mark_delayed("waiting for missing object");
+}
+
+bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
+{
+  /* The conditions below may clear (on_local_recover, before we queue
+   * the transaction) before we actually requeue the degraded waiters
+   * in on_global_recover after the transaction completes.
+   */
+  if (waiting_for_degraded_object.count(soid))
+    return true;
+  if (recovery_state.get_pg_log().get_missing().get_items().count(soid))
+    return true;
+  ceph_assert(!get_acting_recovery_backfill().empty());
+  for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
+       i != get_acting_recovery_backfill().end();
+       ++i) {
+    if (*i == get_primary()) continue;
+    pg_shard_t peer = *i;
+    auto peer_missing_entry = recovery_state.get_peer_missing().find(peer);
+    // If an object is missing on an async_recovery_target, return false.
+    // This will not block the op and the object is async recovered later.
+    if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
+	peer_missing_entry->second.get_items().count(soid)) {
+      if (is_async_recovery_target(peer))
+	continue;
+      else
+	return true;
+    }
+    // Object is degraded if after last_backfill AND
+    // we are backfilling it
+    if (is_backfill_target(peer) &&
+        recovery_state.get_peer_info(peer).last_backfill <= soid &&
+	last_backfill_started >= soid &&
+	backfills_in_flight.count(soid))
+      return true;
+  }
+  return false;
+}
+
+bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
+{
+  for (auto &i: get_async_recovery_targets()) {
+    auto peer_missing_entry = recovery_state.get_peer_missing().find(i);
+    if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
+        peer_missing_entry->second.get_items().count(soid)) {
+      dout(30) << __func__ << " " << soid << dendl;
+      return true;
+    }
+  }
+  return false;
+}
+
+void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
+{
+  ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
+
+  maybe_kick_recovery(soid);
+  waiting_for_degraded_object[soid].push_back(op);
+  op->mark_delayed("waiting for degraded object");
+}
+
+void PrimaryLogPG::block_write_on_full_cache(
+  const hobject_t& _oid, OpRequestRef op)
+{
+  const hobject_t oid = _oid.get_head();
+  dout(20) << __func__ << ": blocking object " << oid
+	   << " on full cache" << dendl;
+  objects_blocked_on_cache_full.insert(oid);
+  waiting_for_cache_not_full.push_back(op);
+  op->mark_delayed("waiting for cache not full");
+}
+
+void PrimaryLogPG::block_for_clean(
+  const hobject_t& oid, OpRequestRef op)
+{
+  dout(20) << __func__ << ": blocking object " << oid
+	   << " on primary repair" << dendl;
+  waiting_for_clean_to_primary_repair.push_back(op);
+  op->mark_delayed("waiting for clean to repair");
+}
+
+void PrimaryLogPG::block_write_on_snap_rollback(
+  const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
+{
+  dout(20) << __func__ << ": blocking object " << oid.get_head()
+	   << " on snap promotion " << obc->obs.oi.soid << dendl;
+  // otherwise, we'd have blocked in do_op
+  ceph_assert(oid.is_head());
+  ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
+  objects_blocked_on_snap_promotion[oid] = obc;
+  wait_for_blocked_object(obc->obs.oi.soid, op);
+}
+
+void PrimaryLogPG::block_write_on_degraded_snap(
+  const hobject_t& snap, OpRequestRef op)
+{
+  dout(20) << __func__ << ": blocking object " << snap.get_head()
+	   << " on degraded snap " << snap << dendl;
+  // otherwise, we'd have blocked in do_op
+  ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
+  objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
+  wait_for_degraded_object(snap, op);
+}
+
+bool PrimaryLogPG::maybe_await_blocked_head(
+  const hobject_t &hoid,
+  OpRequestRef op)
+{
+  ObjectContextRef obc;
+  obc = object_contexts.lookup(hoid.get_head());
+  if (obc) {
+    if (obc->is_blocked()) {
+      wait_for_blocked_object(obc->obs.oi.soid, op);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  return false;
+}
+
+void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
+{
+  dout(10) << __func__ << " " << soid << " " << op << dendl;
+  waiting_for_blocked_object[soid].push_back(op);
+  op->mark_delayed("waiting for blocked object");
+}
+
+void PrimaryLogPG::maybe_force_recovery()
+{
+  // no force if not in degraded/recovery/backfill states
+  if (!is_degraded() &&
+      !state_test(PG_STATE_RECOVERING |
+                  PG_STATE_RECOVERY_WAIT |
+		  PG_STATE_BACKFILLING |
+		  PG_STATE_BACKFILL_WAIT |
+		  PG_STATE_BACKFILL_TOOFULL))
+    return;
+
+  if (recovery_state.get_pg_log().get_log().approx_size() <
+      cct->_conf->osd_max_pg_log_entries *
+        cct->_conf->osd_force_recovery_pg_log_entries_factor)
+    return;
+
+  // find the oldest missing object
+  version_t min_version = recovery_state.get_pg_log().get_log().head.version;
+  hobject_t soid;
+  if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) {
+    min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first;
+    soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second;
+  }
+  ceph_assert(!get_acting_recovery_backfill().empty());
+  for (set<pg_shard_t>::iterator it = get_acting_recovery_backfill().begin();
+       it != get_acting_recovery_backfill().end();
+       ++it) {
+    if (*it == get_primary()) continue;
+    pg_shard_t peer = *it;
+    auto it_missing = recovery_state.get_peer_missing().find(peer);
+    if (it_missing != recovery_state.get_peer_missing().end() &&
+	!it_missing->second.get_rmissing().empty()) {
+      const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin();
+      dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
+               << " oid " << min_obj->second << dendl;
+      if (min_version > min_obj->first) {
+        min_version = min_obj->first;
+        soid = min_obj->second;
+      }
+    }
+  }
+
+  // recover it
+  if (soid != hobject_t())
+    maybe_kick_recovery(soid);
+}
+
+bool PrimaryLogPG::check_laggy(OpRequestRef& op)
+{
+  if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
+		    SERVER_OCTOPUS)) {
+    dout(20) << __func__ << " not all upacting has SERVER_OCTOPUS" << dendl;
+    return true;
+  }
+  if (state_test(PG_STATE_WAIT)) {
+    dout(10) << __func__ << " PG is WAIT state" << dendl;
+  } else if (!state_test(PG_STATE_LAGGY)) {
+    auto mnow = osd->get_mnow();
+    auto ru = recovery_state.get_readable_until();
+    if (mnow <= ru) {
+      // not laggy
+      return true;
+    }
+    dout(10) << __func__
+	     << " mnow " << mnow
+	     << " > readable_until " << ru << dendl;
+
+    if (!is_primary()) {
+      osd->reply_op_error(op, -EAGAIN);
+      return false;
+    }
+
+    // go to laggy state
+    state_set(PG_STATE_LAGGY);
+    publish_stats_to_osd();
+  }
+  dout(10) << __func__ << " not readable" << dendl;
+  waiting_for_readable.push_back(op);
+  op->mark_delayed("waiting for readable");
+  return false;
+}
+
+bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op)
+{
+  if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
+		    SERVER_OCTOPUS)) {
+    return true;
+  }
+  if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) {
+    return true; // not laggy
+  }
+  dout(10) << __func__ << " not readable" << dendl;
+  waiting_for_readable.push_front(op);
+  op->mark_delayed("waiting for readable");
+  return false;
+}
+
+void PrimaryLogPG::recheck_readable()
+{
+  if (!is_wait() && !is_laggy()) {
+    dout(20) << __func__ << " wasn't wait or laggy" << dendl;
+    return;
+  }
+  auto mnow = osd->get_mnow();
+  bool pub = false;
+  if (is_wait()) {
+    auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub();
+    if (mnow < prior_readable_until_ub) {
+      dout(10) << __func__ << " still wait (mnow " << mnow
+	       << " < prior_readable_until_ub " << prior_readable_until_ub
+	       << ")" << dendl;
+    } else {
+      dout(10) << __func__ << " no longer wait (mnow " << mnow
+	       << " >= prior_readable_until_ub " << prior_readable_until_ub
+	       << ")" << dendl;
+      state_clear(PG_STATE_WAIT);
+      recovery_state.clear_prior_readable_until_ub();
+      pub = true;
+    }
+  }
+  if (is_laggy()) {
+    auto ru = recovery_state.get_readable_until();
+    if (ru == ceph::signedspan::zero()) {
+      dout(10) << __func__ << " still laggy (mnow " << mnow
+	       << ", readable_until zero)" << dendl;
+    } else if (mnow >= ru) {
+      dout(10) << __func__ << " still laggy (mnow " << mnow
+	       << " >= readable_until " << ru << ")" << dendl;
+    } else {
+      dout(10) << __func__ << " no longer laggy (mnow " << mnow
+	       << " < readable_until " << ru << ")" << dendl;
+      state_clear(PG_STATE_LAGGY);
+      pub = true;
+    }
+  }
+  if (pub) {
+    publish_stats_to_osd();
+  }
+  if (!is_laggy() && !is_wait()) {
+    requeue_ops(waiting_for_readable);
+  }
+}
+
+bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj)
+{
+  bufferlist bl;
+
+  // If filter has expressed an interest in an xattr, load it.
+  if (!filter.get_xattr().empty()) {
+    int ret = pgbackend->objects_get_attr(
+      sobj,
+      filter.get_xattr(),
+      &bl);
+    dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl;
+    if (ret < 0) {
+      if (ret != -ENODATA || filter.reject_empty_xattr()) {
+        return false;
+      }
+    }
+  }
+
+  return filter.filter(sobj, bl);
+}
+
+std::pair<int, std::unique_ptr<const PGLSFilter>>
+PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter)
+{
+  string type;
+  // storing non-const PGLSFilter for the sake of ::init()
+  std::unique_ptr<PGLSFilter> filter;
+
+  try {
+    decode(type, iter);
+  }
+  catch (ceph::buffer::error& e) {
+    return { -EINVAL, nullptr };
+  }
+
+  if (type.compare("plain") == 0) {
+    filter = std::make_unique<PGLSPlainFilter>();
+  } else {
+    std::size_t dot = type.find(".");
+    if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
+      return { -EINVAL, nullptr };
+    }
+
+    const std::string class_name = type.substr(0, dot);
+    const std::string filter_name = type.substr(dot + 1);
+    ClassHandler::ClassData *cls = NULL;
+    int r = ClassHandler::get_instance().open_class(class_name, &cls);
+    if (r != 0) {
+      derr << "Error opening class '" << class_name << "': "
+           << cpp_strerror(r) << dendl;
+      if (r != -EPERM) // propagate permission error
+        r = -EINVAL;
+      return { r, nullptr };
+    } else {
+      ceph_assert(cls);
+    }
+
+    ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
+    if (class_filter == NULL) {
+      derr << "Error finding filter '" << filter_name << "' in class "
+           << class_name << dendl;
+      return { -EINVAL, nullptr };
+    }
+    filter.reset(class_filter->fn());
+    if (!filter) {
+      // Object classes are obliged to return us something, but let's
+      // give an error rather than asserting out.
+      derr << "Buggy class " << class_name << " failed to construct "
+              "filter " << filter_name << dendl;
+      return { -EINVAL, nullptr };
+    }
+  }
+
+  ceph_assert(filter);
+  int r = filter->init(iter);
+  if (r < 0) {
+    derr << "Error initializing filter " << type << ": "
+         << cpp_strerror(r) << dendl;
+    return { -EINVAL, nullptr };
+  } else {
+    // Successfully constructed and initialized, return it.
+    return std::make_pair(0, std::move(filter));
+  }
+}
+
+
+// ==========================================================
+
+void PrimaryLogPG::do_command(
+  const string_view& orig_prefix,
+  const cmdmap_t& cmdmap,
+  const bufferlist& idata,
+  std::function<void(int,const std::string&,bufferlist&)> on_finish)
+{
+  string format;
+  cmd_getval(cmdmap, "format", format);
+  std::unique_ptr<Formatter> f(Formatter::create(
+				 format, "json-pretty", "json-pretty"));
+  int ret = 0;
+  stringstream ss;   // stderr error message stream
+  bufferlist outbl;  // if empty at end, we'll dump formatter as output
+
+  // get final prefix:
+  // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
+  // - ceph tell <pgid> foo -> prefix=foo
+  string prefix(orig_prefix);
+  string command;
+  cmd_getval(cmdmap, "cmd", command);
+  if (command.size()) {
+    prefix = command;
+  }
+
+  if (prefix == "query") {
+    f->open_object_section("pg");
+    f->dump_stream("snap_trimq") << snap_trimq;
+    f->dump_unsigned("snap_trimq_len", snap_trimq.size());
+    recovery_state.dump_peering_state(f.get());
+
+    f->open_array_section("recovery_state");
+    handle_query_state(f.get());
+    f->close_section();
+
+    if (is_primary() && is_active() && m_scrubber) {
+      m_scrubber->dump(f.get());
+    }
+
+    f->open_object_section("agent_state");
+    if (agent_state)
+      agent_state->dump(f.get());
+    f->close_section();
+
+    f->close_section();
+  }
+
+  else if (prefix == "mark_unfound_lost") {
+    string mulcmd;
+    cmd_getval(cmdmap, "mulcmd", mulcmd);
+    int mode = -1;
+    if (mulcmd == "revert") {
+      if (pool.info.is_erasure()) {
+	ss << "mode must be 'delete' for ec pool";
+	ret = -EINVAL;
+	goto out;
+      }
+      mode = pg_log_entry_t::LOST_REVERT;
+    } else if (mulcmd == "delete") {
+      mode = pg_log_entry_t::LOST_DELETE;
+    } else {
+      ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
+      ret = -EINVAL;
+      goto out;
+    }
+    ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
+		mode == pg_log_entry_t::LOST_DELETE);
+
+    if (!is_primary()) {
+      ss << "not primary";
+      ret = -EROFS;
+      goto out;
+    }
+
+    uint64_t unfound = recovery_state.get_missing_loc().num_unfound();
+    if (!unfound) {
+      ss << "pg has no unfound objects";
+      goto out;  // make command idempotent
+    }
+
+    if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) {
+      ss << "pg has " << unfound
+	 << " unfound objects but we haven't probed all sources, not marking lost";
+      ret = -EINVAL;
+      goto out;
+    }
+
+    mark_all_unfound_lost(mode, on_finish);
+    return;
+  }
+
+  else if (prefix == "list_unfound") {
+    hobject_t offset;
+    string offset_json;
+    bool show_offset = false;
+    if (cmd_getval(cmdmap, "offset", offset_json)) {
+      json_spirit::Value v;
+      try {
+	if (!json_spirit::read(offset_json, v))
+	  throw std::runtime_error("bad json");
+	offset.decode(v);
+      } catch (std::runtime_error& e) {
+	ss << "error parsing offset: " << e.what();
+	ret = -EINVAL;
+	goto out;
+      }
+      show_offset = true;
+    }
+    f->open_object_section("missing");
+    if (show_offset) {
+      f->open_object_section("offset");
+      offset.dump(f.get());
+      f->close_section();
+    }
+    auto &needs_recovery_map = recovery_state.get_missing_loc()
+      .get_needs_recovery();
+    f->dump_int("num_missing", needs_recovery_map.size());
+    f->dump_int("num_unfound", get_num_unfound());
+    map<hobject_t, pg_missing_item>::const_iterator p =
+      needs_recovery_map.upper_bound(offset);
+    {
+      f->open_array_section("objects");
+      int32_t num = 0;
+      for (; p != needs_recovery_map.end() &&
+	     num < cct->_conf->osd_command_max_records;
+	   ++p) {
+        if (recovery_state.get_missing_loc().is_unfound(p->first)) {
+	  f->open_object_section("object");
+	  {
+	    f->open_object_section("oid");
+	    p->first.dump(f.get());
+	    f->close_section();
+	  }
+          p->second.dump(f.get()); // have, need keys
+	  {
+	    f->open_array_section("locations");
+            for (auto &&r : recovery_state.get_missing_loc().get_locations(
+		   p->first)) {
+              f->dump_stream("shard") << r;
+	    }
+	    f->close_section();
+	  }
+	  f->close_section();
+	  num++;
+        }
+      }
+      f->close_section();
+    }
+    // Get possible locations of missing objects from pg information
+    PeeringState::QueryUnfound q(f.get());
+    recovery_state.handle_event(q, 0);
+    f->dump_bool("more", p != needs_recovery_map.end());
+    f->close_section();
+  }
+
+  else if (prefix == "scrub" ||
+	   prefix == "deep_scrub") {
+    bool deep = (prefix == "deep_scrub");
+    int64_t time;
+    cmd_getval(cmdmap, "time", time, (int64_t)0);
+
+    if (is_primary()) {
+      const pg_pool_t *p = &pool.info;
+      double pool_scrub_max_interval = 0;
+      double scrub_max_interval;
+      if (deep) {
+        p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
+        scrub_max_interval = pool_scrub_max_interval > 0 ?
+          pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
+      } else {
+        p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
+        scrub_max_interval = pool_scrub_max_interval > 0 ?
+          pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
+      }
+      // Instead of marking must_scrub force a schedule scrub
+      utime_t stamp = ceph_clock_now();
+      if (time == 0)
+        stamp -= scrub_max_interval;
+      else
+        stamp -=  (float)time;
+      stamp -= 100.0;  // push back last scrub more for good measure
+      if (deep) {
+        set_last_deep_scrub_stamp(stamp);
+      } else {
+        set_last_scrub_stamp(stamp);
+      }
+      f->open_object_section("result");
+      f->dump_bool("deep", deep);
+      f->dump_stream("stamp") << stamp;
+      f->close_section();
+    } else {
+      ss << "Not primary";
+      ret = -EPERM;
+    }
+    outbl.append(ss.str());
+  }
+
+  else {
+    ret = -ENOSYS;
+    ss << "prefix '" << prefix << "' not implemented";
+  }
+
+ out:
+  if (ret >= 0 && outbl.length() == 0) {
+    f->flush(outbl);
+  }
+  on_finish(ret, ss.str(), outbl);
+}
+
+
+// ==========================================================
+
+void PrimaryLogPG::do_pg_op(OpRequestRef op)
+{
+  const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req());
+  ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+  dout(10) << "do_pg_op " << *m << dendl;
+
+  op->mark_started();
+
+  int result = 0;
+  string cname, mname;
+
+  snapid_t snapid = m->get_snapid();
+
+  vector<OSDOp> ops = m->ops;
+
+  for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
+    std::unique_ptr<const PGLSFilter> filter;
+    OSDOp& osd_op = *p;
+    auto bp = p->indata.cbegin();
+    switch (p->op.op) {
+    case CEPH_OSD_OP_PGNLS_FILTER:
+      try {
+	decode(cname, bp);
+	decode(mname, bp);
+      }
+      catch (const ceph::buffer::error& e) {
+	dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
+	result = -EINVAL;
+	break;
+      }
+      std::tie(result, filter) = get_pgls_filter(bp);
+      if (result < 0)
+        break;
+
+      ceph_assert(filter);
+
+      // fall through
+
+    case CEPH_OSD_OP_PGNLS:
+      if (snapid != CEPH_NOSNAP) {
+	result = -EINVAL;
+	break;
+      }
+      if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
+        dout(10) << " pgnls pg=" << m->get_pg()
+		 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
+		 << " != " << info.pgid << dendl;
+	result = 0; // hmm?
+      } else {
+	unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
+						p->op.pgls.count);
+
+        dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
+		 << dendl;
+	// read into a buffer
+        vector<hobject_t> sentries;
+        pg_nls_response_t response;
+	try {
+	  decode(response.handle, bp);
+	}
+	catch (const ceph::buffer::error& e) {
+	  dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
+	  result = -EINVAL;
+	  break;
+	}
+
+	hobject_t next;
+	hobject_t lower_bound = response.handle;
+	hobject_t pg_start = info.pgid.pgid.get_hobj_start();
+	hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
+        dout(10) << " pgnls lower_bound " << lower_bound
+		 << " pg_end " << pg_end << dendl;
+	if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
+	     (lower_bound != hobject_t() && lower_bound < pg_start))) {
+	  // this should only happen with a buggy client.
+	  dout(10) << "outside of PG bounds " << pg_start << " .. "
+		   << pg_end << dendl;
+	  result = -EINVAL;
+	  break;
+	}
+
+	hobject_t current = lower_bound;
+	int r = pgbackend->objects_list_partial(
+	  current,
+	  list_size,
+	  list_size,
+	  &sentries,
+	  &next);
+	if (r != 0) {
+	  result = -EINVAL;
+	  break;
+	}
+
+	map<hobject_t, pg_missing_item>::const_iterator missing_iter =
+	  recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
+	vector<hobject_t>::iterator ls_iter = sentries.begin();
+	hobject_t _max = hobject_t::get_max();
+	while (1) {
+	  const hobject_t &mcand =
+	    missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
+	    _max :
+	    missing_iter->first;
+	  const hobject_t &lcand =
+	    ls_iter == sentries.end() ?
+	    _max :
+	    *ls_iter;
+
+	  hobject_t candidate;
+	  if (mcand == lcand) {
+	    candidate = mcand;
+	    if (!mcand.is_max()) {
+	      ++ls_iter;
+	      ++missing_iter;
+	    }
+	  } else if (mcand < lcand) {
+	    candidate = mcand;
+	    ceph_assert(!mcand.is_max());
+	    ++missing_iter;
+	  } else {
+	    candidate = lcand;
+	    ceph_assert(!lcand.is_max());
+	    ++ls_iter;
+	  }
+
+          dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
+		   << " vs lower bound 0x" << lower_bound.get_hash()
+		   << std::dec << dendl;
+
+	  if (candidate >= next) {
+	    break;
+	  }
+
+	  if (response.entries.size() == list_size) {
+	    next = candidate;
+	    break;
+	  }
+
+	  if (candidate.snap != CEPH_NOSNAP)
+	    continue;
+
+	  // skip internal namespace
+	  if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
+	    continue;
+
+	  if (recovery_state.get_missing_loc().is_deleted(candidate))
+	    continue;
+
+	  // skip wrong namespace
+	  if (m->get_hobj().nspace != librados::all_nspaces &&
+               candidate.get_namespace() != m->get_hobj().nspace)
+	    continue;
+
+	  if (filter && !pgls_filter(*filter, candidate))
+	    continue;
+
+          dout(20) << "pgnls item 0x" << std::hex
+            << candidate.get_hash()
+            << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
+            << std::dec << " "
+            << candidate.oid.name << dendl;
+
+	  librados::ListObjectImpl item;
+	  item.nspace = candidate.get_namespace();
+	  item.oid = candidate.oid.name;
+	  item.locator = candidate.get_key();
+	  response.entries.push_back(item);
+	}
+
+	if (next.is_max() &&
+	    missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
+	    ls_iter == sentries.end()) {
+	  result = 1;
+
+	  // Set response.handle to the start of the next PG according
+	  // to the object sort order.
+	  response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
+	} else {
+          response.handle = next;
+        }
+        dout(10) << "pgnls handle=" << response.handle << dendl;
+	encode(response, osd_op.outdata);
+	dout(10) << " pgnls result=" << result << " outdata.length()="
+		 << osd_op.outdata.length() << dendl;
+      }
+      break;
+
+    case CEPH_OSD_OP_PGLS_FILTER:
+      try {
+	decode(cname, bp);
+	decode(mname, bp);
+      }
+      catch (const ceph::buffer::error& e) {
+	dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
+	result = -EINVAL;
+	break;
+      }
+      std::tie(result, filter) = get_pgls_filter(bp);
+      if (result < 0)
+        break;
+
+      ceph_assert(filter);
+
+      // fall through
+
+    case CEPH_OSD_OP_PGLS:
+      if (snapid != CEPH_NOSNAP) {
+	result = -EINVAL;
+	break;
+      }
+      if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
+        dout(10) << " pgls pg=" << m->get_pg()
+		 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
+		 << " != " << info.pgid << dendl;
+	result = 0; // hmm?
+      } else {
+	unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
+						p->op.pgls.count);
+
+        dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
+	// read into a buffer
+        vector<hobject_t> sentries;
+        pg_ls_response_t response;
+	try {
+	  decode(response.handle, bp);
+	}
+	catch (const ceph::buffer::error& e) {
+	  dout(0) << "unable to decode PGLS handle in " << *m << dendl;
+	  result = -EINVAL;
+	  break;
+	}
+
+	hobject_t next;
+	hobject_t current = response.handle;
+	int r = pgbackend->objects_list_partial(
+	  current,
+	  list_size,
+	  list_size,
+	  &sentries,
+	  &next);
+	if (r != 0) {
+	  result = -EINVAL;
+	  break;
+	}
+
+	ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty());
+
+	map<hobject_t, pg_missing_item>::const_iterator missing_iter =
+	  recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
+	vector<hobject_t>::iterator ls_iter = sentries.begin();
+	hobject_t _max = hobject_t::get_max();
+	while (1) {
+	  const hobject_t &mcand =
+	    missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
+	    _max :
+	    missing_iter->first;
+	  const hobject_t &lcand =
+	    ls_iter == sentries.end() ?
+	    _max :
+	    *ls_iter;
+
+	  hobject_t candidate;
+	  if (mcand == lcand) {
+	    candidate = mcand;
+	    if (!mcand.is_max()) {
+	      ++ls_iter;
+	      ++missing_iter;
+	    }
+	  } else if (mcand < lcand) {
+	    candidate = mcand;
+	    ceph_assert(!mcand.is_max());
+	    ++missing_iter;
+	  } else {
+	    candidate = lcand;
+	    ceph_assert(!lcand.is_max());
+	    ++ls_iter;
+	  }
+
+	  if (candidate >= next) {
+	    break;
+	  }
+
+	  if (response.entries.size() == list_size) {
+	    next = candidate;
+	    break;
+	  }
+
+	  if (candidate.snap != CEPH_NOSNAP)
+	    continue;
+
+	  // skip wrong namespace
+	  if (candidate.get_namespace() != m->get_hobj().nspace)
+	    continue;
+
+	  if (recovery_state.get_missing_loc().is_deleted(candidate))
+	    continue;
+
+	  if (filter && !pgls_filter(*filter, candidate))
+	    continue;
+
+	  response.entries.push_back(make_pair(candidate.oid,
+					       candidate.get_key()));
+	}
+	if (next.is_max() &&
+	    missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
+	    ls_iter == sentries.end()) {
+	  result = 1;
+	}
+	response.handle = next;
+	encode(response, osd_op.outdata);
+	dout(10) << " pgls result=" << result << " outdata.length()="
+		 << osd_op.outdata.length() << dendl;
+      }
+      break;
+
+    case CEPH_OSD_OP_PG_HITSET_LS:
+      {
+	list< pair<utime_t,utime_t> > ls;
+	for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
+	     p != info.hit_set.history.end();
+	     ++p)
+	  ls.push_back(make_pair(p->begin, p->end));
+	if (hit_set)
+	  ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
+	encode(ls, osd_op.outdata);
+      }
+      break;
+
+    case CEPH_OSD_OP_PG_HITSET_GET:
+      {
+	utime_t stamp(osd_op.op.hit_set_get.stamp);
+	if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
+	  // read the current in-memory HitSet, not the version we've
+	  // checkpointed.
+	  if (!hit_set) {
+	    result= -ENOENT;
+	    break;
+	  }
+	  encode(*hit_set, osd_op.outdata);
+	  result = osd_op.outdata.length();
+	} else {
+	  // read an archived HitSet.
+	  hobject_t oid;
+	  for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
+	       p != info.hit_set.history.end();
+	       ++p) {
+	    if (stamp >= p->begin && stamp <= p->end) {
+	      oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+	      break;
+	    }
+	  }
+	  if (oid == hobject_t()) {
+	    result = -ENOENT;
+	    break;
+	  }
+	  if (!pool.info.is_replicated()) {
+	    // FIXME: EC not supported yet
+	    result = -EOPNOTSUPP;
+	    break;
+	  }
+	  if (is_unreadable_object(oid)) {
+	    wait_for_unreadable_object(oid, op);
+	    return;
+	  }
+	  result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
+	}
+      }
+      break;
+
+   case CEPH_OSD_OP_SCRUBLS:
+      result = do_scrub_ls(m, &osd_op);
+      break;
+
+    default:
+      result = -EINVAL;
+      break;
+    }
+
+    if (result < 0)
+      break;
+  }
+
+  // reply
+  MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
+				       CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
+				       false);
+  reply->claim_op_out_data(ops);
+  reply->set_result(result);
+  reply->set_reply_versions(info.last_update, info.last_user_version);
+  osd->send_message_osd_client(reply, m->get_connection());
+}
+
+int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
+{
+  if (m->get_pg() != info.pgid.pgid) {
+    dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
+    return -EINVAL; // hmm?
+  }
+  auto bp = osd_op->indata.cbegin();
+  scrub_ls_arg_t arg;
+  try {
+    arg.decode(bp);
+  } catch (ceph::buffer::error&) {
+    dout(10) << " corrupted scrub_ls_arg_t" << dendl;
+    return -EINVAL;
+  }
+
+  int r = 0;
+  scrub_ls_result_t result = {.interval = info.history.same_interval_since};
+
+  if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
+    r = -EAGAIN;
+  } else {
+    bool store_queried = m_scrubber && m_scrubber->get_store_errors(arg, result);
+    if (store_queried) {
+      encode(result, osd_op->outdata); 
+    } else {
+      // the scrubber's store is not initialized
+      r = -ENOENT;
+    }
+  }
+
+  return r;
+}
+
+/**
+ * Releases locks
+ *
+ * @param manager [in] manager with locks to release
+ */
+void PrimaryLogPG::release_object_locks(
+  ObcLockManager &lock_manager) {
+  std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > to_req;
+  bool requeue_recovery = false;
+  bool requeue_snaptrim = false;
+  lock_manager.put_locks(
+    &to_req,
+    &requeue_recovery,
+    &requeue_snaptrim);
+  if (requeue_recovery)
+    queue_recovery();
+  if (requeue_snaptrim)
+    snap_trimmer_machine.process_event(TrimWriteUnblocked());
+
+  if (!to_req.empty()) {
+    // requeue at front of scrub blocking queue if we are blocked by scrub
+    for (auto &&p: to_req) {
+      if (m_scrubber->write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
+        for (auto& op : p.second) {
+          op->mark_delayed("waiting for scrub");
+        }
+
+	waiting_for_scrub.splice(
+	  waiting_for_scrub.begin(),
+	  p.second,
+	  p.second.begin(),
+	  p.second.end());
+      } else if (is_laggy()) {
+        for (auto& op : p.second) {
+          op->mark_delayed("waiting for readable");
+        }
+	waiting_for_readable.splice(
+	  waiting_for_readable.begin(),
+	  p.second,
+	  p.second.begin(),
+	  p.second.end());
+      } else {
+	requeue_ops(p.second);
+      }
+    }
+  }
+}
+
+PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
+			   const PGPool &_pool,
+			   const map<string,string>& ec_profile, spg_t p) :
+  PG(o, curmap, _pool, p),
+  pgbackend(
+    PGBackend::build_pg_backend(
+      _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
+  object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
+  new_backfill(false),
+  temp_seq(0),
+  snap_trimmer_machine(this)
+{
+  recovery_state.set_backend_predicates(
+    pgbackend->get_is_readable_predicate(),
+    pgbackend->get_is_recoverable_predicate());
+  snap_trimmer_machine.initiate();
+
+  m_scrubber = make_unique<PrimaryLogScrub>(this);
+}
+
+PrimaryLogPG::~PrimaryLogPG()
+{
+  m_scrubber.reset();
+}
+
+void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
+{
+  src_oloc = oloc;
+  if (oloc.key.empty())
+    src_oloc.key = oid.name;
+}
+
+void PrimaryLogPG::handle_backoff(OpRequestRef& op)
+{
+  auto m = op->get_req<MOSDBackoff>();
+  auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
+  if (!session)
+    return;  // drop it.
+  hobject_t begin = info.pgid.pgid.get_hobj_start();
+  hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
+  if (begin < m->begin) {
+    begin = m->begin;
+  }
+  if (end > m->end) {
+    end = m->end;
+  }
+  dout(10) << __func__ << " backoff ack id " << m->id
+	   << " [" << begin << "," << end << ")" << dendl;
+  session->ack_backoff(cct, m->pgid, m->id, begin, end);
+}
+
+void PrimaryLogPG::do_request(
+  OpRequestRef& op,
+  ThreadPool::TPHandle &handle)
+{
+  if (op->osd_trace) {
+    op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
+    op->pg_trace.event("do request");
+  }
+#ifdef HAVE_JAEGER
+  if (op->osd_parent_span) {
+    auto do_req_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
+  }
+#endif
+// make sure we have a new enough map
+  auto p = waiting_for_map.find(op->get_source());
+  if (p != waiting_for_map.end()) {
+    // preserve ordering
+    dout(20) << __func__ << " waiting_for_map "
+	     << p->first << " not empty, queueing" << dendl;
+    p->second.push_back(op);
+    op->mark_delayed("waiting_for_map not empty");
+    return;
+  }
+  if (!have_same_or_newer_map(op->min_epoch)) {
+    dout(20) << __func__ << " min " << op->min_epoch
+	     << ", queue on waiting_for_map " << op->get_source() << dendl;
+    waiting_for_map[op->get_source()].push_back(op);
+    op->mark_delayed("op must wait for map");
+    osd->request_osdmap_update(op->min_epoch);
+    return;
+  }
+
+  if (can_discard_request(op)) {
+    return;
+  }
+
+  // pg-wide backoffs
+  const Message *m = op->get_req();
+  int msg_type = m->get_type();
+  if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
+    auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
+    if (!session)
+      return;  // drop it.
+    if (msg_type == CEPH_MSG_OSD_OP) {
+      if (session->check_backoff(cct, info.pgid,
+				 info.pgid.pgid.get_hobj_start(), m)) {
+	return;
+      }
+
+      bool backoff =
+	is_down() ||
+	is_incomplete() ||
+	(!is_active() && is_peered());
+      if (g_conf()->osd_backoff_on_peering && !backoff) {
+	if (is_peering()) {
+	  backoff = true;
+	}
+      }
+      if (backoff) {
+	add_pg_backoff(session);
+	return;
+      }
+    }
+    // pg backoff acks at pg-level
+    if (msg_type == CEPH_MSG_OSD_BACKOFF) {
+      const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
+      if (ba->begin != ba->end) {
+	handle_backoff(op);
+	return;
+      }
+    }
+  }
+
+  if (!is_peered()) {
+    // Delay unless PGBackend says it's ok
+    if (pgbackend->can_handle_while_inactive(op)) {
+      bool handled = pgbackend->handle_message(op);
+      ceph_assert(handled);
+      return;
+    } else {
+      waiting_for_peered.push_back(op);
+      op->mark_delayed("waiting for peered");
+      return;
+    }
+  }
+
+  if (recovery_state.needs_flush()) {
+    dout(20) << "waiting for flush on " << op << dendl;
+    waiting_for_flush.push_back(op);
+    op->mark_delayed("waiting for flush");
+    return;
+  }
+
+  ceph_assert(is_peered() && !recovery_state.needs_flush());
+  if (pgbackend->handle_message(op))
+    return;
+
+  switch (msg_type) {
+  case CEPH_MSG_OSD_OP:
+  case CEPH_MSG_OSD_BACKOFF:
+    if (!is_active()) {
+      dout(20) << " peered, not active, waiting for active on " << op << dendl;
+      waiting_for_active.push_back(op);
+      op->mark_delayed("waiting for active");
+      return;
+    }
+    switch (msg_type) {
+    case CEPH_MSG_OSD_OP:
+      // verify client features
+      if ((pool.info.has_tiers() || pool.info.is_tier()) &&
+	  !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
+	osd->reply_op_error(op, -EOPNOTSUPP);
+	return;
+      }
+      do_op(op);
+      break;
+    case CEPH_MSG_OSD_BACKOFF:
+      // object-level backoff acks handled in osdop context
+      handle_backoff(op);
+      break;
+    }
+    break;
+
+  case MSG_OSD_PG_SCAN:
+    do_scan(op, handle);
+    break;
+
+  case MSG_OSD_PG_BACKFILL:
+    do_backfill(op);
+    break;
+
+  case MSG_OSD_PG_BACKFILL_REMOVE:
+    do_backfill_remove(op);
+    break;
+
+  case MSG_OSD_SCRUB_RESERVE:
+    {
+      if (!m_scrubber) {
+        osd->reply_op_error(op, -EAGAIN);
+        return;
+      }
+      auto m = op->get_req<MOSDScrubReserve>();
+      switch (m->type) {
+      case MOSDScrubReserve::REQUEST:
+	m_scrubber->handle_scrub_reserve_request(op);
+	break;
+      case MOSDScrubReserve::GRANT:
+	m_scrubber->handle_scrub_reserve_grant(op, m->from);
+	break;
+      case MOSDScrubReserve::REJECT:
+	m_scrubber->handle_scrub_reserve_reject(op, m->from);
+	break;
+      case MOSDScrubReserve::RELEASE:
+	m_scrubber->handle_scrub_reserve_release(op);
+	break;
+      }
+    }
+    break;
+
+  case MSG_OSD_REP_SCRUB:
+    replica_scrub(op, handle);
+    break;
+
+  case MSG_OSD_REP_SCRUBMAP:
+    do_replica_scrub_map(op);
+    break;
+
+  case MSG_OSD_PG_UPDATE_LOG_MISSING:
+    do_update_log_missing(op);
+    break;
+
+  case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
+    do_update_log_missing_reply(op);
+    break;
+
+  default:
+    ceph_abort_msg("bad message type in do_request");
+  }
+}
+
+/** do_op - do an op
+ * pg lock will be held (if multithreaded)
+ * osd_lock NOT held.
+ */
+void PrimaryLogPG::do_op(OpRequestRef& op)
+{
+  FUNCTRACE(cct);
+  // NOTE: take a non-const pointer here; we must be careful not to
+  // change anything that will break other reads on m (operator<<).
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+  ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+  if (m->finish_decode()) {
+    op->reset_desc();   // for TrackedOp
+    m->clear_payload();
+  }
+
+  dout(20) << __func__ << ": op " << *m << dendl;
+
+  const hobject_t head = m->get_hobj().get_head();
+
+  if (!info.pgid.pgid.contains(
+	info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
+    derr << __func__ << " " << info.pgid.pgid << " does not contain "
+	 << head << " pg_num " << pool.info.get_pg_num() << " hash "
+	 << std::hex << head.get_hash() << std::dec << dendl;
+    osd->clog->warn() << info.pgid.pgid << " does not contain " << head
+		      << " op " << *m;
+    ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
+    return;
+  }
+
+  bool can_backoff =
+    m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
+  ceph::ref_t<Session> session;
+  if (can_backoff) {
+    session = static_cast<Session*>(m->get_connection()->get_priv().get());
+    if (!session.get()) {
+      dout(10) << __func__ << " no session" << dendl;
+      return;
+    }
+
+    if (session->check_backoff(cct, info.pgid, head, m)) {
+      return;
+    }
+  }
+
+  if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
+    // not implemented.
+    dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
+    osd->reply_op_error(op, -EINVAL);
+    return;
+  }
+
+  {
+    int r = op->maybe_init_op_info(*get_osdmap());
+    if (r) {
+      osd->reply_op_error(op, r);
+      return;
+    }
+  }
+
+  if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
+			 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+      op->may_read() &&
+      !(op->may_write() || op->may_cache())) {
+    // balanced reads; any replica will do
+    if (!(is_primary() || is_nonprimary())) {
+      osd->handle_misdirected_op(this, op);
+      return;
+    }
+  } else {
+    // normal case; must be primary
+    if (!is_primary()) {
+      osd->handle_misdirected_op(this, op);
+      return;
+    }
+  }
+
+  if (!check_laggy(op)) {
+    return;
+  }
+
+  if (!op_has_sufficient_caps(op)) {
+    osd->reply_op_error(op, -EPERM);
+    return;
+  }
+
+  if (op->includes_pg_op()) {
+    return do_pg_op(op);
+  }
+
+  // object name too long?
+  if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
+    dout(4) << "do_op name is longer than "
+	    << cct->_conf->osd_max_object_name_len
+	    << " bytes" << dendl;
+    osd->reply_op_error(op, -ENAMETOOLONG);
+    return;
+  }
+  if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
+    dout(4) << "do_op locator is longer than "
+	    << cct->_conf->osd_max_object_name_len
+	    << " bytes" << dendl;
+    osd->reply_op_error(op, -ENAMETOOLONG);
+    return;
+  }
+  if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
+    dout(4) << "do_op namespace is longer than "
+	    << cct->_conf->osd_max_object_namespace_len
+	    << " bytes" << dendl;
+    osd->reply_op_error(op, -ENAMETOOLONG);
+    return;
+  }
+  if (m->get_hobj().oid.name.empty()) {
+    dout(4) << "do_op empty oid name is not allowed" << dendl;
+    osd->reply_op_error(op, -EINVAL);
+    return;
+  }
+
+  if (int r = osd->store->validate_hobject_key(head)) {
+    dout(4) << "do_op object " << head << " invalid for backing store: "
+	    << r << dendl;
+    osd->reply_op_error(op, r);
+    return;
+  }
+
+  // blocklisted?
+  if (get_osdmap()->is_blocklisted(m->get_source_addr())) {
+    dout(10) << "do_op " << m->get_source_addr() << " is blocklisted" << dendl;
+    osd->reply_op_error(op, -EBLOCKLISTED);
+    return;
+  }
+
+  // order this op as a write?
+  bool write_ordered = op->rwordered();
+
+  // discard due to cluster full transition?  (we discard any op that
+  // originates before the cluster or pool is marked full; the client
+  // will resend after the full flag is removed or if they expect the
+  // op to succeed despite being full).  The except is FULL_FORCE and
+  // FULL_TRY ops, which there is no reason to discard because they
+  // bypass all full checks anyway.  If this op isn't write or
+  // read-ordered, we skip.
+  // FIXME: we exclude mds writes for now.
+  if (write_ordered && !(m->get_source().is_mds() ||
+			 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
+			 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
+      info.history.last_epoch_marked_full > m->get_map_epoch()) {
+    dout(10) << __func__ << " discarding op sent before full " << m << " "
+	     << *m << dendl;
+    return;
+  }
+  // mds should have stopped writing before this point.
+  // We can't allow OSD to become non-startable even if mds
+  // could be writing as part of file removals.
+  if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
+      !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
+    dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
+    return;
+  }
+  int64_t poolid = get_pgid().pool();
+  if (op->may_write()) {
+
+    const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
+    if (!pi) {
+      return;
+    }
+
+    // invalid?
+    if (m->get_snapid() != CEPH_NOSNAP) {
+      dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
+      osd->reply_op_error(op, -EINVAL);
+      return;
+    }
+
+    // too big?
+    if (cct->_conf->osd_max_write_size &&
+        m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
+      // journal can't hold commit!
+      derr << "do_op msg data len " << m->get_data_len()
+           << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
+           << " on " << *m << dendl;
+      osd->reply_op_error(op, -OSD_WRITETOOBIG);
+      return;
+    }
+  }
+
+  dout(10) << "do_op " << *m
+	   << (op->may_write() ? " may_write" : "")
+	   << (op->may_read() ? " may_read" : "")
+	   << (op->may_cache() ? " may_cache" : "")
+	   << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
+	   << " flags " << ceph_osd_flag_string(m->get_flags())
+	   << dendl;
+
+#ifdef HAVE_JAEGER
+  if (op->osd_parent_span) {
+    auto do_op_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
+  }
+#endif
+  // missing object?
+  if (is_unreadable_object(head)) {
+    if (!is_primary()) {
+      osd->reply_op_error(op, -EAGAIN);
+      return;
+    }
+    if (can_backoff &&
+	(g_conf()->osd_backoff_on_degraded ||
+	 (g_conf()->osd_backoff_on_unfound &&
+	  recovery_state.get_missing_loc().is_unfound(head)))) {
+      add_backoff(session, head, head);
+      maybe_kick_recovery(head);
+    } else {
+      wait_for_unreadable_object(head, op);
+    }
+    return;
+  }
+
+  if (write_ordered) {
+    // degraded object?
+    if (is_degraded_or_backfilling_object(head)) {
+      if (can_backoff && g_conf()->osd_backoff_on_degraded) {
+        add_backoff(session, head, head);
+        maybe_kick_recovery(head);
+      } else {
+        wait_for_degraded_object(head, op);
+      }
+      return;
+    }
+
+    if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) {
+      dout(20) << __func__ << ": waiting for scrub" << dendl;
+      waiting_for_scrub.push_back(op);
+      op->mark_delayed("waiting for scrub");
+      return;
+    }
+    if (!check_laggy_requeue(op)) {
+      return;
+    }
+
+    // blocked on snap?
+    if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
+	blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
+      hobject_t to_wait_on(head);
+      to_wait_on.snap = blocked_iter->second;
+      wait_for_degraded_object(to_wait_on, op);
+      return;
+    }
+    if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
+	blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
+      wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
+      return;
+    }
+    if (objects_blocked_on_cache_full.count(head)) {
+      block_write_on_full_cache(head, op);
+      return;
+    }
+  }
+
+  // dup/resent?
+  if (op->may_write() || op->may_cache()) {
+    // warning: we will get back *a* request for this reqid, but not
+    // necessarily the most recent.  this happens with flush and
+    // promote ops, but we can't possible have both in our log where
+    // the original request is still not stable on disk, so for our
+    // purposes here it doesn't matter which one we get.
+    eversion_t version;
+    version_t user_version;
+    int return_code = 0;
+    vector<pg_log_op_return_item_t> op_returns;
+    bool got = check_in_progress_op(
+      m->get_reqid(), &version, &user_version, &return_code, &op_returns);
+    if (got) {
+      dout(3) << __func__ << " dup " << m->get_reqid()
+	      << " version " << version << dendl;
+      if (already_complete(version)) {
+	osd->reply_op_error(op, return_code, version, user_version, op_returns);
+      } else {
+	dout(10) << " waiting for " << version << " to commit" << dendl;
+        // always queue ondisk waiters, so that we can requeue if needed
+	waiting_for_ondisk[version].emplace_back(op, user_version, return_code,
+						 op_returns);
+	op->mark_delayed("waiting for ondisk");
+      }
+      return;
+    }
+  }
+
+  ObjectContextRef obc;
+  bool can_create = op->may_write();
+  hobject_t missing_oid;
+
+  // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
+  const hobject_t& oid =
+    m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj();
+
+  // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
+  for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
+    OSDOp& osd_op = *p;
+
+    if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
+      if (m->get_snapid() != CEPH_SNAPDIR) {
+	dout(10) << "LIST_SNAPS with incorrect context" << dendl;
+	osd->reply_op_error(op, -EINVAL);
+	return;
+      }
+    } else {
+      if (m->get_snapid() == CEPH_SNAPDIR) {
+	dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
+	osd->reply_op_error(op, -EINVAL);
+	return;
+      }
+    }
+  }
+
+  // io blocked on obc?
+  if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
+      maybe_await_blocked_head(oid, op)) {
+    return;
+  }
+
+  if (!is_primary()) {
+    if (!recovery_state.can_serve_replica_read(oid)) {
+      dout(20) << __func__
+               << ": unstable write on replica, bouncing to primary "
+	       << *m << dendl;
+      osd->reply_op_error(op, -EAGAIN);
+      return;
+    }
+    dout(20) << __func__ << ": serving replica read on oid " << oid
+             << dendl;
+  }
+
+  int r = find_object_context(
+    oid, &obc, can_create,
+    m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
+    &missing_oid);
+
+  // LIST_SNAPS needs the ssc too
+  if (obc &&
+      m->get_snapid() == CEPH_SNAPDIR &&
+      !obc->ssc) {
+    obc->ssc = get_snapset_context(oid, true);
+  }
+
+  if (r == -EAGAIN) {
+    // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
+    // we have to wait for the object.
+    if (is_primary()) {
+      // missing the specific snap we need; requeue and wait.
+      ceph_assert(!op->may_write()); // only happens on a read/cache
+      wait_for_unreadable_object(missing_oid, op);
+      return;
+    }
+  } else if (r == 0) {
+    if (is_unreadable_object(obc->obs.oi.soid)) {
+      dout(10) << __func__ << ": clone " << obc->obs.oi.soid
+	       << " is unreadable, waiting" << dendl;
+      wait_for_unreadable_object(obc->obs.oi.soid, op);
+      return;
+    }
+
+    // degraded object?  (the check above was for head; this could be a clone)
+    if (write_ordered &&
+	obc->obs.oi.soid.snap != CEPH_NOSNAP &&
+	is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
+      dout(10) << __func__ << ": clone " << obc->obs.oi.soid
+	       << " is degraded, waiting" << dendl;
+      wait_for_degraded_object(obc->obs.oi.soid, op);
+      return;
+    }
+  }
+
+  bool in_hit_set = false;
+  if (hit_set) {
+    if (obc.get()) {
+      if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
+	in_hit_set = true;
+    } else {
+      if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
+        in_hit_set = true;
+    }
+    if (!op->hitset_inserted) {
+      hit_set->insert(oid);
+      op->hitset_inserted = true;
+      if (hit_set->is_full() ||
+          hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
+        hit_set_persist();
+      }
+    }
+  }
+
+  if (agent_state) {
+    if (agent_choose_mode(false, op))
+      return;
+  }
+
+  if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
+    if (recover_adjacent_clones(obc, op)) {
+      return;
+    }
+    if (maybe_handle_manifest(op,
+			       write_ordered,
+			       obc))
+    return;
+  }
+
+  if (maybe_handle_cache(op,
+			 write_ordered,
+			 obc,
+			 r,
+			 missing_oid,
+			 false,
+			 in_hit_set))
+    return;
+
+  if (r && (r != -ENOENT || !obc)) {
+    // copy the reqids for copy get on ENOENT
+    if (r == -ENOENT &&
+	(m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
+      fill_in_copy_get_noent(op, oid, m->ops[0]);
+      return;
+    }
+    dout(20) << __func__ << ": find_object_context got error " << r << dendl;
+    if (op->may_write() &&
+	get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
+      record_write_error(op, oid, nullptr, r);
+    } else {
+      osd->reply_op_error(op, r);
+    }
+    return;
+  }
+
+  // make sure locator is consistent
+  object_locator_t oloc(obc->obs.oi.soid);
+  if (m->get_object_locator() != oloc) {
+    dout(10) << " provided locator " << m->get_object_locator()
+	     << " != object's " << obc->obs.oi.soid << dendl;
+    osd->clog->warn() << "bad locator " << m->get_object_locator()
+		     << " on object " << oloc
+		      << " op " << *m;
+  }
+
+  // io blocked on obc?
+  if (obc->is_blocked() &&
+      !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
+    wait_for_blocked_object(obc->obs.oi.soid, op);
+    return;
+  }
+
+  dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
+
+  OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
+
+  if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
+    dout(20) << __func__ << ": skipping rw locks" << dendl;
+  } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
+    dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
+
+    // verify there is in fact a flush in progress
+    // FIXME: we could make this a stronger test.
+    map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
+    if (p == flush_ops.end()) {
+      dout(10) << __func__ << " no flush in progress, aborting" << dendl;
+      reply_ctx(ctx, -EINVAL);
+      return;
+    }
+  } else if (!get_rw_locks(write_ordered, ctx)) {
+    dout(20) << __func__ << " waiting for rw locks " << dendl;
+    op->mark_delayed("waiting for rw locks");
+    close_op_ctx(ctx);
+    return;
+  }
+  dout(20) << __func__ << " obc " << *obc << dendl;
+
+  if (r) {
+    dout(20) << __func__ << " returned an error: " << r << dendl;
+    if (op->may_write() &&
+	get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
+      record_write_error(op, oid, nullptr, r,
+			 ctx->op->allows_returnvec() ? ctx : nullptr);
+    } else {
+      osd->reply_op_error(op, r);
+    }
+    close_op_ctx(ctx);
+    return;
+  }
+
+  if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
+    ctx->ignore_cache = true;
+  }
+
+  if ((op->may_read()) && (obc->obs.oi.is_lost())) {
+    // This object is lost. Reading from it returns an error.
+    dout(20) << __func__ << ": object " << obc->obs.oi.soid
+	     << " is lost" << dendl;
+    reply_ctx(ctx, -ENFILE);
+    return;
+  }
+  if (!op->may_write() &&
+      !op->may_cache() &&
+      (!obc->obs.exists ||
+       ((m->get_snapid() != CEPH_SNAPDIR) &&
+	obc->obs.oi.is_whiteout()))) {
+    // copy the reqids for copy get on ENOENT
+    if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
+      fill_in_copy_get_noent(op, oid, m->ops[0]);
+      close_op_ctx(ctx);
+      return;
+    }
+    reply_ctx(ctx, -ENOENT);
+    return;
+  }
+
+  op->mark_started();
+
+  execute_ctx(ctx);
+  utime_t prepare_latency = ceph_clock_now();
+  prepare_latency -= op->get_dequeued_time();
+  osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
+  if (op->may_read() && op->may_write()) {
+    osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
+  } else if (op->may_read()) {
+    osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
+  } else if (op->may_write() || op->may_cache()) {
+    osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
+  }
+
+  // force recovery of the oldest missing object if too many logs
+  maybe_force_recovery();
+}
+
+PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
+  OpRequestRef op,
+  bool write_ordered,
+  ObjectContextRef obc)
+{
+  ceph_assert(obc);
+  if (op->get_req<MOSDOp>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) {
+    dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
+    return cache_result_t::NOOP;
+  }
+
+  // if it is write-ordered and blocked, stop now
+  if (obc->is_blocked() && write_ordered) {
+    // we're already doing something with this object
+    dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
+    return cache_result_t::NOOP;
+  }
+
+  vector<OSDOp> ops = op->get_req<MOSDOp>()->ops;
+  for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
+    OSDOp& osd_op = *p;
+    ceph_osd_op& op = osd_op.op;
+    if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
+	op.op == CEPH_OSD_OP_SET_CHUNK ||
+	op.op == CEPH_OSD_OP_UNSET_MANIFEST ||
+	op.op == CEPH_OSD_OP_TIER_PROMOTE ||
+	op.op == CEPH_OSD_OP_TIER_FLUSH ||
+	op.op == CEPH_OSD_OP_TIER_EVICT) {
+      return cache_result_t::NOOP;
+    }
+  }
+
+  switch (obc->obs.oi.manifest.type) {
+  case object_manifest_t::TYPE_REDIRECT:
+    if (op->may_write() || write_ordered) {
+      do_proxy_write(op, obc);
+    } else {
+      // promoted object
+      if (obc->obs.oi.size != 0) {
+	return cache_result_t::NOOP;
+      }
+      do_proxy_read(op, obc);
+    }
+    return cache_result_t::HANDLED_PROXY;
+  case object_manifest_t::TYPE_CHUNKED:
+    {
+      if (can_proxy_chunked_read(op, obc)) {
+	map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
+        if (p != flush_ops.end()) {
+          do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
+          return cache_result_t::HANDLED_PROXY;
+        }
+	do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
+	return cache_result_t::HANDLED_PROXY;
+      }
+
+      MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+      ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
+      hobject_t head = m->get_hobj();
+
+      if (is_degraded_or_backfilling_object(head)) {
+	dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
+	wait_for_degraded_object(head, op);
+	return cache_result_t::BLOCKED_RECOVERY;
+      }
+
+      if (m_scrubber->write_blocked_by_scrub(head)) {
+	dout(20) << __func__ << ": waiting for scrub" << dendl;
+	waiting_for_scrub.push_back(op);
+	op->mark_delayed("waiting for scrub");
+	return cache_result_t::BLOCKED_RECOVERY;
+      }
+      if (!check_laggy_requeue(op)) {
+	return cache_result_t::BLOCKED_RECOVERY;
+      }
+
+      for (auto& p : obc->obs.oi.manifest.chunk_map) {
+	if (p.second.is_missing()) {
+	  auto m = op->get_req<MOSDOp>();
+	  const object_locator_t oloc = m->get_object_locator();
+	  promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
+	  return cache_result_t::BLOCKED_PROMOTE;
+	}
+      }
+      return cache_result_t::NOOP;
+    }
+  default:
+    ceph_abort_msg("unrecognized manifest type");
+  }
+
+  return cache_result_t::NOOP;
+}
+
+void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
+				      MOSDOpReply *orig_reply, int r,
+				      OpContext *ctx_for_op_returns)
+{
+  dout(20) << __func__ << " r=" << r << dendl;
+  ceph_assert(op->may_write());
+  const osd_reqid_t &reqid = op->get_req<MOSDOp>()->get_reqid();
+  mempool::osd_pglog::list<pg_log_entry_t> entries;
+  entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
+				   get_next_version(), eversion_t(), 0,
+				   reqid, utime_t(), r));
+  if (ctx_for_op_returns) {
+    entries.back().set_op_returns(*ctx_for_op_returns->ops);
+    dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl;
+  }
+
+  struct OnComplete {
+    PrimaryLogPG *pg;
+    OpRequestRef op;
+    boost::intrusive_ptr<MOSDOpReply> orig_reply;
+    int r;
+    OnComplete(
+      PrimaryLogPG *pg,
+      OpRequestRef op,
+      MOSDOpReply *orig_reply,
+      int r)
+      : pg(pg), op(op),
+	orig_reply(orig_reply, false /* take over ref */), r(r)
+      {}
+    void operator()() {
+      ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
+      auto m = op->get_req<MOSDOp>();
+      MOSDOpReply *reply = orig_reply.detach();
+      ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
+      pg->osd->send_message_osd_client(reply, m->get_connection());
+    }
+  };
+
+  ObcLockManager lock_manager;
+  submit_log_entries(
+    entries,
+    std::move(lock_manager),
+    std::optional<std::function<void(void)> >(
+      OnComplete(this, op, orig_reply, r)),
+    op,
+    r);
+}
+
+PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
+  OpRequestRef op,
+  bool write_ordered,
+  ObjectContextRef obc,
+  int r, hobject_t missing_oid,
+  bool must_promote,
+  bool in_hit_set,
+  ObjectContextRef *promote_obc)
+{
+  // return quickly if caching is not enabled
+  if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
+    return cache_result_t::NOOP;
+
+  if (op &&
+      op->get_req() &&
+      op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
+      (op->get_req<MOSDOp>()->get_flags() &
+       CEPH_OSD_FLAG_IGNORE_CACHE)) {
+    dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
+    return cache_result_t::NOOP;
+  }
+
+  must_promote = must_promote || op->need_promote();
+
+  if (obc)
+    dout(25) << __func__ << " " << obc->obs.oi << " "
+	     << (obc->obs.exists ? "exists" : "DNE")
+	     << " missing_oid " << missing_oid
+	     << " must_promote " << (int)must_promote
+	     << " in_hit_set " << (int)in_hit_set
+	     << dendl;
+  else
+    dout(25) << __func__ << " (no obc)"
+	     << " missing_oid " << missing_oid
+	     << " must_promote " << (int)must_promote
+	     << " in_hit_set " << (int)in_hit_set
+	     << dendl;
+
+  // if it is write-ordered and blocked, stop now
+  if (obc.get() && obc->is_blocked() && write_ordered) {
+    // we're already doing something with this object
+    dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
+    return cache_result_t::NOOP;
+  }
+
+  if (r == -ENOENT && missing_oid == hobject_t()) {
+    // we know this object is logically absent (e.g., an undefined clone)
+    return cache_result_t::NOOP;
+  }
+
+  if (obc.get() && obc->obs.exists) {
+    osd->logger->inc(l_osd_op_cache_hit);
+    return cache_result_t::NOOP;
+  }
+  if (!is_primary()) {
+    dout(20) << __func__ << " cache miss; ask the primary" << dendl;
+    osd->reply_op_error(op, -EAGAIN);
+    return cache_result_t::REPLIED_WITH_EAGAIN;
+  }
+
+  if (missing_oid == hobject_t() && obc.get()) {
+    missing_oid = obc->obs.oi.soid;
+  }
+
+  auto m = op->get_req<MOSDOp>();
+  const object_locator_t oloc = m->get_object_locator();
+
+  if (op->need_skip_handle_cache()) {
+    return cache_result_t::NOOP;
+  }
+
+  OpRequestRef promote_op;
+
+  switch (pool.info.cache_mode) {
+  case pg_pool_t::CACHEMODE_WRITEBACK:
+    if (agent_state &&
+	agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
+      if (!op->may_write() && !op->may_cache() &&
+	  !write_ordered && !must_promote) {
+	dout(20) << __func__ << " cache pool full, proxying read" << dendl;
+	do_proxy_read(op);
+	return cache_result_t::HANDLED_PROXY;
+      }
+      dout(20) << __func__ << " cache pool full, waiting" << dendl;
+      block_write_on_full_cache(missing_oid, op);
+      return cache_result_t::BLOCKED_FULL;
+    }
+
+    if (must_promote || (!hit_set && !op->need_skip_promote())) {
+      promote_object(obc, missing_oid, oloc, op, promote_obc);
+      return cache_result_t::BLOCKED_PROMOTE;
+    }
+
+    if (op->may_write() || op->may_cache()) {
+      do_proxy_write(op);
+
+      // Promote too?
+      if (!op->need_skip_promote() &&
+          maybe_promote(obc, missing_oid, oloc, in_hit_set,
+	              pool.info.min_write_recency_for_promote,
+		      OpRequestRef(),
+		      promote_obc)) {
+	return cache_result_t::BLOCKED_PROMOTE;
+      }
+      return cache_result_t::HANDLED_PROXY;
+    } else {
+      do_proxy_read(op);
+
+      // Avoid duplicate promotion
+      if (obc.get() && obc->is_blocked()) {
+	if (promote_obc)
+	  *promote_obc = obc;
+        return cache_result_t::BLOCKED_PROMOTE;
+      }
+
+      // Promote too?
+      if (!op->need_skip_promote()) {
+        (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
+                            pool.info.min_read_recency_for_promote,
+                            promote_op, promote_obc);
+      }
+
+      return cache_result_t::HANDLED_PROXY;
+    }
+    ceph_abort_msg("unreachable");
+    return cache_result_t::NOOP;
+
+  case pg_pool_t::CACHEMODE_READONLY:
+    // TODO: clean this case up
+    if (!obc.get() && r == -ENOENT) {
+      // we don't have the object and op's a read
+      promote_object(obc, missing_oid, oloc, op, promote_obc);
+      return cache_result_t::BLOCKED_PROMOTE;
+    }
+    if (!r) { // it must be a write
+      do_cache_redirect(op);
+      return cache_result_t::HANDLED_REDIRECT;
+    }
+    // crap, there was a failure of some kind
+    return cache_result_t::NOOP;
+
+  case pg_pool_t::CACHEMODE_FORWARD:
+    // this mode is deprecated; proxy instead
+  case pg_pool_t::CACHEMODE_PROXY:
+    if (!must_promote) {
+      if (op->may_write() || op->may_cache() || write_ordered) {
+	do_proxy_write(op);
+	return cache_result_t::HANDLED_PROXY;
+      } else {
+	do_proxy_read(op);
+	return cache_result_t::HANDLED_PROXY;
+      }
+    }
+    // ugh, we're forced to promote.
+    if (agent_state &&
+	agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
+      dout(20) << __func__ << " cache pool full, waiting" << dendl;
+      block_write_on_full_cache(missing_oid, op);
+      return cache_result_t::BLOCKED_FULL;
+    }
+    promote_object(obc, missing_oid, oloc, op, promote_obc);
+    return cache_result_t::BLOCKED_PROMOTE;
+
+  case pg_pool_t::CACHEMODE_READFORWARD:
+    // this mode is deprecated; proxy instead
+  case pg_pool_t::CACHEMODE_READPROXY:
+    // Do writeback to the cache tier for writes
+    if (op->may_write() || write_ordered || must_promote) {
+      if (agent_state &&
+	  agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
+	dout(20) << __func__ << " cache pool full, waiting" << dendl;
+	block_write_on_full_cache(missing_oid, op);
+	return cache_result_t::BLOCKED_FULL;
+      }
+      promote_object(obc, missing_oid, oloc, op, promote_obc);
+      return cache_result_t::BLOCKED_PROMOTE;
+    }
+
+    // If it is a read, we can read, we need to proxy it
+    do_proxy_read(op);
+    return cache_result_t::HANDLED_PROXY;
+
+  default:
+    ceph_abort_msg("unrecognized cache_mode");
+  }
+  return cache_result_t::NOOP;
+}
+
+bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
+				 const hobject_t& missing_oid,
+				 const object_locator_t& oloc,
+				 bool in_hit_set,
+				 uint32_t recency,
+				 OpRequestRef promote_op,
+				 ObjectContextRef *promote_obc)
+{
+  dout(20) << __func__ << " missing_oid " << missing_oid
+	   << "  in_hit_set " << in_hit_set << dendl;
+
+  switch (recency) {
+  case 0:
+    break;
+  case 1:
+    // Check if in the current hit set
+    if (in_hit_set) {
+      break;
+    } else {
+      // not promoting
+      return false;
+    }
+    break;
+  default:
+    {
+      unsigned count = (int)in_hit_set;
+      if (count) {
+	// Check if in other hit sets
+	const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
+	for (map<time_t,HitSetRef>::reverse_iterator itor =
+	       agent_state->hit_set_map.rbegin();
+	     itor != agent_state->hit_set_map.rend();
+	     ++itor) {
+	  if (!itor->second->contains(oid)) {
+	    break;
+	  }
+	  ++count;
+	  if (count >= recency) {
+	    break;
+	  }
+	}
+      }
+      if (count >= recency) {
+	break;
+      }
+      return false;	// not promoting
+    }
+    break;
+  }
+
+  if (osd->promote_throttle()) {
+    dout(10) << __func__ << " promote throttled" << dendl;
+    return false;
+  }
+  promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
+  return true;
+}
+
+void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
+{
+  auto m = op->get_req<MOSDOp>();
+  int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
+  MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
+                                       flags, false);
+  request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
+  reply->set_redirect(redir);
+  dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
+	   << op << dendl;
+  m->get_connection()->send_message(reply);
+  return;
+}
+
+struct C_ProxyRead : public Context {
+  PrimaryLogPGRef pg;
+  hobject_t oid;
+  epoch_t last_peering_reset;
+  ceph_tid_t tid;
+  PrimaryLogPG::ProxyReadOpRef prdop;
+  utime_t start;
+  C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
+	     const PrimaryLogPG::ProxyReadOpRef& prd)
+    : pg(p), oid(o), last_peering_reset(lpr),
+      tid(0), prdop(prd), start(ceph_clock_now())
+  {}
+  void finish(int r) override {
+    if (prdop->canceled)
+      return;
+    std::scoped_lock locker{*pg};
+    if (prdop->canceled) {
+      return;
+    }
+    if (last_peering_reset == pg->get_last_peering_reset()) {
+      pg->finish_proxy_read(oid, tid, r);
+      pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
+    }
+  }
+};
+
+struct C_ProxyChunkRead : public Context {
+  PrimaryLogPGRef pg;
+  hobject_t oid;
+  epoch_t last_peering_reset;
+  ceph_tid_t tid;
+  PrimaryLogPG::ProxyReadOpRef prdop;
+  utime_t start;
+  ObjectOperation *obj_op;
+  int op_index = 0;
+  uint64_t req_offset = 0;
+  ObjectContextRef obc;
+  uint64_t req_total_len = 0;
+  C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
+		   const PrimaryLogPG::ProxyReadOpRef& prd)
+    : pg(p), oid(o), last_peering_reset(lpr),
+      tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
+  {}
+  void finish(int r) override {
+    if (prdop->canceled)
+      return;
+    std::scoped_lock locker{*pg};
+    if (prdop->canceled) {
+      return;
+    }
+    if (last_peering_reset == pg->get_last_peering_reset()) {
+      if (r >= 0) {
+	if (!prdop->ops[op_index].outdata.length()) {
+	  ceph_assert(req_total_len);
+	  bufferlist list;
+	  bufferptr bptr(req_total_len);
+	  list.push_back(std::move(bptr));
+	  prdop->ops[op_index].outdata.append(list);
+	}
+	ceph_assert(obj_op);
+	uint64_t copy_offset;
+	if (req_offset >= prdop->ops[op_index].op.extent.offset) {
+	  copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
+	} else {
+	  copy_offset = 0;
+	}
+	prdop->ops[op_index].outdata.begin(copy_offset).copy_in(
+          obj_op->ops[0].outdata.length(),
+          obj_op->ops[0].outdata.c_str());
+      }
+
+      pg->finish_proxy_read(oid, tid, r);
+      pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
+      if (obj_op) {
+	delete obj_op;
+      }
+    }
+  }
+};
+
+void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
+{
+  // NOTE: non-const here because the ProxyReadOp needs mutable refs to
+  // stash the result in the request's OSDOp vector
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+  object_locator_t oloc;
+  hobject_t soid;
+  /* extensible tier */
+  if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
+    switch (obc->obs.oi.manifest.type) {
+      case object_manifest_t::TYPE_REDIRECT:
+	  oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
+	  soid = obc->obs.oi.manifest.redirect_target;
+	  break;
+      default:
+	ceph_abort_msg("unrecognized manifest type");
+    }
+  } else {
+  /* proxy */
+    soid = m->get_hobj();
+    oloc = object_locator_t(m->get_object_locator());
+    oloc.pool = pool.info.tier_of;
+  }
+  unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
+
+  // pass through some original flags that make sense.
+  //  - leave out redirection and balancing flags since we are
+  //    already proxying through the primary
+  //  - leave off read/write/exec flags that are derived from the op
+  flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
+			     CEPH_OSD_FLAG_ORDERSNAP |
+			     CEPH_OSD_FLAG_ENFORCE_SNAPC |
+			     CEPH_OSD_FLAG_MAP_SNAP_CLONE);
+
+  dout(10) << __func__ << " Start proxy read for " << *m << dendl;
+
+  ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
+
+  ObjectOperation obj_op;
+  obj_op.dup(prdop->ops);
+
+  if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
+      (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
+    for (unsigned i = 0; i < obj_op.ops.size(); i++) {
+      ceph_osd_op op = obj_op.ops[i].op;
+      switch (op.op) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_SYNC_READ:
+	case CEPH_OSD_OP_SPARSE_READ:
+	case CEPH_OSD_OP_CHECKSUM:
+	case CEPH_OSD_OP_CMPEXT:
+	  op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
+		       ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+      }
+    }
+  }
+
+  C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
+				     prdop);
+  ceph_tid_t tid = osd->objecter->read(
+    soid.oid, oloc, obj_op,
+    m->get_snapid(), NULL,
+    flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
+    &prdop->user_version,
+    &prdop->data_offset,
+    m->get_features());
+  fin->tid = tid;
+  prdop->objecter_tid = tid;
+  proxyread_ops[tid] = prdop;
+  in_progress_proxy_ops[soid].push_back(op);
+}
+
+void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
+{
+  dout(10) << __func__ << " " << oid << " tid " << tid
+	   << " " << cpp_strerror(r) << dendl;
+
+  map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
+  if (p == proxyread_ops.end()) {
+    dout(10) << __func__ << " no proxyread_op found" << dendl;
+    return;
+  }
+  ProxyReadOpRef prdop = p->second;
+  if (tid != prdop->objecter_tid) {
+    dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
+	     << " tid " << prdop->objecter_tid << dendl;
+    return;
+  }
+  if (oid != prdop->soid) {
+    dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
+	     << " soid " << prdop->soid << dendl;
+    return;
+  }
+  proxyread_ops.erase(tid);
+
+  map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
+  if (q == in_progress_proxy_ops.end()) {
+    dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
+    return;
+  }
+  ceph_assert(q->second.size());
+  list<OpRequestRef>::iterator it = std::find(q->second.begin(),
+                                              q->second.end(),
+					      prdop->op);
+  ceph_assert(it != q->second.end());
+  OpRequestRef op = *it;
+  q->second.erase(it);
+  if (q->second.size() == 0) {
+    in_progress_proxy_ops.erase(oid);
+  } else if (std::find(q->second.begin(),
+                       q->second.end(),
+                       prdop->op) != q->second.end()) {
+    /* multiple read case */
+    dout(20) << __func__ << " " << oid << " is not completed  " << dendl;
+    return;
+  }
+
+  osd->logger->inc(l_osd_tier_proxy_read);
+
+  auto m = op->get_req<MOSDOp>();
+  OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
+  ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
+  ctx->user_at_version = prdop->user_version;
+  ctx->data_off = prdop->data_offset;
+  ctx->ignore_log_op_stats = true;
+  complete_read_ctx(r, ctx);
+}
+
+void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
+{
+  map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
+  if (p == in_progress_proxy_ops.end())
+    return;
+
+  list<OpRequestRef>& ls = p->second;
+  dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
+  requeue_ops(ls);
+  in_progress_proxy_ops.erase(p);
+}
+
+void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
+				     vector<ceph_tid_t> *tids)
+{
+  dout(10) << __func__ << " " << prdop->soid << dendl;
+  prdop->canceled = true;
+
+  // cancel objecter op, if we can
+  if (prdop->objecter_tid) {
+    tids->push_back(prdop->objecter_tid);
+    for (uint32_t i = 0; i < prdop->ops.size(); i++) {
+      prdop->ops[i].outdata.clear();
+    }
+    proxyread_ops.erase(prdop->objecter_tid);
+    prdop->objecter_tid = 0;
+  }
+}
+
+void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
+{
+  dout(10) << __func__ << dendl;
+
+  // cancel proxy reads
+  map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
+  while (p != proxyread_ops.end()) {
+    cancel_proxy_read((p++)->second, tids);
+  }
+
+  // cancel proxy writes
+  map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
+  while (q != proxywrite_ops.end()) {
+    cancel_proxy_write((q++)->second, tids);
+  }
+
+  if (requeue) {
+    map<hobject_t, list<OpRequestRef>>::iterator p =
+      in_progress_proxy_ops.begin();
+    while (p != in_progress_proxy_ops.end()) {
+      list<OpRequestRef>& ls = p->second;
+      dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
+	       << " requests" << dendl;
+      requeue_ops(ls);
+      in_progress_proxy_ops.erase(p++);
+    }
+  } else {
+    in_progress_proxy_ops.clear();
+  }
+}
+
+struct C_ProxyWrite_Commit : public Context {
+  PrimaryLogPGRef pg;
+  hobject_t oid;
+  epoch_t last_peering_reset;
+  ceph_tid_t tid;
+  PrimaryLogPG::ProxyWriteOpRef pwop;
+  C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
+	              const PrimaryLogPG::ProxyWriteOpRef& pw)
+    : pg(p), oid(o), last_peering_reset(lpr),
+      tid(0), pwop(pw)
+  {}
+  void finish(int r) override {
+    if (pwop->canceled)
+      return;
+    std::scoped_lock locker{*pg};
+    if (pwop->canceled) {
+      return;
+    }
+    if (last_peering_reset == pg->get_last_peering_reset()) {
+      pg->finish_proxy_write(oid, tid, r);
+    }
+  }
+};
+
+void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
+{
+  // NOTE: non-const because ProxyWriteOp takes a mutable ref
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+  object_locator_t oloc;
+  SnapContext snapc(m->get_snap_seq(), m->get_snaps());
+  hobject_t soid;
+  /* extensible tier */
+  if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
+    switch (obc->obs.oi.manifest.type) {
+      case object_manifest_t::TYPE_REDIRECT:
+	  oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
+	  soid = obc->obs.oi.manifest.redirect_target;
+	  break;
+      default:
+	ceph_abort_msg("unrecognized manifest type");
+    }
+  } else {
+  /* proxy */
+    soid = m->get_hobj();
+    oloc = object_locator_t(m->get_object_locator());
+    oloc.pool = pool.info.tier_of;
+  }
+
+  unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
+  if (!(op->may_write() || op->may_cache())) {
+    flags |= CEPH_OSD_FLAG_RWORDERED;
+  }
+  if (op->allows_returnvec()) {
+    flags |= CEPH_OSD_FLAG_RETURNVEC;
+  }
+
+  dout(10) << __func__ << " Start proxy write for " << *m << dendl;
+
+  ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
+  pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
+  pwop->mtime = m->get_mtime();
+
+  ObjectOperation obj_op;
+  obj_op.dup(pwop->ops);
+
+  C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
+      this, soid, get_last_peering_reset(), pwop);
+  ceph_tid_t tid = osd->objecter->mutate(
+    soid.oid, oloc, obj_op, snapc,
+    ceph::real_clock::from_ceph_timespec(pwop->mtime),
+    flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
+    &pwop->user_version, pwop->reqid);
+  fin->tid = tid;
+  pwop->objecter_tid = tid;
+  proxywrite_ops[tid] = pwop;
+  in_progress_proxy_ops[soid].push_back(op);
+}
+
+void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
+				       ObjectContextRef obc, bool write_ordered)
+{
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+  OSDOp *osd_op = NULL;
+  for (unsigned int i = 0; i < m->ops.size(); i++) {
+    osd_op = &m->ops[i];
+    uint64_t cursor = osd_op->op.extent.offset;
+    uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
+    uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
+    object_manifest_t *manifest = &obc->obs.oi.manifest;
+    map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
+
+    while (cursor < op_length) {
+      chunk_index = 0;
+      chunk_length = 0;
+      /* find the right chunk position for cursor */
+      for (auto &p : manifest->chunk_map) {
+	if (p.first <= cursor && p.first + p.second.length > cursor) {
+	  chunk_length = p.second.length;
+	  chunk_index = p.first;
+	  break;
+	}
+      }
+      /* no index */
+      if (!chunk_index && !chunk_length) {
+	if (cursor == osd_op->op.extent.offset) {
+	  OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
+	  ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
+	  ctx->data_off = osd_op->op.extent.offset;
+	  ctx->ignore_log_op_stats = true;
+	  complete_read_ctx(0, ctx);
+	}
+	break;
+      }
+      uint64_t next_length = chunk_length;
+      /* the size to read -> | op length | */
+      /*		     | 	 a chunk   | */
+      if (cursor + next_length > op_length) {
+	next_length = op_length - cursor;
+      }
+      /* the size to read -> |   op length   | */
+      /*		     | 	 a chunk | */
+      if (cursor + next_length > chunk_index + chunk_length) {
+	next_length = chunk_index + chunk_length - cursor;
+      }
+
+      chunk_read[cursor] = {{chunk_index, next_length}};
+      cursor += next_length;
+    }
+
+    req_len = cursor - osd_op->op.extent.offset;
+    for (auto &p : chunk_read) {
+      auto chunks = p.second.begin();
+      dout(20) << __func__ << " chunk_index: " << chunks->first
+	      << " next_length: " << chunks->second << " cursor: "
+	      << p.first << dendl;
+      do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
+    }
+  }
+}
+
+struct RefCountCallback : public Context {
+public:
+  PrimaryLogPG::OpContext *ctx;
+  OSDOp& osd_op;
+  bool requeue = false;
+
+  RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
+    : ctx(ctx), osd_op(osd_op) {}
+  void finish(int r) override {
+    // NB: caller must already have pg->lock held
+    ctx->obc->stop_block();
+    ctx->pg->kick_object_context_blocked(ctx->obc);
+    if (r >= 0) {
+      osd_op.rval = 0;
+      ctx->pg->execute_ctx(ctx);
+    } else {
+       // on cancel simply toss op out,
+       // or requeue as requested
+      if (r != -ECANCELED) {
+        if (ctx->op)
+          ctx->pg->osd->reply_op_error(ctx->op, r);
+      } else if (requeue) {
+        if (ctx->op)
+          ctx->pg->requeue_op(ctx->op);
+      }
+      ctx->pg->close_op_ctx(ctx);
+    }
+  }
+  void set_requeue(bool rq) {
+    requeue = rq;
+  }
+};
+
+struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
+  OSDOp& osd_op;
+
+  explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
+  }
+
+  int execute() override {
+    return osd_op.rval;
+  }
+};
+
+struct C_SetManifestRefCountDone : public Context {
+  PrimaryLogPGRef pg;
+  PrimaryLogPG::ManifestOpRef mop;
+  hobject_t soid;
+  C_SetManifestRefCountDone(PrimaryLogPG *p,
+    PrimaryLogPG::ManifestOpRef mop, hobject_t soid) : 
+    pg(p), mop(mop), soid(soid) {}
+  void finish(int r) override {
+    if (r == -ECANCELED)
+      return;
+    std::scoped_lock locker{*pg};
+    auto it = pg->manifest_ops.find(soid);
+    if (it == pg->manifest_ops.end()) {
+      // raced with cancel_manifest_ops
+      return;
+    }
+    if (it->second->cb) {
+      it->second->cb->complete(r);
+    }
+    pg->manifest_ops.erase(it);
+    mop.reset();
+  }
+};
+
+struct C_SetDedupChunks : public Context {
+  PrimaryLogPGRef pg;
+  hobject_t oid;
+  epoch_t last_peering_reset;
+  ceph_tid_t tid;
+  uint64_t offset;
+  
+  C_SetDedupChunks(PrimaryLogPG *p, hobject_t o, epoch_t lpr, uint64_t offset)
+    : pg(p), oid(o), last_peering_reset(lpr),
+      tid(0), offset(offset)
+  {}
+  void finish(int r) override {
+    if (r == -ECANCELED)
+      return;
+    std::scoped_lock locker{*pg};
+    if (last_peering_reset != pg->get_last_peering_reset()) {
+      return;
+    }
+    pg->finish_set_dedup(oid, r, tid, offset);
+  }
+};
+
+void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids)
+{
+  dout(10) << __func__ << dendl;
+  auto p = manifest_ops.begin();
+  while (p != manifest_ops.end()) {
+    auto mop = p->second;
+    // cancel objecter op, if we can
+    if (mop->objecter_tid) {
+      tids->push_back(mop->objecter_tid);
+      mop->objecter_tid = 0;
+    }
+    if (mop->cb) {
+      mop->cb->set_requeue(requeue);
+      mop->cb->complete(-ECANCELED);
+    }
+    manifest_ops.erase(p++);
+  }
+}
+
+int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op) 
+{
+  int cnt = 0;
+  // head
+  for (auto &p : obc->obs.oi.manifest.chunk_map) {
+    if (p.second.oid.oid.name == fp_oid) {
+      cnt++;
+    }
+  }
+  // snap
+  SnapSet& ss = obc->ssc->snapset;
+  const OSDMapRef& osdmap = get_osdmap();
+  for (vector<snapid_t>::const_reverse_iterator p = ss.clones.rbegin();
+      p != ss.clones.rend();
+      ++p) {
+    object_ref_delta_t refs;
+    ObjectContextRef obc_l = nullptr;
+    ObjectContextRef obc_g = nullptr;
+    hobject_t clone_oid = obc->obs.oi.soid;
+    clone_oid.snap = *p;
+    if (osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
+      return -EBUSY;
+    }
+    ObjectContextRef clone_obc = get_object_context(clone_oid, false);
+    if (!clone_obc) {
+      break;
+    }
+    if (recover_adjacent_clones(clone_obc, op)) {
+      return -EAGAIN;
+    }
+    get_adjacent_clones(clone_obc, obc_l, obc_g);
+    clone_obc->obs.oi.manifest.calc_refs_to_inc_on_set(
+      obc_g ? &(obc_g->obs.oi.manifest) : nullptr ,
+      nullptr,
+      refs);
+    for (auto p = refs.begin(); p != refs.end(); ++p) {
+      if (p->first.oid.name == fp_oid && p->second > 0) {
+	cnt += p->second;
+      }
+    }
+  }
+
+  return cnt;
+}
+
+bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
+{
+  if (!obc->ssc || !obc->ssc->snapset.clones.size()) {
+    return false;
+  }
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+  bool has_manifest_op = std::any_of(
+    begin(m->ops),
+    end(m->ops),
+    [](const auto& osd_op) {
+       return osd_op.op.op == CEPH_OSD_OP_SET_CHUNK;
+    });
+  if (!obc->obs.oi.manifest.is_chunked() && !has_manifest_op) {
+    return false;
+  }
+  ceph_assert(op);
+
+  const SnapSet& snapset = obc->ssc->snapset;
+  auto s = std::find(snapset.clones.begin(), snapset.clones.end(), obc->obs.oi.soid.snap);
+  auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> bool {
+    hobject_t cid = obc->obs.oi.soid;
+    cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
+    if (is_unreadable_object(cid)) {
+      dout(10) << __func__ << ": clone " << cid
+	       << " is unreadable, waiting" << dendl;
+      wait_for_unreadable_object(cid, op);
+      return true;
+    }
+    return false;
+  };
+  if (s != snapset.clones.begin()) {
+    if (is_unreadable_snap(s - 1)) {
+      return true;
+    }
+  }
+  if (s != snapset.clones.end()) {
+    if (is_unreadable_snap(s + 1)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+ObjectContextRef PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc)
+{
+  auto s = std::find(obc->ssc->snapset.clones.begin(), obc->ssc->snapset.clones.end(),
+		    obc->obs.oi.soid.snap);
+  if (s != obc->ssc->snapset.clones.begin()) {
+    auto s_iter = s - 1;
+    hobject_t cid = obc->obs.oi.soid;
+    object_ref_delta_t refs;
+    cid.snap = *s_iter;
+    ObjectContextRef cobc = get_object_context(cid, false, NULL);
+    ceph_assert(cobc);
+    return cobc;
+  }
+  return nullptr;
+}
+
+void PrimaryLogPG::dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs)
+{
+  for (auto p = refs.begin(); p != refs.end(); ++p) {
+    int dec_ref_count = p->second;
+    ceph_assert(dec_ref_count < 0);
+    while (dec_ref_count < 0) {
+      dout(10) << __func__ << ": decrement reference on offset oid: " << p->first << dendl;
+      refcount_manifest(soid, p->first, 
+			refcount_t::DECREMENT_REF, NULL, std::nullopt);
+      dec_ref_count++;
+    }
+  }
+}
+
+
+void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc, 
+				       ObjectContextRef& _l, ObjectContextRef& _g) 
+{
+  const SnapSet& snapset = src_obc->ssc->snapset;
+  const object_info_t& oi = src_obc->obs.oi;
+
+  auto get_context = [this, &oi, &snapset](auto iter)
+    -> ObjectContextRef {
+    hobject_t cid = oi.soid;
+    cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
+    ObjectContextRef obc = get_object_context(cid, false, NULL);
+    ceph_assert(obc);
+    return obc;
+  };
+
+  // check adjacent clones
+  auto s = std::find(snapset.clones.begin(), snapset.clones.end(), oi.soid.snap);
+
+  // We *must* find the clone iff it's not head,
+  // let s == snapset.clones.end() mean head
+  ceph_assert((s == snapset.clones.end()) == oi.soid.is_head());
+
+  if (s != snapset.clones.begin()) {
+    _l = get_context(s - 1);
+  }
+
+  if (s != snapset.clones.end()) {
+    _g = get_context(s + 1);
+  }
+}
+
+bool PrimaryLogPG::inc_refcount_by_set(OpContext* ctx, object_manifest_t& set_chunk,
+				       OSDOp& osd_op)
+{
+  object_ref_delta_t refs;
+  ObjectContextRef obc_l, obc_g;
+  get_adjacent_clones(ctx->obc, obc_l, obc_g);
+  set_chunk.calc_refs_to_inc_on_set(
+    obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
+    obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
+    refs);
+  if (!refs.is_empty()) {
+    /* This is called by set-chunk, so we only consider a single chunk for the time being */
+    ceph_assert(refs.size() == 1);
+    auto p = refs.begin();
+    int inc_ref_count = p->second;
+    if (inc_ref_count > 0) {
+      /*
+       * In set-chunk case, the first thing we should do is to increment
+       * the reference the targe object has prior to update object_manifest in object_info_t.
+       * So, call directly refcount_manifest.
+       */
+      ManifestOpRef mop = std::make_shared<ManifestOp>(new RefCountCallback(ctx, osd_op));
+      C_SetManifestRefCountDone* fin = new C_SetManifestRefCountDone(this, mop, ctx->obs->oi.soid);
+      ceph_tid_t tid = refcount_manifest(ctx->obs->oi.soid, p->first,
+					  refcount_t::INCREMENT_REF, fin, std::nullopt);
+      mop->objecter_tid = tid;
+      manifest_ops[ctx->obs->oi.soid] = mop;
+      ctx->obc->start_block();
+      return true;
+    } else if (inc_ref_count < 0) {
+      hobject_t src = ctx->obs->oi.soid;
+      hobject_t tgt = p->first;
+      ctx->register_on_commit(
+	  [src, tgt, this](){
+	    refcount_manifest(src, tgt, refcount_t::DECREMENT_REF, NULL, std::nullopt);
+	  });
+      return false;
+    }
+  }
+
+  return false;
+}
+
+void PrimaryLogPG::dec_refcount_by_dirty(OpContext* ctx)
+{
+  object_ref_delta_t refs;
+  ObjectContextRef cobc = nullptr;
+  ObjectContextRef obc = ctx->obc;
+  for (auto &p : ctx->obs->oi.manifest.chunk_map) {
+    if (!ctx->clean_regions.is_clean_region(p.first, p.second.length)) {
+      ctx->new_obs.oi.manifest.chunk_map.erase(p.first);
+      if (ctx->new_obs.oi.manifest.chunk_map.empty()) {
+	ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
+	ctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
+	ctx->delta_stats.num_objects_manifest--;
+      }
+    }
+  }
+  // Look over previous snapshot, then figure out whether updated chunk needs to be deleted
+  cobc = get_prev_clone_obc(obc);
+  obc->obs.oi.manifest.calc_refs_to_drop_on_modify(
+    cobc ? &cobc->obs.oi.manifest : nullptr,
+    ctx->clean_regions,
+    refs);
+  if (!refs.is_empty()) {
+    hobject_t soid = obc->obs.oi.soid;
+    ctx->register_on_commit(
+      [soid, this, refs](){
+	dec_refcount(soid, refs);
+      });
+  }
+}
+
+void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx)
+{
+  ceph_assert(oi.has_manifest());
+  ceph_assert(ctx->obc->ssc);
+
+  if (oi.manifest.is_chunked()) {
+    object_ref_delta_t refs;
+    ObjectContextRef obc_l, obc_g;
+    get_adjacent_clones(ctx->obc, obc_l, obc_g);
+    oi.manifest.calc_refs_to_drop_on_removal(
+      obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
+      obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
+      refs);
+
+    if (!refs.is_empty()) {
+      hobject_t soid = ctx->obc->obs.oi.soid;
+      ctx->register_on_commit(
+	[soid, this, refs](){
+	  dec_refcount(soid, refs);
+	});
+    }
+  } else if (oi.manifest.is_redirect() &&
+	     oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
+    ctx->register_on_commit(
+      [oi, this](){
+	refcount_manifest(oi.soid, oi.manifest.redirect_target, 
+			  refcount_t::DECREMENT_REF, NULL, std::nullopt);
+      });
+  }
+}
+
+ceph_tid_t PrimaryLogPG::refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type,
+                                     Context *cb, std::optional<bufferlist> chunk)
+{
+  unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
+                   CEPH_OSD_FLAG_RWORDERED;
+
+  dout(10) << __func__ << " Start refcount from " << src_soid
+           << " to " << tgt_soid << dendl;
+
+  ObjectOperation obj_op;
+  bufferlist in;
+  if (type == refcount_t::INCREMENT_REF) {
+    cls_cas_chunk_get_ref_op call;
+    call.source = src_soid.get_head();
+    ::encode(call, in);
+    obj_op.call("cas", "chunk_get_ref", in);
+  } else if (type == refcount_t::DECREMENT_REF) {
+    cls_cas_chunk_put_ref_op call;
+    call.source = src_soid.get_head();
+    ::encode(call, in);
+    obj_op.call("cas", "chunk_put_ref", in);
+  } else if (type == refcount_t::CREATE_OR_GET_REF) {
+    cls_cas_chunk_create_or_get_ref_op get_call;
+    get_call.source = src_soid.get_head();
+    ceph_assert(chunk);
+    get_call.data = move(*chunk);
+    ::encode(get_call, in);
+    obj_op.call("cas", "chunk_create_or_get_ref", in);
+  } else {
+    ceph_assert(0 == "unrecognized type");
+  }
+
+  Context *c = nullptr;
+  if (cb) {
+    c = new C_OnFinisher(cb, osd->get_objecter_finisher(get_pg_shard()));
+  }
+
+  object_locator_t oloc(tgt_soid);
+  ObjectContextRef src_obc = get_object_context(src_soid, false, NULL);
+  ceph_assert(src_obc);
+  auto tid = osd->objecter->mutate(
+    tgt_soid.oid, oloc, obj_op, SnapContext(),
+    ceph::real_clock::from_ceph_timespec(src_obc->obs.oi.mtime),
+    flags, c);
+  return tid;
+}
+
+void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
+					 uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
+					 uint64_t req_total_len, bool write_ordered)
+{
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+  object_manifest_t *manifest = &obc->obs.oi.manifest;
+  if (!manifest->chunk_map.count(chunk_index)) {
+    return;
+  }
+  uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
+  hobject_t soid = manifest->chunk_map[chunk_index].oid;
+  hobject_t ori_soid = m->get_hobj();
+  object_locator_t oloc(soid);
+  unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
+  if (write_ordered) {
+    flags |= CEPH_OSD_FLAG_RWORDERED;
+  }
+
+  if (!chunk_length || soid == hobject_t()) {
+    return;
+  }
+
+  /* same as do_proxy_read() */
+  flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
+			     CEPH_OSD_FLAG_ORDERSNAP |
+			     CEPH_OSD_FLAG_ENFORCE_SNAPC |
+			     CEPH_OSD_FLAG_MAP_SNAP_CLONE);
+
+  dout(10) << __func__ << " Start do chunk proxy read for " << *m
+	   << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
+	   << " req_length: " << req_length << dendl;
+
+  ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
+
+  ObjectOperation *pobj_op = new ObjectOperation;
+  OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
+
+  if (chunk_index <= req_offset) {
+    osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
+  } else {
+    ceph_abort_msg("chunk_index > req_offset");
+  }
+  osd_op.op.extent.length = req_length;
+
+  ObjectOperation obj_op;
+  obj_op.dup(pobj_op->ops);
+
+  C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
+					       prdop);
+  fin->obj_op = pobj_op;
+  fin->op_index = op_index;
+  fin->req_offset = req_offset;
+  fin->obc = obc;
+  fin->req_total_len = req_total_len;
+
+  ceph_tid_t tid = osd->objecter->read(
+    soid.oid, oloc, obj_op,
+    m->get_snapid(), NULL,
+    flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
+    &prdop->user_version,
+    &prdop->data_offset,
+    m->get_features());
+  fin->tid = tid;
+  prdop->objecter_tid = tid;
+  proxyread_ops[tid] = prdop;
+  in_progress_proxy_ops[ori_soid].push_back(op);
+}
+
+bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
+{
+  MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
+  OSDOp *osd_op = NULL;
+  bool ret = true;
+  for (unsigned int i = 0; i < m->ops.size(); i++) {
+    osd_op = &m->ops[i];
+    ceph_osd_op op = osd_op->op;
+    switch (op.op) {
+      case CEPH_OSD_OP_READ:
+      case CEPH_OSD_OP_SYNC_READ: {
+	uint64_t cursor = osd_op->op.extent.offset;
+	uint64_t remain = osd_op->op.extent.length;
+
+	/* requested chunks exist in chunk_map ? */
+	for (auto &p : obc->obs.oi.manifest.chunk_map) {
+	  if (p.first <= cursor && p.first + p.second.length > cursor) {
+	    if (!p.second.is_missing()) {
+	      return false;
+	    }
+	    if (p.second.length >= remain) {
+	      remain = 0;
+	      break;
+	    } else {
+	      remain = remain - p.second.length;
+	    }
+	    cursor += p.second.length;
+	  }
+	}
+
+	if (remain) {
+	  dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
+	  return false;
+	}
+	continue;
+      }
+      default:
+	return false;
+    }
+  }
+  return ret;
+}
+
+void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
+{
+  dout(10) << __func__ << " " << oid << " tid " << tid
+	   << " " << cpp_strerror(r) << dendl;
+
+  map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
+  if (p == proxywrite_ops.end()) {
+    dout(10) << __func__ << " no proxywrite_op found" << dendl;
+    return;
+  }
+  ProxyWriteOpRef pwop = p->second;
+  ceph_assert(tid == pwop->objecter_tid);
+  ceph_assert(oid == pwop->soid);
+
+  proxywrite_ops.erase(tid);
+
+  map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
+  if (q == in_progress_proxy_ops.end()) {
+    dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
+    delete pwop->ctx;
+    pwop->ctx = NULL;
+    return;
+  }
+  list<OpRequestRef>& in_progress_op = q->second;
+  ceph_assert(in_progress_op.size());
+  list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
+                                              in_progress_op.end(),
+					      pwop->op);
+  ceph_assert(it != in_progress_op.end());
+  in_progress_op.erase(it);
+  if (in_progress_op.size() == 0) {
+    in_progress_proxy_ops.erase(oid);
+  } else if (std::find(in_progress_op.begin(),
+                        in_progress_op.end(),
+                        pwop->op) != in_progress_op.end()) {
+    if (pwop->ctx)
+      delete pwop->ctx;
+    pwop->ctx = NULL;
+    dout(20) << __func__ << " " << oid << " tid " << tid
+            << " in_progress_op size: "
+            << in_progress_op.size() << dendl;
+    return;
+  }
+
+  osd->logger->inc(l_osd_tier_proxy_write);
+
+  auto m = pwop->op->get_req<MOSDOp>();
+  ceph_assert(m != NULL);
+
+  if (!pwop->sent_reply) {
+    // send commit.
+    assert(pwop->ctx->reply == nullptr);
+    MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0,
+					 true /* we claim it below */);
+    reply->set_reply_versions(eversion_t(), pwop->user_version);
+    reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+    reply->claim_op_out_data(pwop->ops);
+    dout(10) << " sending commit on " << pwop << " " << reply << dendl;
+    osd->send_message_osd_client(reply, m->get_connection());
+    pwop->sent_reply = true;
+    pwop->ctx->op->mark_commit_sent();
+  }
+
+  delete pwop->ctx;
+  pwop->ctx = NULL;
+}
+
+void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
+				      vector<ceph_tid_t> *tids)
+{
+  dout(10) << __func__ << " " << pwop->soid << dendl;
+  pwop->canceled = true;
+
+  // cancel objecter op, if we can
+  if (pwop->objecter_tid) {
+    tids->push_back(pwop->objecter_tid);
+    delete pwop->ctx;
+    pwop->ctx = NULL;
+    proxywrite_ops.erase(pwop->objecter_tid);
+    pwop->objecter_tid = 0;
+  }
+}
+
+class PromoteCallback: public PrimaryLogPG::CopyCallback {
+  ObjectContextRef obc;
+  PrimaryLogPG *pg;
+  utime_t start;
+public:
+  PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
+    : obc(obc_),
+      pg(pg_),
+      start(ceph_clock_now()) {}
+
+  void finish(PrimaryLogPG::CopyCallbackResults results) override {
+    PrimaryLogPG::CopyResults *results_data = results.get<1>();
+    int r = results.get<0>();
+    pg->finish_promote(r, results_data, obc);
+    pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
+  }
+};
+
+class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
+  ObjectContextRef obc;
+  PrimaryLogPG *pg;
+  utime_t start;
+  PrimaryLogPG::OpContext *ctx;
+  PrimaryLogPG::CopyCallbackResults promote_results;
+public:
+  PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx = NULL)
+    : obc(obc_),
+      pg(pg_),
+      start(ceph_clock_now()), ctx(ctx) {}
+
+  void finish(PrimaryLogPG::CopyCallbackResults results) override {
+    PrimaryLogPG::CopyResults *results_data = results.get<1>();
+    int r = results.get<0>();
+    if (ctx) {
+      promote_results = results;
+      pg->execute_ctx(ctx);
+    } else {
+      pg->finish_promote_manifest(r, results_data, obc);
+    }
+    pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
+  }
+  friend struct PromoteFinisher;
+};
+
+struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
+  PromoteManifestCallback *promote_callback;
+
+  explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
+    : promote_callback(promote_callback) {
+  }
+
+  int execute() override {
+    if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
+      promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
+						promote_callback->promote_results.get<1>(),
+						promote_callback->obc);
+    } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
+      promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
+						promote_callback->promote_results.get<1>(),
+						promote_callback->obc);
+    } else {
+      ceph_abort_msg("unrecognized manifest type");
+    }
+    return 0;
+  }
+};
+
+void PrimaryLogPG::promote_object(ObjectContextRef obc,
+				  const hobject_t& missing_oid,
+				  const object_locator_t& oloc,
+				  OpRequestRef op,
+				  ObjectContextRef *promote_obc)
+{
+  hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
+  ceph_assert(hoid != hobject_t());
+  if (m_scrubber->write_blocked_by_scrub(hoid)) {
+    dout(10) << __func__ << " " << hoid
+	     << " blocked by scrub" << dendl;
+    if (op) {
+      waiting_for_scrub.push_back(op);
+      op->mark_delayed("waiting for scrub");
+      dout(10) << __func__ << " " << hoid
+	       << " placing op in waiting_for_scrub" << dendl;
+    } else {
+      dout(10) << __func__ << " " << hoid
+	       << " no op, dropping on the floor" << dendl;
+    }
+    return;
+  }
+  if (op && !check_laggy_requeue(op)) {
+    return;
+  }
+  if (!obc) { // we need to create an ObjectContext
+    ceph_assert(missing_oid != hobject_t());
+    obc = get_object_context(missing_oid, true);
+  }
+  if (promote_obc)
+    *promote_obc = obc;
+
+  /*
+   * Before promote complete, if there are  proxy-reads for the object,
+   * for this case we don't use DONTNEED.
+   */
+  unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
+  map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
+  if (q == in_progress_proxy_ops.end()) {
+    src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
+  }
+
+  CopyCallback *cb;
+  object_locator_t my_oloc;
+  hobject_t src_hoid;
+  if (!obc->obs.oi.has_manifest()) {
+    my_oloc = oloc;
+    my_oloc.pool = pool.info.tier_of;
+    src_hoid = obc->obs.oi.soid;
+    cb = new PromoteCallback(obc, this);
+  } else {
+    if (obc->obs.oi.manifest.is_chunked()) {
+      src_hoid = obc->obs.oi.soid;
+      cb = new PromoteManifestCallback(obc, this);
+    } else if (obc->obs.oi.manifest.is_redirect()) {
+      object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
+      my_oloc = src_oloc;
+      src_hoid = obc->obs.oi.manifest.redirect_target;
+      cb = new PromoteCallback(obc, this);
+    } else {
+      ceph_abort_msg("unrecognized manifest type");
+    }
+  }
+
+  unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
+                   CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
+                   CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
+                   CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
+  start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
+	     obc->obs.oi.soid.snap == CEPH_NOSNAP,
+	     src_fadvise_flags, 0);
+
+  ceph_assert(obc->is_blocked());
+
+  if (op)
+    wait_for_blocked_object(obc->obs.oi.soid, op);
+
+  recovery_state.update_stats(
+    [](auto &history, auto &stats) {
+      stats.stats.sum.num_promote++;
+      return false;
+    });
+}
+
+void PrimaryLogPG::execute_ctx(OpContext *ctx)
+{
+  FUNCTRACE(cct);
+  dout(10) << __func__ << " " << ctx << dendl;
+  ctx->reset_obs(ctx->obc);
+  ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
+  OpRequestRef op = ctx->op;
+  auto m = op->get_req<MOSDOp>();
+  ObjectContextRef obc = ctx->obc;
+  const hobject_t& soid = obc->obs.oi.soid;
+
+  // this method must be idempotent since we may call it several times
+  // before we finally apply the resulting transaction.
+  ctx->op_t.reset(new PGTransaction);
+
+  if (op->may_write() || op->may_cache()) {
+    // snap
+    if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
+	pool.info.is_pool_snaps_mode()) {
+      // use pool's snapc
+      ctx->snapc = pool.snapc;
+    } else {
+      // client specified snapc
+      ctx->snapc.seq = m->get_snap_seq();
+      ctx->snapc.snaps = m->get_snaps();
+      filter_snapc(ctx->snapc.snaps);
+    }
+    if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
+	ctx->snapc.seq < obc->ssc->snapset.seq) {
+      dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
+	       << " < snapset seq " << obc->ssc->snapset.seq
+	       << " on " << obc->obs.oi.soid << dendl;
+      reply_ctx(ctx, -EOLDSNAPC);
+      return;
+    }
+
+    // version
+    ctx->at_version = get_next_version();
+    ctx->mtime = m->get_mtime();
+
+    dout(10) << __func__ << " " << soid << " " << *ctx->ops
+	     << " ov " << obc->obs.oi.version << " av " << ctx->at_version
+	     << " snapc " << ctx->snapc
+	     << " snapset " << obc->ssc->snapset
+	     << dendl;
+  } else {
+    dout(10) << __func__ << " " << soid << " " << *ctx->ops
+	     << " ov " << obc->obs.oi.version
+	     << dendl;
+  }
+
+  if (!ctx->user_at_version)
+    ctx->user_at_version = obc->obs.oi.user_version;
+  dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
+
+  {
+#ifdef WITH_LTTNG
+    osd_reqid_t reqid = ctx->op->get_reqid();
+#endif
+    tracepoint(osd, prepare_tx_enter, reqid.name._type,
+        reqid.name._num, reqid.tid, reqid.inc);
+  }
+#ifdef HAVE_JAEGER
+  if (ctx->op->osd_parent_span) {
+    auto execute_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
+  }
+#endif
+
+  int result = prepare_transaction(ctx);
+
+  {
+#ifdef WITH_LTTNG
+    osd_reqid_t reqid = ctx->op->get_reqid();
+#endif
+    tracepoint(osd, prepare_tx_exit, reqid.name._type,
+        reqid.name._num, reqid.tid, reqid.inc);
+  }
+
+  bool pending_async_reads = !ctx->pending_async_reads.empty();
+  if (result == -EINPROGRESS || pending_async_reads) {
+    // come back later.
+    if (pending_async_reads) {
+      ceph_assert(pool.info.is_erasure());
+      in_progress_async_reads.push_back(make_pair(op, ctx));
+      ctx->start_async_reads(this);
+    }
+    return;
+  }
+
+  if (result == -EAGAIN) {
+    // clean up after the ctx
+    close_op_ctx(ctx);
+    return;
+  }
+
+  bool ignore_out_data = false;
+  if (!ctx->op_t->empty() &&
+      op->may_write() &&
+      result >= 0) {
+    // successful update
+    if (ctx->op->allows_returnvec()) {
+      // enforce reasonable bound on the return buffer sizes
+      for (auto& i : *ctx->ops) {
+	if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) {
+	  dout(10) << __func__ << " op " << i << " outdata overflow" << dendl;
+	  result = -EOVERFLOW;  // overall result is overflow
+	  i.rval = -EOVERFLOW;
+	  i.outdata.clear();
+	}
+      }
+    } else {
+      // legacy behavior -- zero result and return data etc.
+      ignore_out_data = true;
+      result = 0;
+    }
+  }
+
+  // prepare the reply
+  ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0,
+			       ignore_out_data);
+  dout(20) << __func__ << " alloc reply " << ctx->reply
+	   << " result " << result << dendl;
+
+  // read or error?
+  if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
+    // finish side-effects
+    if (result >= 0)
+      do_osd_op_effects(ctx, m->get_connection());
+
+    complete_read_ctx(result, ctx);
+    return;
+  }
+
+  ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
+
+  ceph_assert(op->may_write() || op->may_cache());
+
+  // trim log?
+  recovery_state.update_trim_to();
+
+  // verify that we are doing this in order?
+  if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
+      !pool.info.is_tier() && !pool.info.has_tiers()) {
+    map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
+    ceph_tid_t t = m->get_tid();
+    client_t n = m->get_source().num();
+    map<client_t,ceph_tid_t>::iterator p = cm.find(n);
+    if (p == cm.end()) {
+      dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
+      cm[n] = t;
+    } else {
+      dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
+      if (p->second > t) {
+	derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
+	ceph_abort_msg("out of order op");
+      }
+      p->second = t;
+    }
+  }
+
+  if (ctx->update_log_only) {
+    if (result >= 0)
+      do_osd_op_effects(ctx, m->get_connection());
+
+    dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
+    // save just what we need from ctx
+    MOSDOpReply *reply = ctx->reply;
+    ctx->reply = nullptr;
+    reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
+
+    if (result == -ENOENT) {
+      reply->set_enoent_reply_versions(info.last_update,
+				       info.last_user_version);
+    }
+    reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+    // append to pg log for dup detection - don't save buffers for now
+    record_write_error(op, soid, reply, result,
+		       ctx->op->allows_returnvec() ? ctx : nullptr);
+    close_op_ctx(ctx);
+    return;
+  }
+
+  // no need to capture PG ref, repop cancel will handle that
+  // Can capture the ctx by pointer, it's owned by the repop
+  ctx->register_on_commit(
+    [m, ctx, this](){
+      if (ctx->op)
+	log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
+
+      if (m && !ctx->sent_reply) {
+	MOSDOpReply *reply = ctx->reply;
+	ctx->reply = nullptr;
+	reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+	dout(10) << " sending reply on " << *m << " " << reply << dendl;
+	osd->send_message_osd_client(reply, m->get_connection());
+	ctx->sent_reply = true;
+	ctx->op->mark_commit_sent();
+      }
+    });
+  ctx->register_on_success(
+    [ctx, this]() {
+      do_osd_op_effects(
+	ctx,
+	ctx->op ? ctx->op->get_req()->get_connection() :
+	ConnectionRef());
+    });
+  ctx->register_on_finish(
+    [ctx]() {
+      delete ctx;
+    });
+
+  // issue replica writes
+  ceph_tid_t rep_tid = osd->get_tid();
+
+  RepGather *repop = new_repop(ctx, obc, rep_tid);
+
+  issue_repop(repop, ctx);
+  eval_repop(repop);
+  repop->put();
+}
+
+void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
+  release_object_locks(ctx->lock_manager);
+
+  ctx->op_t.reset();
+
+  for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
+       ctx->on_finish.erase(p++)) {
+    (*p)();
+  }
+  delete ctx;
+}
+
+void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
+{
+  if (ctx->op)
+    osd->reply_op_error(ctx->op, r);
+  close_op_ctx(ctx);
+}
+
+void PrimaryLogPG::log_op_stats(const OpRequest& op,
+				const uint64_t inb,
+				const uint64_t outb)
+{
+  auto m = op.get_req<MOSDOp>();
+  const utime_t now = ceph_clock_now();
+
+  const utime_t latency = now - m->get_recv_stamp();
+  const utime_t process_latency = now - op.get_dequeued_time();
+
+  osd->logger->inc(l_osd_op);
+
+  osd->logger->inc(l_osd_op_outb, outb);
+  osd->logger->inc(l_osd_op_inb, inb);
+  osd->logger->tinc(l_osd_op_lat, latency);
+  osd->logger->tinc(l_osd_op_process_lat, process_latency);
+
+  if (op.may_read() && op.may_write()) {
+    osd->logger->inc(l_osd_op_rw);
+    osd->logger->inc(l_osd_op_rw_inb, inb);
+    osd->logger->inc(l_osd_op_rw_outb, outb);
+    osd->logger->tinc(l_osd_op_rw_lat, latency);
+    osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
+    osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
+    osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
+  } else if (op.may_read()) {
+    osd->logger->inc(l_osd_op_r);
+    osd->logger->inc(l_osd_op_r_outb, outb);
+    osd->logger->tinc(l_osd_op_r_lat, latency);
+    osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
+    osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
+  } else if (op.may_write() || op.may_cache()) {
+    osd->logger->inc(l_osd_op_w);
+    osd->logger->inc(l_osd_op_w_inb, inb);
+    osd->logger->tinc(l_osd_op_w_lat, latency);
+    osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
+    osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
+  } else {
+    ceph_abort();
+  }
+
+  dout(15) << "log_op_stats " << *m
+	   << " inb " << inb
+	   << " outb " << outb
+	   << " lat " << latency << dendl;
+
+  if (m_dynamic_perf_stats.is_enabled()) {
+    m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
+  }
+}
+
+void PrimaryLogPG::set_dynamic_perf_stats_queries(
+    const std::list<OSDPerfMetricQuery> &queries)
+{
+  m_dynamic_perf_stats.set_queries(queries);
+}
+
+void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
+{
+  std::swap(m_dynamic_perf_stats, *stats);
+}
+
+void PrimaryLogPG::do_scan(
+  OpRequestRef op,
+  ThreadPool::TPHandle &handle)
+{
+  auto m = op->get_req<MOSDPGScan>();
+  ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
+  dout(10) << "do_scan " << *m << dendl;
+
+  op->mark_started();
+
+  switch (m->op) {
+  case MOSDPGScan::OP_SCAN_GET_DIGEST:
+    {
+      auto dpp = get_dpp();
+      if (osd->check_backfill_full(dpp)) {
+	dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
+	queue_peering_event(
+	  PGPeeringEventRef(
+	    std::make_shared<PGPeeringEvent>(
+	      get_osdmap_epoch(),
+	      get_osdmap_epoch(),
+	      PeeringState::BackfillTooFull())));
+	return;
+      }
+
+      BackfillInterval bi;
+      bi.begin = m->begin;
+      // No need to flush, there won't be any in progress writes occuring
+      // past m->begin
+      scan_range(
+	cct->_conf->osd_backfill_scan_min,
+	cct->_conf->osd_backfill_scan_max,
+	&bi,
+	handle);
+      MOSDPGScan *reply = new MOSDPGScan(
+	MOSDPGScan::OP_SCAN_DIGEST,
+	pg_whoami,
+	get_osdmap_epoch(), m->query_epoch,
+	spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
+      encode(bi.objects, reply->get_data());
+      osd->send_message_osd_cluster(reply, m->get_connection());
+    }
+    break;
+
+  case MOSDPGScan::OP_SCAN_DIGEST:
+    {
+      pg_shard_t from = m->from;
+
+      // Check that from is in backfill_targets vector
+      ceph_assert(is_backfill_target(from));
+
+      BackfillInterval& bi = peer_backfill_info[from];
+      bi.begin = m->begin;
+      bi.end = m->end;
+      auto p = m->get_data().cbegin();
+
+      // take care to preserve ordering!
+      bi.clear_objects();
+      decode_noclear(bi.objects, p);
+      dout(10) << __func__ << " bi.begin=" << bi.begin << " bi.end=" << bi.end
+               << " bi.objects.size()=" << bi.objects.size() << dendl;
+
+      if (waiting_on_backfill.erase(from)) {
+	if (waiting_on_backfill.empty()) {
+	  ceph_assert(
+	    peer_backfill_info.size() ==
+	    get_backfill_targets().size());
+	  finish_recovery_op(hobject_t::get_max());
+	}
+      } else {
+	// we canceled backfill for a while due to a too full, and this
+	// is an extra response from a non-too-full peer
+	dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
+      }
+    }
+    break;
+  }
+}
+
+void PrimaryLogPG::do_backfill(OpRequestRef op)
+{
+  auto m = op->get_req<MOSDPGBackfill>();
+  ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
+  dout(10) << "do_backfill " << *m << dendl;
+
+  op->mark_started();
+
+  switch (m->op) {
+  case MOSDPGBackfill::OP_BACKFILL_FINISH:
+    {
+      ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
+
+      MOSDPGBackfill *reply = new MOSDPGBackfill(
+	MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
+	get_osdmap_epoch(),
+	m->query_epoch,
+	spg_t(info.pgid.pgid, get_primary().shard));
+      reply->set_priority(get_recovery_op_priority());
+      osd->send_message_osd_cluster(reply, m->get_connection());
+      queue_peering_event(
+	PGPeeringEventRef(
+	  std::make_shared<PGPeeringEvent>(
+	    get_osdmap_epoch(),
+	    get_osdmap_epoch(),
+	    RecoveryDone())));
+    }
+    // fall-thru
+
+  case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
+    {
+      ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
+
+      ObjectStore::Transaction t;
+      recovery_state.update_backfill_progress(
+	m->last_backfill,
+	m->stats,
+	m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+	t);
+
+      int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
+      ceph_assert(tr == 0);
+    }
+    break;
+
+  case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
+    {
+      ceph_assert(is_primary());
+      ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
+      finish_recovery_op(hobject_t::get_max());
+    }
+    break;
+  }
+}
+
+void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
+{
+  const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
+    op->get_req());
+  ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
+  dout(7) << __func__ << " " << m->ls << dendl;
+
+  op->mark_started();
+
+  ObjectStore::Transaction t;
+  for (auto& p : m->ls) {
+    if (is_remote_backfilling()) {
+      struct stat st;
+      int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
+                               pg_whoami.shard) , &st);
+      if (r == 0) {
+        sub_local_num_bytes(st.st_size);
+        int64_t usersize;
+        if (pool.info.is_erasure()) {
+          bufferlist bv;
+	  int r = osd->store->getattr(
+	      ch,
+              ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
+	      OI_ATTR,
+	      bv);
+	  if (r >= 0) {
+	    object_info_t oi(bv);
+            usersize = oi.size * pgbackend->get_ec_data_chunk_count();
+          } else {
+            dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
+                    << " can't get object info" << dendl;
+            usersize = 0;
+          }
+        } else {
+          usersize = st.st_size;
+        }
+        sub_num_bytes(usersize);
+        dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
+                 << " sub actual data by " << st.st_size
+                 << " sub num_bytes by " << usersize
+                 << dendl;
+      }
+    }
+    remove_snap_mapped_object(t, p.first);
+  }
+  int r = osd->store->queue_transaction(ch, std::move(t), NULL);
+  ceph_assert(r == 0);
+}
+
+int PrimaryLogPG::trim_object(
+  bool first, const hobject_t &coid, snapid_t snap_to_trim,
+  PrimaryLogPG::OpContextUPtr *ctxp)
+{
+  *ctxp = NULL;
+
+  // load clone info
+  bufferlist bl;
+  ObjectContextRef obc = get_object_context(coid, false, NULL);
+  if (!obc || !obc->ssc || !obc->ssc->exists) {
+    osd->clog->error() << __func__ << ": Can not trim " << coid
+      << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
+    return -ENOENT;
+  }
+
+  hobject_t head_oid = coid.get_head();
+  ObjectContextRef head_obc = get_object_context(head_oid, false);
+  if (!head_obc) {
+    osd->clog->error() << __func__ << ": Can not trim " << coid
+      << " repair needed, no snapset obc for " << head_oid;
+    return -ENOENT;
+  }
+
+  SnapSet& snapset = obc->ssc->snapset;
+
+  object_info_t &coi = obc->obs.oi;
+  auto citer = snapset.clone_snaps.find(coid.snap);
+  if (citer == snapset.clone_snaps.end()) {
+    osd->clog->error() << "No clone_snaps in snapset " << snapset
+		       << " for object " << coid << "\n";
+    return -ENOENT;
+  }
+  set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
+  if (old_snaps.empty()) {
+    osd->clog->error() << "No object info snaps for object " << coid;
+    return -ENOENT;
+  }
+
+  dout(10) << coid << " old_snaps " << old_snaps
+	   << " old snapset " << snapset << dendl;
+  if (snapset.seq == 0) {
+    osd->clog->error() << "No snapset.seq for object " << coid;
+    return -ENOENT;
+  }
+
+  set<snapid_t> new_snaps;
+  const OSDMapRef& osdmap = get_osdmap();
+  for (set<snapid_t>::iterator i = old_snaps.begin();
+       i != old_snaps.end();
+       ++i) {
+    if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) &&
+	*i != snap_to_trim) {
+      new_snaps.insert(*i);
+    }
+  }
+
+  vector<snapid_t>::iterator p = snapset.clones.end();
+
+  if (new_snaps.empty()) {
+    p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
+    if (p == snapset.clones.end()) {
+      osd->clog->error() << "Snap " << coid.snap << " not in clones";
+      return -ENOENT;
+    }
+  }
+
+  OpContextUPtr ctx = simple_opc_create(obc);
+  ctx->head_obc = head_obc;
+
+  if (!ctx->lock_manager.get_snaptrimmer_write(
+	coid,
+	obc,
+	first)) {
+    close_op_ctx(ctx.release());
+    dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
+    return -ENOLCK;
+  }
+
+  if (!ctx->lock_manager.get_snaptrimmer_write(
+	head_oid,
+	head_obc,
+	first)) {
+    close_op_ctx(ctx.release());
+    dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
+    return -ENOLCK;
+  }
+
+  ctx->at_version = get_next_version();
+
+  PGTransaction *t = ctx->op_t.get();
+
+  if (new_snaps.empty()) {
+    // remove clone
+    dout(10) << coid << " snaps " << old_snaps << " -> "
+	     << new_snaps << " ... deleting" << dendl;
+
+    // ...from snapset
+    ceph_assert(p != snapset.clones.end());
+
+    snapid_t last = coid.snap;
+    ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
+
+    if (p != snapset.clones.begin()) {
+      // not the oldest... merge overlap into next older clone
+      vector<snapid_t>::iterator n = p - 1;
+      hobject_t prev_coid = coid;
+      prev_coid.snap = *n;
+      bool adjust_prev_bytes = is_present_clone(prev_coid);
+
+      if (adjust_prev_bytes)
+	ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
+
+      snapset.clone_overlap[*n].intersection_of(
+	snapset.clone_overlap[*p]);
+
+      if (adjust_prev_bytes)
+	ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
+    }
+    ctx->delta_stats.num_objects--;
+    if (coi.is_dirty())
+      ctx->delta_stats.num_objects_dirty--;
+    if (coi.is_omap())
+      ctx->delta_stats.num_objects_omap--;
+    if (coi.is_whiteout()) {
+      dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
+      ctx->delta_stats.num_whiteouts--;
+    }
+    ctx->delta_stats.num_object_clones--;
+    if (coi.is_cache_pinned())
+      ctx->delta_stats.num_objects_pinned--;
+    if (coi.has_manifest()) {
+      dec_all_refcount_manifest(coi, ctx.get());
+      ctx->delta_stats.num_objects_manifest--;
+    }
+    obc->obs.exists = false;
+
+    snapset.clones.erase(p);
+    snapset.clone_overlap.erase(last);
+    snapset.clone_size.erase(last);
+    snapset.clone_snaps.erase(last);
+
+    ctx->log.push_back(
+      pg_log_entry_t(
+	pg_log_entry_t::DELETE,
+	coid,
+	ctx->at_version,
+	ctx->obs->oi.version,
+	0,
+	osd_reqid_t(),
+	ctx->mtime,
+	0)
+      );
+    t->remove(coid);
+    t->update_snaps(
+      coid,
+      old_snaps,
+      new_snaps);
+
+    coi = object_info_t(coid);
+
+    ctx->at_version.version++;
+  } else {
+    // save adjusted snaps for this object
+    dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
+    snapset.clone_snaps[coid.snap] =
+      vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
+    // we still do a 'modify' event on this object just to trigger a
+    // snapmapper.update ... :(
+
+    coi.prior_version = coi.version;
+    coi.version = ctx->at_version;
+    bl.clear();
+    encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+    t->setattr(coid, OI_ATTR, bl);
+
+    ctx->log.push_back(
+      pg_log_entry_t(
+	pg_log_entry_t::MODIFY,
+	coid,
+	coi.version,
+	coi.prior_version,
+	0,
+	osd_reqid_t(),
+	ctx->mtime,
+	0)
+      );
+    ctx->at_version.version++;
+
+    t->update_snaps(
+      coid,
+      old_snaps,
+      new_snaps);
+  }
+
+  // save head snapset
+  dout(10) << coid << " new snapset " << snapset << " on "
+	   << head_obc->obs.oi << dendl;
+  if (snapset.clones.empty() &&
+      (head_obc->obs.oi.is_whiteout() &&
+       !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
+       !head_obc->obs.oi.is_cache_pinned())) {
+    // NOTE: this arguably constitutes minor interference with the
+    // tiering agent if this is a cache tier since a snap trim event
+    // is effectively evicting a whiteout we might otherwise want to
+    // keep around.
+    dout(10) << coid << " removing " << head_oid << dendl;
+    ctx->log.push_back(
+      pg_log_entry_t(
+	pg_log_entry_t::DELETE,
+	head_oid,
+	ctx->at_version,
+	head_obc->obs.oi.version,
+	0,
+	osd_reqid_t(),
+	ctx->mtime,
+	0)
+      );
+    dout(10) << "removing snap head" << dendl;
+    object_info_t& oi = head_obc->obs.oi;
+    ctx->delta_stats.num_objects--;
+    if (oi.is_dirty()) {
+      ctx->delta_stats.num_objects_dirty--;
+    }
+    if (oi.is_omap())
+      ctx->delta_stats.num_objects_omap--;
+    if (oi.is_whiteout()) {
+      dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
+      ctx->delta_stats.num_whiteouts--;
+    }
+    if (oi.is_cache_pinned()) {
+      ctx->delta_stats.num_objects_pinned--;
+    }
+    if (oi.has_manifest()) {
+      ctx->delta_stats.num_objects_manifest--;
+      dec_all_refcount_manifest(oi, ctx.get());
+    }
+    head_obc->obs.exists = false;
+    head_obc->obs.oi = object_info_t(head_oid);
+    t->remove(head_oid);
+  } else {
+    if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
+      // filter SnapSet::snaps for the benefit of pre-octopus
+      // peers. This is perhaps overly conservative in that I'm not
+      // certain they need this, but let's be conservative here.
+      dout(10) << coid << " filtering snapset on " << head_oid << dendl;
+      snapset.filter(pool.info);
+    } else {
+      snapset.snaps.clear();
+    }
+    dout(10) << coid << " writing updated snapset on " << head_oid
+	     << ", snapset is " << snapset << dendl;
+    ctx->log.push_back(
+      pg_log_entry_t(
+	pg_log_entry_t::MODIFY,
+	head_oid,
+	ctx->at_version,
+	head_obc->obs.oi.version,
+	0,
+	osd_reqid_t(),
+	ctx->mtime,
+	0)
+      );
+
+    head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
+    head_obc->obs.oi.version = ctx->at_version;
+
+    map <string, bufferlist> attrs;
+    bl.clear();
+    encode(snapset, bl);
+    attrs[SS_ATTR] = std::move(bl);
+
+    bl.clear();
+    encode(head_obc->obs.oi, bl,
+	     get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+    attrs[OI_ATTR] = std::move(bl);
+    t->setattrs(head_oid, attrs);
+  }
+
+  *ctxp = std::move(ctx);
+  return 0;
+}
+
+void PrimaryLogPG::kick_snap_trim()
+{
+  ceph_assert(is_active());
+  ceph_assert(is_primary());
+  if (is_clean() &&
+      !state_test(PG_STATE_PREMERGE) &&
+      !snap_trimq.empty()) {
+    if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
+      dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
+    } else {
+      dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
+      snap_trimmer_machine.process_event(KickTrim());
+    }
+  }
+}
+
+void PrimaryLogPG::snap_trimmer_scrub_complete()
+{
+  if (is_primary() && is_active() && is_clean() && !snap_trimq.empty()) {
+    dout(10) << "scrub finished - requeuing snap_trimmer" << dendl;
+    snap_trimmer_machine.process_event(ScrubComplete());
+  }
+}
+
+void PrimaryLogPG::snap_trimmer(epoch_t queued)
+{
+  if (recovery_state.is_deleting() || pg_has_reset_since(queued)) {
+    return;
+  }
+
+  ceph_assert(is_primary());
+
+  dout(10) << "snap_trimmer posting" << dendl;
+  snap_trimmer_machine.process_event(DoSnapWork());
+  dout(10) << "snap_trimmer complete" << dendl;
+  return;
+}
+
+int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
+{
+  __u64 v2;
+
+  string v2s(xattr.c_str(), xattr.length());
+  if (v2s.length())
+    v2 = strtoull(v2s.c_str(), NULL, 10);
+  else
+    v2 = 0;
+
+  dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
+
+  switch (op) {
+  case CEPH_OSD_CMPXATTR_OP_EQ:
+    return (v1 == v2);
+  case CEPH_OSD_CMPXATTR_OP_NE:
+    return (v1 != v2);
+  case CEPH_OSD_CMPXATTR_OP_GT:
+    return (v1 > v2);
+  case CEPH_OSD_CMPXATTR_OP_GTE:
+    return (v1 >= v2);
+  case CEPH_OSD_CMPXATTR_OP_LT:
+    return (v1 < v2);
+  case CEPH_OSD_CMPXATTR_OP_LTE:
+    return (v1 <= v2);
+  default:
+    return -EINVAL;
+  }
+}
+
+int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
+{
+  string v2s(xattr.c_str(), xattr.length());
+
+  dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
+
+  switch (op) {
+  case CEPH_OSD_CMPXATTR_OP_EQ:
+    return (v1s.compare(v2s) == 0);
+  case CEPH_OSD_CMPXATTR_OP_NE:
+    return (v1s.compare(v2s) != 0);
+  case CEPH_OSD_CMPXATTR_OP_GT:
+    return (v1s.compare(v2s) > 0);
+  case CEPH_OSD_CMPXATTR_OP_GTE:
+    return (v1s.compare(v2s) >= 0);
+  case CEPH_OSD_CMPXATTR_OP_LT:
+    return (v1s.compare(v2s) < 0);
+  case CEPH_OSD_CMPXATTR_OP_LTE:
+    return (v1s.compare(v2s) <= 0);
+  default:
+    return -EINVAL;
+  }
+}
+
+int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
+{
+  ceph_osd_op& op = osd_op.op;
+  vector<OSDOp> write_ops(1);
+  OSDOp& write_op = write_ops[0];
+  uint64_t write_length = op.writesame.length;
+  int result = 0;
+
+  if (!write_length)
+    return 0;
+
+  if (!op.writesame.data_length || write_length % op.writesame.data_length)
+    return -EINVAL;
+
+  if (op.writesame.data_length != osd_op.indata.length()) {
+    derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
+    return -EINVAL;
+  }
+
+  while (write_length) {
+    write_op.indata.append(osd_op.indata);
+    write_length -= op.writesame.data_length;
+  }
+
+  write_op.op.op = CEPH_OSD_OP_WRITE;
+  write_op.op.extent.offset = op.writesame.offset;
+  write_op.op.extent.length = op.writesame.length;
+  result = do_osd_ops(ctx, write_ops);
+  if (result < 0)
+    derr << "do_writesame do_osd_ops failed " << result << dendl;
+
+  return result;
+}
+
+// ========================================================================
+// low level osd ops
+
+int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
+{
+  dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
+  bufferlist header, vals;
+  int r = _get_tmap(ctx, &header, &vals);
+  if (r < 0) {
+    if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
+      r = 0;
+    return r;
+  }
+
+  vector<OSDOp> ops(3);
+
+  ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
+  ops[0].op.extent.offset = 0;
+  ops[0].op.extent.length = 0;
+
+  ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
+  ops[1].indata = std::move(header);
+
+  ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
+  ops[2].indata = std::move(vals);
+
+  return do_osd_ops(ctx, ops);
+}
+
+int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
+				 OSDOp& osd_op, bufferlist& bl)
+{
+  // decode
+  bufferlist header;
+  map<string, bufferlist> m;
+  if (bl.length()) {
+    auto p = bl.cbegin();
+    decode(header, p);
+    decode(m, p);
+    ceph_assert(p.end());
+  }
+
+  // do the update(s)
+  while (!bp.end()) {
+    __u8 op;
+    string key;
+    decode(op, bp);
+
+    switch (op) {
+    case CEPH_OSD_TMAP_SET: // insert key
+      {
+	decode(key, bp);
+	bufferlist data;
+	decode(data, bp);
+	m[key] = data;
+      }
+      break;
+    case CEPH_OSD_TMAP_RM: // remove key
+      decode(key, bp);
+      if (!m.count(key)) {
+	return -ENOENT;
+      }
+      m.erase(key);
+      break;
+    case CEPH_OSD_TMAP_RMSLOPPY: // remove key
+      decode(key, bp);
+      m.erase(key);
+      break;
+    case CEPH_OSD_TMAP_HDR: // update header
+      {
+	decode(header, bp);
+      }
+      break;
+    default:
+      return -EINVAL;
+    }
+  }
+
+  // reencode
+  bufferlist obl;
+  encode(header, obl);
+  encode(m, obl);
+
+  // write it out
+  vector<OSDOp> nops(1);
+  OSDOp& newop = nops[0];
+  newop.op.op = CEPH_OSD_OP_WRITEFULL;
+  newop.op.extent.offset = 0;
+  newop.op.extent.length = obl.length();
+  newop.indata = obl;
+  do_osd_ops(ctx, nops);
+  return 0;
+}
+
+int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
+{
+  bufferlist::const_iterator orig_bp = bp;
+  int result = 0;
+  if (bp.end()) {
+    dout(10) << "tmapup is a no-op" << dendl;
+  } else {
+    // read the whole object
+    vector<OSDOp> nops(1);
+    OSDOp& newop = nops[0];
+    newop.op.op = CEPH_OSD_OP_READ;
+    newop.op.extent.offset = 0;
+    newop.op.extent.length = 0;
+    result = do_osd_ops(ctx, nops);
+
+    dout(10) << "tmapup read " << newop.outdata.length() << dendl;
+
+    dout(30) << " starting is \n";
+    newop.outdata.hexdump(*_dout);
+    *_dout << dendl;
+
+    auto ip = newop.outdata.cbegin();
+    bufferlist obl;
+
+    dout(30) << "the update command is: \n";
+    osd_op.indata.hexdump(*_dout);
+    *_dout << dendl;
+
+    // header
+    bufferlist header;
+    __u32 nkeys = 0;
+    if (newop.outdata.length()) {
+      decode(header, ip);
+      decode(nkeys, ip);
+    }
+    dout(10) << "tmapup header " << header.length() << dendl;
+
+    if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
+      ++bp;
+      decode(header, bp);
+      dout(10) << "tmapup new header " << header.length() << dendl;
+    }
+
+    encode(header, obl);
+
+    dout(20) << "tmapup initial nkeys " << nkeys << dendl;
+
+    // update keys
+    bufferlist newkeydata;
+    string nextkey, last_in_key;
+    bufferlist nextval;
+    bool have_next = false;
+    if (!ip.end()) {
+      have_next = true;
+      decode(nextkey, ip);
+      decode(nextval, ip);
+    }
+    while (!bp.end() && !result) {
+      __u8 op;
+      string key;
+      try {
+	decode(op, bp);
+	decode(key, bp);
+      }
+      catch (ceph::buffer::error& e) {
+	return -EINVAL;
+      }
+      if (key < last_in_key) {
+	dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
+		<< "', falling back to an inefficient (unsorted) update" << dendl;
+	bp = orig_bp;
+	return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
+      }
+      last_in_key = key;
+
+      dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
+
+      // skip existing intervening keys
+      bool key_exists = false;
+      while (have_next && !key_exists) {
+	dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
+	if (nextkey > key)
+	  break;
+	if (nextkey < key) {
+	  // copy untouched.
+	  encode(nextkey, newkeydata);
+	  encode(nextval, newkeydata);
+	  dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
+	} else {
+	  // don't copy; discard old value.  and stop.
+	  dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
+	  key_exists = true;
+	  nkeys--;
+	}
+	if (!ip.end()) {
+	  decode(nextkey, ip);
+	  decode(nextval, ip);
+	} else {
+	  have_next = false;
+	}
+      }
+
+      if (op == CEPH_OSD_TMAP_SET) {
+	bufferlist val;
+	try {
+	  decode(val, bp);
+	}
+	catch (ceph::buffer::error& e) {
+	  return -EINVAL;
+	}
+	encode(key, newkeydata);
+	encode(val, newkeydata);
+	dout(20) << "   set " << key << " " << val.length() << dendl;
+	nkeys++;
+      } else if (op == CEPH_OSD_TMAP_CREATE) {
+	if (key_exists) {
+	  return -EEXIST;
+	}
+	bufferlist val;
+	try {
+	  decode(val, bp);
+	}
+	catch (ceph::buffer::error& e) {
+	  return -EINVAL;
+	}
+	encode(key, newkeydata);
+	encode(val, newkeydata);
+	dout(20) << "   create " << key << " " << val.length() << dendl;
+	nkeys++;
+      } else if (op == CEPH_OSD_TMAP_RM) {
+	// do nothing.
+	if (!key_exists) {
+	  return -ENOENT;
+	}
+      } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
+	// do nothing
+      } else {
+	dout(10) << "  invalid tmap op " << (int)op << dendl;
+	return -EINVAL;
+      }
+    }
+
+    // copy remaining
+    if (have_next) {
+      encode(nextkey, newkeydata);
+      encode(nextval, newkeydata);
+      dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
+    }
+    if (!ip.end()) {
+      bufferlist rest;
+      rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
+      dout(20) << "  keep trailing " << rest.length()
+	       << " at " << newkeydata.length() << dendl;
+      newkeydata.claim_append(rest);
+    }
+
+    // encode final key count + key data
+    dout(20) << "tmapup final nkeys " << nkeys << dendl;
+    encode(nkeys, obl);
+    obl.claim_append(newkeydata);
+
+    if (0) {
+      dout(30) << " final is \n";
+      obl.hexdump(*_dout);
+      *_dout << dendl;
+
+      // sanity check
+      auto tp = obl.cbegin();
+      bufferlist h;
+      decode(h, tp);
+      map<string,bufferlist> d;
+      decode(d, tp);
+      ceph_assert(tp.end());
+      dout(0) << " **** debug sanity check, looks ok ****" << dendl;
+    }
+
+    // write it out
+    if (!result) {
+      dout(20) << "tmapput write " << obl.length() << dendl;
+      newop.op.op = CEPH_OSD_OP_WRITEFULL;
+      newop.op.extent.offset = 0;
+      newop.op.extent.length = obl.length();
+      newop.indata = obl;
+      do_osd_ops(ctx, nops);
+    }
+  }
+  return result;
+}
+
+static int check_offset_and_length(uint64_t offset, uint64_t length,
+  uint64_t max, DoutPrefixProvider *dpp)
+{
+  if (offset >= max ||
+      length > max ||
+      offset + length > max) {
+    ldpp_dout(dpp, 10) << __func__ << " "
+      << "osd_max_object_size: " << max
+      << "; Hard limit of object size is 4GB." << dendl;
+    return -EFBIG;
+  }
+
+  return 0;
+}
+
+struct FillInVerifyExtent : public Context {
+  ceph_le64 *r;
+  int32_t *rval;
+  bufferlist *outdatap;
+  std::optional<uint32_t> maybe_crc;
+  uint64_t size;
+  OSDService *osd;
+  hobject_t soid;
+  uint32_t flags;
+  FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
+		     std::optional<uint32_t> mc, uint64_t size,
+		     OSDService *osd, hobject_t soid, uint32_t flags) :
+    r(r), rval(rv), outdatap(blp), maybe_crc(mc),
+    size(size), osd(osd), soid(soid), flags(flags) {}
+  void finish(int len) override {
+    if (len < 0) {
+      *rval = len;
+      return;
+    }
+    *r = len;
+    *rval = 0;
+
+    // whole object?  can we verify the checksum?
+    if (maybe_crc && *r == size) {
+      uint32_t crc = outdatap->crc32c(-1);
+      if (maybe_crc != crc) {
+        osd->clog->error() << std::hex << " full-object read crc 0x" << crc
+			   << " != expected 0x" << *maybe_crc
+			   << std::dec << " on " << soid;
+        if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+	  *rval = -EIO;
+	  *r = 0;
+	}
+      }
+    }
+  }
+};
+
+struct ToSparseReadResult : public Context {
+  int* result;
+  bufferlist* data_bl;
+  uint64_t data_offset;
+  ceph_le64* len;
+  ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
+		     ceph_le64* len)
+    : result(result), data_bl(bl), data_offset(offset),len(len) {}
+  void finish(int r) override {
+    if (r < 0) {
+      *result = r;
+      return;
+    }
+    *result = 0;
+    *len = r;
+    bufferlist outdata;
+    map<uint64_t, uint64_t> extents = {{data_offset, r}};
+    encode(extents, outdata);
+    encode_destructively(*data_bl, outdata);
+    data_bl->swap(outdata);
+  }
+};
+
+template<typename V>
+static string list_keys(const map<string, V>& m) {
+  string s;
+  for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
+    if (!s.empty()) {
+      s.push_back(',');
+    }
+    s.append(itr->first);
+  }
+  return s;
+}
+
+template<typename T>
+static string list_entries(const T& m) {
+  string s;
+  for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
+    if (!s.empty()) {
+      s.push_back(',');
+    }
+    s.append(*itr);
+  }
+  return s;
+}
+
+void PrimaryLogPG::maybe_create_new_object(
+  OpContext *ctx,
+  bool ignore_transaction)
+{
+  ObjectState& obs = ctx->new_obs;
+  if (!obs.exists) {
+    ctx->delta_stats.num_objects++;
+    obs.exists = true;
+    ceph_assert(!obs.oi.is_whiteout());
+    obs.oi.new_object();
+    if (!ignore_transaction)
+      ctx->op_t->create(obs.oi.soid);
+  } else if (obs.oi.is_whiteout()) {
+    dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
+    ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+    --ctx->delta_stats.num_whiteouts;
+  }
+}
+
+struct ReadFinisher : public PrimaryLogPG::OpFinisher {
+  OSDOp& osd_op;
+
+  explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
+  }
+
+  int execute() override {
+    return osd_op.rval;
+  }
+};
+
+struct C_ChecksumRead : public Context {
+  PrimaryLogPG *primary_log_pg;
+  OSDOp &osd_op;
+  Checksummer::CSumType csum_type;
+  bufferlist init_value_bl;
+  ceph_le64 read_length;
+  bufferlist read_bl;
+  Context *fill_extent_ctx;
+
+  C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
+		 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
+		 std::optional<uint32_t> maybe_crc, uint64_t size,
+		 OSDService *osd, hobject_t soid, uint32_t flags)
+    : primary_log_pg(primary_log_pg), osd_op(osd_op),
+      csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
+      fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
+					     &read_bl, maybe_crc, size,
+					     osd, soid, flags)) {
+  }
+  ~C_ChecksumRead() override {
+    delete fill_extent_ctx;
+  }
+
+  void finish(int r) override {
+    fill_extent_ctx->complete(r);
+    fill_extent_ctx = nullptr;
+
+    if (osd_op.rval >= 0) {
+      bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
+      osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
+						    &init_value_bl_it, read_bl);
+    }
+  }
+};
+
+int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
+			      bufferlist::const_iterator *bl_it)
+{
+  dout(20) << __func__ << dendl;
+
+  auto& op = osd_op.op;
+  if (op.checksum.chunk_size > 0) {
+    if (op.checksum.length == 0) {
+      dout(10) << __func__ << ": length required when chunk size provided"
+	       << dendl;
+      return -EINVAL;
+    }
+    if (op.checksum.length % op.checksum.chunk_size != 0) {
+      dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
+      return -EINVAL;
+    }
+  }
+
+  auto& oi = ctx->new_obs.oi;
+  if (op.checksum.offset == 0 && op.checksum.length == 0) {
+    // zeroed offset+length implies checksum whole object
+    op.checksum.length = oi.size;
+  } else if (op.checksum.offset >= oi.size) {
+    // read size was trimmed to zero, do nothing
+    // see PrimaryLogPG::do_read
+    return 0;
+  } else if (op.extent.offset + op.extent.length > oi.size) {
+    op.extent.length = oi.size - op.extent.offset;
+    if (op.checksum.chunk_size > 0 &&
+        op.checksum.length % op.checksum.chunk_size != 0) {
+      dout(10) << __func__ << ": length (trimmed to 0x"
+               << std::hex << op.checksum.length
+               << ") not aligned to chunk size 0x"
+               << op.checksum.chunk_size << std::dec
+               << dendl;
+      return -EINVAL;
+    }
+  }
+
+  Checksummer::CSumType csum_type;
+  switch (op.checksum.type) {
+  case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
+    csum_type = Checksummer::CSUM_XXHASH32;
+    break;
+  case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
+    csum_type = Checksummer::CSUM_XXHASH64;
+    break;
+  case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
+    csum_type = Checksummer::CSUM_CRC32C;
+    break;
+  default:
+    dout(10) << __func__ << ": unknown crc type ("
+	     << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
+    return -EINVAL;
+  }
+
+  size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
+  if (bl_it->get_remaining() < csum_init_value_size) {
+    dout(10) << __func__ << ": init value not provided" << dendl;
+    return -EINVAL;
+  }
+
+  bufferlist init_value_bl;
+  init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
+			  csum_init_value_size);
+  *bl_it += csum_init_value_size;
+
+  if (pool.info.is_erasure() && op.checksum.length > 0) {
+    // If there is a data digest and it is possible we are reading
+    // entire object, pass the digest.
+    std::optional<uint32_t> maybe_crc;
+    if (oi.is_data_digest() && op.checksum.offset == 0 &&
+        op.checksum.length >= oi.size) {
+      maybe_crc = oi.data_digest;
+    }
+
+    // async read
+    auto& soid = oi.soid;
+    auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
+					   std::move(init_value_bl), maybe_crc,
+					   oi.size, osd, soid, op.flags);
+
+    ctx->pending_async_reads.push_back({
+      {op.checksum.offset, op.checksum.length, op.flags},
+      {&checksum_ctx->read_bl, checksum_ctx}});
+
+    dout(10) << __func__ << ": async_read noted for " << soid << dendl;
+    ctx->op_finishers[ctx->current_osd_subop_num].reset(
+      new ReadFinisher(osd_op));
+    return -EINPROGRESS;
+  }
+
+  // sync read
+  std::vector<OSDOp> read_ops(1);
+  auto& read_op = read_ops[0];
+  if (op.checksum.length > 0) {
+    read_op.op.op = CEPH_OSD_OP_READ;
+    read_op.op.flags = op.flags;
+    read_op.op.extent.offset = op.checksum.offset;
+    read_op.op.extent.length = op.checksum.length;
+    read_op.op.extent.truncate_size = 0;
+    read_op.op.extent.truncate_seq = 0;
+
+    int r = do_osd_ops(ctx, read_ops);
+    if (r < 0) {
+      derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
+  return finish_checksum(osd_op, csum_type, &init_value_bl_it,
+			 read_op.outdata);
+}
+
+int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
+				  Checksummer::CSumType csum_type,
+				  bufferlist::const_iterator *init_value_bl_it,
+				  const bufferlist &read_bl) {
+  dout(20) << __func__ << dendl;
+
+  auto& op = osd_op.op;
+
+  if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
+    derr << __func__ << ": bytes read " << read_bl.length() << " != "
+	 << op.checksum.length << dendl;
+    return -EINVAL;
+  }
+
+  size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
+			      op.checksum.chunk_size : read_bl.length());
+  uint32_t csum_count = (csum_chunk_size > 0 ?
+			   read_bl.length() / csum_chunk_size : 0);
+
+  bufferlist csum;
+  bufferptr csum_data;
+  if (csum_count > 0) {
+    size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
+    csum_data = ceph::buffer::create(csum_value_size * csum_count);
+    csum_data.zero();
+    csum.append(csum_data);
+
+    switch (csum_type) {
+    case Checksummer::CSUM_XXHASH32:
+      {
+        Checksummer::xxhash32::init_value_t init_value;
+        decode(init_value, *init_value_bl_it);
+        Checksummer::calculate<Checksummer::xxhash32>(
+	  init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
+	  &csum_data);
+      }
+      break;
+    case Checksummer::CSUM_XXHASH64:
+      {
+        Checksummer::xxhash64::init_value_t init_value;
+        decode(init_value, *init_value_bl_it);
+        Checksummer::calculate<Checksummer::xxhash64>(
+	  init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
+	  &csum_data);
+      }
+      break;
+    case Checksummer::CSUM_CRC32C:
+      {
+        Checksummer::crc32c::init_value_t init_value;
+        decode(init_value, *init_value_bl_it);
+        Checksummer::calculate<Checksummer::crc32c>(
+  	  init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
+	  &csum_data);
+      }
+      break;
+    default:
+      break;
+    }
+  }
+
+  encode(csum_count, osd_op.outdata);
+  osd_op.outdata.claim_append(csum);
+  return 0;
+}
+
+struct C_ExtentCmpRead : public Context {
+  PrimaryLogPG *primary_log_pg;
+  OSDOp &osd_op;
+  ceph_le64 read_length{};
+  bufferlist read_bl;
+  Context *fill_extent_ctx;
+
+  C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
+		  std::optional<uint32_t> maybe_crc, uint64_t size,
+		  OSDService *osd, hobject_t soid, uint32_t flags)
+    : primary_log_pg(primary_log_pg), osd_op(osd_op),
+      fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
+					     &read_bl, maybe_crc, size,
+					     osd, soid, flags)) {
+  }
+  ~C_ExtentCmpRead() override {
+    delete fill_extent_ctx;
+  }
+
+  void finish(int r) override {
+    if (r == -ENOENT) {
+      osd_op.rval = 0;
+      read_bl.clear();
+      delete fill_extent_ctx;
+    } else {
+      fill_extent_ctx->complete(r);
+    }
+    fill_extent_ctx = nullptr;
+
+    if (osd_op.rval >= 0) {
+      osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
+    }
+  }
+};
+
+int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
+{
+  dout(20) << __func__ << dendl;
+  ceph_osd_op& op = osd_op.op;
+
+  auto& oi = ctx->new_obs.oi;
+  uint64_t size = oi.size;
+  if ((oi.truncate_seq < op.extent.truncate_seq) &&
+      (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
+    size = op.extent.truncate_size;
+  }
+
+  if (op.extent.offset >= size) {
+    op.extent.length = 0;
+  } else if (op.extent.offset + op.extent.length > size) {
+    op.extent.length = size - op.extent.offset;
+  }
+
+  if (op.extent.length == 0) {
+    dout(20) << __func__ << " zero length extent" << dendl;
+    return finish_extent_cmp(osd_op, bufferlist{});
+  } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
+    dout(20) << __func__ << " object DNE" << dendl;
+    return finish_extent_cmp(osd_op, {});
+  } else if (pool.info.is_erasure()) {
+    // If there is a data digest and it is possible we are reading
+    // entire object, pass the digest.
+    std::optional<uint32_t> maybe_crc;
+    if (oi.is_data_digest() && op.checksum.offset == 0 &&
+        op.checksum.length >= oi.size) {
+      maybe_crc = oi.data_digest;
+    }
+
+    // async read
+    auto& soid = oi.soid;
+    auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
+					      osd, soid, op.flags);
+    ctx->pending_async_reads.push_back({
+      {op.extent.offset, op.extent.length, op.flags},
+      {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
+
+    dout(10) << __func__ << ": async_read noted for " << soid << dendl;
+
+    ctx->op_finishers[ctx->current_osd_subop_num].reset(
+      new ReadFinisher(osd_op));
+    return -EINPROGRESS;
+  }
+
+  // sync read
+  vector<OSDOp> read_ops(1);
+  OSDOp& read_op = read_ops[0];
+
+  read_op.op.op = CEPH_OSD_OP_SYNC_READ;
+  read_op.op.extent.offset = op.extent.offset;
+  read_op.op.extent.length = op.extent.length;
+  read_op.op.extent.truncate_seq = op.extent.truncate_seq;
+  read_op.op.extent.truncate_size = op.extent.truncate_size;
+
+  int result = do_osd_ops(ctx, read_ops);
+  if (result < 0) {
+    derr << __func__ << " failed " << result << dendl;
+    return result;
+  }
+  return finish_extent_cmp(osd_op, read_op.outdata);
+}
+
+int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
+{
+  for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
+    char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
+    if (osd_op.indata[idx] != read_byte) {
+        return (-MAX_ERRNO - idx);
+    }
+  }
+
+  return 0;
+}
+
+int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
+  dout(20) << __func__ << dendl;
+  auto& op = osd_op.op;
+  auto& oi = ctx->new_obs.oi;
+  auto& soid = oi.soid;
+  __u32 seq = oi.truncate_seq;
+  uint64_t size = oi.size;
+  bool trimmed_read = false;
+
+  dout(30) << __func__ << " oi.size: " << oi.size << dendl;
+  dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
+  dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
+  dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
+
+  // are we beyond truncate_size?
+  if ( (seq < op.extent.truncate_seq) &&
+       (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
+       (size > op.extent.truncate_size) )
+    size = op.extent.truncate_size;
+
+  if (op.extent.length == 0) //length is zero mean read the whole object
+    op.extent.length = size;
+
+  if (op.extent.offset >= size) {
+    op.extent.length = 0;
+    trimmed_read = true;
+  } else if (op.extent.offset + op.extent.length > size) {
+    op.extent.length = size - op.extent.offset;
+    trimmed_read = true;
+  }
+
+  dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
+
+  // read into a buffer
+  int result = 0;
+  if (trimmed_read && op.extent.length == 0) {
+    // read size was trimmed to zero and it is expected to do nothing
+    // a read operation of 0 bytes does *not* do nothing, this is why
+    // the trimmed_read boolean is needed
+  } else if (pool.info.is_erasure()) {
+    // The initialisation below is required to silence a false positive
+    // -Wmaybe-uninitialized warning
+    std::optional<uint32_t> maybe_crc;
+    // If there is a data digest and it is possible we are reading
+    // entire object, pass the digest.  FillInVerifyExtent will
+    // will check the oi.size again.
+    if (oi.is_data_digest() && op.extent.offset == 0 &&
+        op.extent.length >= oi.size)
+      maybe_crc = oi.data_digest;
+    ctx->pending_async_reads.push_back(
+      make_pair(
+        boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
+        make_pair(&osd_op.outdata,
+		  new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
+					 &osd_op.outdata, maybe_crc, oi.size,
+					 osd, soid, op.flags))));
+    dout(10) << " async_read noted for " << soid << dendl;
+
+    ctx->op_finishers[ctx->current_osd_subop_num].reset(
+      new ReadFinisher(osd_op));
+  } else {
+    int r = pgbackend->objects_read_sync(
+      soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
+    // whole object?  can we verify the checksum?
+    if (r >= 0 && op.extent.offset == 0 &&
+        (uint64_t)r == oi.size && oi.is_data_digest()) {
+      uint32_t crc = osd_op.outdata.crc32c(-1);
+      if (oi.data_digest != crc) {
+        osd->clog->error() << info.pgid << std::hex
+                           << " full-object read crc 0x" << crc
+                           << " != expected 0x" << oi.data_digest
+                           << std::dec << " on " << soid;
+        r = -EIO; // try repair later
+      }
+    }
+    if (r == -EIO) {
+      r = rep_repair_primary_object(soid, ctx);
+    }
+    if (r >= 0)
+      op.extent.length = r;
+    else if (r == -EAGAIN) {
+      result = -EAGAIN;
+    } else {
+      result = r;
+      op.extent.length = 0;
+    }
+    dout(10) << " read got " << r << " / " << op.extent.length
+	     << " bytes from obj " << soid << dendl;
+  }
+  if (result >= 0) {
+    ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
+    ctx->delta_stats.num_rd++;
+  }
+  return result;
+}
+
+int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
+  dout(20) << __func__ << dendl;
+  auto& op = osd_op.op;
+  auto& oi = ctx->new_obs.oi;
+  auto& soid = oi.soid;
+
+  if (op.extent.truncate_seq) {
+    dout(0) << "sparse_read does not support truncation sequence " << dendl;
+    return -EINVAL;
+  }
+
+  ++ctx->num_read;
+  if (pool.info.is_erasure()) {
+    // translate sparse read to a normal one if not supported
+    uint64_t offset = op.extent.offset;
+    uint64_t length = op.extent.length;
+    if (offset > oi.size) {
+      length = 0;
+    } else if (offset + length > oi.size) {
+      length = oi.size - offset;
+    }
+
+    if (length > 0) {
+      ctx->pending_async_reads.push_back(
+        make_pair(
+          boost::make_tuple(offset, length, op.flags),
+          make_pair(
+	    &osd_op.outdata,
+	    new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
+				   &op.extent.length))));
+      dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
+
+      ctx->op_finishers[ctx->current_osd_subop_num].reset(
+        new ReadFinisher(osd_op));
+    } else {
+      dout(10) << " sparse read ended up empty for " << soid << dendl;
+      map<uint64_t, uint64_t> extents;
+      encode(extents, osd_op.outdata);
+    }
+  } else {
+    // read into a buffer
+    map<uint64_t, uint64_t> m;
+    int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
+					      info.pgid.shard),
+			       op.extent.offset, op.extent.length, m);
+    if (r < 0)  {
+      return r;
+    }
+
+    bufferlist data_bl;
+    r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl);
+    if (r == -EIO) {
+      r = rep_repair_primary_object(soid, ctx);
+    }
+    if (r < 0) {
+      return r;
+    }
+
+    // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
+    // Maybe at first, there is no much whole objects. With continued use, more
+    // and more whole object exist. So from this point, for spare-read add
+    // checksum make sense.
+    if ((uint64_t)r == oi.size && oi.is_data_digest()) {
+      uint32_t crc = data_bl.crc32c(-1);
+      if (oi.data_digest != crc) {
+        osd->clog->error() << info.pgid << std::hex
+          << " full-object read crc 0x" << crc
+          << " != expected 0x" << oi.data_digest
+          << std::dec << " on " << soid;
+        r = rep_repair_primary_object(soid, ctx);
+	if (r < 0) {
+	  return r;
+	}
+      }
+    }
+
+    op.extent.length = r;
+
+    encode(m, osd_op.outdata); // re-encode since it might be modified
+    ::encode_destructively(data_bl, osd_op.outdata);
+
+    dout(10) << " sparse_read got " << r << " bytes from object "
+	     << soid << dendl;
+  }
+
+  ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
+  ctx->delta_stats.num_rd++;
+  return 0;
+}
+
+int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
+{
+  int result = 0;
+  SnapSetContext *ssc = ctx->obc->ssc;
+  ObjectState& obs = ctx->new_obs;
+  object_info_t& oi = obs.oi;
+  const hobject_t& soid = oi.soid;
+  const bool skip_data_digest = osd->store->has_builtin_csum() &&
+    osd->osd_skip_data_digest;
+
+  PGTransaction* t = ctx->op_t.get();
+
+  dout(10) << "do_osd_op " << soid << " " << ops << dendl;
+#ifdef HAVE_JAEGER
+  if (ctx->op->osd_parent_span) {
+    auto do_osd_op_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
+  }
+#endif
+
+  ctx->current_osd_subop_num = 0;
+  for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
+    OSDOp& osd_op = *p;
+    ceph_osd_op& op = osd_op.op;
+
+    OpFinisher* op_finisher = nullptr;
+    {
+      auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
+      if (op_finisher_it != ctx->op_finishers.end()) {
+        op_finisher = op_finisher_it->second.get();
+      }
+    }
+
+    // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
+    // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
+    // but the code in this function seems to treat them as native-endian.  What should the
+    // tracepoints do?
+    tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
+
+    dout(10) << "do_osd_op  " << osd_op << dendl;
+
+    auto bp = osd_op.indata.cbegin();
+
+    // user-visible modifcation?
+    switch (op.op) {
+      // non user-visible modifications
+    case CEPH_OSD_OP_WATCH:
+    case CEPH_OSD_OP_CACHE_EVICT:
+    case CEPH_OSD_OP_CACHE_FLUSH:
+    case CEPH_OSD_OP_CACHE_TRY_FLUSH:
+    case CEPH_OSD_OP_UNDIRTY:
+    case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
+    case CEPH_OSD_OP_COPY_FROM2:
+    case CEPH_OSD_OP_CACHE_PIN:
+    case CEPH_OSD_OP_CACHE_UNPIN:
+    case CEPH_OSD_OP_SET_REDIRECT:
+    case CEPH_OSD_OP_SET_CHUNK:
+    case CEPH_OSD_OP_TIER_PROMOTE:
+    case CEPH_OSD_OP_TIER_FLUSH:
+    case CEPH_OSD_OP_TIER_EVICT:
+      break;
+    default:
+      if (op.op & CEPH_OSD_OP_MODE_WR)
+	ctx->user_modify = true;
+    }
+
+    // munge -1 truncate to 0 truncate
+    if (ceph_osd_op_uses_extent(op.op) &&
+        op.extent.truncate_seq == 1 &&
+        op.extent.truncate_size == (-1ULL)) {
+      op.extent.truncate_size = 0;
+      op.extent.truncate_seq = 0;
+    }
+
+    // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
+    if (op.op == CEPH_OSD_OP_ZERO &&
+        obs.exists &&
+        op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
+        op.extent.length >= 1 &&
+        op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
+	op.extent.offset + op.extent.length >= oi.size) {
+      if (op.extent.offset >= oi.size) {
+        // no-op
+	goto fail;
+      }
+      dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
+	       << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
+      op.op = CEPH_OSD_OP_TRUNCATE;
+    }
+
+    switch (op.op) {
+
+      // --- READS ---
+
+    case CEPH_OSD_OP_CMPEXT:
+      ++ctx->num_read;
+      tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
+		 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
+		 op.extent.length, op.extent.truncate_size,
+		 op.extent.truncate_seq);
+
+      if (op_finisher == nullptr) {
+	result = do_extent_cmp(ctx, osd_op);
+      } else {
+	result = op_finisher->execute();
+      }
+      break;
+
+    case CEPH_OSD_OP_SYNC_READ:
+      if (pool.info.is_erasure()) {
+	result = -EOPNOTSUPP;
+	break;
+      }
+      // fall through
+    case CEPH_OSD_OP_READ:
+      ++ctx->num_read;
+      tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
+		 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
+		 op.extent.length, op.extent.truncate_size,
+		 op.extent.truncate_seq);
+      if (op_finisher == nullptr) {
+	if (!ctx->data_off) {
+	  ctx->data_off = op.extent.offset;
+	}
+	result = do_read(ctx, osd_op);
+      } else {
+	result = op_finisher->execute();
+      }
+      break;
+
+    case CEPH_OSD_OP_CHECKSUM:
+      ++ctx->num_read;
+      {
+	tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
+		   soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
+		   op.checksum.offset, op.checksum.length,
+		   op.checksum.chunk_size);
+
+	if (op_finisher == nullptr) {
+	  result = do_checksum(ctx, osd_op, &bp);
+	} else {
+	  result = op_finisher->execute();
+	}
+      }
+      break;
+
+    /* map extents */
+    case CEPH_OSD_OP_MAPEXT:
+      tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
+      if (pool.info.is_erasure()) {
+	result = -EOPNOTSUPP;
+	break;
+      }
+      ++ctx->num_read;
+      {
+	// read into a buffer
+	bufferlist bl;
+	int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
+						  info.pgid.shard),
+				   op.extent.offset, op.extent.length, bl);
+	osd_op.outdata = std::move(bl);
+	if (r < 0)
+	  result = r;
+	else
+	  ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
+	ctx->delta_stats.num_rd++;
+	dout(10) << " map_extents done on object " << soid << dendl;
+      }
+      break;
+
+    /* map extents */
+    case CEPH_OSD_OP_SPARSE_READ:
+      tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
+		 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
+		 op.extent.length, op.extent.truncate_size,
+		 op.extent.truncate_seq);
+      if (op_finisher == nullptr) {
+	result = do_sparse_read(ctx, osd_op);
+      } else {
+	result = op_finisher->execute();
+      }
+      break;
+
+    case CEPH_OSD_OP_CALL:
+      {
+	string cname, mname;
+	bufferlist indata;
+	try {
+	  bp.copy(op.cls.class_len, cname);
+	  bp.copy(op.cls.method_len, mname);
+	  bp.copy(op.cls.indata_len, indata);
+	} catch (ceph::buffer::error& e) {
+	  dout(10) << "call unable to decode class + method + indata" << dendl;
+	  dout(30) << "in dump: ";
+	  osd_op.indata.hexdump(*_dout);
+	  *_dout << dendl;
+	  result = -EINVAL;
+	  tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
+	  break;
+	}
+	tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
+
+	ClassHandler::ClassData *cls;
+	result = ClassHandler::get_instance().open_class(cname, &cls);
+	ceph_assert(result == 0);   // init_op_flags() already verified this works.
+
+	ClassHandler::ClassMethod *method = cls->get_method(mname);
+	if (!method) {
+	  dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
+	  result = -EOPNOTSUPP;
+	  break;
+	}
+
+	int flags = method->get_flags();
+	if (flags & CLS_METHOD_WR)
+	  ctx->user_modify = true;
+
+	bufferlist outdata;
+	dout(10) << "call method " << cname << "." << mname << dendl;
+	int prev_rd = ctx->num_read;
+	int prev_wr = ctx->num_write;
+	result = method->exec((cls_method_context_t)&ctx, indata, outdata);
+
+	if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
+	  derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
+	  result = -EIO;
+	  break;
+	}
+	if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
+	  derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
+	  result = -EIO;
+	  break;
+	}
+
+	dout(10) << "method called response length=" << outdata.length() << dendl;
+	op.extent.length = outdata.length();
+	osd_op.outdata.claim_append(outdata);
+	dout(30) << "out dump: ";
+	osd_op.outdata.hexdump(*_dout);
+	*_dout << dendl;
+      }
+      break;
+
+    case CEPH_OSD_OP_STAT:
+      // note: stat does not require RD
+      {
+	tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
+
+	if (obs.exists && !oi.is_whiteout()) {
+	  encode(oi.size, osd_op.outdata);
+	  encode(oi.mtime, osd_op.outdata);
+	  dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
+	} else {
+	  result = -ENOENT;
+	  dout(10) << "stat oi object does not exist" << dendl;
+	}
+
+	ctx->delta_stats.num_rd++;
+      }
+      break;
+
+    case CEPH_OSD_OP_ISDIRTY:
+      ++ctx->num_read;
+      {
+	tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
+	bool is_dirty = obs.oi.is_dirty();
+	encode(is_dirty, osd_op.outdata);
+	ctx->delta_stats.num_rd++;
+	result = 0;
+      }
+      break;
+
+    case CEPH_OSD_OP_UNDIRTY:
+      ++ctx->num_write;
+      result = 0;
+      {
+	tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
+	if (oi.is_dirty()) {
+	  ctx->undirty = true;  // see make_writeable()
+	  ctx->modify = true;
+	  ctx->delta_stats.num_wr++;
+	}
+      }
+      break;
+
+    case CEPH_OSD_OP_CACHE_TRY_FLUSH:
+      ++ctx->num_write;
+      result = 0;
+      {
+	tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
+	if (ctx->lock_type != RWState::RWNONE) {
+	  dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
+	  result = -EINVAL;
+	  break;
+	}
+	if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
+	  result = -EINVAL;
+	  break;
+	}
+	if (!obs.exists) {
+	  result = 0;
+	  break;
+	}
+	if (oi.is_cache_pinned()) {
+	  dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
+	  result = -EPERM;
+	  break;
+	}
+	if (oi.is_dirty()) {
+	  result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt);
+	  if (result == -EINPROGRESS)
+	    result = -EAGAIN;
+	} else {
+	  result = 0;
+	}
+      }
+      break;
+
+    case CEPH_OSD_OP_CACHE_FLUSH:
+      ++ctx->num_write;
+      result = 0;
+      {
+	tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
+	if (ctx->lock_type == RWState::RWNONE) {
+	  dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
+	  result = -EINVAL;
+	  break;
+	}
+	if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
+	  result = -EINVAL;
+	  break;
+	}
+	if (!obs.exists) {
+	  result = 0;
+	  break;
+	}
+	if (oi.is_cache_pinned()) {
+	  dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
+	  result = -EPERM;
+	  break;
+	}
+	hobject_t missing;
+	if (oi.is_dirty()) {
+	  result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt);
+	  if (result == -EINPROGRESS)
+	    result = -EAGAIN;
+	} else {
+	  result = 0;
+	}
+	// Check special return value which has set missing_return
+        if (result == -ENOENT) {
+          dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
+	  ceph_assert(!missing.is_min());
+	  wait_for_unreadable_object(missing, ctx->op);
+	  // Error code which is used elsewhere when wait_for_unreadable_object() is used
+	  result = -EAGAIN;
+	}
+      }
+      break;
+
+    case CEPH_OSD_OP_CACHE_EVICT:
+      ++ctx->num_write;
+      result = 0;
+      {
+	tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
+	if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
+	  result = -EINVAL;
+	  break;
+	}
+	if (!obs.exists) {
+	  result = 0;
+	  break;
+	}
+	if (oi.is_cache_pinned()) {
+	  dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
+	  result = -EPERM;
+	  break;
+	}
+	if (oi.is_dirty()) {
+	  result = -EBUSY;
+	  break;
+	}
+	if (!oi.watchers.empty()) {
+	  result = -EBUSY;
+	  break;
+	}
+	if (soid.snap == CEPH_NOSNAP) {
+	  result = _verify_no_head_clones(soid, ssc->snapset);
+	  if (result < 0)
+	    break;
+	}
+	result = _delete_oid(ctx, true, false);
+	if (result >= 0) {
+	  // mark that this is a cache eviction to avoid triggering normal
+	  // make_writeable() clone creation in finish_ctx()
+	  ctx->cache_operation = true;
+	}
+	osd->logger->inc(l_osd_tier_evict);
+      }
+      break;
+
+    case CEPH_OSD_OP_GETXATTR:
+      ++ctx->num_read;
+      {
+	string aname;
+	bp.copy(op.xattr.name_len, aname);
+	tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
+	string name = "_" + aname;
+	int r = getattr_maybe_cache(
+	  ctx->obc,
+	  name,
+	  &(osd_op.outdata));
+	if (r >= 0) {
+	  op.xattr.value_len = osd_op.outdata.length();
+	  result = 0;
+	  ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+	} else
+	  result = r;
+
+	ctx->delta_stats.num_rd++;
+      }
+      break;
+
+   case CEPH_OSD_OP_GETXATTRS:
+      ++ctx->num_read;
+      {
+	tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
+	map<string, bufferlist> out;
+	result = getattrs_maybe_cache(
+	  ctx->obc,
+	  &out);
+
+        bufferlist bl;
+        encode(out, bl);
+	ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
+        ctx->delta_stats.num_rd++;
+        osd_op.outdata.claim_append(bl);
+      }
+      break;
+
+    case CEPH_OSD_OP_CMPXATTR:
+      ++ctx->num_read;
+      {
+	string aname;
+	bp.copy(op.xattr.name_len, aname);
+	tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
+	string name = "_" + aname;
+	name[op.xattr.name_len + 1] = 0;
+
+	bufferlist xattr;
+	result = getattr_maybe_cache(
+	  ctx->obc,
+	  name,
+	  &xattr);
+	if (result < 0 && result != -EEXIST && result != -ENODATA)
+	  break;
+
+	ctx->delta_stats.num_rd++;
+	ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
+
+	switch (op.xattr.cmp_mode) {
+	case CEPH_OSD_CMPXATTR_MODE_STRING:
+	  {
+	    string val;
+	    bp.copy(op.xattr.value_len, val);
+	    val[op.xattr.value_len] = 0;
+	    dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
+		     << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
+	    result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
+	  }
+	  break;
+
+        case CEPH_OSD_CMPXATTR_MODE_U64:
+	  {
+	    uint64_t u64val;
+	    try {
+	      decode(u64val, bp);
+	    }
+	    catch (ceph::buffer::error& e) {
+	      result = -EINVAL;
+	      goto fail;
+	    }
+	    dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
+		     << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
+	    result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
+	  }
+	  break;
+
+	default:
+	  dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
+	  result = -EINVAL;
+	}
+
+	if (!result) {
+	  dout(10) << "comparison returned false" << dendl;
+	  result = -ECANCELED;
+	  break;
+	}
+	if (result < 0) {
+	  dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
+	  break;
+	}
+
+	dout(10) << "comparison returned true" << dendl;
+      }
+      break;
+
+    case CEPH_OSD_OP_ASSERT_VER:
+      ++ctx->num_read;
+      {
+	uint64_t ver = op.assert_ver.ver;
+	tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
+	if (!ver)
+	  result = -EINVAL;
+        else if (ver < oi.user_version)
+	  result = -ERANGE;
+	else if (ver > oi.user_version)
+	  result = -EOVERFLOW;
+      }
+      break;
+
+    case CEPH_OSD_OP_LIST_WATCHERS:
+      ++ctx->num_read;
+      {
+	tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
+        obj_list_watch_response_t resp;
+
+        map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
+        for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
+                                       ++oi_iter) {
+          dout(20) << "key cookie=" << oi_iter->first.first
+               << " entity=" << oi_iter->first.second << " "
+               << oi_iter->second << dendl;
+          ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
+          ceph_assert(oi_iter->first.second.is_client());
+
+          watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
+		 oi_iter->second.timeout_seconds, oi_iter->second.addr);
+          resp.entries.push_back(wi);
+        }
+
+        resp.encode(osd_op.outdata, ctx->get_features());
+        result = 0;
+
+        ctx->delta_stats.num_rd++;
+        break;
+      }
+
+    case CEPH_OSD_OP_LIST_SNAPS:
+      ++ctx->num_read;
+      {
+	tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
+        obj_list_snap_response_t resp;
+
+        if (!ssc) {
+	  ssc = ctx->obc->ssc = get_snapset_context(soid, false);
+        }
+        ceph_assert(ssc);
+	dout(20) << " snapset " << ssc->snapset << dendl;
+
+        int clonecount = ssc->snapset.clones.size();
+	clonecount++;  // for head
+        resp.clones.reserve(clonecount);
+        for (auto clone_iter = ssc->snapset.clones.begin();
+	     clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
+          clone_info ci;
+          ci.cloneid = *clone_iter;
+
+	  hobject_t clone_oid = soid;
+	  clone_oid.snap = *clone_iter;
+
+	  auto p = ssc->snapset.clone_snaps.find(*clone_iter);
+	  if (p == ssc->snapset.clone_snaps.end()) {
+	    osd->clog->error() << "osd." << osd->whoami
+			       << ": inconsistent clone_snaps found for oid "
+			       << soid << " clone " << *clone_iter
+			       << " snapset " << ssc->snapset;
+	    result = -EINVAL;
+	    break;
+	  }
+	  for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
+	    ci.snaps.push_back(*q);
+	  }
+
+          dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
+
+          map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
+          coi = ssc->snapset.clone_overlap.find(ci.cloneid);
+          if (coi == ssc->snapset.clone_overlap.end()) {
+            osd->clog->error() << "osd." << osd->whoami
+			       << ": inconsistent clone_overlap found for oid "
+			      << soid << " clone " << *clone_iter;
+            result = -EINVAL;
+            break;
+          }
+          const interval_set<uint64_t> &o = coi->second;
+          ci.overlap.reserve(o.num_intervals());
+          for (interval_set<uint64_t>::const_iterator r = o.begin();
+               r != o.end(); ++r) {
+            ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
+							 r.get_len()));
+          }
+
+          map<snapid_t, uint64_t>::const_iterator si;
+          si = ssc->snapset.clone_size.find(ci.cloneid);
+          if (si == ssc->snapset.clone_size.end()) {
+            osd->clog->error() << "osd." << osd->whoami
+			       << ": inconsistent clone_size found for oid "
+			       << soid << " clone " << *clone_iter;
+            result = -EINVAL;
+            break;
+          }
+          ci.size = si->second;
+
+          resp.clones.push_back(ci);
+        }
+	if (result < 0) {
+	  break;
+	}
+        if (!ctx->obc->obs.oi.is_whiteout()) {
+          ceph_assert(obs.exists);
+          clone_info ci;
+          ci.cloneid = CEPH_NOSNAP;
+
+          //Size for HEAD is oi.size
+          ci.size = oi.size;
+
+          resp.clones.push_back(ci);
+        }
+	resp.seq = ssc->snapset.seq;
+
+        resp.encode(osd_op.outdata);
+        result = 0;
+
+        ctx->delta_stats.num_rd++;
+        break;
+      }
+
+   case CEPH_OSD_OP_NOTIFY:
+      ++ctx->num_read;
+      {
+	uint32_t timeout;
+        bufferlist bl;
+
+	try {
+	  uint32_t ver; // obsolete
+          decode(ver, bp);
+	  decode(timeout, bp);
+          decode(bl, bp);
+	} catch (const ceph::buffer::error &e) {
+	  timeout = 0;
+	}
+	tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
+	if (!timeout)
+	  timeout = cct->_conf->osd_default_notify_timeout;
+
+	notify_info_t n;
+	n.timeout = timeout;
+	n.notify_id = osd->get_next_id(get_osdmap_epoch());
+	n.cookie = op.notify.cookie;
+        n.bl = bl;
+	ctx->notifies.push_back(n);
+
+	// return our unique notify id to the client
+	encode(n.notify_id, osd_op.outdata);
+      }
+      break;
+
+    case CEPH_OSD_OP_NOTIFY_ACK:
+      ++ctx->num_read;
+      {
+	try {
+	  uint64_t notify_id = 0;
+	  uint64_t watch_cookie = 0;
+	  decode(notify_id, bp);
+	  decode(watch_cookie, bp);
+	  bufferlist reply_bl;
+	  if (!bp.end()) {
+	    decode(reply_bl, bp);
+	  }
+	  tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
+	  OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
+	  ctx->notify_acks.push_back(ack);
+	} catch (const ceph::buffer::error &e) {
+	  tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
+	  OpContext::NotifyAck ack(
+	    // op.watch.cookie is actually the notify_id for historical reasons
+	    op.watch.cookie
+	    );
+	  ctx->notify_acks.push_back(ack);
+	}
+      }
+      break;
+
+    case CEPH_OSD_OP_SETALLOCHINT:
+      ++ctx->num_write;
+      result = 0;
+      {
+	tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
+	maybe_create_new_object(ctx);
+	oi.expected_object_size = op.alloc_hint.expected_object_size;
+	oi.expected_write_size = op.alloc_hint.expected_write_size;
+	oi.alloc_hint_flags = op.alloc_hint.flags;
+        t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
+                          op.alloc_hint.expected_write_size,
+			  op.alloc_hint.flags);
+      }
+      break;
+
+
+      // --- WRITES ---
+
+      // -- object data --
+
+    case CEPH_OSD_OP_WRITE:
+      ++ctx->num_write;
+      result = 0;
+      { // write
+        __u32 seq = oi.truncate_seq;
+	tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
+	if (op.extent.length != osd_op.indata.length()) {
+	  result = -EINVAL;
+	  break;
+	}
+
+	if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
+	  op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
+	if (pool.info.requires_aligned_append() &&
+	    (op.extent.offset % pool.info.required_alignment() != 0)) {
+	  result = -EOPNOTSUPP;
+	  break;
+	}
+
+	if (!obs.exists) {
+	  if (pool.info.requires_aligned_append() && op.extent.offset) {
+	    result = -EOPNOTSUPP;
+	    break;
+	  }
+	} else if (op.extent.offset != oi.size &&
+		   pool.info.requires_aligned_append()) {
+	  result = -EOPNOTSUPP;
+	  break;
+	}
+
+        if (seq && (seq > op.extent.truncate_seq) &&
+            (op.extent.offset + op.extent.length > oi.size)) {
+	  // old write, arrived after trimtrunc
+	  op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
+	  dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
+		   << ", adjusting write length to " << op.extent.length << dendl;
+	  bufferlist t;
+	  t.substr_of(osd_op.indata, 0, op.extent.length);
+	  osd_op.indata.swap(t);
+        }
+	if (op.extent.truncate_seq > seq) {
+	  // write arrives before trimtrunc
+	  if (obs.exists && !oi.is_whiteout()) {
+	    dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
+		     << ", truncating to " << op.extent.truncate_size << dendl;
+	    t->truncate(soid, op.extent.truncate_size);
+	    oi.truncate_seq = op.extent.truncate_seq;
+	    oi.truncate_size = op.extent.truncate_size;
+	    if (oi.size > op.extent.truncate_size) {
+	      interval_set<uint64_t> trim;
+	      trim.insert(op.extent.truncate_size,
+	        oi.size - op.extent.truncate_size);
+	      ctx->modified_ranges.union_of(trim);
+	      ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size);
+	      oi.clear_data_digest();
+	    }
+	    if (op.extent.truncate_size != oi.size) {
+              truncate_update_size_and_usage(ctx->delta_stats,
+                                             oi,
+                                             op.extent.truncate_size);
+	    }
+	  } else {
+	    dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
+		     << ", but object is new" << dendl;
+	    oi.truncate_seq = op.extent.truncate_seq;
+	    oi.truncate_size = op.extent.truncate_size;
+	  }
+	}
+	result = check_offset_and_length(
+	  op.extent.offset, op.extent.length,
+	  static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+	if (result < 0)
+	  break;
+
+	maybe_create_new_object(ctx);
+
+	if (op.extent.length == 0) {
+	  if (op.extent.offset > oi.size) {
+	    t->truncate(
+	      soid, op.extent.offset);
+            truncate_update_size_and_usage(ctx->delta_stats, oi,
+                                           op.extent.offset);
+	  } else {
+	    t->nop(soid);
+	  }
+	} else {
+	  t->write(
+	    soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
+	}
+
+	if (op.extent.offset == 0 && op.extent.length >= oi.size
+            && !skip_data_digest) {
+	  obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
+	} else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
+          if (skip_data_digest) {
+            obs.oi.clear_data_digest();
+          } else {
+	    obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
+          }
+	} else {
+	  obs.oi.clear_data_digest();
+        }
+	write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
+				    op.extent.offset, op.extent.length);
+	ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
+	dout(10) << "clean_regions modified" << ctx->clean_regions << dendl;
+      }
+      break;
+
+    case CEPH_OSD_OP_WRITEFULL:
+      ++ctx->num_write;
+      result = 0;
+      { // write full object
+	tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
+
+	if (op.extent.length != osd_op.indata.length()) {
+	  result = -EINVAL;
+	  break;
+	}
+	result = check_offset_and_length(
+	  0, op.extent.length,
+          static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+	if (result < 0)
+	  break;
+
+	if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
+	  op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
+	maybe_create_new_object(ctx);
+	if (pool.info.is_erasure()) {
+	  t->truncate(soid, 0);
+	} else if (obs.exists && op.extent.length < oi.size) {
+	  t->truncate(soid, op.extent.length);
+	}
+	if (op.extent.length) {
+	  t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
+	}
+        if (!skip_data_digest) {
+	  obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
+        } else {
+	  obs.oi.clear_data_digest();
+	}
+        ctx->clean_regions.mark_data_region_dirty(0,
+          std::max((uint64_t)op.extent.length, oi.size));
+	write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
+	    0, op.extent.length, true);
+      }
+      break;
+
+    case CEPH_OSD_OP_WRITESAME:
+      ++ctx->num_write;
+      tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
+      result = do_writesame(ctx, osd_op);
+      break;
+
+    case CEPH_OSD_OP_ROLLBACK :
+      ++ctx->num_write;
+      tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
+      result = _rollback_to(ctx, op);
+      break;
+
+    case CEPH_OSD_OP_ZERO:
+      tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
+      if (pool.info.requires_aligned_append()) {
+	result = -EOPNOTSUPP;
+	break;
+      }
+      ++ctx->num_write;
+      { // zero
+	result = check_offset_and_length(
+	  op.extent.offset, op.extent.length,
+          static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+	if (result < 0)
+	  break;
+
+	ceph_assert(op.extent.length);
+	if (obs.exists && !oi.is_whiteout()) {
+	  t->zero(soid, op.extent.offset, op.extent.length);
+	  interval_set<uint64_t> ch;
+	  ch.insert(op.extent.offset, op.extent.length);
+	  ctx->modified_ranges.union_of(ch);
+	  ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
+	  ctx->delta_stats.num_wr++;
+	  oi.clear_data_digest();
+	} else {
+	  // no-op
+	}
+      }
+      break;
+    case CEPH_OSD_OP_CREATE:
+      ++ctx->num_write;
+      result = 0;
+      {
+	tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
+	if (obs.exists && !oi.is_whiteout() &&
+	    (op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
+          result = -EEXIST; /* this is an exclusive create */
+	} else {
+	  if (osd_op.indata.length()) {
+	    auto p = osd_op.indata.cbegin();
+	    string category;
+	    try {
+	      decode(category, p);
+	    }
+	    catch (ceph::buffer::error& e) {
+	      result = -EINVAL;
+	      goto fail;
+	    }
+	    // category is no longer implemented.
+	  }
+	  maybe_create_new_object(ctx);
+	  t->nop(soid);
+	}
+      }
+      break;
+
+    case CEPH_OSD_OP_TRIMTRUNC:
+      op.extent.offset = op.extent.truncate_size;
+      // falling through
+
+    case CEPH_OSD_OP_TRUNCATE:
+      tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
+      if (pool.info.requires_aligned_append()) {
+	result = -EOPNOTSUPP;
+	break;
+      }
+      ++ctx->num_write;
+      result = 0;
+      {
+	// truncate
+	if (!obs.exists || oi.is_whiteout()) {
+	  dout(10) << " object dne, truncate is a no-op" << dendl;
+	  break;
+	}
+
+        result = check_offset_and_length(
+	  op.extent.offset, op.extent.length,
+          static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
+        if (result < 0)
+	  break;
+
+	if (op.extent.truncate_seq) {
+	  ceph_assert(op.extent.offset == op.extent.truncate_size);
+	  if (op.extent.truncate_seq <= oi.truncate_seq) {
+	    dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
+		     << ", no-op" << dendl;
+	    break; // old
+	  }
+	  dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
+		   << ", truncating" << dendl;
+	  oi.truncate_seq = op.extent.truncate_seq;
+	  oi.truncate_size = op.extent.truncate_size;
+	}
+
+	maybe_create_new_object(ctx);
+	t->truncate(soid, op.extent.offset);
+	if (oi.size > op.extent.offset) {
+	  interval_set<uint64_t> trim;
+	  trim.insert(op.extent.offset, oi.size-op.extent.offset);
+	  ctx->modified_ranges.union_of(trim);
+	  ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset);
+	} else if (oi.size < op.extent.offset) {
+          ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size);
+        }
+	if (op.extent.offset != oi.size) {
+          truncate_update_size_and_usage(ctx->delta_stats,
+                                         oi,
+                                         op.extent.offset);
+	}
+	ctx->delta_stats.num_wr++;
+	// do no set exists, or we will break above DELETE -> TRUNCATE munging.
+
+	oi.clear_data_digest();
+      }
+      break;
+
+    case CEPH_OSD_OP_DELETE:
+      ++ctx->num_write;
+      result = 0;
+      tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
+      {
+	result = _delete_oid(ctx, false, ctx->ignore_cache);
+      }
+      break;
+
+    case CEPH_OSD_OP_WATCH:
+      ++ctx->num_write;
+      result = 0;
+      {
+	tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
+		   op.watch.cookie, op.watch.op);
+	if (!obs.exists) {
+	  result = -ENOENT;
+	  break;
+	}
+	result = 0;
+        uint64_t cookie = op.watch.cookie;
+        entity_name_t entity = ctx->reqid.name;
+	ObjectContextRef obc = ctx->obc;
+
+	dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
+		 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
+		 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
+	dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
+	dout(10) << "watch: peer_addr="
+	  << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
+
+	uint32_t timeout = cct->_conf->osd_client_watch_timeout;
+	if (op.watch.timeout != 0) {
+	  timeout = op.watch.timeout;
+	}
+
+	watch_info_t w(cookie, timeout,
+	  ctx->op->get_req()->get_connection()->get_peer_addr());
+	if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
+	    op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
+	  if (oi.watchers.count(make_pair(cookie, entity))) {
+	    dout(10) << " found existing watch " << w << " by " << entity << dendl;
+	  } else {
+	    dout(10) << " registered new watch " << w << " by " << entity << dendl;
+	    oi.watchers[make_pair(cookie, entity)] = w;
+	    t->nop(soid);  // make sure update the object_info on disk!
+	  }
+	  bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
+	  ctx->watch_connects.push_back(make_pair(w, will_ping));
+        } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
+	  if (!oi.watchers.count(make_pair(cookie, entity))) {
+	    result = -ENOTCONN;
+	    break;
+	  }
+	  dout(10) << " found existing watch " << w << " by " << entity << dendl;
+	  ctx->watch_connects.push_back(make_pair(w, true));
+        } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
+	  /* Note: WATCH with PING doesn't cause may_write() to return true,
+	   * so if there is nothing else in the transaction, this is going
+	   * to run do_osd_op_effects, but not write out a log entry */
+	  if (!oi.watchers.count(make_pair(cookie, entity))) {
+	    result = -ENOTCONN;
+	    break;
+	  }
+	  map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
+	    obc->watchers.find(make_pair(cookie, entity));
+	  if (p == obc->watchers.end() ||
+	      !p->second->is_connected()) {
+	    // client needs to reconnect
+	    result = -ETIMEDOUT;
+	    break;
+	  }
+	  dout(10) << " found existing watch " << w << " by " << entity << dendl;
+	  p->second->got_ping(ceph_clock_now());
+	  result = 0;
+        } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
+	  map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
+	    oi.watchers.find(make_pair(cookie, entity));
+	  if (oi_iter != oi.watchers.end()) {
+	    dout(10) << " removed watch " << oi_iter->second << " by "
+		     << entity << dendl;
+            oi.watchers.erase(oi_iter);
+	    t->nop(soid);  // update oi on disk
+	    ctx->watch_disconnects.push_back(
+	      watch_disconnect_t(cookie, entity, false));
+	  } else {
+	    dout(10) << " can't remove: no watch by " << entity << dendl;
+	  }
+        }
+      }
+      break;
+
+    case CEPH_OSD_OP_CACHE_PIN:
+      tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
+      if ((!pool.info.is_tier() ||
+	  pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
+        result = -EINVAL;
+        dout(10) << " pin object is only allowed on the cache tier " << dendl;
+        break;
+      }
+      ++ctx->num_write;
+      result = 0;
+      {
+	if (!obs.exists || oi.is_whiteout()) {
+	  result = -ENOENT;
+	  break;
+	}
+
+	if (!oi.is_cache_pinned()) {
+	  oi.set_flag(object_info_t::FLAG_CACHE_PIN);
+	  ctx->modify = true;
+	  ctx->delta_stats.num_objects_pinned++;
+	  ctx->delta_stats.num_wr++;
+	}
+      }
+      break;
+
+    case CEPH_OSD_OP_CACHE_UNPIN:
+      tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
+      if ((!pool.info.is_tier() ||
+	  pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
+        result = -EINVAL;
+        dout(10) << " pin object is only allowed on the cache tier " << dendl;
+        break;
+      }
+      ++ctx->num_write;
+      result = 0;
+      {
+	if (!obs.exists || oi.is_whiteout()) {
+	  result = -ENOENT;
+	  break;
+	}
+
+	if (oi.is_cache_pinned()) {
+	  oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
+	  ctx->modify = true;
+	  ctx->delta_stats.num_objects_pinned--;
+	  ctx->delta_stats.num_wr++;
+	}
+      }
+      break;
+
+    case CEPH_OSD_OP_SET_REDIRECT:
+      ++ctx->num_write;
+      result = 0;
+      {
+	if (pool.info.is_tier()) {
+	  result = -EINVAL;
+	  break;
+	}
+	if (!obs.exists) {
+	  result = -ENOENT;
+	  break;
+	}
+	if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
+	  result = -EOPNOTSUPP;
+	  break;
+	}
+
+	object_t target_name;
+	object_locator_t target_oloc;
+	snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
+	version_t target_version = op.copy_from.src_version;
+	try {
+	  decode(target_name, bp);
+	  decode(target_oloc, bp);
+	}
+	catch (ceph::buffer::error& e) {
+	  result = -EINVAL;
+	  goto fail;
+	}
+	pg_t raw_pg;
+	result = get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
+	if (result < 0) {
+	  dout(5) << " pool information is invalid: " << result << dendl;
+	  break;
+	}
+	hobject_t target(target_name, target_oloc.key, target_snapid,
+		raw_pg.ps(), raw_pg.pool(),
+		target_oloc.nspace);
+	if (target == soid) {
+	  dout(20) << " set-redirect self is invalid" << dendl;
+	  result = -EINVAL;
+	  break;
+	}
+
+	bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
+	bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
+	if (has_reference) {
+	  result = -EINVAL;
+	  dout(5) << " the object is already a manifest " << dendl;
+	  break;
+	}
+	if (op_finisher == nullptr && need_reference) {
+	  // start
+	  ctx->op_finishers[ctx->current_osd_subop_num].reset(
+	    new SetManifestFinisher(osd_op));
+	  ManifestOpRef mop = std::make_shared<ManifestOp>(new RefCountCallback(ctx, osd_op));
+	  C_SetManifestRefCountDone* fin = new C_SetManifestRefCountDone(this, mop, soid);
+	  ceph_tid_t tid = refcount_manifest(soid, target, 
+					      refcount_t::INCREMENT_REF, fin, std::nullopt);
+	  mop->objecter_tid = tid;
+	  manifest_ops[soid] = mop;
+	  ctx->obc->start_block();
+	  result = -EINPROGRESS;
+	} else {
+	  // finish
+	  if (op_finisher) {
+	    result = op_finisher->execute();
+	    ceph_assert(result == 0);
+	  }
+
+	  if (!oi.has_manifest() && !oi.manifest.is_redirect())
+	    ctx->delta_stats.num_objects_manifest++;
+
+	  oi.set_flag(object_info_t::FLAG_MANIFEST);
+	  oi.manifest.redirect_target = target;
+	  oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
+	  t->truncate(soid, 0);
+          ctx->clean_regions.mark_data_region_dirty(0, oi.size);
+	  if (oi.is_omap() && pool.info.supports_omap()) {
+	    t->omap_clear(soid);
+	    obs.oi.clear_omap_digest();
+	    obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+            ctx->clean_regions.mark_omap_dirty();
+	  }
+          write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
+	    0, oi.size, false);
+	  ctx->delta_stats.num_bytes -= oi.size;
+	  oi.size = 0;
+	  oi.new_object();
+	  oi.user_version = target_version;
+	  ctx->user_at_version = target_version;
+	  /* rm_attrs */
+	  map<string,bufferlist> rmattrs;
+	  result = getattrs_maybe_cache(ctx->obc, &rmattrs);
+	  if (result < 0) {
+	    dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
+	    return result;
+	  }
+	  map<string, bufferlist>::iterator iter;
+	  for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
+	    const string& name = iter->first;
+	    t->rmattr(soid, name);
+	  }
+	  if (!has_reference && need_reference) {
+	    oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
+	  }
+	  dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
+	  if (op_finisher) {
+	    ctx->op_finishers.erase(ctx->current_osd_subop_num);
+	  }
+	}
+      }
+
+      break;
+
+    case CEPH_OSD_OP_SET_CHUNK:
+      ++ctx->num_write;
+      result = 0;
+      {
+	if (pool.info.is_tier()) {
+	  result = -EINVAL;
+	  break;
+	}
+	if (!obs.exists) {
+	  result = -ENOENT;
+	  break;
+	}
+	if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
+	  result = -EOPNOTSUPP;
+	  break;
+	}
+	if (oi.manifest.is_redirect()) {
+	  result = -EINVAL;
+	  goto fail;
+	}
+
+	object_locator_t tgt_oloc;
+	uint64_t src_offset, src_length, tgt_offset;
+	object_t tgt_name;
+	try {
+	  decode(src_offset, bp);
+	  decode(src_length, bp);
+	  decode(tgt_oloc, bp);
+	  decode(tgt_name, bp);
+	  decode(tgt_offset, bp);
+	}
+	catch (ceph::buffer::error& e) {
+	  result = -EINVAL;
+	  goto fail;
+	}
+
+	if (!src_length) {
+	  result = -EINVAL;
+	  goto fail;
+	}
+	if (src_offset + src_length > oi.size) {
+	  result = -ERANGE;
+	  goto fail;
+	}
+	if (!(osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE)) {
+	  result = -EOPNOTSUPP;
+	  break;
+	}
+	if (pool.info.is_erasure()) {
+	  result = -EOPNOTSUPP;
+	  break;
+	}
+
+	for (auto &p : oi.manifest.chunk_map) {
+	  interval_set<uint64_t> chunk;
+	  chunk.insert(p.first, p.second.length);
+	  if (chunk.intersects(src_offset, src_length)) {
+	    dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
+		    << " chunk_info: " << p << dendl;
+	    result = -EOPNOTSUPP;
+	    goto fail;
+	  }
+	}
+
+	pg_t raw_pg;
+	chunk_info_t chunk_info;
+	result = get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
+	if (result < 0) {
+	  dout(5) << " pool information is invalid: " << result << dendl;
+	  break;
+	}
+	hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
+			 raw_pg.ps(), raw_pg.pool(),
+			 tgt_oloc.nspace);
+	bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
+			     (oi.manifest.chunk_map[src_offset].test_flag(chunk_info_t::FLAG_HAS_REFERENCE));
+	if (has_reference) {
+	  result = -EINVAL;
+	  dout(5) << " the object is already a manifest " << dendl;
+	  break;
+	}
+	chunk_info.oid = target;
+	chunk_info.offset = tgt_offset;
+	chunk_info.length = src_length;
+	if (op_finisher == nullptr)  {
+	  // start
+	  ctx->op_finishers[ctx->current_osd_subop_num].reset(
+	    new SetManifestFinisher(osd_op));
+	  object_manifest_t set_chunk;
+	  bool need_inc_ref = false;
+	  set_chunk.chunk_map[src_offset] = chunk_info;
+	  need_inc_ref = inc_refcount_by_set(ctx, set_chunk, osd_op);
+	  if (need_inc_ref) {
+	    result = -EINPROGRESS;
+	    break;
+	  }
+	}
+	if (op_finisher) {
+	  result = op_finisher->execute();
+	  ceph_assert(result == 0);
+	}
+
+	oi.manifest.chunk_map[src_offset] = chunk_info;
+	if (!oi.has_manifest() && !oi.manifest.is_chunked())
+	  ctx->delta_stats.num_objects_manifest++;
+	oi.set_flag(object_info_t::FLAG_MANIFEST);
+	oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
+	if (!has_reference) {
+	  oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
+	}
+	ctx->modify = true;
+	ctx->cache_operation = true;
+
+	dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
+		 << " chunk_info: " << chunk_info << dendl;
+	if (op_finisher) {
+	  ctx->op_finishers.erase(ctx->current_osd_subop_num);
+	}
+      }
+
+      break;
+
+    case CEPH_OSD_OP_TIER_PROMOTE:
+      ++ctx->num_write;
+      result = 0;
+      {
+	if (pool.info.is_tier()) {
+	  result = -EINVAL;
+	  break;
+	}
+	if (!obs.exists) {
+	  result = -ENOENT;
+	  break;
+	}
+	if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
+	  result = -EOPNOTSUPP;
+	  break;
+	}
+	if (!obs.oi.has_manifest()) {
+	  result = 0;
+	  break;
+	}
+
+	if (op_finisher == nullptr) {
+	  PromoteManifestCallback *cb;
+	  object_locator_t my_oloc;
+	  hobject_t src_hoid;
+
+	  if (obs.oi.manifest.is_chunked()) {
+	    src_hoid = obs.oi.soid;
+	  } else if (obs.oi.manifest.is_redirect()) {
+	    object_locator_t src_oloc(obs.oi.manifest.redirect_target);
+	    my_oloc = src_oloc;
+	    src_hoid = obs.oi.manifest.redirect_target;
+	  } else {
+	    ceph_abort_msg("unrecognized manifest type");
+	  }
+	  cb = new PromoteManifestCallback(ctx->obc, this, ctx);
+          ctx->op_finishers[ctx->current_osd_subop_num].reset(
+            new PromoteFinisher(cb));
+	  unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
+			   CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
+			   CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
+			   CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
+	  unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
+	  start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
+		     obs.oi.soid.snap == CEPH_NOSNAP,
+		     src_fadvise_flags, 0);
+
+	  dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
+	  result = -EINPROGRESS;
+	} else {
+	  result = op_finisher->execute();
+	  ceph_assert(result == 0);
+	  ctx->op_finishers.erase(ctx->current_osd_subop_num);
+	}
+      }
+
+      break;
+
+    case CEPH_OSD_OP_TIER_FLUSH:
+      ++ctx->num_write;
+      result = 0;
+      {
+	if (pool.info.is_tier()) {
+	  result = -EINVAL;
+	  break;
+	}
+	if (!obs.exists) {
+	  result = -ENOENT;
+	  break;
+	}
+	if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
+	  result = -EOPNOTSUPP;
+	  break;
+	}
+	if (!obs.oi.has_manifest()) {
+	  result = 0;
+	  break;
+	}
+
+	if (oi.is_dirty()) {
+	  result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt);
+	  if (result == -EINPROGRESS)
+	    result = -EAGAIN;
+	} else {
+	  result = 0;
+	}
+      }
+
+      break;
+
+    case CEPH_OSD_OP_TIER_EVICT:
+      ++ctx->num_write;
+      result = 0;
+      {
+	if (pool.info.is_tier()) {
+	  result = -EINVAL;
+	  break;
+	}
+	if (!obs.exists) {
+	  result = -ENOENT;
+	  break;
+	}
+	if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
+	  result = -EOPNOTSUPP;
+	  break;
+	}
+	if (!obs.oi.has_manifest()) {
+	  result = -EINVAL;
+	  break;
+	}
+
+	// The chunks already has a reference, so it is just enough to invoke truncate if necessary
+	uint64_t chunk_length = 0;
+	for (auto p : obs.oi.manifest.chunk_map) {
+	  chunk_length += p.second.length;
+	}
+	if (chunk_length == obs.oi.size) {
+	  for (auto &p : obs.oi.manifest.chunk_map) {
+	    p.second.set_flag(chunk_info_t::FLAG_MISSING);
+	  }
+	  // punch hole
+	  t->zero(soid, 0, oi.size);
+	  oi.clear_data_digest();
+	  ctx->delta_stats.num_wr++;
+	  ctx->cache_operation = true;
+	}
+	osd->logger->inc(l_osd_tier_evict);
+      }
+
+      break;
+
+    case CEPH_OSD_OP_UNSET_MANIFEST:
+      ++ctx->num_write;
+      result = 0;
+      {
+	if (pool.info.is_tier()) {
+	  result = -EINVAL;
+	  break;
+	}
+	if (!obs.exists) {
+	  result = -ENOENT;
+	  break;
+	}
+	if (!oi.has_manifest()) {
+	  result = -EOPNOTSUPP;
+	  break;
+	}
+	if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
+	  result = -EOPNOTSUPP;
+	  break;
+	}
+
+	dec_all_refcount_manifest(oi, ctx);
+
+	oi.clear_flag(object_info_t::FLAG_MANIFEST);
+	oi.manifest = object_manifest_t();
+	ctx->delta_stats.num_objects_manifest--;
+	ctx->delta_stats.num_wr++;
+	ctx->modify = true;
+      }
+
+      break;
+
+      // -- object attrs --
+
+    case CEPH_OSD_OP_SETXATTR:
+      ++ctx->num_write;
+      result = 0;
+      {
+	if (cct->_conf->osd_max_attr_size > 0 &&
+	    op.xattr.value_len > cct->_conf->osd_max_attr_size) {
+	  tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
+	  result = -EFBIG;
+	  break;
+	}
+	unsigned max_name_len =
+	  std::min<uint64_t>(osd->store->get_max_attr_name_length(),
+			     cct->_conf->osd_max_attr_name_len);
+	if (op.xattr.name_len > max_name_len) {
+	  result = -ENAMETOOLONG;
+	  break;
+	}
+	maybe_create_new_object(ctx);
+	string aname;
+	bp.copy(op.xattr.name_len, aname);
+	tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
+	string name = "_" + aname;
+	bufferlist bl;
+	bp.copy(op.xattr.value_len, bl);
+	t->setattr(soid, name, bl);
+ 	ctx->delta_stats.num_wr++;
+      }
+      break;
+
+    case CEPH_OSD_OP_RMXATTR:
+      ++ctx->num_write;
+      result = 0;
+      {
+	string aname;
+	bp.copy(op.xattr.name_len, aname);
+	tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
+	if (!obs.exists || oi.is_whiteout()) {
+	  result = -ENOENT;
+	  break;
+	}
+	string name = "_" + aname;
+	t->rmattr(soid, name);
+ 	ctx->delta_stats.num_wr++;
+      }
+      break;
+
+
+      // -- fancy writers --
+    case CEPH_OSD_OP_APPEND:
+      {
+	tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
+	// just do it inline; this works because we are happy to execute
+	// fancy op on replicas as well.
+	vector<OSDOp> nops(1);
+	OSDOp& newop = nops[0];
+	newop.op.op = CEPH_OSD_OP_WRITE;
+	newop.op.extent.offset = oi.size;
+	newop.op.extent.length = op.extent.length;
+	newop.op.extent.truncate_seq = oi.truncate_seq;
+        newop.indata = osd_op.indata;
+	result = do_osd_ops(ctx, nops);
+	osd_op.outdata = std::move(newop.outdata);
+      }
+      break;
+
+    case CEPH_OSD_OP_STARTSYNC:
+      result = 0;
+      t->nop(soid);
+      break;
+
+      // -- trivial map --
+    case CEPH_OSD_OP_TMAPGET:
+      tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
+      if (pool.info.is_erasure()) {
+	result = -EOPNOTSUPP;
+	break;
+      }
+      {
+	vector<OSDOp> nops(1);
+	OSDOp& newop = nops[0];
+	newop.op.op = CEPH_OSD_OP_SYNC_READ;
+	newop.op.extent.offset = 0;
+	newop.op.extent.length = 0;
+	result = do_osd_ops(ctx, nops);
+	osd_op.outdata = std::move(newop.outdata);
+      }
+      break;
+
+    case CEPH_OSD_OP_TMAPPUT:
+      tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
+      if (pool.info.is_erasure()) {
+	result = -EOPNOTSUPP;
+	break;
+      }
+      {
+	//_dout_lock.Lock();
+	//osd_op.data.hexdump(*_dout);
+	//_dout_lock.Unlock();
+
+	// verify sort order
+	bool unsorted = false;
+	if (true) {
+	  bufferlist header;
+	  decode(header, bp);
+	  uint32_t n;
+	  decode(n, bp);
+	  string last_key;
+	  while (n--) {
+	    string key;
+	    decode(key, bp);
+	    dout(10) << "tmapput key " << key << dendl;
+	    bufferlist val;
+	    decode(val, bp);
+	    if (key < last_key) {
+	      dout(10) << "TMAPPUT is unordered; resorting" << dendl;
+	      unsorted = true;
+	      break;
+	    }
+	    last_key = key;
+	  }
+	}
+
+	// write it
+	vector<OSDOp> nops(1);
+	OSDOp& newop = nops[0];
+	newop.op.op = CEPH_OSD_OP_WRITEFULL;
+	newop.op.extent.offset = 0;
+	newop.op.extent.length = osd_op.indata.length();
+	newop.indata = osd_op.indata;
+
+	if (unsorted) {
+	  bp = osd_op.indata.begin();
+	  bufferlist header;
+	  map<string, bufferlist> m;
+	  decode(header, bp);
+	  decode(m, bp);
+	  ceph_assert(bp.end());
+	  bufferlist newbl;
+	  encode(header, newbl);
+	  encode(m, newbl);
+	  newop.indata = newbl;
+	}
+	result = do_osd_ops(ctx, nops);
+	ceph_assert(result == 0);
+      }
+      break;
+
+    case CEPH_OSD_OP_TMAPUP:
+      tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
+      if (pool.info.is_erasure()) {
+	result = -EOPNOTSUPP;
+	break;
+      }
+      ++ctx->num_write;
+      result = do_tmapup(ctx, bp, osd_op);
+      break;
+
+    case CEPH_OSD_OP_TMAP2OMAP:
+      ++ctx->num_write;
+      tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
+      result = do_tmap2omap(ctx, op.tmap2omap.flags);
+      break;
+
+      // OMAP Read ops
+    case CEPH_OSD_OP_OMAPGETKEYS:
+      ++ctx->num_read;
+      {
+	string start_after;
+	uint64_t max_return;
+	try {
+	  decode(start_after, bp);
+	  decode(max_return, bp);
+	}
+	catch (ceph::buffer::error& e) {
+	  result = -EINVAL;
+	  tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
+	  goto fail;
+	}
+	if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
+	  max_return = cct->_conf->osd_max_omap_entries_per_request;
+	}
+	tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
+
+	bufferlist bl;
+	uint32_t num = 0;
+	bool truncated = false;
+	if (oi.is_omap()) {
+	  ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
+	    ch, ghobject_t(soid)
+	    );
+	  ceph_assert(iter);
+	  iter->upper_bound(start_after);
+	  for (num = 0; iter->valid(); ++num, iter->next()) {
+	    if (num >= max_return ||
+		bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
+	      truncated = true;
+	      break;
+	    }
+	    encode(iter->key(), bl);
+	  }
+	} // else return empty out_set
+	encode(num, osd_op.outdata);
+	osd_op.outdata.claim_append(bl);
+	encode(truncated, osd_op.outdata);
+	ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+	ctx->delta_stats.num_rd++;
+      }
+      break;
+
+    case CEPH_OSD_OP_OMAPGETVALS:
+      ++ctx->num_read;
+      {
+	string start_after;
+	uint64_t max_return;
+	string filter_prefix;
+	try {
+	  decode(start_after, bp);
+	  decode(max_return, bp);
+	  decode(filter_prefix, bp);
+	}
+	catch (ceph::buffer::error& e) {
+	  result = -EINVAL;
+	  tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
+	  goto fail;
+	}
+	if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
+	  max_return = cct->_conf->osd_max_omap_entries_per_request;
+	}
+	tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
+
+	uint32_t num = 0;
+	bool truncated = false;
+	bufferlist bl;
+	if (oi.is_omap()) {
+	  ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
+	    ch, ghobject_t(soid)
+	    );
+          if (!iter) {
+            result = -ENOENT;
+            goto fail;
+          }
+	  iter->upper_bound(start_after);
+	  if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
+	  for (num = 0;
+	       iter->valid() &&
+		 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
+	       ++num, iter->next()) {
+	    dout(20) << "Found key " << iter->key() << dendl;
+	    if (num >= max_return ||
+		bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
+	      truncated = true;
+	      break;
+	    }
+	    encode(iter->key(), bl);
+	    encode(iter->value(), bl);
+	  }
+	} // else return empty out_set
+	encode(num, osd_op.outdata);
+	osd_op.outdata.claim_append(bl);
+	encode(truncated, osd_op.outdata);
+	ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+	ctx->delta_stats.num_rd++;
+      }
+      break;
+
+    case CEPH_OSD_OP_OMAPGETHEADER:
+      tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
+      if (!oi.is_omap()) {
+	// return empty header
+	break;
+      }
+      ++ctx->num_read;
+      {
+	osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
+	ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+	ctx->delta_stats.num_rd++;
+      }
+      break;
+
+    case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
+      ++ctx->num_read;
+      {
+	set<string> keys_to_get;
+	try {
+	  decode(keys_to_get, bp);
+	}
+	catch (ceph::buffer::error& e) {
+	  result = -EINVAL;
+	  tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
+	  goto fail;
+	}
+	tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
+	map<string, bufferlist> out;
+	if (oi.is_omap()) {
+	  osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
+	} // else return empty omap entries
+	encode(out, osd_op.outdata);
+	ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
+	ctx->delta_stats.num_rd++;
+      }
+      break;
+
+    case CEPH_OSD_OP_OMAP_CMP:
+      ++ctx->num_read;
+      {
+	if (!obs.exists || oi.is_whiteout()) {
+	  result = -ENOENT;
+	  tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
+	  break;
+	}
+	map<string, pair<bufferlist, int> > assertions;
+	try {
+	  decode(assertions, bp);
+	}
+	catch (ceph::buffer::error& e) {
+	  result = -EINVAL;
+	  tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
+	  goto fail;
+	}
+	tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
+
+	map<string, bufferlist> out;
+
+	if (oi.is_omap()) {
+	  set<string> to_get;
+	  for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
+	       i != assertions.end();
+	       ++i)
+	    to_get.insert(i->first);
+	  int r = osd->store->omap_get_values(ch, ghobject_t(soid),
+					      to_get, &out);
+	  if (r < 0) {
+	    result = r;
+	    break;
+	  }
+	} // else leave out empty
+
+	//Should set num_rd_kb based on encode length of map
+	ctx->delta_stats.num_rd++;
+
+	int r = 0;
+	bufferlist empty;
+	for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
+	     i != assertions.end();
+	     ++i) {
+	  auto out_entry = out.find(i->first);
+	  bufferlist &bl = (out_entry != out.end()) ?
+	    out_entry->second : empty;
+	  switch (i->second.second) {
+	  case CEPH_OSD_CMPXATTR_OP_EQ:
+	    if (!(bl == i->second.first)) {
+	      r = -ECANCELED;
+	    }
+	    break;
+	  case CEPH_OSD_CMPXATTR_OP_LT:
+	    if (!(bl < i->second.first)) {
+	      r = -ECANCELED;
+	    }
+	    break;
+	  case CEPH_OSD_CMPXATTR_OP_GT:
+	    if (!(bl > i->second.first)) {
+	      r = -ECANCELED;
+	    }
+	    break;
+	  default:
+	    r = -EINVAL;
+	    break;
+	  }
+	  if (r < 0)
+	    break;
+	}
+	if (r < 0) {
+	  result = r;
+	}
+      }
+      break;
+
+      // OMAP Write ops
+    case CEPH_OSD_OP_OMAPSETVALS:
+      if (!pool.info.supports_omap()) {
+	result = -EOPNOTSUPP;
+	tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
+	break;
+      }
+      ++ctx->num_write;
+      result = 0;
+      {
+	maybe_create_new_object(ctx);
+	bufferlist to_set_bl;
+	try {
+	  decode_str_str_map_to_bl(bp, &to_set_bl);
+	}
+	catch (ceph::buffer::error& e) {
+	  result = -EINVAL;
+	  tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
+	  goto fail;
+	}
+	tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
+	if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
+	  dout(20) << "setting vals: " << dendl;
+	  map<string,bufferlist> to_set;
+	  bufferlist::const_iterator pt = to_set_bl.begin();
+	  decode(to_set, pt);
+	  for (map<string, bufferlist>::iterator i = to_set.begin();
+	       i != to_set.end();
+	       ++i) {
+	    dout(20) << "\t" << i->first << dendl;
+	  }
+	}
+	t->omap_setkeys(soid, to_set_bl);
+	ctx->clean_regions.mark_omap_dirty();
+	ctx->delta_stats.num_wr++;
+        ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
+      }
+      obs.oi.set_flag(object_info_t::FLAG_OMAP);
+      obs.oi.clear_omap_digest();
+      break;
+
+    case CEPH_OSD_OP_OMAPSETHEADER:
+      tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
+      if (!pool.info.supports_omap()) {
+	result = -EOPNOTSUPP;
+	break;
+      }
+      ++ctx->num_write;
+      result = 0;
+      {
+	maybe_create_new_object(ctx);
+	t->omap_setheader(soid, osd_op.indata);
+	ctx->clean_regions.mark_omap_dirty();
+	ctx->delta_stats.num_wr++;
+      }
+      obs.oi.set_flag(object_info_t::FLAG_OMAP);
+      obs.oi.clear_omap_digest();
+      break;
+
+    case CEPH_OSD_OP_OMAPCLEAR:
+      tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
+      if (!pool.info.supports_omap()) {
+	result = -EOPNOTSUPP;
+	break;
+      }
+      ++ctx->num_write;
+      result = 0;
+      {
+	if (!obs.exists || oi.is_whiteout()) {
+	  result = -ENOENT;
+	  break;
+	}
+	if (oi.is_omap()) {
+	  t->omap_clear(soid);
+	  ctx->clean_regions.mark_omap_dirty();
+	  ctx->delta_stats.num_wr++;
+	  obs.oi.clear_omap_digest();
+	  obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+	}
+      }
+      break;
+
+    case CEPH_OSD_OP_OMAPRMKEYS:
+      if (!pool.info.supports_omap()) {
+	result = -EOPNOTSUPP;
+	tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
+	break;
+      }
+      ++ctx->num_write;
+      result = 0;
+      {
+	if (!obs.exists || oi.is_whiteout()) {
+	  result = -ENOENT;
+	  tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
+	  break;
+	}
+	bufferlist to_rm_bl;
+	try {
+	  decode_str_set_to_bl(bp, &to_rm_bl);
+	}
+	catch (ceph::buffer::error& e) {
+	  result = -EINVAL;
+	  tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
+	  goto fail;
+	}
+	tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
+	t->omap_rmkeys(soid, to_rm_bl);
+	ctx->clean_regions.mark_omap_dirty();
+	ctx->delta_stats.num_wr++;
+      }
+      obs.oi.clear_omap_digest();
+      break;
+
+    case CEPH_OSD_OP_OMAPRMKEYRANGE:
+      tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val);
+      if (!pool.info.supports_omap()) {
+	result = -EOPNOTSUPP;
+	break;
+      }
+      ++ctx->num_write;
+      result = 0;
+      {
+	if (!obs.exists || oi.is_whiteout()) {
+	  result = -ENOENT;
+	  break;
+	}
+	std::string key_begin, key_end;
+	try {
+	  decode(key_begin, bp);
+	  decode(key_end, bp);
+	} catch (ceph::buffer::error& e) {
+	  result = -EINVAL;
+	  goto fail;
+	}
+	t->omap_rmkeyrange(soid, key_begin, key_end);
+        ctx->clean_regions.mark_omap_dirty();
+	ctx->delta_stats.num_wr++;
+      }
+      obs.oi.clear_omap_digest();
+      break;
+
+    case CEPH_OSD_OP_COPY_GET:
+      ++ctx->num_read;
+      tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
+		 soid.snap.val);
+      if (op_finisher == nullptr) {
+	result = do_copy_get(ctx, bp, osd_op, ctx->obc);
+      } else {
+	result = op_finisher->execute();
+      }
+      break;
+
+    case CEPH_OSD_OP_COPY_FROM:
+    case CEPH_OSD_OP_COPY_FROM2:
+      ++ctx->num_write;
+      result = 0;
+      {
+	object_t src_name;
+	object_locator_t src_oloc;
+	uint32_t truncate_seq = 0;
+	uint64_t truncate_size = 0;
+	bool have_truncate = false;
+	snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
+	version_t src_version = op.copy_from.src_version;
+
+	if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
+	    (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) {
+	  dout(20) << "invalid copy-from2 flags 0x"
+		  << std::hex << (int)op.copy_from.flags << std::dec << dendl;
+	  result = -EINVAL;
+	  break;
+	}
+	try {
+	  decode(src_name, bp);
+	  decode(src_oloc, bp);
+	  // check if client sent us truncate_seq and truncate_size
+	  if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
+	      (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) {
+	    decode(truncate_seq, bp);
+	    decode(truncate_size, bp);
+	    have_truncate = true;
+	  }
+	}
+	catch (ceph::buffer::error& e) {
+	  result = -EINVAL;
+	  tracepoint(osd,
+		     do_osd_op_pre_copy_from,
+		     soid.oid.name.c_str(),
+		     soid.snap.val,
+		     "???",
+		     0,
+		     "???",
+		     "???",
+		     0,
+		     src_snapid,
+		     src_version);
+	  goto fail;
+	}
+	tracepoint(osd,
+		   do_osd_op_pre_copy_from,
+		   soid.oid.name.c_str(),
+		   soid.snap.val,
+		   src_name.name.c_str(),
+		   src_oloc.pool,
+		   src_oloc.key.c_str(),
+		   src_oloc.nspace.c_str(),
+		   src_oloc.hash,
+		   src_snapid,
+		   src_version);
+	if (op_finisher == nullptr) {
+	  // start
+	  pg_t raw_pg;
+	  get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
+	  hobject_t src(src_name, src_oloc.key, src_snapid,
+			raw_pg.ps(), raw_pg.pool(),
+			src_oloc.nspace);
+	  if (src == soid) {
+	    dout(20) << " copy from self is invalid" << dendl;
+	    result = -EINVAL;
+	    break;
+	  }
+	  CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
+	  if (have_truncate)
+	    cb->set_truncate(truncate_seq, truncate_size);
+          ctx->op_finishers[ctx->current_osd_subop_num].reset(
+            new CopyFromFinisher(cb));
+	  start_copy(cb, ctx->obc, src, src_oloc, src_version,
+		     op.copy_from.flags,
+		     false,
+		     op.copy_from.src_fadvise_flags,
+		     op.flags);
+	  result = -EINPROGRESS;
+	} else {
+	  // finish
+	  result = op_finisher->execute();
+	  ceph_assert(result == 0);
+
+          // COPY_FROM cannot be executed multiple times -- it must restart
+          ctx->op_finishers.erase(ctx->current_osd_subop_num);
+	}
+      }
+      break;
+
+    default:
+      tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
+      dout(1) << "unrecognized osd op " << op.op
+	      << " " << ceph_osd_op_name(op.op)
+	      << dendl;
+      result = -EOPNOTSUPP;
+    }
+
+  fail:
+    osd_op.rval = result;
+    tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
+    if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
+        result != -EAGAIN && result != -EINPROGRESS)
+      result = 0;
+
+    if (result < 0)
+      break;
+  }
+  if (result < 0) {
+    dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
+  }
+  return result;
+}
+
+int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
+{
+  if (ctx->new_obs.oi.size == 0) {
+    dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
+    return -ENODATA;
+  }
+  vector<OSDOp> nops(1);
+  OSDOp &newop = nops[0];
+  newop.op.op = CEPH_OSD_OP_TMAPGET;
+  do_osd_ops(ctx, nops);
+  try {
+    bufferlist::const_iterator i = newop.outdata.begin();
+    decode(*header, i);
+    (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
+  } catch (...) {
+    dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
+	     << dendl;
+    return -EINVAL;
+  }
+  dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
+	   << dendl;
+  return 0;
+}
+
+int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
+					const SnapSet& ss)
+{
+  // verify that all clones have been evicted
+  dout(20) << __func__ << " verifying clones are absent "
+	   << ss << dendl;
+  for (vector<snapid_t>::const_iterator p = ss.clones.begin();
+       p != ss.clones.end();
+       ++p) {
+    hobject_t clone_oid = soid;
+    clone_oid.snap = *p;
+    if (is_missing_object(clone_oid))
+      return -EBUSY;
+    ObjectContextRef clone_obc = get_object_context(clone_oid, false);
+    if (clone_obc && clone_obc->obs.exists) {
+      dout(10) << __func__ << " cannot evict head before clone "
+	       << clone_oid << dendl;
+      return -EBUSY;
+    }
+    if (copy_ops.count(clone_oid)) {
+      dout(10) << __func__ << " cannot evict head, pending promote on clone "
+	       << clone_oid << dendl;
+      return -EBUSY;
+    }
+  }
+  return 0;
+}
+
+inline int PrimaryLogPG::_delete_oid(
+  OpContext *ctx,
+  bool no_whiteout,     // no whiteouts, no matter what.
+  bool try_no_whiteout) // try not to whiteout
+{
+  SnapSet& snapset = ctx->new_snapset;
+  ObjectState& obs = ctx->new_obs;
+  object_info_t& oi = obs.oi;
+  const hobject_t& soid = oi.soid;
+  PGTransaction* t = ctx->op_t.get();
+
+  // cache: cache: set whiteout on delete?
+  bool whiteout = false;
+  if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
+      && !no_whiteout
+      && !try_no_whiteout) {
+    whiteout = true;
+  }
+
+  // in luminous or later, we can't delete the head if there are
+  // clones. we trust the caller passing no_whiteout has already
+  // verified they don't exist.
+  if (!snapset.clones.empty() ||
+      (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
+    if (no_whiteout) {
+      dout(20) << __func__ << " has or will have clones but no_whiteout=1"
+	       << dendl;
+    } else {
+      dout(20) << __func__ << " has or will have clones; will whiteout"
+	       << dendl;
+      whiteout = true;
+    }
+  }
+  dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
+	   << " no_whiteout=" << (int)no_whiteout
+	   << " try_no_whiteout=" << (int)try_no_whiteout
+	   << dendl;
+  if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
+    return -ENOENT;
+
+  t->remove(soid);
+
+  if (oi.size > 0) {
+    interval_set<uint64_t> ch;
+    ch.insert(0, oi.size);
+    ctx->modified_ranges.union_of(ch);
+    ctx->clean_regions.mark_data_region_dirty(0, oi.size);
+  }
+
+  ctx->clean_regions.mark_omap_dirty();
+  ctx->delta_stats.num_wr++;
+  if (soid.is_snap()) {
+    ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
+    ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
+  } else {
+    ctx->delta_stats.num_bytes -= oi.size;
+  }
+  oi.size = 0;
+  oi.new_object();
+
+  // disconnect all watchers
+  for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
+	 oi.watchers.begin();
+       p != oi.watchers.end();
+       ++p) {
+    dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
+    ctx->watch_disconnects.push_back(
+      watch_disconnect_t(p->first.first, p->first.second, true));
+  }
+  oi.watchers.clear();
+
+  if (oi.has_manifest()) {
+    ctx->delta_stats.num_objects_manifest--;
+    dec_all_refcount_manifest(oi, ctx);
+  }
+
+  if (whiteout) {
+    dout(20) << __func__ << " setting whiteout on " << soid << dendl;
+    oi.set_flag(object_info_t::FLAG_WHITEOUT);
+    ctx->delta_stats.num_whiteouts++;
+    t->create(soid);
+    osd->logger->inc(l_osd_tier_whiteout);
+    return 0;
+  }
+
+  // delete the head
+  ctx->delta_stats.num_objects--;
+  if (soid.is_snap())
+    ctx->delta_stats.num_object_clones--;
+  if (oi.is_whiteout()) {
+    dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
+    ctx->delta_stats.num_whiteouts--;
+    oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+  }
+  if (oi.is_cache_pinned()) {
+    ctx->delta_stats.num_objects_pinned--;
+  }
+  obs.exists = false;
+  return 0;
+}
+
+int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
+{
+  SnapSet& snapset = ctx->new_snapset;
+  ObjectState& obs = ctx->new_obs;
+  object_info_t& oi = obs.oi;
+  const hobject_t& soid = oi.soid;
+  PGTransaction* t = ctx->op_t.get();
+  snapid_t snapid = (uint64_t)op.snap.snapid;
+  hobject_t missing_oid;
+
+  dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
+
+  ObjectContextRef rollback_to;
+
+  int ret = find_object_context(
+    hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
+	      soid.get_namespace()),
+    &rollback_to, false, false, &missing_oid);
+  if (ret == -EAGAIN) {
+    /* clone must be missing */
+    ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
+    dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
+	     << missing_oid << " (requested snapid: ) " << snapid << dendl;
+    block_write_on_degraded_snap(missing_oid, ctx->op);
+    return ret;
+  }
+  {
+    ObjectContextRef promote_obc;
+    cache_result_t tier_mode_result;
+    if (obs.exists && obs.oi.has_manifest()) {
+      tier_mode_result =
+	maybe_handle_manifest_detail(
+	  ctx->op,
+	  true,
+	  rollback_to);
+    } else {
+      tier_mode_result =
+	maybe_handle_cache_detail(
+	  ctx->op,
+	  true,
+	  rollback_to,
+	  ret,
+	  missing_oid,
+	  true,
+	  false,
+	  &promote_obc);
+    }
+    switch (tier_mode_result) {
+    case cache_result_t::NOOP:
+      break;
+    case cache_result_t::BLOCKED_PROMOTE:
+      ceph_assert(promote_obc);
+      block_write_on_snap_rollback(soid, promote_obc, ctx->op);
+      return -EAGAIN;
+    case cache_result_t::BLOCKED_FULL:
+      block_write_on_full_cache(soid, ctx->op);
+      return -EAGAIN;
+    case cache_result_t::REPLIED_WITH_EAGAIN:
+      ceph_abort_msg("this can't happen, no rollback on replica");
+    default:
+      ceph_abort_msg("must promote was set, other values are not valid");
+      return -EAGAIN;
+    }
+  }
+
+  if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
+    // there's no snapshot here, or there's no object.
+    // if there's no snapshot, we delete the object; otherwise, do nothing.
+    dout(20) << "_rollback_to deleting head on " << soid.oid
+	     << " because got ENOENT|whiteout on find_object_context" << dendl;
+    if (ctx->obc->obs.oi.watchers.size()) {
+      // Cannot delete an object with watchers
+      ret = -EBUSY;
+    } else {
+      _delete_oid(ctx, false, false);
+      ret = 0;
+    }
+  } else if (ret) {
+    // ummm....huh? It *can't* return anything else at time of writing.
+    ceph_abort_msg("unexpected error code in _rollback_to");
+  } else { //we got our context, let's use it to do the rollback!
+    hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
+    if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
+	is_degraded_on_async_recovery_target(rollback_to_sobject)) {
+      dout(20) << "_rollback_to attempted to roll back to a degraded object "
+	       << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
+      block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
+      ret = -EAGAIN;
+    } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
+      // rolling back to the head; we just need to clone it.
+      ctx->modify = true;
+    } else {
+      /* 1) Delete current head
+       * 2) Clone correct snapshot into head
+       * 3) Calculate clone_overlaps by following overlaps
+       *    forward from rollback snapshot */
+      dout(10) << "_rollback_to deleting " << soid.oid
+	       << " and rolling back to old snap" << dendl;
+
+      if (obs.exists) {
+	t->remove(soid);
+      }
+      t->clone(soid, rollback_to_sobject);
+      t->add_obc(rollback_to);
+
+      map<snapid_t, interval_set<uint64_t> >::iterator iter =
+	snapset.clone_overlap.lower_bound(snapid);
+      ceph_assert(iter != snapset.clone_overlap.end());
+      interval_set<uint64_t> overlaps = iter->second;
+      for ( ;
+	    iter != snapset.clone_overlap.end();
+	    ++iter)
+	overlaps.intersection_of(iter->second);
+
+      if (obs.oi.size > 0) {
+	interval_set<uint64_t> modified;
+	modified.insert(0, obs.oi.size);
+	overlaps.intersection_of(modified);
+	modified.subtract(overlaps);
+	ctx->modified_ranges.union_of(modified);
+      }
+
+      // Adjust the cached objectcontext
+      maybe_create_new_object(ctx, true);
+      ctx->delta_stats.num_bytes -= obs.oi.size;
+      ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
+      ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size));
+      ctx->clean_regions.mark_omap_dirty();
+      obs.oi.size = rollback_to->obs.oi.size;
+      if (rollback_to->obs.oi.is_data_digest())
+	obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
+      else
+	obs.oi.clear_data_digest();
+      if (rollback_to->obs.oi.is_omap_digest())
+	obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
+      else
+	obs.oi.clear_omap_digest();
+
+      if (rollback_to->obs.oi.is_omap()) {
+	dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
+	obs.oi.set_flag(object_info_t::FLAG_OMAP);
+      } else {
+	dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
+	obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+      }
+    }
+  }
+  return ret;
+}
+
+void PrimaryLogPG::_make_clone(
+  OpContext *ctx,
+  PGTransaction* t,
+  ObjectContextRef obc,
+  const hobject_t& head, const hobject_t& coid,
+  object_info_t *poi)
+{
+  bufferlist bv;
+  encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+
+  t->clone(coid, head);
+  setattr_maybe_cache(obc, t, OI_ATTR, bv);
+  rmattr_maybe_cache(obc, t, SS_ATTR);
+}
+
+void PrimaryLogPG::make_writeable(OpContext *ctx)
+{
+  const hobject_t& soid = ctx->obs->oi.soid;
+  SnapContext& snapc = ctx->snapc;
+
+  // clone?
+  ceph_assert(soid.snap == CEPH_NOSNAP);
+  dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
+	   << "  snapc=" << snapc << dendl;
+
+  bool was_dirty = ctx->obc->obs.oi.is_dirty();
+  if (ctx->new_obs.exists) {
+    // we will mark the object dirty
+    if (ctx->undirty && was_dirty) {
+      dout(20) << " clearing DIRTY flag" << dendl;
+      ceph_assert(ctx->new_obs.oi.is_dirty());
+      ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
+      --ctx->delta_stats.num_objects_dirty;
+      osd->logger->inc(l_osd_tier_clean);
+    } else if (!was_dirty && !ctx->undirty) {
+      dout(20) << " setting DIRTY flag" << dendl;
+      ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
+      ++ctx->delta_stats.num_objects_dirty;
+      osd->logger->inc(l_osd_tier_dirty);
+    }
+  } else {
+    if (was_dirty) {
+      dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
+      ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
+      --ctx->delta_stats.num_objects_dirty;
+    }
+  }
+
+  if ((ctx->new_obs.exists &&
+       ctx->new_obs.oi.is_omap()) &&
+      (!ctx->obc->obs.exists ||
+       !ctx->obc->obs.oi.is_omap())) {
+    ++ctx->delta_stats.num_objects_omap;
+  }
+  if ((!ctx->new_obs.exists ||
+       !ctx->new_obs.oi.is_omap()) &&
+      (ctx->obc->obs.exists &&
+       ctx->obc->obs.oi.is_omap())) {
+    --ctx->delta_stats.num_objects_omap;
+  }
+
+  if (ctx->new_snapset.seq > snapc.seq) {
+    dout(10) << " op snapset is old" << dendl;
+  }
+
+  if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
+      snapc.snaps.size() &&                 // there are snaps
+      !ctx->cache_operation &&
+      snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
+    // clone
+    hobject_t coid = soid;
+    coid.snap = snapc.seq;
+
+    unsigned l;
+    for (l = 1;
+	 l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq;
+	 l++) ;
+
+    vector<snapid_t> snaps(l);
+    for (unsigned i=0; i<l; i++)
+      snaps[i] = snapc.snaps[i];
+
+    // prepare clone
+    object_info_t static_snap_oi(coid);
+    object_info_t *snap_oi;
+    if (is_primary()) {
+      ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
+      ctx->clone_obc->destructor_callback =
+	new C_PG_ObjectContext(this, ctx->clone_obc.get());
+      ctx->clone_obc->obs.oi = static_snap_oi;
+      ctx->clone_obc->obs.exists = true;
+      ctx->clone_obc->ssc = ctx->obc->ssc;
+      ctx->clone_obc->ssc->ref++;
+      if (pool.info.is_erasure())
+	ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
+      snap_oi = &ctx->clone_obc->obs.oi;
+      if (ctx->obc->obs.oi.has_manifest()) {
+	if ((ctx->obc->obs.oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) &&
+	    ctx->obc->obs.oi.manifest.is_redirect()) {
+	  snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
+	  snap_oi->manifest.type = object_manifest_t::TYPE_REDIRECT;
+	  snap_oi->manifest.redirect_target = ctx->obc->obs.oi.manifest.redirect_target;
+	} else if (ctx->obc->obs.oi.manifest.is_chunked()) {
+	  snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
+	  snap_oi->manifest.type = object_manifest_t::TYPE_CHUNKED;
+	  snap_oi->manifest.chunk_map = ctx->obc->obs.oi.manifest.chunk_map;
+	} else {
+	  ceph_abort_msg("unrecognized manifest type");
+	}
+      }
+      bool got = ctx->lock_manager.get_write_greedy(
+	coid,
+	ctx->clone_obc,
+	ctx->op);
+      ceph_assert(got);
+      dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
+    } else {
+      snap_oi = &static_snap_oi;
+    }
+    snap_oi->version = ctx->at_version;
+    snap_oi->prior_version = ctx->obs->oi.version;
+    snap_oi->copy_user_bits(ctx->obs->oi);
+
+    _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
+
+    ctx->delta_stats.num_objects++;
+    if (snap_oi->is_dirty()) {
+      ctx->delta_stats.num_objects_dirty++;
+      osd->logger->inc(l_osd_tier_dirty);
+    }
+    if (snap_oi->is_omap())
+      ctx->delta_stats.num_objects_omap++;
+    if (snap_oi->is_cache_pinned())
+      ctx->delta_stats.num_objects_pinned++;
+    if (snap_oi->has_manifest())
+      ctx->delta_stats.num_objects_manifest++;
+    ctx->delta_stats.num_object_clones++;
+    ctx->new_snapset.clones.push_back(coid.snap);
+    ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
+    ctx->new_snapset.clone_snaps[coid.snap] = snaps;
+
+    // clone_overlap should contain an entry for each clone
+    // (an empty interval_set if there is no overlap)
+    ctx->new_snapset.clone_overlap[coid.snap];
+    if (ctx->obs->oi.size)
+      ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
+
+    // log clone
+    dout(10) << " cloning v " << ctx->obs->oi.version
+	     << " to " << coid << " v " << ctx->at_version
+	     << " snaps=" << snaps
+	     << " snapset=" << ctx->new_snapset << dendl;
+    ctx->log.push_back(pg_log_entry_t(
+			 pg_log_entry_t::CLONE, coid, ctx->at_version,
+			 ctx->obs->oi.version,
+			 ctx->obs->oi.user_version,
+			 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
+    encode(snaps, ctx->log.back().snaps);
+
+    ctx->at_version.version++;
+  }
+
+  // update most recent clone_overlap and usage stats
+  if (ctx->new_snapset.clones.size() > 0) {
+    // the clone_overlap is difference of range between head and clones.
+    // we need to check whether the most recent clone exists, if it's
+    // been evicted, it's not included in the stats, but the clone_overlap
+    // is still exist in the snapset, so we should update the
+    // clone_overlap to make it sense.
+    hobject_t last_clone_oid = soid;
+    last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
+    interval_set<uint64_t> &newest_overlap =
+      ctx->new_snapset.clone_overlap.rbegin()->second;
+    ctx->modified_ranges.intersection_of(newest_overlap);
+    if (is_present_clone(last_clone_oid)) {
+      // modified_ranges is still in use by the clone
+      ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
+    }
+    newest_overlap.subtract(ctx->modified_ranges);
+  }
+
+  if (snapc.seq > ctx->new_snapset.seq) {
+    // update snapset with latest snap context
+    ctx->new_snapset.seq = snapc.seq;
+    if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
+      ctx->new_snapset.snaps = snapc.snaps;
+    } else {
+      ctx->new_snapset.snaps.clear();
+    }
+  }
+  dout(20) << "make_writeable " << soid
+	   << " done, snapset=" << ctx->new_snapset << dendl;
+}
+
+
+void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
+					       interval_set<uint64_t>& modified, uint64_t offset,
+					       uint64_t length, bool write_full)
+{
+  interval_set<uint64_t> ch;
+  if (write_full) {
+    if (oi.size)
+      ch.insert(0, oi.size);
+  } else if (length)
+    ch.insert(offset, length);
+  modified.union_of(ch);
+  if (write_full ||
+      (offset + length > oi.size && length)) {
+    uint64_t new_size = offset + length;
+    delta_stats.num_bytes -= oi.size;
+    delta_stats.num_bytes += new_size;
+    oi.size = new_size;
+  }
+
+  delta_stats.num_wr++;
+  delta_stats.num_wr_kb += shift_round_up(length, 10);
+}
+
+void PrimaryLogPG::truncate_update_size_and_usage(
+  object_stat_sum_t& delta_stats,
+  object_info_t& oi,
+  uint64_t truncate_size)
+{
+  if (oi.size != truncate_size) {
+    delta_stats.num_bytes -= oi.size;
+    delta_stats.num_bytes += truncate_size;
+    oi.size = truncate_size;
+  }
+}
+
+void PrimaryLogPG::complete_disconnect_watches(
+  ObjectContextRef obc,
+  const list<watch_disconnect_t> &to_disconnect)
+{
+  for (list<watch_disconnect_t>::const_iterator i =
+	 to_disconnect.begin();
+       i != to_disconnect.end();
+       ++i) {
+    pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
+    auto watchers_entry = obc->watchers.find(watcher);
+    if (watchers_entry != obc->watchers.end()) {
+      WatchRef watch = watchers_entry->second;
+      dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
+      obc->watchers.erase(watcher);
+      watch->remove(i->send_disconnect);
+    } else {
+      dout(10) << "do_osd_op_effects disconnect failed to find watcher "
+	       << watcher << dendl;
+    }
+  }
+}
+
+void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
+{
+  entity_name_t entity = ctx->reqid.name;
+  dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
+
+  // disconnects first
+  complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
+
+  ceph_assert(conn);
+
+  auto session = conn->get_priv();
+  if (!session)
+    return;
+
+  for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
+       i != ctx->watch_connects.end();
+       ++i) {
+    pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
+    dout(15) << "do_osd_op_effects applying watch connect on session "
+	     << session.get() << " watcher " << watcher << dendl;
+    WatchRef watch;
+    if (ctx->obc->watchers.count(watcher)) {
+      dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
+	       << dendl;
+      watch = ctx->obc->watchers[watcher];
+    } else {
+      dout(15) << "do_osd_op_effects new watcher " << watcher
+	       << dendl;
+      watch = Watch::makeWatchRef(
+	this, osd, ctx->obc, i->first.timeout_seconds,
+	i->first.cookie, entity, conn->get_peer_addr());
+      ctx->obc->watchers.insert(
+	make_pair(
+	  watcher,
+	  watch));
+    }
+    watch->connect(conn, i->second);
+  }
+
+  for (list<notify_info_t>::iterator p = ctx->notifies.begin();
+       p != ctx->notifies.end();
+       ++p) {
+    dout(10) << "do_osd_op_effects, notify " << *p << dendl;
+    ConnectionRef conn(ctx->op->get_req()->get_connection());
+    NotifyRef notif(
+      Notify::makeNotifyRef(
+	conn,
+	ctx->reqid.name.num(),
+	p->bl,
+	p->timeout,
+	p->cookie,
+	p->notify_id,
+	ctx->obc->obs.oi.user_version,
+	osd));
+    for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
+	   ctx->obc->watchers.begin();
+	 i != ctx->obc->watchers.end();
+	 ++i) {
+      dout(10) << "starting notify on watch " << i->first << dendl;
+      i->second->start_notify(notif);
+    }
+    notif->init();
+  }
+
+  for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
+       p != ctx->notify_acks.end();
+       ++p) {
+    if (p->watch_cookie)
+      dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl;
+    else
+      dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
+    for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
+	   ctx->obc->watchers.begin();
+	 i != ctx->obc->watchers.end();
+	 ++i) {
+      if (i->first.second != entity) continue;
+      if (p->watch_cookie &&
+	  *(p->watch_cookie) != i->first.first) continue;
+      dout(10) << "acking notify on watch " << i->first << dendl;
+      i->second->notify_ack(p->notify_id, p->reply_bl);
+    }
+  }
+}
+
+hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
+{
+  ostringstream ss;
+  ss << "temp_" << info.pgid << "_" << get_role()
+     << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
+  hobject_t hoid = target.make_temp_hobject(ss.str());
+  dout(20) << __func__ << " " << hoid << dendl;
+  return hoid;
+}
+
+hobject_t PrimaryLogPG::get_temp_recovery_object(
+  const hobject_t& target,
+  eversion_t version)
+{
+  ostringstream ss;
+  ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
+     << "_" << version
+     << "_" << info.history.same_interval_since
+     << "_" << target.snap;
+  // pgid + version + interval + snapid is unique, and short
+  hobject_t hoid = target.make_temp_hobject(ss.str());
+  dout(20) << __func__ << " " << hoid << dendl;
+  return hoid;
+}
+
+int PrimaryLogPG::prepare_transaction(OpContext *ctx)
+{
+  ceph_assert(!ctx->ops->empty());
+
+  // valid snap context?
+  if (!ctx->snapc.is_valid()) {
+    dout(10) << " invalid snapc " << ctx->snapc << dendl;
+    return -EINVAL;
+  }
+
+  // prepare the actual mutation
+  int result = do_osd_ops(ctx, *ctx->ops);
+  if (result < 0) {
+    if (ctx->op->may_write() &&
+	get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
+      // need to save the error code in the pg log, to detect dup ops,
+      // but do nothing else
+      ctx->update_log_only = true;
+    }
+    return result;
+  }
+
+  // read-op?  write-op noop? done?
+  if (ctx->op_t->empty() && !ctx->modify) {
+    if (ctx->pending_async_reads.empty())
+      unstable_stats.add(ctx->delta_stats);
+    if (ctx->op->may_write() &&
+	get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
+      ctx->update_log_only = true;
+    }
+    return result;
+  }
+
+  // check for full
+  if ((ctx->delta_stats.num_bytes > 0 ||
+       ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
+      pool.info.has_flag(pg_pool_t::FLAG_FULL)) {
+    auto m = ctx->op->get_req<MOSDOp>();
+    if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
+	m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
+      dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
+	       << dendl;
+    } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
+      // they tried, they failed.
+      dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
+      return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
+    } else {
+      // drop request
+      dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
+      return -EAGAIN;
+    }
+  }
+
+  const hobject_t& soid = ctx->obs->oi.soid;
+  // clone, if necessary
+  if (soid.snap == CEPH_NOSNAP)
+    make_writeable(ctx);
+
+  finish_ctx(ctx,
+	     ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
+	     pg_log_entry_t::DELETE,
+	     result);
+
+  return result;
+}
+
+void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result)
+{
+  const hobject_t& soid = ctx->obs->oi.soid;
+  dout(20) << __func__ << " " << soid << " " << ctx
+	   << " op " << pg_log_entry_t::get_op_name(log_op_type)
+	   << dendl;
+  utime_t now = ceph_clock_now();
+
+#ifdef HAVE_JAEGER
+  if (ctx->op->osd_parent_span) {
+    auto finish_ctx_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
+  }
+#endif
+  // Drop the reference if deduped chunk is modified
+  if (ctx->new_obs.oi.is_dirty() &&
+    (ctx->obs->oi.has_manifest() && ctx->obs->oi.manifest.is_chunked()) &&
+    // If a clone is creating, ignore dropping the reference for manifest object
+    !ctx->delta_stats.num_object_clones &&
+    ctx->new_obs.oi.size != 0 && // missing, redirect and delete
+    !ctx->cache_operation &&
+    log_op_type != pg_log_entry_t::PROMOTE) {
+    dec_refcount_by_dirty(ctx);
+  }
+
+  // finish and log the op.
+  if (ctx->user_modify) {
+    // update the user_version for any modify ops, except for the watch op
+    ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
+    /* In order for new clients and old clients to interoperate properly
+     * when exchanging versions, we need to lower bound the user_version
+     * (which our new clients pay proper attention to)
+     * by the at_version (which is all the old clients can ever see). */
+    if (ctx->at_version.version > ctx->user_at_version)
+      ctx->user_at_version = ctx->at_version.version;
+    ctx->new_obs.oi.user_version = ctx->user_at_version;
+  }
+  ctx->bytes_written = ctx->op_t->get_bytes_written();
+
+  if (ctx->new_obs.exists) {
+    ctx->new_obs.oi.version = ctx->at_version;
+    ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
+    ctx->new_obs.oi.last_reqid = ctx->reqid;
+    if (ctx->mtime != utime_t()) {
+      ctx->new_obs.oi.mtime = ctx->mtime;
+      dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
+      ctx->new_obs.oi.local_mtime = now;
+    } else {
+      dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
+    }
+
+    // object_info_t
+    map <string, bufferlist> attrs;
+    bufferlist bv(sizeof(ctx->new_obs.oi));
+    encode(ctx->new_obs.oi, bv,
+	     get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+    attrs[OI_ATTR] = std::move(bv);
+
+    // snapset
+    if (soid.snap == CEPH_NOSNAP) {
+      dout(10) << " final snapset " << ctx->new_snapset
+	       << " in " << soid << dendl;
+      bufferlist bss;
+      encode(ctx->new_snapset, bss);
+      attrs[SS_ATTR] = std::move(bss);
+    } else {
+      dout(10) << " no snapset (this is a clone)" << dendl;
+    }
+    ctx->op_t->setattrs(soid, attrs);
+  } else {
+    // reset cached oi
+    ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
+  }
+
+  // append to log
+  ctx->log.push_back(
+    pg_log_entry_t(log_op_type, soid, ctx->at_version,
+		   ctx->obs->oi.version,
+		   ctx->user_at_version, ctx->reqid,
+		   ctx->mtime,
+		   (ctx->op && ctx->op->allows_returnvec()) ? result : 0));
+  if (ctx->op && ctx->op->allows_returnvec()) {
+    // also the per-op values
+    ctx->log.back().set_op_returns(*ctx->ops);
+    dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns
+	     << dendl;
+  }
+
+  ctx->log.back().clean_regions = ctx->clean_regions;
+  dout(20) << __func__ << " object " << soid <<  " marks clean_regions " << ctx->log.back().clean_regions << dendl;
+
+  if (soid.snap < CEPH_NOSNAP) {
+    switch (log_op_type) {
+    case pg_log_entry_t::MODIFY:
+    case pg_log_entry_t::PROMOTE:
+    case pg_log_entry_t::CLEAN:
+      dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
+	       << dendl;
+      encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!ctx->extra_reqids.empty()) {
+    dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << " "
+             << ctx->extra_reqid_return_codes << dendl;
+    ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
+    ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
+  }
+
+  // apply new object state.
+  ctx->obc->obs = ctx->new_obs;
+
+  if (soid.is_head() && !ctx->obc->obs.exists) {
+    ctx->obc->ssc->exists = false;
+    ctx->obc->ssc->snapset = SnapSet();
+  } else {
+    ctx->obc->ssc->exists = true;
+    ctx->obc->ssc->snapset = ctx->new_snapset;
+  }
+}
+
+void PrimaryLogPG::apply_stats(
+  const hobject_t &soid,
+  const object_stat_sum_t &delta_stats) {
+
+  recovery_state.apply_op_stats(soid, delta_stats);
+  for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+       i != get_backfill_targets().end();
+       ++i) {
+    pg_shard_t bt = *i;
+    const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
+    if (soid > pinfo.last_backfill && soid <= last_backfill_started) {
+      pending_backfill_updates[soid].stats.add(delta_stats);
+    }
+  }
+
+  m_scrubber->stats_of_handled_objects(delta_stats, soid);
+}
+
+void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
+{
+  auto m = ctx->op->get_req<MOSDOp>();
+  ceph_assert(ctx->async_reads_complete());
+
+  for (auto p = ctx->ops->begin();
+    p != ctx->ops->end() && result >= 0; ++p) {
+    if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
+      result = p->rval;
+      break;
+    }
+    ctx->bytes_read += p->outdata.length();
+  }
+  ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
+
+  MOSDOpReply *reply = ctx->reply;
+  ctx->reply = nullptr;
+
+  if (result >= 0) {
+    if (!ctx->ignore_log_op_stats) {
+      log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
+
+      publish_stats_to_osd();
+    }
+
+    // on read, return the current object version
+    if (ctx->obs) {
+      reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
+    } else {
+      reply->set_reply_versions(eversion_t(), ctx->user_at_version);
+    }
+  } else if (result == -ENOENT) {
+    // on ENOENT, set a floor for what the next user version will be.
+    reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
+  }
+
+  reply->set_result(result);
+  reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+  osd->send_message_osd_client(reply, m->get_connection());
+  close_op_ctx(ctx);
+}
+
+// ========================================================================
+// copyfrom
+
+struct C_Copyfrom : public Context {
+  PrimaryLogPGRef pg;
+  hobject_t oid;
+  epoch_t last_peering_reset;
+  ceph_tid_t tid;
+  PrimaryLogPG::CopyOpRef cop;	// used for keeping the cop alive
+  C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
+	     const PrimaryLogPG::CopyOpRef& c)
+    : pg(p), oid(o), last_peering_reset(lpr),
+      tid(0), cop(c)
+  {}
+  void finish(int r) override {
+    if (r == -ECANCELED)
+      return;
+    std::scoped_lock l{*pg};
+    if (last_peering_reset == pg->get_last_peering_reset()) {
+      pg->process_copy_chunk(oid, tid, r);
+      cop.reset();
+    }
+  }
+};
+
+struct C_CopyFrom_AsyncReadCb : public Context {
+  OSDOp *osd_op;
+  object_copy_data_t reply_obj;
+  uint64_t features;
+  size_t len;
+  C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
+    osd_op(osd_op), features(features), len(0) {}
+  void finish(int r) override {
+    osd_op->rval = r;
+    if (r < 0) {
+      return;
+    }
+
+    ceph_assert(len > 0);
+    ceph_assert(len <= reply_obj.data.length());
+    bufferlist bl;
+    bl.substr_of(reply_obj.data, 0, len);
+    reply_obj.data.swap(bl);
+    encode(reply_obj, osd_op->outdata, features);
+  }
+};
+
+struct C_CopyChunk : public Context {
+  PrimaryLogPGRef pg;
+  hobject_t oid;
+  epoch_t last_peering_reset;
+  ceph_tid_t tid;
+  PrimaryLogPG::CopyOpRef cop;	// used for keeping the cop alive
+  uint64_t offset = 0;
+  C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
+	     const PrimaryLogPG::CopyOpRef& c)
+    : pg(p), oid(o), last_peering_reset(lpr),
+      tid(0), cop(c)
+  {}
+  void finish(int r) override {
+    if (r == -ECANCELED)
+      return;
+    std::scoped_lock l{*pg};
+    if (last_peering_reset == pg->get_last_peering_reset()) {
+      pg->process_copy_chunk_manifest(oid, tid, r, offset);
+      cop.reset();
+    }
+  }
+};
+
+int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
+			      OSDOp& osd_op, ObjectContextRef &obc)
+{
+  object_info_t& oi = obc->obs.oi;
+  hobject_t& soid = oi.soid;
+  int result = 0;
+  object_copy_cursor_t cursor;
+  uint64_t out_max;
+  try {
+    decode(cursor, bp);
+    decode(out_max, bp);
+  }
+  catch (ceph::buffer::error& e) {
+    result = -EINVAL;
+    return result;
+  }
+
+  const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
+  uint64_t features = op->get_features();
+
+  bool async_read_started = false;
+  object_copy_data_t _reply_obj;
+  C_CopyFrom_AsyncReadCb *cb = nullptr;
+  if (pool.info.is_erasure()) {
+    cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
+  }
+  object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
+  // size, mtime
+  reply_obj.size = oi.size;
+  reply_obj.mtime = oi.mtime;
+  ceph_assert(obc->ssc);
+  if (soid.snap < CEPH_NOSNAP) {
+    auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
+    ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
+    reply_obj.snaps = p->second;
+  } else {
+    reply_obj.snap_seq = obc->ssc->snapset.seq;
+  }
+  if (oi.is_data_digest()) {
+    reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
+    reply_obj.data_digest = oi.data_digest;
+  }
+  if (oi.is_omap_digest()) {
+    reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
+    reply_obj.omap_digest = oi.omap_digest;
+  }
+  reply_obj.truncate_seq = oi.truncate_seq;
+  reply_obj.truncate_size = oi.truncate_size;
+
+  // attrs
+  map<string,bufferlist>& out_attrs = reply_obj.attrs;
+  if (!cursor.attr_complete) {
+    result = getattrs_maybe_cache(
+      ctx->obc,
+      &out_attrs);
+    if (result < 0) {
+      if (cb) {
+        delete cb;
+      }
+      return result;
+    }
+    cursor.attr_complete = true;
+    dout(20) << " got attrs" << dendl;
+  }
+
+  int64_t left = out_max - osd_op.outdata.length();
+
+  // data
+  bufferlist& bl = reply_obj.data;
+  if (left > 0 && !cursor.data_complete) {
+    if (cursor.data_offset < oi.size) {
+      uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
+      if (cb) {
+	async_read_started = true;
+	ctx->pending_async_reads.push_back(
+	  make_pair(
+	    boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
+	    make_pair(&bl, cb)));
+	cb->len = max_read;
+
+        ctx->op_finishers[ctx->current_osd_subop_num].reset(
+          new ReadFinisher(osd_op));
+	result = -EINPROGRESS;
+
+	dout(10) << __func__ << ": async_read noted for " << soid << dendl;
+      } else {
+	result = pgbackend->objects_read_sync(
+	  oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
+	if (result < 0)
+	  return result;
+      }
+      left -= max_read;
+      cursor.data_offset += max_read;
+    }
+    if (cursor.data_offset == oi.size) {
+      cursor.data_complete = true;
+      dout(20) << " got data" << dendl;
+    }
+    ceph_assert(cursor.data_offset <= oi.size);
+  }
+
+  // omap
+  uint32_t omap_keys = 0;
+  if (!pool.info.supports_omap() || !oi.is_omap()) {
+    cursor.omap_complete = true;
+  } else {
+    if (left > 0 && !cursor.omap_complete) {
+      ceph_assert(cursor.data_complete);
+      if (cursor.omap_offset.empty()) {
+	osd->store->omap_get_header(ch, ghobject_t(oi.soid),
+				    &reply_obj.omap_header);
+      }
+      bufferlist omap_data;
+      ObjectMap::ObjectMapIterator iter =
+	osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
+      ceph_assert(iter);
+      iter->upper_bound(cursor.omap_offset);
+      for (; iter->valid(); iter->next()) {
+	++omap_keys;
+	encode(iter->key(), omap_data);
+	encode(iter->value(), omap_data);
+	left -= iter->key().length() + 4 + iter->value().length() + 4;
+	if (left <= 0)
+	  break;
+      }
+      if (omap_keys) {
+	encode(omap_keys, reply_obj.omap_data);
+	reply_obj.omap_data.claim_append(omap_data);
+      }
+      if (iter->valid()) {
+	cursor.omap_offset = iter->key();
+      } else {
+	cursor.omap_complete = true;
+	dout(20) << " got omap" << dendl;
+      }
+    }
+  }
+
+  if (cursor.is_complete()) {
+    // include reqids only in the final step.  this is a bit fragile
+    // but it works...
+    recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
+                                       &reply_obj.reqids,
+                                       &reply_obj.reqid_return_codes);
+    dout(20) << " got reqids" << dendl;
+  }
+
+  dout(20) << " cursor.is_complete=" << cursor.is_complete()
+	   << " " << out_attrs.size() << " attrs"
+	   << " " << bl.length() << " bytes"
+	   << " " << reply_obj.omap_header.length() << " omap header bytes"
+	   << " " << reply_obj.omap_data.length() << " omap data bytes in "
+	   << omap_keys << " keys"
+	   << " " << reply_obj.reqids.size() << " reqids"
+	   << dendl;
+  reply_obj.cursor = cursor;
+  if (!async_read_started) {
+    encode(reply_obj, osd_op.outdata, features);
+  }
+  if (cb && !async_read_started) {
+    delete cb;
+  }
+
+  if (result > 0) {
+    result = 0;
+  }
+  return result;
+}
+
+void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
+                                          OSDOp& osd_op)
+{
+  const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
+  uint64_t features = m->get_features();
+  object_copy_data_t reply_obj;
+
+  recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
+                                     &reply_obj.reqid_return_codes);
+  dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
+  encode(reply_obj, osd_op.outdata, features);
+  osd_op.rval = -ENOENT;
+  MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
+  reply->set_result(-ENOENT);
+  reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
+  osd->send_message_osd_client(reply, m->get_connection());
+}
+
+void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
+			      hobject_t src, object_locator_t oloc,
+			      version_t version, unsigned flags,
+			      bool mirror_snapset,
+			      unsigned src_obj_fadvise_flags,
+			      unsigned dest_obj_fadvise_flags)
+{
+  const hobject_t& dest = obc->obs.oi.soid;
+  dout(10) << __func__ << " " << dest
+	   << " from " << src << " " << oloc << " v" << version
+	   << " flags " << flags
+	   << (mirror_snapset ? " mirror_snapset" : "")
+	   << dendl;
+
+  ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
+
+  // cancel a previous in-progress copy?
+  if (copy_ops.count(dest)) {
+    // FIXME: if the src etc match, we could avoid restarting from the
+    // beginning.
+    CopyOpRef cop = copy_ops[dest];
+    vector<ceph_tid_t> tids;
+    cancel_copy(cop, false, &tids);
+    osd->objecter->op_cancel(tids, -ECANCELED);
+  }
+
+  CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
+			   mirror_snapset, src_obj_fadvise_flags,
+			   dest_obj_fadvise_flags));
+  copy_ops[dest] = cop;
+  obc->start_block();
+
+  if (!obc->obs.oi.has_manifest()) {
+    _copy_some(obc, cop);
+  } else {
+    if (obc->obs.oi.manifest.is_redirect()) {
+      _copy_some(obc, cop);
+    } else if (obc->obs.oi.manifest.is_chunked()) {
+      auto p = obc->obs.oi.manifest.chunk_map.begin();
+      _copy_some_manifest(obc, cop, p->first);
+    } else {
+      ceph_abort_msg("unrecognized manifest type");
+    }
+  }
+}
+
+void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
+{
+  dout(10) << __func__ << " " << *obc << " " << cop << dendl;
+
+  unsigned flags = 0;
+  if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
+    flags |= CEPH_OSD_FLAG_FLUSH;
+  if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
+    flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
+  if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
+    flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
+  if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
+    flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
+  if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
+    flags |= CEPH_OSD_FLAG_RWORDERED;
+
+  C_GatherBuilder gather(cct);
+
+  if (cop->cursor.is_initial() && cop->mirror_snapset) {
+    // list snaps too.
+    ceph_assert(cop->src.snap == CEPH_NOSNAP);
+    ObjectOperation op;
+    op.list_snaps(&cop->results.snapset, NULL);
+    ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
+				    CEPH_SNAPDIR, NULL,
+				    flags, gather.new_sub(), NULL);
+    cop->objecter_tid2 = tid;
+  }
+
+  ObjectOperation op;
+  if (cop->results.user_version) {
+    op.assert_version(cop->results.user_version);
+  } else {
+    // we should learn the version after the first chunk, if we didn't know
+    // it already!
+    ceph_assert(cop->cursor.is_initial());
+  }
+  op.copy_get(&cop->cursor, get_copy_chunk_size(),
+	      &cop->results.object_size, &cop->results.mtime,
+	      &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
+	      &cop->results.snaps, &cop->results.snap_seq,
+	      &cop->results.flags,
+	      &cop->results.source_data_digest,
+	      &cop->results.source_omap_digest,
+	      &cop->results.reqids,
+	      &cop->results.reqid_return_codes,
+	      &cop->results.truncate_seq,
+	      &cop->results.truncate_size,
+	      &cop->rval);
+  op.set_last_op_flags(cop->src_obj_fadvise_flags);
+
+  C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
+				   get_last_peering_reset(), cop);
+  gather.set_finisher(new C_OnFinisher(fin,
+				       osd->get_objecter_finisher(get_pg_shard())));
+
+  ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
+				  cop->src.snap, NULL,
+				  flags,
+				  gather.new_sub(),
+				  // discover the object version if we don't know it yet
+				  cop->results.user_version ? NULL : &cop->results.user_version);
+  fin->tid = tid;
+  cop->objecter_tid = tid;
+  gather.activate();
+}
+
+void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
+{
+  dout(10) << __func__ << " " << *obc << " " << cop << dendl;
+
+  unsigned flags = 0;
+  if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
+    flags |= CEPH_OSD_FLAG_FLUSH;
+  if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
+    flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
+  if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
+    flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
+  if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
+    flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
+  if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
+    flags |= CEPH_OSD_FLAG_RWORDERED;
+
+  int num_chunks = 0;
+  uint64_t last_offset = 0, chunks_size = 0;
+  object_manifest_t *manifest = &obc->obs.oi.manifest;
+  map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
+  for (;iter != manifest->chunk_map.end(); ++iter) {
+    num_chunks++;
+    chunks_size += iter->second.length;
+    last_offset = iter->first;
+    if (get_copy_chunk_size() < chunks_size) {
+      break;
+    }
+  }
+
+  cop->num_chunk = num_chunks;
+  cop->start_offset = start_offset;
+  cop->last_offset = last_offset;
+  dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
+	  << " start_offset: " << start_offset << " chunks_size: " << chunks_size
+	  << " last_offset: " << last_offset << dendl;
+
+  iter = manifest->chunk_map.find(start_offset);
+  for (;iter != manifest->chunk_map.end(); ++iter) {
+    uint64_t obj_offset = iter->first;
+    uint64_t length = manifest->chunk_map[iter->first].length;
+    hobject_t soid = manifest->chunk_map[iter->first].oid;
+    object_locator_t oloc(soid);
+    CopyCallback * cb = NULL;
+    CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
+		       cop->results.user_version, cop->flags, cop->mirror_snapset,
+		       cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
+    sub_cop->cursor.data_offset = obj_offset;
+    cop->chunk_cops[obj_offset] = sub_cop;
+
+    int s = sub_cop->chunk_ops.size();
+    sub_cop->chunk_ops.resize(s+1);
+    sub_cop->chunk_ops[s].op.op =  CEPH_OSD_OP_READ;
+    sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
+    sub_cop->chunk_ops[s].op.extent.length = length;
+
+    ObjectOperation op;
+    op.dup(sub_cop->chunk_ops);
+
+    if (cop->results.user_version) {
+      op.assert_version(cop->results.user_version);
+    } else {
+      // we should learn the version after the first chunk, if we didn't know
+      // it already!
+      ceph_assert(cop->cursor.is_initial());
+    }
+    op.set_last_op_flags(cop->src_obj_fadvise_flags);
+
+    C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
+				     get_last_peering_reset(), cop);
+    fin->offset = obj_offset;
+
+    ceph_tid_t tid = osd->objecter->read(
+      soid.oid, oloc, op,
+      sub_cop->src.snap, NULL,
+      flags,
+      new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
+      // discover the object version if we don't know it yet
+      sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
+    fin->tid = tid;
+    sub_cop->objecter_tid = tid;
+
+    dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
+	    << manifest->chunk_map[iter->first].offset
+	    << " length: " << length << " pool id: " << oloc.pool
+	    << " tid: " << tid << dendl;
+
+    if (last_offset < iter->first) {
+      break;
+    }
+  }
+}
+
+void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
+{
+  dout(10) << __func__ << " " << oid << " tid " << tid
+	   << " " << cpp_strerror(r) << dendl;
+  map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
+  if (p == copy_ops.end()) {
+    dout(10) << __func__ << " no copy_op found" << dendl;
+    return;
+  }
+  CopyOpRef cop = p->second;
+  if (tid != cop->objecter_tid) {
+    dout(10) << __func__ << " tid " << tid << " != cop " << cop
+	     << " tid " << cop->objecter_tid << dendl;
+    return;
+  }
+
+  if (cop->omap_data.length() || cop->omap_header.length())
+    cop->results.has_omap = true;
+
+  if (r >= 0 && !pool.info.supports_omap() &&
+      (cop->omap_data.length() || cop->omap_header.length())) {
+    r = -EOPNOTSUPP;
+  }
+  cop->objecter_tid = 0;
+  cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
+  ObjectContextRef& cobc = cop->obc;
+
+  if (r < 0)
+    goto out;
+
+  ceph_assert(cop->rval >= 0);
+
+  if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
+    // verify snap hasn't been deleted
+    vector<snapid_t>::iterator p = cop->results.snaps.begin();
+    while (p != cop->results.snaps.end()) {
+      // make best effort to sanitize snaps/clones.
+      if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
+	dout(10) << __func__ << " clone snap " << *p << " has been deleted"
+		 << dendl;
+	for (vector<snapid_t>::iterator q = p + 1;
+	     q != cop->results.snaps.end();
+	     ++q)
+	  *(q - 1) = *q;
+	cop->results.snaps.resize(cop->results.snaps.size() - 1);
+      } else {
+	++p;
+      }
+    }
+    if (cop->results.snaps.empty()) {
+      dout(10) << __func__ << " no more snaps for " << oid << dendl;
+      r = -ENOENT;
+      goto out;
+    }
+  }
+
+  ceph_assert(cop->rval >= 0);
+
+  if (!cop->temp_cursor.data_complete) {
+    cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
+  }
+  if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
+    if (cop->omap_header.length()) {
+      cop->results.omap_digest =
+	cop->omap_header.crc32c(cop->results.omap_digest);
+    }
+    if (cop->omap_data.length()) {
+      bufferlist keys;
+      keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
+      cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
+    }
+  }
+
+  if (!cop->temp_cursor.attr_complete) {
+    for (map<string,bufferlist>::iterator p = cop->attrs.begin();
+	 p != cop->attrs.end();
+	 ++p) {
+      cop->results.attrs[string("_") + p->first] = p->second;
+    }
+    cop->attrs.clear();
+  }
+
+  if (!cop->cursor.is_complete()) {
+    // write out what we have so far
+    if (cop->temp_cursor.is_initial()) {
+      ceph_assert(!cop->results.started_temp_obj);
+      cop->results.started_temp_obj = true;
+      cop->results.temp_oid = generate_temp_object(oid);
+      dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
+    }
+    ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
+    OpContextUPtr ctx = simple_opc_create(tempobc);
+    if (cop->temp_cursor.is_initial()) {
+      ctx->new_temp_oid = cop->results.temp_oid;
+    }
+    _write_copy_chunk(cop, ctx->op_t.get());
+    simple_opc_submit(std::move(ctx));
+    dout(10) << __func__ << " fetching more" << dendl;
+    _copy_some(cobc, cop);
+    return;
+  }
+
+  // verify digests?
+  if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
+    dout(20) << __func__ << std::hex
+      << " got digest: rx data 0x" << cop->results.data_digest
+      << " omap 0x" << cop->results.omap_digest
+      << ", source: data 0x" << cop->results.source_data_digest
+      << " omap 0x" <<  cop->results.source_omap_digest
+      << std::dec
+      << " flags " << cop->results.flags
+      << dendl;
+  }
+  if (cop->results.is_data_digest() &&
+      cop->results.data_digest != cop->results.source_data_digest) {
+    derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
+	 << " != source 0x" << cop->results.source_data_digest << std::dec
+	 << dendl;
+    osd->clog->error() << info.pgid << " copy from " << cop->src
+		       << " to " << cop->obc->obs.oi.soid << std::hex
+		       << " data digest 0x" << cop->results.data_digest
+		       << " != source 0x" << cop->results.source_data_digest
+		       << std::dec;
+    r = -EIO;
+    goto out;
+  }
+  if (cop->results.is_omap_digest() &&
+      cop->results.omap_digest != cop->results.source_omap_digest) {
+    derr << __func__ << std::hex
+	 << " omap digest 0x" << cop->results.omap_digest
+	 << " != source 0x" << cop->results.source_omap_digest
+	 << std::dec << dendl;
+    osd->clog->error() << info.pgid << " copy from " << cop->src
+		       << " to " << cop->obc->obs.oi.soid << std::hex
+		       << " omap digest 0x" << cop->results.omap_digest
+		       << " != source 0x" << cop->results.source_omap_digest
+		       << std::dec;
+    r = -EIO;
+    goto out;
+  }
+  if (cct->_conf->osd_debug_inject_copyfrom_error) {
+    derr << __func__ << " injecting copyfrom failure" << dendl;
+    r = -EIO;
+    goto out;
+  }
+
+  cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
+    [this, &cop /* avoid ref cycle */](PGTransaction *t) {
+      ObjectState& obs = cop->obc->obs;
+      if (cop->temp_cursor.is_initial()) {
+	dout(20) << "fill_in_final_tx: writing "
+		 << "directly to final object" << dendl;
+	// write directly to final object
+	cop->results.temp_oid = obs.oi.soid;
+	_write_copy_chunk(cop, t);
+      } else {
+	// finish writing to temp object, then move into place
+	dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
+	if (obs.oi.has_manifest() && obs.oi.manifest.is_redirect() && obs.exists) {
+	  /* In redirect manifest case, the object exists in the upper tier.
+	   * So, to avoid a conflict when rename() is called, remove existing
+	   * object first
+	   */
+	  t->remove(obs.oi.soid);
+	}
+	_write_copy_chunk(cop, t);
+	t->rename(obs.oi.soid, cop->results.temp_oid);
+      }
+      t->setattrs(obs.oi.soid, cop->results.attrs);
+    });
+
+  dout(20) << __func__ << " success; committing" << dendl;
+
+ out:
+  dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
+  CopyCallbackResults results(r, &cop->results);
+  cop->cb->complete(results);
+
+  copy_ops.erase(cobc->obs.oi.soid);
+  cobc->stop_block();
+
+  if (r < 0 && cop->results.started_temp_obj) {
+    dout(10) << __func__ << " deleting partial temp object "
+	     << cop->results.temp_oid << dendl;
+    ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
+    OpContextUPtr ctx = simple_opc_create(tempobc);
+    ctx->op_t->remove(cop->results.temp_oid);
+    ctx->discard_temp_oid = cop->results.temp_oid;
+    simple_opc_submit(std::move(ctx));
+  }
+
+  // cancel and requeue proxy ops on this object
+  if (!r) {
+    cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
+  }
+
+  kick_object_context_blocked(cobc);
+}
+
+void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
+{
+  dout(10) << __func__ << " " << oid << " tid " << tid
+	   << " " << cpp_strerror(r) << dendl;
+  map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
+  if (p == copy_ops.end()) {
+    dout(10) << __func__ << " no copy_op found" << dendl;
+    return;
+  }
+  CopyOpRef obj_cop = p->second;
+  CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
+
+  if (tid != chunk_cop->objecter_tid) {
+    dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
+	     << " tid " << chunk_cop->objecter_tid << dendl;
+    return;
+  }
+
+  if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
+    r = -EOPNOTSUPP;
+  }
+
+  chunk_cop->objecter_tid = 0;
+  chunk_cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
+  ObjectContextRef& cobc = obj_cop->obc;
+  OSDOp &chunk_data = chunk_cop->chunk_ops[0];
+
+  if (r < 0) {
+    obj_cop->failed = true;
+    goto out;
+  }
+
+  if (obj_cop->failed) {
+    return;
+  }
+  if (!chunk_data.outdata.length()) {
+    r = -EIO;
+    obj_cop->failed = true;
+    goto out;
+  }
+
+  obj_cop->num_chunk--;
+
+  /* check all of the copyop are completed */
+  if (obj_cop->num_chunk) {
+    dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
+    return;
+  }
+
+  {
+    OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
+    if (!ctx->lock_manager.take_write_lock(
+	  obj_cop->obc->obs.oi.soid,
+	  obj_cop->obc)) {
+      // recovery op can take read lock.
+      // so need to wait for recovery completion
+      r = -EAGAIN;
+      obj_cop->failed = true;
+      close_op_ctx(ctx.release());
+      goto out;
+    }
+    dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
+
+    PGTransaction *t = ctx->op_t.get();
+    ObjectState& obs = ctx->new_obs;
+    for (auto p : obj_cop->chunk_cops) {
+      OSDOp &sub_chunk = p.second->chunk_ops[0];
+      t->write(cobc->obs.oi.soid,
+	      p.second->cursor.data_offset,
+	      sub_chunk.outdata.length(),
+	      sub_chunk.outdata,
+	      p.second->dest_obj_fadvise_flags);
+      dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
+	      << " length: " << sub_chunk.outdata.length() << dendl;
+      write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
+				  p.second->cursor.data_offset, sub_chunk.outdata.length());
+      obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
+      ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length());
+      sub_chunk.outdata.clear();
+    }
+    obs.oi.clear_data_digest();
+    ctx->at_version = get_next_version();
+    finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
+    simple_opc_submit(std::move(ctx));
+
+    auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
+    /* check remaining work */
+    if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
+      if (obj_cop->last_offset >= p->first + p->second.length) {
+	for (auto &en : cobc->obs.oi.manifest.chunk_map) {
+	  if (obj_cop->last_offset < en.first) {
+	    _copy_some_manifest(cobc, obj_cop, en.first);
+	    return;
+	  }
+	}
+      }
+    }
+  }
+
+ out:
+  dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
+  CopyCallbackResults results(r, &obj_cop->results);
+  obj_cop->cb->complete(results);
+
+  copy_ops.erase(cobc->obs.oi.soid);
+  cobc->stop_block();
+
+  // cancel and requeue proxy ops on this object
+  if (!r) {
+    cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
+  }
+
+  kick_object_context_blocked(cobc);
+}
+
+void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
+  vector<ceph_tid_t> tids;
+  for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
+      it != proxyread_ops.end();) {
+    if (it->second->soid == oid) {
+      cancel_proxy_read((it++)->second, &tids);
+    } else {
+      ++it;
+    }
+  }
+  for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
+       it != proxywrite_ops.end();) {
+    if (it->second->soid == oid) {
+      cancel_proxy_write((it++)->second, &tids);
+    } else {
+      ++it;
+    }
+  }
+  osd->objecter->op_cancel(tids, -ECANCELED);
+  kick_proxy_ops_blocked(oid);
+}
+
+void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
+{
+  dout(20) << __func__ << " " << cop
+	   << " " << cop->attrs.size() << " attrs"
+	   << " " << cop->data.length() << " bytes"
+	   << " " << cop->omap_header.length() << " omap header bytes"
+	   << " " << cop->omap_data.length() << " omap data bytes"
+	   << dendl;
+  if (!cop->temp_cursor.attr_complete) {
+    t->create(cop->results.temp_oid);
+  }
+  if (!cop->temp_cursor.data_complete) {
+    ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
+	   cop->cursor.data_offset);
+    if (pool.info.required_alignment() &&
+	!cop->cursor.data_complete) {
+      /**
+       * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
+       * to pick it up on the next pass.
+       */
+      ceph_assert(cop->temp_cursor.data_offset %
+	     pool.info.required_alignment() == 0);
+      if (cop->data.length() % pool.info.required_alignment() != 0) {
+	uint64_t to_trim =
+	  cop->data.length() % pool.info.required_alignment();
+	bufferlist bl;
+	bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
+	cop->data.swap(bl);
+	cop->cursor.data_offset -= to_trim;
+	ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
+	       cop->cursor.data_offset);
+      }
+    }
+    if (cop->data.length()) {
+      t->write(
+	cop->results.temp_oid,
+	cop->temp_cursor.data_offset,
+	cop->data.length(),
+	cop->data,
+	cop->dest_obj_fadvise_flags);
+    }
+    cop->data.clear();
+  }
+  if (pool.info.supports_omap()) {
+    if (!cop->temp_cursor.omap_complete) {
+      if (cop->omap_header.length()) {
+	t->omap_setheader(
+	  cop->results.temp_oid,
+	  cop->omap_header);
+	cop->omap_header.clear();
+      }
+      if (cop->omap_data.length()) {
+	map<string,bufferlist> omap;
+	bufferlist::const_iterator p = cop->omap_data.begin();
+	decode(omap, p);
+	t->omap_setkeys(cop->results.temp_oid, omap);
+	cop->omap_data.clear();
+      }
+    }
+  } else {
+    ceph_assert(cop->omap_header.length() == 0);
+    ceph_assert(cop->omap_data.length() == 0);
+  }
+  cop->temp_cursor = cop->cursor;
+}
+
+void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
+{
+  OpContext *ctx = cb->ctx;
+  dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
+
+  ObjectState& obs = ctx->new_obs;
+  if (obs.exists) {
+    dout(20) << __func__ << ": exists, removing" << dendl;
+    ctx->op_t->remove(obs.oi.soid);
+  } else {
+    ctx->delta_stats.num_objects++;
+    obs.exists = true;
+  }
+  if (cb->is_temp_obj_used()) {
+    ctx->discard_temp_oid = cb->results->temp_oid;
+  }
+  cb->results->fill_in_final_tx(ctx->op_t.get());
+
+  // CopyFromCallback fills this in for us
+  obs.oi.user_version = ctx->user_at_version;
+
+  if (cb->results->is_data_digest()) {
+    obs.oi.set_data_digest(cb->results->data_digest);
+  } else {
+    obs.oi.clear_data_digest();
+  }
+  if (cb->results->is_omap_digest()) {
+    obs.oi.set_omap_digest(cb->results->omap_digest);
+  } else {
+    obs.oi.clear_omap_digest();
+  }
+
+  obs.oi.truncate_seq = cb->truncate_seq;
+  obs.oi.truncate_size = cb->truncate_size;
+
+  obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime);
+  ctx->mtime = utime_t();
+
+  ctx->extra_reqids = cb->results->reqids;
+  ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
+
+  // cache: clear whiteout?
+  if (obs.oi.is_whiteout()) {
+    dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
+    obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
+    --ctx->delta_stats.num_whiteouts;
+  }
+
+  if (cb->results->has_omap) {
+    dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
+    obs.oi.set_flag(object_info_t::FLAG_OMAP);
+    ctx->clean_regions.mark_omap_dirty();
+  } else {
+    dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
+    obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+  }
+
+  interval_set<uint64_t> ch;
+  if (obs.oi.size > 0)
+    ch.insert(0, obs.oi.size);
+  ctx->modified_ranges.union_of(ch);
+  ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size()));
+
+  if (cb->get_data_size() != obs.oi.size) {
+    ctx->delta_stats.num_bytes -= obs.oi.size;
+    obs.oi.size = cb->get_data_size();
+    ctx->delta_stats.num_bytes += obs.oi.size;
+  }
+  ctx->delta_stats.num_wr++;
+  ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
+
+  osd->logger->inc(l_osd_copyfrom);
+}
+
+void PrimaryLogPG::finish_promote(int r, CopyResults *results,
+				  ObjectContextRef obc)
+{
+  const hobject_t& soid = obc->obs.oi.soid;
+  dout(10) << __func__ << " " << soid << " r=" << r
+	   << " uv" << results->user_version << dendl;
+
+  if (r == -ECANCELED) {
+    return;
+  }
+
+  if (r != -ENOENT && soid.is_snap()) {
+    if (results->snaps.empty()) {
+      // we must have read "snap" content from the head object in the
+      // base pool.  use snap_seq to construct what snaps should be
+      // for this clone (what is was before we evicted the clean clone
+      // from this pool, and what it will be when we flush and the
+      // clone eventually happens in the base pool).  we want to use
+      // snaps in (results->snap_seq,soid.snap]
+      SnapSet& snapset = obc->ssc->snapset;
+      for (auto p = snapset.clone_snaps.rbegin();
+	   p != snapset.clone_snaps.rend();
+	   ++p) {
+	for (auto snap : p->second) {
+	  if (snap > soid.snap) {
+	    continue;
+	  }
+	  if (snap <= results->snap_seq) {
+	    break;
+	  }
+	  results->snaps.push_back(snap);
+	}
+      }
+    }
+
+    dout(20) << __func__ << " snaps " << results->snaps << dendl;
+    filter_snapc(results->snaps);
+
+    dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
+    if (results->snaps.empty()) {
+      dout(20) << __func__
+	       << " snaps are empty, clone is invalid,"
+	       << " setting r to ENOENT" << dendl;
+      r = -ENOENT;
+    }
+  }
+
+  if (r < 0 && results->started_temp_obj) {
+    dout(10) << __func__ << " abort; will clean up partial work" << dendl;
+    ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
+    ceph_assert(tempobc);
+    OpContextUPtr ctx = simple_opc_create(tempobc);
+    ctx->op_t->remove(results->temp_oid);
+    simple_opc_submit(std::move(ctx));
+    results->started_temp_obj = false;
+  }
+
+  if (r == -ENOENT && soid.is_snap()) {
+    dout(10) << __func__
+	     << ": enoent while trying to promote clone, " << soid
+	     << " must have been trimmed, removing from snapset"
+	     << dendl;
+    hobject_t head(soid.get_head());
+    ObjectContextRef obc = get_object_context(head, false);
+    ceph_assert(obc);
+
+    OpContextUPtr tctx = simple_opc_create(obc);
+    tctx->at_version = get_next_version();
+    if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
+      filter_snapc(tctx->new_snapset.snaps);
+    } else {
+      tctx->new_snapset.snaps.clear();
+    }
+    vector<snapid_t> new_clones;
+    map<snapid_t, vector<snapid_t>> new_clone_snaps;
+    for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
+	 i != tctx->new_snapset.clones.end();
+	 ++i) {
+      if (*i != soid.snap) {
+	new_clones.push_back(*i);
+	auto p = tctx->new_snapset.clone_snaps.find(*i);
+	if (p != tctx->new_snapset.clone_snaps.end()) {
+	  new_clone_snaps[*i] = p->second;
+	}
+      }
+    }
+    tctx->new_snapset.clones.swap(new_clones);
+    tctx->new_snapset.clone_overlap.erase(soid.snap);
+    tctx->new_snapset.clone_size.erase(soid.snap);
+    tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
+
+    // take RWWRITE lock for duration of our local write.  ignore starvation.
+    if (!tctx->lock_manager.take_write_lock(
+	  head,
+	  obc)) {
+      ceph_abort_msg("problem!");
+    }
+    dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
+
+    finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
+
+    simple_opc_submit(std::move(tctx));
+    return;
+  }
+
+  bool whiteout = false;
+  if (r == -ENOENT) {
+    ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
+    dout(10) << __func__ << " whiteout " << soid << dendl;
+    whiteout = true;
+  }
+
+  if (r < 0 && !whiteout) {
+    derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
+    // pass error to everyone blocked on this object
+    // FIXME: this is pretty sloppy, but at this point we got
+    // something unexpected and don't have many other options.
+    map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
+      waiting_for_blocked_object.find(soid);
+    if (blocked_iter != waiting_for_blocked_object.end()) {
+      while (!blocked_iter->second.empty()) {
+	osd->reply_op_error(blocked_iter->second.front(), r);
+	blocked_iter->second.pop_front();
+      }
+      waiting_for_blocked_object.erase(blocked_iter);
+    }
+    return;
+  }
+
+  osd->promote_finish(results->object_size);
+
+  OpContextUPtr tctx =  simple_opc_create(obc);
+  tctx->at_version = get_next_version();
+
+  if (!obc->obs.oi.has_manifest()) {
+    ++tctx->delta_stats.num_objects;
+  }
+  if (soid.snap < CEPH_NOSNAP)
+    ++tctx->delta_stats.num_object_clones;
+  tctx->new_obs.exists = true;
+
+  tctx->extra_reqids = results->reqids;
+  tctx->extra_reqid_return_codes = results->reqid_return_codes;
+
+  if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) {
+    tctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
+    tctx->new_obs.oi.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
+    tctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
+    tctx->new_obs.oi.manifest.redirect_target = hobject_t();
+    tctx->delta_stats.num_objects_manifest--;
+    if (obc->obs.oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
+      dec_all_refcount_manifest(obc->obs.oi, tctx.get());
+    }
+  }
+
+  if (whiteout) {
+    // create a whiteout
+    tctx->op_t->create(soid);
+    tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
+    ++tctx->delta_stats.num_whiteouts;
+    dout(20) << __func__ << " creating whiteout on " << soid << dendl;
+    osd->logger->inc(l_osd_tier_whiteout);
+  } else {
+    if (results->has_omap) {
+      dout(10) << __func__ << " setting omap flag on " << soid << dendl;
+      tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
+      ++tctx->delta_stats.num_objects_omap;
+    }
+
+    results->fill_in_final_tx(tctx->op_t.get());
+    if (results->started_temp_obj) {
+      tctx->discard_temp_oid = results->temp_oid;
+    }
+    tctx->new_obs.oi.size = results->object_size;
+    tctx->new_obs.oi.user_version = results->user_version;
+    tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime);
+    tctx->mtime = utime_t();
+    if (results->is_data_digest()) {
+      tctx->new_obs.oi.set_data_digest(results->data_digest);
+    } else {
+      tctx->new_obs.oi.clear_data_digest();
+    }
+    if (results->object_size)
+      tctx->clean_regions.mark_data_region_dirty(0, results->object_size);
+    if (results->is_omap_digest()) {
+      tctx->new_obs.oi.set_omap_digest(results->omap_digest);
+    } else {
+      tctx->new_obs.oi.clear_omap_digest();
+    }
+    if (results->has_omap)
+        tctx->clean_regions.mark_omap_dirty();
+    tctx->new_obs.oi.truncate_seq = results->truncate_seq;
+    tctx->new_obs.oi.truncate_size = results->truncate_size;
+
+    if (soid.snap != CEPH_NOSNAP) {
+      ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
+      ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
+      ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
+	     results->object_size);
+      ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
+
+      tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
+    } else {
+      tctx->delta_stats.num_bytes += results->object_size;
+    }
+  }
+
+  if (results->mirror_snapset) {
+    ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
+    tctx->new_snapset.from_snap_set(
+      results->snapset,
+      get_osdmap()->require_osd_release < ceph_release_t::luminous);
+  }
+  dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
+
+  // take RWWRITE lock for duration of our local write.  ignore starvation.
+  if (!tctx->lock_manager.take_write_lock(
+	obc->obs.oi.soid,
+	obc)) {
+    ceph_abort_msg("problem!");
+  }
+  dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
+
+  finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
+
+  simple_opc_submit(std::move(tctx));
+
+  osd->logger->inc(l_osd_tier_promote);
+
+  if (agent_state &&
+      agent_state->is_idle())
+    agent_choose_mode();
+}
+
+void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
+					    ObjectContextRef obc)
+{
+  const hobject_t& soid = obc->obs.oi.soid;
+  dout(10) << __func__ << " " << soid << " r=" << r
+	   << " uv" << results->user_version << dendl;
+
+  if (r == -ECANCELED || r == -EAGAIN) {
+    return;
+  }
+
+  if (r < 0) {
+    derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
+    // pass error to everyone blocked on this object
+    // FIXME: this is pretty sloppy, but at this point we got
+    // something unexpected and don't have many other options.
+    map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
+      waiting_for_blocked_object.find(soid);
+    if (blocked_iter != waiting_for_blocked_object.end()) {
+      while (!blocked_iter->second.empty()) {
+	osd->reply_op_error(blocked_iter->second.front(), r);
+	blocked_iter->second.pop_front();
+      }
+      waiting_for_blocked_object.erase(blocked_iter);
+    }
+    return;
+  }
+
+  osd->promote_finish(results->object_size);
+  osd->logger->inc(l_osd_tier_promote);
+
+  if (agent_state &&
+      agent_state->is_idle())
+    agent_choose_mode();
+}
+
+void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
+			       vector<ceph_tid_t> *tids)
+{
+  dout(10) << __func__ << " " << cop->obc->obs.oi.soid
+	   << " from " << cop->src << " " << cop->oloc
+	   << " v" << cop->results.user_version << dendl;
+
+  // cancel objecter op, if we can
+  if (cop->objecter_tid) {
+    tids->push_back(cop->objecter_tid);
+    cop->objecter_tid = 0;
+    if (cop->objecter_tid2) {
+      tids->push_back(cop->objecter_tid2);
+      cop->objecter_tid2 = 0;
+    }
+  }
+
+  copy_ops.erase(cop->obc->obs.oi.soid);
+  cop->obc->stop_block();
+
+  kick_object_context_blocked(cop->obc);
+  cop->results.should_requeue = requeue;
+  CopyCallbackResults result(-ECANCELED, &cop->results);
+  cop->cb->complete(result);
+
+  // There may still be an objecter callback referencing this copy op.
+  // That callback will not need the obc since it's been canceled, and
+  // we need the obc reference to go away prior to flush.
+  cop->obc = ObjectContextRef();
+}
+
+void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
+{
+  dout(10) << __func__ << dendl;
+  map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
+  while (p != copy_ops.end()) {
+    // requeue this op? can I queue up all of them?
+    cancel_copy((p++)->second, requeue, tids);
+  }
+}
+
+
+// ========================================================================
+// flush
+//
+// Flush a dirty object in the cache tier by writing it back to the
+// base tier.  The sequence looks like:
+//
+//  * send a copy-from operation to the base tier to copy the current
+//    version of the object
+//  * base tier will pull the object via (perhaps multiple) copy-get(s)
+//  * on completion, we check if the object has been modified.  if so,
+//    just reply with -EAGAIN.
+//  * try to take a write lock so we can clear the dirty flag.  if this
+//    fails, wait and retry
+//  * start a repop that clears the bit.
+//
+// If we have to wait, we will retry by coming back through the
+// start_flush method.  We check if a flush is already in progress
+// and, if so, try to finish it by rechecking the version and trying
+// to clear the dirty bit.
+//
+// In order for the cache-flush (a write op) to not block the copy-get
+// from reading the object, the client *must* set the SKIPRWLOCKS
+// flag.
+//
+// NOTE: normally writes are strictly ordered for the client, but
+// flushes are special in that they can be reordered with respect to
+// other writes.  In particular, we can't have a flush request block
+// an update to the cache pool object!
+
+struct C_Flush : public Context {
+  PrimaryLogPGRef pg;
+  hobject_t oid;
+  epoch_t last_peering_reset;
+  ceph_tid_t tid;
+  utime_t start;
+  C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
+    : pg(p), oid(o), last_peering_reset(lpr),
+      tid(0), start(ceph_clock_now())
+  {}
+  void finish(int r) override {
+    if (r == -ECANCELED)
+      return;
+    std::scoped_lock locker{*pg};
+    if (last_peering_reset == pg->get_last_peering_reset()) {
+      pg->finish_flush(oid, tid, r);
+      pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
+    }
+  }
+};
+
+int PrimaryLogPG::start_dedup(OpRequestRef op, ObjectContextRef obc)
+{
+  const object_info_t& oi = obc->obs.oi;
+  const hobject_t& soid = oi.soid;
+
+  ceph_assert(obc->is_blocked());
+  if (oi.size == 0) {
+    // evicted 
+    return 0;
+  }
+  if (pool.info.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE) {
+    dout(0) << " fingerprint algorithm is not set " << dendl;
+    return -EINVAL;
+  }
+  if (pool.info.get_dedup_tier() <= 0) {
+    dout(10) << " dedup tier is not set " << dendl;
+    return -EINVAL;
+  }
+
+  /*
+   * The operations to make dedup chunks are tracked by a ManifestOp.
+   * This op will be finished if all the operations are completed.
+   */
+  ManifestOpRef mop(std::make_shared<ManifestOp>(nullptr));
+
+  // cdc
+  std::map<uint64_t, bufferlist> chunks; 
+  int r = do_cdc(oi, mop->new_manifest.chunk_map, chunks);
+  if (r < 0) {
+    return r;
+  }
+  if (!chunks.size()) {
+    return 0;
+  }
+
+  // chunks issued here are different with chunk_map newly generated
+  // because the same chunks in previous snap will not be issued
+  // So, we need two data structures; the first is the issued chunk list to track
+  // issued operations, and the second is the new chunk_map to update chunk_map after 
+  // all operations are finished
+  object_ref_delta_t refs;
+  ObjectContextRef obc_l, obc_g;
+  get_adjacent_clones(obc, obc_l, obc_g);
+  // skip if the same content exits in prev snap at same offset
+  mop->new_manifest.calc_refs_to_inc_on_set(
+    obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
+    obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
+    refs);
+
+  for (auto p : chunks) {
+    hobject_t target = mop->new_manifest.chunk_map[p.first].oid;
+    if (refs.find(target) == refs.end()) {
+      continue;
+    }
+    C_SetDedupChunks *fin = new C_SetDedupChunks(this, soid, get_last_peering_reset(), p.first);
+    ceph_tid_t tid = refcount_manifest(soid, target, refcount_t::CREATE_OR_GET_REF, 
+			    fin, move(chunks[p.first]));
+    mop->chunks[target] = make_pair(p.first, p.second.length());
+    mop->num_chunks++;
+    mop->tids[p.first] = tid;
+    fin->tid = tid;
+    dout(10) << __func__ << " oid: " << soid << " tid: " << tid
+	    << " target: " << target << " offset: " << p.first
+	    << " length: " << p.second.length() << dendl;
+  }
+
+  if (mop->tids.size()) {
+    manifest_ops[soid] = mop;
+    manifest_ops[soid]->op = op;
+  } else {
+    // size == 0
+    return 0;
+  }
+
+  return -EINPROGRESS;
+}
+
+int PrimaryLogPG::do_cdc(const object_info_t& oi, 
+			 std::map<uint64_t, chunk_info_t>& chunk_map,
+			 std::map<uint64_t, bufferlist>& chunks)
+{
+  string chunk_algo = pool.info.get_dedup_chunk_algorithm_name();
+  int64_t chunk_size = pool.info.get_dedup_cdc_chunk_size();
+  uint64_t total_length = 0;
+
+  std::unique_ptr<CDC> cdc = CDC::create(chunk_algo, cbits(chunk_size)-1);
+  if (!cdc) {
+    dout(0) << __func__ << " unrecognized chunk-algorithm " << dendl;
+    return -EINVAL;
+  }
+
+  bufferlist bl;
+  /**
+   * We disable EC pool as a base tier of distributed dedup.
+   * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync(). 
+   * Therefore, we should change the current implementation totally to make EC pool compatible. 
+   * As s result, we leave this as a future work.
+   */
+  int r = pgbackend->objects_read_sync(
+      oi.soid, 0, oi.size, 0, &bl);
+  if (r < 0) {
+    dout(0) << __func__ << " read fail " << oi.soid
+            << " len: " << oi.size << " r: " << r << dendl;
+    return r;
+  }
+  if (bl.length() != oi.size) {
+    dout(0) << __func__ << " bl.length: " << bl.length() << " != oi.size: "
+	    << oi.size << " during chunking " << dendl;
+    return -EIO;
+  }
+
+  dout(10) << __func__ << " oid: " << oi.soid << " len: " << bl.length() 
+	   << " oi.size: " << oi.size   
+	   << " chunk_size: " << chunk_size << dendl;
+
+  vector<pair<uint64_t, uint64_t>> cdc_chunks;
+  cdc->calc_chunks(bl, &cdc_chunks);
+
+  // get fingerprint 
+  for (auto p : cdc_chunks) {
+    bufferlist chunk;
+    chunk.substr_of(bl, p.first, p.second);
+    auto [ret, target] = get_fpoid_from_chunk(oi.soid, chunk);
+    if (ret < 0) {
+      return ret;
+    }
+    chunks[p.first] = std::move(chunk);
+    chunk_map[p.first] = chunk_info_t(0, p.second, target);
+    total_length += p.second;
+  }
+  return total_length;
+}
+
+std::pair<int, hobject_t> PrimaryLogPG::get_fpoid_from_chunk(
+  const hobject_t soid, bufferlist& chunk)
+{
+  pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type();
+  if (fp_algo == pg_pool_t::TYPE_FINGERPRINT_NONE) {
+    return make_pair(-EINVAL, hobject_t());
+  }
+  object_t fp_oid = [&fp_algo, &chunk]() -> string {
+    switch (fp_algo) {
+      case pg_pool_t::TYPE_FINGERPRINT_SHA1:
+	return ceph::crypto::digest<ceph::crypto::SHA1>(chunk).to_str();
+      case pg_pool_t::TYPE_FINGERPRINT_SHA256:
+	return ceph::crypto::digest<ceph::crypto::SHA256>(chunk).to_str();
+      case pg_pool_t::TYPE_FINGERPRINT_SHA512:
+	return ceph::crypto::digest<ceph::crypto::SHA512>(chunk).to_str();
+      default:
+	assert(0 == "unrecognized fingerprint type");
+	return {};
+    }
+  }();    
+
+  pg_t raw_pg;
+  object_locator_t oloc(soid);
+  oloc.pool = pool.info.get_dedup_tier();
+  // check if dedup_tier isn't set
+  ceph_assert(oloc.pool > 0);
+  int ret = get_osdmap()->object_locator_to_pg(fp_oid, oloc, raw_pg);
+  if (ret < 0) {
+    return make_pair(ret, hobject_t());
+  }
+  hobject_t target(fp_oid, oloc.key, snapid_t(),
+		    raw_pg.ps(), raw_pg.pool(),
+		    oloc.nspace);
+  return make_pair(0, target);
+}
+
+int PrimaryLogPG::finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset)
+{
+  dout(10) << __func__ << " " << oid << " tid " << tid
+	   << " " << cpp_strerror(r) << dendl;
+  map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid);
+  if (p == manifest_ops.end()) {
+    dout(10) << __func__ << " no manifest_op found" << dendl;
+    return -EINVAL;
+  }
+  ManifestOpRef mop = p->second;
+  mop->results[offset] = r;
+  if (r < 0) {
+    // if any failure occurs, put a mark on the results to recognize the failure
+    mop->results[0] = r;
+  }
+  if (mop->num_chunks != mop->results.size()) {
+    // there are on-going works
+    return -EINPROGRESS;
+  }
+  ObjectContextRef obc = get_object_context(oid, false);
+  if (!obc) {
+    if (mop->op)
+      osd->reply_op_error(mop->op, -EINVAL);
+    return -EINVAL;
+  }
+  ceph_assert(obc->is_blocked());
+  obc->stop_block();
+  kick_object_context_blocked(obc);
+  if (mop->results[0] < 0) {
+    // check if the previous op returns fail
+    ceph_assert(mop->num_chunks == mop->results.size());
+    manifest_ops.erase(oid);
+    osd->reply_op_error(mop->op, mop->results[0]);
+    return -EIO;
+  }
+
+  if (mop->chunks.size()) {
+    OpContextUPtr ctx = simple_opc_create(obc);
+    ceph_assert(ctx);
+    if (ctx->lock_manager.get_lock_type(
+	  RWState::RWWRITE,
+	  oid,
+	  obc,
+	  mop->op)) {
+      dout(20) << __func__ << " took write lock" << dendl;
+    } else if (mop->op) {
+      dout(10) << __func__ << " waiting on write lock " << mop->op << dendl;
+      close_op_ctx(ctx.release());
+      return -EAGAIN;    
+    }
+
+    ctx->at_version = get_next_version();
+    ctx->new_obs = obc->obs;
+    ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
+
+    /* 
+    * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head.
+    * head: [0, 2) aaa <-- tier_flush()
+    * 20:   [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
+    * 
+    * In this case, if the new chunk_map is as follows,
+    * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
+    * we should drop aaa from head by using calc_refs_to_drop_on_removal().
+    * So, the precedure is 
+    * 	1. calc_refs_to_drop_on_removal()
+    * 	2. register old references to drop after tier_flush() is committed
+    * 	3. update new chunk_map
+    */
+
+    ObjectCleanRegions c_regions = ctx->clean_regions;
+    ObjectContextRef cobc = get_prev_clone_obc(obc);
+    c_regions.mark_fully_dirty(); 
+    // CDC was done on entire range of manifest object,
+    // so the first thing we should do here is to drop the reference to old chunks
+    ObjectContextRef obc_l, obc_g;
+    get_adjacent_clones(obc, obc_l, obc_g);
+    // clear all old references
+    object_ref_delta_t refs;
+    ctx->obs->oi.manifest.calc_refs_to_drop_on_removal(
+      obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
+      obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
+      refs);
+    if (!refs.is_empty()) {
+      ctx->register_on_commit(
+        [oid, this, refs](){
+          dec_refcount(oid, refs);
+        });
+    }
+
+    // set new references
+    ctx->new_obs.oi.manifest.chunk_map = mop->new_manifest.chunk_map;
+
+    finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
+    simple_opc_submit(std::move(ctx));
+  }
+  if (mop->op)
+    osd->reply_op_error(mop->op, r);
+
+  manifest_ops.erase(oid);
+  return 0;
+}
+
+int PrimaryLogPG::start_flush(
+  OpRequestRef op, ObjectContextRef obc,
+  bool blocking, hobject_t *pmissing,
+  std::optional<std::function<void()>> &&on_flush)
+{
+  const object_info_t& oi = obc->obs.oi;
+  const hobject_t& soid = oi.soid;
+  dout(10) << __func__ << " " << soid
+	   << " v" << oi.version
+	   << " uv" << oi.user_version
+	   << " " << (blocking ? "blocking" : "non-blocking/best-effort")
+	   << dendl;
+
+  bool preoctopus_compat =
+    get_osdmap()->require_osd_release < ceph_release_t::octopus;
+  SnapSet snapset;
+  if (preoctopus_compat) {
+    // for pre-octopus compatibility, filter SnapSet::snaps.  not
+    // certain we need this, but let's be conservative.
+    snapset = obc->ssc->snapset.get_filtered(pool.info);
+  } else {
+    // NOTE: change this to a const ref when we remove this compat code
+    snapset = obc->ssc->snapset;
+  }
+
+  if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
+    // current dedup tier only supports blocking operation
+    if (!blocking) {
+      return -EOPNOTSUPP;
+    }
+  }
+
+  // verify there are no (older) check for dirty clones
+  {
+    dout(20) << " snapset " << snapset << dendl;
+    vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
+    while (p != snapset.clones.rend() && *p >= soid.snap)
+      ++p;
+    if (p != snapset.clones.rend()) {
+      hobject_t next = soid;
+      next.snap = *p;
+      ceph_assert(next.snap < soid.snap);
+      if (recovery_state.get_pg_log().get_missing().is_missing(next)) {
+	dout(10) << __func__ << " missing clone is " << next << dendl;
+	if (pmissing)
+	  *pmissing = next;
+	return -ENOENT;
+      }
+      ObjectContextRef older_obc = get_object_context(next, false);
+      if (older_obc) {
+	dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
+		 << dendl;
+	if (older_obc->obs.oi.is_dirty()) {
+	  dout(10) << __func__ << " next oldest clone is dirty: "
+		   << older_obc->obs.oi << dendl;
+	  return -EBUSY;
+	}
+      } else {
+	dout(20) << __func__ << " next oldest clone " << next
+		 << " is not present; implicitly clean" << dendl;
+      }
+    } else {
+      dout(20) << __func__ << " no older clones" << dendl;
+    }
+  }
+
+  if (blocking)
+    obc->start_block();
+
+  map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
+  if (p != flush_ops.end()) {
+    FlushOpRef fop = p->second;
+    if (fop->op == op) {
+      // we couldn't take the write lock on a cache-try-flush before;
+      // now we are trying again for the lock.
+      return try_flush_mark_clean(fop);
+    }
+    if (fop->flushed_version == obc->obs.oi.user_version &&
+	(fop->blocking || !blocking)) {
+      // nonblocking can join anything
+      // blocking can only join a blocking flush
+      dout(20) << __func__ << " piggybacking on existing flush " << dendl;
+      if (op)
+	fop->dup_ops.push_back(op);
+      return -EAGAIN;   // clean up this ctx; op will retry later
+    }
+
+    // cancel current flush since it will fail anyway, or because we
+    // are blocking and the existing flush is nonblocking.
+    dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
+    if (fop->op)
+      osd->reply_op_error(fop->op, -EBUSY);
+    while (!fop->dup_ops.empty()) {
+      osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
+      fop->dup_ops.pop_front();
+    }
+    vector<ceph_tid_t> tids;
+    cancel_flush(fop, false, &tids);
+    osd->objecter->op_cancel(tids, -ECANCELED);
+  }
+
+  if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
+    int r = start_dedup(op, obc);
+    if (r != -EINPROGRESS) {
+      if (blocking)
+	obc->stop_block();
+    }
+    return r;
+  }
+
+  /**
+   * In general, we need to send a delete and a copyfrom.
+   * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
+   * where 4 is marked as clean.  To flush 10, we have to:
+   * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
+   * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
+   *
+   * There is a complicating case.  Supposed there had been a clone 7
+   * for snaps [7, 6] which has been trimmed since they no longer exist.
+   * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
+   * the delete, the snap will be promoted to 5, and the head will become
+   * a whiteout.  When the copy-from goes through, we'll end up with
+   * 8:[8,4,3,2]:[4(4,3,2)]+head.
+   *
+   * Another complication is the case where there is an interval change
+   * after doing the delete and the flush but before marking the object
+   * clean.  We'll happily delete head and then recreate it at the same
+   * sequence number, which works out ok.
+   */
+
+  SnapContext snapc, dsnapc;
+  if (snapset.seq != 0) {
+    if (soid.snap == CEPH_NOSNAP) {
+      snapc = snapset.get_ssc_as_of(snapset.seq);
+    } else {
+      snapid_t min_included_snap;
+      auto p = snapset.clone_snaps.find(soid.snap);
+      ceph_assert(p != snapset.clone_snaps.end());
+      min_included_snap = p->second.back();
+      snapc = snapset.get_ssc_as_of(min_included_snap - 1);
+    }
+
+    snapid_t prev_snapc = 0;
+    for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
+	 citer != snapset.clones.rend();
+	 ++citer) {
+      if (*citer < soid.snap) {
+	prev_snapc = *citer;
+	break;
+      }
+    }
+
+    dsnapc = snapset.get_ssc_as_of(prev_snapc);
+  }
+
+  object_locator_t base_oloc(soid);
+  base_oloc.pool = pool.info.tier_of;
+
+  if (dsnapc.seq < snapc.seq) {
+    ObjectOperation o;
+    o.remove();
+    osd->objecter->mutate(
+      soid.oid,
+      base_oloc,
+      o,
+      dsnapc,
+      ceph::real_clock::from_ceph_timespec(oi.mtime),
+      (CEPH_OSD_FLAG_IGNORE_OVERLAY |
+       CEPH_OSD_FLAG_ENFORCE_SNAPC),
+      NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
+  }
+
+  FlushOpRef fop(std::make_shared<FlushOp>());
+  fop->obc = obc;
+  fop->flushed_version = oi.user_version;
+  fop->blocking = blocking;
+  fop->on_flush = std::move(on_flush);
+  fop->op = op;
+
+  ObjectOperation o;
+  if (oi.is_whiteout()) {
+    fop->removal = true;
+    o.remove();
+  } else {
+    object_locator_t oloc(soid);
+    o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
+		CEPH_OSD_COPY_FROM_FLAG_FLUSH |
+		CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
+		CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
+		CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
+		LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+
+    //mean the base tier don't cache data after this
+    if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
+      o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+  }
+  C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
+
+  ceph_tid_t tid = osd->objecter->mutate(
+    soid.oid, base_oloc, o, snapc,
+    ceph::real_clock::from_ceph_timespec(oi.mtime),
+    CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
+    new C_OnFinisher(fin,
+		     osd->get_objecter_finisher(get_pg_shard())));
+  /* we're under the pg lock and fin->finish() is grabbing that */
+  fin->tid = tid;
+  fop->objecter_tid = tid;
+
+  flush_ops[soid] = fop;
+
+  recovery_state.update_stats(
+    [&oi](auto &history, auto &stats) {
+      stats.stats.sum.num_flush++;
+      stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
+      return false;
+    });
+  return -EINPROGRESS;
+}
+
+void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
+{
+  dout(10) << __func__ << " " << oid << " tid " << tid
+	   << " " << cpp_strerror(r) << dendl;
+  map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
+  if (p == flush_ops.end()) {
+    dout(10) << __func__ << " no flush_op found" << dendl;
+    return;
+  }
+  FlushOpRef fop = p->second;
+  if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
+    dout(10) << __func__ << " tid " << tid << " != fop " << fop
+	     << " tid " << fop->objecter_tid << dendl;
+    return;
+  }
+  ObjectContextRef obc = fop->obc;
+  fop->objecter_tid = 0;
+
+  if (r < 0 && !(r == -ENOENT && fop->removal)) {
+    if (fop->op)
+      osd->reply_op_error(fop->op, -EBUSY);
+    if (fop->blocking) {
+      obc->stop_block();
+      kick_object_context_blocked(obc);
+    }
+
+    if (!fop->dup_ops.empty()) {
+      dout(20) << __func__ << " requeueing dups" << dendl;
+      requeue_ops(fop->dup_ops);
+    }
+    if (fop->on_flush) {
+      (*(fop->on_flush))();
+      fop->on_flush = std::nullopt;
+    }
+    flush_ops.erase(oid);
+    return;
+  }
+
+  r = try_flush_mark_clean(fop);
+  if (r == -EBUSY && fop->op) {
+    osd->reply_op_error(fop->op, r);
+  }
+}
+
+int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
+{
+  ObjectContextRef obc = fop->obc;
+  const hobject_t& oid = obc->obs.oi.soid;
+
+  if (fop->blocking) {
+    obc->stop_block();
+    kick_object_context_blocked(obc);
+  }
+
+  if (fop->flushed_version != obc->obs.oi.user_version ||
+      !obc->obs.exists) {
+    if (obc->obs.exists)
+      dout(10) << __func__ << " flushed_version " << fop->flushed_version
+	       << " != current " << obc->obs.oi.user_version
+	       << dendl;
+    else
+      dout(10) << __func__ << " object no longer exists" << dendl;
+
+    if (!fop->dup_ops.empty()) {
+      dout(20) << __func__ << " requeueing dups" << dendl;
+      requeue_ops(fop->dup_ops);
+    }
+    if (fop->on_flush) {
+      (*(fop->on_flush))();
+      fop->on_flush = std::nullopt;
+    }
+    flush_ops.erase(oid);
+    if (fop->blocking)
+      osd->logger->inc(l_osd_tier_flush_fail);
+    else
+      osd->logger->inc(l_osd_tier_try_flush_fail);
+    return -EBUSY;
+  }
+
+  if (!fop->blocking &&
+      m_scrubber->write_blocked_by_scrub(oid)) {
+    if (fop->op) {
+      dout(10) << __func__ << " blocked by scrub" << dendl;
+      requeue_op(fop->op);
+      requeue_ops(fop->dup_ops);
+      return -EAGAIN;    // will retry
+    } else {
+      osd->logger->inc(l_osd_tier_try_flush_fail);
+      vector<ceph_tid_t> tids;
+      cancel_flush(fop, false, &tids);
+      osd->objecter->op_cancel(tids, -ECANCELED);
+      return -ECANCELED;
+    }
+  }
+
+  // successfully flushed, can we evict this object?
+  if (!obc->obs.oi.has_manifest() && !fop->op &&
+      agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
+      agent_maybe_evict(obc, true)) {
+    osd->logger->inc(l_osd_tier_clean);
+    if (fop->on_flush) {
+      (*(fop->on_flush))();
+      fop->on_flush = std::nullopt;
+    }
+    flush_ops.erase(oid);
+    return 0;
+  }
+
+  dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
+  OpContextUPtr ctx = simple_opc_create(fop->obc);
+
+  // successfully flushed; can we clear the dirty bit?
+  // try to take the lock manually, since we don't
+  // have a ctx yet.
+  if (ctx->lock_manager.get_lock_type(
+	RWState::RWWRITE,
+	oid,
+	obc,
+	fop->op)) {
+    dout(20) << __func__ << " took write lock" << dendl;
+  } else if (fop->op) {
+    dout(10) << __func__ << " waiting on write lock " << fop->op << " "
+	     << fop->dup_ops << dendl;
+    // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
+    for (auto op : fop->dup_ops) {
+      bool locked = ctx->lock_manager.get_lock_type(
+	RWState::RWWRITE,
+	oid,
+	obc,
+	op);
+      ceph_assert(!locked);
+    }
+    close_op_ctx(ctx.release());
+    return -EAGAIN;    // will retry
+  } else {
+    dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
+    close_op_ctx(ctx.release());
+    osd->logger->inc(l_osd_tier_try_flush_fail);
+    vector<ceph_tid_t> tids;
+    cancel_flush(fop, false, &tids);
+    osd->objecter->op_cancel(tids, -ECANCELED);
+    return -ECANCELED;
+  }
+
+  if (fop->on_flush) {
+    ctx->register_on_finish(*(fop->on_flush));
+    fop->on_flush = std::nullopt;
+  }
+
+  ctx->at_version = get_next_version();
+
+  ctx->new_obs = obc->obs;
+  ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
+  --ctx->delta_stats.num_objects_dirty;
+  if (fop->obc->obs.oi.has_manifest()) {
+    ceph_assert(obc->obs.oi.manifest.is_chunked());
+    PGTransaction* t = ctx->op_t.get();
+    uint64_t chunks_size = 0;
+    for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
+      chunks_size += p.second.length;
+    }
+    if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
+      t->omap_clear(oid);
+      ctx->new_obs.oi.clear_omap_digest();
+      ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
+      ctx->clean_regions.mark_omap_dirty();
+    }
+    if (obc->obs.oi.size == chunks_size) {
+      t->truncate(oid, 0);
+      interval_set<uint64_t> trim;
+      trim.insert(0, ctx->new_obs.oi.size);
+      ctx->modified_ranges.union_of(trim);
+      truncate_update_size_and_usage(ctx->delta_stats,
+				     ctx->new_obs.oi,
+				     0);
+      ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size);
+      ctx->new_obs.oi.new_object();
+      for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
+	p.second.set_flag(chunk_info_t::FLAG_MISSING);
+      }
+    } else {
+      for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
+	dout(20) << __func__ << " offset: " << p.second.offset
+		<< " length: " << p.second.length << dendl;
+	p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
+      }
+    }
+  }
+
+  finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
+
+  osd->logger->inc(l_osd_tier_clean);
+
+  if (!fop->dup_ops.empty() || fop->op) {
+    dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
+    list<OpRequestRef> ls;
+    if (fop->op)
+      ls.push_back(fop->op);
+    ls.splice(ls.end(), fop->dup_ops);
+    requeue_ops(ls);
+  }
+
+  simple_opc_submit(std::move(ctx));
+
+  flush_ops.erase(oid);
+
+  if (fop->blocking)
+    osd->logger->inc(l_osd_tier_flush);
+  else
+    osd->logger->inc(l_osd_tier_try_flush);
+
+  return -EINPROGRESS;
+}
+
+void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
+				vector<ceph_tid_t> *tids)
+{
+  dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
+	   << fop->objecter_tid << dendl;
+  if (fop->objecter_tid) {
+    tids->push_back(fop->objecter_tid);
+    fop->objecter_tid = 0;
+  }
+  if (fop->io_tids.size()) {
+    for (auto &p : fop->io_tids) {
+      tids->push_back(p.second);
+      p.second = 0;
+    }
+  }
+  if (fop->blocking && fop->obc->is_blocked()) {
+    fop->obc->stop_block();
+    kick_object_context_blocked(fop->obc);
+  }
+  if (requeue) {
+    if (fop->op)
+      requeue_op(fop->op);
+    requeue_ops(fop->dup_ops);
+  }
+  if (fop->on_flush) {
+    (*(fop->on_flush))();
+    fop->on_flush = std::nullopt;
+  }
+  flush_ops.erase(fop->obc->obs.oi.soid);
+}
+
+void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
+{
+  dout(10) << __func__ << dendl;
+  map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
+  while (p != flush_ops.end()) {
+    cancel_flush((p++)->second, requeue, tids);
+  }
+}
+
+bool PrimaryLogPG::is_present_clone(hobject_t coid)
+{
+  if (!pool.info.allow_incomplete_clones())
+    return true;
+  if (is_missing_object(coid))
+    return true;
+  ObjectContextRef obc = get_object_context(coid, false);
+  return obc && obc->obs.exists;
+}
+
+// ========================================================================
+// rep op gather
+
+class C_OSD_RepopCommit : public Context {
+  PrimaryLogPGRef pg;
+  boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
+public:
+  C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
+    : pg(pg), repop(repop) {}
+  void finish(int) override {
+    pg->repop_all_committed(repop.get());
+  }
+};
+
+void PrimaryLogPG::repop_all_committed(RepGather *repop)
+{
+  dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
+	   << dendl;
+  repop->all_committed = true;
+  if (!repop->rep_aborted) {
+    if (repop->v != eversion_t()) {
+      recovery_state.complete_write(repop->v, repop->pg_local_last_complete);
+    }
+    eval_repop(repop);
+  }
+}
+
+void PrimaryLogPG::op_applied(const eversion_t &applied_version)
+{
+  dout(10) << "op_applied version " << applied_version << dendl;
+  ceph_assert(applied_version != eversion_t());
+  ceph_assert(applied_version <= info.last_update);
+  recovery_state.local_write_applied(applied_version);
+
+  if (is_primary() && m_scrubber) {
+    // if there's a scrub operation waiting for the selected chunk to be fully updated -
+    // allow it to continue
+    m_scrubber->on_applied_when_primary(recovery_state.get_last_update_applied());
+  }
+}
+
+void PrimaryLogPG::eval_repop(RepGather *repop)
+{
+  #ifdef HAVE_JAEGER
+  if (repop->op->osd_parent_span) {
+    auto eval_span = jaeger_tracing::child_span(__func__, repop->op->osd_parent_span);
+  }
+ #endif
+  dout(10) << "eval_repop " << *repop
+    << (repop->op && repop->op->get_req<MOSDOp>() ? "" : " (no op)") << dendl;
+
+  // ondisk?
+  if (repop->all_committed) {
+    dout(10) << " commit: " << *repop << dendl;
+    for (auto p = repop->on_committed.begin();
+	 p != repop->on_committed.end();
+	 repop->on_committed.erase(p++)) {
+      (*p)();
+    }
+    // send dup commits, in order
+    auto it = waiting_for_ondisk.find(repop->v);
+    if (it != waiting_for_ondisk.end()) {
+      ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
+      for (auto& i : it->second) {
+        int return_code = repop->r;
+        if (return_code >= 0) {
+          return_code = std::get<2>(i);
+        }
+        osd->reply_op_error(std::get<0>(i), return_code, repop->v,
+                            std::get<1>(i), std::get<3>(i));
+      }
+      waiting_for_ondisk.erase(it);
+    }
+
+    publish_stats_to_osd();
+
+    dout(10) << " removing " << *repop << dendl;
+    ceph_assert(!repop_queue.empty());
+    dout(20) << "   q front is " << *repop_queue.front() << dendl;
+    if (repop_queue.front() == repop) {
+      RepGather *to_remove = nullptr;
+      while (!repop_queue.empty() &&
+	     (to_remove = repop_queue.front())->all_committed) {
+	repop_queue.pop_front();
+	for (auto p = to_remove->on_success.begin();
+	     p != to_remove->on_success.end();
+	     to_remove->on_success.erase(p++)) {
+	  (*p)();
+	}
+	remove_repop(to_remove);
+      }
+    }
+  }
+}
+
+void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
+{
+  FUNCTRACE(cct);
+  const hobject_t& soid = ctx->obs->oi.soid;
+  dout(7) << "issue_repop rep_tid " << repop->rep_tid
+          << " o " << soid
+          << dendl;
+#ifdef HAVE_JAEGER
+  if (ctx->op->osd_parent_span) {
+    auto issue_repop_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
+  }
+#endif
+
+  repop->v = ctx->at_version;
+
+  ctx->op_t->add_obc(ctx->obc);
+  if (ctx->clone_obc) {
+    ctx->op_t->add_obc(ctx->clone_obc);
+  }
+  if (ctx->head_obc) {
+    ctx->op_t->add_obc(ctx->head_obc);
+  }
+
+  Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
+  if (!(ctx->log.empty())) {
+    ceph_assert(ctx->at_version >= projected_last_update);
+    projected_last_update = ctx->at_version;
+  }
+  for (auto &&entry: ctx->log) {
+    projected_log.add(entry);
+  }
+
+  recovery_state.pre_submit_op(
+    soid,
+    ctx->log,
+    ctx->at_version);
+  pgbackend->submit_transaction(
+    soid,
+    ctx->delta_stats,
+    ctx->at_version,
+    std::move(ctx->op_t),
+    recovery_state.get_pg_trim_to(),
+    recovery_state.get_min_last_complete_ondisk(),
+    std::move(ctx->log),
+    ctx->updated_hset_history,
+    on_all_commit,
+    repop->rep_tid,
+    ctx->reqid,
+    ctx->op);
+}
+
+PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
+  OpContext *ctx, ObjectContextRef obc,
+  ceph_tid_t rep_tid)
+{
+  if (ctx->op)
+    dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
+  else
+    dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
+
+  RepGather *repop = new RepGather(
+    ctx, rep_tid, info.last_complete);
+
+  repop->start = ceph_clock_now();
+
+  repop_queue.push_back(&repop->queue_item);
+  repop->get();
+
+  osd->logger->inc(l_osd_op_wip);
+
+  dout(10) << __func__ << ": " << *repop << dendl;
+  return repop;
+}
+
+boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
+  eversion_t version,
+  int r,
+  ObcLockManager &&manager,
+  OpRequestRef &&op,
+  std::optional<std::function<void(void)> > &&on_complete)
+{
+  RepGather *repop = new RepGather(
+    std::move(manager),
+    std::move(op),
+    std::move(on_complete),
+    osd->get_tid(),
+    info.last_complete,
+    r);
+  repop->v = version;
+
+  repop->start = ceph_clock_now();
+
+  repop_queue.push_back(&repop->queue_item);
+
+  osd->logger->inc(l_osd_op_wip);
+
+  dout(10) << __func__ << ": " << *repop << dendl;
+  return boost::intrusive_ptr<RepGather>(repop);
+}
+
+void PrimaryLogPG::remove_repop(RepGather *repop)
+{
+  dout(20) << __func__ << " " << *repop << dendl;
+
+  for (auto p = repop->on_finish.begin();
+       p != repop->on_finish.end();
+       repop->on_finish.erase(p++)) {
+    (*p)();
+  }
+
+  release_object_locks(
+    repop->lock_manager);
+  repop->put();
+
+  osd->logger->dec(l_osd_op_wip);
+}
+
+PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
+{
+  dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
+  ceph_tid_t rep_tid = osd->get_tid();
+  osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
+  OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
+  ctx->op_t.reset(new PGTransaction());
+  ctx->mtime = ceph_clock_now();
+  return ctx;
+}
+
+void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
+{
+  RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
+  dout(20) << __func__ << " " << repop << dendl;
+  issue_repop(repop, ctx.get());
+  eval_repop(repop);
+  recovery_state.update_trim_to();
+  repop->put();
+}
+
+
+void PrimaryLogPG::submit_log_entries(
+  const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+  ObcLockManager &&manager,
+  std::optional<std::function<void(void)> > &&_on_complete,
+  OpRequestRef op,
+  int r)
+{
+  dout(10) << __func__ << " " << entries << dendl;
+  ceph_assert(is_primary());
+
+  eversion_t version;
+  if (!entries.empty()) {
+    ceph_assert(entries.rbegin()->version >= projected_last_update);
+    version = projected_last_update = entries.rbegin()->version;
+  }
+
+  boost::intrusive_ptr<RepGather> repop;
+  std::optional<std::function<void(void)> > on_complete;
+  if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
+    repop = new_repop(
+      version,
+      r,
+      std::move(manager),
+      std::move(op),
+      std::move(_on_complete));
+  } else {
+    on_complete = std::move(_on_complete);
+  }
+
+  pgbackend->call_write_ordered(
+    [this, entries, repop, on_complete]() {
+      ObjectStore::Transaction t;
+      eversion_t old_last_update = info.last_update;
+      recovery_state.merge_new_log_entries(
+	entries, t, recovery_state.get_pg_trim_to(),
+	recovery_state.get_min_last_complete_ondisk());
+
+      set<pg_shard_t> waiting_on;
+      for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin();
+	   i != get_acting_recovery_backfill().end();
+	   ++i) {
+	pg_shard_t peer(*i);
+	if (peer == pg_whoami) continue;
+	ceph_assert(recovery_state.get_peer_missing().count(peer));
+	ceph_assert(recovery_state.has_peer_info(peer));
+	if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
+	  ceph_assert(repop);
+	  MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
+	    entries,
+	    spg_t(info.pgid.pgid, i->shard),
+	    pg_whoami.shard,
+	    get_osdmap_epoch(),
+	    get_last_peering_reset(),
+	    repop->rep_tid,
+	    recovery_state.get_pg_trim_to(),
+	    recovery_state.get_min_last_complete_ondisk());
+	  osd->send_message_osd_cluster(
+	    peer.osd, m, get_osdmap_epoch());
+	  waiting_on.insert(peer);
+	} else {
+	  MOSDPGLog *m = new MOSDPGLog(
+	    peer.shard, pg_whoami.shard,
+	    info.last_update.epoch,
+	    info, get_last_peering_reset());
+	  m->log.log = entries;
+	  m->log.tail = old_last_update;
+	  m->log.head = info.last_update;
+	  osd->send_message_osd_cluster(
+	    peer.osd, m, get_osdmap_epoch());
+	}
+      }
+      ceph_tid_t rep_tid = repop->rep_tid;
+      waiting_on.insert(pg_whoami);
+      log_entry_update_waiting_on.insert(
+	make_pair(
+	  rep_tid,
+	  LogUpdateCtx{std::move(repop), std::move(waiting_on)}
+	  ));
+      struct OnComplete : public Context {
+	PrimaryLogPGRef pg;
+	ceph_tid_t rep_tid;
+	epoch_t epoch;
+	OnComplete(
+	  PrimaryLogPGRef pg,
+	  ceph_tid_t rep_tid,
+	  epoch_t epoch)
+	  : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
+	void finish(int) override {
+	  std::scoped_lock l{*pg};
+	  if (!pg->pg_has_reset_since(epoch)) {
+	    auto it = pg->log_entry_update_waiting_on.find(rep_tid);
+	    ceph_assert(it != pg->log_entry_update_waiting_on.end());
+	    auto it2 = it->second.waiting_on.find(pg->pg_whoami);
+	    ceph_assert(it2 != it->second.waiting_on.end());
+	    it->second.waiting_on.erase(it2);
+	    if (it->second.waiting_on.empty()) {
+	      pg->repop_all_committed(it->second.repop.get());
+	      pg->log_entry_update_waiting_on.erase(it);
+	    }
+	  }
+	}
+      };
+      t.register_on_commit(
+	new OnComplete{this, rep_tid, get_osdmap_epoch()});
+      int r = osd->store->queue_transaction(ch, std::move(t), NULL);
+      ceph_assert(r == 0);
+      op_applied(info.last_update);
+    });
+
+  recovery_state.update_trim_to();
+}
+
+void PrimaryLogPG::cancel_log_updates()
+{
+  // get rid of all the LogUpdateCtx so their references to repops are
+  // dropped
+  log_entry_update_waiting_on.clear();
+}
+
+// -------------------------------------------------------
+
+void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
+{
+  std::scoped_lock l{*this};
+  pair<hobject_t, ObjectContextRef> i;
+  while (object_contexts.get_next(i.first, &i)) {
+    ObjectContextRef obc(i.second);
+    get_obc_watchers(obc, *ls);
+  }
+}
+
+void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
+{
+  for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
+	 obc->watchers.begin();
+	j != obc->watchers.end();
+	++j) {
+    obj_watch_item_t owi;
+
+    owi.obj = obc->obs.oi.soid;
+    owi.wi.addr = j->second->get_peer_addr();
+    owi.wi.name = j->second->get_entity();
+    owi.wi.cookie = j->second->get_cookie();
+    owi.wi.timeout_seconds = j->second->get_timeout();
+
+    dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
+      << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
+
+    pg_watchers.push_back(owi);
+  }
+}
+
+void PrimaryLogPG::check_blocklisted_watchers()
+{
+  dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl;
+  pair<hobject_t, ObjectContextRef> i;
+  while (object_contexts.get_next(i.first, &i))
+    check_blocklisted_obc_watchers(i.second);
+}
+
+void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc)
+{
+  dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
+  for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
+	 obc->watchers.begin();
+	k != obc->watchers.end();
+	) {
+    //Advance iterator now so handle_watch_timeout() can erase element
+    map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
+    dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
+    entity_addr_t ea = j->second->get_peer_addr();
+    dout(30) << "watch: Check entity_addr_t " << ea << dendl;
+    if (get_osdmap()->is_blocklisted(ea)) {
+      dout(10) << "watch: Found blocklisted watcher for " << ea << dendl;
+      ceph_assert(j->second->get_pg() == this);
+      j->second->unregister_cb();
+      handle_watch_timeout(j->second);
+    }
+  }
+}
+
+void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
+{
+  ceph_assert(is_primary() && is_active());
+  auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid);
+  ceph_assert((recovering.count(obc->obs.oi.soid) ||
+	  !is_missing_object(obc->obs.oi.soid)) ||
+	 (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary()
+	  it_objects->second->op ==
+	    pg_log_entry_t::LOST_REVERT &&
+	  it_objects->second->reverting_to ==
+	    obc->obs.oi.version));
+
+  dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
+  ceph_assert(obc->watchers.empty());
+  // populate unconnected_watchers
+  for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
+	obc->obs.oi.watchers.begin();
+       p != obc->obs.oi.watchers.end();
+       ++p) {
+    utime_t expire = info.stats.last_became_active;
+    expire += p->second.timeout_seconds;
+    dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
+    WatchRef watch(
+      Watch::makeWatchRef(
+	this, osd, obc, p->second.timeout_seconds, p->first.first,
+	p->first.second, p->second.addr));
+    watch->disconnect();
+    obc->watchers.insert(
+      make_pair(
+	make_pair(p->first.first, p->first.second),
+	watch));
+  }
+  // Look for watchers from blocklisted clients and drop
+  check_blocklisted_obc_watchers(obc);
+}
+
+void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
+{
+  ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
+  dout(10) << "handle_watch_timeout obc " << obc << dendl;
+
+  if (!is_active()) {
+    dout(10) << "handle_watch_timeout not active, no-op" << dendl;
+    return;
+  }
+  if (!obc->obs.exists) {
+    dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
+    return;
+  }
+  if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
+    callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
+      watch->get_delayed_cb()
+      );
+    dout(10) << "handle_watch_timeout waiting for degraded on obj "
+	     << obc->obs.oi.soid
+	     << dendl;
+    return;
+  }
+
+  if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) {
+    dout(10) << "handle_watch_timeout waiting for scrub on obj "
+	     << obc->obs.oi.soid
+	     << dendl;
+    m_scrubber->add_callback(
+      watch->get_delayed_cb() // This callback!
+      );
+    return;
+  }
+
+  OpContextUPtr ctx = simple_opc_create(obc);
+  ctx->at_version = get_next_version();
+
+  object_info_t& oi = ctx->new_obs.oi;
+  oi.watchers.erase(make_pair(watch->get_cookie(),
+			      watch->get_entity()));
+
+  list<watch_disconnect_t> watch_disconnects = {
+    watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
+  };
+  ctx->register_on_success(
+    [this, obc, watch_disconnects]() {
+      complete_disconnect_watches(obc, watch_disconnects);
+    });
+
+
+  PGTransaction *t = ctx->op_t.get();
+  ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
+				    ctx->at_version,
+				    oi.version,
+				    0,
+				    osd_reqid_t(), ctx->mtime, 0));
+
+  oi.prior_version = obc->obs.oi.version;
+  oi.version = ctx->at_version;
+  bufferlist bl;
+  encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+  t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
+
+  // apply new object state.
+  ctx->obc->obs = ctx->new_obs;
+
+  // no ctx->delta_stats
+  simple_opc_submit(std::move(ctx));
+}
+
+ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
+						     SnapSetContext *ssc)
+{
+  ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
+  ceph_assert(obc->destructor_callback == NULL);
+  obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
+  obc->obs.oi = oi;
+  obc->obs.exists = false;
+  obc->ssc = ssc;
+  if (ssc)
+    register_snapset_context(ssc);
+  dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
+  if (is_active())
+    populate_obc_watchers(obc);
+  return obc;
+}
+
+ObjectContextRef PrimaryLogPG::get_object_context(
+  const hobject_t& soid,
+  bool can_create,
+  const map<string, bufferlist> *attrs)
+{
+  auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid);
+  ceph_assert(
+    attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) ||
+    // or this is a revert... see recover_primary()
+    (it_objects != recovery_state.get_pg_log().get_log().objects.end() &&
+      it_objects->second->op ==
+      pg_log_entry_t::LOST_REVERT));
+  ObjectContextRef obc = object_contexts.lookup(soid);
+  osd->logger->inc(l_osd_object_ctx_cache_total);
+  if (obc) {
+    osd->logger->inc(l_osd_object_ctx_cache_hit);
+    dout(10) << __func__ << ": found obc in cache: " << obc
+	     << dendl;
+  } else {
+    dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
+    // check disk
+    bufferlist bv;
+    if (attrs) {
+      auto it_oi = attrs->find(OI_ATTR);
+      ceph_assert(it_oi != attrs->end());
+      bv = it_oi->second;
+    } else {
+      int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
+      if (r < 0) {
+	if (!can_create) {
+	  dout(10) << __func__ << ": no obc for soid "
+		   << soid << " and !can_create"
+		   << dendl;
+	  return ObjectContextRef();   // -ENOENT!
+	}
+
+	dout(10) << __func__ << ": no obc for soid "
+		 << soid << " but can_create"
+		 << dendl;
+	// new object.
+	object_info_t oi(soid);
+	SnapSetContext *ssc = get_snapset_context(
+	  soid, true, 0, false);
+        ceph_assert(ssc);
+	obc = create_object_context(oi, ssc);
+	dout(10) << __func__ << ": " << obc << " " << soid
+		 << " " << obc->rwstate
+		 << " oi: " << obc->obs.oi
+		 << " ssc: " << obc->ssc
+		 << " snapset: " << obc->ssc->snapset << dendl;
+	return obc;
+      }
+    }
+
+    object_info_t oi;
+    try {
+      bufferlist::const_iterator bliter = bv.begin();
+      decode(oi, bliter);
+    } catch (...) {
+      dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
+      return ObjectContextRef();   // -ENOENT!
+    }
+
+    ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
+
+    obc = object_contexts.lookup_or_create(oi.soid);
+    obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
+    obc->obs.oi = oi;
+    obc->obs.exists = true;
+
+    obc->ssc = get_snapset_context(
+      soid, true,
+      soid.has_snapset() ? attrs : 0);
+
+    if (is_primary() && is_active())
+      populate_obc_watchers(obc);
+
+    if (pool.info.is_erasure()) {
+      if (attrs) {
+	obc->attr_cache = *attrs;
+      } else {
+	int r = pgbackend->objects_get_attrs(
+	  soid,
+	  &obc->attr_cache);
+	ceph_assert(r == 0);
+      }
+    }
+
+    dout(10) << __func__ << ": creating obc from disk: " << obc
+	     << dendl;
+  }
+
+  // XXX: Caller doesn't expect this
+  if (obc->ssc == NULL) {
+    derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
+    return ObjectContextRef();   // -ENOENT!
+  }
+
+  dout(10) << __func__ << ": " << obc << " " << soid
+	   << " " << obc->rwstate
+	   << " oi: " << obc->obs.oi
+	   << " exists: " << (int)obc->obs.exists
+	   << " ssc: " << obc->ssc
+	   << " snapset: " << obc->ssc->snapset << dendl;
+  return obc;
+}
+
+void PrimaryLogPG::context_registry_on_change()
+{
+  pair<hobject_t, ObjectContextRef> i;
+  while (object_contexts.get_next(i.first, &i)) {
+    ObjectContextRef obc(i.second);
+    if (obc) {
+      for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
+	     obc->watchers.begin();
+	   j != obc->watchers.end();
+	   obc->watchers.erase(j++)) {
+	j->second->discard();
+      }
+    }
+  }
+}
+
+
+/*
+ * If we return an error, and set *pmissing, then promoting that
+ * object may help.
+ *
+ * If we return -EAGAIN, we will always set *pmissing to the missing
+ * object to wait for.
+ *
+ * If we return an error but do not set *pmissing, then we know the
+ * object does not exist.
+ */
+int PrimaryLogPG::find_object_context(const hobject_t& oid,
+				      ObjectContextRef *pobc,
+				      bool can_create,
+				      bool map_snapid_to_clone,
+				      hobject_t *pmissing)
+{
+  FUNCTRACE(cct);
+  ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
+  // want the head?
+  if (oid.snap == CEPH_NOSNAP) {
+    ObjectContextRef obc = get_object_context(oid, can_create);
+    if (!obc) {
+      if (pmissing)
+        *pmissing = oid;
+      return -ENOENT;
+    }
+    dout(10) << __func__ << " " << oid
+       << " @" << oid.snap
+       << " oi=" << obc->obs.oi
+       << dendl;
+    *pobc = obc;
+
+    return 0;
+  }
+
+  // we want a snap
+
+  hobject_t head = oid.get_head();
+  SnapSetContext *ssc = get_snapset_context(oid, can_create);
+  if (!ssc || !(ssc->exists || can_create)) {
+    dout(20) << __func__ << " " << oid << " no snapset" << dendl;
+    if (pmissing)
+      *pmissing = head;  // start by getting the head
+    if (ssc)
+      put_snapset_context(ssc);
+    return -ENOENT;
+  }
+
+  if (map_snapid_to_clone) {
+    dout(10) << __func__ << " " << oid << " @" << oid.snap
+	     << " snapset " << ssc->snapset
+	     << " map_snapid_to_clone=true" << dendl;
+    if (oid.snap > ssc->snapset.seq) {
+      // already must be readable
+      ObjectContextRef obc = get_object_context(head, false);
+      dout(10) << __func__ << " " << oid << " @" << oid.snap
+	       << " snapset " << ssc->snapset
+	       << " maps to head" << dendl;
+      *pobc = obc;
+      put_snapset_context(ssc);
+      return (obc && obc->obs.exists) ? 0 : -ENOENT;
+    } else {
+      vector<snapid_t>::const_iterator citer = std::find(
+	ssc->snapset.clones.begin(),
+	ssc->snapset.clones.end(),
+	oid.snap);
+      if (citer == ssc->snapset.clones.end()) {
+	dout(10) << __func__ << " " << oid << " @" << oid.snap
+		 << " snapset " << ssc->snapset
+		 << " maps to nothing" << dendl;
+	put_snapset_context(ssc);
+	return -ENOENT;
+      }
+
+      dout(10) << __func__ << " " << oid << " @" << oid.snap
+	       << " snapset " << ssc->snapset
+	       << " maps to " << oid << dendl;
+
+      if (recovery_state.get_pg_log().get_missing().is_missing(oid)) {
+	dout(10) << __func__ << " " << oid << " @" << oid.snap
+		 << " snapset " << ssc->snapset
+		 << " " << oid << " is missing" << dendl;
+	if (pmissing)
+	  *pmissing = oid;
+	put_snapset_context(ssc);
+	return -EAGAIN;
+      }
+
+      ObjectContextRef obc = get_object_context(oid, false);
+      if (!obc || !obc->obs.exists) {
+	dout(10) << __func__ << " " << oid << " @" << oid.snap
+		 << " snapset " << ssc->snapset
+		 << " " << oid << " is not present" << dendl;
+	if (pmissing)
+	  *pmissing = oid;
+	put_snapset_context(ssc);
+	return -ENOENT;
+      }
+      dout(10) << __func__ << " " << oid << " @" << oid.snap
+	       << " snapset " << ssc->snapset
+	       << " " << oid << " HIT" << dendl;
+      *pobc = obc;
+      put_snapset_context(ssc);
+      return 0;
+    }
+    ceph_abort(); //unreachable
+  }
+
+  dout(10) << __func__ << " " << oid << " @" << oid.snap
+	   << " snapset " << ssc->snapset << dendl;
+
+  // head?
+  if (oid.snap > ssc->snapset.seq) {
+    ObjectContextRef obc = get_object_context(head, false);
+    dout(10) << __func__ << " " << head
+	     << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
+	     << " -- HIT " << obc->obs
+	     << dendl;
+    if (!obc->ssc)
+      obc->ssc = ssc;
+    else {
+      ceph_assert(ssc == obc->ssc);
+      put_snapset_context(ssc);
+    }
+    *pobc = obc;
+    return 0;
+  }
+
+  // which clone would it be?
+  unsigned k = 0;
+  while (k < ssc->snapset.clones.size() &&
+	 ssc->snapset.clones[k] < oid.snap)
+    k++;
+  if (k == ssc->snapset.clones.size()) {
+    dout(10) << __func__ << " no clones with last >= oid.snap "
+	     << oid.snap << " -- DNE" << dendl;
+    put_snapset_context(ssc);
+    return -ENOENT;
+  }
+  hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
+		 info.pgid.pool(), oid.get_namespace());
+
+  if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
+    dout(20) << __func__ << " " << soid << " missing, try again later"
+	     << dendl;
+    if (pmissing)
+      *pmissing = soid;
+    put_snapset_context(ssc);
+    return -EAGAIN;
+  }
+
+  ObjectContextRef obc = get_object_context(soid, false);
+  if (!obc || !obc->obs.exists) {
+    if (pmissing)
+      *pmissing = soid;
+    put_snapset_context(ssc);
+    if (is_primary()) {
+      if (is_degraded_or_backfilling_object(soid)) {
+	dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
+	return -EAGAIN;
+      } else if (is_degraded_on_async_recovery_target(soid)) {
+	dout(20) << __func__ << " clone is recovering " << soid << dendl;
+	return -EAGAIN;
+      } else {
+	dout(20) << __func__ << " missing clone " << soid << dendl;
+	return -ENOENT;
+      }
+    } else {
+      dout(20) << __func__ << " replica missing clone" << soid << dendl;
+      return -ENOENT;
+    }
+  }
+
+  if (!obc->ssc) {
+    obc->ssc = ssc;
+  } else {
+    ceph_assert(obc->ssc == ssc);
+    put_snapset_context(ssc);
+  }
+  ssc = 0;
+
+  // clone
+  dout(20) << __func__ << " " << soid
+	   << " snapset " << obc->ssc->snapset
+	   << dendl;
+  snapid_t first, last;
+  auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
+  ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
+  if (p->second.empty()) {
+    dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
+    ceph_assert(!cct->_conf->osd_debug_verify_snaps);
+    return -ENOENT;
+  }
+  if (std::find(p->second.begin(), p->second.end(), oid.snap) ==
+      p->second.end()) {
+    dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
+	     << " does not contain " << oid.snap << " -- DNE" << dendl;
+    return -ENOENT;
+  }
+  if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) {
+    dout(20) << __func__ << " " << soid << " snap " << oid.snap
+	     << " in removed_snaps_queue" << " -- DNE" << dendl;
+    return -ENOENT;
+  }
+  dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
+	   << " contains " << oid.snap << " -- HIT " << obc->obs << dendl;
+  *pobc = obc;
+  return 0;
+}
+
+void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
+{
+  if (obc->ssc)
+    put_snapset_context(obc->ssc);
+}
+
+void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
+{
+  object_info_t& oi = obc->obs.oi;
+
+  dout(10) << __func__ << " " << oi.soid << dendl;
+  ceph_assert(!oi.soid.is_snapdir());
+
+  object_stat_sum_t stat;
+  stat.num_objects++;
+  if (oi.is_dirty())
+    stat.num_objects_dirty++;
+  if (oi.is_whiteout())
+    stat.num_whiteouts++;
+  if (oi.is_omap())
+    stat.num_objects_omap++;
+  if (oi.is_cache_pinned())
+    stat.num_objects_pinned++;
+  if (oi.has_manifest())
+    stat.num_objects_manifest++;
+
+  if (oi.soid.is_snap()) {
+    stat.num_object_clones++;
+
+    if (!obc->ssc)
+      obc->ssc = get_snapset_context(oi.soid, false);
+    ceph_assert(obc->ssc);
+    stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
+  } else {
+    stat.num_bytes += oi.size;
+  }
+
+  // add it in
+  pgstat->stats.sum.add(stat);
+}
+
+void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
+{
+  const hobject_t& soid = obc->obs.oi.soid;
+  if (obc->is_blocked()) {
+    dout(10) << __func__ << " " << soid << " still blocked" << dendl;
+    return;
+  }
+
+  map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
+  if (p != waiting_for_blocked_object.end()) {
+    list<OpRequestRef>& ls = p->second;
+    dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
+    requeue_ops(ls);
+    waiting_for_blocked_object.erase(p);
+  }
+
+  map<hobject_t, ObjectContextRef>::iterator i =
+    objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
+  if (i != objects_blocked_on_snap_promotion.end()) {
+    ceph_assert(i->second == obc);
+    objects_blocked_on_snap_promotion.erase(i);
+  }
+
+  if (obc->requeue_scrub_on_unblock) {
+
+    obc->requeue_scrub_on_unblock = false;
+
+    dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl;
+
+    // only requeue if we are still active: we may be unblocking
+    // because we are resetting for a new peering interval
+    if (is_active()) {
+      osd->queue_scrub_unblocking(this, is_scrub_blocking_ops());
+    }
+  }
+}
+
+SnapSetContext *PrimaryLogPG::get_snapset_context(
+  const hobject_t& oid,
+  bool can_create,
+  const map<string, bufferlist> *attrs,
+  bool oid_existed)
+{
+  std::lock_guard l(snapset_contexts_lock);
+  SnapSetContext *ssc;
+  map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
+    oid.get_snapdir());
+  if (p != snapset_contexts.end()) {
+    if (can_create || p->second->exists) {
+      ssc = p->second;
+    } else {
+      return NULL;
+    }
+  } else {
+    bufferlist bv;
+    if (!attrs) {
+      int r = -ENOENT;
+      if (!(oid.is_head() && !oid_existed)) {
+	r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
+      }
+      if (r < 0 && !can_create)
+	return NULL;
+    } else {
+      auto it_ss = attrs->find(SS_ATTR);
+      ceph_assert(it_ss != attrs->end());
+      bv = it_ss->second;
+    }
+    ssc = new SnapSetContext(oid.get_snapdir());
+    _register_snapset_context(ssc);
+    if (bv.length()) {
+      bufferlist::const_iterator bvp = bv.begin();
+      try {
+	ssc->snapset.decode(bvp);
+      } catch (const ceph::buffer::error& e) {
+        dout(0) << __func__ << " Can't decode snapset: " << e.what() << dendl;
+	return NULL;
+      }
+      ssc->exists = true;
+    } else {
+      ssc->exists = false;
+    }
+  }
+  ceph_assert(ssc);
+  ssc->ref++;
+  return ssc;
+}
+
+void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
+{
+  std::lock_guard l(snapset_contexts_lock);
+  --ssc->ref;
+  if (ssc->ref == 0) {
+    if (ssc->registered)
+      snapset_contexts.erase(ssc->oid);
+    delete ssc;
+  }
+}
+
+/*
+ * Return values:
+ *  NONE  - didn't pull anything
+ *  YES   - pulled what the caller wanted
+ *  HEAD  - needed to pull head first
+ */
+enum { PULL_NONE, PULL_HEAD, PULL_YES };
+
+int PrimaryLogPG::recover_missing(
+  const hobject_t &soid, eversion_t v,
+  int priority,
+  PGBackend::RecoveryHandle *h)
+{
+  if (recovery_state.get_missing_loc().is_unfound(soid)) {
+    dout(7) << __func__ << " " << soid
+	    << " v " << v
+	    << " but it is unfound" << dendl;
+    return PULL_NONE;
+  }
+
+  if (recovery_state.get_missing_loc().is_deleted(soid)) {
+    start_recovery_op(soid);
+    ceph_assert(!recovering.count(soid));
+    recovering.insert(make_pair(soid, ObjectContextRef()));
+    epoch_t cur_epoch = get_osdmap_epoch();
+    remove_missing_object(soid, v, new LambdaContext(
+     [=](int) {
+       std::scoped_lock locker{*this};
+       if (!pg_has_reset_since(cur_epoch)) {
+	 bool object_missing = false;
+	 for (const auto& shard : get_acting_recovery_backfill()) {
+	   if (shard == pg_whoami)
+	     continue;
+	   if (recovery_state.get_peer_missing(shard).is_missing(soid)) {
+	     dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
+	     object_missing = true;
+	     break;
+	   }
+	 }
+	 if (!object_missing) {
+	   object_stat_sum_t stat_diff;
+	   stat_diff.num_objects_recovered = 1;
+	   if (scrub_after_recovery)
+	     stat_diff.num_objects_repaired = 1;
+	   on_global_recover(soid, stat_diff, true);
+	 } else {
+	   auto recovery_handle = pgbackend->open_recovery_op();
+	   pgbackend->recover_delete_object(soid, v, recovery_handle);
+	   pgbackend->run_recovery_op(recovery_handle, priority);
+	 }
+       }
+     }));
+    return PULL_YES;
+  }
+
+  // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
+  ObjectContextRef obc;
+  ObjectContextRef head_obc;
+  if (soid.snap && soid.snap < CEPH_NOSNAP) {
+    // do we have the head?
+    hobject_t head = soid.get_head();
+    if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
+      if (recovering.count(head)) {
+	dout(10) << " missing but already recovering head " << head << dendl;
+	return PULL_NONE;
+      } else {
+	int r = recover_missing(
+	  head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority,
+	  h);
+	if (r != PULL_NONE)
+	  return PULL_HEAD;
+	return PULL_NONE;
+      }
+    }
+    head_obc = get_object_context(
+      head,
+      false,
+      0);
+    ceph_assert(head_obc);
+  }
+  start_recovery_op(soid);
+  ceph_assert(!recovering.count(soid));
+  recovering.insert(make_pair(soid, obc));
+  int r = pgbackend->recover_object(
+    soid,
+    v,
+    head_obc,
+    obc,
+    h);
+  // This is only a pull which shouldn't return an error
+  ceph_assert(r >= 0);
+  return PULL_YES;
+}
+
+void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
+					 eversion_t v, Context *on_complete)
+{
+  dout(20) << __func__ << " " << soid << " " << v << dendl;
+  ceph_assert(on_complete != nullptr);
+  // delete locally
+  ObjectStore::Transaction t;
+  remove_snap_mapped_object(t, soid);
+
+  ObjectRecoveryInfo recovery_info;
+  recovery_info.soid = soid;
+  recovery_info.version = v;
+
+  epoch_t cur_epoch = get_osdmap_epoch();
+  t.register_on_complete(new LambdaContext(
+     [=](int) {
+       std::unique_lock locker{*this};
+       if (!pg_has_reset_since(cur_epoch)) {
+	 ObjectStore::Transaction t2;
+	 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
+	 t2.register_on_complete(on_complete);
+	 int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
+	 ceph_assert(r == 0);
+	 locker.unlock();
+       } else {
+	 locker.unlock();
+	 on_complete->complete(-EAGAIN);
+       }
+     }));
+  int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
+  ceph_assert(r == 0);
+}
+
+void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
+{
+  dout(10) << __func__ << " " << oid << dendl;
+  if (callbacks_for_degraded_object.count(oid)) {
+    list<Context*> contexts;
+    contexts.swap(callbacks_for_degraded_object[oid]);
+    callbacks_for_degraded_object.erase(oid);
+    for (list<Context*>::iterator i = contexts.begin();
+	 i != contexts.end();
+	 ++i) {
+      (*i)->complete(0);
+    }
+  }
+  map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
+    oid.get_head());
+  if (i != objects_blocked_on_degraded_snap.end() &&
+      i->second == oid.snap)
+    objects_blocked_on_degraded_snap.erase(i);
+}
+
+void PrimaryLogPG::_committed_pushed_object(
+  epoch_t epoch, eversion_t last_complete)
+{
+  std::scoped_lock locker{*this};
+  if (!pg_has_reset_since(epoch)) {
+    recovery_state.recovery_committed_to(last_complete);
+  } else {
+    dout(10) << __func__
+	     << " pg has changed, not touching last_complete_ondisk" << dendl;
+  }
+}
+
+void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
+{
+  dout(20) << __func__ << dendl;
+  if (obc) {
+    dout(20) << "obc = " << *obc << dendl;
+  }
+  ceph_assert(active_pushes >= 1);
+  --active_pushes;
+
+  // requeue an active chunky scrub waiting on recovery ops
+  if (!recovery_state.is_deleting() && active_pushes == 0 &&
+      m_scrubber->is_scrub_active()) {
+
+    osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
+  }
+}
+
+void PrimaryLogPG::_applied_recovered_object_replica()
+{
+  dout(20) << __func__ << dendl;
+  ceph_assert(active_pushes >= 1);
+  --active_pushes;
+
+  // requeue an active scrub waiting on recovery ops
+  if (!recovery_state.is_deleting() && active_pushes == 0 &&
+      m_scrubber->is_scrub_active()) {
+
+    osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority());
+  }
+}
+
+void PrimaryLogPG::on_failed_pull(
+  const set<pg_shard_t> &from,
+  const hobject_t &soid,
+  const eversion_t &v)
+{
+  dout(20) << __func__ << ": " << soid << dendl;
+  ceph_assert(recovering.count(soid));
+  auto obc = recovering[soid];
+  if (obc) {
+    list<OpRequestRef> blocked_ops;
+    obc->drop_recovery_read(&blocked_ops);
+    requeue_ops(blocked_ops);
+  }
+  recovering.erase(soid);
+  for (auto&& i : from) {
+    if (i != pg_whoami) { // we'll get it below in primary_error
+      recovery_state.force_object_missing(i, soid, v);
+    }
+  }
+
+  dout(0) << __func__ << " " << soid << " from shard " << from
+	  << ", reps on " << recovery_state.get_missing_loc().get_locations(soid)
+	  << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid)
+	  << dendl;
+  finish_recovery_op(soid);  // close out this attempt,
+  finish_degraded_object(soid);
+
+  if (from.count(pg_whoami)) {
+    dout(0) << " primary missing oid " << soid << " version " << v << dendl;
+    primary_error(soid, v);
+    backfills_in_flight.erase(soid);
+  }
+}
+
+eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
+{
+  eversion_t v;
+  pg_missing_item pmi;
+  bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi);
+  ceph_assert(is_missing);
+  v = pmi.have;
+  dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
+
+  ceph_assert(!get_acting_recovery_backfill().empty());
+  for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
+       i != get_acting_recovery_backfill().end();
+       ++i) {
+    if (*i == get_primary()) continue;
+    pg_shard_t peer = *i;
+    if (!recovery_state.get_peer_missing(peer).is_missing(oid)) {
+      continue;
+    }
+    eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have;
+    dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
+    if (h > v)
+      v = h;
+  }
+
+  dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
+  return v;
+}
+
+void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
+{
+  const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
+    op->get_req());
+  ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
+  ObjectStore::Transaction t;
+  std::optional<eversion_t> op_trim_to, op_roll_forward_to;
+  if (m->pg_trim_to != eversion_t())
+    op_trim_to = m->pg_trim_to;
+  if (m->pg_roll_forward_to != eversion_t())
+    op_roll_forward_to = m->pg_roll_forward_to;
+
+  dout(20) << __func__
+	   << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
+
+  recovery_state.append_log_entries_update_missing(
+    m->entries, t, op_trim_to, op_roll_forward_to);
+  eversion_t new_lcod = info.last_complete;
+
+  Context *complete = new LambdaContext(
+    [=](int) {
+      const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
+	op->get_req());
+      std::scoped_lock locker{*this};
+      if (!pg_has_reset_since(msg->get_epoch())) {
+	update_last_complete_ondisk(new_lcod);
+	MOSDPGUpdateLogMissingReply *reply =
+	  new MOSDPGUpdateLogMissingReply(
+	    spg_t(info.pgid.pgid, primary_shard().shard),
+	    pg_whoami.shard,
+	    msg->get_epoch(),
+	    msg->min_epoch,
+	    msg->get_tid(),
+	    new_lcod);
+	reply->set_priority(CEPH_MSG_PRIO_HIGH);
+	msg->get_connection()->send_message(reply);
+      }
+    });
+
+  if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
+    t.register_on_commit(complete);
+  } else {
+    /* Hack to work around the fact that ReplicatedBackend sends
+     * ack+commit if commit happens first
+     *
+     * This behavior is no longer necessary, but we preserve it so old
+     * primaries can keep their repops in order */
+    if (pool.info.is_erasure()) {
+      t.register_on_complete(complete);
+    } else {
+      t.register_on_commit(complete);
+    }
+  }
+  int tr = osd->store->queue_transaction(
+    ch,
+    std::move(t),
+    nullptr);
+  ceph_assert(tr == 0);
+  op_applied(info.last_update);
+}
+
+void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
+{
+  const MOSDPGUpdateLogMissingReply *m =
+    static_cast<const MOSDPGUpdateLogMissingReply*>(
+    op->get_req());
+  dout(20) << __func__ << " got reply from "
+	   << m->get_from() << dendl;
+
+  auto it = log_entry_update_waiting_on.find(m->get_tid());
+  if (it != log_entry_update_waiting_on.end()) {
+    if (it->second.waiting_on.count(m->get_from())) {
+      it->second.waiting_on.erase(m->get_from());
+      if (m->last_complete_ondisk != eversion_t()) {
+	update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
+      }
+    } else {
+      osd->clog->error()
+	<< info.pgid << " got reply "
+	<< *m << " from shard we are not waiting for "
+	<< m->get_from();
+    }
+
+    if (it->second.waiting_on.empty()) {
+      repop_all_committed(it->second.repop.get());
+      log_entry_update_waiting_on.erase(it);
+    }
+  } else {
+    osd->clog->error()
+      << info.pgid << " got reply "
+      << *m << " on unknown tid " << m->get_tid();
+  }
+}
+
+/* Mark all unfound objects as lost.
+ */
+void PrimaryLogPG::mark_all_unfound_lost(
+  int what,
+  std::function<void(int,const std::string&,bufferlist&)> on_finish)
+{
+  dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
+  list<hobject_t> oids;
+
+  dout(30) << __func__ << ": log before:\n";
+  recovery_state.get_pg_log().get_log().print(*_dout);
+  *_dout << dendl;
+
+  mempool::osd_pglog::list<pg_log_entry_t> log_entries;
+
+  utime_t mtime = ceph_clock_now();
+  map<hobject_t, pg_missing_item>::const_iterator m =
+    recovery_state.get_missing_loc().get_needs_recovery().begin();
+  map<hobject_t, pg_missing_item>::const_iterator mend =
+    recovery_state.get_missing_loc().get_needs_recovery().end();
+
+  ObcLockManager manager;
+  eversion_t v = get_next_version();
+  v.epoch = get_osdmap_epoch();
+  uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound();
+  while (m != mend) {
+    const hobject_t &oid(m->first);
+    if (!recovery_state.get_missing_loc().is_unfound(oid)) {
+      // We only care about unfound objects
+      ++m;
+      continue;
+    }
+
+    ObjectContextRef obc;
+    eversion_t prev;
+
+    switch (what) {
+    case pg_log_entry_t::LOST_MARK:
+      ceph_abort_msg("actually, not implemented yet!");
+      break;
+
+    case pg_log_entry_t::LOST_REVERT:
+      prev = pick_newest_available(oid);
+      if (prev > eversion_t()) {
+	// log it
+	pg_log_entry_t e(
+	  pg_log_entry_t::LOST_REVERT, oid, v,
+	  m->second.need, 0, osd_reqid_t(), mtime, 0);
+	e.reverting_to = prev;
+	e.mark_unrollbackable();
+	log_entries.push_back(e);
+	dout(10) << e << dendl;
+
+	// we are now missing the new version; recovery code will sort it out.
+	++v.version;
+	++m;
+	break;
+      }
+
+    case pg_log_entry_t::LOST_DELETE:
+      {
+	pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
+			 0, osd_reqid_t(), mtime, 0);
+	if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
+	  if (pool.info.require_rollback()) {
+	    e.mod_desc.try_rmobject(v.version);
+	  } else {
+	    e.mark_unrollbackable();
+	  }
+	} // otherwise, just do what we used to do
+	dout(10) << e << dendl;
+	log_entries.push_back(e);
+        oids.push_back(oid);
+
+	// If context found mark object as deleted in case
+	// of racing with new creation.  This can happen if
+	// object lost and EIO at primary.
+	obc = object_contexts.lookup(oid);
+	if (obc)
+	  obc->obs.exists = false;
+
+	++v.version;
+	++m;
+      }
+      break;
+
+    default:
+      ceph_abort();
+    }
+  }
+
+  recovery_state.update_stats(
+    [](auto &history, auto &stats) {
+      stats.stats_invalid = true;
+      return false;
+    });
+
+  submit_log_entries(
+    log_entries,
+    std::move(manager),
+    std::optional<std::function<void(void)> >(
+      [this, oids, num_unfound, on_finish]() {
+	if (recovery_state.perform_deletes_during_peering()) {
+	  for (auto oid : oids) {
+	    // clear old locations - merge_new_log_entries will have
+	    // handled rebuilding missing_loc for each of these
+	    // objects if we have the RECOVERY_DELETES flag
+	    recovery_state.object_recovered(oid, object_stat_sum_t());
+	  }
+	}
+
+	if (is_recovery_unfound()) {
+	  queue_peering_event(
+	    PGPeeringEventRef(
+	      std::make_shared<PGPeeringEvent>(
+	      get_osdmap_epoch(),
+	      get_osdmap_epoch(),
+	      PeeringState::DoRecovery())));
+	} else if (is_backfill_unfound()) {
+	  queue_peering_event(
+	    PGPeeringEventRef(
+	      std::make_shared<PGPeeringEvent>(
+	      get_osdmap_epoch(),
+	      get_osdmap_epoch(),
+	      PeeringState::RequestBackfill())));
+	} else {
+	  queue_recovery();
+	}
+
+	stringstream ss;
+	ss << "pg has " << num_unfound
+	   << " objects unfound and apparently lost marking";
+	string rs = ss.str();
+	dout(0) << "do_command r=" << 0 << " " << rs << dendl;
+	osd->clog->info() << rs;
+	bufferlist empty;
+	on_finish(0, rs, empty);
+      }),
+    OpRequestRef());
+}
+
+void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
+{
+  ceph_assert(repop_queue.empty());
+}
+
+/*
+ * pg status change notification
+ */
+
+void PrimaryLogPG::apply_and_flush_repops(bool requeue)
+{
+  list<OpRequestRef> rq;
+
+  // apply all repops
+  while (!repop_queue.empty()) {
+    RepGather *repop = repop_queue.front();
+    repop_queue.pop_front();
+    dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
+    repop->rep_aborted = true;
+    repop->on_committed.clear();
+    repop->on_success.clear();
+
+    if (requeue) {
+      if (repop->op) {
+	dout(10) << " requeuing " << *repop->op->get_req() << dendl;
+	rq.push_back(repop->op);
+	repop->op = OpRequestRef();
+      }
+
+      // also requeue any dups, interleaved into position
+      auto p = waiting_for_ondisk.find(repop->v);
+      if (p != waiting_for_ondisk.end()) {
+	dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
+	for (auto& i : p->second) {
+	  rq.push_back(std::get<0>(i));
+	}
+	waiting_for_ondisk.erase(p);
+      }
+    }
+
+    remove_repop(repop);
+  }
+
+  ceph_assert(repop_queue.empty());
+
+  if (requeue) {
+    requeue_ops(rq);
+    if (!waiting_for_ondisk.empty()) {
+      for (auto& i : waiting_for_ondisk) {
+        for (auto& j : i.second) {
+          derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
+               << " waiting on " << i.first << dendl;
+        }
+      }
+      ceph_assert(waiting_for_ondisk.empty());
+    }
+  }
+
+  waiting_for_ondisk.clear();
+}
+
+void PrimaryLogPG::on_flushed()
+{
+  requeue_ops(waiting_for_flush);
+  if (!is_peered() || !is_primary()) {
+    pair<hobject_t, ObjectContextRef> i;
+    while (object_contexts.get_next(i.first, &i)) {
+      derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
+    }
+    ceph_assert(object_contexts.empty());
+  }
+}
+
+void PrimaryLogPG::on_removal(ObjectStore::Transaction &t)
+{
+  dout(10) << __func__ << dendl;
+
+  on_shutdown();
+
+  t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
+}
+
+void PrimaryLogPG::clear_async_reads()
+{
+  dout(10) << __func__ << dendl;
+  for(auto& i : in_progress_async_reads) {
+    dout(10) << "clear ctx: "
+             << "OpRequestRef " << i.first
+             << " OpContext " << i.second
+             << dendl;
+    close_op_ctx(i.second);
+  }
+}
+
+void PrimaryLogPG::clear_cache()
+{
+  object_contexts.clear();
+}
+
+void PrimaryLogPG::on_shutdown()
+{
+  dout(10) << __func__ << dendl;
+
+  if (recovery_queued) {
+    recovery_queued = false;
+    osd->clear_queued_recovery(this);
+  }
+
+  m_scrubber->scrub_clear_state();
+
+  m_scrubber->unreg_next_scrub();
+
+  vector<ceph_tid_t> tids;
+  cancel_copy_ops(false, &tids);
+  cancel_flush_ops(false, &tids);
+  cancel_proxy_ops(false, &tids);
+  cancel_manifest_ops(false, &tids);
+  osd->objecter->op_cancel(tids, -ECANCELED);
+
+  apply_and_flush_repops(false);
+  cancel_log_updates();
+  // we must remove PGRefs, so do this this prior to release_backoffs() callers
+  clear_backoffs();
+  // clean up snap trim references
+  snap_trimmer_machine.process_event(Reset());
+
+  pgbackend->on_change();
+
+  context_registry_on_change();
+  object_contexts.clear();
+
+  clear_async_reads();
+
+  osd->remote_reserver.cancel_reservation(info.pgid);
+  osd->local_reserver.cancel_reservation(info.pgid);
+
+  clear_primary_state();
+  cancel_recovery();
+
+  if (is_primary()) {
+    osd->clear_ready_to_merge(this);
+  }
+}
+
+void PrimaryLogPG::on_activate_complete()
+{
+  check_local();
+  // waiters
+  if (!recovery_state.needs_flush()) {
+    requeue_ops(waiting_for_peered);
+  } else if (!waiting_for_peered.empty()) {
+    dout(10) << __func__ << " flushes in progress, moving "
+	     << waiting_for_peered.size()
+	     << " items to waiting_for_flush"
+	     << dendl;
+    ceph_assert(waiting_for_flush.empty());
+    waiting_for_flush.swap(waiting_for_peered);
+  }
+
+
+  // all clean?
+  if (needs_recovery()) {
+    dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
+    queue_peering_event(
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	  get_osdmap_epoch(),
+	  get_osdmap_epoch(),
+	  PeeringState::DoRecovery())));
+  } else if (needs_backfill()) {
+    dout(10) << "activate queueing backfill" << dendl;
+    queue_peering_event(
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	  get_osdmap_epoch(),
+	  get_osdmap_epoch(),
+	  PeeringState::RequestBackfill())));
+  } else {
+    dout(10) << "activate all replicas clean, no recovery" << dendl;
+    queue_peering_event(
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	  get_osdmap_epoch(),
+	  get_osdmap_epoch(),
+	  PeeringState::AllReplicasRecovered())));
+  }
+
+  publish_stats_to_osd();
+
+  if (get_backfill_targets().size()) {
+    last_backfill_started = recovery_state.earliest_backfill();
+    new_backfill = true;
+    ceph_assert(!last_backfill_started.is_max());
+    dout(5) << __func__ << ": bft=" << get_backfill_targets()
+	   << " from " << last_backfill_started << dendl;
+    for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+	 i != get_backfill_targets().end();
+	 ++i) {
+      dout(5) << "target shard " << *i
+	     << " from " << recovery_state.get_peer_info(*i).last_backfill
+	     << dendl;
+    }
+  }
+
+  hit_set_setup();
+  agent_setup();
+}
+
+void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
+{
+  dout(10) << __func__ << dendl;
+
+  if (hit_set && hit_set->insert_count() == 0) {
+    dout(20) << " discarding empty hit_set" << dendl;
+    hit_set_clear();
+  }
+
+  if (recovery_queued) {
+    recovery_queued = false;
+    osd->clear_queued_recovery(this);
+  }
+
+  // requeue everything in the reverse order they should be
+  // reexamined.
+  requeue_ops(waiting_for_peered);
+  requeue_ops(waiting_for_flush);
+  requeue_ops(waiting_for_active);
+  requeue_ops(waiting_for_readable);
+
+  vector<ceph_tid_t> tids;
+  cancel_copy_ops(is_primary(), &tids);
+  cancel_flush_ops(is_primary(), &tids);
+  cancel_proxy_ops(is_primary(), &tids);
+  cancel_manifest_ops(is_primary(), &tids);
+  osd->objecter->op_cancel(tids, -ECANCELED);
+
+  // requeue object waiters
+  for (auto& p : waiting_for_unreadable_object) {
+    release_backoffs(p.first);
+  }
+  if (is_primary()) {
+    requeue_object_waiters(waiting_for_unreadable_object);
+  } else {
+    waiting_for_unreadable_object.clear();
+  }
+  for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
+       p != waiting_for_degraded_object.end();
+       waiting_for_degraded_object.erase(p++)) {
+    release_backoffs(p->first);
+    if (is_primary())
+      requeue_ops(p->second);
+    else
+      p->second.clear();
+    finish_degraded_object(p->first);
+  }
+
+  // requeues waiting_for_scrub
+  m_scrubber->scrub_clear_state();
+
+  for (auto p = waiting_for_blocked_object.begin();
+       p != waiting_for_blocked_object.end();
+       waiting_for_blocked_object.erase(p++)) {
+    if (is_primary())
+      requeue_ops(p->second);
+    else
+      p->second.clear();
+  }
+  for (auto i = callbacks_for_degraded_object.begin();
+       i != callbacks_for_degraded_object.end();
+    ) {
+    finish_degraded_object((i++)->first);
+  }
+  ceph_assert(callbacks_for_degraded_object.empty());
+
+  if (is_primary()) {
+    requeue_ops(waiting_for_cache_not_full);
+  } else {
+    waiting_for_cache_not_full.clear();
+  }
+  objects_blocked_on_cache_full.clear();
+
+  for (list<pair<OpRequestRef, OpContext*> >::iterator i =
+         in_progress_async_reads.begin();
+       i != in_progress_async_reads.end();
+       in_progress_async_reads.erase(i++)) {
+    close_op_ctx(i->second);
+    if (is_primary())
+      requeue_op(i->first);
+  }
+
+  // this will requeue ops we were working on but didn't finish, and
+  // any dups
+  apply_and_flush_repops(is_primary());
+  cancel_log_updates();
+
+  // do this *after* apply_and_flush_repops so that we catch any newly
+  // registered watches.
+  context_registry_on_change();
+
+  pgbackend->on_change_cleanup(&t);
+  m_scrubber->cleanup_store(&t);
+  pgbackend->on_change();
+
+  // clear snap_trimmer state
+  snap_trimmer_machine.process_event(Reset());
+
+  debug_op_order.clear();
+  unstable_stats.clear();
+
+  // we don't want to cache object_contexts through the interval change
+  // NOTE: we actually assert that all currently live references are dead
+  // by the time the flush for the next interval completes.
+  object_contexts.clear();
+
+  // should have been cleared above by finishing all of the degraded objects
+  ceph_assert(objects_blocked_on_degraded_snap.empty());
+}
+
+void PrimaryLogPG::plpg_on_role_change()
+{
+  dout(10) << __func__ << dendl;
+  if (get_role() != 0 && hit_set) {
+    dout(10) << " clearing hit set" << dendl;
+    hit_set_clear();
+  }
+}
+
+void PrimaryLogPG::plpg_on_pool_change()
+{
+  dout(10) << __func__ << dendl;
+  // requeue cache full waiters just in case the cache_mode is
+  // changing away from writeback mode.  note that if we are not
+  // active the normal requeuing machinery is sufficient (and properly
+  // ordered).
+  if (is_active() &&
+      pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+      !waiting_for_cache_not_full.empty()) {
+    dout(10) << __func__ << " requeuing full waiters (not in writeback) "
+	     << dendl;
+    requeue_ops(waiting_for_cache_not_full);
+    objects_blocked_on_cache_full.clear();
+  }
+  hit_set_setup();
+  agent_setup();
+}
+
+// clear state.  called on recovery completion AND cancellation.
+void PrimaryLogPG::_clear_recovery_state()
+{
+#ifdef DEBUG_RECOVERY_OIDS
+  recovering_oids.clear();
+#endif
+  dout(15) << __func__ << " flags: " << m_planned_scrub << dendl;
+
+  last_backfill_started = hobject_t();
+  set<hobject_t>::iterator i = backfills_in_flight.begin();
+  while (i != backfills_in_flight.end()) {
+    backfills_in_flight.erase(i++);
+  }
+
+  list<OpRequestRef> blocked_ops;
+  for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
+       i != recovering.end();
+       recovering.erase(i++)) {
+    if (i->second) {
+      i->second->drop_recovery_read(&blocked_ops);
+      requeue_ops(blocked_ops);
+    }
+  }
+  ceph_assert(backfills_in_flight.empty());
+  pending_backfill_updates.clear();
+  ceph_assert(recovering.empty());
+  pgbackend->clear_recovery_state();
+}
+
+void PrimaryLogPG::cancel_pull(const hobject_t &soid)
+{
+  dout(20) << __func__ << ": " << soid << dendl;
+  ceph_assert(recovering.count(soid));
+  ObjectContextRef obc = recovering[soid];
+  if (obc) {
+    list<OpRequestRef> blocked_ops;
+    obc->drop_recovery_read(&blocked_ops);
+    requeue_ops(blocked_ops);
+  }
+  recovering.erase(soid);
+  finish_recovery_op(soid);
+  release_backoffs(soid);
+  if (waiting_for_degraded_object.count(soid)) {
+    dout(20) << " kicking degraded waiters on " << soid << dendl;
+    requeue_ops(waiting_for_degraded_object[soid]);
+    waiting_for_degraded_object.erase(soid);
+  }
+  if (waiting_for_unreadable_object.count(soid)) {
+    dout(20) << " kicking unreadable waiters on " << soid << dendl;
+    requeue_ops(waiting_for_unreadable_object[soid]);
+    waiting_for_unreadable_object.erase(soid);
+  }
+  if (is_missing_object(soid))
+    recovery_state.set_last_requested(0);
+  finish_degraded_object(soid);
+}
+
+void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
+{
+  pgbackend->check_recovery_sources(osdmap);
+}
+
+bool PrimaryLogPG::start_recovery_ops(
+  uint64_t max,
+  ThreadPool::TPHandle &handle,
+  uint64_t *ops_started)
+{
+  uint64_t& started = *ops_started;
+  started = 0;
+  bool work_in_progress = false;
+  bool recovery_started = false;
+  ceph_assert(is_primary());
+  ceph_assert(is_peered());
+  ceph_assert(!recovery_state.is_deleting());
+
+  ceph_assert(recovery_queued);
+  recovery_queued = false;
+
+  if (!state_test(PG_STATE_RECOVERING) &&
+      !state_test(PG_STATE_BACKFILLING)) {
+    /* TODO: I think this case is broken and will make do_recovery()
+     * unhappy since we're returning false */
+    dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
+    return have_unfound();
+  }
+
+  const auto &missing = recovery_state.get_pg_log().get_missing();
+
+  uint64_t num_unfound = get_num_unfound();
+
+  if (!recovery_state.have_missing()) {
+    recovery_state.local_recovery_complete();
+  }
+
+  if (!missing.have_missing() || // Primary does not have missing
+      // or all of the missing objects are unfound.
+      recovery_state.all_missing_unfound()) {
+    // Recover the replicas.
+    started = recover_replicas(max, handle, &recovery_started);
+  }
+  if (!started) {
+    // We still have missing objects that we should grab from replicas.
+    started += recover_primary(max, handle);
+  }
+  if (!started && num_unfound != get_num_unfound()) {
+    // second chance to recovery replicas
+    started = recover_replicas(max, handle, &recovery_started);
+  }
+
+  if (started || recovery_started)
+    work_in_progress = true;
+
+  bool deferred_backfill = false;
+  if (recovering.empty() &&
+      state_test(PG_STATE_BACKFILLING) &&
+      !get_backfill_targets().empty() && started < max &&
+      missing.num_missing() == 0 &&
+      waiting_on_backfill.empty()) {
+    if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
+      dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
+      deferred_backfill = true;
+    } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
+	       !is_degraded())  {
+      dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
+      deferred_backfill = true;
+    } else if (!recovery_state.is_backfill_reserved()) {
+      /* DNMNOTE I think this branch is dead */
+      dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
+      if (!backfill_reserving) {
+	dout(10) << "queueing RequestBackfill" << dendl;
+	backfill_reserving = true;
+	queue_peering_event(
+	  PGPeeringEventRef(
+	    std::make_shared<PGPeeringEvent>(
+	      get_osdmap_epoch(),
+	      get_osdmap_epoch(),
+	      PeeringState::RequestBackfill())));
+      }
+      deferred_backfill = true;
+    } else {
+      started += recover_backfill(max - started, handle, &work_in_progress);
+    }
+  }
+
+  dout(10) << " started " << started << dendl;
+  osd->logger->inc(l_osd_rop, started);
+
+  if (!recovering.empty() ||
+      work_in_progress || recovery_ops_active > 0 || deferred_backfill)
+    return !work_in_progress && have_unfound();
+
+  ceph_assert(recovering.empty());
+  ceph_assert(recovery_ops_active == 0);
+
+  dout(10) << __func__ << " needs_recovery: "
+	   << recovery_state.get_missing_loc().get_needs_recovery()
+	   << dendl;
+  dout(10) << __func__ << " missing_loc: "
+	   << recovery_state.get_missing_loc().get_missing_locs()
+	   << dendl;
+  int unfound = get_num_unfound();
+  if (unfound) {
+    dout(10) << " still have " << unfound << " unfound" << dendl;
+    return true;
+  }
+
+  if (missing.num_missing() > 0) {
+    // this shouldn't happen!
+    osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
+		       << missing.num_missing() << ": " << missing.get_items();
+    return false;
+  }
+
+  if (needs_recovery()) {
+    // this shouldn't happen!
+    // We already checked num_missing() so we must have missing replicas
+    osd->clog->error() << info.pgid
+                       << " Unexpected Error: recovery ending with missing replicas";
+    return false;
+  }
+
+  if (state_test(PG_STATE_RECOVERING)) {
+    state_clear(PG_STATE_RECOVERING);
+    state_clear(PG_STATE_FORCED_RECOVERY);
+    if (needs_backfill()) {
+      dout(10) << "recovery done, queuing backfill" << dendl;
+      queue_peering_event(
+        PGPeeringEventRef(
+          std::make_shared<PGPeeringEvent>(
+            get_osdmap_epoch(),
+            get_osdmap_epoch(),
+            PeeringState::RequestBackfill())));
+    } else {
+      dout(10) << "recovery done, no backfill" << dendl;
+      state_clear(PG_STATE_FORCED_BACKFILL);
+      queue_peering_event(
+        PGPeeringEventRef(
+          std::make_shared<PGPeeringEvent>(
+            get_osdmap_epoch(),
+            get_osdmap_epoch(),
+            PeeringState::AllReplicasRecovered())));
+    }
+  } else { // backfilling
+    state_clear(PG_STATE_BACKFILLING);
+    state_clear(PG_STATE_FORCED_BACKFILL);
+    state_clear(PG_STATE_FORCED_RECOVERY);
+    dout(10) << "recovery done, backfill done" << dendl;
+    queue_peering_event(
+      PGPeeringEventRef(
+        std::make_shared<PGPeeringEvent>(
+          get_osdmap_epoch(),
+          get_osdmap_epoch(),
+          PeeringState::Backfilled())));
+  }
+
+  return false;
+}
+
+/**
+ * do one recovery op.
+ * return true if done, false if nothing left to do.
+ */
+uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
+{
+  ceph_assert(is_primary());
+
+  const auto &missing = recovery_state.get_pg_log().get_missing();
+
+  dout(10) << __func__ << " recovering " << recovering.size()
+           << " in pg,"
+           << " missing " << missing << dendl;
+
+  dout(25) << __func__ << " " << missing.get_items() << dendl;
+
+  // look at log!
+  pg_log_entry_t *latest = 0;
+  unsigned started = 0;
+  int skipped = 0;
+
+  PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+  map<version_t, hobject_t>::const_iterator p =
+    missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested);
+  while (p != missing.get_rmissing().end()) {
+    handle.reset_tp_timeout();
+    hobject_t soid;
+    version_t v = p->first;
+
+    auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second);
+    if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) {
+      latest = it_objects->second;
+      ceph_assert(latest->is_update() || latest->is_delete());
+      soid = latest->soid;
+    } else {
+      latest = 0;
+      soid = p->second;
+    }
+    const pg_missing_item& item = missing.get_items().find(p->second)->second;
+    ++p;
+
+    hobject_t head = soid.get_head();
+
+    eversion_t need = item.need;
+
+    dout(10) << __func__ << " "
+             << soid << " " << item.need
+	     << (missing.is_missing(soid) ? " (missing)":"")
+	     << (missing.is_missing(head) ? " (missing head)":"")
+             << (recovering.count(soid) ? " (recovering)":"")
+	     << (recovering.count(head) ? " (recovering head)":"")
+             << dendl;
+
+    if (latest) {
+      switch (latest->op) {
+      case pg_log_entry_t::CLONE:
+	/*
+	 * Handling for this special case removed for now, until we
+	 * can correctly construct an accurate SnapSet from the old
+	 * one.
+	 */
+	break;
+
+      case pg_log_entry_t::LOST_REVERT:
+	{
+	  if (item.have == latest->reverting_to) {
+	    ObjectContextRef obc = get_object_context(soid, true);
+
+	    if (obc->obs.oi.version == latest->version) {
+	      // I'm already reverting
+	      dout(10) << " already reverting " << soid << dendl;
+	    } else {
+	      dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
+	      obc->obs.oi.version = latest->version;
+
+	      ObjectStore::Transaction t;
+	      bufferlist b2;
+	      obc->obs.oi.encode(
+		b2,
+		get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+	      ceph_assert(!pool.info.require_rollback());
+	      t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
+
+	      recovery_state.recover_got(
+		soid,
+		latest->version,
+		false,
+		t);
+
+	      ++active_pushes;
+
+	      t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
+	      t.register_on_commit(new C_OSD_CommittedPushedObject(
+				     this,
+				     get_osdmap_epoch(),
+				     info.last_complete));
+	      osd->store->queue_transaction(ch, std::move(t));
+	      continue;
+	    }
+	  } else {
+	    /*
+	     * Pull the old version of the object.  Update missing_loc here to have the location
+	     * of the version we want.
+	     *
+	     * This doesn't use the usual missing_loc paths, but that's okay:
+	     *  - if we have it locally, we hit the case above, and go from there.
+	     *  - if we don't, we always pass through this case during recovery and set up the location
+	     *    properly.
+	     *  - this way we don't need to mangle the missing code to be general about needing an old
+	     *    version...
+	     */
+	    eversion_t alternate_need = latest->reverting_to;
+	    dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
+
+	    set<pg_shard_t> good_peers;
+	    for (auto p = recovery_state.get_peer_missing().begin();
+		 p != recovery_state.get_peer_missing().end();
+		 ++p) {
+	      if (p->second.is_missing(soid, need) &&
+		  p->second.get_items().at(soid).have == alternate_need) {
+		good_peers.insert(p->first);
+	      }
+	    }
+	    recovery_state.set_revert_with_targets(
+	      soid,
+	      good_peers);
+	    dout(10) << " will pull " << alternate_need << " or " << need
+		     << " from one of "
+		     << recovery_state.get_missing_loc().get_locations(soid)
+		     << dendl;
+	  }
+	}
+	break;
+      }
+    }
+
+    if (!recovering.count(soid)) {
+      if (recovering.count(head)) {
+	++skipped;
+      } else {
+	int r = recover_missing(
+	  soid, need, get_recovery_op_priority(), h);
+	switch (r) {
+	case PULL_YES:
+	  ++started;
+	  break;
+	case PULL_HEAD:
+	  ++started;
+	case PULL_NONE:
+	  ++skipped;
+	  break;
+	default:
+	  ceph_abort();
+	}
+	if (started >= max)
+	  break;
+      }
+    }
+
+    // only advance last_requested if we haven't skipped anything
+    if (!skipped)
+      recovery_state.set_last_requested(v);
+  }
+
+  pgbackend->run_recovery_op(h, get_recovery_op_priority());
+  return started;
+}
+
+bool PrimaryLogPG::primary_error(
+  const hobject_t& soid, eversion_t v)
+{
+  recovery_state.force_object_missing(pg_whoami, soid, v);
+  bool uhoh = recovery_state.get_missing_loc().is_unfound(soid);
+  if (uhoh)
+    osd->clog->error() << info.pgid << " missing primary copy of "
+		       << soid << ", unfound";
+  else
+    osd->clog->error() << info.pgid << " missing primary copy of "
+		       << soid
+		       << ", will try copies on "
+		       << recovery_state.get_missing_loc().get_locations(soid);
+  return uhoh;
+}
+
+int PrimaryLogPG::prep_object_replica_deletes(
+  const hobject_t& soid, eversion_t v,
+  PGBackend::RecoveryHandle *h,
+  bool *work_started)
+{
+  ceph_assert(is_primary());
+  dout(10) << __func__ << ": on " << soid << dendl;
+
+  ObjectContextRef obc = get_object_context(soid, false);
+  if (obc) {
+    if (!obc->get_recovery_read()) {
+      dout(20) << "replica delete delayed on " << soid
+	       << "; could not get rw_manager lock" << dendl;
+      *work_started = true;
+      return 0;
+    } else {
+      dout(20) << "replica delete got recovery read lock on " << soid
+	       << dendl;
+    }
+  }
+
+  start_recovery_op(soid);
+  ceph_assert(!recovering.count(soid));
+  if (!obc)
+    recovering.insert(make_pair(soid, ObjectContextRef()));
+  else
+    recovering.insert(make_pair(soid, obc));
+
+  pgbackend->recover_delete_object(soid, v, h);
+  return 1;
+}
+
+int PrimaryLogPG::prep_object_replica_pushes(
+  const hobject_t& soid, eversion_t v,
+  PGBackend::RecoveryHandle *h,
+  bool *work_started)
+{
+  ceph_assert(is_primary());
+  dout(10) << __func__ << ": on " << soid << dendl;
+
+  if (soid.snap && soid.snap < CEPH_NOSNAP) {
+    // do we have the head and/or snapdir?
+    hobject_t head = soid.get_head();
+    if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
+      if (recovering.count(head)) {
+	dout(10) << " missing but already recovering head " << head << dendl;
+	return 0;
+      } else {
+	int r = recover_missing(
+	    head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need,
+	    get_recovery_op_priority(), h);
+	if (r != PULL_NONE)
+	  return 1;
+	return 0;
+      }
+    }
+  }
+
+  // NOTE: we know we will get a valid oloc off of disk here.
+  ObjectContextRef obc = get_object_context(soid, false);
+  if (!obc) {
+    primary_error(soid, v);
+    return 0;
+  }
+
+  if (!obc->get_recovery_read()) {
+    dout(20) << "recovery delayed on " << soid
+	     << "; could not get rw_manager lock" << dendl;
+    *work_started = true;
+    return 0;
+  } else {
+    dout(20) << "recovery got recovery read lock on " << soid
+	     << dendl;
+  }
+
+  start_recovery_op(soid);
+  ceph_assert(!recovering.count(soid));
+  recovering.insert(make_pair(soid, obc));
+
+  int r = pgbackend->recover_object(
+    soid,
+    v,
+    ObjectContextRef(),
+    obc, // has snapset context
+    h);
+  if (r < 0) {
+    dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
+    on_failed_pull({ pg_whoami }, soid, v);
+    return 0;
+  }
+  return 1;
+}
+
+uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
+  bool *work_started)
+{
+  dout(10) << __func__ << "(" << max << ")" << dendl;
+  uint64_t started = 0;
+
+  PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+
+  // this is FAR from an optimal recovery order.  pretty lame, really.
+  ceph_assert(!get_acting_recovery_backfill().empty());
+  // choose replicas to recover, replica has the shortest missing list first
+  // so we can bring it back to normal ASAP
+  std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
+    async_by_num_missing;
+  replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
+  for (auto &p: get_acting_recovery_backfill()) {
+    if (p == get_primary()) {
+      continue;
+    }
+    auto pm = recovery_state.get_peer_missing().find(p);
+    ceph_assert(pm != recovery_state.get_peer_missing().end());
+    auto nm = pm->second.num_missing();
+    if (nm != 0) {
+      if (is_async_recovery_target(p)) {
+        async_by_num_missing.push_back(make_pair(nm, p));
+      } else {
+        replicas_by_num_missing.push_back(make_pair(nm, p));
+      }
+    }
+  }
+  // sort by number of missing objects, in ascending order.
+  auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
+                 const std::pair<unsigned int, pg_shard_t> &rhs) {
+    return lhs.first < rhs.first;
+  };
+  // acting goes first
+  std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
+  // then async_recovery_targets
+  std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
+  replicas_by_num_missing.insert(replicas_by_num_missing.end(),
+    async_by_num_missing.begin(), async_by_num_missing.end());
+  for (auto &replica: replicas_by_num_missing) {
+    pg_shard_t &peer = replica.second;
+    ceph_assert(peer != get_primary());
+    auto pm = recovery_state.get_peer_missing().find(peer);
+    ceph_assert(pm != recovery_state.get_peer_missing().end());
+    size_t m_sz = pm->second.num_missing();
+
+    dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
+    dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
+
+    // oldest first!
+    const pg_missing_t &m(pm->second);
+    for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
+	 p != m.get_rmissing().end() && started < max;
+	   ++p) {
+      handle.reset_tp_timeout();
+      const hobject_t soid(p->second);
+
+      if (recovery_state.get_missing_loc().is_unfound(soid)) {
+	dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
+	continue;
+      }
+
+      const pg_info_t &pi = recovery_state.get_peer_info(peer);
+      if (soid > pi.last_backfill) {
+	if (!recovering.count(soid)) {
+          derr << __func__ << ": object " << soid << " last_backfill "
+	       << pi.last_backfill << dendl;
+	  derr << __func__ << ": object added to missing set for backfill, but "
+	       << "is not in recovering, error!" << dendl;
+	  ceph_abort();
+	}
+	continue;
+      }
+
+      if (recovering.count(soid)) {
+	dout(10) << __func__ << ": already recovering " << soid << dendl;
+	continue;
+      }
+
+      if (recovery_state.get_missing_loc().is_deleted(soid)) {
+	dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
+	map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
+	started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
+	continue;
+      }
+
+      if (soid.is_snap() &&
+	  recovery_state.get_pg_log().get_missing().is_missing(
+	    soid.get_head())) {
+	dout(10) << __func__ << ": " << soid.get_head()
+		 << " still missing on primary" << dendl;
+	continue;
+      }
+
+      if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
+	dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
+	continue;
+      }
+
+      dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
+      map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
+      started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
+    }
+  }
+
+  pgbackend->run_recovery_op(h, get_recovery_op_priority());
+  return started;
+}
+
+hobject_t PrimaryLogPG::earliest_peer_backfill() const
+{
+  hobject_t e = hobject_t::get_max();
+  for (const pg_shard_t& peer : get_backfill_targets()) {
+    const auto iter = peer_backfill_info.find(peer);
+    ceph_assert(iter != peer_backfill_info.end());
+    e = std::min(e, iter->second.begin);
+  }
+  return e;
+}
+
+bool PrimaryLogPG::all_peer_done() const
+{
+  // Primary hasn't got any more objects
+  ceph_assert(backfill_info.empty());
+
+  for (const pg_shard_t& bt : get_backfill_targets()) {
+    const auto piter = peer_backfill_info.find(bt);
+    ceph_assert(piter != peer_backfill_info.end());
+    const BackfillInterval& pbi = piter->second;
+    // See if peer has more to process
+    if (!pbi.extends_to_end() || !pbi.empty())
+	return false;
+  }
+  return true;
+}
+
+/**
+ * recover_backfill
+ *
+ * Invariants:
+ *
+ * backfilled: fully pushed to replica or present in replica's missing set (both
+ * our copy and theirs).
+ *
+ * All objects on a backfill_target in
+ * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
+ * objects have been actually deleted and all logically-valid objects are replicated.
+ * There may be PG objects in this interval yet to be backfilled.
+ *
+ * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
+ * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
+ *
+ * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
+ *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
+ * interval remain on the backfill target.
+ *
+ * For a backfill target, all objects <= peer_info[target].last_backfill
+ * have been backfilled to target
+ *
+ * There *MAY* be missing/outdated objects between last_backfill_started and
+ * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
+ * io created objects since the last scan.  For this reason, we call
+ * update_range() again before continuing backfill.
+ */
+uint64_t PrimaryLogPG::recover_backfill(
+  uint64_t max,
+  ThreadPool::TPHandle &handle, bool *work_started)
+{
+  dout(10) << __func__ << " (" << max << ")"
+           << " bft=" << get_backfill_targets()
+	   << " last_backfill_started " << last_backfill_started
+	   << (new_backfill ? " new_backfill":"")
+	   << dendl;
+  ceph_assert(!get_backfill_targets().empty());
+
+  // Initialize from prior backfill state
+  if (new_backfill) {
+    // on_activate() was called prior to getting here
+    ceph_assert(last_backfill_started == recovery_state.earliest_backfill());
+    new_backfill = false;
+
+    // initialize BackfillIntervals
+    for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+	 i != get_backfill_targets().end();
+	 ++i) {
+      peer_backfill_info[*i].reset(
+	recovery_state.get_peer_info(*i).last_backfill);
+    }
+    backfill_info.reset(last_backfill_started);
+
+    backfills_in_flight.clear();
+    pending_backfill_updates.clear();
+  }
+
+  for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+       i != get_backfill_targets().end();
+       ++i) {
+    dout(10) << "peer osd." << *i
+	   << " info " << recovery_state.get_peer_info(*i)
+	   << " interval " << peer_backfill_info[*i].begin
+	   << "-" << peer_backfill_info[*i].end
+	   << " " << peer_backfill_info[*i].objects.size() << " objects"
+	   << dendl;
+  }
+
+  // update our local interval to cope with recent changes
+  backfill_info.begin = last_backfill_started;
+  update_range(&backfill_info, handle);
+
+  unsigned ops = 0;
+  vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
+  set<hobject_t> add_to_stat;
+
+  for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+       i != get_backfill_targets().end();
+       ++i) {
+    peer_backfill_info[*i].trim_to(
+      std::max(
+	recovery_state.get_peer_info(*i).last_backfill,
+	last_backfill_started));
+  }
+  backfill_info.trim_to(last_backfill_started);
+
+  PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+  while (ops < max) {
+    if (backfill_info.begin <= earliest_peer_backfill() &&
+	!backfill_info.extends_to_end() && backfill_info.empty()) {
+      hobject_t next = backfill_info.end;
+      backfill_info.reset(next);
+      backfill_info.end = hobject_t::get_max();
+      update_range(&backfill_info, handle);
+      backfill_info.trim();
+    }
+
+    dout(20) << "   my backfill interval " << backfill_info << dendl;
+
+    bool sent_scan = false;
+    for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+	 i != get_backfill_targets().end();
+	 ++i) {
+      pg_shard_t bt = *i;
+      BackfillInterval& pbi = peer_backfill_info[bt];
+
+      dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
+      if (pbi.begin <= backfill_info.begin &&
+	  !pbi.extends_to_end() && pbi.empty()) {
+	dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
+	epoch_t e = get_osdmap_epoch();
+	MOSDPGScan *m = new MOSDPGScan(
+	  MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(),
+	  spg_t(info.pgid.pgid, bt.shard),
+	  pbi.end, hobject_t());
+	osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
+	ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
+	waiting_on_backfill.insert(bt);
+        sent_scan = true;
+      }
+    }
+
+    // Count simultaneous scans as a single op and let those complete
+    if (sent_scan) {
+      ops++;
+      start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
+      break;
+    }
+
+    if (backfill_info.empty() && all_peer_done()) {
+      dout(10) << " reached end for both local and all peers" << dendl;
+      break;
+    }
+
+    // Get object within set of peers to operate on and
+    // the set of targets for which that object applies.
+    hobject_t check = earliest_peer_backfill();
+
+    if (check < backfill_info.begin) {
+
+      set<pg_shard_t> check_targets;
+      for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+	   i != get_backfill_targets().end();
+	   ++i) {
+        pg_shard_t bt = *i;
+        BackfillInterval& pbi = peer_backfill_info[bt];
+        if (pbi.begin == check)
+          check_targets.insert(bt);
+      }
+      ceph_assert(!check_targets.empty());
+
+      dout(20) << " BACKFILL removing " << check
+	       << " from peers " << check_targets << dendl;
+      for (set<pg_shard_t>::iterator i = check_targets.begin();
+	   i != check_targets.end();
+	   ++i) {
+        pg_shard_t bt = *i;
+        BackfillInterval& pbi = peer_backfill_info[bt];
+        ceph_assert(pbi.begin == check);
+
+        to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
+        pbi.pop_front();
+      }
+
+      last_backfill_started = check;
+
+      // Don't increment ops here because deletions
+      // are cheap and not replied to unlike real recovery_ops,
+      // and we can't increment ops without requeueing ourself
+      // for recovery.
+    } else {
+      eversion_t& obj_v = backfill_info.objects.begin()->second;
+
+      vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
+      for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+	   i != get_backfill_targets().end();
+	   ++i) {
+	pg_shard_t bt = *i;
+	BackfillInterval& pbi = peer_backfill_info[bt];
+        // Find all check peers that have the wrong version
+	if (check == backfill_info.begin && check == pbi.begin) {
+	  if (pbi.objects.begin()->second != obj_v) {
+	    need_ver_targs.push_back(bt);
+	  } else {
+	    keep_ver_targs.push_back(bt);
+	  }
+        } else {
+	  const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
+
+          // Only include peers that we've caught up to their backfill line
+	  // otherwise, they only appear to be missing this object
+	  // because their pbi.begin > backfill_info.begin.
+          if (backfill_info.begin > pinfo.last_backfill)
+	    missing_targs.push_back(bt);
+	  else
+	    skip_targs.push_back(bt);
+	}
+      }
+
+      if (!keep_ver_targs.empty()) {
+        // These peers have version obj_v
+	dout(20) << " BACKFILL keeping " << check
+		 << " with ver " << obj_v
+		 << " on peers " << keep_ver_targs << dendl;
+	//assert(!waiting_for_degraded_object.count(check));
+      }
+      if (!need_ver_targs.empty() || !missing_targs.empty()) {
+	ObjectContextRef obc = get_object_context(backfill_info.begin, false);
+	ceph_assert(obc);
+	if (obc->get_recovery_read()) {
+	  if (!need_ver_targs.empty()) {
+	    dout(20) << " BACKFILL replacing " << check
+		   << " with ver " << obj_v
+		   << " to peers " << need_ver_targs << dendl;
+	  }
+	  if (!missing_targs.empty()) {
+	    dout(20) << " BACKFILL pushing " << backfill_info.begin
+	         << " with ver " << obj_v
+	         << " to peers " << missing_targs << dendl;
+	  }
+	  vector<pg_shard_t> all_push = need_ver_targs;
+	  all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
+
+	  handle.reset_tp_timeout();
+	  int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
+	  if (r < 0) {
+	    *work_started = true;
+	    dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
+	    break;
+	  }
+	  ops++;
+	} else {
+	  *work_started = true;
+	  dout(20) << "backfill blocking on " << backfill_info.begin
+		   << "; could not get rw_manager lock" << dendl;
+	  break;
+	}
+      }
+      dout(20) << "need_ver_targs=" << need_ver_targs
+	       << " keep_ver_targs=" << keep_ver_targs << dendl;
+      dout(20) << "backfill_targets=" << get_backfill_targets()
+	       << " missing_targs=" << missing_targs
+	       << " skip_targs=" << skip_targs << dendl;
+
+      last_backfill_started = backfill_info.begin;
+      add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
+      backfill_info.pop_front();
+      vector<pg_shard_t> check_targets = need_ver_targs;
+      check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
+      for (vector<pg_shard_t>::iterator i = check_targets.begin();
+	   i != check_targets.end();
+	   ++i) {
+        pg_shard_t bt = *i;
+        BackfillInterval& pbi = peer_backfill_info[bt];
+        pbi.pop_front();
+      }
+    }
+  }
+
+  for (set<hobject_t>::iterator i = add_to_stat.begin();
+       i != add_to_stat.end();
+       ++i) {
+    ObjectContextRef obc = get_object_context(*i, false);
+    ceph_assert(obc);
+    pg_stat_t stat;
+    add_object_context_to_pg_stat(obc, &stat);
+    pending_backfill_updates[*i] = stat;
+  }
+  map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
+  for (unsigned i = 0; i < to_remove.size(); ++i) {
+    handle.reset_tp_timeout();
+    const hobject_t& oid = to_remove[i].get<0>();
+    eversion_t v = to_remove[i].get<1>();
+    pg_shard_t peer = to_remove[i].get<2>();
+    MOSDPGBackfillRemove *m;
+    auto it = reqs.find(peer);
+    if (it != reqs.end()) {
+      m = it->second;
+    } else {
+      m = reqs[peer] = new MOSDPGBackfillRemove(
+	spg_t(info.pgid.pgid, peer.shard),
+	get_osdmap_epoch());
+    }
+    m->ls.push_back(make_pair(oid, v));
+
+    if (oid <= last_backfill_started)
+      pending_backfill_updates[oid]; // add empty stat!
+  }
+  for (auto p : reqs) {
+    osd->send_message_osd_cluster(p.first.osd, p.second,
+				  get_osdmap_epoch());
+  }
+
+  pgbackend->run_recovery_op(h, get_recovery_op_priority());
+
+  hobject_t backfill_pos =
+    std::min(backfill_info.begin, earliest_peer_backfill());
+  dout(5) << "backfill_pos is " << backfill_pos << dendl;
+  for (set<hobject_t>::iterator i = backfills_in_flight.begin();
+       i != backfills_in_flight.end();
+       ++i) {
+    dout(20) << *i << " is still in flight" << dendl;
+  }
+
+  hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
+    backfill_pos : *(backfills_in_flight.begin());
+  hobject_t new_last_backfill = recovery_state.earliest_backfill();
+  dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
+  for (map<hobject_t, pg_stat_t>::iterator i =
+	 pending_backfill_updates.begin();
+       i != pending_backfill_updates.end() &&
+	 i->first < next_backfill_to_complete;
+       pending_backfill_updates.erase(i++)) {
+    dout(20) << " pending_backfill_update " << i->first << dendl;
+    ceph_assert(i->first > new_last_backfill);
+    // carried from a previous round – if we are here, then we had to
+    // be requeued (by e.g. on_global_recover()) and those operations
+    // are done.
+    recovery_state.update_complete_backfill_object_stats(
+      i->first,
+      i->second);
+    new_last_backfill = i->first;
+  }
+  dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
+
+  ceph_assert(!pending_backfill_updates.empty() ||
+	 new_last_backfill == last_backfill_started);
+  if (pending_backfill_updates.empty() &&
+      backfill_pos.is_max()) {
+    ceph_assert(backfills_in_flight.empty());
+    new_last_backfill = backfill_pos;
+    last_backfill_started = backfill_pos;
+  }
+  dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
+
+  // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
+  // all the backfill targets.  Otherwise, we will move last_backfill up on
+  // those targets need it and send OP_BACKFILL_PROGRESS to them.
+  for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
+       i != get_backfill_targets().end();
+       ++i) {
+    pg_shard_t bt = *i;
+    const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
+
+    if (new_last_backfill > pinfo.last_backfill) {
+      recovery_state.update_peer_last_backfill(bt, new_last_backfill);
+      epoch_t e = get_osdmap_epoch();
+      MOSDPGBackfill *m = NULL;
+      if (pinfo.last_backfill.is_max()) {
+        m = new MOSDPGBackfill(
+	  MOSDPGBackfill::OP_BACKFILL_FINISH,
+	  e,
+	  get_last_peering_reset(),
+	  spg_t(info.pgid.pgid, bt.shard));
+        // Use default priority here, must match sub_op priority
+        start_recovery_op(hobject_t::get_max());
+      } else {
+        m = new MOSDPGBackfill(
+	  MOSDPGBackfill::OP_BACKFILL_PROGRESS,
+	  e,
+	  get_last_peering_reset(),
+	  spg_t(info.pgid.pgid, bt.shard));
+        // Use default priority here, must match sub_op priority
+      }
+      m->last_backfill = pinfo.last_backfill;
+      m->stats = pinfo.stats;
+      osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
+      dout(10) << " peer " << bt
+	       << " num_objects now " << pinfo.stats.stats.sum.num_objects
+	       << " / " << info.stats.stats.sum.num_objects << dendl;
+    }
+  }
+
+  if (ops)
+    *work_started = true;
+  return ops;
+}
+
+int PrimaryLogPG::prep_backfill_object_push(
+  hobject_t oid, eversion_t v,
+  ObjectContextRef obc,
+  vector<pg_shard_t> peers,
+  PGBackend::RecoveryHandle *h)
+{
+  dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
+  ceph_assert(!peers.empty());
+
+  backfills_in_flight.insert(oid);
+  recovery_state.prepare_backfill_for_missing(oid, v, peers);
+
+  ceph_assert(!recovering.count(oid));
+
+  start_recovery_op(oid);
+  recovering.insert(make_pair(oid, obc));
+
+  int r = pgbackend->recover_object(
+    oid,
+    v,
+    ObjectContextRef(),
+    obc,
+    h);
+  if (r < 0) {
+    dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
+    on_failed_pull({ pg_whoami }, oid, v);
+  }
+  return r;
+}
+
+void PrimaryLogPG::update_range(
+  BackfillInterval *bi,
+  ThreadPool::TPHandle &handle)
+{
+  int local_min = cct->_conf->osd_backfill_scan_min;
+  int local_max = cct->_conf->osd_backfill_scan_max;
+
+  if (bi->version < info.log_tail) {
+    dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
+	     << dendl;
+    bi->version = info.last_update;
+    scan_range(local_min, local_max, bi, handle);
+  }
+
+  if (bi->version >= projected_last_update) {
+    dout(10) << __func__<< ": bi is current " << dendl;
+    ceph_assert(bi->version == projected_last_update);
+  } else if (bi->version >= info.log_tail) {
+    if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) {
+      /* Because we don't move log_tail on split, the log might be
+       * empty even if log_tail != last_update.  However, the only
+       * way to get here with an empty log is if log_tail is actually
+       * eversion_t(), because otherwise the entry which changed
+       * last_update since the last scan would have to be present.
+       */
+      ceph_assert(bi->version == eversion_t());
+      return;
+    }
+
+    dout(10) << __func__<< ": bi is old, (" << bi->version
+	     << ") can be updated with log to projected_last_update "
+	     << projected_last_update << dendl;
+
+    auto func = [&](const pg_log_entry_t &e) {
+      dout(10) << __func__ << ": updating from version " << e.version
+               << dendl;
+      const hobject_t &soid = e.soid;
+      if (soid >= bi->begin &&
+	  soid < bi->end) {
+	if (e.is_update()) {
+	  dout(10) << __func__ << ": " << e.soid << " updated to version "
+		   << e.version << dendl;
+	  bi->objects.erase(e.soid);
+	  bi->objects.insert(
+	    make_pair(
+	      e.soid,
+	      e.version));
+	} else if (e.is_delete()) {
+	  dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
+	  bi->objects.erase(e.soid);
+	}
+      }
+    };
+    dout(10) << "scanning pg log first" << dendl;
+    recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func);
+    dout(10) << "scanning projected log" << dendl;
+    projected_log.scan_log_after(bi->version, func);
+    bi->version = projected_last_update;
+  } else {
+    ceph_abort_msg("scan_range should have raised bi->version past log_tail");
+  }
+}
+
+void PrimaryLogPG::scan_range(
+  int min, int max, BackfillInterval *bi,
+  ThreadPool::TPHandle &handle)
+{
+  ceph_assert(is_locked());
+  dout(10) << "scan_range from " << bi->begin << dendl;
+  bi->clear_objects();
+
+  vector<hobject_t> ls;
+  ls.reserve(max);
+  int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
+  ceph_assert(r >= 0);
+  dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
+  dout(20) << ls << dendl;
+
+  for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
+    handle.reset_tp_timeout();
+    ObjectContextRef obc;
+    if (is_primary())
+      obc = object_contexts.lookup(*p);
+    if (obc) {
+      if (!obc->obs.exists) {
+	/* If the object does not exist here, it must have been removed
+	 * between the collection_list_partial and here.  This can happen
+	 * for the first item in the range, which is usually last_backfill.
+	 */
+	continue;
+      }
+      bi->objects[*p] = obc->obs.oi.version;
+      dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
+    } else {
+      bufferlist bl;
+      int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
+      /* If the object does not exist here, it must have been removed
+       * between the collection_list_partial and here.  This can happen
+       * for the first item in the range, which is usually last_backfill.
+       */
+      if (r == -ENOENT)
+	continue;
+
+      ceph_assert(r >= 0);
+      object_info_t oi(bl);
+      bi->objects[*p] = oi.version;
+      dout(20) << "  " << *p << " " << oi.version << dendl;
+    }
+  }
+}
+
+
+/** check_local
+ *
+ * verifies that stray objects have been deleted
+ */
+void PrimaryLogPG::check_local()
+{
+  dout(10) << __func__ << dendl;
+
+  ceph_assert(
+    info.last_update >=
+    recovery_state.get_pg_log().get_tail());  // otherwise we need some help!
+
+  if (!cct->_conf->osd_debug_verify_stray_on_activate)
+    return;
+
+  // just scan the log.
+  set<hobject_t> did;
+  for (list<pg_log_entry_t>::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin();
+       p != recovery_state.get_pg_log().get_log().log.rend();
+       ++p) {
+    if (did.count(p->soid))
+      continue;
+    did.insert(p->soid);
+
+    if (p->is_delete() && !is_missing_object(p->soid)) {
+      dout(10) << " checking " << p->soid
+	       << " at " << p->version << dendl;
+      struct stat st;
+      int r = osd->store->stat(
+	ch,
+	ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
+	&st);
+      if (r != -ENOENT) {
+	derr << __func__ << " " << p->soid << " exists, but should have been "
+	     << "deleted" << dendl;
+	ceph_abort_msg("erroneously present object");
+      }
+    } else {
+      // ignore old(+missing) objects
+    }
+  }
+}
+
+
+
+// ===========================
+// hit sets
+
+hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
+{
+  ostringstream ss;
+  ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
+  hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
+		 info.pgid.ps(), info.pgid.pool(),
+		 cct->_conf->osd_hit_set_namespace);
+  dout(20) << __func__ << " " << hoid << dendl;
+  return hoid;
+}
+
+hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
+						   utime_t end,
+						   bool using_gmt)
+{
+  ostringstream ss;
+  ss << "hit_set_" << info.pgid.pgid << "_archive_";
+  if (using_gmt) {
+    start.gmtime(ss, true /* legacy pre-octopus form */) << "_";
+    end.gmtime(ss, true /* legacy pre-octopus form */);
+  } else {
+    start.localtime(ss, true /* legacy pre-octopus form */) << "_";
+    end.localtime(ss, true /* legacy pre-octopus form */);
+  }
+  hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
+		 info.pgid.ps(), info.pgid.pool(),
+		 cct->_conf->osd_hit_set_namespace);
+  dout(20) << __func__ << " " << hoid << dendl;
+  return hoid;
+}
+
+void PrimaryLogPG::hit_set_clear()
+{
+  dout(20) << __func__ << dendl;
+  hit_set.reset();
+  hit_set_start_stamp = utime_t();
+}
+
+void PrimaryLogPG::hit_set_setup()
+{
+  if (!is_active() ||
+      !is_primary()) {
+    hit_set_clear();
+    return;
+  }
+
+  if (is_active() && is_primary() &&
+      (!pool.info.hit_set_count ||
+       !pool.info.hit_set_period ||
+       pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
+    hit_set_clear();
+
+    // only primary is allowed to remove all the hit set objects
+    hit_set_remove_all();
+    return;
+  }
+
+  // FIXME: discard any previous data for now
+  hit_set_create();
+
+  // include any writes we know about from the pg log.  this doesn't
+  // capture reads, but it is better than nothing!
+  hit_set_apply_log();
+}
+
+void PrimaryLogPG::hit_set_remove_all()
+{
+  // If any archives are degraded we skip this
+  for (auto p = info.hit_set.history.begin();
+       p != info.hit_set.history.end();
+       ++p) {
+    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+
+    // Once we hit a degraded object just skip
+    if (is_degraded_or_backfilling_object(aoid))
+      return;
+    if (m_scrubber->write_blocked_by_scrub(aoid))
+      return;
+  }
+
+  if (!info.hit_set.history.empty()) {
+    auto p = info.hit_set.history.rbegin();
+    ceph_assert(p != info.hit_set.history.rend());
+    hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+    ceph_assert(!is_degraded_or_backfilling_object(oid));
+    ObjectContextRef obc = get_object_context(oid, false);
+    ceph_assert(obc);
+
+    OpContextUPtr ctx = simple_opc_create(obc);
+    ctx->at_version = get_next_version();
+    ctx->updated_hset_history = info.hit_set;
+    utime_t now = ceph_clock_now();
+    ctx->mtime = now;
+    hit_set_trim(ctx, 0);
+    simple_opc_submit(std::move(ctx));
+  }
+
+  recovery_state.update_hset(pg_hit_set_history_t());
+  if (agent_state) {
+    agent_state->discard_hit_sets();
+  }
+}
+
+void PrimaryLogPG::hit_set_create()
+{
+  utime_t now = ceph_clock_now();
+  // make a copy of the params to modify
+  HitSet::Params params(pool.info.hit_set_params);
+
+  dout(20) << __func__ << " " << params << dendl;
+  if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
+    BloomHitSet::Params *p =
+      static_cast<BloomHitSet::Params*>(params.impl.get());
+
+    // convert false positive rate so it holds up across the full period
+    p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
+    if (p->get_fpp() <= 0.0)
+      p->set_fpp(.01);  // fpp cannot be zero!
+
+    // if we don't have specified size, estimate target size based on the
+    // previous bin!
+    if (p->target_size == 0 && hit_set) {
+      utime_t dur = now - hit_set_start_stamp;
+      unsigned unique = hit_set->approx_unique_insert_count();
+      dout(20) << __func__ << " previous set had approx " << unique
+	       << " unique items over " << dur << " seconds" << dendl;
+      p->target_size = (double)unique * (double)pool.info.hit_set_period
+		     / (double)dur;
+    }
+    if (p->target_size <
+	static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
+      p->target_size = cct->_conf->osd_hit_set_min_size;
+
+    if (p->target_size
+	> static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
+      p->target_size = cct->_conf->osd_hit_set_max_size;
+
+    p->seed = now.sec();
+
+    dout(10) << __func__ << " target_size " << p->target_size
+	     << " fpp " << p->get_fpp() << dendl;
+  }
+  hit_set.reset(new HitSet(params));
+  hit_set_start_stamp = now;
+}
+
+/**
+ * apply log entries to set
+ *
+ * this would only happen after peering, to at least capture writes
+ * during an interval that was potentially lost.
+ */
+bool PrimaryLogPG::hit_set_apply_log()
+{
+  if (!hit_set)
+    return false;
+
+  eversion_t to = info.last_update;
+  eversion_t from = info.hit_set.current_last_update;
+  if (to <= from) {
+    dout(20) << __func__ << " no update" << dendl;
+    return false;
+  }
+
+  dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
+  list<pg_log_entry_t>::const_reverse_iterator p =
+    recovery_state.get_pg_log().get_log().log.rbegin();
+  while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to)
+    ++p;
+  while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) {
+    hit_set->insert(p->soid);
+    ++p;
+  }
+
+  return true;
+}
+
+void PrimaryLogPG::hit_set_persist()
+{
+  dout(10) << __func__  << dendl;
+  bufferlist bl;
+  unsigned max = pool.info.hit_set_count;
+
+  utime_t now = ceph_clock_now();
+  hobject_t oid;
+
+  // If any archives are degraded we skip this persist request
+  // account for the additional entry being added below
+  for (auto p = info.hit_set.history.begin();
+       p != info.hit_set.history.end();
+       ++p) {
+    hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+
+    // Once we hit a degraded object just skip further trim
+    if (is_degraded_or_backfilling_object(aoid))
+      return;
+    if (m_scrubber->write_blocked_by_scrub(aoid))
+      return;
+  }
+
+  // If backfill is in progress and we could possibly overlap with the
+  // hit_set_* objects, back off.  Since these all have
+  // hobject_t::hash set to pgid.ps(), and those sort first, we can
+  // look just at that.  This is necessary because our transactions
+  // may include a modify of the new hit_set *and* a delete of the
+  // old one, and this may span the backfill boundary.
+  for (set<pg_shard_t>::const_iterator p = get_backfill_targets().begin();
+       p != get_backfill_targets().end();
+       ++p) {
+    const pg_info_t& pi = recovery_state.get_peer_info(*p);
+    if (pi.last_backfill == hobject_t() ||
+	pi.last_backfill.get_hash() == info.pgid.ps()) {
+      dout(10) << __func__ << " backfill target osd." << *p
+	       << " last_backfill has not progressed past pgid ps"
+	       << dendl;
+      return;
+    }
+  }
+
+
+  pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
+  new_hset.begin = hit_set_start_stamp;
+  new_hset.end = now;
+  oid = get_hit_set_archive_object(
+    new_hset.begin,
+    new_hset.end,
+    new_hset.using_gmt);
+
+  // If the current object is degraded we skip this persist request
+  if (m_scrubber->write_blocked_by_scrub(oid))
+    return;
+
+  hit_set->seal();
+  encode(*hit_set, bl);
+  dout(20) << __func__ << " archive " << oid << dendl;
+
+  if (agent_state) {
+    agent_state->add_hit_set(new_hset.begin, hit_set);
+    uint32_t size = agent_state->hit_set_map.size();
+    if (size >= pool.info.hit_set_count) {
+      size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
+    }
+    hit_set_in_memory_trim(size);
+  }
+
+  ObjectContextRef obc = get_object_context(oid, true);
+  OpContextUPtr ctx = simple_opc_create(obc);
+
+  ctx->at_version = get_next_version();
+  ctx->updated_hset_history = info.hit_set;
+  pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
+
+  updated_hit_set_hist.current_last_update = info.last_update;
+  new_hset.version = ctx->at_version;
+
+  updated_hit_set_hist.history.push_back(new_hset);
+  hit_set_create();
+
+  // fabricate an object_info_t and SnapSet
+  obc->obs.oi.version = ctx->at_version;
+  obc->obs.oi.mtime = now;
+  obc->obs.oi.size = bl.length();
+  obc->obs.exists = true;
+  obc->obs.oi.set_data_digest(bl.crc32c(-1));
+
+  ctx->new_obs = obc->obs;
+
+  ctx->new_snapset = obc->ssc->snapset;
+
+  ctx->delta_stats.num_objects++;
+  ctx->delta_stats.num_objects_hit_set_archive++;
+
+  ctx->delta_stats.num_bytes += bl.length();
+  ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
+
+  bufferlist bss;
+  encode(ctx->new_snapset, bss);
+  bufferlist boi(sizeof(ctx->new_obs.oi));
+  encode(ctx->new_obs.oi, boi,
+	   get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+
+  ctx->op_t->create(oid);
+  if (bl.length()) {
+    ctx->op_t->write(oid, 0, bl.length(), bl, 0);
+    write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges,
+        0, bl.length());
+    ctx->clean_regions.mark_data_region_dirty(0, bl.length());
+  }
+  map <string, bufferlist> attrs;
+  attrs[OI_ATTR] = std::move(boi);
+  attrs[SS_ATTR] = std::move(bss);
+  setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
+  ctx->log.push_back(
+    pg_log_entry_t(
+      pg_log_entry_t::MODIFY,
+      oid,
+      ctx->at_version,
+      eversion_t(),
+      0,
+      osd_reqid_t(),
+      ctx->mtime,
+      0)
+    );
+  ctx->log.back().clean_regions = ctx->clean_regions;
+
+  hit_set_trim(ctx, max);
+
+  simple_opc_submit(std::move(ctx));
+}
+
+void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
+{
+  ceph_assert(ctx->updated_hset_history);
+  pg_hit_set_history_t &updated_hit_set_hist =
+    *(ctx->updated_hset_history);
+  for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
+    list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
+    ceph_assert(p != updated_hit_set_hist.history.end());
+    hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+
+    ceph_assert(!is_degraded_or_backfilling_object(oid));
+
+    dout(20) << __func__ << " removing " << oid << dendl;
+    ++ctx->at_version.version;
+    ctx->log.push_back(
+        pg_log_entry_t(pg_log_entry_t::DELETE,
+		       oid,
+		       ctx->at_version,
+		       p->version,
+		       0,
+		       osd_reqid_t(),
+		       ctx->mtime,
+		       0));
+
+    ctx->op_t->remove(oid);
+    updated_hit_set_hist.history.pop_front();
+
+    ObjectContextRef obc = get_object_context(oid, false);
+    ceph_assert(obc);
+    --ctx->delta_stats.num_objects;
+    --ctx->delta_stats.num_objects_hit_set_archive;
+    ctx->delta_stats.num_bytes -= obc->obs.oi.size;
+    ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
+  }
+}
+
+void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
+{
+  while (agent_state->hit_set_map.size() > max_in_memory) {
+    agent_state->remove_oldest_hit_set();
+  }
+}
+
+
+// =======================================
+// cache agent
+
+void PrimaryLogPG::agent_setup()
+{
+  ceph_assert(is_locked());
+  if (!is_active() ||
+      !is_primary() ||
+      state_test(PG_STATE_PREMERGE) ||
+      pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
+      pool.info.tier_of < 0 ||
+      !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
+    agent_clear();
+    return;
+  }
+  if (!agent_state) {
+    agent_state.reset(new TierAgentState);
+
+    // choose random starting position
+    agent_state->position = hobject_t();
+    agent_state->position.pool = info.pgid.pool();
+    agent_state->position.set_hash(pool.info.get_random_pg_position(
+      info.pgid.pgid,
+      rand()));
+    agent_state->start = agent_state->position;
+
+    dout(10) << __func__ << " allocated new state, position "
+	     << agent_state->position << dendl;
+  } else {
+    dout(10) << __func__ << " keeping existing state" << dendl;
+  }
+
+  if (info.stats.stats_invalid) {
+    osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
+  }
+
+  agent_choose_mode();
+}
+
+void PrimaryLogPG::agent_clear()
+{
+  agent_stop();
+  agent_state.reset(NULL);
+}
+
+// Return false if no objects operated on since start of object hash space
+bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
+{
+  std::scoped_lock locker{*this};
+  if (!agent_state) {
+    dout(10) << __func__ << " no agent state, stopping" << dendl;
+    return true;
+  }
+
+  ceph_assert(!recovery_state.is_deleting());
+
+  if (agent_state->is_idle()) {
+    dout(10) << __func__ << " idle, stopping" << dendl;
+    return true;
+  }
+
+  osd->logger->inc(l_osd_agent_wake);
+
+  dout(10) << __func__
+	   << " max " << start_max
+	   << ", flush " << agent_state->get_flush_mode_name()
+	   << ", evict " << agent_state->get_evict_mode_name()
+	   << ", pos " << agent_state->position
+	   << dendl;
+  ceph_assert(is_primary());
+  ceph_assert(is_active());
+
+  agent_load_hit_sets();
+
+  const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
+  ceph_assert(base_pool);
+
+  int ls_min = 1;
+  int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
+
+  // list some objects.  this conveniently lists clones (oldest to
+  // newest) before heads... the same order we want to flush in.
+  //
+  // NOTE: do not flush the Sequencer.  we will assume that the
+  // listing we get back is imprecise.
+  vector<hobject_t> ls;
+  hobject_t next;
+  int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
+					  &ls, &next);
+  ceph_assert(r >= 0);
+  dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
+  int started = 0;
+  for (vector<hobject_t>::iterator p = ls.begin();
+       p != ls.end();
+       ++p) {
+    if (p->nspace == cct->_conf->osd_hit_set_namespace) {
+      dout(20) << __func__ << " skip (hit set) " << *p << dendl;
+      osd->logger->inc(l_osd_agent_skip);
+      continue;
+    }
+    if (is_degraded_or_backfilling_object(*p)) {
+      dout(20) << __func__ << " skip (degraded) " << *p << dendl;
+      osd->logger->inc(l_osd_agent_skip);
+      continue;
+    }
+    if (is_missing_object(p->get_head())) {
+      dout(20) << __func__ << " skip (missing head) " << *p << dendl;
+      osd->logger->inc(l_osd_agent_skip);
+      continue;
+    }
+    ObjectContextRef obc = get_object_context(*p, false, NULL);
+    if (!obc) {
+      // we didn't flush; we may miss something here.
+      dout(20) << __func__ << " skip (no obc) " << *p << dendl;
+      osd->logger->inc(l_osd_agent_skip);
+      continue;
+    }
+    if (!obc->obs.exists) {
+      dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
+      osd->logger->inc(l_osd_agent_skip);
+      continue;
+    }
+    if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid,
+			       obc->obs.oi.soid.get_head())) {
+      dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
+      osd->logger->inc(l_osd_agent_skip);
+      continue;
+    }
+    if (obc->is_blocked()) {
+      dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
+      osd->logger->inc(l_osd_agent_skip);
+      continue;
+    }
+    if (obc->is_request_pending()) {
+      dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
+      osd->logger->inc(l_osd_agent_skip);
+      continue;
+    }
+
+    // be careful flushing omap to an EC pool.
+    if (!base_pool->supports_omap() &&
+	obc->obs.oi.is_omap()) {
+      dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
+      osd->logger->inc(l_osd_agent_skip);
+      continue;
+    }
+
+    if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
+	agent_maybe_evict(obc, false))
+      ++started;
+    else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
+             agent_flush_quota > 0 && agent_maybe_flush(obc)) {
+      ++started;
+      --agent_flush_quota;
+    }
+    if (started >= start_max) {
+      // If finishing early, set "next" to the next object
+      if (++p != ls.end())
+	next = *p;
+      break;
+    }
+  }
+
+  if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
+    dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
+    agent_state->hist_age = 0;
+    agent_state->temp_hist.decay();
+  }
+
+  // Total objects operated on so far
+  int total_started = agent_state->started + started;
+  bool need_delay = false;
+
+  dout(20) << __func__ << " start pos " << agent_state->position
+    << " next start pos " << next
+    << " started " << total_started << dendl;
+
+  // See if we've made a full pass over the object hash space
+  // This might check at most ls_max objects a second time to notice that
+  // we've checked every objects at least once.
+  if (agent_state->position < agent_state->start &&
+      next >= agent_state->start) {
+    dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
+    if (total_started == 0)
+      need_delay = true;
+    else
+      total_started = 0;
+    agent_state->start = next;
+  }
+  agent_state->started = total_started;
+
+  // See if we are starting from beginning
+  if (next.is_max())
+    agent_state->position = hobject_t();
+  else
+    agent_state->position = next;
+
+  // Discard old in memory HitSets
+  hit_set_in_memory_trim(pool.info.hit_set_count);
+
+  if (need_delay) {
+    ceph_assert(agent_state->delaying == false);
+    agent_delay();
+    return false;
+  }
+  agent_choose_mode();
+  return true;
+}
+
+void PrimaryLogPG::agent_load_hit_sets()
+{
+  if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
+    return;
+  }
+
+  if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
+    dout(10) << __func__ << dendl;
+    for (auto p = info.hit_set.history.begin();
+	 p != info.hit_set.history.end(); ++p) {
+      if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
+	dout(10) << __func__ << " loading " << p->begin << "-"
+		 << p->end << dendl;
+	if (!pool.info.is_replicated()) {
+	  // FIXME: EC not supported here yet
+	  derr << __func__ << " on non-replicated pool" << dendl;
+	  break;
+	}
+
+	hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
+	if (is_unreadable_object(oid)) {
+	  dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
+	  break;
+	}
+
+	ObjectContextRef obc = get_object_context(oid, false);
+	if (!obc) {
+	  derr << __func__ << ": could not load hitset " << oid << dendl;
+	  break;
+	}
+
+	bufferlist bl;
+	{
+	  int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
+	  ceph_assert(r >= 0);
+	}
+	HitSetRef hs(new HitSet);
+	bufferlist::const_iterator pbl = bl.begin();
+	decode(*hs, pbl);
+	agent_state->add_hit_set(p->begin.sec(), hs);
+      }
+    }
+  }
+}
+
+bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
+{
+  if (!obc->obs.oi.is_dirty()) {
+    dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
+    osd->logger->inc(l_osd_agent_skip);
+    return false;
+  }
+  if (obc->obs.oi.is_cache_pinned()) {
+    dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
+    osd->logger->inc(l_osd_agent_skip);
+    return false;
+  }
+
+  utime_t now = ceph_clock_now();
+  utime_t ob_local_mtime;
+  if (obc->obs.oi.local_mtime != utime_t()) {
+    ob_local_mtime = obc->obs.oi.local_mtime;
+  } else {
+    ob_local_mtime = obc->obs.oi.mtime;
+  }
+  bool evict_mode_full =
+    (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
+  if (!evict_mode_full &&
+      obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
+      (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
+    dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
+    osd->logger->inc(l_osd_agent_skip);
+    return false;
+  }
+
+  if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
+    dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
+    osd->logger->inc(l_osd_agent_skip);
+    return false;
+  }
+
+  dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
+
+  // FIXME: flush anything dirty, regardless of what distribution of
+  // ages we expect.
+
+  hobject_t oid = obc->obs.oi.soid;
+  osd->agent_start_op(oid);
+  // no need to capture a pg ref, can't outlive fop or ctx
+  std::function<void()> on_flush = [this, oid]() {
+    osd->agent_finish_op(oid);
+  };
+
+  int result = start_flush(
+    OpRequestRef(), obc, false, NULL,
+    on_flush);
+  if (result != -EINPROGRESS) {
+    on_flush();
+    dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
+      << " with " << result << dendl;
+    osd->logger->inc(l_osd_agent_skip);
+    return false;
+  }
+
+  osd->logger->inc(l_osd_agent_flush);
+  return true;
+}
+
+bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
+{
+  const hobject_t& soid = obc->obs.oi.soid;
+  if (!after_flush && obc->obs.oi.is_dirty()) {
+    dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
+    return false;
+  }
+  // This is already checked by agent_work() which passes after_flush = false
+  if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) {
+      dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
+      return false;
+  }
+  if (!obc->obs.oi.watchers.empty()) {
+    dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
+    return false;
+  }
+  if (obc->is_blocked()) {
+    dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
+    return false;
+  }
+  if (obc->obs.oi.is_cache_pinned()) {
+    dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
+    return false;
+  }
+
+  if (soid.snap == CEPH_NOSNAP) {
+    int result = _verify_no_head_clones(soid, obc->ssc->snapset);
+    if (result < 0) {
+      dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
+      return false;
+    }
+  }
+
+  if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
+    // is this object old than cache_min_evict_age?
+    utime_t now = ceph_clock_now();
+    utime_t ob_local_mtime;
+    if (obc->obs.oi.local_mtime != utime_t()) {
+      ob_local_mtime = obc->obs.oi.local_mtime;
+    } else {
+      ob_local_mtime = obc->obs.oi.mtime;
+    }
+    if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
+      dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
+      osd->logger->inc(l_osd_agent_skip);
+      return false;
+    }
+    // is this object old and/or cold enough?
+    int temp = 0;
+    uint64_t temp_upper = 0, temp_lower = 0;
+    if (hit_set)
+      agent_estimate_temp(soid, &temp);
+    agent_state->temp_hist.add(temp);
+    agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
+
+    dout(20) << __func__
+	     << " temp " << temp
+	     << " pos " << temp_lower << "-" << temp_upper
+	     << ", evict_effort " << agent_state->evict_effort
+	     << dendl;
+    dout(30) << "agent_state:\n";
+    Formatter *f = Formatter::create("");
+    f->open_object_section("agent_state");
+    agent_state->dump(f);
+    f->close_section();
+    f->flush(*_dout);
+    delete f;
+    *_dout << dendl;
+
+    if (1000000 - temp_upper >= agent_state->evict_effort)
+      return false;
+  }
+
+  dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
+  OpContextUPtr ctx = simple_opc_create(obc);
+
+  auto null_op_req = OpRequestRef();
+  if (!ctx->lock_manager.get_lock_type(
+	RWState::RWWRITE,
+	obc->obs.oi.soid,
+	obc,
+	null_op_req)) {
+    close_op_ctx(ctx.release());
+    dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
+    return false;
+  }
+
+  osd->agent_start_evict_op();
+  ctx->register_on_finish(
+    [this]() {
+      osd->agent_finish_evict_op();
+    });
+
+  ctx->at_version = get_next_version();
+  ceph_assert(ctx->new_obs.exists);
+  int r = _delete_oid(ctx.get(), true, false);
+  if (obc->obs.oi.is_omap())
+    ctx->delta_stats.num_objects_omap--;
+  ctx->delta_stats.num_evict++;
+  ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
+  if (obc->obs.oi.is_dirty())
+    --ctx->delta_stats.num_objects_dirty;
+  ceph_assert(r == 0);
+  finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
+  simple_opc_submit(std::move(ctx));
+  osd->logger->inc(l_osd_tier_evict);
+  osd->logger->inc(l_osd_agent_evict);
+  return true;
+}
+
+void PrimaryLogPG::agent_stop()
+{
+  dout(20) << __func__ << dendl;
+  if (agent_state && !agent_state->is_idle()) {
+    agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
+    agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
+    osd->agent_disable_pg(this, agent_state->evict_effort);
+  }
+}
+
+void PrimaryLogPG::agent_delay()
+{
+  dout(20) << __func__ << dendl;
+  if (agent_state && !agent_state->is_idle()) {
+    ceph_assert(agent_state->delaying == false);
+    agent_state->delaying = true;
+    osd->agent_disable_pg(this, agent_state->evict_effort);
+  }
+}
+
+void PrimaryLogPG::agent_choose_mode_restart()
+{
+  dout(20) << __func__ << dendl;
+  std::scoped_lock locker{*this};
+  if (agent_state && agent_state->delaying) {
+    agent_state->delaying = false;
+    agent_choose_mode(true);
+  }
+}
+
+bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
+{
+  bool requeued = false;
+  // Let delay play out
+  if (agent_state->delaying) {
+    dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
+    return requeued;
+  }
+
+  TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
+  TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
+  unsigned evict_effort = 0;
+
+  if (info.stats.stats_invalid) {
+    // idle; stats can't be trusted until we scrub.
+    dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
+    goto skip_calc;
+  }
+
+  {
+  uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
+  ceph_assert(divisor > 0);
+
+  // adjust (effective) user objects down based on the number
+  // of HitSet objects, which should not count toward our total since
+  // they cannot be flushed.
+  uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
+
+  // also exclude omap objects if ec backing pool
+  const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
+  ceph_assert(base_pool);
+  if (!base_pool->supports_omap())
+    unflushable += info.stats.stats.sum.num_objects_omap;
+
+  uint64_t num_user_objects = info.stats.stats.sum.num_objects;
+  if (num_user_objects > unflushable)
+    num_user_objects -= unflushable;
+  else
+    num_user_objects = 0;
+
+  uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
+  uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
+  num_user_bytes -= unflushable_bytes;
+  uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
+  num_user_bytes += num_overhead_bytes;
+
+  // also reduce the num_dirty by num_objects_omap
+  int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
+  if (!base_pool->supports_omap()) {
+    if (num_dirty > info.stats.stats.sum.num_objects_omap)
+      num_dirty -= info.stats.stats.sum.num_objects_omap;
+    else
+      num_dirty = 0;
+  }
+
+  dout(10) << __func__
+	   << " flush_mode: "
+	   << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
+	   << " evict_mode: "
+	   << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
+	   << " num_objects: " << info.stats.stats.sum.num_objects
+	   << " num_bytes: " << info.stats.stats.sum.num_bytes
+	   << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
+	   << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
+	   << " num_dirty: " << num_dirty
+	   << " num_user_objects: " << num_user_objects
+	   << " num_user_bytes: " << num_user_bytes
+	   << " num_overhead_bytes: " << num_overhead_bytes
+	   << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
+	   << " pool.info.target_max_objects: " << pool.info.target_max_objects
+	   << dendl;
+
+  // get dirty, full ratios
+  uint64_t dirty_micro = 0;
+  uint64_t full_micro = 0;
+  if (pool.info.target_max_bytes && num_user_objects > 0) {
+    uint64_t avg_size = num_user_bytes / num_user_objects;
+    dirty_micro =
+      num_dirty * avg_size * 1000000 /
+      std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
+    full_micro =
+      num_user_objects * avg_size * 1000000 /
+      std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
+  }
+  if (pool.info.target_max_objects > 0) {
+    uint64_t dirty_objects_micro =
+      num_dirty * 1000000 /
+      std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
+    if (dirty_objects_micro > dirty_micro)
+      dirty_micro = dirty_objects_micro;
+    uint64_t full_objects_micro =
+      num_user_objects * 1000000 /
+      std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
+    if (full_objects_micro > full_micro)
+      full_micro = full_objects_micro;
+  }
+  dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
+	   << " full " << ((float)full_micro / 1000000.0)
+	   << dendl;
+
+  // flush mode
+  uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
+  uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
+  uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
+  if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
+    flush_target += flush_slop;
+    flush_high_target += flush_slop;
+  } else {
+    flush_target -= std::min(flush_target, flush_slop);
+    flush_high_target -= std::min(flush_high_target, flush_slop);
+  }
+
+  if (dirty_micro > flush_high_target) {
+    flush_mode = TierAgentState::FLUSH_MODE_HIGH;
+  } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
+    flush_mode = TierAgentState::FLUSH_MODE_LOW;
+  }
+
+  // evict mode
+  uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
+  uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
+  if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
+    evict_target += evict_slop;
+  else
+    evict_target -= std::min(evict_target, evict_slop);
+
+  if (full_micro > 1000000) {
+    // evict anything clean
+    evict_mode = TierAgentState::EVICT_MODE_FULL;
+    evict_effort = 1000000;
+  } else if (full_micro > evict_target) {
+    // set effort in [0..1] range based on where we are between
+    evict_mode = TierAgentState::EVICT_MODE_SOME;
+    uint64_t over = full_micro - evict_target;
+    uint64_t span  = 1000000 - evict_target;
+    evict_effort = std::max(over * 1000000 / span,
+			    uint64_t(1000000.0 *
+				     cct->_conf->osd_agent_min_evict_effort));
+
+    // quantize effort to avoid too much reordering in the agent_queue.
+    uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
+    ceph_assert(inc > 0);
+    uint64_t was = evict_effort;
+    evict_effort -= evict_effort % inc;
+    if (evict_effort < inc)
+      evict_effort = inc;
+    ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
+    dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
+  }
+  }
+
+  skip_calc:
+  bool old_idle = agent_state->is_idle();
+  if (flush_mode != agent_state->flush_mode) {
+    dout(5) << __func__ << " flush_mode "
+	    << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
+	    << " -> "
+	    << TierAgentState::get_flush_mode_name(flush_mode)
+	    << dendl;
+    recovery_state.update_stats(
+      [=](auto &history, auto &stats) {
+	if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
+	  osd->agent_inc_high_count();
+	  stats.stats.sum.num_flush_mode_high = 1;
+	} else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
+	  stats.stats.sum.num_flush_mode_low = 1;
+	}
+	if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
+	  osd->agent_dec_high_count();
+	  stats.stats.sum.num_flush_mode_high = 0;
+	} else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
+	  stats.stats.sum.num_flush_mode_low = 0;
+	}
+	return false;
+      });
+    agent_state->flush_mode = flush_mode;
+  }
+  if (evict_mode != agent_state->evict_mode) {
+    dout(5) << __func__ << " evict_mode "
+	    << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
+	    << " -> "
+	    << TierAgentState::get_evict_mode_name(evict_mode)
+	    << dendl;
+    if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
+	is_active()) {
+      if (op)
+	requeue_op(op);
+      requeue_ops(waiting_for_flush);
+      requeue_ops(waiting_for_active);
+      requeue_ops(waiting_for_readable);
+      requeue_ops(waiting_for_scrub);
+      requeue_ops(waiting_for_cache_not_full);
+      objects_blocked_on_cache_full.clear();
+      requeued = true;
+    }
+    recovery_state.update_stats(
+      [=](auto &history, auto &stats) {
+	if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
+	  stats.stats.sum.num_evict_mode_some = 1;
+	} else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
+	  stats.stats.sum.num_evict_mode_full = 1;
+	}
+	if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
+	  stats.stats.sum.num_evict_mode_some = 0;
+	} else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
+	  stats.stats.sum.num_evict_mode_full = 0;
+	}
+	return false;
+      });
+    agent_state->evict_mode = evict_mode;
+  }
+  uint64_t old_effort = agent_state->evict_effort;
+  if (evict_effort != agent_state->evict_effort) {
+    dout(5) << __func__ << " evict_effort "
+	    << ((float)agent_state->evict_effort / 1000000.0)
+	    << " -> "
+	    << ((float)evict_effort / 1000000.0)
+	    << dendl;
+    agent_state->evict_effort = evict_effort;
+  }
+
+  // NOTE: we are using evict_effort as a proxy for *all* agent effort
+  // (including flush).  This is probably fine (they should be
+  // correlated) but it is not precisely correct.
+  if (agent_state->is_idle()) {
+    if (!restart && !old_idle) {
+      osd->agent_disable_pg(this, old_effort);
+    }
+  } else {
+    if (restart || old_idle) {
+      osd->agent_enable_pg(this, agent_state->evict_effort);
+    } else if (old_effort != agent_state->evict_effort) {
+      osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
+    }
+  }
+  return requeued;
+}
+
+void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
+{
+  ceph_assert(hit_set);
+  ceph_assert(temp);
+  *temp = 0;
+  if (hit_set->contains(oid))
+    *temp = 1000000;
+  unsigned i = 0;
+  int last_n = pool.info.hit_set_search_last_n;
+  for (map<time_t,HitSetRef>::reverse_iterator p =
+       agent_state->hit_set_map.rbegin(); last_n > 0 &&
+       p != agent_state->hit_set_map.rend(); ++p, ++i) {
+    if (p->second->contains(oid)) {
+      *temp += pool.info.get_grade(i);
+      --last_n;
+    }
+  }
+}
+
+// Dup op detection
+
+bool PrimaryLogPG::already_complete(eversion_t v)
+{
+  dout(20) << __func__ << ": " << v << dendl;
+  for (xlist<RepGather*>::iterator i = repop_queue.begin();
+       !i.end();
+       ++i) {
+    dout(20) << __func__ << ": " << **i << dendl;
+    // skip copy from temp object ops
+    if ((*i)->v == eversion_t()) {
+      dout(20) << __func__ << ": " << **i
+	       << " version is empty" << dendl;
+      continue;
+    }
+    if ((*i)->v > v) {
+      dout(20) << __func__ << ": " << **i
+	       << " (*i)->v past v" << dendl;
+      break;
+    }
+    if (!(*i)->all_committed) {
+      dout(20) << __func__ << ": " << **i
+	       << " not committed, returning false"
+	       << dendl;
+      return false;
+    }
+  }
+  dout(20) << __func__ << ": returning true" << dendl;
+  return true;
+}
+
+
+// ==========================================================================================
+// SCRUB
+
+void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op)
+{
+  dout(15) << __func__ << " is scrub active? " << m_scrubber->is_scrub_active() << dendl;
+  op->mark_started();
+
+  if (!m_scrubber->is_scrub_active()) {
+    dout(10) << __func__ << " scrub isn't active" << dendl;
+    return;
+  }
+  m_scrubber->map_from_replica(op);
+}
+
+bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin,
+					      const hobject_t& end)
+{
+  pair<hobject_t, ObjectContextRef> next;
+  next.second = object_contexts.lookup(begin);
+  next.first = begin;
+  bool more = true;
+  while (more && next.first < end) {
+    if (next.second && next.second->is_blocked()) {
+      next.second->requeue_scrub_on_unblock = true;
+      dout(10) << __func__ << ": scrub delayed, "
+	       << next.first << " is blocked"
+	       << dendl;
+      return false;
+    }
+    more = object_contexts.get_next(next.first, &next);
+  }
+  return true;
+}
+
+
+int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
+{
+  OpRequestRef op = ctx->op;
+  // Only supports replicated pools
+  ceph_assert(!pool.info.is_erasure());
+  ceph_assert(is_primary());
+
+  dout(10) << __func__ << " " << soid
+	   << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl;
+
+  if (!is_clean()) {
+    block_for_clean(soid, op);
+    return -EAGAIN;
+  }
+
+  ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid));
+  auto& oi = ctx->new_obs.oi;
+  eversion_t v = oi.version;
+
+  if (primary_error(soid, v)) {
+    dout(0) << __func__ << " No other replicas available for " << soid << dendl;
+    // XXX: If we knew that there is no down osd which could include this
+    // object, it would be nice if we could return EIO here.
+    // If a "never fail" flag was available, that could be used
+    // for rbd to NOT return EIO until object marked lost.
+
+    // Drop through to save this op in case an osd comes up with the object.
+  }
+
+  // Restart the op after object becomes readable again
+  waiting_for_unreadable_object[soid].push_back(op);
+  op->mark_delayed("waiting for missing object");
+
+  ceph_assert(is_clean());
+  state_set(PG_STATE_REPAIR);
+  state_clear(PG_STATE_CLEAN);
+  queue_peering_event(
+      PGPeeringEventRef(
+	std::make_shared<PGPeeringEvent>(
+	get_osdmap_epoch(),
+	get_osdmap_epoch(),
+	PeeringState::DoRecovery())));
+
+  return -EAGAIN;
+}
+
+/*---SnapTrimmer Logging---*/
+#undef dout_prefix
+#define dout_prefix pg->gen_prefix(*_dout)
+
+void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
+{
+  ldout(pg->cct, 20) << "enter " << state_name << dendl;
+}
+
+void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
+{
+  ldout(pg->cct, 20) << "exit " << state_name << dendl;
+}
+
+bool PrimaryLogPG::SnapTrimmer::permit_trim() {
+  return
+    pg->is_clean() &&
+    !pg->is_scrub_queued_or_active() &&
+    !pg->snap_trimq.empty();
+}
+
+/*---SnapTrimmer states---*/
+#undef dout_prefix
+#define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
+		     << "SnapTrimmer state<" << get_state_name() << ">: ")
+
+/* NotTrimming */
+PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
+  : my_base(ctx),
+    NamedState(nullptr, "NotTrimming")
+{
+  context< SnapTrimmer >().log_enter(state_name);
+}
+
+void PrimaryLogPG::NotTrimming::exit()
+{
+  context< SnapTrimmer >().log_exit(state_name, enter_time);
+}
+
+boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
+{
+  PrimaryLogPG *pg = context< SnapTrimmer >().pg;
+  ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
+
+  if (!(pg->is_primary() && pg->is_active())) {
+    ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
+    return discard_event();
+  }
+  if (!pg->is_clean() ||
+      pg->snap_trimq.empty()) {
+    ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
+    return discard_event();
+  }
+  if (pg->is_scrub_queued_or_active()) {
+    ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
+    return transit< WaitScrub >();
+  } else {
+    return transit< Trimming >();
+  }
+}
+
+boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
+{
+  PrimaryLogPG *pg = context< SnapTrimmer >().pg;
+  ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
+
+  pending = nullptr;
+  if (!context< SnapTrimmer >().can_trim()) {
+    post_event(KickTrim());
+    return transit< NotTrimming >();
+  }
+
+  context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
+  ldout(pg->cct, 10) << "NotTrimming: trimming "
+		     << pg->snap_trimq.range_start()
+		     << dendl;
+  return transit< AwaitAsyncWork >();
+}
+
+/* AwaitAsyncWork */
+PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
+  : my_base(ctx),
+    NamedState(nullptr, "Trimming/AwaitAsyncWork")
+{
+  auto *pg = context< SnapTrimmer >().pg;
+  context< SnapTrimmer >().log_enter(state_name);
+  context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
+  pg->state_set(PG_STATE_SNAPTRIM);
+  pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
+  pg->publish_stats_to_osd();
+}
+
+boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
+{
+  PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
+  snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
+  auto &in_flight = context<Trimming>().in_flight;
+  ceph_assert(in_flight.empty());
+
+  ceph_assert(pg->is_primary() && pg->is_active());
+  if (!context< SnapTrimmer >().can_trim()) {
+    ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
+    post_event(KickTrim());
+    return transit< NotTrimming >();
+  }
+
+  ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
+
+  vector<hobject_t> to_trim;
+  unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
+  // we need to look for at least 1 snaptrim, otherwise we'll misinterpret
+  // the ENOENT below and erase snap_to_trim.
+  ceph_assert(max > 0);
+  to_trim.reserve(max);
+  int r = pg->snap_mapper.get_next_objects_to_trim(
+    snap_to_trim,
+    max,
+    &to_trim);
+  if (r != 0 && r != -ENOENT) {
+    lderr(pg->cct) << "get_next_objects_to_trim returned "
+		   << cpp_strerror(r) << dendl;
+    ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
+  } else if (r == -ENOENT) {
+    // Done!
+    ldout(pg->cct, 10) << "got ENOENT" << dendl;
+
+    pg->snap_trimq.erase(snap_to_trim);
+
+    if (pg->snap_trimq_repeat.count(snap_to_trim)) {
+      ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl;
+      pg->snap_trimq_repeat.erase(snap_to_trim);
+    } else {
+      ldout(pg->cct, 10) << "adding snap " << snap_to_trim
+			 << " to purged_snaps"
+			 << dendl;
+      ObjectStore::Transaction t;
+      pg->recovery_state.adjust_purged_snaps(
+	[snap_to_trim](auto &purged_snaps) {
+	  purged_snaps.insert(snap_to_trim);
+	});
+      pg->write_if_dirty(t);
+
+      ldout(pg->cct, 10) << "purged_snaps now "
+			 << pg->info.purged_snaps << ", snap_trimq now "
+			 << pg->snap_trimq << dendl;
+
+      int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
+      ceph_assert(tr == 0);
+
+      pg->recovery_state.share_pg_info();
+    }
+    post_event(KickTrim());
+    return transit< NotTrimming >();
+  }
+  ceph_assert(!to_trim.empty());
+
+  for (auto &&object: to_trim) {
+    // Get next
+    ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
+    OpContextUPtr ctx;
+    int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx);
+    if (error) {
+      if (error == -ENOLCK) {
+	ldout(pg->cct, 10) << "could not get write lock on obj "
+			   << object << dendl;
+      } else {
+	pg->state_set(PG_STATE_SNAPTRIM_ERROR);
+	ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
+      }
+      if (!in_flight.empty()) {
+	ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
+	return transit< WaitRepops >();
+      }
+      if (error == -ENOLCK) {
+	ldout(pg->cct, 10) << "waiting for it to clear"
+			   << dendl;
+	return transit< WaitRWLock >();
+      } else {
+        return transit< NotTrimming >();
+      }
+    }
+
+    in_flight.insert(object);
+    ctx->register_on_success(
+      [pg, object, &in_flight]() {
+	ceph_assert(in_flight.find(object) != in_flight.end());
+	in_flight.erase(object);
+	if (in_flight.empty()) {
+	  if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
+	    pg->snap_trimmer_machine.process_event(Reset());
+	  } else {
+	    pg->snap_trimmer_machine.process_event(RepopsComplete());
+	  }
+	}
+      });
+
+    pg->simple_opc_submit(std::move(ctx));
+  }
+
+  return transit< WaitRepops >();
+}
+
+void PrimaryLogPG::setattr_maybe_cache(
+  ObjectContextRef obc,
+  PGTransaction *t,
+  const string &key,
+  bufferlist &val)
+{
+  t->setattr(obc->obs.oi.soid, key, val);
+}
+
+void PrimaryLogPG::setattrs_maybe_cache(
+  ObjectContextRef obc,
+  PGTransaction *t,
+  map<string, bufferlist> &attrs)
+{
+  t->setattrs(obc->obs.oi.soid, attrs);
+}
+
+void PrimaryLogPG::rmattr_maybe_cache(
+  ObjectContextRef obc,
+  PGTransaction *t,
+  const string &key)
+{
+  t->rmattr(obc->obs.oi.soid, key);
+}
+
+int PrimaryLogPG::getattr_maybe_cache(
+  ObjectContextRef obc,
+  const string &key,
+  bufferlist *val)
+{
+  if (pool.info.is_erasure()) {
+    map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
+    if (i != obc->attr_cache.end()) {
+      if (val)
+	*val = i->second;
+      return 0;
+    } else {
+      return -ENODATA;
+    }
+  }
+  return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
+}
+
+int PrimaryLogPG::getattrs_maybe_cache(
+  ObjectContextRef obc,
+  map<string, bufferlist> *out)
+{
+  int r = 0;
+  ceph_assert(out);
+  if (pool.info.is_erasure()) {
+    *out = obc->attr_cache;
+  } else {
+    r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
+  }
+  map<string, bufferlist> tmp;
+  for (map<string, bufferlist>::iterator i = out->begin();
+       i != out->end();
+       ++i) {
+    if (i->first.size() > 1 && i->first[0] == '_')
+      tmp[i->first.substr(1, i->first.size())] = std::move(i->second);
+  }
+  tmp.swap(*out);
+  return r;
+}
+
+bool PrimaryLogPG::check_failsafe_full() {
+    return osd->check_failsafe_full(get_dpp());
+}
+
+bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid)
+{
+  return m_scrubber->write_blocked_by_scrub(oid);
+}
+
+void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
+void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
+
+#ifdef PG_DEBUG_REFS
+uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
+void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
+#endif
+
+void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
+void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h
new file mode 100644
index 000000000..68cdec24e
--- /dev/null
+++ b/src/osd/PrimaryLogPG.h
@@ -0,0 +1,1969 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_REPLICATEDPG_H
+#define CEPH_REPLICATEDPG_H
+
+#include <boost/tuple/tuple.hpp>
+#include "include/ceph_assert.h"
+#include "DynamicPerfStats.h"
+#include "OSD.h"
+#include "PG.h"
+#include "Watch.h"
+#include "TierAgentState.h"
+#include "messages/MOSDOpReply.h"
+#include "common/Checksummer.h"
+#include "common/sharedptr_registry.hpp"
+#include "common/shared_cache.hpp"
+#include "ReplicatedBackend.h"
+#include "PGTransaction.h"
+#include "cls/cas/cls_cas_ops.h"
+
+class CopyFromCallback;
+class PromoteCallback;
+struct RefCountCallback;
+
+class PrimaryLogPG;
+class PGLSFilter;
+class HitSet;
+struct TierAgentState;
+class OSDService;
+
+void intrusive_ptr_add_ref(PrimaryLogPG *pg);
+void intrusive_ptr_release(PrimaryLogPG *pg);
+uint64_t get_with_id(PrimaryLogPG *pg);
+void put_with_id(PrimaryLogPG *pg, uint64_t id);
+
+#ifdef PG_DEBUG_REFS
+  typedef TrackedIntPtr<PrimaryLogPG> PrimaryLogPGRef;
+#else
+  typedef boost::intrusive_ptr<PrimaryLogPG> PrimaryLogPGRef;
+#endif
+
+struct inconsistent_snapset_wrapper;
+
+class PrimaryLogPG : public PG, public PGBackend::Listener {
+  friend class OSD;
+  friend class Watch;
+  friend class PrimaryLogScrub;
+
+public:
+  MEMPOOL_CLASS_HELPERS();
+
+  /*
+   * state associated with a copy operation
+   */
+  struct OpContext;
+  class CopyCallback;
+
+  /**
+   * CopyResults stores the object metadata of interest to a copy initiator.
+   */
+  struct CopyResults {
+    ceph::real_time mtime; ///< the copy source's mtime
+    uint64_t object_size; ///< the copied object's size
+    bool started_temp_obj; ///< true if the callback needs to delete temp object
+    hobject_t temp_oid;    ///< temp object (if any)
+
+    /**
+     * Function to fill in transaction; if non-empty the callback
+     * must execute it before any other accesses to the object
+     * (in order to complete the copy).
+     */
+    std::function<void(PGTransaction *)> fill_in_final_tx;
+
+    version_t user_version; ///< The copy source's user version
+    bool should_requeue;  ///< op should be requeued on cancel
+    std::vector<snapid_t> snaps;  ///< src's snaps (if clone)
+    snapid_t snap_seq;       ///< src's snap_seq (if head)
+    librados::snap_set_t snapset; ///< src snapset (if head)
+    bool mirror_snapset;
+    bool has_omap;
+    uint32_t flags;    // object_copy_data_t::FLAG_*
+    uint32_t source_data_digest, source_omap_digest;
+    uint32_t data_digest, omap_digest;
+    mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > reqids; // [(reqid, user_version)]
+    mempool::osd_pglog::map<uint32_t, int> reqid_return_codes; // std::map reqids by index to error code
+    std::map<std::string, ceph::buffer::list> attrs; // xattrs
+    uint64_t truncate_seq;
+    uint64_t truncate_size;
+    bool is_data_digest() {
+      return flags & object_copy_data_t::FLAG_DATA_DIGEST;
+    }
+    bool is_omap_digest() {
+      return flags & object_copy_data_t::FLAG_OMAP_DIGEST;
+    }
+    CopyResults()
+      : object_size(0), started_temp_obj(false),
+	user_version(0),
+	should_requeue(false), mirror_snapset(false),
+	has_omap(false),
+	flags(0),
+	source_data_digest(-1), source_omap_digest(-1),
+	data_digest(-1), omap_digest(-1),
+	truncate_seq(0), truncate_size(0)
+    {}
+  };
+
+  struct CopyOp;
+  typedef std::shared_ptr<CopyOp> CopyOpRef;
+
+  struct CopyOp {
+    CopyCallback *cb;
+    ObjectContextRef obc;
+    hobject_t src;
+    object_locator_t oloc;
+    unsigned flags;
+    bool mirror_snapset;
+
+    CopyResults results;
+
+    ceph_tid_t objecter_tid;
+    ceph_tid_t objecter_tid2;
+
+    object_copy_cursor_t cursor;
+    std::map<std::string,ceph::buffer::list> attrs;
+    ceph::buffer::list data;
+    ceph::buffer::list omap_header;
+    ceph::buffer::list omap_data;
+    int rval;
+
+    object_copy_cursor_t temp_cursor;
+
+    /*
+     * For CopyOp the process is:
+     * step1: read the data(attr/omap/data) from the source object
+     * step2: handle those data(w/ those data create a new object)
+     * src_obj_fadvise_flags used in step1;
+     * dest_obj_fadvise_flags used in step2
+     */
+    unsigned src_obj_fadvise_flags;
+    unsigned dest_obj_fadvise_flags;
+
+    std::map<uint64_t, CopyOpRef> chunk_cops;
+    int num_chunk;
+    bool failed;
+    uint64_t start_offset = 0;
+    uint64_t last_offset = 0;
+    std::vector<OSDOp> chunk_ops;
+
+    CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s,
+	   object_locator_t l,
+           version_t v,
+	   unsigned f,
+	   bool ms,
+	   unsigned src_obj_fadvise_flags,
+	   unsigned dest_obj_fadvise_flags)
+      : cb(cb_), obc(_obc), src(s), oloc(l), flags(f),
+	mirror_snapset(ms),
+	objecter_tid(0),
+	objecter_tid2(0),
+	rval(-1),
+	src_obj_fadvise_flags(src_obj_fadvise_flags),
+	dest_obj_fadvise_flags(dest_obj_fadvise_flags),
+	num_chunk(0),
+	failed(false)
+    {
+      results.user_version = v;
+      results.mirror_snapset = mirror_snapset;
+    }
+  };
+
+  /**
+   * The CopyCallback class defines an interface for completions to the
+   * copy_start code. Users of the copy infrastructure must implement
+   * one and give an instance of the class to start_copy.
+   *
+   * The implementer is responsible for making sure that the CopyCallback
+   * can associate itself with the correct copy operation.
+   */
+  typedef boost::tuple<int, CopyResults*> CopyCallbackResults;
+
+  friend class CopyFromCallback;
+  friend struct CopyFromFinisher;
+  friend class PromoteCallback;
+  friend struct PromoteFinisher;
+
+  struct ProxyReadOp {
+    OpRequestRef op;
+    hobject_t soid;
+    ceph_tid_t objecter_tid;
+    std::vector<OSDOp> &ops;
+    version_t user_version;
+    int data_offset;
+    bool canceled;              ///< true if canceled
+
+    ProxyReadOp(OpRequestRef _op, hobject_t oid, std::vector<OSDOp>& _ops)
+      : op(_op), soid(oid),
+        objecter_tid(0), ops(_ops),
+	user_version(0), data_offset(0),
+	canceled(false) { }
+  };
+  typedef std::shared_ptr<ProxyReadOp> ProxyReadOpRef;
+
+  struct ProxyWriteOp {
+    OpContext *ctx;
+    OpRequestRef op;
+    hobject_t soid;
+    ceph_tid_t objecter_tid;
+    std::vector<OSDOp> &ops;
+    version_t user_version;
+    bool sent_reply;
+    utime_t mtime;
+    bool canceled;
+    osd_reqid_t reqid;
+
+    ProxyWriteOp(OpRequestRef _op, hobject_t oid, std::vector<OSDOp>& _ops, osd_reqid_t _reqid)
+      : ctx(NULL), op(_op), soid(oid),
+        objecter_tid(0), ops(_ops),
+	user_version(0), sent_reply(false),
+	canceled(false),
+        reqid(_reqid) { }
+  };
+  typedef std::shared_ptr<ProxyWriteOp> ProxyWriteOpRef;
+
+  struct FlushOp {
+    ObjectContextRef obc;       ///< obc we are flushing
+    OpRequestRef op;            ///< initiating op
+    std::list<OpRequestRef> dup_ops; ///< bandwagon jumpers
+    version_t flushed_version;  ///< user version we are flushing
+    ceph_tid_t objecter_tid;    ///< copy-from request tid
+    int rval;                   ///< copy-from result
+    bool blocking;              ///< whether we are blocking updates
+    bool removal;               ///< we are removing the backend object
+    std::optional<std::function<void()>> on_flush; ///< callback, may be null
+    // for chunked object
+    std::map<uint64_t, int> io_results;
+    std::map<uint64_t, ceph_tid_t> io_tids;
+    uint64_t chunks;
+
+    FlushOp()
+      : flushed_version(0), objecter_tid(0), rval(0),
+	blocking(false), removal(false), chunks(0) {}
+    ~FlushOp() { ceph_assert(!on_flush); }
+  };
+  typedef std::shared_ptr<FlushOp> FlushOpRef;
+
+  friend struct RefCountCallback;
+  struct ManifestOp {
+    RefCountCallback *cb;
+    ceph_tid_t objecter_tid;
+    OpRequestRef op;
+    std::map<uint64_t, int> results;
+    std::map<uint64_t, ceph_tid_t> tids; 
+    std::map<hobject_t, pair<uint64_t, uint64_t>> chunks;
+    uint64_t num_chunks = 0;
+    object_manifest_t new_manifest;
+    
+
+    ManifestOp(RefCountCallback* cb)
+      : cb(cb), objecter_tid(0) {}
+  };
+  typedef std::shared_ptr<ManifestOp> ManifestOpRef;
+  std::map<hobject_t, ManifestOpRef> manifest_ops;
+
+  boost::scoped_ptr<PGBackend> pgbackend;
+  PGBackend *get_pgbackend() override {
+    return pgbackend.get();
+  }
+
+  const PGBackend *get_pgbackend() const override {
+    return pgbackend.get();
+  }
+
+  /// Listener methods
+  DoutPrefixProvider *get_dpp() override {
+    return this;
+  }
+
+  void on_local_recover(
+    const hobject_t &oid,
+    const ObjectRecoveryInfo &recovery_info,
+    ObjectContextRef obc,
+    bool is_delete,
+    ObjectStore::Transaction *t
+    ) override;
+  void on_peer_recover(
+    pg_shard_t peer,
+    const hobject_t &oid,
+    const ObjectRecoveryInfo &recovery_info
+    ) override {
+    recovery_state.on_peer_recover(peer, oid, recovery_info.version);
+  }
+  void begin_peer_recover(
+    pg_shard_t peer,
+    const hobject_t oid) override {
+    recovery_state.begin_peer_recover(peer, oid);
+  }
+  void on_global_recover(
+    const hobject_t &oid,
+    const object_stat_sum_t &stat_diff,
+    bool is_delete) override;
+  void on_failed_pull(
+    const std::set<pg_shard_t> &from,
+    const hobject_t &soid,
+    const eversion_t &version) override;
+  void cancel_pull(const hobject_t &soid) override;
+  void apply_stats(
+    const hobject_t &soid,
+    const object_stat_sum_t &delta_stats) override;
+
+  bool primary_error(const hobject_t& soid, eversion_t v);
+
+  void remove_missing_object(const hobject_t &oid,
+			     eversion_t v,
+			     Context *on_complete) override;
+
+  template<class T> class BlessedGenContext;
+  template<class T> class UnlockedBlessedGenContext;
+  class BlessedContext;
+  Context *bless_context(Context *c) override;
+
+  GenContext<ThreadPool::TPHandle&> *bless_gencontext(
+    GenContext<ThreadPool::TPHandle&> *c) override;
+  GenContext<ThreadPool::TPHandle&> *bless_unlocked_gencontext(
+    GenContext<ThreadPool::TPHandle&> *c) override;
+
+  void send_message(int to_osd, Message *m) override {
+    osd->send_message_osd_cluster(to_osd, m, get_osdmap_epoch());
+  }
+  void queue_transaction(ObjectStore::Transaction&& t,
+			 OpRequestRef op) override {
+    osd->store->queue_transaction(ch, std::move(t), op);
+  }
+  void queue_transactions(std::vector<ObjectStore::Transaction>& tls,
+			  OpRequestRef op) override {
+    osd->store->queue_transactions(ch, tls, op, NULL);
+  }
+  epoch_t get_interval_start_epoch() const override {
+    return info.history.same_interval_since;
+  }
+  epoch_t get_last_peering_reset_epoch() const override {
+    return get_last_peering_reset();
+  }
+  const std::set<pg_shard_t> &get_acting_recovery_backfill_shards() const override {
+    return get_acting_recovery_backfill();
+  }
+  const std::set<pg_shard_t> &get_acting_shards() const override {
+    return recovery_state.get_actingset();
+  }
+  const std::set<pg_shard_t> &get_backfill_shards() const override {
+    return get_backfill_targets();
+  }
+
+  std::ostream& gen_dbg_prefix(std::ostream& out) const override {
+    return gen_prefix(out);
+  }
+
+  const HobjToShardSetMapping& get_missing_loc_shards() const override
+  {
+    return recovery_state.get_missing_loc().get_missing_locs();
+  }
+  const std::map<pg_shard_t, pg_missing_t> &get_shard_missing() const override {
+    return recovery_state.get_peer_missing();
+  }
+  using PGBackend::Listener::get_shard_missing;
+  const std::map<pg_shard_t, pg_info_t> &get_shard_info() const override {
+    return recovery_state.get_peer_info();
+  }
+  using PGBackend::Listener::get_shard_info;
+  const pg_missing_tracker_t &get_local_missing() const override {
+    return recovery_state.get_pg_log().get_missing();
+  }
+  const PGLog &get_log() const override {
+    return recovery_state.get_pg_log();
+  }
+  void add_local_next_event(const pg_log_entry_t& e) override {
+    recovery_state.add_local_next_event(e);
+  }
+  bool pgb_is_primary() const override {
+    return is_primary();
+  }
+  const OSDMapRef& pgb_get_osdmap() const override final {
+    return get_osdmap();
+  }
+  epoch_t pgb_get_osdmap_epoch() const override final {
+    return get_osdmap_epoch();
+  }
+  const pg_info_t &get_info() const override {
+    return info;
+  }
+  const pg_pool_t &get_pool() const override {
+    return pool.info;
+  }
+
+  ObjectContextRef get_obc(
+    const hobject_t &hoid,
+    const std::map<std::string, ceph::buffer::list> &attrs) override {
+    return get_object_context(hoid, true, &attrs);
+  }
+
+  bool try_lock_for_read(
+    const hobject_t &hoid,
+    ObcLockManager &manager) override {
+    if (is_missing_object(hoid))
+      return false;
+    auto obc = get_object_context(hoid, false, nullptr);
+    if (!obc)
+      return false;
+    return manager.try_get_read_lock(hoid, obc);
+  }
+
+  void release_locks(ObcLockManager &manager) override {
+    release_object_locks(manager);
+  }
+
+  bool pg_is_repair() override {
+    return is_repair();
+  }
+  void inc_osd_stat_repaired() override {
+    osd->inc_osd_stat_repaired();
+  }
+  bool pg_is_remote_backfilling() override {
+    return is_remote_backfilling();
+  }
+  void pg_add_local_num_bytes(int64_t num_bytes) override {
+    add_local_num_bytes(num_bytes);
+  }
+  void pg_sub_local_num_bytes(int64_t num_bytes) override {
+    sub_local_num_bytes(num_bytes);
+  }
+  void pg_add_num_bytes(int64_t num_bytes) override {
+    add_num_bytes(num_bytes);
+  }
+  void pg_sub_num_bytes(int64_t num_bytes) override {
+    sub_num_bytes(num_bytes);
+  }
+
+  void pgb_set_object_snap_mapping(
+    const hobject_t &soid,
+    const std::set<snapid_t> &snaps,
+    ObjectStore::Transaction *t) override {
+    return update_object_snap_mapping(t, soid, snaps);
+  }
+  void pgb_clear_object_snap_mapping(
+    const hobject_t &soid,
+    ObjectStore::Transaction *t) override {
+    return clear_object_snap_mapping(t, soid);
+  }
+
+  void log_operation(
+    std::vector<pg_log_entry_t>&& logv,
+    const std::optional<pg_hit_set_history_t> &hset_history,
+    const eversion_t &trim_to,
+    const eversion_t &roll_forward_to,
+    const eversion_t &min_last_complete_ondisk,
+    bool transaction_applied,
+    ObjectStore::Transaction &t,
+    bool async = false) override {
+    if (is_primary()) {
+      ceph_assert(trim_to <= recovery_state.get_last_update_ondisk());
+    }
+    if (hset_history) {
+      recovery_state.update_hset(*hset_history);
+    }
+    if (transaction_applied) {
+      update_snap_map(logv, t);
+    }
+    auto last = logv.rbegin();
+    if (is_primary() && last != logv.rend()) {
+      projected_log.skip_can_rollback_to_to_head();
+      projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
+    }
+    if (!is_primary() && !is_ec_pg()) {
+      replica_clear_repop_obc(logv, t);
+    }
+    recovery_state.append_log(
+      std::move(logv), trim_to, roll_forward_to, min_last_complete_ondisk,
+      t, transaction_applied, async);
+  }
+
+  void replica_clear_repop_obc(
+    const std::vector<pg_log_entry_t> &logv,
+    ObjectStore::Transaction &t);
+
+  void op_applied(const eversion_t &applied_version) override;
+
+  bool should_send_op(
+    pg_shard_t peer,
+    const hobject_t &hoid) override;
+
+  bool pg_is_undersized() const override {
+    return is_undersized();
+  }
+
+  bool pg_is_repair() const override {
+    return is_repair();
+  }
+
+  void update_peer_last_complete_ondisk(
+    pg_shard_t fromosd,
+    eversion_t lcod) override {
+    recovery_state.update_peer_last_complete_ondisk(fromosd, lcod);
+  }
+
+  void update_last_complete_ondisk(
+    eversion_t lcod) override {
+    recovery_state.update_last_complete_ondisk(lcod);
+  }
+
+  void update_stats(
+    const pg_stat_t &stat) override {
+    recovery_state.update_stats(
+      [&stat](auto &history, auto &stats) {
+	stats = stat;
+	return false;
+      });
+  }
+
+  void schedule_recovery_work(
+    GenContext<ThreadPool::TPHandle&> *c) override;
+
+  pg_shard_t whoami_shard() const override {
+    return pg_whoami;
+  }
+  spg_t primary_spg_t() const override {
+    return spg_t(info.pgid.pgid, get_primary().shard);
+  }
+  pg_shard_t primary_shard() const override {
+    return get_primary();
+  }
+  uint64_t min_peer_features() const override {
+    return recovery_state.get_min_peer_features();
+  }
+  uint64_t min_upacting_features() const override {
+    return recovery_state.get_min_upacting_features();
+  }
+  void send_message_osd_cluster(
+    int peer, Message *m, epoch_t from_epoch) override {
+    osd->send_message_osd_cluster(peer, m, from_epoch);
+  }
+  void send_message_osd_cluster(
+    std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch) override {
+    osd->send_message_osd_cluster(messages, from_epoch);
+  }
+  void send_message_osd_cluster(
+    MessageRef m, Connection *con) override {
+    osd->send_message_osd_cluster(std::move(m), con);
+  }
+  void send_message_osd_cluster(
+    Message *m, const ConnectionRef& con) override {
+    osd->send_message_osd_cluster(m, con);
+  }
+  ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) override;
+  entity_name_t get_cluster_msgr_name() override {
+    return osd->get_cluster_msgr_name();
+  }
+
+  PerfCounters *get_logger() override;
+
+  ceph_tid_t get_tid() override { return osd->get_tid(); }
+
+  OstreamTemp clog_error() override { return osd->clog->error(); }
+  OstreamTemp clog_warn() override { return osd->clog->warn(); }
+
+  /**
+   * a scrub-map arrived from a replica
+   */
+  void do_replica_scrub_map(OpRequestRef op);
+
+  struct watch_disconnect_t {
+    uint64_t cookie;
+    entity_name_t name;
+    bool send_disconnect;
+    watch_disconnect_t(uint64_t c, entity_name_t n, bool sd)
+      : cookie(c), name(n), send_disconnect(sd) {}
+  };
+  void complete_disconnect_watches(
+    ObjectContextRef obc,
+    const std::list<watch_disconnect_t> &to_disconnect);
+
+  struct OpFinisher {
+    virtual ~OpFinisher() {
+    }
+
+    virtual int execute() = 0;
+  };
+
+  /*
+   * Capture all object state associated with an in-progress read or write.
+   */
+  struct OpContext {
+    OpRequestRef op;
+    osd_reqid_t reqid;
+    std::vector<OSDOp> *ops;
+
+    const ObjectState *obs; // Old objectstate
+    const SnapSet *snapset; // Old snapset
+
+    ObjectState new_obs;  // resulting ObjectState
+    SnapSet new_snapset;  // resulting SnapSet (in case of a write)
+    //pg_stat_t new_stats;  // resulting Stats
+    object_stat_sum_t delta_stats;
+
+    bool modify;          // (force) modification (even if op_t is empty)
+    bool user_modify;     // user-visible modification
+    bool undirty;         // user explicitly un-dirtying this object
+    bool cache_operation;     ///< true if this is a cache eviction
+    bool ignore_cache;    ///< true if IGNORE_CACHE flag is std::set
+    bool ignore_log_op_stats;  // don't log op stats
+    bool update_log_only; ///< this is a write that returned an error - just record in pg log for dup detection
+    ObjectCleanRegions clean_regions;
+
+    // side effects
+    std::list<std::pair<watch_info_t,bool> > watch_connects; ///< new watch + will_ping flag
+    std::list<watch_disconnect_t> watch_disconnects; ///< old watch + send_discon
+    std::list<notify_info_t> notifies;
+    struct NotifyAck {
+      std::optional<uint64_t> watch_cookie;
+      uint64_t notify_id;
+      ceph::buffer::list reply_bl;
+      explicit NotifyAck(uint64_t notify_id) : notify_id(notify_id) {}
+      NotifyAck(uint64_t notify_id, uint64_t cookie, ceph::buffer::list& rbl)
+	: watch_cookie(cookie), notify_id(notify_id) {
+	reply_bl = std::move(rbl);
+      }
+    };
+    std::list<NotifyAck> notify_acks;
+
+    uint64_t bytes_written, bytes_read;
+
+    utime_t mtime;
+    SnapContext snapc;           // writer snap context
+    eversion_t at_version;       // pg's current version pointer
+    version_t user_at_version;   // pg's current user version pointer
+
+    /// index of the current subop - only valid inside of do_osd_ops()
+    int current_osd_subop_num;
+    /// total number of subops processed in this context for cls_cxx_subop_version()
+    int processed_subop_count = 0;
+
+    PGTransactionUPtr op_t;
+    std::vector<pg_log_entry_t> log;
+    std::optional<pg_hit_set_history_t> updated_hset_history;
+
+    interval_set<uint64_t> modified_ranges;
+    ObjectContextRef obc;
+    ObjectContextRef clone_obc;    // if we created a clone
+    ObjectContextRef head_obc;     // if we also update snapset (see trim_object)
+
+    // FIXME: we may want to kill this msgr hint off at some point!
+    std::optional<int> data_off = std::nullopt;
+
+    MOSDOpReply *reply;
+
+    PrimaryLogPG *pg;
+
+    int num_read;    ///< count read ops
+    int num_write;   ///< count update ops
+
+    mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > extra_reqids;
+    mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
+
+    hobject_t new_temp_oid, discard_temp_oid;  ///< temp objects we should start/stop tracking
+
+    std::list<std::function<void()>> on_applied;
+    std::list<std::function<void()>> on_committed;
+    std::list<std::function<void()>> on_finish;
+    std::list<std::function<void()>> on_success;
+    template <typename F>
+    void register_on_finish(F &&f) {
+      on_finish.emplace_back(std::forward<F>(f));
+    }
+    template <typename F>
+    void register_on_success(F &&f) {
+      on_success.emplace_back(std::forward<F>(f));
+    }
+    template <typename F>
+    void register_on_applied(F &&f) {
+      on_applied.emplace_back(std::forward<F>(f));
+    }
+    template <typename F>
+    void register_on_commit(F &&f) {
+      on_committed.emplace_back(std::forward<F>(f));
+    }
+
+    bool sent_reply = false;
+
+    // pending async reads <off, len, op_flags> -> <outbl, outr>
+    std::list<std::pair<boost::tuple<uint64_t, uint64_t, unsigned>,
+	      std::pair<ceph::buffer::list*, Context*> > > pending_async_reads;
+    int inflightreads;
+    friend struct OnReadComplete;
+    void start_async_reads(PrimaryLogPG *pg);
+    void finish_read(PrimaryLogPG *pg);
+    bool async_reads_complete() {
+      return inflightreads == 0;
+    }
+
+    RWState::State lock_type;
+    ObcLockManager lock_manager;
+
+    std::map<int, std::unique_ptr<OpFinisher>> op_finishers;
+
+    OpContext(const OpContext& other);
+    const OpContext& operator=(const OpContext& other);
+
+    OpContext(OpRequestRef _op, osd_reqid_t _reqid, std::vector<OSDOp>* _ops,
+	      ObjectContextRef& obc,
+	      PrimaryLogPG *_pg) :
+      op(_op), reqid(_reqid), ops(_ops),
+      obs(&obc->obs),
+      snapset(0),
+      new_obs(obs->oi, obs->exists),
+      modify(false), user_modify(false), undirty(false), cache_operation(false),
+      ignore_cache(false), ignore_log_op_stats(false), update_log_only(false),
+      bytes_written(0), bytes_read(0), user_at_version(0),
+      current_osd_subop_num(0),
+      obc(obc),
+      reply(NULL), pg(_pg),
+      num_read(0),
+      num_write(0),
+      sent_reply(false),
+      inflightreads(0),
+      lock_type(RWState::RWNONE) {
+      if (obc->ssc) {
+	new_snapset = obc->ssc->snapset;
+	snapset = &obc->ssc->snapset;
+      }
+    }
+    OpContext(OpRequestRef _op, osd_reqid_t _reqid,
+              std::vector<OSDOp>* _ops, PrimaryLogPG *_pg) :
+      op(_op), reqid(_reqid), ops(_ops), obs(NULL), snapset(0),
+      modify(false), user_modify(false), undirty(false), cache_operation(false),
+      ignore_cache(false), ignore_log_op_stats(false), update_log_only(false),
+      bytes_written(0), bytes_read(0), user_at_version(0),
+      current_osd_subop_num(0),
+      reply(NULL), pg(_pg),
+      num_read(0),
+      num_write(0),
+      inflightreads(0),
+      lock_type(RWState::RWNONE) {}
+    void reset_obs(ObjectContextRef obc) {
+      new_obs = ObjectState(obc->obs.oi, obc->obs.exists);
+      if (obc->ssc) {
+	new_snapset = obc->ssc->snapset;
+	snapset = &obc->ssc->snapset;
+      }
+    }
+    ~OpContext() {
+      ceph_assert(!op_t);
+      if (reply)
+	reply->put();
+      for (std::list<std::pair<boost::tuple<uint64_t, uint64_t, unsigned>,
+		     std::pair<ceph::buffer::list*, Context*> > >::iterator i =
+	     pending_async_reads.begin();
+	   i != pending_async_reads.end();
+	   pending_async_reads.erase(i++)) {
+	delete i->second.second;
+      }
+    }
+    uint64_t get_features() {
+      if (op && op->get_req()) {
+        return op->get_req()->get_connection()->get_features();
+      }
+      return -1ull;
+    }
+  };
+  using OpContextUPtr = std::unique_ptr<OpContext>;
+  friend struct OpContext;
+
+  /*
+   * State on the PG primary associated with the replicated mutation
+   */
+  class RepGather {
+  public:
+    hobject_t hoid;
+    OpRequestRef op;
+    xlist<RepGather*>::item queue_item;
+    int nref;
+
+    eversion_t v;
+    int r = 0;
+
+    ceph_tid_t rep_tid;
+
+    bool rep_aborted;
+    bool all_committed;
+
+    utime_t   start;
+
+    eversion_t          pg_local_last_complete;
+
+    ObcLockManager lock_manager;
+
+    std::list<std::function<void()>> on_committed;
+    std::list<std::function<void()>> on_success;
+    std::list<std::function<void()>> on_finish;
+
+    RepGather(
+      OpContext *c, ceph_tid_t rt,
+      eversion_t lc) :
+      hoid(c->obc->obs.oi.soid),
+      op(c->op),
+      queue_item(this),
+      nref(1),
+      rep_tid(rt),
+      rep_aborted(false),
+      all_committed(false),
+      pg_local_last_complete(lc),
+      lock_manager(std::move(c->lock_manager)),
+      on_committed(std::move(c->on_committed)),
+      on_success(std::move(c->on_success)),
+      on_finish(std::move(c->on_finish)) {}
+
+    RepGather(
+      ObcLockManager &&manager,
+      OpRequestRef &&o,
+      std::optional<std::function<void(void)> > &&on_complete,
+      ceph_tid_t rt,
+      eversion_t lc,
+      int r) :
+      op(o),
+      queue_item(this),
+      nref(1),
+      r(r),
+      rep_tid(rt),
+      rep_aborted(false),
+      all_committed(false),
+      pg_local_last_complete(lc),
+      lock_manager(std::move(manager)) {
+      if (on_complete) {
+	on_success.push_back(std::move(*on_complete));
+      }
+    }
+
+    RepGather *get() {
+      nref++;
+      return this;
+    }
+    void put() {
+      ceph_assert(nref > 0);
+      if (--nref == 0) {
+	delete this;
+	//generic_dout(0) << "deleting " << this << dendl;
+      }
+    }
+  };
+
+
+protected:
+
+  /**
+   * Grabs locks for OpContext, should be cleaned up in close_op_ctx
+   *
+   * @param ctx [in,out] ctx to get locks for
+   * @return true on success, false if we are queued
+   */
+  bool get_rw_locks(bool write_ordered, OpContext *ctx) {
+    /* If head_obc, !obc->obs->exists and we will always take the
+     * snapdir lock *before* the head lock.  Since all callers will do
+     * this (read or write) if we get the first we will be guaranteed
+     * to get the second.
+     */
+    if (write_ordered && ctx->op->may_read()) {
+      ctx->lock_type = RWState::RWEXCL;
+    } else if (write_ordered) {
+      ctx->lock_type = RWState::RWWRITE;
+    } else {
+      ceph_assert(ctx->op->may_read());
+      ctx->lock_type = RWState::RWREAD;
+    }
+
+    if (ctx->head_obc) {
+      ceph_assert(!ctx->obc->obs.exists);
+      if (!ctx->lock_manager.get_lock_type(
+	    ctx->lock_type,
+	    ctx->head_obc->obs.oi.soid,
+	    ctx->head_obc,
+	    ctx->op)) {
+	ctx->lock_type = RWState::RWNONE;
+	return false;
+      }
+    }
+    if (ctx->lock_manager.get_lock_type(
+	  ctx->lock_type,
+	  ctx->obc->obs.oi.soid,
+	  ctx->obc,
+	  ctx->op)) {
+      return true;
+    } else {
+      ceph_assert(!ctx->head_obc);
+      ctx->lock_type = RWState::RWNONE;
+      return false;
+    }
+  }
+
+  /**
+   * Cleans up OpContext
+   *
+   * @param ctx [in] ctx to clean up
+   */
+  void close_op_ctx(OpContext *ctx);
+
+  /**
+   * Releases locks
+   *
+   * @param manager [in] manager with locks to release
+   *
+   * (moved to .cc due to scrubber access)
+   */
+  void release_object_locks(ObcLockManager &lock_manager);
+
+  // replica ops
+  // [primary|tail]
+  xlist<RepGather*> repop_queue;
+
+  friend class C_OSD_RepopCommit;
+  void repop_all_committed(RepGather *repop);
+  void eval_repop(RepGather*);
+  void issue_repop(RepGather *repop, OpContext *ctx);
+  RepGather *new_repop(
+    OpContext *ctx,
+    ObjectContextRef obc,
+    ceph_tid_t rep_tid);
+  boost::intrusive_ptr<RepGather> new_repop(
+    eversion_t version,
+    int r,
+    ObcLockManager &&manager,
+    OpRequestRef &&op,
+    std::optional<std::function<void(void)> > &&on_complete);
+  void remove_repop(RepGather *repop);
+
+  OpContextUPtr simple_opc_create(ObjectContextRef obc);
+  void simple_opc_submit(OpContextUPtr ctx);
+
+  /**
+   * Merge entries atomically into all acting_recovery_backfill osds
+   * adjusting missing and recovery state as necessary.
+   *
+   * Also used to store error log entries for dup detection.
+   */
+  void submit_log_entries(
+    const mempool::osd_pglog::list<pg_log_entry_t> &entries,
+    ObcLockManager &&manager,
+    std::optional<std::function<void(void)> > &&on_complete,
+    OpRequestRef op = OpRequestRef(),
+    int r = 0);
+  struct LogUpdateCtx {
+    boost::intrusive_ptr<RepGather> repop;
+    std::set<pg_shard_t> waiting_on;
+  };
+  void cancel_log_updates();
+  std::map<ceph_tid_t, LogUpdateCtx> log_entry_update_waiting_on;
+
+
+  // hot/cold tracking
+  HitSetRef hit_set;        ///< currently accumulating HitSet
+  utime_t hit_set_start_stamp;    ///< time the current HitSet started recording
+
+
+  void hit_set_clear();     ///< discard any HitSet state
+  void hit_set_setup();     ///< initialize HitSet state
+  void hit_set_create();    ///< create a new HitSet
+  void hit_set_persist();   ///< persist hit info
+  bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet
+  void hit_set_trim(OpContextUPtr &ctx, unsigned max); ///< discard old HitSets
+  void hit_set_in_memory_trim(uint32_t max_in_memory); ///< discard old in memory HitSets
+  void hit_set_remove_all();
+
+  hobject_t get_hit_set_current_object(utime_t stamp);
+  hobject_t get_hit_set_archive_object(utime_t start,
+				       utime_t end,
+				       bool using_gmt);
+
+  // agent
+  boost::scoped_ptr<TierAgentState> agent_state;
+
+  void agent_setup();       ///< initialize agent state
+  bool agent_work(int max) override ///< entry point to do some agent work
+  {
+    return agent_work(max, max);
+  }
+  bool agent_work(int max, int agent_flush_quota) override;
+  bool agent_maybe_flush(ObjectContextRef& obc);  ///< maybe flush
+  bool agent_maybe_evict(ObjectContextRef& obc, bool after_flush);  ///< maybe evict
+
+  void agent_load_hit_sets();  ///< load HitSets, if needed
+
+  /// estimate object atime and temperature
+  ///
+  /// @param oid [in] object name
+  /// @param temperature [out] relative temperature (# consider both access time and frequency)
+  void agent_estimate_temp(const hobject_t& oid, int *temperature);
+
+  /// stop the agent
+  void agent_stop() override;
+  void agent_delay() override;
+
+  /// clear agent state
+  void agent_clear() override;
+
+  /// choose (new) agent mode(s), returns true if op is requeued
+  bool agent_choose_mode(bool restart = false, OpRequestRef op = OpRequestRef());
+  void agent_choose_mode_restart() override;
+
+  /// true if we can send an ondisk/commit for v
+  bool already_complete(eversion_t v);
+
+  // projected object info
+  SharedLRU<hobject_t, ObjectContext> object_contexts;
+  // std::map from oid.snapdir() to SnapSetContext *
+  std::map<hobject_t, SnapSetContext*> snapset_contexts;
+  ceph::mutex snapset_contexts_lock =
+    ceph::make_mutex("PrimaryLogPG::snapset_contexts_lock");
+
+  // debug order that client ops are applied
+  std::map<hobject_t, std::map<client_t, ceph_tid_t>> debug_op_order;
+
+  void populate_obc_watchers(ObjectContextRef obc);
+  void check_blocklisted_obc_watchers(ObjectContextRef obc);
+  void check_blocklisted_watchers() override;
+  void get_watchers(std::list<obj_watch_item_t> *ls) override;
+  void get_obc_watchers(ObjectContextRef obc, std::list<obj_watch_item_t> &pg_watchers);
+public:
+  void handle_watch_timeout(WatchRef watch);
+protected:
+
+  ObjectContextRef create_object_context(const object_info_t& oi, SnapSetContext *ssc);
+  ObjectContextRef get_object_context(
+    const hobject_t& soid,
+    bool can_create,
+    const std::map<std::string, ceph::buffer::list> *attrs = 0
+    );
+
+  void context_registry_on_change();
+  void object_context_destructor_callback(ObjectContext *obc);
+  class C_PG_ObjectContext;
+
+  int find_object_context(const hobject_t& oid,
+			  ObjectContextRef *pobc,
+			  bool can_create,
+			  bool map_snapid_to_clone=false,
+			  hobject_t *missing_oid=NULL);
+
+  void add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *stat);
+
+  void get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc);
+
+  SnapSetContext *get_snapset_context(
+    const hobject_t& oid,
+    bool can_create,
+    const std::map<std::string, ceph::buffer::list> *attrs = 0,
+    bool oid_existed = true //indicate this oid whether exsited in backend
+    );
+  void register_snapset_context(SnapSetContext *ssc) {
+    std::lock_guard l(snapset_contexts_lock);
+    _register_snapset_context(ssc);
+  }
+  void _register_snapset_context(SnapSetContext *ssc) {
+    ceph_assert(ceph_mutex_is_locked(snapset_contexts_lock));
+    if (!ssc->registered) {
+      ceph_assert(snapset_contexts.count(ssc->oid) == 0);
+      ssc->registered = true;
+      snapset_contexts[ssc->oid] = ssc;
+    }
+  }
+  void put_snapset_context(SnapSetContext *ssc);
+
+  std::map<hobject_t, ObjectContextRef> recovering;
+
+  /*
+   * Backfill
+   *
+   * peer_info[backfill_target].last_backfill == info.last_backfill on the peer.
+   *
+   * objects prior to peer_info[backfill_target].last_backfill
+   *   - are on the peer
+   *   - are included in the peer stats
+   *
+   * objects \in (last_backfill, last_backfill_started]
+   *   - are on the peer or are in backfills_in_flight
+   *   - are not included in pg stats (yet)
+   *   - have their stats in pending_backfill_updates on the primary
+   */
+  std::set<hobject_t> backfills_in_flight;
+  std::map<hobject_t, pg_stat_t> pending_backfill_updates;
+
+  void dump_recovery_info(ceph::Formatter *f) const override {
+    f->open_array_section("waiting_on_backfill");
+    for (std::set<pg_shard_t>::const_iterator p = waiting_on_backfill.begin();
+        p != waiting_on_backfill.end(); ++p)
+      f->dump_stream("osd") << *p;
+    f->close_section();
+    f->dump_stream("last_backfill_started") << last_backfill_started;
+    {
+      f->open_object_section("backfill_info");
+      backfill_info.dump(f);
+      f->close_section();
+    }
+    {
+      f->open_array_section("peer_backfill_info");
+      for (std::map<pg_shard_t, BackfillInterval>::const_iterator pbi =
+	     peer_backfill_info.begin();
+          pbi != peer_backfill_info.end(); ++pbi) {
+        f->dump_stream("osd") << pbi->first;
+        f->open_object_section("BackfillInterval");
+          pbi->second.dump(f);
+        f->close_section();
+      }
+      f->close_section();
+    }
+    {
+      f->open_array_section("backfills_in_flight");
+      for (std::set<hobject_t>::const_iterator i = backfills_in_flight.begin();
+	   i != backfills_in_flight.end();
+	   ++i) {
+	f->dump_stream("object") << *i;
+      }
+      f->close_section();
+    }
+    {
+      f->open_array_section("recovering");
+      for (std::map<hobject_t, ObjectContextRef>::const_iterator i = recovering.begin();
+	   i != recovering.end();
+	   ++i) {
+	f->dump_stream("object") << i->first;
+      }
+      f->close_section();
+    }
+    {
+      f->open_object_section("pg_backend");
+      pgbackend->dump_recovery_info(f);
+      f->close_section();
+    }
+  }
+
+  /// last backfill operation started
+  hobject_t last_backfill_started;
+  bool new_backfill;
+
+  int prep_object_replica_pushes(const hobject_t& soid, eversion_t v,
+				 PGBackend::RecoveryHandle *h,
+				 bool *work_started);
+  int prep_object_replica_deletes(const hobject_t& soid, eversion_t v,
+				  PGBackend::RecoveryHandle *h,
+				  bool *work_started);
+
+  void finish_degraded_object(const hobject_t oid);
+
+  // Cancels/resets pulls from peer
+  void check_recovery_sources(const OSDMapRef& map) override ;
+
+  int recover_missing(
+    const hobject_t& oid,
+    eversion_t v,
+    int priority,
+    PGBackend::RecoveryHandle *h);
+
+  // low level ops
+
+  void _make_clone(
+    OpContext *ctx,
+    PGTransaction* t,
+    ObjectContextRef obc,
+    const hobject_t& head, const hobject_t& coid,
+    object_info_t *poi);
+  void execute_ctx(OpContext *ctx);
+  void finish_ctx(OpContext *ctx, int log_op_type, int result=0);
+  void reply_ctx(OpContext *ctx, int err);
+  void make_writeable(OpContext *ctx);
+  void log_op_stats(const OpRequest& op, uint64_t inb, uint64_t outb);
+
+  void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi,
+				   interval_set<uint64_t>& modified, uint64_t offset,
+				   uint64_t length, bool write_full=false);
+  inline void truncate_update_size_and_usage(
+    object_stat_sum_t& delta_stats,
+    object_info_t& oi,
+    uint64_t truncate_size);
+
+  enum class cache_result_t {
+    NOOP,
+    BLOCKED_FULL,
+    BLOCKED_PROMOTE,
+    HANDLED_PROXY,
+    HANDLED_REDIRECT,
+    REPLIED_WITH_EAGAIN,
+    BLOCKED_RECOVERY,
+  };
+  cache_result_t maybe_handle_cache_detail(OpRequestRef op,
+					   bool write_ordered,
+					   ObjectContextRef obc, int r,
+					   hobject_t missing_oid,
+					   bool must_promote,
+					   bool in_hit_set,
+					   ObjectContextRef *promote_obc);
+  cache_result_t maybe_handle_manifest_detail(OpRequestRef op,
+						     bool write_ordered,
+						     ObjectContextRef obc);
+  bool maybe_handle_manifest(OpRequestRef op,
+			      bool write_ordered,
+			      ObjectContextRef obc) {
+    return cache_result_t::NOOP != maybe_handle_manifest_detail(
+      op,
+      write_ordered,
+      obc);
+  }
+
+  /**
+   * This helper function is called from do_op if the ObjectContext lookup fails.
+   * @returns true if the caching code is handling the Op, false otherwise.
+   */
+  bool maybe_handle_cache(OpRequestRef op,
+			  bool write_ordered,
+			  ObjectContextRef obc, int r,
+			  const hobject_t& missing_oid,
+			  bool must_promote,
+			  bool in_hit_set = false) {
+    return cache_result_t::NOOP != maybe_handle_cache_detail(
+      op,
+      write_ordered,
+      obc,
+      r,
+      missing_oid,
+      must_promote,
+      in_hit_set,
+      nullptr);
+  }
+
+  /**
+   * This helper function checks if a promotion is needed.
+   */
+  bool maybe_promote(ObjectContextRef obc,
+		     const hobject_t& missing_oid,
+		     const object_locator_t& oloc,
+		     bool in_hit_set,
+		     uint32_t recency,
+		     OpRequestRef promote_op,
+		     ObjectContextRef *promote_obc = nullptr);
+  /**
+   * This helper function tells the client to redirect their request elsewhere.
+   */
+  void do_cache_redirect(OpRequestRef op);
+  /**
+   * This function attempts to start a promote.  Either it succeeds,
+   * or places op on a wait std::list.  If op is null, failure means that
+   * this is a noop.  If a future user wants to be able to distinguish
+   * these cases, a return value should be added.
+   */
+  void promote_object(
+    ObjectContextRef obc,            ///< [optional] obc
+    const hobject_t& missing_object, ///< oid (if !obc)
+    const object_locator_t& oloc,    ///< locator for obc|oid
+    OpRequestRef op,                 ///< [optional] client op
+    ObjectContextRef *promote_obc = nullptr ///< [optional] new obc for object
+    );
+
+  int prepare_transaction(OpContext *ctx);
+  std::list<std::pair<OpRequestRef, OpContext*> > in_progress_async_reads;
+  void complete_read_ctx(int result, OpContext *ctx);
+
+  // pg on-disk content
+  void check_local() override;
+
+  void _clear_recovery_state() override;
+
+  bool start_recovery_ops(
+    uint64_t max,
+    ThreadPool::TPHandle &handle, uint64_t *started) override;
+
+  uint64_t recover_primary(uint64_t max, ThreadPool::TPHandle &handle);
+  uint64_t recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
+		            bool *recovery_started);
+  hobject_t earliest_peer_backfill() const;
+  bool all_peer_done() const;
+  /**
+   * @param work_started will be std::set to true if recover_backfill got anywhere
+   * @returns the number of operations started
+   */
+  uint64_t recover_backfill(uint64_t max, ThreadPool::TPHandle &handle,
+			    bool *work_started);
+
+  /**
+   * scan a (hash) range of objects in the current pg
+   *
+   * @min return at least this many items, unless we are done
+   * @max return no more than this many items
+   * @bi.begin first item should be >= this value
+   * @bi [out] resulting std::map of objects to eversion_t's
+   */
+  void scan_range(
+    int min, int max, BackfillInterval *bi,
+    ThreadPool::TPHandle &handle
+    );
+
+  /// Update a hash range to reflect changes since the last scan
+  void update_range(
+    BackfillInterval *bi,        ///< [in,out] interval to update
+    ThreadPool::TPHandle &handle ///< [in] tp handle
+    );
+
+  int prep_backfill_object_push(
+    hobject_t oid, eversion_t v, ObjectContextRef obc,
+    std::vector<pg_shard_t> peers,
+    PGBackend::RecoveryHandle *h);
+  void send_remove_op(const hobject_t& oid, eversion_t v, pg_shard_t peer);
+
+
+  class C_OSD_AppliedRecoveredObject;
+  class C_OSD_CommittedPushedObject;
+  class C_OSD_AppliedRecoveredObjectReplica;
+
+  void _applied_recovered_object(ObjectContextRef obc);
+  void _applied_recovered_object_replica();
+  void _committed_pushed_object(epoch_t epoch, eversion_t lc);
+  void recover_got(hobject_t oid, eversion_t v);
+
+  // -- copyfrom --
+  std::map<hobject_t, CopyOpRef> copy_ops;
+
+  int do_copy_get(OpContext *ctx, ceph::buffer::list::const_iterator& bp, OSDOp& op,
+		  ObjectContextRef& obc);
+  int finish_copy_get();
+
+  void fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
+                              OSDOp& osd_op);
+
+  /**
+   * To copy an object, call start_copy.
+   *
+   * @param cb: The CopyCallback to be activated when the copy is complete
+   * @param obc: The ObjectContext we are copying into
+   * @param src: The source object
+   * @param oloc: the source object locator
+   * @param version: the version of the source object to copy (0 for any)
+   */
+  void start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src,
+		  object_locator_t oloc, version_t version, unsigned flags,
+		  bool mirror_snapset, unsigned src_obj_fadvise_flags,
+		  unsigned dest_obj_fadvise_flags);
+  void process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r);
+  void _write_copy_chunk(CopyOpRef cop, PGTransaction *t);
+  uint64_t get_copy_chunk_size() const {
+    uint64_t size = cct->_conf->osd_copyfrom_max_chunk;
+    if (pool.info.required_alignment()) {
+      uint64_t alignment = pool.info.required_alignment();
+      if (size % alignment) {
+	size += alignment - (size % alignment);
+      }
+    }
+    return size;
+  }
+  void _copy_some(ObjectContextRef obc, CopyOpRef cop);
+  void finish_copyfrom(CopyFromCallback *cb);
+  void finish_promote(int r, CopyResults *results, ObjectContextRef obc);
+  void cancel_copy(CopyOpRef cop, bool requeue, std::vector<ceph_tid_t> *tids);
+  void cancel_copy_ops(bool requeue, std::vector<ceph_tid_t> *tids);
+
+  friend struct C_Copyfrom;
+
+  // -- flush --
+  std::map<hobject_t, FlushOpRef> flush_ops;
+
+  /// start_flush takes ownership of on_flush iff ret == -EINPROGRESS
+  int start_flush(
+    OpRequestRef op, ObjectContextRef obc,
+    bool blocking, hobject_t *pmissing,
+    std::optional<std::function<void()>> &&on_flush);
+  void finish_flush(hobject_t oid, ceph_tid_t tid, int r);
+  int try_flush_mark_clean(FlushOpRef fop);
+  void cancel_flush(FlushOpRef fop, bool requeue, std::vector<ceph_tid_t> *tids);
+  void cancel_flush_ops(bool requeue, std::vector<ceph_tid_t> *tids);
+
+  /// @return false if clone is has been evicted
+  bool is_present_clone(hobject_t coid);
+
+  friend struct C_Flush;
+
+  // -- scrub --
+  bool _range_available_for_scrub(
+    const hobject_t &begin, const hobject_t &end) override;
+
+  void _split_into(pg_t child_pgid, PG *child,
+                   unsigned split_bits) override;
+  void apply_and_flush_repops(bool requeue);
+
+  int do_xattr_cmp_u64(int op, __u64 v1, ceph::buffer::list& xattr);
+  int do_xattr_cmp_str(int op, std::string& v1s, ceph::buffer::list& xattr);
+
+  // -- checksum --
+  int do_checksum(OpContext *ctx, OSDOp& osd_op, ceph::buffer::list::const_iterator *bl_it);
+  int finish_checksum(OSDOp& osd_op, Checksummer::CSumType csum_type,
+                      ceph::buffer::list::const_iterator *init_value_bl_it,
+                      const ceph::buffer::list &read_bl);
+
+  friend struct C_ChecksumRead;
+
+  int do_extent_cmp(OpContext *ctx, OSDOp& osd_op);
+  int finish_extent_cmp(OSDOp& osd_op, const ceph::buffer::list &read_bl);
+
+  friend struct C_ExtentCmpRead;
+
+  int do_read(OpContext *ctx, OSDOp& osd_op);
+  int do_sparse_read(OpContext *ctx, OSDOp& osd_op);
+  int do_writesame(OpContext *ctx, OSDOp& osd_op);
+
+  bool pgls_filter(const PGLSFilter& filter, const hobject_t& sobj);
+
+  std::pair<int, std::unique_ptr<const PGLSFilter>> get_pgls_filter(
+    ceph::buffer::list::const_iterator& iter);
+
+  std::map<hobject_t, std::list<OpRequestRef>> in_progress_proxy_ops;
+  void kick_proxy_ops_blocked(hobject_t& soid);
+  void cancel_proxy_ops(bool requeue, std::vector<ceph_tid_t> *tids);
+
+  // -- proxyread --
+  std::map<ceph_tid_t, ProxyReadOpRef> proxyread_ops;
+
+  void do_proxy_read(OpRequestRef op, ObjectContextRef obc = NULL);
+  void finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r);
+  void cancel_proxy_read(ProxyReadOpRef prdop, std::vector<ceph_tid_t> *tids);
+
+  friend struct C_ProxyRead;
+
+  // -- proxywrite --
+  std::map<ceph_tid_t, ProxyWriteOpRef> proxywrite_ops;
+
+  void do_proxy_write(OpRequestRef op, ObjectContextRef obc = NULL);
+  void finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r);
+  void cancel_proxy_write(ProxyWriteOpRef pwop, std::vector<ceph_tid_t> *tids);
+
+  friend struct C_ProxyWrite_Commit;
+
+  // -- chunkop --
+  enum class refcount_t {
+    INCREMENT_REF,
+    DECREMENT_REF,
+    CREATE_OR_GET_REF,
+  };
+  void do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
+			   ObjectContextRef obc, bool write_ordered);
+  void do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
+			     uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
+			     uint64_t req_total_len, bool write_ordered);
+  bool can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc);
+  void _copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset);
+  void process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset);
+  void finish_promote_manifest(int r, CopyResults *results, ObjectContextRef obc);
+  void cancel_and_requeue_proxy_ops(hobject_t oid);
+  void cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids);
+  ceph_tid_t refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type,
+			      Context *cb, std::optional<bufferlist> chunk);
+  void dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx);
+  void dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs);
+  void dec_refcount_by_dirty(OpContext* ctx);
+  ObjectContextRef get_prev_clone_obc(ObjectContextRef obc);
+  bool recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op);
+  void get_adjacent_clones(ObjectContextRef src_obc, 
+			   ObjectContextRef& _l, ObjectContextRef& _g);
+  bool inc_refcount_by_set(OpContext* ctx, object_manifest_t& tgt,
+			   OSDOp& osd_op);
+  int do_cdc(const object_info_t& oi, std::map<uint64_t, chunk_info_t>& chunk_map,
+	     std::map<uint64_t, bufferlist>& chunks);
+  int start_dedup(OpRequestRef op, ObjectContextRef obc);
+  std::pair<int, hobject_t> get_fpoid_from_chunk(const hobject_t soid, bufferlist& chunk);
+  int finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset);
+
+  friend struct C_ProxyChunkRead;
+  friend class PromoteManifestCallback;
+  friend struct C_CopyChunk;
+  friend struct RefCountCallback;
+  friend struct C_SetDedupChunks;
+
+public:
+  PrimaryLogPG(OSDService *o, OSDMapRef curmap,
+	       const PGPool &_pool,
+	       const std::map<std::string,std::string>& ec_profile,
+	       spg_t p);
+  ~PrimaryLogPG() override;
+
+  void do_command(
+    const std::string_view& prefix,
+    const cmdmap_t& cmdmap,
+    const ceph::buffer::list& idata,
+    std::function<void(int,const std::string&,ceph::buffer::list&)> on_finish) override;
+
+  void clear_cache() override;
+  int get_cache_obj_count() override {
+    return object_contexts.get_count();
+  }
+  unsigned get_pg_shard() const {
+    return info.pgid.hash_to_shard(osd->get_num_shards());
+  }
+  void do_request(
+    OpRequestRef& op,
+    ThreadPool::TPHandle &handle) override;
+  void do_op(OpRequestRef& op);
+  void record_write_error(OpRequestRef op, const hobject_t &soid,
+			  MOSDOpReply *orig_reply, int r,
+			  OpContext *ctx_for_op_returns=nullptr);
+  void do_pg_op(OpRequestRef op);
+  void do_scan(
+    OpRequestRef op,
+    ThreadPool::TPHandle &handle);
+  void do_backfill(OpRequestRef op);
+  void do_backfill_remove(OpRequestRef op);
+
+  void handle_backoff(OpRequestRef& op);
+
+  int trim_object(bool first, const hobject_t &coid, snapid_t snap_to_trim,
+		  OpContextUPtr *ctxp);
+  void snap_trimmer(epoch_t e) override;
+  void kick_snap_trim() override;
+  void snap_trimmer_scrub_complete() override;
+  int do_osd_ops(OpContext *ctx, std::vector<OSDOp>& ops);
+
+  int _get_tmap(OpContext *ctx, ceph::buffer::list *header, ceph::buffer::list *vals);
+  int do_tmap2omap(OpContext *ctx, unsigned flags);
+  int do_tmapup(OpContext *ctx, ceph::buffer::list::const_iterator& bp, OSDOp& osd_op);
+  int do_tmapup_slow(OpContext *ctx, ceph::buffer::list::const_iterator& bp, OSDOp& osd_op, ceph::buffer::list& bl);
+
+  void do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn);
+private:
+  int do_scrub_ls(const MOSDOp *op, OSDOp *osd_op);
+  bool check_src_targ(const hobject_t& soid, const hobject_t& toid) const;
+
+  uint64_t temp_seq; ///< last id for naming temp objects
+  /// generate a new temp object name
+  hobject_t generate_temp_object(const hobject_t& target);
+  /// generate a new temp object name (for recovery)
+  hobject_t get_temp_recovery_object(const hobject_t& target,
+				     eversion_t version) override;
+  int get_recovery_op_priority() const {
+    int64_t pri = 0;
+    pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
+    return  pri > 0 ? pri : cct->_conf->osd_recovery_op_priority;
+  }
+
+public:
+  coll_t get_coll() {
+    return coll;
+  }
+  void split_colls(
+    spg_t child,
+    int split_bits,
+    int seed,
+    const pg_pool_t *pool,
+    ObjectStore::Transaction &t) override {
+    coll_t target = coll_t(child);
+    create_pg_collection(t, child, split_bits);
+    t.split_collection(
+      coll,
+      split_bits,
+      seed,
+      target);
+    init_pg_ondisk(t, child, pool);
+  }
+private:
+
+  struct DoSnapWork : boost::statechart::event< DoSnapWork > {
+    DoSnapWork() : boost::statechart::event < DoSnapWork >() {}
+  };
+  struct KickTrim : boost::statechart::event< KickTrim > {
+    KickTrim() : boost::statechart::event < KickTrim >() {}
+  };
+  struct RepopsComplete : boost::statechart::event< RepopsComplete > {
+    RepopsComplete() : boost::statechart::event < RepopsComplete >() {}
+  };
+  struct ScrubComplete : boost::statechart::event< ScrubComplete > {
+    ScrubComplete() : boost::statechart::event < ScrubComplete >() {}
+  };
+  struct TrimWriteUnblocked : boost::statechart::event< TrimWriteUnblocked > {
+    TrimWriteUnblocked() : boost::statechart::event < TrimWriteUnblocked >() {}
+  };
+  struct Reset : boost::statechart::event< Reset > {
+    Reset() : boost::statechart::event< Reset >() {}
+  };
+  struct SnapTrimReserved : boost::statechart::event< SnapTrimReserved > {
+    SnapTrimReserved() : boost::statechart::event< SnapTrimReserved >() {}
+  };
+  struct SnapTrimTimerReady : boost::statechart::event< SnapTrimTimerReady > {
+    SnapTrimTimerReady() : boost::statechart::event< SnapTrimTimerReady >() {}
+  };
+
+  struct NotTrimming;
+  struct SnapTrimmer : public boost::statechart::state_machine< SnapTrimmer, NotTrimming > {
+    PrimaryLogPG *pg;
+    explicit SnapTrimmer(PrimaryLogPG *pg) : pg(pg) {}
+    void log_enter(const char *state_name);
+    void log_exit(const char *state_name, utime_t duration);
+    bool permit_trim();
+    bool can_trim() {
+      return
+	permit_trim() &&
+	!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM);
+    }
+  } snap_trimmer_machine;
+
+  struct WaitReservation;
+  struct Trimming : boost::statechart::state< Trimming, SnapTrimmer, WaitReservation >, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< KickTrim >,
+      boost::statechart::transition< Reset, NotTrimming >
+      > reactions;
+
+    std::set<hobject_t> in_flight;
+    snapid_t snap_to_trim;
+
+    explicit Trimming(my_context ctx)
+      : my_base(ctx),
+	NamedState(nullptr, "Trimming") {
+      context< SnapTrimmer >().log_enter(state_name);
+      ceph_assert(context< SnapTrimmer >().permit_trim());
+      ceph_assert(in_flight.empty());
+    }
+    void exit() {
+      context< SnapTrimmer >().log_exit(state_name, enter_time);
+      auto *pg = context< SnapTrimmer >().pg;
+      pg->osd->snap_reserver.cancel_reservation(pg->get_pgid());
+      pg->state_clear(PG_STATE_SNAPTRIM);
+      pg->publish_stats_to_osd();
+    }
+    boost::statechart::result react(const KickTrim&) {
+      return discard_event();
+    }
+  };
+
+  /* SnapTrimmerStates */
+  struct WaitTrimTimer : boost::statechart::state< WaitTrimTimer, Trimming >, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< SnapTrimTimerReady >
+      > reactions;
+    Context *wakeup = nullptr;
+    explicit WaitTrimTimer(my_context ctx)
+      : my_base(ctx),
+	NamedState(nullptr, "Trimming/WaitTrimTimer") {
+      context< SnapTrimmer >().log_enter(state_name);
+      ceph_assert(context<Trimming>().in_flight.empty());
+      struct OnTimer : Context {
+	PrimaryLogPGRef pg;
+	epoch_t epoch;
+	OnTimer(PrimaryLogPGRef pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
+	void finish(int) override {
+	  pg->lock();
+	  if (!pg->pg_has_reset_since(epoch))
+	    pg->snap_trimmer_machine.process_event(SnapTrimTimerReady());
+	  pg->unlock();
+	}
+      };
+      auto *pg = context< SnapTrimmer >().pg;
+      float osd_snap_trim_sleep = pg->osd->osd->get_osd_snap_trim_sleep();
+      if (osd_snap_trim_sleep > 0) {
+	std::lock_guard l(pg->osd->sleep_lock);
+	wakeup = pg->osd->sleep_timer.add_event_after(
+	  osd_snap_trim_sleep,
+	  new OnTimer{pg, pg->get_osdmap_epoch()});
+      } else {
+	post_event(SnapTrimTimerReady());
+      }
+    }
+    void exit() {
+      context< SnapTrimmer >().log_exit(state_name, enter_time);
+      auto *pg = context< SnapTrimmer >().pg;
+      if (wakeup) {
+	std::lock_guard l(pg->osd->sleep_lock);
+	pg->osd->sleep_timer.cancel_event(wakeup);
+	wakeup = nullptr;
+      }
+    }
+    boost::statechart::result react(const SnapTrimTimerReady &) {
+      wakeup = nullptr;
+      if (!context< SnapTrimmer >().can_trim()) {
+	post_event(KickTrim());
+	return transit< NotTrimming >();
+      } else {
+	return transit< AwaitAsyncWork >();
+      }
+    }
+  };
+
+  struct WaitRWLock : boost::statechart::state< WaitRWLock, Trimming >, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< TrimWriteUnblocked >
+      > reactions;
+    explicit WaitRWLock(my_context ctx)
+      : my_base(ctx),
+	NamedState(nullptr, "Trimming/WaitRWLock") {
+      context< SnapTrimmer >().log_enter(state_name);
+      ceph_assert(context<Trimming>().in_flight.empty());
+    }
+    void exit() {
+      context< SnapTrimmer >().log_exit(state_name, enter_time);
+    }
+    boost::statechart::result react(const TrimWriteUnblocked&) {
+      if (!context< SnapTrimmer >().can_trim()) {
+	post_event(KickTrim());
+	return transit< NotTrimming >();
+      } else {
+	return transit< AwaitAsyncWork >();
+      }
+    }
+  };
+
+  struct WaitRepops : boost::statechart::state< WaitRepops, Trimming >, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< RepopsComplete >
+      > reactions;
+    explicit WaitRepops(my_context ctx)
+      : my_base(ctx),
+	NamedState(nullptr, "Trimming/WaitRepops") {
+      context< SnapTrimmer >().log_enter(state_name);
+      ceph_assert(!context<Trimming>().in_flight.empty());
+    }
+    void exit() {
+      context< SnapTrimmer >().log_exit(state_name, enter_time);
+    }
+    boost::statechart::result react(const RepopsComplete&) {
+      if (!context< SnapTrimmer >().can_trim()) {
+	post_event(KickTrim());
+	return transit< NotTrimming >();
+      } else {
+	return transit< WaitTrimTimer >();
+      }
+    }
+  };
+
+  struct AwaitAsyncWork : boost::statechart::state< AwaitAsyncWork, Trimming >, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< DoSnapWork >
+      > reactions;
+    explicit AwaitAsyncWork(my_context ctx);
+    void exit() {
+      context< SnapTrimmer >().log_exit(state_name, enter_time);
+    }
+    boost::statechart::result react(const DoSnapWork&);
+  };
+
+  struct WaitReservation : boost::statechart::state< WaitReservation, Trimming >, NamedState {
+    /* WaitReservation is a sub-state of trimming simply so that exiting Trimming
+     * always cancels the reservation */
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< SnapTrimReserved >
+      > reactions;
+    struct ReservationCB : public Context {
+      PrimaryLogPGRef pg;
+      bool canceled;
+      explicit ReservationCB(PrimaryLogPG *pg) : pg(pg), canceled(false) {}
+      void finish(int) override {
+	pg->lock();
+	if (!canceled)
+	  pg->snap_trimmer_machine.process_event(SnapTrimReserved());
+	pg->unlock();
+      }
+      void cancel() {
+	ceph_assert(pg->is_locked());
+	ceph_assert(!canceled);
+	canceled = true;
+      }
+    };
+    ReservationCB *pending = nullptr;
+
+    explicit WaitReservation(my_context ctx)
+      : my_base(ctx),
+	NamedState(nullptr, "Trimming/WaitReservation") {
+      context< SnapTrimmer >().log_enter(state_name);
+      ceph_assert(context<Trimming>().in_flight.empty());
+      auto *pg = context< SnapTrimmer >().pg;
+      pending = new ReservationCB(pg);
+      pg->osd->snap_reserver.request_reservation(
+	pg->get_pgid(),
+	pending,
+	0);
+      pg->state_set(PG_STATE_SNAPTRIM_WAIT);
+      pg->publish_stats_to_osd();
+    }
+    boost::statechart::result react(const SnapTrimReserved&);
+    void exit() {
+      context< SnapTrimmer >().log_exit(state_name, enter_time);
+      if (pending)
+	pending->cancel();
+      pending = nullptr;
+      auto *pg = context< SnapTrimmer >().pg;
+      pg->state_clear(PG_STATE_SNAPTRIM_WAIT);
+      pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
+      pg->publish_stats_to_osd();
+    }
+  };
+
+  struct WaitScrub : boost::statechart::state< WaitScrub, SnapTrimmer >, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< ScrubComplete >,
+      boost::statechart::custom_reaction< KickTrim >,
+      boost::statechart::transition< Reset, NotTrimming >
+      > reactions;
+    explicit WaitScrub(my_context ctx)
+      : my_base(ctx),
+	NamedState(nullptr, "Trimming/WaitScrub") {
+      context< SnapTrimmer >().log_enter(state_name);
+    }
+    void exit() {
+      context< SnapTrimmer >().log_exit(state_name, enter_time);
+    }
+    boost::statechart::result react(const ScrubComplete&) {
+      post_event(KickTrim());
+      return transit< NotTrimming >();
+    }
+    boost::statechart::result react(const KickTrim&) {
+      return discard_event();
+    }
+  };
+
+  struct NotTrimming : boost::statechart::state< NotTrimming, SnapTrimmer >, NamedState {
+    typedef boost::mpl::list <
+      boost::statechart::custom_reaction< KickTrim >,
+      boost::statechart::transition< Reset, NotTrimming >
+      > reactions;
+    explicit NotTrimming(my_context ctx);
+    void exit();
+    boost::statechart::result react(const KickTrim&);
+  };
+
+  int _verify_no_head_clones(const hobject_t& soid,
+			     const SnapSet& ss);
+  // return true if we're creating a local object, false for a
+  // whiteout or no change.
+  void maybe_create_new_object(OpContext *ctx, bool ignore_transaction=false);
+  int _delete_oid(OpContext *ctx, bool no_whiteout, bool try_no_whiteout);
+  int _rollback_to(OpContext *ctx, ceph_osd_op& op);
+public:
+  bool is_missing_object(const hobject_t& oid) const;
+  bool is_unreadable_object(const hobject_t &oid) const {
+    return is_missing_object(oid) ||
+      !recovery_state.get_missing_loc().readable_with_acting(
+	oid, get_actingset());
+  }
+  void maybe_kick_recovery(const hobject_t &soid);
+  void wait_for_unreadable_object(const hobject_t& oid, OpRequestRef op);
+
+  int get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op);
+
+  bool check_laggy(OpRequestRef& op);
+  bool check_laggy_requeue(OpRequestRef& op);
+  void recheck_readable() override;
+
+  bool is_backfill_target(pg_shard_t osd) const {
+    return recovery_state.is_backfill_target(osd);
+  }
+  const std::set<pg_shard_t> &get_backfill_targets() const {
+    return recovery_state.get_backfill_targets();
+  }
+  bool is_async_recovery_target(pg_shard_t peer) const {
+    return recovery_state.is_async_recovery_target(peer);
+  }
+  const std::set<pg_shard_t> &get_async_recovery_targets() const {
+    return recovery_state.get_async_recovery_targets();
+  }
+  bool is_degraded_or_backfilling_object(const hobject_t& oid);
+  bool is_degraded_on_async_recovery_target(const hobject_t& soid);
+  void wait_for_degraded_object(const hobject_t& oid, OpRequestRef op);
+
+  void block_write_on_full_cache(
+    const hobject_t& oid, OpRequestRef op);
+  void block_for_clean(
+    const hobject_t& oid, OpRequestRef op);
+  void block_write_on_snap_rollback(
+    const hobject_t& oid, ObjectContextRef obc, OpRequestRef op);
+  void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op);
+
+  bool maybe_await_blocked_head(const hobject_t &soid, OpRequestRef op);
+  void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op);
+  void kick_object_context_blocked(ObjectContextRef obc);
+
+  void maybe_force_recovery();
+
+  void mark_all_unfound_lost(
+    int what,
+    std::function<void(int,const std::string&,ceph::buffer::list&)> on_finish);
+  eversion_t pick_newest_available(const hobject_t& oid);
+
+  void do_update_log_missing(
+    OpRequestRef &op);
+
+  void do_update_log_missing_reply(
+    OpRequestRef &op);
+
+  void plpg_on_role_change() override;
+  void plpg_on_pool_change() override;
+  void clear_async_reads();
+  void on_change(ObjectStore::Transaction &t) override;
+  void on_activate_complete() override;
+  void on_flushed() override;
+  void on_removal(ObjectStore::Transaction &t) override;
+  void on_shutdown() override;
+  bool check_failsafe_full() override;
+  bool maybe_preempt_replica_scrub(const hobject_t& oid) override;
+  int rep_repair_primary_object(const hobject_t& soid, OpContext *ctx);
+
+  // attr cache handling
+  void setattr_maybe_cache(
+    ObjectContextRef obc,
+    PGTransaction *t,
+    const std::string &key,
+    ceph::buffer::list &val);
+  void setattrs_maybe_cache(
+    ObjectContextRef obc,
+    PGTransaction *t,
+    std::map<std::string, ceph::buffer::list> &attrs);
+  void rmattr_maybe_cache(
+    ObjectContextRef obc,
+    PGTransaction *t,
+    const std::string &key);
+  int getattr_maybe_cache(
+    ObjectContextRef obc,
+    const std::string &key,
+    ceph::buffer::list *val);
+  int getattrs_maybe_cache(
+    ObjectContextRef obc,
+    std::map<std::string, ceph::buffer::list> *out);
+
+public:
+  void set_dynamic_perf_stats_queries(
+      const std::list<OSDPerfMetricQuery> &queries)  override;
+  void get_dynamic_perf_stats(DynamicPerfStats *stats)  override;
+
+private:
+  DynamicPerfStats m_dynamic_perf_stats;
+};
+
+inline ostream& operator<<(ostream& out, const PrimaryLogPG::RepGather& repop)
+{
+  out << "repgather(" << &repop
+      << " " << repop.v
+      << " rep_tid=" << repop.rep_tid
+      << " committed?=" << repop.all_committed
+      << " r=" << repop.r
+      << ")";
+  return out;
+}
+
+inline ostream& operator<<(ostream& out,
+			   const PrimaryLogPG::ProxyWriteOpRef& pwop)
+{
+  out << "proxywrite(" << &pwop
+      << " " << pwop->user_version
+      << " pwop_tid=" << pwop->objecter_tid;
+  if (pwop->ctx->op)
+    out << " op=" << *(pwop->ctx->op->get_req());
+  out << ")";
+  return out;
+}
+
+void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop);
+void intrusive_ptr_release(PrimaryLogPG::RepGather *repop);
+
+
+#endif
diff --git a/src/osd/PrimaryLogScrub.cc b/src/osd/PrimaryLogScrub.cc
new file mode 100644
index 000000000..8cf76dd1d
--- /dev/null
+++ b/src/osd/PrimaryLogScrub.cc
@@ -0,0 +1,589 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PrimaryLogScrub.h"
+
+#include "common/scrub_types.h"
+
+#include "PeeringState.h"
+#include "PrimaryLogPG.h"
+#include "scrub_machine.h"
+
+#define dout_context (m_osds->cct)
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+template <class T>
+static ostream& _prefix(std::ostream* _dout, T* t)
+{
+  return t->gen_prefix(*_dout);
+}
+
+using namespace Scrub;
+using Scrub::ScrubMachine;
+
+bool PrimaryLogScrub::get_store_errors(const scrub_ls_arg_t& arg,
+				       scrub_ls_result_t& res_inout) const
+{
+  if (!m_store) {
+    return false;
+  }
+
+  if (arg.get_snapsets) {
+    res_inout.vals =
+      m_store->get_snap_errors(m_pg->get_pgid().pool(), arg.start_after, arg.max_return);
+  } else {
+    res_inout.vals = m_store->get_object_errors(m_pg->get_pgid().pool(), arg.start_after,
+						arg.max_return);
+  }
+  return true;
+}
+
+void PrimaryLogScrub::_scrub_finish()
+{
+  auto& info = m_pg->info;  ///< a temporary alias
+
+  dout(10) << __func__
+	   << " info stats: " << (info.stats.stats_invalid ? "invalid" : "valid")
+	   << dendl;
+
+  if (info.stats.stats_invalid) {
+    m_pl_pg->recovery_state.update_stats([=](auto& history, auto& stats) {
+      stats.stats = m_scrub_cstat;
+      stats.stats_invalid = false;
+      return false;
+    });
+
+    if (m_pl_pg->agent_state)
+      m_pl_pg->agent_choose_mode();
+  }
+
+  dout(10) << m_mode_desc << " got " << m_scrub_cstat.sum.num_objects << "/"
+	   << info.stats.stats.sum.num_objects << " objects, "
+	   << m_scrub_cstat.sum.num_object_clones << "/"
+	   << info.stats.stats.sum.num_object_clones << " clones, "
+	   << m_scrub_cstat.sum.num_objects_dirty << "/"
+	   << info.stats.stats.sum.num_objects_dirty << " dirty, "
+	   << m_scrub_cstat.sum.num_objects_omap << "/"
+	   << info.stats.stats.sum.num_objects_omap << " omap, "
+	   << m_scrub_cstat.sum.num_objects_pinned << "/"
+	   << info.stats.stats.sum.num_objects_pinned << " pinned, "
+	   << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
+	   << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
+	   << m_scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes
+	   << " bytes, " << m_scrub_cstat.sum.num_objects_manifest << "/"
+	   << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
+	   << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
+	   << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
+	   << dendl;
+
+  if (m_scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
+      m_scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
+      (m_scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
+       !info.stats.dirty_stats_invalid) ||
+      (m_scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
+       !info.stats.omap_stats_invalid) ||
+      (m_scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
+       !info.stats.pin_stats_invalid) ||
+      (m_scrub_cstat.sum.num_objects_hit_set_archive !=
+	 info.stats.stats.sum.num_objects_hit_set_archive &&
+       !info.stats.hitset_stats_invalid) ||
+      (m_scrub_cstat.sum.num_bytes_hit_set_archive !=
+	 info.stats.stats.sum.num_bytes_hit_set_archive &&
+       !info.stats.hitset_bytes_stats_invalid) ||
+      (m_scrub_cstat.sum.num_objects_manifest !=
+	 info.stats.stats.sum.num_objects_manifest &&
+       !info.stats.manifest_stats_invalid) ||
+      m_scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
+      m_scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
+    m_osds->clog->error() << info.pgid << " " << m_mode_desc << " : stat mismatch, got "
+			  << m_scrub_cstat.sum.num_objects << "/"
+			  << info.stats.stats.sum.num_objects << " objects, "
+			  << m_scrub_cstat.sum.num_object_clones << "/"
+			  << info.stats.stats.sum.num_object_clones << " clones, "
+			  << m_scrub_cstat.sum.num_objects_dirty << "/"
+			  << info.stats.stats.sum.num_objects_dirty << " dirty, "
+			  << m_scrub_cstat.sum.num_objects_omap << "/"
+			  << info.stats.stats.sum.num_objects_omap << " omap, "
+			  << m_scrub_cstat.sum.num_objects_pinned << "/"
+			  << info.stats.stats.sum.num_objects_pinned << " pinned, "
+			  << m_scrub_cstat.sum.num_objects_hit_set_archive << "/"
+			  << info.stats.stats.sum.num_objects_hit_set_archive
+			  << " hit_set_archive, " << m_scrub_cstat.sum.num_whiteouts
+			  << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
+			  << m_scrub_cstat.sum.num_bytes << "/"
+			  << info.stats.stats.sum.num_bytes << " bytes, "
+			  << m_scrub_cstat.sum.num_objects_manifest << "/"
+			  << info.stats.stats.sum.num_objects_manifest
+			  << " manifest objects, "
+			  << m_scrub_cstat.sum.num_bytes_hit_set_archive << "/"
+			  << info.stats.stats.sum.num_bytes_hit_set_archive
+			  << " hit_set_archive bytes.";
+    ++m_shallow_errors;
+
+    if (m_is_repair) {
+      ++m_fixed_count;
+      m_pl_pg->recovery_state.update_stats([this](auto& history, auto& stats) {
+	stats.stats = m_scrub_cstat;
+	stats.dirty_stats_invalid = false;
+	stats.omap_stats_invalid = false;
+	stats.hitset_stats_invalid = false;
+	stats.hitset_bytes_stats_invalid = false;
+	stats.pin_stats_invalid = false;
+	stats.manifest_stats_invalid = false;
+	return false;
+      });
+      m_pl_pg->publish_stats_to_osd();
+      m_pl_pg->recovery_state.share_pg_info();
+    }
+  }
+  // Clear object context cache to get repair information
+  if (m_is_repair)
+    m_pl_pg->object_contexts.clear();
+}
+
+static bool doing_clones(const std::optional<SnapSet>& snapset,
+			 const vector<snapid_t>::reverse_iterator& curclone)
+{
+  return snapset && curclone != snapset->clones.rend();
+}
+
+void PrimaryLogScrub::log_missing(int missing,
+				  const std::optional<hobject_t>& head,
+				  LogChannelRef clog,
+				  const spg_t& pgid,
+				  const char* func,
+				  bool allow_incomplete_clones)
+{
+  ceph_assert(head);
+  if (allow_incomplete_clones) {
+    dout(20) << func << " " << m_mode_desc << " " << pgid << " " << *head << " skipped "
+	     << missing << " clone(s) in cache tier" << dendl;
+  } else {
+    clog->info() << m_mode_desc << " " << pgid << " " << *head << " : " << missing
+		 << " missing clone(s)";
+  }
+}
+
+int PrimaryLogScrub::process_clones_to(const std::optional<hobject_t>& head,
+				       const std::optional<SnapSet>& snapset,
+				       LogChannelRef clog,
+				       const spg_t& pgid,
+				       bool allow_incomplete_clones,
+				       std::optional<snapid_t> target,
+				       vector<snapid_t>::reverse_iterator* curclone,
+				       inconsistent_snapset_wrapper& e)
+{
+  ceph_assert(head);
+  ceph_assert(snapset);
+  int missing_count = 0;
+
+  // NOTE: clones are in descending order, thus **curclone > target test here
+  hobject_t next_clone(*head);
+  while (doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
+
+    ++missing_count;
+    // it is okay to be missing one or more clones in a cache tier.
+    // skip higher-numbered clones in the list.
+    if (!allow_incomplete_clones) {
+      next_clone.snap = **curclone;
+      clog->error() << m_mode_desc << " " << pgid << " " << *head << " : expected clone "
+		    << next_clone << " " << m_missing << " missing";
+      ++m_shallow_errors;
+      e.set_clone_missing(next_clone.snap);
+    }
+    // Clones are descending
+    ++(*curclone);
+  }
+  return missing_count;
+}
+
+/*
+ * Validate consistency of the object info and snap sets.
+ *
+ * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
+ * the comparison of the objects is against multiple snapset.clones. There are
+ * multiple clone lists and in between lists we expect head.
+ *
+ * Example
+ *
+ * objects              expected
+ * =======              =======
+ * obj1 snap 1          head, unexpected obj1 snap 1
+ * obj2 head            head, match
+ *              [SnapSet clones 6 4 2 1]
+ * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
+ * obj2 snap 6          obj2 snap 6, match
+ * obj2 snap 4          obj2 snap 4, match
+ * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), match
+ *              [Snapset clones 3 1]
+ * obj3 snap 3          obj3 snap 3 match
+ * obj3 snap 1          obj3 snap 1 match
+ * obj4 head            head, match
+ *              [Snapset clones 4]
+ * EOL                  obj4 snap 4, (expected)
+ */
+void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap,
+					      const missing_map_t& missing_digest)
+{
+  dout(10) << __func__ << " num stat obj " << m_pl_pg->info.stats.stats.sum.num_objects
+	   << dendl;
+
+  auto& info = m_pl_pg->info;
+  const PGPool& pool = m_pl_pg->pool;
+  bool allow_incomplete_clones = pool.info.allow_incomplete_clones();
+
+  std::optional<snapid_t> all_clones;  // Unspecified snapid_t or std::nullopt
+
+  // traverse in reverse order.
+  std::optional<hobject_t> head;
+  std::optional<SnapSet> snapset;		// If initialized so will head (above)
+  vector<snapid_t>::reverse_iterator curclone;	// Defined only if snapset initialized
+  int missing = 0;
+  inconsistent_snapset_wrapper soid_error, head_error;
+  int soid_error_count = 0;
+
+  for (auto p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
+
+    const hobject_t& soid = p->first;
+    ceph_assert(!soid.is_snapdir());
+    soid_error = inconsistent_snapset_wrapper{soid};
+    object_stat_sum_t stat;
+    std::optional<object_info_t> oi;
+
+    stat.num_objects++;
+
+    if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
+      stat.num_objects_hit_set_archive++;
+
+    if (soid.is_snap()) {
+      // it's a clone
+      stat.num_object_clones++;
+    }
+
+    // basic checks.
+    if (p->second.attrs.count(OI_ATTR) == 0) {
+      oi = std::nullopt;
+      m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
+			    << OI_ATTR << "' attr";
+      ++m_shallow_errors;
+      soid_error.set_info_missing();
+    } else {
+      bufferlist bv;
+      bv.push_back(p->second.attrs[OI_ATTR]);
+      try {
+	oi = object_info_t();  // Initialize optional<> before decode into it
+	oi->decode(bv);
+      } catch (ceph::buffer::error& e) {
+	oi = std::nullopt;
+	m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+			      << " : can't decode '" << OI_ATTR << "' attr " << e.what();
+	++m_shallow_errors;
+	soid_error.set_info_corrupted();
+	soid_error.set_info_missing();	// Not available too
+      }
+    }
+
+    if (oi) {
+      if (m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
+	m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+			      << " : on disk size (" << p->second.size
+			      << ") does not match object info size (" << oi->size
+			      << ") adjusted for ondisk to ("
+			      << m_pl_pg->pgbackend->be_get_ondisk_size(oi->size) << ")";
+	soid_error.set_size_mismatch();
+	++m_shallow_errors;
+      }
+
+      dout(20) << m_mode_desc << "  " << soid << " " << *oi << dendl;
+
+      // A clone num_bytes will be added later when we have snapset
+      if (!soid.is_snap()) {
+	stat.num_bytes += oi->size;
+      }
+      if (soid.nspace == m_pl_pg->cct->_conf->osd_hit_set_namespace)
+	stat.num_bytes_hit_set_archive += oi->size;
+
+      if (oi->is_dirty())
+	++stat.num_objects_dirty;
+      if (oi->is_whiteout())
+	++stat.num_whiteouts;
+      if (oi->is_omap())
+	++stat.num_objects_omap;
+      if (oi->is_cache_pinned())
+	++stat.num_objects_pinned;
+      if (oi->has_manifest())
+	++stat.num_objects_manifest;
+    }
+
+    // Check for any problems while processing clones
+    if (doing_clones(snapset, curclone)) {
+      std::optional<snapid_t> target;
+      // Expecting an object with snap for current head
+      if (soid.has_snapset() || soid.get_head() != head->get_head()) {
+
+	dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid << " new object " << soid
+		 << " while processing " << *head << dendl;
+
+	target = all_clones;
+      } else {
+	ceph_assert(soid.is_snap());
+	target = soid.snap;
+      }
+
+      // Log any clones we were expecting to be there up to target
+      // This will set missing, but will be a no-op if snap.soid == *curclone.
+      missing +=
+	process_clones_to(head, snapset, m_osds->clog, info.pgid,
+			  allow_incomplete_clones, target, &curclone, head_error);
+    }
+
+    bool expected;
+    // Check doing_clones() again in case we ran process_clones_to()
+    if (doing_clones(snapset, curclone)) {
+      // A head would have processed all clones above
+      // or all greater than *curclone.
+      ceph_assert(soid.is_snap() && *curclone <= soid.snap);
+
+      // After processing above clone snap should match the expected curclone
+      expected = (*curclone == soid.snap);
+    } else {
+      // If we aren't doing clones any longer, then expecting head
+      expected = soid.has_snapset();
+    }
+    if (!expected) {
+      // If we couldn't read the head's snapset, just ignore clones
+      if (head && !snapset) {
+	m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+			      << " : clone ignored due to missing snapset";
+      } else {
+	m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+			      << " : is an unexpected clone";
+      }
+      ++m_shallow_errors;
+      soid_error.set_headless();
+      m_store->add_snap_error(pool.id, soid_error);
+      ++soid_error_count;
+      if (head && soid.get_head() == head->get_head())
+	head_error.set_clone(soid.snap);
+      continue;
+    }
+
+    // new snapset?
+    if (soid.has_snapset()) {
+
+      if (missing) {
+	log_missing(missing, head, m_osds->clog, info.pgid, __func__,
+		    pool.info.allow_incomplete_clones());
+      }
+
+      // Save previous head error information
+      if (head && (head_error.errors || soid_error_count))
+	m_store->add_snap_error(pool.id, head_error);
+      // Set this as a new head object
+      head = soid;
+      missing = 0;
+      head_error = soid_error;
+      soid_error_count = 0;
+
+      dout(20) << __func__ << " " << m_mode_desc << " new head " << head << dendl;
+
+      if (p->second.attrs.count(SS_ATTR) == 0) {
+	m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid << " : no '"
+			      << SS_ATTR << "' attr";
+	++m_shallow_errors;
+	snapset = std::nullopt;
+	head_error.set_snapset_missing();
+      } else {
+	bufferlist bl;
+	bl.push_back(p->second.attrs[SS_ATTR]);
+	auto blp = bl.cbegin();
+	try {
+	  snapset = SnapSet();	// Initialize optional<> before decoding into it
+	  decode(*snapset, blp);
+	  head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
+	} catch (ceph::buffer::error& e) {
+	  snapset = std::nullopt;
+	  m_osds->clog->error()
+	    << m_mode_desc << " " << info.pgid << " " << soid << " : can't decode '" << SS_ATTR
+	    << "' attr " << e.what();
+	  ++m_shallow_errors;
+	  head_error.set_snapset_corrupted();
+	}
+      }
+
+      if (snapset) {
+	// what will be next?
+	curclone = snapset->clones.rbegin();
+
+	if (!snapset->clones.empty()) {
+	  dout(20) << "  snapset " << *snapset << dendl;
+	  if (snapset->seq == 0) {
+	    m_osds->clog->error()
+	      << m_mode_desc << " " << info.pgid << " " << soid << " : snaps.seq not set";
+	    ++m_shallow_errors;
+	    head_error.set_snapset_error();
+	  }
+	}
+      }
+    } else {
+      ceph_assert(soid.is_snap());
+      ceph_assert(head);
+      ceph_assert(snapset);
+      ceph_assert(soid.snap == *curclone);
+
+      dout(20) << __func__ << " " << m_mode_desc << " matched clone " << soid << dendl;
+
+      if (snapset->clone_size.count(soid.snap) == 0) {
+	m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+			      << " : is missing in clone_size";
+	++m_shallow_errors;
+	soid_error.set_size_mismatch();
+      } else {
+	if (oi && oi->size != snapset->clone_size[soid.snap]) {
+	  m_osds->clog->error()
+	    << m_mode_desc << " " << info.pgid << " " << soid << " : size " << oi->size
+	    << " != clone_size " << snapset->clone_size[*curclone];
+	  ++m_shallow_errors;
+	  soid_error.set_size_mismatch();
+	}
+
+	if (snapset->clone_overlap.count(soid.snap) == 0) {
+	  m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+				<< " : is missing in clone_overlap";
+	  ++m_shallow_errors;
+	  soid_error.set_size_mismatch();
+	} else {
+	  // This checking is based on get_clone_bytes().  The first 2 asserts
+	  // can't happen because we know we have a clone_size and
+	  // a clone_overlap.  Now we check that the interval_set won't
+	  // cause the last assert.
+	  uint64_t size = snapset->clone_size.find(soid.snap)->second;
+	  const interval_set<uint64_t>& overlap =
+	    snapset->clone_overlap.find(soid.snap)->second;
+	  bool bad_interval_set = false;
+	  for (interval_set<uint64_t>::const_iterator i = overlap.begin();
+	       i != overlap.end(); ++i) {
+	    if (size < i.get_len()) {
+	      bad_interval_set = true;
+	      break;
+	    }
+	    size -= i.get_len();
+	  }
+
+	  if (bad_interval_set) {
+	    m_osds->clog->error() << m_mode_desc << " " << info.pgid << " " << soid
+				  << " : bad interval_set in clone_overlap";
+	    ++m_shallow_errors;
+	    soid_error.set_size_mismatch();
+	  } else {
+	    stat.num_bytes += snapset->get_clone_bytes(soid.snap);
+	  }
+	}
+      }
+
+      // what's next?
+      ++curclone;
+      if (soid_error.errors) {
+	m_store->add_snap_error(pool.id, soid_error);
+	++soid_error_count;
+      }
+    }
+    m_scrub_cstat.add(stat);
+  }
+
+  if (doing_clones(snapset, curclone)) {
+    dout(10) << __func__ << " " << m_mode_desc << " " << info.pgid
+	     << " No more objects while processing " << *head << dendl;
+
+    missing +=
+      process_clones_to(head, snapset, m_osds->clog, info.pgid,
+			allow_incomplete_clones, all_clones, &curclone, head_error);
+  }
+
+  // There could be missing found by the test above or even
+  // before dropping out of the loop for the last head.
+  if (missing) {
+    log_missing(missing, head, m_osds->clog, info.pgid, __func__,
+		allow_incomplete_clones);
+  }
+  if (head && (head_error.errors || soid_error_count))
+    m_store->add_snap_error(pool.id, head_error);
+
+  dout(20) << __func__ << " - " << missing << " (" << missing_digest.size() << ") missing"
+	   << dendl;
+  for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
+
+    ceph_assert(!p->first.is_snapdir());
+    dout(10) << __func__ << " recording digests for " << p->first << dendl;
+
+    ObjectContextRef obc = m_pl_pg->get_object_context(p->first, false);
+    if (!obc) {
+      m_osds->clog->error() << info.pgid << " " << m_mode_desc
+			    << " cannot get object context for object " << p->first;
+      continue;
+    }
+    if (obc->obs.oi.soid != p->first) {
+      m_osds->clog->error() << info.pgid << " " << m_mode_desc << " " << p->first
+			    << " : object has a valid oi attr with a mismatched name, "
+			    << " obc->obs.oi.soid: " << obc->obs.oi.soid;
+      continue;
+    }
+    PrimaryLogPG::OpContextUPtr ctx = m_pl_pg->simple_opc_create(obc);
+    ctx->at_version = m_pl_pg->get_next_version();
+    ctx->mtime = utime_t();  // do not update mtime
+    if (p->second.first) {
+      ctx->new_obs.oi.set_data_digest(*p->second.first);
+    } else {
+      ctx->new_obs.oi.clear_data_digest();
+    }
+    if (p->second.second) {
+      ctx->new_obs.oi.set_omap_digest(*p->second.second);
+    } else {
+      ctx->new_obs.oi.clear_omap_digest();
+    }
+    m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
+
+    ++num_digest_updates_pending;
+    ctx->register_on_success([this]() {
+      if ((num_digest_updates_pending >= 1) && 
+          (--num_digest_updates_pending == 0)) {
+	m_osds->queue_scrub_digest_update(m_pl_pg, m_pl_pg->is_scrub_blocking_ops());
+      }
+    });
+
+    m_pl_pg->simple_opc_submit(std::move(ctx));
+  }
+
+  dout(10) << __func__ << " (" << m_mode_desc << ") finish" << dendl;
+}
+
+PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) : PgScrubber{pg}, m_pl_pg{pg} {}
+
+void PrimaryLogScrub::_scrub_clear_state()
+{
+  dout(15) << __func__ << dendl;
+  m_scrub_cstat = object_stat_collection_t();
+}
+
+void PrimaryLogScrub::stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+					       const hobject_t& soid)
+{
+  // We scrub objects in hobject_t order, so objects before m_start have already been
+  // scrubbed and their stats have already been added to the scrubber. Objects after that
+  // point haven't been included in the scrubber's stats accounting yet, so they will be
+  // included when the scrubber gets to that object.
+  dout(15) << __func__ << " soid: " << soid << " scrub is active? " << is_scrub_active()
+	   << dendl;
+  if (is_primary() && is_scrub_active()) {
+    if (soid < m_start) {
+      dout(20) << __func__ << " " << soid << " < [" << m_start << "," << m_end << ")"
+	       << dendl;
+      m_scrub_cstat.add(delta_stats);
+    } else {
+      dout(20) << __func__ << " " << soid << " >= [" << m_start << "," << m_end << ")"
+	       << dendl;
+    }
+  }
+}
diff --git a/src/osd/PrimaryLogScrub.h b/src/osd/PrimaryLogScrub.h
new file mode 100644
index 000000000..78353d6db
--- /dev/null
+++ b/src/osd/PrimaryLogScrub.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+// the './' includes are marked this way to affect clang-format
+#include "./pg_scrubber.h"
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "debug.h"
+
+#include "common/errno.h"
+#include "common/scrub_types.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrubReserve.h"
+
+#include "OSD.h"
+#include "scrub_machine.h"
+
+class PrimaryLogPG;
+
+/**
+ * The derivative of PgScrubber that is used by PrimaryLogPG.
+ */
+class PrimaryLogScrub : public PgScrubber {
+ public:
+  explicit PrimaryLogScrub(PrimaryLogPG* pg);
+
+  void _scrub_finish() final;
+
+  bool get_store_errors(const scrub_ls_arg_t& arg,
+			scrub_ls_result_t& res_inout) const final;
+
+  void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+				const hobject_t& soid) final;
+
+ private:
+  // we know our PG is actually a PrimaryLogPG. Let's alias the pointer to that object:
+  PrimaryLogPG* const m_pl_pg;
+
+  /**
+   * Validate consistency of the object info and snap sets.
+   */
+  void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) final;
+
+  void log_missing(int missing,
+		   const std::optional<hobject_t>& head,
+		   LogChannelRef clog,
+		   const spg_t& pgid,
+		   const char* func,
+		   bool allow_incomplete_clones);
+
+  int process_clones_to(const std::optional<hobject_t>& head,
+			const std::optional<SnapSet>& snapset,
+			LogChannelRef clog,
+			const spg_t& pgid,
+			bool allow_incomplete_clones,
+			std::optional<snapid_t> target,
+			std::vector<snapid_t>::reverse_iterator* curclone,
+			inconsistent_snapset_wrapper& snap_error);
+
+
+  // handle our part in stats collection
+  object_stat_collection_t m_scrub_cstat;
+  void _scrub_clear_state() final;  // which just clears the stats
+};
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
new file mode 100644
index 000000000..1468764c3
--- /dev/null
+++ b/src/osd/ReplicatedBackend.cc
@@ -0,0 +1,2425 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "common/errno.h"
+#include "ReplicatedBackend.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPull.h"
+#include "messages/MOSDPGPushReply.h"
+#include "common/EventTrace.h"
+#include "include/random.h"
+#include "include/util.h"
+#include "OSD.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#define DOUT_PREFIX_ARGS this
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, ReplicatedBackend *pgb) {
+  return pgb->get_parent()->gen_dbg_prefix(*_dout);
+}
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::ostringstream;
+using std::set;
+using std::pair;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferhash;
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+
+namespace {
+class PG_SendMessageOnConn: public Context {
+  PGBackend::Listener *pg;
+  Message *reply;
+  ConnectionRef conn;
+  public:
+  PG_SendMessageOnConn(
+    PGBackend::Listener *pg,
+    Message *reply,
+    ConnectionRef conn) : pg(pg), reply(reply), conn(conn) {}
+  void finish(int) override {
+    pg->send_message_osd_cluster(MessageRef(reply, false), conn.get());
+  }
+};
+
+class PG_RecoveryQueueAsync : public Context {
+  PGBackend::Listener *pg;
+  unique_ptr<GenContext<ThreadPool::TPHandle&>> c;
+  public:
+  PG_RecoveryQueueAsync(
+    PGBackend::Listener *pg,
+    GenContext<ThreadPool::TPHandle&> *c) : pg(pg), c(c) {}
+  void finish(int) override {
+    pg->schedule_recovery_work(c.release());
+  }
+};
+}
+
+struct ReplicatedBackend::C_OSD_RepModifyCommit : public Context {
+  ReplicatedBackend *pg;
+  RepModifyRef rm;
+  C_OSD_RepModifyCommit(ReplicatedBackend *pg, RepModifyRef r)
+    : pg(pg), rm(r) {}
+  void finish(int r) override {
+    pg->repop_commit(rm);
+  }
+};
+
+static void log_subop_stats(
+  PerfCounters *logger,
+  OpRequestRef op, int subop)
+{
+  utime_t latency = ceph_clock_now();
+  latency -= op->get_req()->get_recv_stamp();
+
+
+  logger->inc(l_osd_sop);
+  logger->tinc(l_osd_sop_lat, latency);
+  logger->inc(subop);
+
+  if (subop != l_osd_sop_pull) {
+    uint64_t inb = op->get_req()->get_data().length();
+    logger->inc(l_osd_sop_inb, inb);
+    if (subop == l_osd_sop_w) {
+      logger->inc(l_osd_sop_w_inb, inb);
+      logger->tinc(l_osd_sop_w_lat, latency);
+    } else if (subop == l_osd_sop_push) {
+      logger->inc(l_osd_sop_push_inb, inb);
+      logger->tinc(l_osd_sop_push_lat, latency);
+    } else
+      ceph_abort_msg("no support subop");
+  } else {
+    logger->tinc(l_osd_sop_pull_lat, latency);
+  }
+}
+
+ReplicatedBackend::ReplicatedBackend(
+  PGBackend::Listener *pg,
+  const coll_t &coll,
+  ObjectStore::CollectionHandle &c,
+  ObjectStore *store,
+  CephContext *cct) :
+  PGBackend(cct, pg, store, coll, c) {}
+
+void ReplicatedBackend::run_recovery_op(
+  PGBackend::RecoveryHandle *_h,
+  int priority)
+{
+  RPGHandle *h = static_cast<RPGHandle *>(_h);
+  send_pushes(priority, h->pushes);
+  send_pulls(priority, h->pulls);
+  send_recovery_deletes(priority, h->deletes);
+  delete h;
+}
+
+int ReplicatedBackend::recover_object(
+  const hobject_t &hoid,
+  eversion_t v,
+  ObjectContextRef head,
+  ObjectContextRef obc,
+  RecoveryHandle *_h
+  )
+{
+  dout(10) << __func__ << ": " << hoid << dendl;
+  RPGHandle *h = static_cast<RPGHandle *>(_h);
+  if (get_parent()->get_local_missing().is_missing(hoid)) {
+    ceph_assert(!obc);
+    // pull
+    prepare_pull(
+      v,
+      hoid,
+      head,
+      h);
+  } else {
+    ceph_assert(obc);
+    int started = start_pushes(
+      hoid,
+      obc,
+      h);
+    if (started < 0) {
+      pushing[hoid].clear();
+      return started;
+    }
+  }
+  return 0;
+}
+
+void ReplicatedBackend::check_recovery_sources(const OSDMapRef& osdmap)
+{
+  for(map<pg_shard_t, set<hobject_t> >::iterator i = pull_from_peer.begin();
+      i != pull_from_peer.end();
+      ) {
+    if (osdmap->is_down(i->first.osd)) {
+      dout(10) << "check_recovery_sources resetting pulls from osd." << i->first
+	       << ", osdmap has it marked down" << dendl;
+      for (set<hobject_t>::iterator j = i->second.begin();
+	   j != i->second.end();
+	   ++j) {
+	get_parent()->cancel_pull(*j);
+	clear_pull(pulling.find(*j), false);
+      }
+      pull_from_peer.erase(i++);
+    } else {
+      ++i;
+    }
+  }
+}
+
+bool ReplicatedBackend::can_handle_while_inactive(OpRequestRef op)
+{
+  dout(10) << __func__ << ": " << op << dendl;
+  switch (op->get_req()->get_type()) {
+  case MSG_OSD_PG_PULL:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool ReplicatedBackend::_handle_message(
+  OpRequestRef op
+  )
+{
+  dout(10) << __func__ << ": " << op << dendl;
+  switch (op->get_req()->get_type()) {
+  case MSG_OSD_PG_PUSH:
+    do_push(op);
+    return true;
+
+  case MSG_OSD_PG_PULL:
+    do_pull(op);
+    return true;
+
+  case MSG_OSD_PG_PUSH_REPLY:
+    do_push_reply(op);
+    return true;
+
+  case MSG_OSD_REPOP: {
+    do_repop(op);
+    return true;
+  }
+
+  case MSG_OSD_REPOPREPLY: {
+    do_repop_reply(op);
+    return true;
+  }
+
+  default:
+    break;
+  }
+  return false;
+}
+
+void ReplicatedBackend::clear_recovery_state()
+{
+  // clear pushing/pulling maps
+  for (auto &&i: pushing) {
+    for (auto &&j: i.second) {
+      get_parent()->release_locks(j.second.lock_manager);
+    }
+  }
+  pushing.clear();
+
+  for (auto &&i: pulling) {
+    get_parent()->release_locks(i.second.lock_manager);
+  }
+  pulling.clear();
+  pull_from_peer.clear();
+}
+
+void ReplicatedBackend::on_change()
+{
+  dout(10) << __func__ << dendl;
+  for (auto& op : in_progress_ops) {
+    delete op.second->on_commit;
+    op.second->on_commit = nullptr;
+  }
+  in_progress_ops.clear();
+  clear_recovery_state();
+}
+
+int ReplicatedBackend::objects_read_sync(
+  const hobject_t &hoid,
+  uint64_t off,
+  uint64_t len,
+  uint32_t op_flags,
+  bufferlist *bl)
+{
+  return store->read(ch, ghobject_t(hoid), off, len, *bl, op_flags);
+}
+
+int ReplicatedBackend::objects_readv_sync(
+  const hobject_t &hoid,
+  map<uint64_t, uint64_t>&& m,
+  uint32_t op_flags,
+  bufferlist *bl)
+{
+  interval_set<uint64_t> im(std::move(m));
+  auto r = store->readv(ch, ghobject_t(hoid), im, *bl, op_flags);
+  if (r >= 0) {
+    m = std::move(im).detach();
+  }
+  return r;
+}
+
+void ReplicatedBackend::objects_read_async(
+  const hobject_t &hoid,
+  const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+		  pair<bufferlist*, Context*> > > &to_read,
+  Context *on_complete,
+  bool fast_read)
+{
+  ceph_abort_msg("async read is not used by replica pool");
+}
+
+class C_OSD_OnOpCommit : public Context {
+  ReplicatedBackend *pg;
+  ceph::ref_t<ReplicatedBackend::InProgressOp> op;
+public:
+  C_OSD_OnOpCommit(ReplicatedBackend *pg, ceph::ref_t<ReplicatedBackend::InProgressOp> op)
+    : pg(pg), op(std::move(op)) {}
+  void finish(int) override {
+    pg->op_commit(op);
+  }
+};
+
+void generate_transaction(
+  PGTransactionUPtr &pgt,
+  const coll_t &coll,
+  vector<pg_log_entry_t> &log_entries,
+  ObjectStore::Transaction *t,
+  set<hobject_t> *added,
+  set<hobject_t> *removed,
+  const ceph_release_t require_osd_release = ceph_release_t::unknown )
+{
+  ceph_assert(t);
+  ceph_assert(added);
+  ceph_assert(removed);
+
+  for (auto &&le: log_entries) {
+    le.mark_unrollbackable();
+    auto oiter = pgt->op_map.find(le.soid);
+    if (oiter != pgt->op_map.end() && oiter->second.updated_snaps) {
+      bufferlist bl(oiter->second.updated_snaps->second.size() * 8 + 8);
+      encode(oiter->second.updated_snaps->second, bl);
+      le.snaps.swap(bl);
+      le.snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+    }
+  }
+
+  pgt->safe_create_traverse(
+    [&](pair<const hobject_t, PGTransaction::ObjectOperation> &obj_op) {
+      const hobject_t &oid = obj_op.first;
+      const ghobject_t goid =
+	ghobject_t(oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD);
+      const PGTransaction::ObjectOperation &op = obj_op.second;
+
+      if (oid.is_temp()) {
+	if (op.is_fresh_object()) {
+	  added->insert(oid);
+	} else if (op.is_delete()) {
+	  removed->insert(oid);
+	}
+      }
+
+      if (op.delete_first) {
+	t->remove(coll, goid);
+      }
+
+      match(
+	op.init_type,
+	[&](const PGTransaction::ObjectOperation::Init::None &) {
+	},
+	[&](const PGTransaction::ObjectOperation::Init::Create &op) {
+	  if (require_osd_release >= ceph_release_t::octopus) {
+	    t->create(coll, goid);
+	  } else {
+	    t->touch(coll, goid);
+	  }
+	},
+	[&](const PGTransaction::ObjectOperation::Init::Clone &op) {
+	  t->clone(
+	    coll,
+	    ghobject_t(
+	      op.source, ghobject_t::NO_GEN, shard_id_t::NO_SHARD),
+	    goid);
+	},
+	[&](const PGTransaction::ObjectOperation::Init::Rename &op) {
+	  ceph_assert(op.source.is_temp());
+	  t->collection_move_rename(
+	    coll,
+	    ghobject_t(
+	      op.source, ghobject_t::NO_GEN, shard_id_t::NO_SHARD),
+	    coll,
+	    goid);
+	});
+
+      if (op.truncate) {
+	t->truncate(coll, goid, op.truncate->first);
+	if (op.truncate->first != op.truncate->second)
+	  t->truncate(coll, goid, op.truncate->second);
+      }
+
+      if (!op.attr_updates.empty()) {
+	map<string, bufferlist> attrs;
+	for (auto &&p: op.attr_updates) {
+	  if (p.second)
+	    attrs[p.first] = *(p.second);
+	  else
+	    t->rmattr(coll, goid, p.first);
+	}
+	t->setattrs(coll, goid, attrs);
+      }
+
+      if (op.clear_omap)
+	t->omap_clear(coll, goid);
+      if (op.omap_header)
+	t->omap_setheader(coll, goid, *(op.omap_header));
+
+      for (auto &&up: op.omap_updates) {
+	using UpdateType = PGTransaction::ObjectOperation::OmapUpdateType;
+	switch (up.first) {
+	case UpdateType::Remove:
+	  t->omap_rmkeys(coll, goid, up.second);
+	  break;
+	case UpdateType::Insert:
+	  t->omap_setkeys(coll, goid, up.second);
+	  break;
+	case UpdateType::RemoveRange:
+	  t->omap_rmkeyrange(coll, goid, up.second);
+	  break;
+	}
+      }
+
+      // updated_snaps doesn't matter since we marked unrollbackable
+
+      if (op.alloc_hint) {
+	auto &hint = *(op.alloc_hint);
+	t->set_alloc_hint(
+	  coll,
+	  goid,
+	  hint.expected_object_size,
+	  hint.expected_write_size,
+	  hint.flags);
+      }
+
+      for (auto &&extent: op.buffer_updates) {
+	using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate;
+	match(
+	  extent.get_val(),
+	  [&](const BufferUpdate::Write &op) {
+	    t->write(
+	      coll,
+	      goid,
+	      extent.get_off(),
+	      extent.get_len(),
+	      op.buffer,
+	      op.fadvise_flags);
+	  },
+	  [&](const BufferUpdate::Zero &op) {
+	    t->zero(
+	      coll,
+	      goid,
+	      extent.get_off(),
+	      extent.get_len());
+	  },
+	  [&](const BufferUpdate::CloneRange &op) {
+	    ceph_assert(op.len == extent.get_len());
+	    t->clone_range(
+	      coll,
+	      ghobject_t(op.from, ghobject_t::NO_GEN, shard_id_t::NO_SHARD),
+	      goid,
+	      op.offset,
+	      extent.get_len(),
+	      extent.get_off());
+	  });
+      }
+    });
+}
+
+void ReplicatedBackend::submit_transaction(
+  const hobject_t &soid,
+  const object_stat_sum_t &delta_stats,
+  const eversion_t &at_version,
+  PGTransactionUPtr &&_t,
+  const eversion_t &trim_to,
+  const eversion_t &min_last_complete_ondisk,
+  vector<pg_log_entry_t>&& _log_entries,
+  std::optional<pg_hit_set_history_t> &hset_history,
+  Context *on_all_commit,
+  ceph_tid_t tid,
+  osd_reqid_t reqid,
+  OpRequestRef orig_op)
+{
+  parent->apply_stats(
+    soid,
+    delta_stats);
+
+  vector<pg_log_entry_t> log_entries(_log_entries);
+  ObjectStore::Transaction op_t;
+  PGTransactionUPtr t(std::move(_t));
+  set<hobject_t> added, removed;
+  generate_transaction(
+    t,
+    coll,
+    log_entries,
+    &op_t,
+    &added,
+    &removed,
+    get_osdmap()->require_osd_release);
+  ceph_assert(added.size() <= 1);
+  ceph_assert(removed.size() <= 1);
+
+  auto insert_res = in_progress_ops.insert(
+    make_pair(
+      tid,
+      ceph::make_ref<InProgressOp>(
+	tid, on_all_commit,
+	orig_op, at_version)
+      )
+    );
+  ceph_assert(insert_res.second);
+  InProgressOp &op = *insert_res.first->second;
+
+#ifdef HAVE_JAEGER
+  auto rep_sub_trans = jaeger_tracing::child_span("ReplicatedBackend::submit_transaction", orig_op->osd_parent_span);
+#endif
+  op.waiting_for_commit.insert(
+    parent->get_acting_recovery_backfill_shards().begin(),
+    parent->get_acting_recovery_backfill_shards().end());
+
+  issue_op(
+    soid,
+    at_version,
+    tid,
+    reqid,
+    trim_to,
+    min_last_complete_ondisk,
+    added.size() ? *(added.begin()) : hobject_t(),
+    removed.size() ? *(removed.begin()) : hobject_t(),
+    log_entries,
+    hset_history,
+    &op,
+    op_t);
+
+  add_temp_objs(added);
+  clear_temp_objs(removed);
+
+  parent->log_operation(
+    std::move(log_entries),
+    hset_history,
+    trim_to,
+    at_version,
+    min_last_complete_ondisk,
+    true,
+    op_t);
+  
+  op_t.register_on_commit(
+    parent->bless_context(
+      new C_OSD_OnOpCommit(this, &op)));
+
+  vector<ObjectStore::Transaction> tls;
+  tls.push_back(std::move(op_t));
+
+  parent->queue_transactions(tls, op.op);
+  if (at_version != eversion_t()) {
+    parent->op_applied(at_version);
+  }
+}
+
+void ReplicatedBackend::op_commit(const ceph::ref_t<InProgressOp>& op)
+{
+  if (op->on_commit == nullptr) {
+    // aborted
+    return;
+  }
+
+  FUNCTRACE(cct);
+  OID_EVENT_TRACE_WITH_MSG((op && op->op) ? op->op->get_req() : NULL, "OP_COMMIT_BEGIN", true);
+  dout(10) << __func__ << ": " << op->tid << dendl;
+  if (op->op) {
+    op->op->mark_event("op_commit");
+    op->op->pg_trace.event("op commit");
+  }
+
+  op->waiting_for_commit.erase(get_parent()->whoami_shard());
+
+  if (op->waiting_for_commit.empty()) {
+    op->on_commit->complete(0);
+    op->on_commit = 0;
+    in_progress_ops.erase(op->tid);
+  }
+}
+
+void ReplicatedBackend::do_repop_reply(OpRequestRef op)
+{
+  static_cast<MOSDRepOpReply*>(op->get_nonconst_req())->finish_decode();
+  auto r = op->get_req<MOSDRepOpReply>();
+  ceph_assert(r->get_header().type == MSG_OSD_REPOPREPLY);
+
+  op->mark_started();
+
+  // must be replication.
+  ceph_tid_t rep_tid = r->get_tid();
+  pg_shard_t from = r->from;
+
+  auto iter = in_progress_ops.find(rep_tid);
+  if (iter != in_progress_ops.end()) {
+    InProgressOp &ip_op = *iter->second;
+    const MOSDOp *m = nullptr;
+    if (ip_op.op)
+      m = ip_op.op->get_req<MOSDOp>();
+
+    if (m)
+      dout(7) << __func__ << ": tid " << ip_op.tid << " op " //<< *m
+	      << " ack_type " << (int)r->ack_type
+	      << " from " << from
+	      << dendl;
+    else
+      dout(7) << __func__ << ": tid " << ip_op.tid << " (no op) "
+	      << " ack_type " << (int)r->ack_type
+	      << " from " << from
+	      << dendl;
+
+    // oh, good.
+
+    if (r->ack_type & CEPH_OSD_FLAG_ONDISK) {
+      ceph_assert(ip_op.waiting_for_commit.count(from));
+      ip_op.waiting_for_commit.erase(from);
+      if (ip_op.op) {
+	ip_op.op->mark_event("sub_op_commit_rec");
+	ip_op.op->pg_trace.event("sub_op_commit_rec");
+      }
+    } else {
+      // legacy peer; ignore
+    }
+
+    parent->update_peer_last_complete_ondisk(
+      from,
+      r->get_last_complete_ondisk());
+
+    if (ip_op.waiting_for_commit.empty() &&
+        ip_op.on_commit) {
+      ip_op.on_commit->complete(0);
+      ip_op.on_commit = 0;
+      in_progress_ops.erase(iter);
+    }
+  }
+}
+
+int ReplicatedBackend::be_deep_scrub(
+  const hobject_t &poid,
+  ScrubMap &map,
+  ScrubMapBuilder &pos,
+  ScrubMap::object &o)
+{
+  dout(10) << __func__ << " " << poid << " pos " << pos << dendl;
+  int r;
+  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
+                           CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+                           CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE;
+
+  utime_t sleeptime;
+  sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep);
+  if (sleeptime != utime_t()) {
+    lgeneric_derr(cct) << __func__ << " sleeping for " << sleeptime << dendl;
+    sleeptime.sleep();
+  }
+
+  ceph_assert(poid == pos.ls[pos.pos]);
+  if (!pos.data_done()) {
+    if (pos.data_pos == 0) {
+      pos.data_hash = bufferhash(-1);
+    }
+
+    bufferlist bl;
+    r = store->read(
+      ch,
+      ghobject_t(
+	poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      pos.data_pos,
+      cct->_conf->osd_deep_scrub_stride, bl,
+      fadvise_flags);
+    if (r < 0) {
+      dout(20) << __func__ << "  " << poid << " got "
+	       << r << " on read, read_error" << dendl;
+      o.read_error = true;
+      return 0;
+    }
+    if (r > 0) {
+      pos.data_hash << bl;
+    }
+    pos.data_pos += r;
+    if (r == cct->_conf->osd_deep_scrub_stride) {
+      dout(20) << __func__ << "  " << poid << " more data, digest so far 0x"
+	       << std::hex << pos.data_hash.digest() << std::dec << dendl;
+      return -EINPROGRESS;
+    }
+    // done with bytes
+    pos.data_pos = -1;
+    o.digest = pos.data_hash.digest();
+    o.digest_present = true;
+    dout(20) << __func__ << "  " << poid << " done with data, digest 0x"
+	     << std::hex << o.digest << std::dec << dendl;
+  }
+
+  // omap header
+  if (pos.omap_pos.empty()) {
+    pos.omap_hash = bufferhash(-1);
+
+    bufferlist hdrbl;
+    r = store->omap_get_header(
+      ch,
+      ghobject_t(
+	poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+      &hdrbl, true);
+    if (r == -EIO) {
+      dout(20) << __func__ << "  " << poid << " got "
+	       << r << " on omap header read, read_error" << dendl;
+      o.read_error = true;
+      return 0;
+    }
+    if (r == 0 && hdrbl.length()) {
+      bool encoded = false;
+      dout(25) << "CRC header " << cleanbin(hdrbl, encoded, true) << dendl;
+      pos.omap_hash << hdrbl;
+    }
+  }
+
+  // omap
+  ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(
+    ch,
+    ghobject_t(
+      poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
+  ceph_assert(iter);
+  if (pos.omap_pos.length()) {
+    iter->lower_bound(pos.omap_pos);
+  } else {
+    iter->seek_to_first();
+  }
+  int max = g_conf()->osd_deep_scrub_keys;
+  while (iter->status() == 0 && iter->valid()) {
+    pos.omap_bytes += iter->value().length();
+    ++pos.omap_keys;
+    --max;
+    // fixme: we can do this more efficiently.
+    bufferlist bl;
+    encode(iter->key(), bl);
+    encode(iter->value(), bl);
+    pos.omap_hash << bl;
+
+    iter->next();
+
+    if (iter->valid() && max == 0) {
+      pos.omap_pos = iter->key();
+      return -EINPROGRESS;
+    }
+    if (iter->status() < 0) {
+      dout(25) << __func__ << "  " << poid
+	       << " on omap scan, db status error" << dendl;
+      o.read_error = true;
+      return 0;
+    }
+  }
+
+  if (pos.omap_keys > cct->_conf->
+	osd_deep_scrub_large_omap_object_key_threshold ||
+      pos.omap_bytes > cct->_conf->
+	osd_deep_scrub_large_omap_object_value_sum_threshold) {
+    dout(25) << __func__ << " " << poid
+	     << " large omap object detected. Object has " << pos.omap_keys
+	     << " keys and size " << pos.omap_bytes << " bytes" << dendl;
+    o.large_omap_object_found = true;
+    o.large_omap_object_key_count = pos.omap_keys;
+    o.large_omap_object_value_size = pos.omap_bytes;
+    map.has_large_omap_object_errors = true;
+  }
+
+  o.omap_digest = pos.omap_hash.digest();
+  o.omap_digest_present = true;
+  dout(20) << __func__ << " done with " << poid << " omap_digest "
+	   << std::hex << o.omap_digest << std::dec << dendl;
+
+  // Sum up omap usage
+  if (pos.omap_keys > 0 || pos.omap_bytes > 0) {
+    dout(25) << __func__ << " adding " << pos.omap_keys << " keys and "
+             << pos.omap_bytes << " bytes to pg_stats sums" << dendl;
+    map.has_omap_keys = true;
+    o.object_omap_bytes = pos.omap_bytes;
+    o.object_omap_keys = pos.omap_keys;
+  }
+
+  // done!
+  return 0;
+}
+
+void ReplicatedBackend::_do_push(OpRequestRef op)
+{
+  auto m = op->get_req<MOSDPGPush>();
+  ceph_assert(m->get_type() == MSG_OSD_PG_PUSH);
+  pg_shard_t from = m->from;
+
+  op->mark_started();
+
+  vector<PushReplyOp> replies;
+  ObjectStore::Transaction t;
+  if (get_parent()->check_failsafe_full()) {
+    dout(10) << __func__ << " Out of space (failsafe) processing push request." << dendl;
+    ceph_abort();
+  }
+  for (vector<PushOp>::const_iterator i = m->pushes.begin();
+       i != m->pushes.end();
+       ++i) {
+    replies.push_back(PushReplyOp());
+    handle_push(from, *i, &(replies.back()), &t, m->is_repair);
+  }
+
+  MOSDPGPushReply *reply = new MOSDPGPushReply;
+  reply->from = get_parent()->whoami_shard();
+  reply->set_priority(m->get_priority());
+  reply->pgid = get_info().pgid;
+  reply->map_epoch = m->map_epoch;
+  reply->min_epoch = m->min_epoch;
+  reply->replies.swap(replies);
+  reply->compute_cost(cct);
+
+  t.register_on_complete(
+    new PG_SendMessageOnConn(
+      get_parent(), reply, m->get_connection()));
+
+  get_parent()->queue_transaction(std::move(t));
+}
+
+struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
+  ReplicatedBackend *bc;
+  list<ReplicatedBackend::pull_complete_info> to_continue;
+  int priority;
+  C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority)
+    : bc(bc), priority(priority) {}
+
+  void finish(ThreadPool::TPHandle &handle) override {
+    ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op();
+    for (auto &&i: to_continue) {
+      auto j = bc->pulling.find(i.hoid);
+      ceph_assert(j != bc->pulling.end());
+      ObjectContextRef obc = j->second.obc;
+      bc->clear_pull(j, false /* already did it */);
+      int started = bc->start_pushes(i.hoid, obc, h);
+      if (started < 0) {
+	bc->pushing[i.hoid].clear();
+	bc->get_parent()->on_failed_pull(
+	  { bc->get_parent()->whoami_shard() },
+	  i.hoid, obc->obs.oi.version);
+      } else if (!started) {
+	bc->get_parent()->on_global_recover(
+	  i.hoid, i.stat, false);
+      }
+      handle.reset_tp_timeout();
+    }
+    bc->run_recovery_op(h, priority);
+  }
+};
+
+void ReplicatedBackend::_do_pull_response(OpRequestRef op)
+{
+  auto m = op->get_req<MOSDPGPush>();
+  ceph_assert(m->get_type() == MSG_OSD_PG_PUSH);
+  pg_shard_t from = m->from;
+
+  op->mark_started();
+
+  vector<PullOp> replies(1);
+  if (get_parent()->check_failsafe_full()) {
+    dout(10) << __func__ << " Out of space (failsafe) processing pull response (push)." << dendl;
+    ceph_abort();
+  }
+
+  ObjectStore::Transaction t;
+  list<pull_complete_info> to_continue;
+  for (vector<PushOp>::const_iterator i = m->pushes.begin();
+       i != m->pushes.end();
+       ++i) {
+    bool more = handle_pull_response(from, *i, &(replies.back()), &to_continue, &t);
+    if (more)
+      replies.push_back(PullOp());
+  }
+  if (!to_continue.empty()) {
+    C_ReplicatedBackend_OnPullComplete *c =
+      new C_ReplicatedBackend_OnPullComplete(
+	this,
+	m->get_priority());
+    c->to_continue.swap(to_continue);
+    t.register_on_complete(
+      new PG_RecoveryQueueAsync(
+	get_parent(),
+	get_parent()->bless_unlocked_gencontext(c)));
+  }
+  replies.erase(replies.end() - 1);
+
+  if (replies.size()) {
+    MOSDPGPull *reply = new MOSDPGPull;
+    reply->from = parent->whoami_shard();
+    reply->set_priority(m->get_priority());
+    reply->pgid = get_info().pgid;
+    reply->map_epoch = m->map_epoch;
+    reply->min_epoch = m->min_epoch;
+    reply->set_pulls(std::move(replies));
+    reply->compute_cost(cct);
+
+    t.register_on_complete(
+      new PG_SendMessageOnConn(
+	get_parent(), reply, m->get_connection()));
+  }
+
+  get_parent()->queue_transaction(std::move(t));
+}
+
+void ReplicatedBackend::do_pull(OpRequestRef op)
+{
+  MOSDPGPull *m = static_cast<MOSDPGPull *>(op->get_nonconst_req());
+  ceph_assert(m->get_type() == MSG_OSD_PG_PULL);
+  pg_shard_t from = m->from;
+
+  map<pg_shard_t, vector<PushOp> > replies;
+  for (auto& i : m->take_pulls()) {
+    replies[from].push_back(PushOp());
+    handle_pull(from, i, &(replies[from].back()));
+  }
+  send_pushes(m->get_priority(), replies);
+}
+
+void ReplicatedBackend::do_push_reply(OpRequestRef op)
+{
+  auto m = op->get_req<MOSDPGPushReply>();
+  ceph_assert(m->get_type() == MSG_OSD_PG_PUSH_REPLY);
+  pg_shard_t from = m->from;
+
+  vector<PushOp> replies(1);
+  for (vector<PushReplyOp>::const_iterator i = m->replies.begin();
+       i != m->replies.end();
+       ++i) {
+    bool more = handle_push_reply(from, *i, &(replies.back()));
+    if (more)
+      replies.push_back(PushOp());
+  }
+  replies.erase(replies.end() - 1);
+
+  map<pg_shard_t, vector<PushOp> > _replies;
+  _replies[from].swap(replies);
+  send_pushes(m->get_priority(), _replies);
+}
+
+Message * ReplicatedBackend::generate_subop(
+  const hobject_t &soid,
+  const eversion_t &at_version,
+  ceph_tid_t tid,
+  osd_reqid_t reqid,
+  eversion_t pg_trim_to,
+  eversion_t min_last_complete_ondisk,
+  hobject_t new_temp_oid,
+  hobject_t discard_temp_oid,
+  const bufferlist &log_entries,
+  std::optional<pg_hit_set_history_t> &hset_hist,
+  ObjectStore::Transaction &op_t,
+  pg_shard_t peer,
+  const pg_info_t &pinfo)
+{
+  int acks_wanted = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+  // forward the write/update/whatever
+  MOSDRepOp *wr = new MOSDRepOp(
+    reqid, parent->whoami_shard(),
+    spg_t(get_info().pgid.pgid, peer.shard),
+    soid, acks_wanted,
+    get_osdmap_epoch(),
+    parent->get_last_peering_reset_epoch(),
+    tid, at_version);
+
+  // ship resulting transaction, log entries, and pg_stats
+  if (!parent->should_send_op(peer, soid)) {
+    ObjectStore::Transaction t;
+    encode(t, wr->get_data());
+  } else {
+    encode(op_t, wr->get_data());
+    wr->get_header().data_off = op_t.get_data_alignment();
+  }
+
+  wr->logbl = log_entries;
+
+  if (pinfo.is_incomplete())
+    wr->pg_stats = pinfo.stats;  // reflects backfill progress
+  else
+    wr->pg_stats = get_info().stats;
+
+  wr->pg_trim_to = pg_trim_to;
+
+  if (HAVE_FEATURE(parent->min_peer_features(), OSD_REPOP_MLCOD)) {
+    wr->min_last_complete_ondisk = min_last_complete_ondisk;
+  } else {
+    /* Some replicas need this field to be at_version.  New replicas
+     * will ignore it */
+    wr->set_rollback_to(at_version);
+  }
+
+  wr->new_temp_oid = new_temp_oid;
+  wr->discard_temp_oid = discard_temp_oid;
+  wr->updated_hit_set_history = hset_hist;
+  return wr;
+}
+
+void ReplicatedBackend::issue_op(
+  const hobject_t &soid,
+  const eversion_t &at_version,
+  ceph_tid_t tid,
+  osd_reqid_t reqid,
+  eversion_t pg_trim_to,
+  eversion_t min_last_complete_ondisk,
+  hobject_t new_temp_oid,
+  hobject_t discard_temp_oid,
+  const vector<pg_log_entry_t> &log_entries,
+  std::optional<pg_hit_set_history_t> &hset_hist,
+  InProgressOp *op,
+  ObjectStore::Transaction &op_t)
+{
+  if (parent->get_acting_recovery_backfill_shards().size() > 1) {
+    if (op->op) {
+      op->op->pg_trace.event("issue replication ops");
+      ostringstream ss;
+      set<pg_shard_t> replicas = parent->get_acting_recovery_backfill_shards();
+      replicas.erase(parent->whoami_shard());
+      ss << "waiting for subops from " << replicas;
+      op->op->mark_sub_op_sent(ss.str());
+    }
+
+    // avoid doing the same work in generate_subop
+    bufferlist logs;
+    encode(log_entries, logs);
+
+    for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
+      if (shard == parent->whoami_shard()) continue;
+      const pg_info_t &pinfo = parent->get_shard_info().find(shard)->second;
+
+      Message *wr;
+      wr = generate_subop(
+	  soid,
+	  at_version,
+	  tid,
+	  reqid,
+	  pg_trim_to,
+	  min_last_complete_ondisk,
+	  new_temp_oid,
+	  discard_temp_oid,
+	  logs,
+	  hset_hist,
+	  op_t,
+	  shard,
+	  pinfo);
+      if (op->op && op->op->pg_trace)
+	wr->trace.init("replicated op", nullptr, &op->op->pg_trace);
+      get_parent()->send_message_osd_cluster(
+	  shard.osd, wr, get_osdmap_epoch());
+    }
+  }
+}
+
+// sub op modify
+void ReplicatedBackend::do_repop(OpRequestRef op)
+{
+  static_cast<MOSDRepOp*>(op->get_nonconst_req())->finish_decode();
+  auto m = op->get_req<MOSDRepOp>();
+  int msg_type = m->get_type();
+  ceph_assert(MSG_OSD_REPOP == msg_type);
+
+  const hobject_t& soid = m->poid;
+
+  dout(10) << __func__ << " " << soid
+           << " v " << m->version
+	   << (m->logbl.length() ? " (transaction)" : " (parallel exec")
+	   << " " << m->logbl.length()
+	   << dendl;
+
+#ifdef HAVE_JAEGER
+  auto do_repop_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
+#endif
+
+  // sanity checks
+  ceph_assert(m->map_epoch >= get_info().history.same_interval_since);
+
+  dout(30) << __func__ << " missing before " << get_parent()->get_log().get_missing().get_items() << dendl;
+  parent->maybe_preempt_replica_scrub(soid);
+
+  int ackerosd = m->get_source().num();
+
+  op->mark_started();
+
+  RepModifyRef rm(std::make_shared<RepModify>());
+  rm->op = op;
+  rm->ackerosd = ackerosd;
+  rm->last_complete = get_info().last_complete;
+  rm->epoch_started = get_osdmap_epoch();
+
+  ceph_assert(m->logbl.length());
+  // shipped transaction and log entries
+  vector<pg_log_entry_t> log;
+
+  auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
+  decode(rm->opt, p);
+
+  if (m->new_temp_oid != hobject_t()) {
+    dout(20) << __func__ << " start tracking temp " << m->new_temp_oid << dendl;
+    add_temp_obj(m->new_temp_oid);
+  }
+  if (m->discard_temp_oid != hobject_t()) {
+    dout(20) << __func__ << " stop tracking temp " << m->discard_temp_oid << dendl;
+    if (rm->opt.empty()) {
+      dout(10) << __func__ << ": removing object " << m->discard_temp_oid
+	       << " since we won't get the transaction" << dendl;
+      rm->localt.remove(coll, ghobject_t(m->discard_temp_oid));
+    }
+    clear_temp_obj(m->discard_temp_oid);
+  }
+
+  p = const_cast<bufferlist&>(m->logbl).begin();
+  decode(log, p);
+  rm->opt.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+
+  bool update_snaps = false;
+  if (!rm->opt.empty()) {
+    // If the opt is non-empty, we infer we are before
+    // last_backfill (according to the primary, not our
+    // not-quite-accurate value), and should update the
+    // collections now.  Otherwise, we do it later on push.
+    update_snaps = true;
+  }
+
+  // flag set to true during async recovery
+  bool async = false;
+  pg_missing_tracker_t pmissing = get_parent()->get_local_missing();
+  if (pmissing.is_missing(soid)) {
+    async = true;
+    dout(30) << __func__ << " is_missing " << pmissing.is_missing(soid) << dendl;
+    for (auto &&e: log) {
+      dout(30) << " add_next_event entry " << e << dendl;
+      get_parent()->add_local_next_event(e);
+      dout(30) << " entry is_delete " << e.is_delete() << dendl;
+    }
+  }
+
+  parent->update_stats(m->pg_stats);
+  parent->log_operation(
+    std::move(log),
+    m->updated_hit_set_history,
+    m->pg_trim_to,
+    m->version, /* Replicated PGs don't have rollback info */
+    m->min_last_complete_ondisk,
+    update_snaps,
+    rm->localt,
+    async);
+
+  rm->opt.register_on_commit(
+    parent->bless_context(
+      new C_OSD_RepModifyCommit(this, rm)));
+  vector<ObjectStore::Transaction> tls;
+  tls.reserve(2);
+  tls.push_back(std::move(rm->localt));
+  tls.push_back(std::move(rm->opt));
+  parent->queue_transactions(tls, op);
+  // op is cleaned up by oncommit/onapply when both are executed
+  dout(30) << __func__ << " missing after" << get_parent()->get_log().get_missing().get_items() << dendl;
+}
+
+void ReplicatedBackend::repop_commit(RepModifyRef rm)
+{
+  rm->op->mark_commit_sent();
+  rm->op->pg_trace.event("sup_op_commit");
+  rm->committed = true;
+
+  // send commit.
+  auto m = rm->op->get_req<MOSDRepOp>();
+  ceph_assert(m->get_type() == MSG_OSD_REPOP);
+  dout(10) << __func__ << " on op " << *m
+	   << ", sending commit to osd." << rm->ackerosd
+	   << dendl;
+  ceph_assert(get_osdmap()->is_up(rm->ackerosd));
+
+  get_parent()->update_last_complete_ondisk(rm->last_complete);
+
+  MOSDRepOpReply *reply = new MOSDRepOpReply(
+    m,
+    get_parent()->whoami_shard(),
+    0, get_osdmap_epoch(), m->get_min_epoch(), CEPH_OSD_FLAG_ONDISK);
+  reply->set_last_complete_ondisk(rm->last_complete);
+  reply->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority!
+  reply->trace = rm->op->pg_trace;
+  get_parent()->send_message_osd_cluster(
+    rm->ackerosd, reply, get_osdmap_epoch());
+
+  log_subop_stats(get_parent()->get_logger(), rm->op, l_osd_sop_w);
+}
+
+
+// ===========================================================
+
+void ReplicatedBackend::calc_head_subsets(
+  ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
+  const pg_missing_t& missing,
+  const hobject_t &last_backfill,
+  interval_set<uint64_t>& data_subset,
+  map<hobject_t, interval_set<uint64_t>>& clone_subsets,
+  ObcLockManager &manager)
+{
+  dout(10) << "calc_head_subsets " << head
+	   << " clone_overlap " << snapset.clone_overlap << dendl;
+
+  uint64_t size = obc->obs.oi.size;
+  if (size)
+    data_subset.insert(0, size);
+
+  if (HAVE_FEATURE(parent->min_peer_features(), SERVER_OCTOPUS)) {
+    const auto it = missing.get_items().find(head);
+    assert(it != missing.get_items().end());
+    data_subset.intersection_of(it->second.clean_regions.get_dirty_regions());
+    dout(10) << "calc_head_subsets " << head
+             << " data_subset " << data_subset << dendl;
+  }
+
+  if (get_parent()->get_pool().allow_incomplete_clones()) {
+    dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
+    return;
+  }
+
+  if (!cct->_conf->osd_recover_clone_overlap) {
+    dout(10) << "calc_head_subsets " << head << " -- osd_recover_clone_overlap disabled" << dendl;
+    return;
+  }
+
+
+  interval_set<uint64_t> cloning;
+  interval_set<uint64_t> prev;
+  hobject_t c = head;
+  if (size)
+    prev.insert(0, size);
+
+  for (int j=snapset.clones.size()-1; j>=0; j--) {
+    c.snap = snapset.clones[j];
+    prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]);
+    if (!missing.is_missing(c) &&
+	c < last_backfill &&
+	get_parent()->try_lock_for_read(c, manager)) {
+      dout(10) << "calc_head_subsets " << head << " has prev " << c
+	       << " overlap " << prev << dendl;
+      cloning = prev;
+      break;
+    }
+    dout(10) << "calc_head_subsets " << head << " does not have prev " << c
+	     << " overlap " << prev << dendl;
+  }
+
+  cloning.intersection_of(data_subset);
+  if (cloning.empty()) {
+    dout(10) << "skipping clone, nothing needs to clone" << dendl;
+    return;
+  }
+
+  if (cloning.num_intervals() > g_conf().get_val<uint64_t>("osd_recover_clone_overlap_limit")) {
+    dout(10) << "skipping clone, too many holes" << dendl;
+    get_parent()->release_locks(manager);
+    clone_subsets.clear();
+    cloning.clear();
+    return;
+  }
+
+  // what's left for us to push?
+  clone_subsets[c] = cloning;
+  data_subset.subtract(cloning);
+
+  dout(10) << "calc_head_subsets " << head
+	   << "  data_subset " << data_subset
+	   << "  clone_subsets " << clone_subsets << dendl;
+}
+
+void ReplicatedBackend::calc_clone_subsets(
+  SnapSet& snapset, const hobject_t& soid,
+  const pg_missing_t& missing,
+  const hobject_t &last_backfill,
+  interval_set<uint64_t>& data_subset,
+  map<hobject_t, interval_set<uint64_t>>& clone_subsets,
+  ObcLockManager &manager)
+{
+  dout(10) << "calc_clone_subsets " << soid
+	   << " clone_overlap " << snapset.clone_overlap << dendl;
+
+  uint64_t size = snapset.clone_size[soid.snap];
+  if (size)
+    data_subset.insert(0, size);
+
+  if (get_parent()->get_pool().allow_incomplete_clones()) {
+    dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
+    return;
+  }
+
+  if (!cct->_conf->osd_recover_clone_overlap) {
+    dout(10) << "calc_clone_subsets " << soid << " -- osd_recover_clone_overlap disabled" << dendl;
+    return;
+  }
+
+  unsigned i;
+  for (i=0; i < snapset.clones.size(); i++)
+    if (snapset.clones[i] == soid.snap)
+      break;
+
+  // any overlap with next older clone?
+  interval_set<uint64_t> cloning;
+  interval_set<uint64_t> prev;
+  if (size)
+    prev.insert(0, size);
+  for (int j=i-1; j>=0; j--) {
+    hobject_t c = soid;
+    c.snap = snapset.clones[j];
+    prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]);
+    if (!missing.is_missing(c) &&
+	c < last_backfill &&
+	get_parent()->try_lock_for_read(c, manager)) {
+      dout(10) << "calc_clone_subsets " << soid << " has prev " << c
+	       << " overlap " << prev << dendl;
+      clone_subsets[c] = prev;
+      cloning.union_of(prev);
+      break;
+    }
+    dout(10) << "calc_clone_subsets " << soid << " does not have prev " << c
+	     << " overlap " << prev << dendl;
+  }
+
+  // overlap with next newest?
+  interval_set<uint64_t> next;
+  if (size)
+    next.insert(0, size);
+  for (unsigned j=i+1; j<snapset.clones.size(); j++) {
+    hobject_t c = soid;
+    c.snap = snapset.clones[j];
+    next.intersection_of(snapset.clone_overlap[snapset.clones[j-1]]);
+    if (!missing.is_missing(c) &&
+	c < last_backfill &&
+	get_parent()->try_lock_for_read(c, manager)) {
+      dout(10) << "calc_clone_subsets " << soid << " has next " << c
+	       << " overlap " << next << dendl;
+      clone_subsets[c] = next;
+      cloning.union_of(next);
+      break;
+    }
+    dout(10) << "calc_clone_subsets " << soid << " does not have next " << c
+	     << " overlap " << next << dendl;
+  }
+
+  if (cloning.num_intervals() > g_conf().get_val<uint64_t>("osd_recover_clone_overlap_limit")) {
+    dout(10) << "skipping clone, too many holes" << dendl;
+    get_parent()->release_locks(manager);
+    clone_subsets.clear();
+    cloning.clear();
+  }
+
+
+  // what's left for us to push?
+  data_subset.subtract(cloning);
+
+  dout(10) << "calc_clone_subsets " << soid
+	   << "  data_subset " << data_subset
+	   << "  clone_subsets " << clone_subsets << dendl;
+}
+
+void ReplicatedBackend::prepare_pull(
+  eversion_t v,
+  const hobject_t& soid,
+  ObjectContextRef headctx,
+  RPGHandle *h)
+{
+  const auto missing_iter = get_parent()->get_local_missing().get_items().find(soid);
+  ceph_assert(missing_iter != get_parent()->get_local_missing().get_items().end());
+  eversion_t _v = missing_iter->second.need;
+  ceph_assert(_v == v);
+  const map<hobject_t, set<pg_shard_t>> &missing_loc(
+    get_parent()->get_missing_loc_shards());
+  const map<pg_shard_t, pg_missing_t > &peer_missing(
+    get_parent()->get_shard_missing());
+  map<hobject_t, set<pg_shard_t>>::const_iterator q = missing_loc.find(soid);
+  ceph_assert(q != missing_loc.end());
+  ceph_assert(!q->second.empty());
+
+  // pick a pullee
+  auto p = q->second.end();
+  if (cct->_conf->osd_debug_feed_pullee >= 0) {
+    for (auto it = q->second.begin(); it != q->second.end(); it++) {
+      if (it->osd == cct->_conf->osd_debug_feed_pullee) {
+        p = it;
+        break;
+      }
+    }
+  }
+  if (p == q->second.end()) {
+    // probably because user feed a wrong pullee
+    p = q->second.begin();
+    std::advance(p,
+                 ceph::util::generate_random_number<int>(0,
+							 q->second.size() - 1));
+  }
+  ceph_assert(get_osdmap()->is_up(p->osd));
+  pg_shard_t fromshard = *p;
+
+  dout(7) << "pull " << soid
+	  << " v " << v
+	  << " on osds " << q->second
+	  << " from osd." << fromshard
+	  << dendl;
+
+  ceph_assert(peer_missing.count(fromshard));
+  const pg_missing_t &pmissing = peer_missing.find(fromshard)->second;
+  if (pmissing.is_missing(soid, v)) {
+    ceph_assert(pmissing.get_items().find(soid)->second.have != v);
+    dout(10) << "pulling soid " << soid << " from osd " << fromshard
+	     << " at version " << pmissing.get_items().find(soid)->second.have
+	     << " rather than at version " << v << dendl;
+    v = pmissing.get_items().find(soid)->second.have;
+    ceph_assert(get_parent()->get_log().get_log().objects.count(soid) &&
+	   (get_parent()->get_log().get_log().objects.find(soid)->second->op ==
+	    pg_log_entry_t::LOST_REVERT) &&
+	   (get_parent()->get_log().get_log().objects.find(
+	     soid)->second->reverting_to ==
+	    v));
+  }
+
+  ObjectRecoveryInfo recovery_info;
+  ObcLockManager lock_manager;
+
+  if (soid.is_snap()) {
+    ceph_assert(!get_parent()->get_local_missing().is_missing(soid.get_head()));
+    ceph_assert(headctx);
+    // check snapset
+    SnapSetContext *ssc = headctx->ssc;
+    ceph_assert(ssc);
+    dout(10) << " snapset " << ssc->snapset << dendl;
+    recovery_info.ss = ssc->snapset;
+    calc_clone_subsets(
+      ssc->snapset, soid, get_parent()->get_local_missing(),
+      get_info().last_backfill,
+      recovery_info.copy_subset,
+      recovery_info.clone_subset,
+      lock_manager);
+    // FIXME: this may overestimate if we are pulling multiple clones in parallel...
+    dout(10) << " pulling " << recovery_info << dendl;
+
+    ceph_assert(ssc->snapset.clone_size.count(soid.snap));
+    recovery_info.size = ssc->snapset.clone_size[soid.snap];
+    recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist();
+  } else {
+    // pulling head or unversioned object.
+    // always pull the whole thing.
+    recovery_info.copy_subset.insert(0, (uint64_t)-1);
+    if (HAVE_FEATURE(parent->min_peer_features(), SERVER_OCTOPUS))
+      recovery_info.copy_subset.intersection_of(missing_iter->second.clean_regions.get_dirty_regions());
+    recovery_info.size = ((uint64_t)-1);
+    recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist();
+  }
+
+  h->pulls[fromshard].push_back(PullOp());
+  PullOp &op = h->pulls[fromshard].back();
+  op.soid = soid;
+
+  op.recovery_info = recovery_info;
+  op.recovery_info.soid = soid;
+  op.recovery_info.version = v;
+  op.recovery_progress.data_complete = false;
+  op.recovery_progress.omap_complete = !missing_iter->second.clean_regions.omap_is_dirty() 
+                                && HAVE_FEATURE(parent->min_peer_features(), SERVER_OCTOPUS);
+  op.recovery_progress.data_recovered_to = 0;
+  op.recovery_progress.first = true;
+
+  ceph_assert(!pulling.count(soid));
+  pull_from_peer[fromshard].insert(soid);
+  PullInfo &pi = pulling[soid];
+  pi.from = fromshard;
+  pi.soid = soid;
+  pi.head_ctx = headctx;
+  pi.recovery_info = op.recovery_info;
+  pi.recovery_progress = op.recovery_progress;
+  pi.cache_dont_need = h->cache_dont_need;
+  pi.lock_manager = std::move(lock_manager);
+}
+
+/*
+ * intelligently push an object to a replica.  make use of existing
+ * clones/heads and dup data ranges where possible.
+ */
+int ReplicatedBackend::prep_push_to_replica(
+  ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
+  PushOp *pop, bool cache_dont_need)
+{
+  const object_info_t& oi = obc->obs.oi;
+  uint64_t size = obc->obs.oi.size;
+
+  dout(10) << __func__ << ": " << soid << " v" << oi.version
+	   << " size " << size << " to osd." << peer << dendl;
+
+  map<hobject_t, interval_set<uint64_t>> clone_subsets;
+  interval_set<uint64_t> data_subset;
+
+  ObcLockManager lock_manager;
+  // are we doing a clone on the replica?
+  if (soid.snap && soid.snap < CEPH_NOSNAP) {
+    hobject_t head = soid;
+    head.snap = CEPH_NOSNAP;
+
+    // try to base push off of clones that succeed/preceed poid
+    // we need the head (and current SnapSet) locally to do that.
+    if (get_parent()->get_local_missing().is_missing(head)) {
+      dout(15) << "push_to_replica missing head " << head << ", pushing raw clone" << dendl;
+      return prep_push(obc, soid, peer, pop, cache_dont_need);
+    }
+
+    SnapSetContext *ssc = obc->ssc;
+    ceph_assert(ssc);
+    dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
+    pop->recovery_info.ss = ssc->snapset;
+    map<pg_shard_t, pg_missing_t>::const_iterator pm =
+      get_parent()->get_shard_missing().find(peer);
+    ceph_assert(pm != get_parent()->get_shard_missing().end());
+    map<pg_shard_t, pg_info_t>::const_iterator pi =
+      get_parent()->get_shard_info().find(peer);
+    ceph_assert(pi != get_parent()->get_shard_info().end());
+    calc_clone_subsets(
+      ssc->snapset, soid,
+      pm->second,
+      pi->second.last_backfill,
+      data_subset, clone_subsets,
+      lock_manager);
+  } else if (soid.snap == CEPH_NOSNAP) {
+    // pushing head or unversioned object.
+    // base this on partially on replica's clones?
+    SnapSetContext *ssc = obc->ssc;
+    ceph_assert(ssc);
+    dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl;
+    calc_head_subsets(
+      obc,
+      ssc->snapset, soid, get_parent()->get_shard_missing().find(peer)->second,
+      get_parent()->get_shard_info().find(peer)->second.last_backfill,
+      data_subset, clone_subsets,
+      lock_manager);
+  }
+
+  return prep_push(
+    obc,
+    soid,
+    peer,
+    oi.version,
+    data_subset,
+    clone_subsets,
+    pop,
+    cache_dont_need,
+    std::move(lock_manager));
+}
+
+int ReplicatedBackend::prep_push(ObjectContextRef obc,
+			     const hobject_t& soid, pg_shard_t peer,
+			     PushOp *pop, bool cache_dont_need)
+{
+  interval_set<uint64_t> data_subset;
+  if (obc->obs.oi.size)
+    data_subset.insert(0, obc->obs.oi.size);
+  map<hobject_t, interval_set<uint64_t>> clone_subsets;
+
+  return prep_push(obc, soid, peer,
+	    obc->obs.oi.version, data_subset, clone_subsets,
+	    pop, cache_dont_need, ObcLockManager());
+}
+
+int ReplicatedBackend::prep_push(
+  ObjectContextRef obc,
+  const hobject_t& soid, pg_shard_t peer,
+  eversion_t version,
+  interval_set<uint64_t> &data_subset,
+  map<hobject_t, interval_set<uint64_t>>& clone_subsets,
+  PushOp *pop,
+  bool cache_dont_need,
+  ObcLockManager &&lock_manager)
+{
+  get_parent()->begin_peer_recover(peer, soid);
+  const auto pmissing_iter = get_parent()->get_shard_missing().find(peer);
+  const auto missing_iter = pmissing_iter->second.get_items().find(soid);
+  assert(missing_iter != pmissing_iter->second.get_items().end());
+  // take note.
+  PushInfo &pi = pushing[soid][peer];
+  pi.obc = obc;
+  pi.recovery_info.size = obc->obs.oi.size;
+  pi.recovery_info.copy_subset = data_subset;
+  pi.recovery_info.clone_subset = clone_subsets;
+  pi.recovery_info.soid = soid;
+  pi.recovery_info.oi = obc->obs.oi;
+  pi.recovery_info.ss = pop->recovery_info.ss;
+  pi.recovery_info.version = version;
+  pi.recovery_info.object_exist = missing_iter->second.clean_regions.object_is_exist();
+  pi.recovery_progress.omap_complete = !missing_iter->second.clean_regions.omap_is_dirty() &&
+    HAVE_FEATURE(parent->min_peer_features(), SERVER_OCTOPUS);
+  pi.lock_manager = std::move(lock_manager);
+
+  ObjectRecoveryProgress new_progress;
+  int r = build_push_op(pi.recovery_info,
+			pi.recovery_progress,
+			&new_progress,
+			pop,
+			&(pi.stat), cache_dont_need);
+  if (r < 0)
+    return r;
+  pi.recovery_progress = new_progress;
+  return 0;
+}
+
+void ReplicatedBackend::submit_push_data(
+  const ObjectRecoveryInfo &recovery_info,
+  bool first,
+  bool complete,
+  bool clear_omap,
+  bool cache_dont_need,
+  interval_set<uint64_t> &data_zeros,
+  const interval_set<uint64_t> &intervals_included,
+  bufferlist data_included,
+  bufferlist omap_header,
+  const map<string, bufferlist> &attrs,
+  const map<string, bufferlist> &omap_entries,
+  ObjectStore::Transaction *t)
+{
+  hobject_t target_oid;
+  if (first && complete) {
+    target_oid = recovery_info.soid;
+  } else {
+    target_oid = get_parent()->get_temp_recovery_object(recovery_info.soid,
+							recovery_info.version);
+    if (first) {
+      dout(10) << __func__ << ": Adding oid "
+	       << target_oid << " in the temp collection" << dendl;
+      add_temp_obj(target_oid);
+    }
+  }
+
+  if (first) {
+    if (!complete) {
+      t->remove(coll, ghobject_t(target_oid));
+      t->touch(coll, ghobject_t(target_oid));
+      bufferlist bv = attrs.at(OI_ATTR);
+      object_info_t oi(bv);
+      t->set_alloc_hint(coll, ghobject_t(target_oid),
+		        oi.expected_object_size,
+		        oi.expected_write_size,
+		        oi.alloc_hint_flags);
+      } else {
+        if (!recovery_info.object_exist) {
+	  t->remove(coll, ghobject_t(target_oid));
+          t->touch(coll, ghobject_t(target_oid));
+          bufferlist bv = attrs.at(OI_ATTR);
+          object_info_t oi(bv);
+          t->set_alloc_hint(coll, ghobject_t(target_oid),
+                            oi.expected_object_size,
+                            oi.expected_write_size,
+                            oi.alloc_hint_flags);
+        }
+        //remove xattr and update later if overwrite on original object
+        t->rmattrs(coll, ghobject_t(target_oid));
+        //if need update omap, clear the previous content first
+        if (clear_omap)
+          t->omap_clear(coll, ghobject_t(target_oid));
+      }
+
+    t->truncate(coll, ghobject_t(target_oid), recovery_info.size);
+    if (omap_header.length())
+      t->omap_setheader(coll, ghobject_t(target_oid), omap_header);
+
+    struct stat st;
+    int r = store->stat(ch, ghobject_t(recovery_info.soid), &st);
+    if (get_parent()->pg_is_remote_backfilling()) {
+      uint64_t size = 0;
+      if (r == 0)
+        size = st.st_size;
+      // Don't need to do anything if object is still the same size
+      if (size != recovery_info.oi.size) {
+        get_parent()->pg_add_local_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size);
+        get_parent()->pg_add_num_bytes((int64_t)recovery_info.oi.size - (int64_t)size);
+        dout(10) << __func__ << " " << recovery_info.soid
+               << " backfill size " << recovery_info.oi.size
+               << " previous size " << size
+               << " net size " << recovery_info.oi.size - size
+               << dendl;
+      }
+    }
+    if (!complete) {
+      //clone overlap content in local object
+      if (recovery_info.object_exist) {
+        assert(r == 0);
+        uint64_t local_size = std::min(recovery_info.size, (uint64_t)st.st_size);
+        interval_set<uint64_t> local_intervals_included, local_intervals_excluded;
+        if (local_size) {
+          local_intervals_included.insert(0, local_size);
+          local_intervals_excluded.intersection_of(local_intervals_included, recovery_info.copy_subset);
+          local_intervals_included.subtract(local_intervals_excluded);
+        }
+       for (interval_set<uint64_t>::const_iterator q = local_intervals_included.begin();
+          q != local_intervals_included.end();
+         ++q) {
+         dout(15) << " clone_range " << recovery_info.soid << " "
+                  << q.get_start() << "~" << q.get_len() << dendl;
+         t->clone_range(coll, ghobject_t(recovery_info.soid), ghobject_t(target_oid),
+             q.get_start(), q.get_len(), q.get_start());
+        }
+      }
+    }
+  }
+  uint64_t off = 0;
+  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL;
+  if (cache_dont_need)
+    fadvise_flags |= CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+  // Punch zeros for data, if fiemap indicates nothing but it is marked dirty
+  if (data_zeros.size() > 0) {
+    data_zeros.intersection_of(recovery_info.copy_subset);
+    assert(intervals_included.subset_of(data_zeros));
+    data_zeros.subtract(intervals_included);
+
+    dout(20) << __func__ <<" recovering object " << recovery_info.soid
+             << " copy_subset: " << recovery_info.copy_subset
+             << " intervals_included: " << intervals_included
+             << " data_zeros: " << data_zeros << dendl;
+
+    for (auto p = data_zeros.begin(); p != data_zeros.end(); ++p)
+      t->zero(coll, ghobject_t(target_oid), p.get_start(), p.get_len());
+  }
+  for (interval_set<uint64_t>::const_iterator p = intervals_included.begin();
+       p != intervals_included.end();
+       ++p) {
+    bufferlist bit;
+    bit.substr_of(data_included, off, p.get_len());
+    t->write(coll, ghobject_t(target_oid),
+	     p.get_start(), p.get_len(), bit, fadvise_flags);
+    off += p.get_len();
+  }
+
+  if (!omap_entries.empty())
+    t->omap_setkeys(coll, ghobject_t(target_oid), omap_entries);
+  if (!attrs.empty())
+    t->setattrs(coll, ghobject_t(target_oid), attrs);
+
+  if (complete) {
+    if (!first) {
+      dout(10) << __func__ << ": Removing oid "
+               << target_oid << " from the temp collection" << dendl;
+      clear_temp_obj(target_oid);
+      t->remove(coll, ghobject_t(recovery_info.soid));
+      t->collection_move_rename(coll, ghobject_t(target_oid),
+                                coll, ghobject_t(recovery_info.soid));
+    }
+
+    submit_push_complete(recovery_info, t);
+
+  }
+}
+
+void ReplicatedBackend::submit_push_complete(
+  const ObjectRecoveryInfo &recovery_info,
+  ObjectStore::Transaction *t)
+{
+  for (map<hobject_t, interval_set<uint64_t>>::const_iterator p =
+	 recovery_info.clone_subset.begin();
+       p != recovery_info.clone_subset.end();
+       ++p) {
+    for (interval_set<uint64_t>::const_iterator q = p->second.begin();
+	 q != p->second.end();
+	 ++q) {
+      dout(15) << " clone_range " << p->first << " "
+	       << q.get_start() << "~" << q.get_len() << dendl;
+      t->clone_range(coll, ghobject_t(p->first), ghobject_t(recovery_info.soid),
+		     q.get_start(), q.get_len(), q.get_start());
+    }
+  }
+}
+
+ObjectRecoveryInfo ReplicatedBackend::recalc_subsets(
+  const ObjectRecoveryInfo& recovery_info,
+  SnapSetContext *ssc,
+  ObcLockManager &manager)
+{
+  if (!recovery_info.soid.snap || recovery_info.soid.snap >= CEPH_NOSNAP)
+    return recovery_info;
+  ObjectRecoveryInfo new_info = recovery_info;
+  new_info.copy_subset.clear();
+  new_info.clone_subset.clear();
+  ceph_assert(ssc);
+  get_parent()->release_locks(manager); // might already have locks
+  calc_clone_subsets(
+    ssc->snapset, new_info.soid, get_parent()->get_local_missing(),
+    get_info().last_backfill,
+    new_info.copy_subset, new_info.clone_subset,
+    manager);
+  return new_info;
+}
+
+bool ReplicatedBackend::handle_pull_response(
+  pg_shard_t from, const PushOp &pop, PullOp *response,
+  list<pull_complete_info> *to_continue,
+  ObjectStore::Transaction *t)
+{
+  interval_set<uint64_t> data_included = pop.data_included;
+  bufferlist data;
+  data = pop.data;
+  dout(10) << "handle_pull_response "
+	   << pop.recovery_info
+	   << pop.after_progress
+	   << " data.size() is " << data.length()
+	   << " data_included: " << data_included
+	   << dendl;
+  if (pop.version == eversion_t()) {
+    // replica doesn't have it!
+    _failed_pull(from, pop.soid);
+    return false;
+  }
+
+  const hobject_t &hoid = pop.soid;
+  ceph_assert((data_included.empty() && data.length() == 0) ||
+         (!data_included.empty() && data.length() > 0));
+
+  auto piter = pulling.find(hoid);
+  if (piter == pulling.end()) {
+    return false;
+  }
+
+  PullInfo &pi = piter->second;
+  if (pi.recovery_info.size == (uint64_t(-1))) {
+    pi.recovery_info.size = pop.recovery_info.size;
+    pi.recovery_info.copy_subset.intersection_of(
+      pop.recovery_info.copy_subset);
+  }
+  // If primary doesn't have object info and didn't know version
+  if (pi.recovery_info.version == eversion_t()) {
+    pi.recovery_info.version = pop.version;
+  }
+
+  bool first = pi.recovery_progress.first;
+  if (first) {
+    // attrs only reference the origin bufferlist (decode from
+    // MOSDPGPush message) whose size is much greater than attrs in
+    // recovery. If obc cache it (get_obc maybe cache the attr), this
+    // causes the whole origin bufferlist would not be free until obc
+    // is evicted from obc cache. So rebuild the bufferlists before
+    // cache it.
+    auto attrset = pop.attrset;
+    for (auto& a : attrset) {
+      a.second.rebuild();
+    }
+    pi.obc = get_parent()->get_obc(pi.recovery_info.soid, attrset);
+    if (attrset.find(SS_ATTR) != attrset.end()) {
+      bufferlist ssbv = attrset.at(SS_ATTR);
+      SnapSet ss(ssbv);
+      assert(!pi.obc->ssc->exists || ss.seq  == pi.obc->ssc->snapset.seq);
+    }
+    pi.recovery_info.oi = pi.obc->obs.oi;
+    pi.recovery_info = recalc_subsets(
+      pi.recovery_info,
+      pi.obc->ssc,
+      pi.lock_manager);
+  }
+
+
+  interval_set<uint64_t> usable_intervals;
+  bufferlist usable_data;
+  trim_pushed_data(pi.recovery_info.copy_subset,
+		   data_included,
+		   data,
+		   &usable_intervals,
+		   &usable_data);
+  data_included = usable_intervals;
+  data = std::move(usable_data);
+
+
+  pi.recovery_progress = pop.after_progress;
+
+  dout(10) << "new recovery_info " << pi.recovery_info
+           << ", new progress " << pi.recovery_progress
+           << dendl;
+  interval_set<uint64_t> data_zeros;
+  uint64_t z_offset = pop.before_progress.data_recovered_to;
+  uint64_t z_length = pop.after_progress.data_recovered_to - pop.before_progress.data_recovered_to;
+  if (z_length)
+    data_zeros.insert(z_offset, z_length);
+  bool complete = pi.is_complete();
+  bool clear_omap = !pop.before_progress.omap_complete;
+
+  submit_push_data(pi.recovery_info,
+                  first,
+                  complete,
+                  clear_omap,
+                  pi.cache_dont_need,
+                  data_zeros,
+                  data_included,
+                  data,
+                  pop.omap_header,
+                  pop.attrset,
+                  pop.omap_entries,
+                  t);
+
+  pi.stat.num_keys_recovered += pop.omap_entries.size();
+  pi.stat.num_bytes_recovered += data.length();
+  get_parent()->get_logger()->inc(l_osd_rbytes, pop.omap_entries.size() + data.length());
+
+  if (complete) {
+    pi.stat.num_objects_recovered++;
+    // XXX: This could overcount if regular recovery is needed right after a repair
+    if (get_parent()->pg_is_repair()) {
+      pi.stat.num_objects_repaired++;
+      get_parent()->inc_osd_stat_repaired();
+    }
+    clear_pull_from(piter);
+    to_continue->push_back({hoid, pi.stat});
+    get_parent()->on_local_recover(
+      hoid, pi.recovery_info, pi.obc, false, t);
+    return false;
+  } else {
+    response->soid = pop.soid;
+    response->recovery_info = pi.recovery_info;
+    response->recovery_progress = pi.recovery_progress;
+    return true;
+  }
+}
+
+void ReplicatedBackend::handle_push(
+  pg_shard_t from, const PushOp &pop, PushReplyOp *response,
+  ObjectStore::Transaction *t, bool is_repair)
+{
+  dout(10) << "handle_push "
+	   << pop.recovery_info
+	   << pop.after_progress
+	   << dendl;
+  bufferlist data;
+  data = pop.data;
+  bool first = pop.before_progress.first;
+  bool complete = pop.after_progress.data_complete &&
+    pop.after_progress.omap_complete;
+  bool clear_omap = !pop.before_progress.omap_complete;
+  interval_set<uint64_t> data_zeros;
+  uint64_t z_offset = pop.before_progress.data_recovered_to;
+  uint64_t z_length = pop.after_progress.data_recovered_to - pop.before_progress.data_recovered_to;
+  if (z_length)
+    data_zeros.insert(z_offset, z_length);
+  response->soid = pop.recovery_info.soid;
+
+  submit_push_data(pop.recovery_info,
+		   first,
+		   complete,
+		   clear_omap,
+		   true, // must be replicate
+		   data_zeros,
+		   pop.data_included,
+		   data,
+		   pop.omap_header,
+		   pop.attrset,
+		   pop.omap_entries,
+		   t);
+
+  if (complete) {
+    if (is_repair) {
+      get_parent()->inc_osd_stat_repaired();
+      dout(20) << __func__ << " repair complete" << dendl;
+    }
+    get_parent()->on_local_recover(
+      pop.recovery_info.soid,
+      pop.recovery_info,
+      ObjectContextRef(), // ok, is replica
+      false,
+      t);
+  }
+}
+
+void ReplicatedBackend::send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &pushes)
+{
+  for (map<pg_shard_t, vector<PushOp> >::iterator i = pushes.begin();
+       i != pushes.end();
+       ++i) {
+    ConnectionRef con = get_parent()->get_con_osd_cluster(
+      i->first.osd,
+      get_osdmap_epoch());
+    if (!con)
+      continue;
+    vector<PushOp>::iterator j = i->second.begin();
+    while (j != i->second.end()) {
+      uint64_t cost = 0;
+      uint64_t pushes = 0;
+      MOSDPGPush *msg = new MOSDPGPush();
+      msg->from = get_parent()->whoami_shard();
+      msg->pgid = get_parent()->primary_spg_t();
+      msg->map_epoch = get_osdmap_epoch();
+      msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
+      msg->set_priority(prio);
+      msg->is_repair = get_parent()->pg_is_repair();
+      for (;
+           (j != i->second.end() &&
+	    cost < cct->_conf->osd_max_push_cost &&
+	    pushes < cct->_conf->osd_max_push_objects) ;
+	   ++j) {
+	dout(20) << __func__ << ": sending push " << *j
+		 << " to osd." << i->first << dendl;
+	cost += j->cost(cct);
+	pushes += 1;
+	msg->pushes.push_back(*j);
+      }
+      msg->set_cost(cost);
+      get_parent()->send_message_osd_cluster(msg, con);
+    }
+  }
+}
+
+void ReplicatedBackend::send_pulls(int prio, map<pg_shard_t, vector<PullOp> > &pulls)
+{
+  for (map<pg_shard_t, vector<PullOp> >::iterator i = pulls.begin();
+       i != pulls.end();
+       ++i) {
+    ConnectionRef con = get_parent()->get_con_osd_cluster(
+      i->first.osd,
+      get_osdmap_epoch());
+    if (!con)
+      continue;
+    dout(20) << __func__ << ": sending pulls " << i->second
+	     << " to osd." << i->first << dendl;
+    MOSDPGPull *msg = new MOSDPGPull();
+    msg->from = parent->whoami_shard();
+    msg->set_priority(prio);
+    msg->pgid = get_parent()->primary_spg_t();
+    msg->map_epoch = get_osdmap_epoch();
+    msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
+    msg->set_pulls(std::move(i->second));
+    msg->compute_cost(cct);
+    get_parent()->send_message_osd_cluster(msg, con);
+  }
+}
+
+int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
+				     const ObjectRecoveryProgress &progress,
+				     ObjectRecoveryProgress *out_progress,
+				     PushOp *out_op,
+				     object_stat_sum_t *stat,
+                                     bool cache_dont_need)
+{
+  ObjectRecoveryProgress _new_progress;
+  if (!out_progress)
+    out_progress = &_new_progress;
+  ObjectRecoveryProgress &new_progress = *out_progress;
+  new_progress = progress;
+
+  dout(7) << __func__ << " " << recovery_info.soid
+	  << " v " << recovery_info.version
+	  << " size " << recovery_info.size
+	  << " recovery_info: " << recovery_info
+          << dendl;
+
+  eversion_t v  = recovery_info.version;
+  object_info_t oi;
+  if (progress.first) {
+    int r = store->omap_get_header(ch, ghobject_t(recovery_info.soid), &out_op->omap_header);
+    if (r < 0) {
+      dout(1) << __func__ << " get omap header failed: " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    r = store->getattrs(ch, ghobject_t(recovery_info.soid), out_op->attrset);
+    if (r < 0) {
+      dout(1) << __func__ << " getattrs failed: " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    // Debug
+    bufferlist bv = out_op->attrset[OI_ATTR];
+    try {
+     auto bliter = bv.cbegin();
+     decode(oi, bliter);
+    } catch (...) {
+      dout(0) << __func__ << ": bad object_info_t: " << recovery_info.soid << dendl;
+      return -EINVAL;
+    }
+
+    // If requestor didn't know the version, use ours
+    if (v == eversion_t()) {
+      v = oi.version;
+    } else if (oi.version != v) {
+      get_parent()->clog_error() << get_info().pgid << " push "
+				 << recovery_info.soid << " v "
+				 << recovery_info.version
+				 << " failed because local copy is "
+				 << oi.version;
+      return -EINVAL;
+    }
+
+    new_progress.first = false;
+  }
+  // Once we provide the version subsequent requests will have it, so
+  // at this point it must be known.
+  ceph_assert(v != eversion_t());
+
+  uint64_t available = cct->_conf->osd_recovery_max_chunk;
+  if (!progress.omap_complete) {
+    ObjectMap::ObjectMapIterator iter =
+      store->get_omap_iterator(ch,
+			       ghobject_t(recovery_info.soid));
+    ceph_assert(iter);
+    for (iter->lower_bound(progress.omap_recovered_to);
+	 iter->valid();
+	 iter->next()) {
+      if (!out_op->omap_entries.empty() &&
+	  ((cct->_conf->osd_recovery_max_omap_entries_per_chunk > 0 &&
+	    out_op->omap_entries.size() >= cct->_conf->osd_recovery_max_omap_entries_per_chunk) ||
+	   available <= iter->key().size() + iter->value().length()))
+	break;
+      out_op->omap_entries.insert(make_pair(iter->key(), iter->value()));
+
+      if ((iter->key().size() + iter->value().length()) <= available)
+	available -= (iter->key().size() + iter->value().length());
+      else
+	available = 0;
+    }
+    if (!iter->valid())
+      new_progress.omap_complete = true;
+    else
+      new_progress.omap_recovered_to = iter->key();
+  }
+
+  if (available > 0) {
+    if (!recovery_info.copy_subset.empty()) {
+      interval_set<uint64_t> copy_subset = recovery_info.copy_subset;
+      map<uint64_t, uint64_t> m;
+      int r = store->fiemap(ch, ghobject_t(recovery_info.soid), 0,
+                            copy_subset.range_end(), m);
+      if (r >= 0)  {
+        interval_set<uint64_t> fiemap_included(std::move(m));
+        copy_subset.intersection_of(fiemap_included);
+      } else {
+        // intersection of copy_subset and empty interval_set would be empty anyway
+        copy_subset.clear();
+      }
+
+      out_op->data_included.span_of(copy_subset, progress.data_recovered_to,
+                                    available);
+      // zero filled section, skip to end!
+      if (out_op->data_included.empty() ||
+          out_op->data_included.range_end() == copy_subset.range_end())
+        new_progress.data_recovered_to = recovery_info.copy_subset.range_end();
+      else
+        new_progress.data_recovered_to = out_op->data_included.range_end();
+    }
+  } else {
+    out_op->data_included.clear();
+  }
+
+  auto origin_size = out_op->data_included.size();
+  bufferlist bit;
+  int r = store->readv(ch, ghobject_t(recovery_info.soid),
+		       out_op->data_included, bit,
+                       cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0);
+  if (cct->_conf->osd_debug_random_push_read_error &&
+        (rand() % (int)(cct->_conf->osd_debug_random_push_read_error * 100.0)) == 0) {
+    dout(0) << __func__ << ": inject EIO " << recovery_info.soid << dendl;
+    r = -EIO;
+  }
+  if (r < 0) {
+    return r;
+  }
+  if (out_op->data_included.size() != origin_size) {
+    dout(10) << __func__ << " some extents get pruned "
+             << out_op->data_included.size() << "/" << origin_size
+             << dendl;
+    new_progress.data_complete = true;
+  }
+  out_op->data.claim_append(bit);
+  if (progress.first && !out_op->data_included.empty() &&
+      out_op->data_included.begin().get_start() == 0 &&
+      out_op->data.length() == oi.size && oi.is_data_digest()) {
+    uint32_t crc = out_op->data.crc32c(-1);
+    if (oi.data_digest != crc) {
+      dout(0) << __func__ << " " << coll << std::hex
+                         << " full-object read crc 0x" << crc
+                         << " != expected 0x" << oi.data_digest
+                         << std::dec << " on " << recovery_info.soid << dendl;
+      return -EIO;
+    }
+  }
+
+  if (new_progress.is_complete(recovery_info)) {
+    new_progress.data_complete = true;
+    if (stat) {
+      stat->num_objects_recovered++;
+      if (get_parent()->pg_is_repair())
+        stat->num_objects_repaired++;
+    }
+  } else if (progress.first && progress.omap_complete) {
+    // If omap is not changed, we need recovery omap when recovery cannot be completed once
+    new_progress.omap_complete = false;
+  }
+
+  if (stat) {
+    stat->num_keys_recovered += out_op->omap_entries.size();
+    stat->num_bytes_recovered += out_op->data.length();
+    get_parent()->get_logger()->inc(l_osd_rbytes, out_op->omap_entries.size() + out_op->data.length());
+  }
+
+  get_parent()->get_logger()->inc(l_osd_push);
+  get_parent()->get_logger()->inc(l_osd_push_outb, out_op->data.length());
+
+  // send
+  out_op->version = v;
+  out_op->soid = recovery_info.soid;
+  out_op->recovery_info = recovery_info;
+  out_op->after_progress = new_progress;
+  out_op->before_progress = progress;
+  return 0;
+}
+
+void ReplicatedBackend::prep_push_op_blank(const hobject_t& soid, PushOp *op)
+{
+  op->recovery_info.version = eversion_t();
+  op->version = eversion_t();
+  op->soid = soid;
+}
+
+bool ReplicatedBackend::handle_push_reply(
+  pg_shard_t peer, const PushReplyOp &op, PushOp *reply)
+{
+  const hobject_t &soid = op.soid;
+  if (pushing.count(soid) == 0) {
+    dout(10) << "huh, i wasn't pushing " << soid << " to osd." << peer
+	     << ", or anybody else"
+	     << dendl;
+    return false;
+  } else if (pushing[soid].count(peer) == 0) {
+    dout(10) << "huh, i wasn't pushing " << soid << " to osd." << peer
+	     << dendl;
+    return false;
+  } else {
+    PushInfo *pi = &pushing[soid][peer];
+    bool error = pushing[soid].begin()->second.recovery_progress.error;
+
+    if (!pi->recovery_progress.data_complete && !error) {
+      dout(10) << " pushing more from, "
+	       << pi->recovery_progress.data_recovered_to
+	       << " of " << pi->recovery_info.copy_subset << dendl;
+      ObjectRecoveryProgress new_progress;
+      int r = build_push_op(
+	pi->recovery_info,
+	pi->recovery_progress, &new_progress, reply,
+	&(pi->stat));
+      // Handle the case of a read error right after we wrote, which is
+      // hopefully extremely rare.
+      if (r < 0) {
+        dout(5) << __func__ << ": oid " << soid << " error " << r << dendl;
+
+	error = true;
+	goto done;
+      }
+      pi->recovery_progress = new_progress;
+      return true;
+    } else {
+      // done!
+done:
+      if (!error)
+	get_parent()->on_peer_recover( peer, soid, pi->recovery_info);
+
+      get_parent()->release_locks(pi->lock_manager);
+      object_stat_sum_t stat = pi->stat;
+      eversion_t v = pi->recovery_info.version;
+      pushing[soid].erase(peer);
+      pi = NULL;
+
+      if (pushing[soid].empty()) {
+	if (!error)
+	  get_parent()->on_global_recover(soid, stat, false);
+	else
+	  get_parent()->on_failed_pull(
+	    std::set<pg_shard_t>{ get_parent()->whoami_shard() },
+	    soid,
+	    v);
+	pushing.erase(soid);
+      } else {
+	// This looks weird, but we erased the current peer and need to remember
+	// the error on any other one, while getting more acks.
+	if (error)
+	  pushing[soid].begin()->second.recovery_progress.error = true;
+	dout(10) << "pushed " << soid << ", still waiting for push ack from "
+		 << pushing[soid].size() << " others" << dendl;
+      }
+      return false;
+    }
+  }
+}
+
+void ReplicatedBackend::handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply)
+{
+  const hobject_t &soid = op.soid;
+  struct stat st;
+  int r = store->stat(ch, ghobject_t(soid), &st);
+  if (r != 0) {
+    get_parent()->clog_error() << get_info().pgid << " "
+			       << peer << " tried to pull " << soid
+			       << " but got " << cpp_strerror(-r);
+    prep_push_op_blank(soid, reply);
+  } else {
+    ObjectRecoveryInfo &recovery_info = op.recovery_info;
+    ObjectRecoveryProgress &progress = op.recovery_progress;
+    if (progress.first && recovery_info.size == ((uint64_t)-1)) {
+      // Adjust size and copy_subset
+      recovery_info.size = st.st_size;
+      if (st.st_size) {
+        interval_set<uint64_t> object_range;
+        object_range.insert(0, st.st_size);
+        recovery_info.copy_subset.intersection_of(object_range);
+      } else {
+        recovery_info.copy_subset.clear();
+      }
+      assert(recovery_info.clone_subset.empty());
+    }
+
+    r = build_push_op(recovery_info, progress, 0, reply);
+    if (r < 0)
+      prep_push_op_blank(soid, reply);
+  }
+}
+
+/**
+ * trim received data to remove what we don't want
+ *
+ * @param copy_subset intervals we want
+ * @param data_included intervals we got
+ * @param data_recieved data we got
+ * @param intervals_usable intervals we want to keep
+ * @param data_usable matching data we want to keep
+ */
+void ReplicatedBackend::trim_pushed_data(
+  const interval_set<uint64_t> &copy_subset,
+  const interval_set<uint64_t> &intervals_received,
+  bufferlist data_received,
+  interval_set<uint64_t> *intervals_usable,
+  bufferlist *data_usable)
+{
+  if (intervals_received.subset_of(copy_subset)) {
+    *intervals_usable = intervals_received;
+    *data_usable = data_received;
+    return;
+  }
+
+  intervals_usable->intersection_of(copy_subset,
+				    intervals_received);
+
+  uint64_t off = 0;
+  for (interval_set<uint64_t>::const_iterator p = intervals_received.begin();
+       p != intervals_received.end();
+       ++p) {
+    interval_set<uint64_t> x;
+    x.insert(p.get_start(), p.get_len());
+    x.intersection_of(copy_subset);
+    for (interval_set<uint64_t>::const_iterator q = x.begin();
+	 q != x.end();
+	 ++q) {
+      bufferlist sub;
+      uint64_t data_off = off + (q.get_start() - p.get_start());
+      sub.substr_of(data_received, data_off, q.get_len());
+      data_usable->claim_append(sub);
+    }
+    off += p.get_len();
+  }
+}
+
+void ReplicatedBackend::_failed_pull(pg_shard_t from, const hobject_t &soid)
+{
+  dout(20) << __func__ << ": " << soid << " from " << from << dendl;
+  auto it = pulling.find(soid);
+  assert(it != pulling.end());
+  get_parent()->on_failed_pull(
+    { from },
+    soid,
+    it->second.recovery_info.version);
+
+  clear_pull(it);
+}
+
+void ReplicatedBackend::clear_pull_from(
+  map<hobject_t, PullInfo>::iterator piter)
+{
+  auto from = piter->second.from;
+  pull_from_peer[from].erase(piter->second.soid);
+  if (pull_from_peer[from].empty())
+    pull_from_peer.erase(from);
+}
+
+void ReplicatedBackend::clear_pull(
+  map<hobject_t, PullInfo>::iterator piter,
+  bool clear_pull_from_peer)
+{
+  if (clear_pull_from_peer) {
+    clear_pull_from(piter);
+  }
+  get_parent()->release_locks(piter->second.lock_manager);
+  pulling.erase(piter);
+}
+
+int ReplicatedBackend::start_pushes(
+  const hobject_t &soid,
+  ObjectContextRef obc,
+  RPGHandle *h)
+{
+  list< map<pg_shard_t, pg_missing_t>::const_iterator > shards;
+
+  dout(20) << __func__ << " soid " << soid << dendl;
+  // who needs it?
+  ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0);
+  for (set<pg_shard_t>::iterator i =
+	 get_parent()->get_acting_recovery_backfill_shards().begin();
+       i != get_parent()->get_acting_recovery_backfill_shards().end();
+       ++i) {
+    if (*i == get_parent()->whoami_shard()) continue;
+    pg_shard_t peer = *i;
+    map<pg_shard_t, pg_missing_t>::const_iterator j =
+      get_parent()->get_shard_missing().find(peer);
+    ceph_assert(j != get_parent()->get_shard_missing().end());
+    if (j->second.is_missing(soid)) {
+      shards.push_back(j);
+    }
+  }
+
+  // If more than 1 read will occur ignore possible request to not cache
+  bool cache = shards.size() == 1 ? h->cache_dont_need : false;
+
+  for (auto j : shards) {
+    pg_shard_t peer = j->first;
+    h->pushes[peer].push_back(PushOp());
+    int r = prep_push_to_replica(obc, soid, peer,
+	    &(h->pushes[peer].back()), cache);
+    if (r < 0) {
+      // Back out all failed reads
+      for (auto k : shards) {
+	pg_shard_t p = k->first;
+	dout(10) << __func__ << " clean up peer " << p << dendl;
+	h->pushes[p].pop_back();
+	if (p == peer) break;
+      }
+      return r;
+    }
+  }
+  return shards.size();
+}
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
new file mode 100644
index 000000000..f4b506357
--- /dev/null
+++ b/src/osd/ReplicatedBackend.h
@@ -0,0 +1,437 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef REPBACKEND_H
+#define REPBACKEND_H
+
+#include "PGBackend.h"
+
+struct C_ReplicatedBackend_OnPullComplete;
+class ReplicatedBackend : public PGBackend {
+  struct RPGHandle : public PGBackend::RecoveryHandle {
+    std::map<pg_shard_t, std::vector<PushOp> > pushes;
+    std::map<pg_shard_t, std::vector<PullOp> > pulls;
+  };
+  friend struct C_ReplicatedBackend_OnPullComplete;
+public:
+  ReplicatedBackend(
+    PGBackend::Listener *pg,
+    const coll_t &coll,
+    ObjectStore::CollectionHandle &ch,
+    ObjectStore *store,
+    CephContext *cct);
+
+  /// @see PGBackend::open_recovery_op
+  RPGHandle *_open_recovery_op() {
+    return new RPGHandle();
+  }
+  PGBackend::RecoveryHandle *open_recovery_op() override {
+    return _open_recovery_op();
+  }
+
+  /// @see PGBackend::run_recovery_op
+  void run_recovery_op(
+    PGBackend::RecoveryHandle *h,
+    int priority) override;
+
+  /// @see PGBackend::recover_object
+  int recover_object(
+    const hobject_t &hoid,
+    eversion_t v,
+    ObjectContextRef head,
+    ObjectContextRef obc,
+    RecoveryHandle *h
+    ) override;
+
+  void check_recovery_sources(const OSDMapRef& osdmap) override;
+
+  bool can_handle_while_inactive(OpRequestRef op) override;
+
+  /// @see PGBackend::handle_message
+  bool _handle_message(
+    OpRequestRef op
+    ) override;
+
+  void on_change() override;
+  void clear_recovery_state() override;
+
+  class RPCRecPred : public IsPGRecoverablePredicate {
+  public:
+    bool operator()(const std::set<pg_shard_t> &have) const override {
+      return !have.empty();
+    }
+  };
+  IsPGRecoverablePredicate *get_is_recoverable_predicate() const override {
+    return new RPCRecPred;
+  }
+
+  class RPCReadPred : public IsPGReadablePredicate {
+    pg_shard_t whoami;
+  public:
+    explicit RPCReadPred(pg_shard_t whoami) : whoami(whoami) {}
+    bool operator()(const std::set<pg_shard_t> &have) const override {
+      return have.count(whoami);
+    }
+  };
+  IsPGReadablePredicate *get_is_readable_predicate() const override {
+    return new RPCReadPred(get_parent()->whoami_shard());
+  }
+
+  void dump_recovery_info(ceph::Formatter *f) const override {
+    {
+      f->open_array_section("pull_from_peer");
+      for (std::map<pg_shard_t, std::set<hobject_t> >::const_iterator i = pull_from_peer.begin();
+	   i != pull_from_peer.end();
+	   ++i) {
+	f->open_object_section("pulling_from");
+	f->dump_stream("pull_from") << i->first;
+	{
+	  f->open_array_section("pulls");
+	  for (std::set<hobject_t>::const_iterator j = i->second.begin();
+	       j != i->second.end();
+	       ++j) {
+	    f->open_object_section("pull_info");
+	    ceph_assert(pulling.count(*j));
+	    pulling.find(*j)->second.dump(f);
+	    f->close_section();
+	  }
+	  f->close_section();
+	}
+	f->close_section();
+      }
+      f->close_section();
+    }
+    {
+      f->open_array_section("pushing");
+      for (std::map<hobject_t, std::map<pg_shard_t, PushInfo>>::const_iterator i =
+	     pushing.begin();
+	   i != pushing.end();
+	   ++i) {
+	f->open_object_section("object");
+	f->dump_stream("pushing") << i->first;
+	{
+	  f->open_array_section("pushing_to");
+	  for (std::map<pg_shard_t, PushInfo>::const_iterator j = i->second.begin();
+	       j != i->second.end();
+	       ++j) {
+	    f->open_object_section("push_progress");
+	    f->dump_stream("pushing_to") << j->first;
+	    {
+	      f->open_object_section("push_info");
+	      j->second.dump(f);
+	      f->close_section();
+	    }
+	    f->close_section();
+	  }
+	  f->close_section();
+	}
+	f->close_section();
+      }
+      f->close_section();
+    }
+  }
+
+  int objects_read_sync(
+    const hobject_t &hoid,
+    uint64_t off,
+    uint64_t len,
+    uint32_t op_flags,
+    ceph::buffer::list *bl) override;
+
+  int objects_readv_sync(
+    const hobject_t &hoid,
+    std::map<uint64_t, uint64_t>&& m,
+    uint32_t op_flags,
+    ceph::buffer::list *bl) override;
+
+  void objects_read_async(
+    const hobject_t &hoid,
+    const std::list<std::pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+	       std::pair<ceph::buffer::list*, Context*> > > &to_read,
+               Context *on_complete,
+               bool fast_read = false) override;
+
+private:
+  // push
+  struct PushInfo {
+    ObjectRecoveryProgress recovery_progress;
+    ObjectRecoveryInfo recovery_info;
+    ObjectContextRef obc;
+    object_stat_sum_t stat;
+    ObcLockManager lock_manager;
+
+    void dump(ceph::Formatter *f) const {
+      {
+	f->open_object_section("recovery_progress");
+	recovery_progress.dump(f);
+	f->close_section();
+      }
+      {
+	f->open_object_section("recovery_info");
+	recovery_info.dump(f);
+	f->close_section();
+      }
+    }
+  };
+  std::map<hobject_t, std::map<pg_shard_t, PushInfo>> pushing;
+
+  // pull
+  struct PullInfo {
+    pg_shard_t from;
+    hobject_t soid;
+    ObjectRecoveryProgress recovery_progress;
+    ObjectRecoveryInfo recovery_info;
+    ObjectContextRef head_ctx;
+    ObjectContextRef obc;
+    object_stat_sum_t stat;
+    bool cache_dont_need;
+    ObcLockManager lock_manager;
+
+    void dump(ceph::Formatter *f) const {
+      {
+	f->open_object_section("recovery_progress");
+	recovery_progress.dump(f);
+	f->close_section();
+      }
+      {
+	f->open_object_section("recovery_info");
+	recovery_info.dump(f);
+	f->close_section();
+      }
+    }
+
+    bool is_complete() const {
+      return recovery_progress.is_complete(recovery_info);
+    }
+  };
+
+  std::map<hobject_t, PullInfo> pulling;
+
+  // Reverse mapping from osd peer to objects being pulled from that peer
+  std::map<pg_shard_t, std::set<hobject_t> > pull_from_peer;
+  void clear_pull(
+    std::map<hobject_t, PullInfo>::iterator piter,
+    bool clear_pull_from_peer = true);
+  void clear_pull_from(
+    std::map<hobject_t, PullInfo>::iterator piter);
+
+  void _do_push(OpRequestRef op);
+  void _do_pull_response(OpRequestRef op);
+  void do_push(OpRequestRef op) {
+    if (is_primary()) {
+      _do_pull_response(op);
+    } else {
+      _do_push(op);
+    }
+  }
+  void do_pull(OpRequestRef op);
+  void do_push_reply(OpRequestRef op);
+
+  bool handle_push_reply(pg_shard_t peer, const PushReplyOp &op, PushOp *reply);
+  void handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply);
+
+  struct pull_complete_info {
+    hobject_t hoid;
+    object_stat_sum_t stat;
+  };
+  bool handle_pull_response(
+    pg_shard_t from, const PushOp &op, PullOp *response,
+    std::list<pull_complete_info> *to_continue,
+    ObjectStore::Transaction *t);
+  void handle_push(pg_shard_t from, const PushOp &op, PushReplyOp *response,
+		   ObjectStore::Transaction *t, bool is_repair);
+
+  static void trim_pushed_data(const interval_set<uint64_t> &copy_subset,
+			       const interval_set<uint64_t> &intervals_received,
+			       ceph::buffer::list data_received,
+			       interval_set<uint64_t> *intervals_usable,
+			       ceph::buffer::list *data_usable);
+  void _failed_pull(pg_shard_t from, const hobject_t &soid);
+
+  void send_pushes(int prio, std::map<pg_shard_t, std::vector<PushOp> > &pushes);
+  void prep_push_op_blank(const hobject_t& soid, PushOp *op);
+  void send_pulls(
+    int priority,
+    std::map<pg_shard_t, std::vector<PullOp> > &pulls);
+
+  int build_push_op(const ObjectRecoveryInfo &recovery_info,
+		    const ObjectRecoveryProgress &progress,
+		    ObjectRecoveryProgress *out_progress,
+		    PushOp *out_op,
+		    object_stat_sum_t *stat = 0,
+                    bool cache_dont_need = true);
+  void submit_push_data(const ObjectRecoveryInfo &recovery_info,
+			bool first,
+			bool complete,
+			bool clear_omap,
+			bool cache_dont_need,
+			interval_set<uint64_t> &data_zeros,
+			const interval_set<uint64_t> &intervals_included,
+			ceph::buffer::list data_included,
+			ceph::buffer::list omap_header,
+			const std::map<std::string, ceph::buffer::list> &attrs,
+			const std::map<std::string, ceph::buffer::list> &omap_entries,
+			ObjectStore::Transaction *t);
+  void submit_push_complete(const ObjectRecoveryInfo &recovery_info,
+			    ObjectStore::Transaction *t);
+
+  void calc_clone_subsets(
+    SnapSet& snapset, const hobject_t& poid, const pg_missing_t& missing,
+    const hobject_t &last_backfill,
+    interval_set<uint64_t>& data_subset,
+    std::map<hobject_t, interval_set<uint64_t>>& clone_subsets,
+    ObcLockManager &lock_manager);
+  void prepare_pull(
+    eversion_t v,
+    const hobject_t& soid,
+    ObjectContextRef headctx,
+    RPGHandle *h);
+  int start_pushes(
+    const hobject_t &soid,
+    ObjectContextRef obj,
+    RPGHandle *h);
+  int prep_push_to_replica(
+    ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
+    PushOp *pop, bool cache_dont_need = true);
+  int prep_push(
+    ObjectContextRef obc,
+    const hobject_t& oid, pg_shard_t dest,
+    PushOp *op,
+    bool cache_dont_need);
+  int prep_push(
+    ObjectContextRef obc,
+    const hobject_t& soid, pg_shard_t peer,
+    eversion_t version,
+    interval_set<uint64_t> &data_subset,
+    std::map<hobject_t, interval_set<uint64_t>>& clone_subsets,
+    PushOp *op,
+    bool cache,
+    ObcLockManager &&lock_manager);
+  void calc_head_subsets(
+    ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
+    const pg_missing_t& missing,
+    const hobject_t &last_backfill,
+    interval_set<uint64_t>& data_subset,
+    std::map<hobject_t, interval_set<uint64_t>>& clone_subsets,
+    ObcLockManager &lock_manager);
+  ObjectRecoveryInfo recalc_subsets(
+    const ObjectRecoveryInfo& recovery_info,
+    SnapSetContext *ssc,
+    ObcLockManager &lock_manager);
+
+  /**
+   * Client IO
+   */
+  struct InProgressOp : public RefCountedObject {
+    ceph_tid_t tid;
+    std::set<pg_shard_t> waiting_for_commit;
+    Context *on_commit;
+    OpRequestRef op;
+    eversion_t v;
+    bool done() const {
+      return waiting_for_commit.empty();
+    }
+  private:
+    FRIEND_MAKE_REF(InProgressOp);
+    InProgressOp(ceph_tid_t tid, Context *on_commit, OpRequestRef op, eversion_t v)
+      :
+	tid(tid), on_commit(on_commit),
+	op(op), v(v) {}
+  };
+  std::map<ceph_tid_t, ceph::ref_t<InProgressOp>> in_progress_ops;
+public:
+  friend class C_OSD_OnOpCommit;
+
+  void call_write_ordered(std::function<void(void)> &&cb) override {
+    // ReplicatedBackend submits writes inline in submit_transaction, so
+    // we can just call the callback.
+    cb();
+  }
+
+  void submit_transaction(
+    const hobject_t &hoid,
+    const object_stat_sum_t &delta_stats,
+    const eversion_t &at_version,
+    PGTransactionUPtr &&t,
+    const eversion_t &trim_to,
+    const eversion_t &min_last_complete_ondisk,
+    std::vector<pg_log_entry_t>&& log_entries,
+    std::optional<pg_hit_set_history_t> &hset_history,
+    Context *on_all_commit,
+    ceph_tid_t tid,
+    osd_reqid_t reqid,
+    OpRequestRef op
+    ) override;
+
+private:
+  Message * generate_subop(
+    const hobject_t &soid,
+    const eversion_t &at_version,
+    ceph_tid_t tid,
+    osd_reqid_t reqid,
+    eversion_t pg_trim_to,
+    eversion_t min_last_complete_ondisk,
+    hobject_t new_temp_oid,
+    hobject_t discard_temp_oid,
+    const ceph::buffer::list &log_entries,
+    std::optional<pg_hit_set_history_t> &hset_history,
+    ObjectStore::Transaction &op_t,
+    pg_shard_t peer,
+    const pg_info_t &pinfo);
+  void issue_op(
+    const hobject_t &soid,
+    const eversion_t &at_version,
+    ceph_tid_t tid,
+    osd_reqid_t reqid,
+    eversion_t pg_trim_to,
+    eversion_t min_last_complete_ondisk,
+    hobject_t new_temp_oid,
+    hobject_t discard_temp_oid,
+    const std::vector<pg_log_entry_t> &log_entries,
+    std::optional<pg_hit_set_history_t> &hset_history,
+    InProgressOp *op,
+    ObjectStore::Transaction &op_t);
+  void op_commit(const ceph::ref_t<InProgressOp>& op);
+  void do_repop_reply(OpRequestRef op);
+  void do_repop(OpRequestRef op);
+
+  struct RepModify {
+    OpRequestRef op;
+    bool committed;
+    int ackerosd;
+    eversion_t last_complete;
+    epoch_t epoch_started;
+
+    ObjectStore::Transaction opt, localt;
+    
+    RepModify() : committed(false), ackerosd(-1),
+		  epoch_started(0) {}
+  };
+  typedef std::shared_ptr<RepModify> RepModifyRef;
+
+  struct C_OSD_RepModifyCommit;
+
+  void repop_commit(RepModifyRef rm);
+  bool auto_repair_supported() const override { return store->has_builtin_csum(); }
+
+
+  int be_deep_scrub(
+    const hobject_t &poid,
+    ScrubMap &map,
+    ScrubMapBuilder &pos,
+    ScrubMap::object &o) override;
+  uint64_t be_get_ondisk_size(uint64_t logical_size) override { return logical_size; }
+};
+
+#endif
diff --git a/src/osd/ScrubStore.cc b/src/osd/ScrubStore.cc
new file mode 100644
index 000000000..a692a4435
--- /dev/null
+++ b/src/osd/ScrubStore.cc
@@ -0,0 +1,198 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "ScrubStore.h"
+#include "osd_types.h"
+#include "common/scrub_types.h"
+#include "include/rados/rados_types.hpp"
+
+using std::ostringstream;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+
+namespace {
+ghobject_t make_scrub_object(const spg_t& pgid)
+{
+  ostringstream ss;
+  ss << "scrub_" << pgid;
+  return pgid.make_temp_ghobject(ss.str());
+}
+
+string first_object_key(int64_t pool)
+{
+  auto hoid = hobject_t(object_t(),
+			"",
+			0,
+			0x00000000,
+			pool,
+			"");
+  hoid.build_hash_cache();
+  return "SCRUB_OBJ_" + hoid.to_str();
+}
+
+// the object_key should be unique across pools
+string to_object_key(int64_t pool, const librados::object_id_t& oid)
+{
+  auto hoid = hobject_t(object_t(oid.name),
+			oid.locator, // key
+			oid.snap,
+			0,		// hash
+			pool,
+			oid.nspace);
+  hoid.build_hash_cache();
+  return "SCRUB_OBJ_" + hoid.to_str();
+}
+
+string last_object_key(int64_t pool)
+{
+  auto hoid = hobject_t(object_t(),
+			"",
+			0,
+			0xffffffff,
+			pool,
+			"");
+  hoid.build_hash_cache();
+  return "SCRUB_OBJ_" + hoid.to_str();
+}
+
+string first_snap_key(int64_t pool)
+{
+  // scrub object is per spg_t object, so we can misuse the hash (pg.seed) for
+  // the representing the minimal and maximum keys. and this relies on how
+  // hobject_t::to_str() works: hex(pool).hex(revhash).
+  auto hoid = hobject_t(object_t(),
+			"",
+			0,
+			0x00000000,
+			pool,
+			"");
+  hoid.build_hash_cache();
+  return "SCRUB_SS_" + hoid.to_str();
+}
+
+string to_snap_key(int64_t pool, const librados::object_id_t& oid)
+{
+  auto hoid = hobject_t(object_t(oid.name),
+			oid.locator, // key
+			oid.snap,
+			0x77777777, // hash
+			pool,
+			oid.nspace);
+  hoid.build_hash_cache();
+  return "SCRUB_SS_" + hoid.to_str();
+}
+
+string last_snap_key(int64_t pool)
+{
+  auto hoid = hobject_t(object_t(),
+			"",
+			0,
+			0xffffffff,
+			pool,
+			"");
+  hoid.build_hash_cache();
+  return "SCRUB_SS_" + hoid.to_str();
+}
+}
+
+namespace Scrub {
+
+Store*
+Store::create(ObjectStore* store,
+	      ObjectStore::Transaction* t,
+	      const spg_t& pgid,
+	      const coll_t& coll)
+{
+  ceph_assert(store);
+  ceph_assert(t);
+  ghobject_t oid = make_scrub_object(pgid);
+  t->touch(coll, oid);
+  return new Store{coll, oid, store};
+}
+
+Store::Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store)
+  : coll(coll),
+    hoid(oid),
+    driver(store, coll, hoid),
+    backend(&driver)
+{}
+
+Store::~Store()
+{
+  ceph_assert(results.empty());
+}
+
+void Store::add_object_error(int64_t pool, const inconsistent_obj_wrapper& e)
+{
+  bufferlist bl;
+  e.encode(bl);
+  results[to_object_key(pool, e.object)] = bl;
+}
+
+void Store::add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e)
+{
+  bufferlist bl;
+  e.encode(bl);
+  results[to_snap_key(pool, e.object)] = bl;
+}
+
+bool Store::empty() const
+{
+  return results.empty();
+}
+
+void Store::flush(ObjectStore::Transaction* t)
+{
+  if (t) {
+    OSDriver::OSTransaction txn = driver.get_transaction(t);
+    backend.set_keys(results, &txn);
+  }
+  results.clear();
+}
+
+void Store::cleanup(ObjectStore::Transaction* t)
+{
+  t->remove(coll, hoid);
+}
+
+std::vector<bufferlist>
+Store::get_snap_errors(int64_t pool,
+		       const librados::object_id_t& start,
+		       uint64_t max_return) const
+{
+  const string begin = (start.name.empty() ?
+			first_snap_key(pool) : to_snap_key(pool, start));
+  const string end = last_snap_key(pool);
+  return get_errors(begin, end, max_return);
+}
+
+std::vector<bufferlist>
+Store::get_object_errors(int64_t pool,
+			 const librados::object_id_t& start,
+			 uint64_t max_return) const
+{
+  const string begin = (start.name.empty() ?
+			first_object_key(pool) : to_object_key(pool, start));
+  const string end = last_object_key(pool);
+  return get_errors(begin, end, max_return);
+}
+
+std::vector<bufferlist>
+Store::get_errors(const string& begin,
+		  const string& end,
+		  uint64_t max_return) const
+{
+  vector<bufferlist> errors;
+  auto next = std::make_pair(begin, bufferlist{});
+  while (max_return && !backend.get_next(next.first, &next)) {
+    if (next.first >= end)
+      break;
+    errors.push_back(next.second);
+    max_return--;
+  }
+  return errors;
+}
+
+} // namespace Scrub
diff --git a/src/osd/ScrubStore.h b/src/osd/ScrubStore.h
new file mode 100644
index 000000000..721aae092
--- /dev/null
+++ b/src/osd/ScrubStore.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_SCRUB_RESULT_H
+#define CEPH_SCRUB_RESULT_H
+
+#include "SnapMapper.h"		// for OSDriver
+#include "common/map_cacher.hpp"
+
+namespace librados {
+  struct object_id_t;
+}
+
+struct inconsistent_obj_wrapper;
+struct inconsistent_snapset_wrapper;
+
+namespace Scrub {
+
+class Store {
+public:
+  ~Store();
+  static Store* create(ObjectStore* store,
+		       ObjectStore::Transaction* t,
+		       const spg_t& pgid,
+		       const coll_t& coll);
+  void add_object_error(int64_t pool, const inconsistent_obj_wrapper& e);
+  void add_snap_error(int64_t pool, const inconsistent_snapset_wrapper& e);
+  bool empty() const;
+  void flush(ObjectStore::Transaction *);
+  void cleanup(ObjectStore::Transaction *);
+  std::vector<ceph::buffer::list> get_snap_errors(int64_t pool,
+					  const librados::object_id_t& start,
+					  uint64_t max_return) const;
+  std::vector<ceph::buffer::list> get_object_errors(int64_t pool,
+					    const librados::object_id_t& start,
+					    uint64_t max_return) const;
+private:
+  Store(const coll_t& coll, const ghobject_t& oid, ObjectStore* store);
+  std::vector<ceph::buffer::list> get_errors(const std::string& start, const std::string& end,
+				     uint64_t max_return) const;
+private:
+  const coll_t coll;
+  const ghobject_t hoid;
+  // a temp object holding mappings from seq-id to inconsistencies found in
+  // scrubbing
+  OSDriver driver;
+  mutable MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
+  std::map<std::string, ceph::buffer::list> results;
+};
+}
+
+#endif // CEPH_SCRUB_RESULT_H
diff --git a/src/osd/Session.cc b/src/osd/Session.cc
new file mode 100644
index 000000000..454e1b857
--- /dev/null
+++ b/src/osd/Session.cc
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "PG.h"
+#include "Session.h"
+
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+
+using std::map;
+using std::set;
+
+void Session::clear_backoffs()
+{
+  map<spg_t,map<hobject_t,set<ceph::ref_t<Backoff>>>> ls;
+  {
+    std::lock_guard l(backoff_lock);
+    ls.swap(backoffs);
+    backoff_count = 0;
+  }
+  for (auto& i : ls) {
+    for (auto& p : i.second) {
+      for (auto& b : p.second) {
+	std::lock_guard l(b->lock);
+	if (b->pg) {
+	  ceph_assert(b->session == this);
+	  ceph_assert(b->is_new() || b->is_acked());
+	  b->pg->rm_backoff(b);
+	  b->pg.reset();
+	  b->session.reset();
+	} else if (b->session) {
+	  ceph_assert(b->session == this);
+	  ceph_assert(b->is_deleting());
+	  b->session.reset();
+	}
+      }
+    }
+  }
+}
+
+void Session::ack_backoff(
+  CephContext *cct,
+  spg_t pgid,
+  uint64_t id,
+  const hobject_t& begin,
+  const hobject_t& end)
+{
+  std::lock_guard l(backoff_lock);
+  auto p = backoffs.find(pgid);
+  if (p == backoffs.end()) {
+    dout(20) << __func__ << " " << pgid << " " << id << " [" << begin << ","
+	     << end << ") pg not found" << dendl;
+    return;
+  }
+  auto q = p->second.find(begin);
+  if (q == p->second.end()) {
+    dout(20) << __func__ << " " << pgid << " " << id << " [" << begin << ","
+	     << end << ") begin not found" << dendl;
+    return;
+  }
+  for (auto i = q->second.begin(); i != q->second.end(); ++i) {
+    Backoff *b = (*i).get();
+    if (b->id == id) {
+      if (b->is_new()) {
+	b->state = Backoff::STATE_ACKED;
+	dout(20) << __func__ << " now " << *b << dendl;
+      } else if (b->is_deleting()) {
+	dout(20) << __func__ << " deleting " << *b << dendl;
+	q->second.erase(i);
+	--backoff_count;
+      }
+      break;
+    }
+  }
+  if (q->second.empty()) {
+    dout(20) << __func__ << " clearing begin bin " << q->first << dendl;
+    p->second.erase(q);
+    if (p->second.empty()) {
+      dout(20) << __func__ << " clearing pg bin " << p->first << dendl;
+      backoffs.erase(p);
+    }
+  }
+  ceph_assert(!backoff_count == backoffs.empty());
+}
+
+bool Session::check_backoff(
+  CephContext *cct, spg_t pgid, const hobject_t& oid, const Message *m)
+{
+  auto b = have_backoff(pgid, oid);
+  if (b) {
+    dout(10) << __func__ << " session " << this << " has backoff " << *b
+	     << " for " << *m << dendl;
+    ceph_assert(!b->is_acked() || !g_conf()->osd_debug_crash_on_ignored_backoff);
+    return true;
+  }
+  // we may race with ms_handle_reset.  it clears session->con before removing
+  // backoffs, so if we see con is cleared here we have to abort this
+  // request.
+  if (!con) {
+    dout(10) << __func__ << " session " << this << " disconnected" << dendl;
+    return true;
+  }
+  return false;
+}
diff --git a/src/osd/Session.h b/src/osd/Session.h
new file mode 100644
index 000000000..a42d37bfe
--- /dev/null
+++ b/src/osd/Session.h
@@ -0,0 +1,240 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_SESSION_H
+#define CEPH_OSD_SESSION_H
+
+#include "common/RefCountedObj.h"
+#include "common/ceph_mutex.h"
+#include "global/global_context.h"
+#include "include/spinlock.h"
+#include "OSDCap.h"
+#include "Watch.h"
+#include "OSDMap.h"
+#include "PeeringState.h"
+
+//#define PG_DEBUG_REFS
+
+class PG;
+#ifdef PG_DEBUG_REFS
+#include "common/tracked_int_ptr.hpp"
+typedef TrackedIntPtr<PG> PGRef;
+#else
+typedef boost::intrusive_ptr<PG> PGRef;
+#endif
+
+/*
+ * A Backoff represents one instance of either a PG or an OID
+ * being plugged at the client.  It's refcounted and linked from
+ * the PG {pg_oid}_backoffs map and from the client Session
+ * object.
+ *
+ * The Backoff has a lock that protects it's internal fields.
+ *
+ * The PG has a backoff_lock that protects it's maps to Backoffs.
+ * This lock is *inside* of Backoff::lock.
+ *
+ * The Session has a backoff_lock that protects it's map of pg and
+ * oid backoffs.  This lock is *inside* the Backoff::lock *and*
+ * PG::backoff_lock.
+ *
+ * That's
+ *
+ *    Backoff::lock
+ *       PG::backoff_lock
+ *         Session::backoff_lock
+ *
+ * When the Session goes away, we move our backoff lists aside,
+ * then we lock each of the Backoffs we
+ * previously referenced and clear the Session* pointer.  If the PG
+ * is still linked, we unlink it, too.
+ *
+ * When the PG clears the backoff, it will send an unblock message
+ * if the Session* is still non-null, and unlink the session.
+ *
+ */
+
+struct Backoff : public RefCountedObject {
+  enum {
+    STATE_NEW = 1,     ///< backoff in flight to client
+    STATE_ACKED = 2,   ///< backoff acked
+    STATE_DELETING = 3 ///< backoff deleted, but un-acked
+  };
+  std::atomic<int> state = {STATE_NEW};
+  spg_t pgid;          ///< owning pgid
+  uint64_t id = 0;     ///< unique id (within the Session)
+
+  bool is_new() const {
+    return state.load() == STATE_NEW;
+  }
+  bool is_acked() const {
+    return state.load() == STATE_ACKED;
+  }
+  bool is_deleting() const {
+    return state.load() == STATE_DELETING;
+  }
+  const char *get_state_name() const {
+    switch (state.load()) {
+    case STATE_NEW: return "new";
+    case STATE_ACKED: return "acked";
+    case STATE_DELETING: return "deleting";
+    default: return "???";
+    }
+  }
+
+  ceph::mutex lock = ceph::make_mutex("Backoff::lock");
+  // NOTE: the owning PG and session are either
+  //   - *both* set, or
+  //   - both null (teardown), or
+  //   - only session is set (and state == DELETING)
+  PGRef pg;             ///< owning pg
+  ceph::ref_t<struct Session> session;   ///< owning session
+  hobject_t begin, end; ///< [) range to block, unless ==, then single obj
+
+  friend ostream& operator<<(ostream& out, const Backoff& b) {
+    return out << "Backoff(" << &b << " " << b.pgid << " " << b.id
+	       << " " << b.get_state_name()
+	       << " [" << b.begin << "," << b.end << ") "
+	       << " session " << b.session
+	       << " pg " << b.pg << ")";
+  }
+
+private:
+  FRIEND_MAKE_REF(Backoff);
+  Backoff(spg_t pgid, PGRef pg, ceph::ref_t<Session> s,
+	  uint64_t i,
+	  const hobject_t& b, const hobject_t& e)
+    : RefCountedObject(g_ceph_context),
+      pgid(pgid),
+      id(i),
+      pg(pg),
+      session(std::move(s)),
+      begin(b),
+      end(e) {}
+};
+
+
+
+struct Session : public RefCountedObject {
+  EntityName entity_name;
+  OSDCap caps;
+  ConnectionRef con;
+  entity_addr_t socket_addr;
+  WatchConState wstate;
+
+  ceph::mutex session_dispatch_lock =
+    ceph::make_mutex("Session::session_dispatch_lock");
+  boost::intrusive::list<OpRequest> waiting_on_map;
+
+  ceph::spinlock sent_epoch_lock;
+  epoch_t last_sent_epoch = 0;
+
+  /// protects backoffs; orders inside Backoff::lock *and* PG::backoff_lock
+  ceph::mutex backoff_lock = ceph::make_mutex("Session::backoff_lock");
+  std::atomic<int> backoff_count= {0};  ///< simple count of backoffs
+  std::map<spg_t, std::map<hobject_t, std::set<ceph::ref_t<Backoff>>>> backoffs;
+
+  std::atomic<uint64_t> backoff_seq = {0};
+
+  // for heartbeat connections only
+  int peer = -1;
+  HeartbeatStampsRef stamps;
+
+  entity_addr_t& get_peer_socket_addr() {
+    return socket_addr;
+  }
+
+  void ack_backoff(
+    CephContext *cct,
+    spg_t pgid,
+    uint64_t id,
+    const hobject_t& start,
+    const hobject_t& end);
+
+  ceph::ref_t<Backoff> have_backoff(spg_t pgid, const hobject_t& oid) {
+    if (!backoff_count.load()) {
+      return nullptr;
+    }
+    std::lock_guard l(backoff_lock);
+    ceph_assert(!backoff_count == backoffs.empty());
+    auto i = backoffs.find(pgid);
+    if (i == backoffs.end()) {
+      return nullptr;
+    }
+    auto p = i->second.lower_bound(oid);
+    if (p != i->second.begin() &&
+	(p == i->second.end() || p->first > oid)) {
+      --p;
+    }
+    if (p != i->second.end()) {
+      int r = cmp(oid, p->first);
+      if (r == 0 || r > 0) {
+	for (auto& q : p->second) {
+	  if (r == 0 || oid < q->end) {
+	    return &(*q);
+	  }
+	}
+      }
+    }
+    return nullptr;
+  }
+
+  bool check_backoff(
+    CephContext *cct, spg_t pgid, const hobject_t& oid, const Message *m);
+
+  void add_backoff(ceph::ref_t<Backoff> b) {
+    std::lock_guard l(backoff_lock);
+    ceph_assert(!backoff_count == backoffs.empty());
+    backoffs[b->pgid][b->begin].insert(std::move(b));
+    ++backoff_count;
+  }
+
+  // called by PG::release_*_backoffs and PG::clear_backoffs()
+  void rm_backoff(const ceph::ref_t<Backoff>& b) {
+    std::lock_guard l(backoff_lock);
+    ceph_assert(ceph_mutex_is_locked_by_me(b->lock));
+    ceph_assert(b->session == this);
+    auto i = backoffs.find(b->pgid);
+    if (i != backoffs.end()) {
+      // may race with clear_backoffs()
+      auto p = i->second.find(b->begin);
+      if (p != i->second.end()) {
+	auto q = p->second.find(b);
+	if (q != p->second.end()) {
+	  p->second.erase(q);
+	  --backoff_count;
+	  if (p->second.empty()) {
+	    i->second.erase(p);
+	    if (i->second.empty()) {
+	      backoffs.erase(i);
+	    }
+	  }
+	}
+      }
+    }
+    ceph_assert(!backoff_count == backoffs.empty());
+  }
+  void clear_backoffs();
+
+private:
+  FRIEND_MAKE_REF(Session);
+  explicit Session(CephContext *cct, Connection *con_) :
+    RefCountedObject(cct),
+    con(con_),
+    socket_addr(con_->get_peer_socket_addr()),
+    wstate(cct)
+    {}
+};
+
+#endif
diff --git a/src/osd/SnapMapper.cc b/src/osd/SnapMapper.cc
new file mode 100644
index 000000000..804213b1f
--- /dev/null
+++ b/src/osd/SnapMapper.cc
@@ -0,0 +1,752 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "SnapMapper.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout << "snap_mapper."
+
+using std::make_pair;
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::timespan_str;
+
+const string SnapMapper::LEGACY_MAPPING_PREFIX = "MAP_";
+const string SnapMapper::MAPPING_PREFIX = "SNA_";
+const string SnapMapper::OBJECT_PREFIX = "OBJ_";
+
+const char *SnapMapper::PURGED_SNAP_PREFIX = "PSN_";
+
+/*
+
+  We have a bidirectional mapping, (1) from each snap+obj to object,
+  sorted by snapshot, such that we can enumerate to identify all clones
+  mapped to a particular snapshot, and (2) from object to snaps, so we
+  can identify which reverse mappings exist for any given object (and,
+  e.g., clean up on deletion).
+
+  "MAP_"
+  + ("%016x" % snapid)
+  + "_"
+  + (".%x" % shard_id)
+  + "_"
+  + hobject_t::to_str() ("%llx.%8x.%lx.name...." % pool, hash, snap)
+  -> SnapMapping::Mapping { snap, hoid }
+
+  "SNA_"
+  + ("%lld" % poolid)
+  + "_"
+  + ("%016x" % snapid)
+  + "_"
+  + (".%x" % shard_id)
+  + "_"
+  + hobject_t::to_str() ("%llx.%8x.%lx.name...." % pool, hash, snap)
+  -> SnapMapping::Mapping { snap, hoid }
+
+  "OBJ_" +
+  + (".%x" % shard_id)
+  + hobject_t::to_str()
+   -> SnapMapper::object_snaps { oid, set<snapid_t> }
+
+  */
+
+int OSDriver::get_keys(
+  const std::set<std::string> &keys,
+  std::map<std::string, bufferlist> *out)
+{
+  return os->omap_get_values(ch, hoid, keys, out);
+}
+
+int OSDriver::get_next(
+  const std::string &key,
+  pair<std::string, bufferlist> *next)
+{
+  ObjectMap::ObjectMapIterator iter =
+    os->get_omap_iterator(ch, hoid);
+  if (!iter) {
+    ceph_abort();
+    return -EINVAL;
+  }
+  iter->upper_bound(key);
+  if (iter->valid()) {
+    if (next)
+      *next = make_pair(iter->key(), iter->value());
+    return 0;
+  } else {
+    return -ENOENT;
+  }
+}
+
+string SnapMapper::get_prefix(int64_t pool, snapid_t snap)
+{
+  char buf[100];
+  int len = snprintf(
+    buf, sizeof(buf),
+    "%lld_%.*X_",
+    (long long)pool,
+    (int)(sizeof(snap)*2), static_cast<unsigned>(snap));
+  return MAPPING_PREFIX + string(buf, len);
+}
+
+string SnapMapper::to_raw_key(
+  const pair<snapid_t, hobject_t> &in)
+{
+  return get_prefix(in.second.pool, in.first) + shard_prefix + in.second.to_str();
+}
+
+pair<string, bufferlist> SnapMapper::to_raw(
+  const pair<snapid_t, hobject_t> &in)
+{
+  bufferlist bl;
+  encode(Mapping(in), bl);
+  return make_pair(
+    to_raw_key(in),
+    bl);
+}
+
+pair<snapid_t, hobject_t> SnapMapper::from_raw(
+  const pair<std::string, bufferlist> &image)
+{
+  using ceph::decode;
+  Mapping map;
+  bufferlist bl(image.second);
+  auto bp = bl.cbegin();
+  decode(map, bp);
+  return make_pair(map.snap, map.hoid);
+}
+
+bool SnapMapper::is_mapping(const string &to_test)
+{
+  return to_test.substr(0, MAPPING_PREFIX.size()) == MAPPING_PREFIX;
+}
+
+string SnapMapper::to_object_key(const hobject_t &hoid)
+{
+  return OBJECT_PREFIX + shard_prefix + hoid.to_str();
+}
+
+void SnapMapper::object_snaps::encode(bufferlist &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(oid, bl);
+  encode(snaps, bl);
+  ENCODE_FINISH(bl);
+}
+
+void SnapMapper::object_snaps::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  decode(oid, bl);
+  decode(snaps, bl);
+  DECODE_FINISH(bl);
+}
+
+bool SnapMapper::check(const hobject_t &hoid) const
+{
+  if (hoid.match(mask_bits, match)) {
+    return true;
+  }
+  derr << __func__ << " " << hoid << " mask_bits " << mask_bits
+       << " match 0x" << std::hex << match << std::dec << " is false"
+       << dendl;
+  return false;
+}
+
+int SnapMapper::get_snaps(
+  const hobject_t &oid,
+  object_snaps *out)
+{
+  ceph_assert(check(oid));
+  set<string> keys;
+  map<string, bufferlist> got;
+  keys.insert(to_object_key(oid));
+  int r = backend.get_keys(keys, &got);
+  if (r < 0) {
+    dout(20) << __func__ << " " << oid << " got err " << r << dendl;
+    return r;
+  }
+  if (got.empty()) {
+    dout(20) << __func__ << " " << oid << " got.empty()" << dendl;
+    return -ENOENT;
+  }
+  if (out) {
+    auto bp = got.begin()->second.cbegin();
+    decode(*out, bp);
+    dout(20) << __func__ << " " << oid << " " << out->snaps << dendl;
+    if (out->snaps.empty()) {
+      dout(1) << __func__ << " " << oid << " empty snapset" << dendl;
+      ceph_assert(!cct->_conf->osd_debug_verify_snaps);
+    }
+  } else {
+    dout(20) << __func__ << " " << oid << " (out == NULL)" << dendl;
+  }
+  return 0;
+}
+
+void SnapMapper::clear_snaps(
+  const hobject_t &oid,
+  MapCacher::Transaction<std::string, bufferlist> *t)
+{
+  dout(20) << __func__ << " " << oid << dendl;
+  ceph_assert(check(oid));
+  set<string> to_remove;
+  to_remove.insert(to_object_key(oid));
+  if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+    for (auto& i : to_remove) {
+      dout(20) << __func__ << " rm " << i << dendl;
+    }
+  }
+  backend.remove_keys(to_remove, t);
+}
+
+void SnapMapper::set_snaps(
+  const hobject_t &oid,
+  const object_snaps &in,
+  MapCacher::Transaction<std::string, bufferlist> *t)
+{
+  ceph_assert(check(oid));
+  map<string, bufferlist> to_set;
+  bufferlist bl;
+  encode(in, bl);
+  to_set[to_object_key(oid)] = bl;
+  dout(20) << __func__ << " " << oid << " " << in.snaps << dendl;
+  if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+    for (auto& i : to_set) {
+      dout(20) << __func__ << " set " << i.first << dendl;
+    }
+  }
+  backend.set_keys(to_set, t);
+}
+
+int SnapMapper::update_snaps(
+  const hobject_t &oid,
+  const set<snapid_t> &new_snaps,
+  const set<snapid_t> *old_snaps_check,
+  MapCacher::Transaction<std::string, bufferlist> *t)
+{
+  dout(20) << __func__ << " " << oid << " " << new_snaps
+	   << " was " << (old_snaps_check ? *old_snaps_check : set<snapid_t>())
+	   << dendl;
+  ceph_assert(check(oid));
+  if (new_snaps.empty())
+    return remove_oid(oid, t);
+
+  object_snaps out;
+  int r = get_snaps(oid, &out);
+  // Tolerate missing keys but not disk errors
+  if (r < 0 && r != -ENOENT)
+    return r;
+  if (old_snaps_check)
+    ceph_assert(out.snaps == *old_snaps_check);
+
+  object_snaps in(oid, new_snaps);
+  set_snaps(oid, in, t);
+
+  set<string> to_remove;
+  for (set<snapid_t>::iterator i = out.snaps.begin();
+       i != out.snaps.end();
+       ++i) {
+    if (!new_snaps.count(*i)) {
+      to_remove.insert(to_raw_key(make_pair(*i, oid)));
+    }
+  }
+  if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+    for (auto& i : to_remove) {
+      dout(20) << __func__ << " rm " << i << dendl;
+    }
+  }
+  backend.remove_keys(to_remove, t);
+  return 0;
+}
+
+void SnapMapper::add_oid(
+  const hobject_t &oid,
+  const set<snapid_t>& snaps,
+  MapCacher::Transaction<std::string, bufferlist> *t)
+{
+  dout(20) << __func__ << " " << oid << " " << snaps << dendl;
+  ceph_assert(!snaps.empty());
+  ceph_assert(check(oid));
+  {
+    object_snaps out;
+    int r = get_snaps(oid, &out);
+    if (r != -ENOENT) {
+      derr << __func__ << " found existing snaps mapped on " << oid
+	   << ", removing" << dendl;
+      ceph_assert(!cct->_conf->osd_debug_verify_snaps);
+      remove_oid(oid, t);
+    }
+  }
+
+  object_snaps _snaps(oid, snaps);
+  set_snaps(oid, _snaps, t);
+
+  map<string, bufferlist> to_add;
+  for (set<snapid_t>::iterator i = snaps.begin();
+       i != snaps.end();
+       ++i) {
+    to_add.insert(to_raw(make_pair(*i, oid)));
+  }
+  if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+    for (auto& i : to_add) {
+      dout(20) << __func__ << " set " << i.first << dendl;
+    }
+  }
+  backend.set_keys(to_add, t);
+}
+
+int SnapMapper::get_next_objects_to_trim(
+  snapid_t snap,
+  unsigned max,
+  vector<hobject_t> *out)
+{
+  ceph_assert(out);
+  ceph_assert(out->empty());
+
+  // if max would be 0, we return ENOENT and the caller would mistakenly
+  // trim the snaptrim queue
+  ceph_assert(max > 0);
+  int r = 0;
+  for (set<string>::iterator i = prefixes.begin();
+       i != prefixes.end() && out->size() < max && r == 0;
+       ++i) {
+    string prefix(get_prefix(pool, snap) + *i);
+    string pos = prefix;
+    while (out->size() < max) {
+      pair<string, bufferlist> next;
+      r = backend.get_next(pos, &next);
+      dout(20) << __func__ << " get_next(" << pos << ") returns " << r
+	       << " " << next << dendl;
+      if (r != 0) {
+	break; // Done
+      }
+
+      if (next.first.substr(0, prefix.size()) !=
+	  prefix) {
+	break; // Done with this prefix
+      }
+
+      ceph_assert(is_mapping(next.first));
+
+      dout(20) << __func__ << " " << next.first << dendl;
+      pair<snapid_t, hobject_t> next_decoded(from_raw(next));
+      ceph_assert(next_decoded.first == snap);
+      ceph_assert(check(next_decoded.second));
+
+      out->push_back(next_decoded.second);
+      pos = next.first;
+    }
+  }
+  if (out->size() == 0) {
+    return -ENOENT;
+  } else {
+    return 0;
+  }
+}
+
+
+int SnapMapper::remove_oid(
+  const hobject_t &oid,
+  MapCacher::Transaction<std::string, bufferlist> *t)
+{
+  dout(20) << __func__ << " " << oid << dendl;
+  ceph_assert(check(oid));
+  return _remove_oid(oid, t);
+}
+
+int SnapMapper::_remove_oid(
+  const hobject_t &oid,
+  MapCacher::Transaction<std::string, bufferlist> *t)
+{
+  dout(20) << __func__ << " " << oid << dendl;
+  object_snaps out;
+  int r = get_snaps(oid, &out);
+  if (r < 0)
+    return r;
+
+  clear_snaps(oid, t);
+
+  set<string> to_remove;
+  for (set<snapid_t>::iterator i = out.snaps.begin();
+       i != out.snaps.end();
+       ++i) {
+    to_remove.insert(to_raw_key(make_pair(*i, oid)));
+  }
+  if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
+    for (auto& i : to_remove) {
+      dout(20) << __func__ << " rm " << i << dendl;
+    }
+  }
+  backend.remove_keys(to_remove, t);
+  return 0;
+}
+
+int SnapMapper::get_snaps(
+  const hobject_t &oid,
+  std::set<snapid_t> *snaps)
+{
+  ceph_assert(check(oid));
+  object_snaps out;
+  int r = get_snaps(oid, &out);
+  if (r < 0)
+    return r;
+  if (snaps)
+    snaps->swap(out.snaps);
+  return 0;
+}
+
+
+// -- purged snaps --
+
+string SnapMapper::make_purged_snap_key(int64_t pool, snapid_t last)
+{
+  char k[80];
+  snprintf(k, sizeof(k), "%s_%llu_%016llx", PURGED_SNAP_PREFIX,
+	   (unsigned long long)pool, (unsigned long long)last);
+  return k;
+}
+
+void SnapMapper::make_purged_snap_key_value(
+  int64_t pool, snapid_t begin, snapid_t end, map<string,bufferlist> *m)
+{
+  string k = make_purged_snap_key(pool, end - 1);
+  auto& v = (*m)[k];
+  ceph::encode(pool, v);
+  ceph::encode(begin, v);
+  ceph::encode(end, v);
+}
+
+int SnapMapper::_lookup_purged_snap(
+  CephContext *cct,
+  ObjectStore *store,
+  ObjectStore::CollectionHandle& ch,
+  const ghobject_t& hoid,
+  int64_t pool, snapid_t snap,
+  snapid_t *begin, snapid_t *end)
+{
+  string k = make_purged_snap_key(pool, snap);
+  auto it = store->get_omap_iterator(ch, hoid);
+  it->lower_bound(k);
+  if (!it->valid()) {
+    dout(20) << __func__ << " pool " << pool << " snap " << snap
+	     << " key '" << k << "' lower_bound not found" << dendl;
+    return -ENOENT;
+  }
+  if (it->key().find(PURGED_SNAP_PREFIX) != 0) {
+    dout(20) << __func__ << " pool " << pool << " snap " << snap
+	     << " key '" << k << "' lower_bound got mismatched prefix '"
+	     << it->key() << "'" << dendl;
+    return -ENOENT;
+  }
+  bufferlist v = it->value();
+  auto p = v.cbegin();
+  int64_t gotpool;
+  decode(gotpool, p);
+  decode(*begin, p);
+  decode(*end, p);
+  if (snap < *begin || snap >= *end) {
+    dout(20) << __func__ << " pool " << pool << " snap " << snap
+	     << " found [" << *begin << "," << *end << "), no overlap" << dendl;
+    return -ENOENT;
+  }
+  return 0;
+}
+
+void SnapMapper::record_purged_snaps(
+  CephContext *cct,
+  ObjectStore *store,
+  ObjectStore::CollectionHandle& ch,
+  ghobject_t hoid,
+  ObjectStore::Transaction *t,
+  map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps)
+{
+  dout(10) << __func__ << " purged_snaps " << purged_snaps << dendl;
+  map<string,bufferlist> m;
+  set<string> rm;
+  for (auto& [epoch, bypool] : purged_snaps) {
+    // index by (pool, snap)
+    for (auto& [pool, snaps] : bypool) {
+      for (auto i = snaps.begin();
+	   i != snaps.end();
+	   ++i) {
+	snapid_t begin = i.get_start();
+	snapid_t end = i.get_end();
+	snapid_t before_begin, before_end;
+	snapid_t after_begin, after_end;
+	int b = _lookup_purged_snap(cct, store, ch, hoid,
+				    pool, begin - 1, &before_begin, &before_end);
+	int a = _lookup_purged_snap(cct, store, ch, hoid,
+				    pool, end, &after_begin, &after_end);
+	if (!b && !a) {
+	  dout(10) << __func__
+		   << " [" << begin << "," << end << ") - joins ["
+		   << before_begin << "," << before_end << ") and ["
+		   << after_begin << "," << after_end << ")" << dendl;
+	  // erase only the begin record; we'll overwrite the end one
+	  rm.insert(make_purged_snap_key(pool, before_end - 1));
+	  make_purged_snap_key_value(pool, before_begin, after_end, &m);
+	} else if (!b) {
+	  dout(10) << __func__
+		   << " [" << begin << "," << end << ") - join with earlier ["
+		   << before_begin << "," << before_end << ")" << dendl;
+	  rm.insert(make_purged_snap_key(pool, before_end - 1));
+	  make_purged_snap_key_value(pool, before_begin, end, &m);
+	} else if (!a) {
+	  dout(10) << __func__
+		   << " [" << begin << "," << end << ") - join with later ["
+		   << after_begin << "," << after_end << ")" << dendl;
+	  // overwrite after record
+	  make_purged_snap_key_value(pool, begin, after_end, &m);
+	} else {
+	  make_purged_snap_key_value(pool, begin, end, &m);
+	}
+      }
+    }
+  }
+  t->omap_rmkeys(ch->cid, hoid, rm);
+  t->omap_setkeys(ch->cid, hoid, m);
+  dout(10) << __func__ << " rm " << rm.size() << " keys, set " << m.size()
+	   << " keys" << dendl;
+}
+
+
+bool SnapMapper::Scrubber::_parse_p()
+{
+  if (!psit->valid()) {
+    pool = -1;
+    return false;
+  }
+  if (psit->key().find(PURGED_SNAP_PREFIX) != 0) {
+    pool = -1;
+    return false;
+  }
+  bufferlist v = psit->value();
+  auto p = v.cbegin();
+  ceph::decode(pool, p);
+  ceph::decode(begin, p);
+  ceph::decode(end, p);
+  dout(20) << __func__ << " purged_snaps pool " << pool
+	   << " [" << begin << "," << end << ")" << dendl;
+  psit->next();
+  return true;
+}
+
+bool SnapMapper::Scrubber::_parse_m()
+{
+  if (!mapit->valid()) {
+    return false;
+  }
+  if (mapit->key().find(MAPPING_PREFIX) != 0) {
+    return false;
+  }
+  auto v = mapit->value();
+  auto p = v.cbegin();
+  mapping.decode(p);
+
+  {
+    unsigned long long p, s;
+    long sh;
+    string k = mapit->key();
+    int r = sscanf(k.c_str(), "SNA_%lld_%llx.%lx", &p, &s, &sh);
+    if (r != 1) {
+      shard = shard_id_t::NO_SHARD;
+    } else {
+      shard = shard_id_t(sh);
+    }
+  }
+  dout(20) << __func__ << " mapping pool " << mapping.hoid.pool
+	   << " snap " << mapping.snap
+	   << " shard " << shard
+	   << " " << mapping.hoid << dendl;
+  mapit->next();
+  return true;
+}
+
+void SnapMapper::Scrubber::run()
+{
+  dout(10) << __func__ << dendl;
+
+  psit = store->get_omap_iterator(ch, purged_snaps_hoid);
+  psit->upper_bound(PURGED_SNAP_PREFIX);
+  _parse_p();
+
+  mapit = store->get_omap_iterator(ch, mapping_hoid);
+  mapit->upper_bound(MAPPING_PREFIX);
+
+  while (_parse_m()) {
+    // advance to next purged_snaps range?
+    while (pool >= 0 &&
+	   (mapping.hoid.pool > pool ||
+	    (mapping.hoid.pool == pool && mapping.snap >= end))) {
+      _parse_p();
+    }
+    if (pool < 0) {
+      dout(10) << __func__ << " passed final purged_snaps interval, rest ok"
+	       << dendl;
+      break;
+    }
+    if (mapping.hoid.pool < pool ||
+	mapping.snap < begin) {
+      // ok
+      dout(20) << __func__ << " ok " << mapping.hoid
+	       << " snap " << mapping.snap
+	       << " precedes pool " << pool
+	       << " purged_snaps [" << begin << "," << end << ")" << dendl;
+    } else {
+      assert(mapping.snap >= begin &&
+	     mapping.snap < end &&
+	     mapping.hoid.pool == pool);
+      // invalid
+      dout(10) << __func__ << " stray " << mapping.hoid
+	       << " snap " << mapping.snap
+	       << " in pool " << pool
+	       << " shard " << shard
+	       << " purged_snaps [" << begin << "," << end << ")" << dendl;
+      stray.emplace_back(std::tuple<int64_t,snapid_t,uint32_t,shard_id_t>(
+			   pool, mapping.snap, mapping.hoid.get_hash(),
+			   shard
+			   ));
+    }
+  }
+
+  dout(10) << __func__ << " end, found " << stray.size() << " stray" << dendl;
+  psit = ObjectMap::ObjectMapIterator();
+  mapit = ObjectMap::ObjectMapIterator();
+}
+
+
+// -------------------------------------
+// legacy conversion/support
+
+string SnapMapper::get_legacy_prefix(snapid_t snap)
+{
+  char buf[100];
+  int len = snprintf(
+    buf, sizeof(buf),
+    "%.*X_",
+    (int)(sizeof(snap)*2), static_cast<unsigned>(snap));
+  return LEGACY_MAPPING_PREFIX + string(buf, len);
+}
+
+string SnapMapper::to_legacy_raw_key(
+  const pair<snapid_t, hobject_t> &in)
+{
+  return get_legacy_prefix(in.first) + shard_prefix + in.second.to_str();
+}
+
+bool SnapMapper::is_legacy_mapping(const string &to_test)
+{
+  return to_test.substr(0, LEGACY_MAPPING_PREFIX.size()) ==
+    LEGACY_MAPPING_PREFIX;
+}
+
+/* Octopus modified the SnapMapper key format from
+ *
+ *  <LEGACY_MAPPING_PREFIX><snapid>_<shardid>_<hobject_t::to_str()>
+ *
+ * to
+ *
+ *  <MAPPING_PREFIX><pool>_<snapid>_<shardid>_<hobject_t::to_str()>
+ *
+ * We can't reconstruct the new key format just from the value since the
+ * Mapping object contains an hobject rather than a ghobject. Instead,
+ * we exploit the fact that the new format is identical starting at <snapid>.
+ *
+ * Note that the original version of this conversion introduced in 94ebe0ea
+ * had a crucial bug which essentially destroyed legacy keys by mapping
+ * them to
+ *
+ *  <MAPPING_PREFIX><poolid>_<snapid>_
+ *
+ * without the object-unique suffix.
+ * See https://tracker.ceph.com/issues/56147
+ */
+std::string SnapMapper::convert_legacy_key(
+  const std::string& old_key,
+  const bufferlist& value)
+{
+  auto old = from_raw(make_pair(old_key, value));
+  std::string object_suffix = old_key.substr(
+    SnapMapper::LEGACY_MAPPING_PREFIX.length());
+  return SnapMapper::MAPPING_PREFIX + std::to_string(old.second.pool)
+    + "_" + object_suffix;
+}
+
+int SnapMapper::convert_legacy(
+  CephContext *cct,
+  ObjectStore *store,
+  ObjectStore::CollectionHandle& ch,
+  ghobject_t hoid,
+  unsigned max)
+{
+  uint64_t n = 0;
+
+  ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(ch, hoid);
+  if (!iter) {
+    return -EIO;
+  }
+
+  auto start = ceph::mono_clock::now();
+
+  iter->upper_bound(SnapMapper::LEGACY_MAPPING_PREFIX);
+  map<string,bufferlist> to_set;
+  while (iter->valid()) {
+    bool valid = SnapMapper::is_legacy_mapping(iter->key());
+    if (valid) {
+      to_set.emplace(
+	convert_legacy_key(iter->key(), iter->value()),
+	iter->value());
+      ++n;
+      iter->next();
+    }
+    if (!valid || !iter->valid() || to_set.size() >= max) {
+      ObjectStore::Transaction t;
+      t.omap_setkeys(ch->cid, hoid, to_set);
+      int r = store->queue_transaction(ch, std::move(t));
+      ceph_assert(r == 0);
+      to_set.clear();
+      if (!valid) {
+	break;
+      }
+      dout(10) << __func__ << " converted " << n << " keys" << dendl;
+    }
+  }
+
+  auto end = ceph::mono_clock::now();
+
+  dout(1) << __func__ << " converted " << n << " keys in "
+	  << timespan_str(end - start) << dendl;
+
+  // remove the old keys
+  {
+    ObjectStore::Transaction t;
+    string end = SnapMapper::LEGACY_MAPPING_PREFIX;
+    ++end[end.size()-1]; // turn _ to whatever comes after _
+    t.omap_rmkeyrange(ch->cid, hoid,
+		      SnapMapper::LEGACY_MAPPING_PREFIX,
+		      end);
+    int r = store->queue_transaction(ch, std::move(t));
+    ceph_assert(r == 0);
+  }
+  return 0;
+}
diff --git a/src/osd/SnapMapper.h b/src/osd/SnapMapper.h
new file mode 100644
index 000000000..90b0c7c8d
--- /dev/null
+++ b/src/osd/SnapMapper.h
@@ -0,0 +1,338 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef SNAPMAPPER_H
+#define SNAPMAPPER_H
+
+#include <string>
+#include <set>
+#include <utility>
+#include <cstring>
+
+#include "common/map_cacher.hpp"
+#include "common/hobject.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/object.h"
+#include "os/ObjectStore.h"
+#include "osd/OSDMap.h"
+
+class OSDriver : public MapCacher::StoreDriver<std::string, ceph::buffer::list> {
+  ObjectStore *os;
+  ObjectStore::CollectionHandle ch;
+  ghobject_t hoid;
+
+public:
+  class OSTransaction : public MapCacher::Transaction<std::string, ceph::buffer::list> {
+    friend class OSDriver;
+    coll_t cid;
+    ghobject_t hoid;
+    ObjectStore::Transaction *t;
+    OSTransaction(
+      const coll_t &cid,
+      const ghobject_t &hoid,
+      ObjectStore::Transaction *t)
+      : cid(cid), hoid(hoid), t(t) {}
+  public:
+    void set_keys(
+      const std::map<std::string, ceph::buffer::list> &to_set) override {
+      t->omap_setkeys(cid, hoid, to_set);
+    }
+    void remove_keys(
+      const std::set<std::string> &to_remove) override {
+      t->omap_rmkeys(cid, hoid, to_remove);
+    }
+    void add_callback(
+      Context *c) override {
+      t->register_on_applied(c);
+    }
+  };
+
+  OSTransaction get_transaction(
+    ObjectStore::Transaction *t) {
+    return OSTransaction(ch->cid, hoid, t);
+  }
+
+  OSDriver(ObjectStore *os, const coll_t& cid, const ghobject_t &hoid) :
+    os(os),
+    hoid(hoid) {
+    ch = os->open_collection(cid);
+  }
+  int get_keys(
+    const std::set<std::string> &keys,
+    std::map<std::string, ceph::buffer::list> *out) override;
+  int get_next(
+    const std::string &key,
+    std::pair<std::string, ceph::buffer::list> *next) override;
+};
+
+/**
+ * SnapMapper
+ *
+ * Manages two mappings:
+ *  1) hobject_t -> {snapid}
+ *  2) snapid -> {hobject_t}
+ *
+ * We accomplish this using two sets of keys:
+ *  1) OBJECT_PREFIX + obj.str() -> encoding of object_snaps
+ *  2) MAPPING_PREFIX + poolid + snapid_t + obj.str() -> encoding of std::pair<snapid_t, obj>
+ *
+ * The on disk strings and encodings are implemented in to_raw, to_raw_key,
+ * from_raw, to_object_key.
+ *
+ * The object -> {snapid} mapping is primarily included so that the
+ * SnapMapper state can be verified against the external PG state during
+ * scrub etc.
+ *
+ * The 2) mapping is arranged such that all objects in a particular
+ * snap will sort together, and so that all objects in a pg for a
+ * particular snap will group under up to 8 prefixes.
+ */
+class SnapMapper {
+  friend class MapperVerifier;
+public:
+  CephContext* cct;
+  struct object_snaps {
+    hobject_t oid;
+    std::set<snapid_t> snaps;
+    object_snaps(hobject_t oid, const std::set<snapid_t> &snaps)
+      : oid(oid), snaps(snaps) {}
+    object_snaps() {}
+    void encode(ceph::buffer::list &bl) const;
+    void decode(ceph::buffer::list::const_iterator &bp);
+  };
+
+  struct Mapping {
+    snapid_t snap;
+    hobject_t hoid;
+    explicit Mapping(const std::pair<snapid_t, hobject_t> &in)
+      : snap(in.first), hoid(in.second) {}
+    Mapping() : snap(0) {}
+    void encode(ceph::buffer::list &bl) const {
+      ENCODE_START(1, 1, bl);
+      encode(snap, bl);
+      encode(hoid, bl);
+      ENCODE_FINISH(bl);
+    }
+    void decode(ceph::buffer::list::const_iterator &bl) {
+      DECODE_START(1, bl);
+      decode(snap, bl);
+      decode(hoid, bl);
+      DECODE_FINISH(bl);
+    }
+  };
+
+  static const std::string LEGACY_MAPPING_PREFIX;
+  static const std::string MAPPING_PREFIX;
+  static const std::string OBJECT_PREFIX;
+  static const char *PURGED_SNAP_EPOCH_PREFIX;
+  static const char *PURGED_SNAP_PREFIX;
+
+  struct Scrubber {
+    CephContext *cct;
+    ObjectStore *store;
+    ObjectStore::CollectionHandle ch;
+    ghobject_t mapping_hoid;
+    ghobject_t purged_snaps_hoid;
+
+    ObjectMap::ObjectMapIterator psit;
+    int64_t pool;
+    snapid_t begin, end;
+
+    bool _parse_p();   ///< advance the purged_snaps pointer
+
+    ObjectMap::ObjectMapIterator mapit;
+    Mapping mapping;
+    shard_id_t shard;
+
+    bool _parse_m();   ///< advance the (object) mapper pointer
+
+    std::vector<std::tuple<int64_t, snapid_t, uint32_t, shard_id_t>> stray;
+
+    Scrubber(
+      CephContext *cct,
+      ObjectStore *store,
+      ObjectStore::CollectionHandle& ch,
+      ghobject_t mapping_hoid,
+      ghobject_t purged_snaps_hoid)
+      : cct(cct),
+	store(store),
+	ch(ch),
+	mapping_hoid(mapping_hoid),
+	purged_snaps_hoid(purged_snaps_hoid) {}
+
+    void run();
+  };
+
+  static std::string convert_legacy_key(
+    const std::string& old_key,
+    const bufferlist& value);
+
+  static int convert_legacy(
+    CephContext *cct,
+    ObjectStore *store,
+    ObjectStore::CollectionHandle& ch,
+    ghobject_t hoid,
+    unsigned max);
+
+  static void record_purged_snaps(
+    CephContext *cct,
+    ObjectStore *store,
+    ObjectStore::CollectionHandle& ch,
+    ghobject_t hoid,
+    ObjectStore::Transaction *t,
+    std::map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps);
+  static void scrub_purged_snaps(
+    CephContext *cct,
+    ObjectStore *store,
+    ObjectStore::CollectionHandle& ch,
+    ghobject_t mapper_hoid,
+    ghobject_t purged_snaps_hoid);
+
+private:
+  static int _lookup_purged_snap(
+    CephContext *cct,
+    ObjectStore *store,
+    ObjectStore::CollectionHandle& ch,
+    const ghobject_t& hoid,
+    int64_t pool, snapid_t snap,
+    snapid_t *begin, snapid_t *end);
+  static void make_purged_snap_key_value(
+    int64_t pool, snapid_t begin,
+    snapid_t end, std::map<std::string,ceph::buffer::list> *m);
+  static std::string make_purged_snap_key(int64_t pool, snapid_t last);
+
+
+  MapCacher::MapCacher<std::string, ceph::buffer::list> backend;
+
+  static std::string get_legacy_prefix(snapid_t snap);
+  std::string to_legacy_raw_key(
+    const std::pair<snapid_t, hobject_t> &to_map);
+  static bool is_legacy_mapping(const std::string &to_test);
+
+  static std::string get_prefix(int64_t pool, snapid_t snap);
+  std::string to_raw_key(
+    const std::pair<snapid_t, hobject_t> &to_map);
+
+  std::pair<std::string, ceph::buffer::list> to_raw(
+    const std::pair<snapid_t, hobject_t> &to_map);
+
+  static bool is_mapping(const std::string &to_test);
+
+  static std::pair<snapid_t, hobject_t> from_raw(
+    const std::pair<std::string, ceph::buffer::list> &image);
+
+  std::string to_object_key(const hobject_t &hoid);
+
+  int get_snaps(const hobject_t &oid, object_snaps *out);
+
+  void set_snaps(
+    const hobject_t &oid,
+    const object_snaps &out,
+    MapCacher::Transaction<std::string, ceph::buffer::list> *t);
+
+  void clear_snaps(
+    const hobject_t &oid,
+    MapCacher::Transaction<std::string, ceph::buffer::list> *t);
+
+  // True if hoid belongs in this mapping based on mask_bits and match
+  bool check(const hobject_t &hoid) const;
+
+  int _remove_oid(
+    const hobject_t &oid,    ///< [in] oid to remove
+    MapCacher::Transaction<std::string, ceph::buffer::list> *t ///< [out] transaction
+    );
+
+public:
+  static std::string make_shard_prefix(shard_id_t shard) {
+    if (shard == shard_id_t::NO_SHARD)
+      return std::string();
+    char buf[20];
+    int r = snprintf(buf, sizeof(buf), ".%x", (int)shard);
+    ceph_assert(r < (int)sizeof(buf));
+    return std::string(buf, r) + '_';
+  }
+  uint32_t mask_bits;
+  const uint32_t match;
+  std::string last_key_checked;
+  const int64_t pool;
+  const shard_id_t shard;
+  const std::string shard_prefix;
+  SnapMapper(
+    CephContext* cct,
+    MapCacher::StoreDriver<std::string, ceph::buffer::list> *driver,
+    uint32_t match,  ///< [in] pgid
+    uint32_t bits,   ///< [in] current split bits
+    int64_t pool,    ///< [in] pool
+    shard_id_t shard ///< [in] shard
+    )
+    : cct(cct), backend(driver), mask_bits(bits), match(match), pool(pool),
+      shard(shard), shard_prefix(make_shard_prefix(shard)) {
+    update_bits(mask_bits);
+  }
+
+  std::set<std::string> prefixes;
+  /// Update bits in case of pg split or merge
+  void update_bits(
+    uint32_t new_bits  ///< [in] new split bits
+    ) {
+    mask_bits = new_bits;
+    std::set<std::string> _prefixes = hobject_t::get_prefixes(
+      mask_bits,
+      match,
+      pool);
+    prefixes.clear();
+    for (auto i = _prefixes.begin(); i != _prefixes.end(); ++i) {
+      prefixes.insert(shard_prefix + *i);
+    }
+  }
+
+  /// Update snaps for oid, empty new_snaps removes the mapping
+  int update_snaps(
+    const hobject_t &oid,       ///< [in] oid to update
+    const std::set<snapid_t> &new_snaps, ///< [in] new snap std::set
+    const std::set<snapid_t> *old_snaps, ///< [in] old snaps (for debugging)
+    MapCacher::Transaction<std::string, ceph::buffer::list> *t ///< [out] transaction
+    ); ///@ return error, 0 on success
+
+  /// Add mapping for oid, must not already be mapped
+  void add_oid(
+    const hobject_t &oid,       ///< [in] oid to add
+    const std::set<snapid_t>& new_snaps, ///< [in] snaps
+    MapCacher::Transaction<std::string, ceph::buffer::list> *t ///< [out] transaction
+    );
+
+  /// Returns first object with snap as a snap
+  int get_next_objects_to_trim(
+    snapid_t snap,              ///< [in] snap to check
+    unsigned max,               ///< [in] max to get
+    std::vector<hobject_t> *out      ///< [out] next objects to trim (must be empty)
+    );  ///< @return error, -ENOENT if no more objects
+
+  /// Remove mapping for oid
+  int remove_oid(
+    const hobject_t &oid,    ///< [in] oid to remove
+    MapCacher::Transaction<std::string, ceph::buffer::list> *t ///< [out] transaction
+    ); ///< @return error, -ENOENT if the object is not mapped
+
+  /// Get snaps for oid
+  int get_snaps(
+    const hobject_t &oid,     ///< [in] oid to get snaps for
+    std::set<snapid_t> *snaps ///< [out] snaps
+    ); ///< @return error, -ENOENT if oid is not recorded
+};
+WRITE_CLASS_ENCODER(SnapMapper::object_snaps)
+WRITE_CLASS_ENCODER(SnapMapper::Mapping)
+
+#endif
diff --git a/src/osd/TierAgentState.h b/src/osd/TierAgentState.h
new file mode 100644
index 000000000..28e1598a9
--- /dev/null
+++ b/src/osd/TierAgentState.h
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Sage Weil <sage@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_TIERAGENT_H
+#define CEPH_OSD_TIERAGENT_H
+
+#include <ctime>
+#include <list>
+#include <map>
+#include <utility>
+
+#include "common/Formatter.h"
+#include "common/histogram.h"
+#include "common/hobject.h"
+
+#include "osd/HitSet.h"
+
+struct TierAgentState {
+  /// current position iterating across pool
+  hobject_t position;
+  /// Count of agent_work since "start" position of object hash space
+  int started;
+  hobject_t start;
+  bool delaying;
+
+  /// histogram of ages we've encountered
+  pow2_hist_t temp_hist;
+  int hist_age;
+
+  /// past HitSet(s) (not current)
+  std::map<time_t,HitSetRef> hit_set_map;
+
+  /// a few recent things we've seen that are clean
+  std::list<hobject_t> recent_clean;
+
+  enum flush_mode_t {
+    FLUSH_MODE_IDLE,   // nothing to flush
+    FLUSH_MODE_LOW, // flush dirty objects with a low speed
+    FLUSH_MODE_HIGH, //flush dirty objects with a high speed
+  } flush_mode;     ///< current flush behavior
+  static const char *get_flush_mode_name(flush_mode_t m) {
+    switch (m) {
+    case FLUSH_MODE_IDLE: return "idle";
+    case FLUSH_MODE_LOW: return "low";
+    case FLUSH_MODE_HIGH: return "high";
+    default: ceph_abort_msg("bad flush mode");
+    }
+  }
+  const char *get_flush_mode_name() const {
+    return get_flush_mode_name(flush_mode);
+  }
+
+  enum evict_mode_t {
+    EVICT_MODE_IDLE,      // no need to evict anything
+    EVICT_MODE_SOME,      // evict some things as we are near the target
+    EVICT_MODE_FULL,      // evict anything
+  } evict_mode;     ///< current evict behavior
+  static const char *get_evict_mode_name(evict_mode_t m) {
+    switch (m) {
+    case EVICT_MODE_IDLE: return "idle";
+    case EVICT_MODE_SOME: return "some";
+    case EVICT_MODE_FULL: return "full";
+    default: ceph_abort_msg("bad evict mode");
+    }
+  }
+  const char *get_evict_mode_name() const {
+    return get_evict_mode_name(evict_mode);
+  }
+
+  /// approximate ratio of objects (assuming they are uniformly
+  /// distributed) that i should aim to evict.
+  unsigned evict_effort;
+
+  TierAgentState()
+    : started(0),
+      delaying(false),
+      hist_age(0),
+      flush_mode(FLUSH_MODE_IDLE),
+      evict_mode(EVICT_MODE_IDLE),
+      evict_effort(0)
+  {}
+
+  /// false if we have any work to do
+  bool is_idle() const {
+    return
+      delaying ||
+      (flush_mode == FLUSH_MODE_IDLE &&
+      evict_mode == EVICT_MODE_IDLE);
+  }
+
+  /// add archived HitSet
+  void add_hit_set(time_t start, HitSetRef hs) {
+    hit_set_map.insert(std::make_pair(start, hs));
+  }
+
+  /// remove old/trimmed HitSet
+  void remove_oldest_hit_set() {
+    if (!hit_set_map.empty())
+      hit_set_map.erase(hit_set_map.begin());
+  }
+
+  /// discard all open hit sets
+  void discard_hit_sets() {
+    hit_set_map.clear();
+  }
+
+  void dump(ceph::Formatter *f) const {
+    f->dump_string("flush_mode", get_flush_mode_name());
+    f->dump_string("evict_mode", get_evict_mode_name());
+    f->dump_unsigned("evict_effort", evict_effort);
+    f->dump_stream("position") << position;
+    f->open_object_section("temp_hist");
+    temp_hist.dump(f);
+    f->close_section();
+  }
+};
+
+#endif
diff --git a/src/osd/Watch.cc b/src/osd/Watch.cc
new file mode 100644
index 000000000..78aae6e2d
--- /dev/null
+++ b/src/osd/Watch.cc
@@ -0,0 +1,550 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#include "PG.h"
+
+#include "include/types.h"
+#include "messages/MWatchNotify.h"
+
+#include <map>
+
+#include "OSD.h"
+#include "PrimaryLogPG.h"
+#include "Watch.h"
+#include "Session.h"
+
+#include "common/config.h"
+
+#define dout_context osd->cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+using std::list;
+using std::make_pair;
+using std::pair;
+using std::ostream;
+using std::set;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+
+struct CancelableContext : public Context {
+  virtual void cancel() = 0;
+};
+
+
+static ostream& _prefix(
+  std::ostream* _dout,
+  Notify *notify) {
+  return notify->gen_dbg_prefix(*_dout);
+}
+
+Notify::Notify(
+  ConnectionRef client,
+  uint64_t client_gid,
+  bufferlist &payload,
+  uint32_t timeout,
+  uint64_t cookie,
+  uint64_t notify_id,
+  uint64_t version,
+  OSDService *osd)
+  : client(client),
+    client_gid(client_gid),
+    complete(false),
+    discarded(false),
+    timed_out(false),
+    payload(payload),
+    timeout(timeout),
+    cookie(cookie),
+    notify_id(notify_id),
+    version(version),
+    osd(osd),
+    cb(nullptr) {}
+
+NotifyRef Notify::makeNotifyRef(
+  ConnectionRef client,
+  uint64_t client_gid,
+  bufferlist &payload,
+  uint32_t timeout,
+  uint64_t cookie,
+  uint64_t notify_id,
+  uint64_t version,
+  OSDService *osd) {
+  NotifyRef ret(
+    new Notify(
+      client, client_gid,
+      payload, timeout,
+      cookie, notify_id,
+      version, osd));
+  ret->set_self(ret);
+  return ret;
+}
+
+class NotifyTimeoutCB : public CancelableContext {
+  NotifyRef notif;
+  bool canceled; // protected by notif lock
+public:
+  explicit NotifyTimeoutCB(NotifyRef notif) : notif(notif), canceled(false) {}
+  void finish(int) override {
+    notif->osd->watch_lock.unlock();
+    notif->lock.lock();
+    if (!canceled)
+      notif->do_timeout(); // drops lock
+    else
+      notif->lock.unlock();
+    notif->osd->watch_lock.lock();
+  }
+  void cancel() override {
+    ceph_assert(ceph_mutex_is_locked(notif->lock));
+    canceled = true;
+  }
+};
+
+void Notify::do_timeout()
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  dout(10) << "timeout" << dendl;
+  cb = nullptr;
+  if (is_discarded()) {
+    lock.unlock();
+    return;
+  }
+
+  timed_out = true;         // we will send the client an error code
+  maybe_complete_notify();
+  ceph_assert(complete);
+  set<WatchRef> _watchers;
+  _watchers.swap(watchers);
+  lock.unlock();
+
+  for (auto i = _watchers.begin(); i != _watchers.end(); ++i) {
+    boost::intrusive_ptr<PrimaryLogPG> pg((*i)->get_pg());
+    pg->lock();
+    if (!(*i)->is_discarded()) {
+      (*i)->cancel_notify(self.lock());
+    }
+    pg->unlock();
+  }
+}
+
+void Notify::register_cb()
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  {
+    std::lock_guard l{osd->watch_lock};
+    cb = new NotifyTimeoutCB(self.lock());
+    if (!osd->watch_timer.add_event_after(timeout, cb)) {
+      cb = nullptr;
+    }
+  }
+}
+
+void Notify::unregister_cb()
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  if (!cb)
+    return;
+  cb->cancel();
+  {
+    std::lock_guard l{osd->watch_lock};
+    osd->watch_timer.cancel_event(cb);
+    cb = nullptr;
+  }
+}
+
+void Notify::start_watcher(WatchRef watch)
+{
+  std::lock_guard l(lock);
+  dout(10) << "start_watcher" << dendl;
+  watchers.insert(watch);
+}
+
+void Notify::complete_watcher(WatchRef watch, bufferlist& reply_bl)
+{
+  std::lock_guard l(lock);
+  dout(10) << "complete_watcher" << dendl;
+  if (is_discarded())
+    return;
+  ceph_assert(watchers.count(watch));
+  watchers.erase(watch);
+  notify_replies.insert(make_pair(make_pair(watch->get_watcher_gid(),
+					    watch->get_cookie()),
+				  reply_bl));
+  maybe_complete_notify();
+}
+
+void Notify::complete_watcher_remove(WatchRef watch)
+{
+  std::lock_guard l(lock);
+  dout(10) << __func__ << dendl;
+  if (is_discarded())
+    return;
+  ceph_assert(watchers.count(watch));
+  watchers.erase(watch);
+  maybe_complete_notify();
+}
+
+void Notify::maybe_complete_notify()
+{
+  dout(10) << "maybe_complete_notify -- "
+	   << watchers.size()
+	   << " in progress watchers " << dendl;
+  if (watchers.empty() || timed_out) {
+    // prepare reply
+    bufferlist bl;
+    encode(notify_replies, bl);
+    list<pair<uint64_t,uint64_t> > missed;
+    for (auto p = watchers.begin(); p != watchers.end(); ++p) {
+      missed.push_back(make_pair((*p)->get_watcher_gid(),
+				 (*p)->get_cookie()));
+    }
+    encode(missed, bl);
+
+    bufferlist empty;
+    auto* const reply = new MWatchNotify(
+      cookie,
+      version,
+      notify_id,
+      CEPH_WATCH_EVENT_NOTIFY_COMPLETE,
+      empty,
+      client_gid);
+    reply->set_data(bl);
+    if (timed_out)
+      reply->return_code = -ETIMEDOUT;
+    client->send_message(reply);
+    unregister_cb();
+
+    complete = true;
+  }
+}
+
+void Notify::discard()
+{
+  std::lock_guard l(lock);
+  discarded = true;
+  unregister_cb();
+  watchers.clear();
+}
+
+void Notify::init()
+{
+  std::lock_guard l(lock);
+  register_cb();
+  maybe_complete_notify();
+}
+
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, watch.get())
+
+static ostream& _prefix(
+  std::ostream* _dout,
+  Watch *watch) {
+  return watch->gen_dbg_prefix(*_dout);
+}
+
+class HandleWatchTimeout : public CancelableContext {
+  WatchRef watch;
+public:
+  bool canceled; // protected by watch->pg->lock
+  explicit HandleWatchTimeout(WatchRef watch) : watch(watch), canceled(false) {}
+  void cancel() override {
+    canceled = true;
+  }
+  void finish(int) override { ceph_abort(); /* not used */ }
+  void complete(int) override {
+    OSDService *osd(watch->osd);
+    ldout(osd->cct, 10) << "HandleWatchTimeout" << dendl;
+    boost::intrusive_ptr<PrimaryLogPG> pg(watch->pg);
+    osd->watch_lock.unlock();
+    pg->lock();
+    watch->cb = nullptr;
+    if (!watch->is_discarded() && !canceled)
+      watch->pg->handle_watch_timeout(watch);
+    delete this; // ~Watch requires pg lock!
+    pg->unlock();
+    osd->watch_lock.lock();
+  }
+};
+
+class HandleDelayedWatchTimeout : public CancelableContext {
+  WatchRef watch;
+public:
+  bool canceled;
+  explicit HandleDelayedWatchTimeout(WatchRef watch) : watch(watch), canceled(false) {}
+  void cancel() override {
+    canceled = true;
+  }
+  void finish(int) override {
+    OSDService *osd(watch->osd);
+    dout(10) << "HandleWatchTimeoutDelayed" << dendl;
+    ceph_assert(watch->pg->is_locked());
+    watch->cb = nullptr;
+    if (!watch->is_discarded() && !canceled)
+      watch->pg->handle_watch_timeout(watch);
+  }
+};
+
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+std::ostream& Watch::gen_dbg_prefix(std::ostream& out) {
+  return pg->gen_prefix(out) << " -- Watch("
+      << make_pair(cookie, entity) << ") ";
+}
+
+Watch::Watch(
+  PrimaryLogPG *pg,
+  OSDService *osd,
+  ObjectContextRef obc,
+  uint32_t timeout,
+  uint64_t cookie,
+  entity_name_t entity,
+  const entity_addr_t &addr)
+  : cb(NULL),
+    osd(osd),
+    pg(pg),
+    obc(obc),
+    timeout(timeout),
+    cookie(cookie),
+    addr(addr),
+    will_ping(false),
+    entity(entity),
+    discarded(false) {
+  dout(10) << "Watch()" << dendl;
+}
+
+Watch::~Watch() {
+  dout(10) << "~Watch" << dendl;
+  // users must have called remove() or discard() prior to this point
+  ceph_assert(!obc);
+  ceph_assert(!is_connected());
+}
+
+Context *Watch::get_delayed_cb()
+{
+  ceph_assert(!cb);
+  cb = new HandleDelayedWatchTimeout(self.lock());
+  return cb;
+}
+
+void Watch::register_cb()
+{
+  std::lock_guard l(osd->watch_lock);
+  if (cb) {
+    dout(15) << "re-registering callback, timeout: " << timeout << dendl;
+    cb->cancel();
+    osd->watch_timer.cancel_event(cb);
+  } else {
+    dout(15) << "registering callback, timeout: " << timeout << dendl;
+  }
+  cb = new HandleWatchTimeout(self.lock());
+  if (!osd->watch_timer.add_event_after(timeout, cb)) {
+    cb = nullptr;
+  }
+}
+
+void Watch::unregister_cb()
+{
+  dout(15) << "unregister_cb" << dendl;
+  if (!cb)
+    return;
+  dout(15) << "actually registered, cancelling" << dendl;
+  cb->cancel();
+  {
+    std::lock_guard l(osd->watch_lock);
+    osd->watch_timer.cancel_event(cb); // harmless if not registered with timer
+  }
+  cb = nullptr;
+}
+
+void Watch::got_ping(utime_t t)
+{
+  last_ping = t;
+  if (is_connected()) {
+    register_cb();
+  }
+}
+
+void Watch::connect(ConnectionRef con, bool _will_ping)
+{
+  if (is_connected(con.get())) {
+    dout(10) << __func__ << " con " << con << " - already connected" << dendl;
+    return;
+  }
+  dout(10) << __func__ << " con " << con << dendl;
+  conn = con;
+  will_ping = _will_ping;
+  auto priv = con->get_priv();
+  if (priv) {
+    auto sessionref = static_cast<Session*>(priv.get());
+    sessionref->wstate.addWatch(self.lock());
+    priv.reset();
+    for (auto i = in_progress_notifies.begin();
+	 i != in_progress_notifies.end();
+	 ++i) {
+      send_notify(i->second);
+    }
+  }
+  if (will_ping) {
+    last_ping = ceph_clock_now();
+    register_cb();
+  } else {
+    unregister_cb();
+  }
+}
+
+void Watch::disconnect()
+{
+  dout(10) << "disconnect (con was " << conn << ")" << dendl;
+  conn = ConnectionRef();
+  if (!will_ping)
+    register_cb();
+}
+
+void Watch::discard()
+{
+  dout(10) << "discard" << dendl;
+  for (auto i = in_progress_notifies.begin();
+       i != in_progress_notifies.end();
+       ++i) {
+    i->second->discard();
+  }
+  discard_state();
+}
+
+void Watch::discard_state()
+{
+  ceph_assert(pg->is_locked());
+  ceph_assert(!discarded);
+  ceph_assert(obc);
+  in_progress_notifies.clear();
+  unregister_cb();
+  discarded = true;
+  if (is_connected()) {
+    if (auto priv = conn->get_priv(); priv) {
+      auto session = static_cast<Session*>(priv.get());
+      session->wstate.removeWatch(self.lock());
+    }
+    conn = ConnectionRef();
+  }
+  obc = ObjectContextRef();
+}
+
+bool Watch::is_discarded() const
+{
+  return discarded;
+}
+
+void Watch::remove(bool send_disconnect)
+{
+  dout(10) << "remove" << dendl;
+  if (send_disconnect && is_connected()) {
+    bufferlist empty;
+    MWatchNotify *reply(new MWatchNotify(cookie, 0, 0,
+					 CEPH_WATCH_EVENT_DISCONNECT, empty));
+    conn->send_message(reply);
+  }
+  for (auto i = in_progress_notifies.begin();
+       i != in_progress_notifies.end();
+       ++i) {
+    i->second->complete_watcher_remove(self.lock());
+  }
+  discard_state();
+}
+
+void Watch::start_notify(NotifyRef notif)
+{
+  ceph_assert(in_progress_notifies.find(notif->notify_id) ==
+	 in_progress_notifies.end());
+  if (will_ping) {
+    utime_t cutoff = ceph_clock_now();
+    cutoff.sec_ref() -= timeout;
+    if (last_ping < cutoff) {
+      dout(10) << __func__ << " " << notif->notify_id
+	       << " last_ping " << last_ping << " < cutoff " << cutoff
+	       << ", disconnecting" << dendl;
+      disconnect();
+      return;
+    }
+  }
+  dout(10) << "start_notify " << notif->notify_id << dendl;
+  in_progress_notifies[notif->notify_id] = notif;
+  notif->start_watcher(self.lock());
+  if (is_connected())
+    send_notify(notif);
+}
+
+void Watch::cancel_notify(NotifyRef notif)
+{
+  dout(10) << "cancel_notify " << notif->notify_id << dendl;
+  in_progress_notifies.erase(notif->notify_id);
+}
+
+void Watch::send_notify(NotifyRef notif)
+{
+  dout(10) << "send_notify" << dendl;
+  MWatchNotify *notify_msg = new MWatchNotify(
+    cookie,
+    notif->version,
+    notif->notify_id,
+    CEPH_WATCH_EVENT_NOTIFY,
+    notif->payload,
+    notif->client_gid);
+  conn->send_message(notify_msg);
+}
+
+void Watch::notify_ack(uint64_t notify_id, bufferlist& reply_bl)
+{
+  dout(10) << "notify_ack" << dendl;
+  auto i = in_progress_notifies.find(notify_id);
+  if (i != in_progress_notifies.end()) {
+    i->second->complete_watcher(self.lock(), reply_bl);
+    in_progress_notifies.erase(i);
+  }
+}
+
+WatchRef Watch::makeWatchRef(
+  PrimaryLogPG *pg, OSDService *osd,
+  ObjectContextRef obc, uint32_t timeout, uint64_t cookie, entity_name_t entity, const entity_addr_t& addr)
+{
+  WatchRef ret(new Watch(pg, osd, obc, timeout, cookie, entity, addr));
+  ret->set_self(ret);
+  return ret;
+}
+
+void WatchConState::addWatch(WatchRef watch)
+{
+  std::lock_guard l(lock);
+  watches.insert(watch);
+}
+
+void WatchConState::removeWatch(WatchRef watch)
+{
+  std::lock_guard l(lock);
+  watches.erase(watch);
+}
+
+void WatchConState::reset(Connection *con)
+{
+  set<WatchRef> _watches;
+  {
+    std::lock_guard l(lock);
+    _watches.swap(watches);
+  }
+  for (set<WatchRef>::iterator i = _watches.begin();
+       i != _watches.end();
+       ++i) {
+    boost::intrusive_ptr<PrimaryLogPG> pg((*i)->get_pg());
+    pg->lock();
+    if (!(*i)->is_discarded()) {
+      if ((*i)->is_connected(con)) {
+	(*i)->disconnect();
+      } else {
+	lgeneric_derr(cct) << __func__ << " not still connected to " << (*i) << dendl;
+      }
+    }
+    pg->unlock();
+  }
+}
diff --git a/src/osd/Watch.h b/src/osd/Watch.h
new file mode 100644
index 000000000..8d6d93a7d
--- /dev/null
+++ b/src/osd/Watch.h
@@ -0,0 +1,291 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#ifndef CEPH_WATCH_H
+#define CEPH_WATCH_H
+
+#include <set>
+#include "msg/Connection.h"
+#include "include/Context.h"
+
+enum WatcherState {
+  WATCHER_PENDING,
+  WATCHER_NOTIFIED,
+};
+
+class OSDService;
+class PrimaryLogPG;
+void intrusive_ptr_add_ref(PrimaryLogPG *pg);
+void intrusive_ptr_release(PrimaryLogPG *pg);
+struct ObjectContext;
+class MWatchNotify;
+
+class Watch;
+typedef std::shared_ptr<Watch> WatchRef;
+typedef std::weak_ptr<Watch> WWatchRef;
+
+class Notify;
+typedef std::shared_ptr<Notify> NotifyRef;
+typedef std::weak_ptr<Notify> WNotifyRef;
+
+struct CancelableContext;
+
+/**
+ * Notify tracks the progress of a particular notify
+ *
+ * References are held by Watch and the timeout callback.
+ */
+class Notify {
+  friend class NotifyTimeoutCB;
+  friend class Watch;
+  WNotifyRef self;
+  ConnectionRef client;
+  uint64_t client_gid;
+  bool complete;
+  bool discarded;
+  bool timed_out;  ///< true if the notify timed out
+  std::set<WatchRef> watchers;
+
+  ceph::buffer::list payload;
+  uint32_t timeout;
+  uint64_t cookie;
+  uint64_t notify_id;
+  uint64_t version;
+
+  OSDService *osd;
+  CancelableContext *cb;
+  ceph::mutex lock = ceph::make_mutex("Notify::lock");
+
+  /// (gid,cookie) -> reply_bl for everyone who acked the notify
+  std::multimap<std::pair<uint64_t,uint64_t>, ceph::buffer::list> notify_replies;
+
+  /// true if this notify is being discarded
+  bool is_discarded() {
+    return discarded || complete;
+  }
+
+  /// Sends notify completion if watchers.empty() or timeout
+  void maybe_complete_notify();
+
+  /// Called on Notify timeout
+  void do_timeout();
+
+  Notify(
+    ConnectionRef client,
+    uint64_t client_gid,
+    ceph::buffer::list& payload,
+    uint32_t timeout,
+    uint64_t cookie,
+    uint64_t notify_id,
+    uint64_t version,
+    OSDService *osd);
+
+  /// registers a timeout callback with the watch_timer
+  void register_cb();
+
+  /// removes the timeout callback, called on completion or cancellation
+  void unregister_cb();
+public:
+
+  std::ostream& gen_dbg_prefix(std::ostream& out) {
+    return out << "Notify(" << std::make_pair(cookie, notify_id) << " "
+        << " watchers=" << watchers.size()
+        << ") ";
+  }
+  void set_self(NotifyRef _self) {
+    self = _self;
+  }
+  static NotifyRef makeNotifyRef(
+    ConnectionRef client,
+    uint64_t client_gid,
+    ceph::buffer::list &payload,
+    uint32_t timeout,
+    uint64_t cookie,
+    uint64_t notify_id,
+    uint64_t version,
+    OSDService *osd);
+
+  /// Call after creation to initialize
+  void init();
+
+  /// Called once per watcher prior to init()
+  void start_watcher(
+    WatchRef watcher ///< [in] watcher to complete
+    );
+
+  /// Called once per NotifyAck
+  void complete_watcher(
+    WatchRef watcher, ///< [in] watcher to complete
+    ceph::buffer::list& reply_bl ///< [in] reply buffer from the notified watcher
+    );
+  /// Called when a watcher unregisters or times out
+  void complete_watcher_remove(
+    WatchRef watcher ///< [in] watcher to complete
+    );
+
+  /// Called when the notify is canceled due to a new peering interval
+  void discard();
+};
+
+/**
+ * Watch is a mapping between a Connection and an ObjectContext
+ *
+ * References are held by ObjectContext and the timeout callback
+ */
+class HandleWatchTimeout;
+class HandleDelayedWatchTimeout;
+class Watch {
+  WWatchRef self;
+  friend class HandleWatchTimeout;
+  friend class HandleDelayedWatchTimeout;
+  ConnectionRef conn;
+  CancelableContext *cb;
+
+  OSDService *osd;
+  boost::intrusive_ptr<PrimaryLogPG> pg;
+  std::shared_ptr<ObjectContext> obc;
+
+  std::map<uint64_t, NotifyRef> in_progress_notifies;
+
+  // Could have watch_info_t here, but this file includes osd_types.h
+  uint32_t timeout; ///< timeout in seconds
+  uint64_t cookie;
+  entity_addr_t addr;
+
+  bool will_ping;    ///< is client new enough to ping the watch
+  utime_t last_ping; ///< last client ping
+
+  entity_name_t entity;
+  bool discarded;
+
+  Watch(
+    PrimaryLogPG *pg, OSDService *osd,
+    std::shared_ptr<ObjectContext> obc, uint32_t timeout,
+    uint64_t cookie, entity_name_t entity,
+    const entity_addr_t& addr);
+
+  /// Registers the timeout callback with watch_timer
+  void register_cb();
+
+  /// send a Notify message when connected for notif
+  void send_notify(NotifyRef notif);
+
+  /// Cleans up state on discard or remove (including Connection state, obc)
+  void discard_state();
+public:
+  /// Unregisters the timeout callback
+  void unregister_cb();
+
+  /// note receipt of a ping
+  void got_ping(utime_t t);
+  utime_t get_last_ping() const {
+    return last_ping;
+  }
+
+  /// True if currently connected
+  bool is_connected() const {
+    return conn.get() != NULL;
+  }
+  bool is_connected(Connection *con) const {
+    return conn.get() == con;
+  }
+
+  /// NOTE: must be called with pg lock held
+  ~Watch();
+
+  uint64_t get_watcher_gid() const {
+    return entity.num();
+  }
+
+  std::ostream& gen_dbg_prefix(std::ostream& out);
+  static WatchRef makeWatchRef(
+    PrimaryLogPG *pg, OSDService *osd,
+    std::shared_ptr<ObjectContext> obc, uint32_t timeout, uint64_t cookie, entity_name_t entity, const entity_addr_t &addr);
+  void set_self(WatchRef _self) {
+    self = _self;
+  }
+
+  /// Does not grant a ref count!
+  boost::intrusive_ptr<PrimaryLogPG> get_pg() { return pg; }
+
+  std::shared_ptr<ObjectContext> get_obc() { return obc; }
+
+  uint64_t get_cookie() const { return cookie; }
+  entity_name_t get_entity() const { return entity; }
+  entity_addr_t get_peer_addr() const { return addr; }
+  uint32_t get_timeout() const { return timeout; }
+
+  /// Generates context for use if watch timeout is delayed by scrub or recovery
+  Context *get_delayed_cb();
+
+  /// Transitions Watch to connected, unregister_cb, resends pending Notifies
+  void connect(
+    ConnectionRef con, ///< [in] Reference to new connection
+    bool will_ping     ///< [in] client is new and will send pings
+    );
+
+  /// Transitions watch to disconnected, register_cb
+  void disconnect();
+
+  /// Called if Watch state is discarded due to new peering interval
+  void discard();
+
+  /// True if removed or discarded
+  bool is_discarded() const;
+
+  /// Called on unwatch
+  void remove(bool send_disconnect);
+
+  /// Adds notif as in-progress notify
+  void start_notify(
+    NotifyRef notif ///< [in] Reference to new in-progress notify
+    );
+
+  /// Removes timed out notify
+  void cancel_notify(
+    NotifyRef notif ///< [in] notify which timed out
+    );
+
+  /// Call when notify_ack received on notify_id
+  void notify_ack(
+    uint64_t notify_id, ///< [in] id of acked notify
+    ceph::buffer::list& reply_bl ///< [in] notify reply buffer
+    );
+};
+
+/**
+ * Holds weak refs to Watch structures corresponding to a connection
+ * Lives in the Session object of an OSD connection
+ */
+class WatchConState {
+  ceph::mutex lock = ceph::make_mutex("WatchConState");
+  std::set<WatchRef> watches;
+public:
+  CephContext* cct;
+  explicit WatchConState(CephContext* cct) : cct(cct) {}
+
+  /// Add a watch
+  void addWatch(
+    WatchRef watch ///< [in] Ref to new watch object
+    );
+
+  /// Remove a watch
+  void removeWatch(
+    WatchRef watch ///< [in] Ref to watch object to remove
+    );
+
+  /// Called on session reset, disconnects watchers
+  void reset(Connection *con);
+};
+
+#endif
diff --git a/src/osd/error_code.cc b/src/osd/error_code.cc
new file mode 100644
index 000000000..97f0012fd
--- /dev/null
+++ b/src/osd/error_code.cc
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <string>
+
+#include "common/error_code.h"
+#include "common/errno.h"
+#include "error_code.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+class osd_error_category : public ceph::converting_category {
+public:
+  osd_error_category(){}
+  const char* name() const noexcept override;
+  const char* message(int ev, char*, std::size_t) const noexcept override;
+  std::string message(int ev) const override;
+  boost::system::error_condition default_error_condition(int ev) const noexcept
+    override;
+  bool equivalent(int ev, const boost::system::error_condition& c) const
+    noexcept override;
+  using ceph::converting_category::equivalent;
+  int from_code(int ev) const noexcept override;
+};
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
+
+const char* osd_error_category::name() const noexcept {
+  return "osd";
+}
+
+const char* osd_error_category::message(int ev, char* buf,
+					std::size_t len) const noexcept {
+  if (ev == 0)
+    return "No error";
+
+  switch (static_cast<osd_errc>(ev)) {
+  case osd_errc::old_snapc:
+    return "ORDERSNAP flag set; writer has old snapc";
+  case osd_errc::blocklisted:
+    return "Blocklisted";
+  }
+
+  if (len) {
+    auto s = cpp_strerror(ev);
+    auto n = s.copy(buf, len - 1);
+    *(buf + n) = '\0';
+  }
+  return buf;
+}
+
+std::string osd_error_category::message(int ev) const {
+  if (ev == 0)
+    return "No error";
+
+  switch (static_cast<osd_errc>(ev)) {
+  case osd_errc::old_snapc:
+    return "ORDERSNAP flag set; writer has old snapc";
+  case osd_errc::blocklisted:
+    return "Blocklisted";
+  }
+
+  return cpp_strerror(ev);
+}
+
+boost::system::error_condition osd_error_category::default_error_condition(int ev) const noexcept {
+  if (ev == static_cast<int>(osd_errc::old_snapc) ||
+      ev == static_cast<int>(osd_errc::blocklisted))
+    return { ev, *this };
+  else
+    return { ev, boost::system::generic_category() };
+}
+
+bool osd_error_category::equivalent(int ev, const boost::system::error_condition& c) const noexcept {
+  switch (static_cast<osd_errc>(ev)) {
+  case osd_errc::old_snapc:
+      return c == boost::system::errc::invalid_argument;
+  case osd_errc::blocklisted:
+      return c == boost::system::errc::operation_not_permitted;
+  }
+  return default_error_condition(ev) == c;
+}
+
+int osd_error_category::from_code(int ev) const noexcept {
+  return -ev;
+}
+
+const boost::system::error_category& osd_category() noexcept {
+  static const osd_error_category c;
+  return c;
+}
diff --git a/src/osd/error_code.h b/src/osd/error_code.h
new file mode 100644
index 000000000..d36e79db4
--- /dev/null
+++ b/src/osd/error_code.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <boost/system/error_code.hpp>
+
+#include "include/rados.h"
+
+const boost::system::error_category& osd_category() noexcept;
+
+// Since the OSD mostly uses POSIX error codes plus a couple
+// additions, this will be a degenerate error category for now that
+// mostly forwards to POSIX.
+
+enum class osd_errc {
+  old_snapc = 85,  /* ORDERSNAP flag set; writer has old snapc*/
+  blocklisted = 108 /* blocklisted */
+};
+
+namespace boost::system {
+template<>
+struct is_error_code_enum<::osd_errc> {
+  static const bool value = true;
+};
+
+template<>
+struct is_error_condition_enum<::osd_errc> {
+  static const bool value = false;
+};
+}
+
+//  implicit conversion:
+inline boost::system::error_code make_error_code(osd_errc e) noexcept {
+  return { static_cast<int>(e), osd_category() };
+}
+
+// explicit conversion:
+inline boost::system::error_condition make_error_condition(osd_errc e) noexcept {
+  return { static_cast<int>(e), osd_category() };
+}
diff --git a/src/osd/objclass.cc b/src/osd/objclass.cc
new file mode 100644
index 000000000..274f5e063
--- /dev/null
+++ b/src/osd/objclass.cc
@@ -0,0 +1,702 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <cstdarg>
+#include "common/ceph_context.h"
+#include "common/ceph_releases.h"
+#include "common/config.h"
+#include "common/debug.h"
+
+#include "objclass/objclass.h"
+#include "osd/PrimaryLogPG.h"
+
+#include "osd/ClassHandler.h"
+
+#include "auth/Crypto.h"
+#include "common/armor.h"
+
+#define dout_context ClassHandler::get_instance().cct
+
+using std::map;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::real_time;
+
+
+int cls_call(cls_method_context_t hctx, const char *cls, const char *method,
+	     char *indata, int datalen, char **outdata, int *outdatalen)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  bufferlist idata;
+  vector<OSDOp> nops(1);
+  OSDOp& op = nops[0];
+  int r;
+
+  op.op.op = CEPH_OSD_OP_CALL;
+  op.op.cls.class_len = strlen(cls);
+  op.op.cls.method_len = strlen(method);
+  op.op.cls.indata_len = datalen;
+  op.indata.append(cls, op.op.cls.class_len);
+  op.indata.append(method, op.op.cls.method_len);
+  op.indata.append(indata, datalen);
+  r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+  if (r < 0)
+    return r;
+
+  *outdata = (char *)malloc(op.outdata.length());
+  if (!*outdata)
+    return -ENOMEM;
+  memcpy(*outdata, op.outdata.c_str(), op.outdata.length());
+  *outdatalen = op.outdata.length();
+
+  return r;
+}
+
+int cls_getxattr(cls_method_context_t hctx, const char *name,
+		 char **outdata, int *outdatalen)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> nops(1);
+  OSDOp& op = nops[0];
+  int r;
+
+  op.op.op = CEPH_OSD_OP_GETXATTR;
+  op.op.xattr.name_len = strlen(name);
+  op.indata.append(name, op.op.xattr.name_len);
+  r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+  if (r < 0)
+    return r;
+
+  *outdata = (char *)malloc(op.outdata.length());
+  if (!*outdata)
+    return -ENOMEM;
+  memcpy(*outdata, op.outdata.c_str(), op.outdata.length());
+  *outdatalen = op.outdata.length();
+
+  return r;
+}
+
+int cls_setxattr(cls_method_context_t hctx, const char *name,
+		 const char *value, int val_len)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> nops(1);
+  OSDOp& op = nops[0];
+  int r;
+
+  op.op.op = CEPH_OSD_OP_SETXATTR;
+  op.op.xattr.name_len = strlen(name);
+  op.op.xattr.value_len = val_len;
+  op.indata.append(name, op.op.xattr.name_len);
+  op.indata.append(value, val_len);
+  r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+
+  return r;
+}
+
+int cls_read(cls_method_context_t hctx, int ofs, int len,
+	     char **outdata, int *outdatalen)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  ops[0].op.op = CEPH_OSD_OP_SYNC_READ;
+  ops[0].op.extent.offset = ofs;
+  ops[0].op.extent.length = len;
+  int r = (*pctx)->pg->do_osd_ops(*pctx, ops);
+  if (r < 0)
+    return r;
+
+  *outdata = (char *)malloc(ops[0].outdata.length());
+  if (!*outdata)
+    return -ENOMEM;
+  memcpy(*outdata, ops[0].outdata.c_str(), ops[0].outdata.length());
+  *outdatalen = ops[0].outdata.length();
+
+  return *outdatalen;
+}
+
+int cls_get_request_origin(cls_method_context_t hctx, entity_inst_t *origin)
+{
+  PrimaryLogPG::OpContext **pctx = static_cast<PrimaryLogPG::OpContext **>(hctx);
+  *origin = (*pctx)->op->get_req()->get_orig_source_inst();
+  return 0;
+}
+
+int cls_cxx_create(cls_method_context_t hctx, bool exclusive)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  ops[0].op.op = CEPH_OSD_OP_CREATE;
+  ops[0].op.flags = (exclusive ? CEPH_OSD_OP_FLAG_EXCL : 0);
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_remove(cls_method_context_t hctx)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  ops[0].op.op = CEPH_OSD_OP_DELETE;
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  int ret;
+  ops[0].op.op = CEPH_OSD_OP_STAT;
+  ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+  if (ret < 0)
+    return ret;
+  auto iter = ops[0].outdata.cbegin();
+  utime_t ut;
+  uint64_t s;
+  try {
+    decode(s, iter);
+    decode(ut, iter);
+  } catch (ceph::buffer::error& err) {
+    return -EIO;
+  }
+  if (size)
+    *size = s;
+  if (mtime)
+    *mtime = ut.sec();
+  return 0;
+}
+
+int cls_cxx_stat2(cls_method_context_t hctx, uint64_t *size, ceph::real_time *mtime)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  int ret;
+  ops[0].op.op = CEPH_OSD_OP_STAT;
+  ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+  if (ret < 0)
+    return ret;
+  auto iter = ops[0].outdata.cbegin();
+  real_time ut;
+  uint64_t s;
+  try {
+    decode(s, iter);
+    decode(ut, iter);
+  } catch (ceph::buffer::error& err) {
+    return -EIO;
+  }
+  if (size)
+    *size = s;
+  if (mtime)
+    *mtime = ut;
+  return 0;
+}
+
+int cls_cxx_read2(cls_method_context_t hctx, int ofs, int len,
+                  bufferlist *outbl, uint32_t op_flags)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  int ret;
+  ops[0].op.op = CEPH_OSD_OP_SYNC_READ;
+  ops[0].op.extent.offset = ofs;
+  ops[0].op.extent.length = len;
+  ops[0].op.flags = op_flags;
+  ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+  if (ret < 0)
+    return ret;
+  *outbl = std::move(ops[0].outdata);
+  return outbl->length();
+}
+
+int cls_cxx_write2(cls_method_context_t hctx, int ofs, int len,
+                   bufferlist *inbl, uint32_t op_flags)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  ops[0].op.op = CEPH_OSD_OP_WRITE;
+  ops[0].op.extent.offset = ofs;
+  ops[0].op.extent.length = len;
+  ops[0].op.flags = op_flags;
+  ops[0].indata = *inbl;
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_write_full(cls_method_context_t hctx, bufferlist *inbl)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  ops[0].op.op = CEPH_OSD_OP_WRITEFULL;
+  ops[0].op.extent.offset = 0;
+  ops[0].op.extent.length = inbl->length();
+  ops[0].indata = *inbl;
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_replace(cls_method_context_t hctx, int ofs, int len, bufferlist *inbl)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(2);
+  ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
+  ops[0].op.extent.offset = 0;
+  ops[0].op.extent.length = 0;
+  ops[1].op.op = CEPH_OSD_OP_WRITE;
+  ops[1].op.extent.offset = ofs;
+  ops[1].op.extent.length = len;
+  ops[1].indata = *inbl;
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_truncate(cls_method_context_t hctx, int ofs)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
+  ops[0].op.extent.offset = ofs;
+  ops[0].op.extent.length = 0;
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_write_zero(cls_method_context_t hctx, int ofs, int len)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  ops[0].op.op = CEPH_OSD_OP_ZERO;
+  ops[0].op.extent.offset = ofs;
+  ops[0].op.extent.length = len;
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_getxattr(cls_method_context_t hctx, const char *name,
+                     bufferlist *outbl)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> nops(1);
+  OSDOp& op = nops[0];
+  int r;
+
+  op.op.op = CEPH_OSD_OP_GETXATTR;
+  op.op.xattr.name_len = strlen(name);
+  op.indata.append(name, op.op.xattr.name_len);
+  r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+  if (r < 0)
+    return r;
+
+  *outbl = std::move(op.outdata);
+  return outbl->length();
+}
+
+int cls_cxx_getxattrs(cls_method_context_t hctx, map<string, bufferlist> *attrset)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> nops(1);
+  OSDOp& op = nops[0];
+  int r;
+
+  op.op.op = CEPH_OSD_OP_GETXATTRS;
+  r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+  if (r < 0)
+    return r;
+
+  auto iter = op.outdata.cbegin();
+  try {
+    decode(*attrset, iter);
+  } catch (ceph::buffer::error& err) {
+    return -EIO;
+  }
+  return 0;
+}
+
+int cls_cxx_setxattr(cls_method_context_t hctx, const char *name,
+                     bufferlist *inbl)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> nops(1);
+  OSDOp& op = nops[0];
+  int r;
+
+  op.op.op = CEPH_OSD_OP_SETXATTR;
+  op.op.xattr.name_len = strlen(name);
+  op.op.xattr.value_len = inbl->length();
+  op.indata.append(name, op.op.xattr.name_len);
+  op.indata.append(*inbl);
+  r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+
+  return r;
+}
+
+int cls_cxx_snap_revert(cls_method_context_t hctx, snapid_t snapid)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  ops[0].op.op = CEPH_OSD_OP_ROLLBACK;
+  ops[0].op.snap.snapid = snapid;
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_map_get_all_vals(cls_method_context_t hctx, map<string, bufferlist>* vals,
+                             bool *more)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  OSDOp& op = ops[0];
+  int ret;
+
+  string start_after;
+  string filter_prefix;
+  uint64_t max = (uint64_t)-1;
+
+  encode(start_after, op.indata);
+  encode(max, op.indata);
+  encode(filter_prefix, op.indata);
+
+  op.op.op = CEPH_OSD_OP_OMAPGETVALS;
+  
+  ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+  if (ret < 0)
+    return ret;
+
+  auto iter = op.outdata.cbegin();
+  try {
+    decode(*vals, iter);
+    decode(*more, iter);
+  } catch (ceph::buffer::error& err) {
+    return -EIO;
+  }
+  return vals->size();
+}
+
+int cls_cxx_map_get_keys(cls_method_context_t hctx, const string &start_obj,
+			 uint64_t max_to_get, set<string> *keys,
+                         bool *more)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  OSDOp& op = ops[0];
+  int ret;
+
+  encode(start_obj, op.indata);
+  encode(max_to_get, op.indata);
+
+  op.op.op = CEPH_OSD_OP_OMAPGETKEYS;
+
+  ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+  if (ret < 0)
+    return ret;
+
+  auto iter = op.outdata.cbegin();
+  try {
+    decode(*keys, iter);
+    decode(*more, iter);
+  } catch (ceph::buffer::error& err) {
+    return -EIO;
+  }
+  return keys->size();
+}
+
+int cls_cxx_map_get_vals(cls_method_context_t hctx, const string &start_obj,
+			 const string &filter_prefix, uint64_t max_to_get,
+			 map<string, bufferlist> *vals, bool *more)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  OSDOp& op = ops[0];
+  int ret;
+
+  encode(start_obj, op.indata);
+  encode(max_to_get, op.indata);
+  encode(filter_prefix, op.indata);
+
+  op.op.op = CEPH_OSD_OP_OMAPGETVALS;
+  
+  ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+  if (ret < 0)
+    return ret;
+
+  auto iter = op.outdata.cbegin();
+  try {
+    decode(*vals, iter);
+    decode(*more, iter);
+  } catch (ceph::buffer::error& err) {
+    return -EIO;
+  }
+  return vals->size();
+}
+
+int cls_cxx_map_read_header(cls_method_context_t hctx, bufferlist *outbl)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  OSDOp& op = ops[0];
+  int ret;
+  op.op.op = CEPH_OSD_OP_OMAPGETHEADER;
+  ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+  if (ret < 0)
+    return ret;
+
+  *outbl = std::move(op.outdata);
+
+  return 0;
+}
+
+int cls_cxx_map_get_val(cls_method_context_t hctx, const string &key,
+			bufferlist *outbl)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  OSDOp& op = ops[0];
+  int ret;
+
+  set<string> k;
+  k.insert(key);
+  encode(k, op.indata);
+
+  op.op.op = CEPH_OSD_OP_OMAPGETVALSBYKEYS;
+  ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+  if (ret < 0)
+    return ret;
+
+  auto iter = op.outdata.cbegin();
+  try {
+    map<string, bufferlist> m;
+
+    decode(m, iter);
+    map<string, bufferlist>::iterator iter = m.begin();
+    if (iter == m.end())
+      return -ENOENT;
+
+    *outbl = iter->second;
+  } catch (ceph::buffer::error& e) {
+    return -EIO;
+  }
+  return 0;
+}
+
+int cls_cxx_map_get_vals_by_keys(cls_method_context_t hctx,
+                                 const std::set<std::string> &keys,
+                                 std::map<std::string, bufferlist> *map)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  OSDOp& op = ops[0];
+  int ret;
+
+  encode(keys, op.indata);
+
+  op.op.op = CEPH_OSD_OP_OMAPGETVALSBYKEYS;
+  ret = (*pctx)->pg->do_osd_ops(*pctx, ops);
+  if (ret < 0)
+    return ret;
+
+  auto iter = op.outdata.cbegin();
+  try {
+    decode(*map, iter);
+  } catch (buffer::error& e) {
+    return -EIO;
+  }
+  return 0;
+}
+
+int cls_cxx_map_set_val(cls_method_context_t hctx, const string &key,
+			bufferlist *inbl)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  OSDOp& op = ops[0];
+  bufferlist& update_bl = op.indata;
+  map<string, bufferlist> m;
+  m[key] = *inbl;
+  encode(m, update_bl);
+
+  op.op.op = CEPH_OSD_OP_OMAPSETVALS;
+
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_map_set_vals(cls_method_context_t hctx,
+			 const std::map<string, bufferlist> *map)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  OSDOp& op = ops[0];
+  bufferlist& update_bl = op.indata;
+  encode(*map, update_bl);
+
+  op.op.op = CEPH_OSD_OP_OMAPSETVALS;
+
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_map_clear(cls_method_context_t hctx)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  OSDOp& op = ops[0];
+
+  op.op.op = CEPH_OSD_OP_OMAPCLEAR;
+
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_map_write_header(cls_method_context_t hctx, bufferlist *inbl)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  OSDOp& op = ops[0];
+  op.indata = std::move(*inbl);
+
+  op.op.op = CEPH_OSD_OP_OMAPSETHEADER;
+
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_map_remove_range(cls_method_context_t hctx,
+                             const std::string& key_begin,
+                             const std::string& key_end)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  OSDOp& op = ops[0];
+  bufferlist& update_bl = op.indata;
+
+  ::encode(key_begin, update_bl);
+  ::encode(key_end, update_bl);
+
+  op.op.op = CEPH_OSD_OP_OMAPRMKEYRANGE;
+
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_map_remove_key(cls_method_context_t hctx, const string &key)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> ops(1);
+  OSDOp& op = ops[0];
+  bufferlist& update_bl = op.indata;
+  set<string> to_rm;
+  to_rm.insert(key);
+
+  encode(to_rm, update_bl);
+
+  op.op.op = CEPH_OSD_OP_OMAPRMKEYS;
+
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_cxx_list_watchers(cls_method_context_t hctx,
+			  obj_list_watch_response_t *watchers)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  vector<OSDOp> nops(1);
+  OSDOp& op = nops[0];
+  int r;
+
+  op.op.op = CEPH_OSD_OP_LIST_WATCHERS;
+  r = (*pctx)->pg->do_osd_ops(*pctx, nops);
+  if (r < 0)
+    return r;
+
+  auto iter = op.outdata.cbegin();
+  try {
+    decode(*watchers, iter);
+  } catch (ceph::buffer::error& err) {
+    return -EIO;
+  }
+  return 0;
+}
+
+uint64_t cls_current_version(cls_method_context_t hctx)
+{
+  PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+
+  return ctx->pg->get_last_user_version();
+}
+
+
+int cls_current_subop_num(cls_method_context_t hctx)
+{
+  PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+
+  return ctx->processed_subop_count;
+}
+
+uint64_t cls_get_features(cls_method_context_t hctx)
+{
+  PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+  return ctx->pg->get_osdmap()->get_up_osd_features();
+}
+
+uint64_t cls_get_client_features(cls_method_context_t hctx)
+{
+  PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+  return ctx->op->get_req()->get_connection()->get_features();
+}
+
+ceph_release_t cls_get_required_osd_release(cls_method_context_t hctx)
+{
+  PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+  return ctx->pg->get_osdmap()->require_osd_release;
+}
+
+ceph_release_t cls_get_min_compatible_client(cls_method_context_t hctx)
+{
+  PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+  return ctx->pg->get_osdmap()->get_require_min_compat_client();
+}
+
+int cls_get_snapset_seq(cls_method_context_t hctx, uint64_t *snap_seq) {
+  PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+  if (!ctx->new_obs.exists || (ctx->new_obs.oi.is_whiteout() &&
+                               ctx->obc->ssc->snapset.clones.empty())) {
+    return -ENOENT;
+  }
+  *snap_seq = ctx->obc->ssc->snapset.seq;
+  return 0;
+}
+
+int cls_cxx_chunk_write_and_set(cls_method_context_t hctx, int ofs, int len,
+				bufferlist *write_inbl, uint32_t op_flags,
+				bufferlist *set_inbl, int set_len)
+{
+  PrimaryLogPG::OpContext **pctx = (PrimaryLogPG::OpContext **)hctx;
+  char cname[] = "cas";
+  char method[] = "chunk_set";
+
+  vector<OSDOp> ops(2);
+  ops[0].op.op = CEPH_OSD_OP_WRITE;
+  ops[0].op.extent.offset = ofs;
+  ops[0].op.extent.length = len;
+  ops[0].op.flags = op_flags;
+  ops[0].indata = *write_inbl;
+
+  ops[1].op.op = CEPH_OSD_OP_CALL;
+  ops[1].op.cls.class_len = strlen(cname);
+  ops[1].op.cls.method_len = strlen(method);
+  ops[1].op.cls.indata_len = set_len;
+  ops[1].indata.append(cname, ops[1].op.cls.class_len);
+  ops[1].indata.append(method, ops[1].op.cls.method_len);
+  ops[1].indata.append(*set_inbl);
+
+  return (*pctx)->pg->do_osd_ops(*pctx, ops);
+}
+
+int cls_get_manifest_ref_count(cls_method_context_t hctx, string fp_oid)
+{
+  PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+  return ctx->pg->get_manifest_ref_count(ctx->obc, fp_oid, ctx->op);
+}
+
+uint64_t cls_get_osd_min_alloc_size(cls_method_context_t hctx) {
+  PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+
+  return ctx->pg->get_min_alloc_size();
+}
+
+uint64_t cls_get_pool_stripe_width(cls_method_context_t hctx)
+{
+  PrimaryLogPG::OpContext *ctx = *(PrimaryLogPG::OpContext **)hctx;
+
+  return ctx->pg->get_pool().stripe_width;
+}
diff --git a/src/osd/object_state.h b/src/osd/object_state.h
new file mode 100644
index 000000000..31987d2a4
--- /dev/null
+++ b/src/osd/object_state.h
@@ -0,0 +1,190 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "osd_types.h"
+
+struct ObjectState {
+  object_info_t oi;
+  bool exists;         ///< the stored object exists (i.e., we will remember the object_info_t)
+
+  ObjectState() : exists(false) {}
+
+  ObjectState(const object_info_t &oi_, bool exists_)
+    : oi(oi_), exists(exists_) {}
+  ObjectState(object_info_t &&oi_, bool exists_)
+    : oi(std::move(oi_)), exists(exists_) {}
+  ObjectState(const hobject_t &obj) : oi(obj), exists(false) {}
+};
+
+struct RWState {
+  enum State {
+    RWNONE,
+    RWREAD,
+    RWWRITE,
+    RWEXCL,
+  };
+  static const char *get_state_name(State s) {
+    switch (s) {
+    case RWNONE: return "none";
+    case RWREAD: return "read";
+    case RWWRITE: return "write";
+    case RWEXCL: return "excl";
+    default: return "???";
+    }
+  }
+  const char *get_state_name() const {
+    return get_state_name(state);
+  }
+
+  int count;              ///< number of readers or writers
+  int waiters = 0;        ///< number waiting
+
+  State state:4;               ///< rw state
+  /// if set, restart backfill when we can get a read lock
+  bool recovery_read_marker:1;
+  /// if set, requeue snaptrim on lock release
+  bool snaptrimmer_write_marker:1;
+
+  RWState()
+    : count(0),
+      state(RWNONE),
+      recovery_read_marker(false),
+      snaptrimmer_write_marker(false)
+  {}
+
+  /// this function adjusts the counts if necessary
+  bool get_read_lock() {
+    // don't starve anybody!
+    if (waiters > 0) {
+      return false;
+    }
+    switch (state) {
+    case RWNONE:
+      ceph_assert(count == 0);
+      state = RWREAD;
+      // fall through
+    case RWREAD:
+      count++;
+      return true;
+    case RWWRITE:
+      return false;
+    case RWEXCL:
+      return false;
+    default:
+      ceph_abort_msg("unhandled case");
+      return false;
+    }
+  }
+
+  bool get_write_lock(bool greedy=false) {
+    if (!greedy) {
+      // don't starve anybody!
+      if (waiters > 0 ||
+	  recovery_read_marker) {
+	return false;
+      }
+    }
+    switch (state) {
+    case RWNONE:
+      ceph_assert(count == 0);
+      state = RWWRITE;
+      // fall through
+    case RWWRITE:
+      count++;
+      return true;
+    case RWREAD:
+      return false;
+    case RWEXCL:
+      return false;
+    default:
+      ceph_abort_msg("unhandled case");
+      return false;
+    }
+  }
+  bool get_excl_lock() {
+    switch (state) {
+    case RWNONE:
+      ceph_assert(count == 0);
+      state = RWEXCL;
+      count = 1;
+      return true;
+    case RWWRITE:
+      return false;
+    case RWREAD:
+      return false;
+    case RWEXCL:
+      return false;
+    default:
+      ceph_abort_msg("unhandled case");
+      return false;
+    }
+  }
+  /// same as get_write_lock, but ignore starvation
+  bool take_write_lock() {
+    if (state == RWWRITE) {
+      count++;
+      return true;
+    }
+    return get_write_lock();
+  }
+  bool dec() {
+    ceph_assert(count > 0);
+    count--;
+    if (count == 0) {
+      state = RWNONE;
+      return true;
+    } else {
+      return false;
+    }
+  }
+  bool put_read() {
+    ceph_assert(state == RWREAD);
+    return dec();
+  }
+  bool put_write() {
+    ceph_assert(state == RWWRITE);
+    return dec();
+  }
+  bool put_excl() {
+    ceph_assert(state == RWEXCL);
+    return dec();
+  }
+  void inc_waiters() {
+    ++waiters;
+  }
+  void release_waiters() {
+    waiters = 0;
+  }
+  void dec_waiters(int count) {
+    ceph_assert(waiters >= count);
+    waiters -= count;
+  }
+  bool empty() const { return state == RWNONE; }
+
+  bool get_snaptrimmer_write(bool mark_if_unsuccessful) {
+    if (get_write_lock()) {
+      return true;
+    } else {
+      if (mark_if_unsuccessful)
+	snaptrimmer_write_marker = true;
+      return false;
+    }
+  }
+  bool get_recovery_read() {
+    recovery_read_marker = true;
+    if (get_read_lock()) {
+      return true;
+    }
+    return false;
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const RWState& rw)
+{
+  return out << "rwstate(" << rw.get_state_name()
+	     << " n=" << rw.count
+	     << " w=" << rw.waiters
+	     << ")";
+}
diff --git a/src/osd/osd_internal_types.h b/src/osd/osd_internal_types.h
new file mode 100644
index 000000000..17f4f3146
--- /dev/null
+++ b/src/osd/osd_internal_types.h
@@ -0,0 +1,320 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OSD_INTERNAL_TYPES_H
+#define CEPH_OSD_INTERNAL_TYPES_H
+
+#include "osd_types.h"
+#include "OpRequest.h"
+#include "object_state.h"
+
+/*
+  * keep tabs on object modifications that are in flight.
+  * we need to know the projected existence, size, snapset,
+  * etc., because we don't send writes down to disk until after
+  * replicas ack.
+  */
+
+struct SnapSetContext {
+  hobject_t oid;
+  SnapSet snapset;
+  int ref;
+  bool registered : 1;
+  bool exists : 1;
+
+  explicit SnapSetContext(const hobject_t& o) :
+    oid(o), ref(0), registered(false), exists(true) { }
+};
+struct ObjectContext;
+typedef std::shared_ptr<ObjectContext> ObjectContextRef;
+
+struct ObjectContext {
+  ObjectState obs;
+
+  SnapSetContext *ssc;  // may be null
+
+  Context *destructor_callback;
+
+public:
+
+  // any entity in obs.oi.watchers MUST be in either watchers or unconnected_watchers.
+  std::map<std::pair<uint64_t, entity_name_t>, WatchRef> watchers;
+
+  // attr cache
+  std::map<std::string, ceph::buffer::list> attr_cache;
+
+  RWState rwstate;
+  std::list<OpRequestRef> waiters;  ///< ops waiting on state change
+  bool get_read(OpRequestRef& op) {
+    if (rwstate.get_read_lock()) {
+      return true;
+    } // else
+      // Now we really need to bump up the ref-counter.
+    waiters.emplace_back(op);
+    rwstate.inc_waiters();
+    return false;
+  }
+  bool get_write(OpRequestRef& op, bool greedy=false) {
+    if (rwstate.get_write_lock(greedy)) {
+      return true;
+    } // else
+    if (op) {
+      waiters.emplace_back(op);
+      rwstate.inc_waiters();
+    }
+    return false;
+  }
+  bool get_excl(OpRequestRef& op) {
+    if (rwstate.get_excl_lock()) {
+      return true;
+    } // else
+    if (op) {
+      waiters.emplace_back(op);
+      rwstate.inc_waiters();
+    }
+    return false;
+  }
+  void wake(std::list<OpRequestRef> *requeue) {
+    rwstate.release_waiters();
+    requeue->splice(requeue->end(), waiters);
+  }
+  void put_read(std::list<OpRequestRef> *requeue) {
+    if (rwstate.put_read()) {
+      wake(requeue);
+    }
+  }
+  void put_write(std::list<OpRequestRef> *requeue) {
+    if (rwstate.put_write()) {
+      wake(requeue);
+    }
+  }
+  void put_excl(std::list<OpRequestRef> *requeue) {
+    if (rwstate.put_excl()) {
+      wake(requeue);
+    }
+  }
+  bool empty() const { return rwstate.empty(); }
+
+  bool get_lock_type(OpRequestRef& op, RWState::State type) {
+    switch (type) {
+    case RWState::RWWRITE:
+      return get_write(op);
+    case RWState::RWREAD:
+      return get_read(op);
+    case RWState::RWEXCL:
+      return get_excl(op);
+    default:
+      ceph_abort_msg("invalid lock type");
+      return true;
+    }
+  }
+  bool get_write_greedy(OpRequestRef& op) {
+    return get_write(op, true);
+  }
+  bool get_snaptrimmer_write(bool mark_if_unsuccessful) {
+    return rwstate.get_snaptrimmer_write(mark_if_unsuccessful);
+  }
+  bool get_recovery_read() {
+    return rwstate.get_recovery_read();
+  }
+  bool try_get_read_lock() {
+    return rwstate.get_read_lock();
+  }
+  void drop_recovery_read(std::list<OpRequestRef> *ls) {
+    ceph_assert(rwstate.recovery_read_marker);
+    put_read(ls);
+    rwstate.recovery_read_marker = false;
+  }
+  void put_lock_type(
+    RWState::State type,
+    std::list<OpRequestRef> *to_wake,
+    bool *requeue_recovery,
+    bool *requeue_snaptrimmer) {
+    switch (type) {
+    case RWState::RWWRITE:
+      put_write(to_wake);
+      break;
+    case RWState::RWREAD:
+      put_read(to_wake);
+      break;
+    case RWState::RWEXCL:
+      put_excl(to_wake);
+      break;
+    default:
+      ceph_abort_msg("invalid lock type");
+    }
+    if (rwstate.empty() && rwstate.recovery_read_marker) {
+      rwstate.recovery_read_marker = false;
+      *requeue_recovery = true;
+    }
+    if (rwstate.empty() && rwstate.snaptrimmer_write_marker) {
+      rwstate.snaptrimmer_write_marker = false;
+      *requeue_snaptrimmer = true;
+    }
+  }
+  bool is_request_pending() {
+    return !rwstate.empty();
+  }
+
+  ObjectContext()
+    : ssc(NULL),
+      destructor_callback(0),
+      blocked(false), requeue_scrub_on_unblock(false) {}
+
+  ~ObjectContext() {
+    ceph_assert(rwstate.empty());
+    if (destructor_callback)
+      destructor_callback->complete(0);
+  }
+
+  void start_block() {
+    ceph_assert(!blocked);
+    blocked = true;
+  }
+  void stop_block() {
+    ceph_assert(blocked);
+    blocked = false;
+  }
+  bool is_blocked() const {
+    return blocked;
+  }
+
+  /// in-progress copyfrom ops for this object
+  bool blocked:1;
+  bool requeue_scrub_on_unblock:1;    // true if we need to requeue scrub on unblock
+
+};
+
+inline std::ostream& operator<<(std::ostream& out, const ObjectState& obs)
+{
+  out << obs.oi.soid;
+  if (!obs.exists)
+    out << "(dne)";
+  return out;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const ObjectContext& obc)
+{
+  return out << "obc(" << obc.obs << " " << obc.rwstate << ")";
+}
+
+class ObcLockManager {
+  struct ObjectLockState {
+    ObjectContextRef obc;
+    RWState::State type;
+    ObjectLockState(
+      ObjectContextRef obc,
+      RWState::State type)
+      : obc(std::move(obc)), type(type) {}
+  };
+  std::map<hobject_t, ObjectLockState> locks;
+public:
+  ObcLockManager() = default;
+  ObcLockManager(ObcLockManager &&) = default;
+  ObcLockManager(const ObcLockManager &) = delete;
+  ObcLockManager &operator=(ObcLockManager &&) = default;
+  bool empty() const {
+    return locks.empty();
+  }
+  bool get_lock_type(
+    RWState::State type,
+    const hobject_t &hoid,
+    ObjectContextRef& obc,
+    OpRequestRef& op) {
+    ceph_assert(locks.find(hoid) == locks.end());
+    if (obc->get_lock_type(op, type)) {
+      locks.insert(std::make_pair(hoid, ObjectLockState(obc, type)));
+      return true;
+    } else {
+      return false;
+    }
+  }
+  /// Get write lock, ignore starvation
+  bool take_write_lock(
+    const hobject_t &hoid,
+    ObjectContextRef obc) {
+    ceph_assert(locks.find(hoid) == locks.end());
+    if (obc->rwstate.take_write_lock()) {
+      locks.insert(
+	std::make_pair(
+	  hoid, ObjectLockState(obc, RWState::RWWRITE)));
+      return true;
+    } else {
+      return false;
+    }
+  }
+  /// Get write lock for snap trim
+  bool get_snaptrimmer_write(
+    const hobject_t &hoid,
+    ObjectContextRef obc,
+    bool mark_if_unsuccessful) {
+    ceph_assert(locks.find(hoid) == locks.end());
+    if (obc->get_snaptrimmer_write(mark_if_unsuccessful)) {
+      locks.insert(
+	std::make_pair(
+	  hoid, ObjectLockState(obc, RWState::RWWRITE)));
+      return true;
+    } else {
+      return false;
+    }
+  }
+  /// Get write lock greedy
+  bool get_write_greedy(
+    const hobject_t &hoid,
+    ObjectContextRef obc,
+    OpRequestRef op) {
+    ceph_assert(locks.find(hoid) == locks.end());
+    if (obc->get_write_greedy(op)) {
+      locks.insert(
+	std::make_pair(
+	  hoid, ObjectLockState(obc, RWState::RWWRITE)));
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /// try get read lock
+  bool try_get_read_lock(
+    const hobject_t &hoid,
+    ObjectContextRef obc) {
+    ceph_assert(locks.find(hoid) == locks.end());
+    if (obc->try_get_read_lock()) {
+      locks.insert(
+	std::make_pair(
+	  hoid,
+	  ObjectLockState(obc, RWState::RWREAD)));
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  void put_locks(
+    std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > *to_requeue,
+    bool *requeue_recovery,
+    bool *requeue_snaptrimmer) {
+    for (auto& p: locks) {
+      std::list<OpRequestRef> _to_requeue;
+      p.second.obc->put_lock_type(
+	p.second.type,
+	&_to_requeue,
+	requeue_recovery,
+	requeue_snaptrimmer);
+      if (to_requeue) {
+        // We can safely std::move here as the whole `locks` is going
+        // to die just after the loop.
+	to_requeue->emplace_back(std::move(p.second.obc),
+				 std::move(_to_requeue));
+      }
+    }
+    locks.clear();
+  }
+  ~ObcLockManager() {
+    ceph_assert(locks.empty());
+  }
+};
+
+
+
+#endif
diff --git a/src/osd/osd_op_util.cc b/src/osd/osd_op_util.cc
new file mode 100644
index 000000000..54c590ee2
--- /dev/null
+++ b/src/osd/osd_op_util.cc
@@ -0,0 +1,263 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd/osd_op_util.h"
+
+#include "osd/ClassHandler.h"
+#include "messages/MOSDOp.h"
+
+using std::ostream;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+
+bool OpInfo::check_rmw(int flag) const {
+  ceph_assert(rmw_flags != 0);
+  return rmw_flags & flag;
+}
+bool OpInfo::may_read() const {
+  return need_read_cap() || check_rmw(CEPH_OSD_RMW_FLAG_CLASS_READ);
+}
+bool OpInfo::may_write() const {
+  return need_write_cap() || check_rmw(CEPH_OSD_RMW_FLAG_CLASS_WRITE);
+}
+bool OpInfo::may_cache() const { return check_rmw(CEPH_OSD_RMW_FLAG_CACHE); }
+bool OpInfo::rwordered_forced() const {
+  return check_rmw(CEPH_OSD_RMW_FLAG_RWORDERED);
+}
+bool OpInfo::rwordered() const {
+  return may_write() || may_cache() || rwordered_forced();
+}
+
+bool OpInfo::includes_pg_op() const {
+  return check_rmw(CEPH_OSD_RMW_FLAG_PGOP);
+}
+bool OpInfo::need_read_cap() const {
+  return check_rmw(CEPH_OSD_RMW_FLAG_READ);
+}
+bool OpInfo::need_write_cap() const {
+  return check_rmw(CEPH_OSD_RMW_FLAG_WRITE);
+}
+bool OpInfo::need_promote() const {
+  return check_rmw(CEPH_OSD_RMW_FLAG_FORCE_PROMOTE);
+}
+bool OpInfo::need_skip_handle_cache() const {
+  return check_rmw(CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE);
+}
+bool OpInfo::need_skip_promote() const {
+  return check_rmw(CEPH_OSD_RMW_FLAG_SKIP_PROMOTE);
+}
+bool OpInfo::allows_returnvec() const {
+  return check_rmw(CEPH_OSD_RMW_FLAG_RETURNVEC);
+}
+
+void OpInfo::set_rmw_flags(int flags) {
+  rmw_flags |= flags;
+}
+
+void OpInfo::set_read() { set_rmw_flags(CEPH_OSD_RMW_FLAG_READ); }
+void OpInfo::set_write() { set_rmw_flags(CEPH_OSD_RMW_FLAG_WRITE); }
+void OpInfo::set_class_read() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CLASS_READ); }
+void OpInfo::set_class_write() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CLASS_WRITE); }
+void OpInfo::set_pg_op() { set_rmw_flags(CEPH_OSD_RMW_FLAG_PGOP); }
+void OpInfo::set_cache() { set_rmw_flags(CEPH_OSD_RMW_FLAG_CACHE); }
+void OpInfo::set_promote() { set_rmw_flags(CEPH_OSD_RMW_FLAG_FORCE_PROMOTE); }
+void OpInfo::set_skip_handle_cache() { set_rmw_flags(CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE); }
+void OpInfo::set_skip_promote() { set_rmw_flags(CEPH_OSD_RMW_FLAG_SKIP_PROMOTE); }
+void OpInfo::set_force_rwordered() { set_rmw_flags(CEPH_OSD_RMW_FLAG_RWORDERED); }
+void OpInfo::set_returnvec() { set_rmw_flags(CEPH_OSD_RMW_FLAG_RETURNVEC); }
+
+
+int OpInfo::set_from_op(
+  const MOSDOp *m,
+  const OSDMap &osdmap)
+{
+  vector<OSDOp>::const_iterator iter;
+
+  // client flags have no bearing on whether an op is a read, write, etc.
+  clear();
+
+  if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
+    set_force_rwordered();
+  }
+  if (m->has_flag(CEPH_OSD_FLAG_RETURNVEC)) {
+    set_returnvec();
+  }
+
+  // set bits based on op codes, called methods.
+  for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
+    if ((iter->op.op == CEPH_OSD_OP_WATCH &&
+	 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
+      /* This a bit odd.  PING isn't actually a write.  It can't
+       * result in an update to the object_info.  PINGs also aren't
+       * resent, so there's no reason to write out a log entry.
+       *
+       * However, we pipeline them behind writes, so let's force
+       * the write_ordered flag.
+       */
+      set_force_rwordered();
+    } else {
+      if (ceph_osd_op_mode_modify(iter->op.op))
+	set_write();
+    }
+    if (ceph_osd_op_mode_read(iter->op.op))
+      set_read();
+
+    // set READ flag if there are src_oids
+    if (iter->soid.oid.name.length())
+      set_read();
+
+    // set PGOP flag if there are PG ops
+    if (ceph_osd_op_type_pg(iter->op.op))
+      set_pg_op();
+
+    if (ceph_osd_op_mode_cache(iter->op.op))
+      set_cache();
+
+    // check for ec base pool
+    int64_t poolid = m->get_pg().pool();
+    const pg_pool_t *pool = osdmap.get_pg_pool(poolid);
+    if (pool && pool->is_tier()) {
+      const pg_pool_t *base_pool = osdmap.get_pg_pool(pool->tier_of);
+      if (base_pool && base_pool->require_rollback()) {
+        if ((iter->op.op != CEPH_OSD_OP_READ) &&
+            (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
+            (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
+            (iter->op.op != CEPH_OSD_OP_STAT) &&
+            (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
+            (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
+            (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
+            (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
+            (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
+            (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
+            (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
+            (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
+            (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
+            (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
+            (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
+            (iter->op.op != CEPH_OSD_OP_CREATE) &&
+            (iter->op.op != CEPH_OSD_OP_DELETE) &&
+            (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
+            (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
+            (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
+            (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
+            (iter->op.op != CEPH_OSD_OP_COPY_FROM) &&
+            (iter->op.op != CEPH_OSD_OP_COPY_FROM2)) {
+          set_promote();
+        }
+      }
+    }
+
+    switch (iter->op.op) {
+    case CEPH_OSD_OP_CALL:
+      {
+	bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
+	int is_write, is_read;
+	string cname, mname;
+	bp.copy(iter->op.cls.class_len, cname);
+	bp.copy(iter->op.cls.method_len, mname);
+
+	ClassHandler::ClassData *cls;
+	int r = ClassHandler::get_instance().open_class(cname, &cls);
+	if (r) {
+	  if (r == -ENOENT)
+	    r = -EOPNOTSUPP;
+	  else if (r != -EPERM) // propagate permission errors
+	    r = -EIO;
+	  return r;
+	}
+	int flags = cls->get_method_flags(mname);
+	if (flags < 0) {
+	  if (flags == -ENOENT)
+	    r = -EOPNOTSUPP;
+	  else
+	    r = flags;
+	  return r;
+	}
+	is_read = flags & CLS_METHOD_RD;
+	is_write = flags & CLS_METHOD_WR;
+        bool is_promote = flags & CLS_METHOD_PROMOTE;
+
+	if (is_read)
+	  set_class_read();
+	if (is_write)
+	  set_class_write();
+        if (is_promote)
+          set_promote();
+        add_class(std::move(cname), std::move(mname), is_read, is_write,
+                      cls->allowed);
+	break;
+      }
+
+    case CEPH_OSD_OP_WATCH:
+      // force the read bit for watch since it is depends on previous
+      // watch state (and may return early if the watch exists) or, in
+      // the case of ping, is simply a read op.
+      set_read();
+      // fall through
+    case CEPH_OSD_OP_NOTIFY:
+    case CEPH_OSD_OP_NOTIFY_ACK:
+      {
+        set_promote();
+        break;
+      }
+
+    case CEPH_OSD_OP_DELETE:
+      // if we get a delete with FAILOK we can skip handle cache. without
+      // FAILOK we still need to promote (or do something smarter) to
+      // determine whether to return ENOENT or 0.
+      if (iter == m->ops.begin() &&
+	  iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
+	set_skip_handle_cache();
+      }
+      // skip promotion when proxying a delete op
+      if (m->ops.size() == 1) {
+	set_skip_promote();
+      }
+      break;
+
+    case CEPH_OSD_OP_CACHE_TRY_FLUSH:
+    case CEPH_OSD_OP_CACHE_FLUSH:
+    case CEPH_OSD_OP_CACHE_EVICT:
+      // If try_flush/flush/evict is the only op, can skip handle cache.
+      if (m->ops.size() == 1) {
+	set_skip_handle_cache();
+      }
+      break;
+
+    case CEPH_OSD_OP_READ:
+    case CEPH_OSD_OP_SYNC_READ:
+    case CEPH_OSD_OP_SPARSE_READ:
+    case CEPH_OSD_OP_CHECKSUM:
+    case CEPH_OSD_OP_WRITEFULL:
+      if (m->ops.size() == 1 &&
+          (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
+           iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
+        set_skip_promote();
+      }
+      break;
+
+    // force promotion when pin an object in cache tier
+    case CEPH_OSD_OP_CACHE_PIN:
+      set_promote();
+      break;
+
+    default:
+      break;
+    }
+  }
+
+  if (rmw_flags == 0)
+    return -EINVAL;
+
+  return 0;
+
+}
+
+ostream& operator<<(ostream& out, const OpInfo::ClassInfo& i)
+{
+  out << "class " << i.class_name << " method " << i.method_name
+      << " rd " << i.read << " wr " << i.write << " allowed " << i.allowed;
+  return out;
+}
diff --git a/src/osd/osd_op_util.h b/src/osd/osd_op_util.h
new file mode 100644
index 000000000..5fb568e40
--- /dev/null
+++ b/src/osd/osd_op_util.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <vector>
+#include <string>
+
+#include "osd/OSDMap.h"
+
+#include "messages/MOSDOp.h"
+
+class OpInfo {
+public:
+  struct ClassInfo {
+    ClassInfo(std::string&& class_name, std::string&& method_name,
+              bool read, bool write, bool allowed) :
+      class_name(std::move(class_name)), method_name(std::move(method_name)),
+      read(read), write(write), allowed(allowed)
+    {}
+    const std::string class_name;
+    const std::string method_name;
+    const bool read, write, allowed;
+  };
+
+private:
+  uint64_t rmw_flags = 0;
+  std::vector<ClassInfo> classes;
+
+  void set_rmw_flags(int flags);
+
+  void add_class(std::string&& class_name, std::string&& method_name,
+                 bool read, bool write, bool allowed) {
+    classes.emplace_back(std::move(class_name), std::move(method_name),
+                          read, write, allowed);
+  }
+
+public:
+
+  void clear() {
+    rmw_flags = 0;
+  }
+
+  uint64_t get_flags() const {
+    return rmw_flags;
+  }
+
+  bool check_rmw(int flag) const ;
+  bool may_read() const;
+  bool may_write() const;
+  bool may_cache() const;
+  bool rwordered_forced() const;
+  bool rwordered() const;
+  bool includes_pg_op() const;
+  bool need_read_cap() const;
+  bool need_write_cap() const;
+  bool need_promote() const;
+  bool need_skip_handle_cache() const;
+  bool need_skip_promote() const;
+  bool allows_returnvec() const;
+
+  void set_read();
+  void set_write();
+  void set_cache();
+  void set_class_read();
+  void set_class_write();
+  void set_pg_op();
+  void set_promote();
+  void set_skip_handle_cache();
+  void set_skip_promote();
+  void set_force_rwordered();
+  void set_returnvec();
+
+  int set_from_op(
+    const MOSDOp *m,
+    const OSDMap &osdmap);
+
+  std::vector<ClassInfo> get_classes() const {
+    return classes;
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const OpInfo::ClassInfo& i);
diff --git a/src/osd/osd_perf_counters.cc b/src/osd/osd_perf_counters.cc
new file mode 100644
index 000000000..ed63b4d3f
--- /dev/null
+++ b/src/osd/osd_perf_counters.cc
@@ -0,0 +1,321 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "osd_perf_counters.h"
+#include "include/common_fwd.h"
+
+
+PerfCounters *build_osd_logger(CephContext *cct) {
+  PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
+
+  // Latency axis configuration for op histograms, values are in nanoseconds
+  PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
+    "Latency (usec)",
+    PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
+    0,                               ///< Start at 0
+    100000,                          ///< Quantization unit is 100usec
+    32,                              ///< Enough to cover much longer than slow requests
+  };
+
+  // Op size axis configuration for op histograms, values are in bytes
+  PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
+    "Request size (bytes)",
+    PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
+    0,                               ///< Start at 0
+    512,                             ///< Quantization unit is 512 bytes
+    32,                              ///< Enough to cover requests larger than GB
+  };
+
+
+  // All the basic OSD operation stats are to be considered useful
+  osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
+  osd_plb.add_u64(
+    l_osd_op_wip, "op_wip",
+    "Replication operations currently being processed (primary)");
+  osd_plb.add_u64_counter(
+    l_osd_op, "op",
+    "Client operations",
+    "ops", PerfCountersBuilder::PRIO_CRITICAL);
+  osd_plb.add_u64_counter(
+    l_osd_op_inb,   "op_in_bytes",
+    "Client operations total write size",
+    "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+  osd_plb.add_u64_counter(
+    l_osd_op_outb,  "op_out_bytes",
+    "Client operations total read size",
+    "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+  osd_plb.add_time_avg(
+    l_osd_op_lat,   "op_latency",
+    "Latency of client operations (including queue time)",
+    "l", 9);
+  osd_plb.add_time_avg(
+    l_osd_op_process_lat, "op_process_latency",
+    "Latency of client operations (excluding queue time)");
+  osd_plb.add_time_avg(
+    l_osd_op_prepare_lat, "op_prepare_latency",
+    "Latency of client operations (excluding queue time and wait for finished)");
+
+  osd_plb.add_u64_counter(
+    l_osd_op_r, "op_r", "Client read operations");
+  osd_plb.add_u64_counter(
+    l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  osd_plb.add_time_avg(
+    l_osd_op_r_lat, "op_r_latency",
+    "Latency of read operation (including queue time)");
+  osd_plb.add_u64_counter_histogram(
+    l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of operation latency (including queue time) + data read");
+  osd_plb.add_time_avg(
+    l_osd_op_r_process_lat, "op_r_process_latency",
+    "Latency of read operation (excluding queue time)");
+  osd_plb.add_time_avg(
+    l_osd_op_r_prepare_lat, "op_r_prepare_latency",
+    "Latency of read operations (excluding queue time and wait for finished)");
+  osd_plb.add_u64_counter(
+    l_osd_op_w, "op_w", "Client write operations");
+  osd_plb.add_u64_counter(
+    l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
+  osd_plb.add_time_avg(
+    l_osd_op_w_lat,  "op_w_latency",
+    "Latency of write operation (including queue time)");
+  osd_plb.add_u64_counter_histogram(
+    l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of operation latency (including queue time) + data written");
+  osd_plb.add_time_avg(
+    l_osd_op_w_process_lat, "op_w_process_latency",
+    "Latency of write operation (excluding queue time)");
+  osd_plb.add_time_avg(
+    l_osd_op_w_prepare_lat, "op_w_prepare_latency",
+    "Latency of write operations (excluding queue time and wait for finished)");
+  osd_plb.add_u64_counter(
+    l_osd_op_rw, "op_rw",
+    "Client read-modify-write operations");
+  osd_plb.add_u64_counter(
+    l_osd_op_rw_inb, "op_rw_in_bytes",
+    "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  osd_plb.add_u64_counter(
+    l_osd_op_rw_outb,"op_rw_out_bytes",
+    "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  osd_plb.add_time_avg(
+    l_osd_op_rw_lat, "op_rw_latency",
+    "Latency of read-modify-write operation (including queue time)");
+  osd_plb.add_u64_counter_histogram(
+    l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of rw operation latency (including queue time) + data written");
+  osd_plb.add_u64_counter_histogram(
+    l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
+    op_hist_x_axis_config, op_hist_y_axis_config,
+    "Histogram of rw operation latency (including queue time) + data read");
+  osd_plb.add_time_avg(
+    l_osd_op_rw_process_lat, "op_rw_process_latency",
+    "Latency of read-modify-write operation (excluding queue time)");
+  osd_plb.add_time_avg(
+    l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
+    "Latency of read-modify-write operations (excluding queue time and wait for finished)");
+
+  // Now we move on to some more obscure stats, revert to assuming things
+  // are low priority unless otherwise specified.
+  osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+
+  osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
+    "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
+  osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
+    "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
+
+  osd_plb.add_u64_counter(
+    l_osd_sop, "subop", "Suboperations");
+  osd_plb.add_u64_counter(
+    l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES));
+  osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
+
+  osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
+  osd_plb.add_u64_counter(
+    l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES));
+  osd_plb.add_time_avg(
+    l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
+  osd_plb.add_u64_counter(
+    l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
+  osd_plb.add_time_avg(
+    l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
+  osd_plb.add_u64_counter(
+    l_osd_sop_push, "subop_push", "Suboperations push messages");
+  osd_plb.add_u64_counter(
+    l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES));
+  osd_plb.add_time_avg(
+    l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
+
+  osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
+  osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
+  osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES));
+
+  osd_plb.add_u64_counter(
+    l_osd_rop, "recovery_ops",
+    "Started recovery operations",
+    "rop", PerfCountersBuilder::PRIO_INTERESTING);
+
+  osd_plb.add_u64_counter(
+   l_osd_rbytes, "recovery_bytes",
+   "recovery bytes",
+   "rbt", PerfCountersBuilder::PRIO_INTERESTING);
+
+  osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
+  osd_plb.add_u64(
+    l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
+  osd_plb.add_u64(
+    l_osd_cached_crc_adjusted, "cached_crc_adjusted",
+    "Total number getting crc from crc_cache with adjusting");
+  osd_plb.add_u64(l_osd_missed_crc, "missed_crc", 
+    "Total number of crc cache misses");
+
+  osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
+		  "pgs", PerfCountersBuilder::PRIO_USEFUL);
+  osd_plb.add_u64(
+    l_osd_pg_primary, "numpg_primary",
+    "Placement groups for which this osd is primary");
+  osd_plb.add_u64(
+    l_osd_pg_replica, "numpg_replica",
+    "Placement groups for which this osd is replica");
+  osd_plb.add_u64(
+    l_osd_pg_stray, "numpg_stray",
+    "Placement groups ready to be deleted from this osd");
+  osd_plb.add_u64(
+    l_osd_pg_removing, "numpg_removing",
+    "Placement groups queued for local deletion", "pgsr",
+    PerfCountersBuilder::PRIO_USEFUL);
+  osd_plb.add_u64(
+    l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
+  osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
+  osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
+  osd_plb.add_u64_counter(
+    l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
+  osd_plb.add_u64_counter(
+    l_osd_waiting_for_map, "messages_delayed_for_map",
+    "Operations waiting for OSD map");
+
+  osd_plb.add_u64_counter(
+    l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
+  osd_plb.add_u64_counter(
+    l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
+  osd_plb.add_u64_counter(
+    l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
+    "osdmap cache miss below cache lower bound");
+  osd_plb.add_u64_avg(
+    l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
+    "osdmap cache miss, avg distance below cache lower bound");
+  osd_plb.add_u64_counter(
+    l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
+    "OSDMap buffer cache hits");
+  osd_plb.add_u64_counter(
+    l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
+    "OSDMap buffer cache misses");
+
+  osd_plb.add_u64(
+    l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
+    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  osd_plb.add_u64(
+    l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
+    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
+
+  osd_plb.add_u64_counter(
+    l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
+
+  osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
+  osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
+  osd_plb.add_u64_counter(
+    l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
+  osd_plb.add_u64_counter(
+    l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
+  osd_plb.add_u64_counter(
+    l_osd_tier_try_flush_fail, "tier_try_flush_fail",
+    "Failed tier flush attempts");
+  osd_plb.add_u64_counter(
+    l_osd_tier_evict, "tier_evict", "Tier evictions");
+  osd_plb.add_u64_counter(
+    l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
+  osd_plb.add_u64_counter(
+    l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
+  osd_plb.add_u64_counter(
+    l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
+  osd_plb.add_u64_counter(
+    l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
+  osd_plb.add_u64_counter(
+    l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
+  osd_plb.add_u64_counter(
+    l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
+
+  osd_plb.add_u64_counter(
+    l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
+  osd_plb.add_u64_counter(
+    l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
+  osd_plb.add_u64_counter(
+    l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
+  osd_plb.add_u64_counter(
+    l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
+
+  osd_plb.add_u64_counter(
+    l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
+  osd_plb.add_u64_counter(
+    l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
+
+  osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
+  osd_plb.add_time_avg(
+    l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
+  osd_plb.add_time_avg(
+    l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
+  osd_plb.add_time_avg(
+    l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
+
+  osd_plb.add_u64_counter(
+    l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
+  osd_plb.add_u64_counter(
+    l_osd_pg_fastinfo, "osd_pg_fastinfo",
+    "PG updated its info using fastinfo attr");
+  osd_plb.add_u64_counter(
+    l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
+
+  return osd_plb.create_perf_counters();
+}
+ 
+
+PerfCounters *build_recoverystate_perf(CephContext *cct) {
+  PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
+
+  rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
+  rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
+  rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
+  rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
+  rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
+  rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
+  rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
+  rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
+  rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
+  rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
+  rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
+  rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
+  rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
+  rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
+  rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
+  rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
+  rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
+  rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
+  rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
+  rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
+  rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
+  rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
+  rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
+  rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
+  rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
+  rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
+  rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
+  rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
+  rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
+  rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
+  rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
+
+  return rs_perf.create_perf_counters();
+}
diff --git a/src/osd/osd_perf_counters.h b/src/osd/osd_perf_counters.h
new file mode 100644
index 000000000..9966a7f7d
--- /dev/null
+++ b/src/osd/osd_perf_counters.h
@@ -0,0 +1,163 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/common_fwd.h"
+#include "common/perf_counters.h"
+
+enum {
+  l_osd_first = 10000,
+  l_osd_op_wip,
+  l_osd_op,
+  l_osd_op_inb,
+  l_osd_op_outb,
+  l_osd_op_lat,
+  l_osd_op_process_lat,
+  l_osd_op_prepare_lat,
+  l_osd_op_r,
+  l_osd_op_r_outb,
+  l_osd_op_r_lat,
+  l_osd_op_r_lat_outb_hist,
+  l_osd_op_r_process_lat,
+  l_osd_op_r_prepare_lat,
+  l_osd_op_w,
+  l_osd_op_w_inb,
+  l_osd_op_w_lat,
+  l_osd_op_w_lat_inb_hist,
+  l_osd_op_w_process_lat,
+  l_osd_op_w_prepare_lat,
+  l_osd_op_rw,
+  l_osd_op_rw_inb,
+  l_osd_op_rw_outb,
+  l_osd_op_rw_lat,
+  l_osd_op_rw_lat_inb_hist,
+  l_osd_op_rw_lat_outb_hist,
+  l_osd_op_rw_process_lat,
+  l_osd_op_rw_prepare_lat,
+
+  l_osd_op_before_queue_op_lat,
+  l_osd_op_before_dequeue_op_lat,
+
+  l_osd_sop,
+  l_osd_sop_inb,
+  l_osd_sop_lat,
+  l_osd_sop_w,
+  l_osd_sop_w_inb,
+  l_osd_sop_w_lat,
+  l_osd_sop_pull,
+  l_osd_sop_pull_lat,
+  l_osd_sop_push,
+  l_osd_sop_push_inb,
+  l_osd_sop_push_lat,
+
+  l_osd_pull,
+  l_osd_push,
+  l_osd_push_outb,
+
+  l_osd_rop,
+  l_osd_rbytes,
+
+  l_osd_loadavg,
+  l_osd_cached_crc,
+  l_osd_cached_crc_adjusted,
+  l_osd_missed_crc,
+
+  l_osd_pg,
+  l_osd_pg_primary,
+  l_osd_pg_replica,
+  l_osd_pg_stray,
+  l_osd_pg_removing,
+  l_osd_hb_to,
+  l_osd_map,
+  l_osd_mape,
+  l_osd_mape_dup,
+
+  l_osd_waiting_for_map,
+
+  l_osd_map_cache_hit,
+  l_osd_map_cache_miss,
+  l_osd_map_cache_miss_low,
+  l_osd_map_cache_miss_low_avg,
+  l_osd_map_bl_cache_hit,
+  l_osd_map_bl_cache_miss,
+
+  l_osd_stat_bytes,
+  l_osd_stat_bytes_used,
+  l_osd_stat_bytes_avail,
+
+  l_osd_copyfrom,
+
+  l_osd_tier_promote,
+  l_osd_tier_flush,
+  l_osd_tier_flush_fail,
+  l_osd_tier_try_flush,
+  l_osd_tier_try_flush_fail,
+  l_osd_tier_evict,
+  l_osd_tier_whiteout,
+  l_osd_tier_dirty,
+  l_osd_tier_clean,
+  l_osd_tier_delay,
+  l_osd_tier_proxy_read,
+  l_osd_tier_proxy_write,
+
+  l_osd_agent_wake,
+  l_osd_agent_skip,
+  l_osd_agent_flush,
+  l_osd_agent_evict,
+
+  l_osd_object_ctx_cache_hit,
+  l_osd_object_ctx_cache_total,
+
+  l_osd_op_cache_hit,
+  l_osd_tier_flush_lat,
+  l_osd_tier_promote_lat,
+  l_osd_tier_r_lat,
+
+  l_osd_pg_info,
+  l_osd_pg_fastinfo,
+  l_osd_pg_biginfo,
+
+  l_osd_last,
+};
+
+PerfCounters *build_osd_logger(CephContext *cct);
+
+// PeeringState perf counters
+enum {
+  rs_first = 20000,
+  rs_initial_latency,
+  rs_started_latency,
+  rs_reset_latency,
+  rs_start_latency,
+  rs_primary_latency,
+  rs_peering_latency,
+  rs_backfilling_latency,
+  rs_waitremotebackfillreserved_latency,
+  rs_waitlocalbackfillreserved_latency,
+  rs_notbackfilling_latency,
+  rs_repnotrecovering_latency,
+  rs_repwaitrecoveryreserved_latency,
+  rs_repwaitbackfillreserved_latency,
+  rs_reprecovering_latency,
+  rs_activating_latency,
+  rs_waitlocalrecoveryreserved_latency,
+  rs_waitremoterecoveryreserved_latency,
+  rs_recovering_latency,
+  rs_recovered_latency,
+  rs_clean_latency,
+  rs_active_latency,
+  rs_replicaactive_latency,
+  rs_stray_latency,
+  rs_getinfo_latency,
+  rs_getlog_latency,
+  rs_waitactingchange_latency,
+  rs_incomplete_latency,
+  rs_down_latency,
+  rs_getmissing_latency,
+  rs_waitupthru_latency,
+  rs_notrecovering_latency,
+  rs_last,
+};
+
+PerfCounters *build_recoverystate_perf(CephContext *cct);
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
new file mode 100644
index 000000000..13358560f
--- /dev/null
+++ b/src/osd/osd_types.cc
@@ -0,0 +1,7212 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <list>
+#include <map>
+#include <ostream>
+#include <sstream>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+
+#include <boost/assign/list_of.hpp>
+
+#include "include/ceph_features.h"
+#include "include/encoding.h"
+#include "include/stringify.h"
+extern "C" {
+#include "crush/hash.h"
+}
+
+#include "common/Formatter.h"
+#include "common/StackStringStream.h"
+#include "OSDMap.h"
+#include "osd_types.h"
+#include "os/Transaction.h"
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::ostream;
+using std::pair;
+using std::set;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::decode_nohead;
+using ceph::encode;
+using ceph::encode_nohead;
+using ceph::Formatter;
+using ceph::make_timespan;
+using ceph::JSONFormatter;
+
+using namespace std::literals;
+
+const char *ceph_osd_flag_name(unsigned flag)
+{
+  switch (flag) {
+  case CEPH_OSD_FLAG_ACK: return "ack";
+  case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
+  case CEPH_OSD_FLAG_ONDISK: return "ondisk";
+  case CEPH_OSD_FLAG_RETRY: return "retry";
+  case CEPH_OSD_FLAG_READ: return "read";
+  case CEPH_OSD_FLAG_WRITE: return "write";
+  case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
+  case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
+  case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
+  case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
+  case CEPH_OSD_FLAG_PGOP: return "pgop";
+  case CEPH_OSD_FLAG_EXEC: return "exec";
+  case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
+  case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
+  case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
+  case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
+  case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
+  case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
+  case CEPH_OSD_FLAG_FLUSH: return "flush";
+  case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
+  case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
+  case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
+  case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
+  case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
+  case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
+  case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect";
+  case CEPH_OSD_FLAG_RETURNVEC: return "returnvec";
+  default: return "???";
+  }
+}
+
+string ceph_osd_flag_string(unsigned flags)
+{
+  string s;
+  for (unsigned i=0; i<32; ++i) {
+    if (flags & (1u<<i)) {
+      if (s.length())
+	s += "+";
+      s += ceph_osd_flag_name(1u << i);
+    }
+  }
+  if (s.length())
+    return s;
+  return string("-");
+}
+
+const char * ceph_osd_op_flag_name(unsigned flag)
+{
+  const char *name;
+
+  switch(flag) {
+    case CEPH_OSD_OP_FLAG_EXCL:
+      name = "excl";
+      break;
+    case CEPH_OSD_OP_FLAG_FAILOK:
+      name = "failok";
+      break;
+    case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
+      name = "fadvise_random";
+      break;
+    case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
+      name = "fadvise_sequential";
+      break;
+    case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
+      name = "favise_willneed";
+      break;
+    case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
+      name = "fadvise_dontneed";
+      break;
+    case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
+      name = "fadvise_nocache";
+      break;
+    case CEPH_OSD_OP_FLAG_WITH_REFERENCE:
+      name = "with_reference";
+      break;
+    case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE:
+      name = "bypass_clean_cache";
+      break;
+    default:
+      name = "???";
+  };
+
+  return name;
+}
+
+string ceph_osd_op_flag_string(unsigned flags)
+{
+  string s;
+  for (unsigned i=0; i<32; ++i) {
+    if (flags & (1u<<i)) {
+      if (s.length())
+	s += "+";
+      s += ceph_osd_op_flag_name(1u << i);
+    }
+  }
+  if (s.length())
+    return s;
+  return string("-");
+}
+
+string ceph_osd_alloc_hint_flag_string(unsigned flags)
+{
+  string s;
+  for (unsigned i=0; i<32; ++i) {
+    if (flags & (1u<<i)) {
+      if (s.length())
+	s += "+";
+      s += ceph_osd_alloc_hint_flag_name(1u << i);
+    }
+  }
+  if (s.length())
+    return s;
+  return string("-");
+}
+
+void pg_shard_t::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(osd, bl);
+  encode(shard, bl);
+  ENCODE_FINISH(bl);
+}
+void pg_shard_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  decode(osd, bl);
+  decode(shard, bl);
+  DECODE_FINISH(bl);
+}
+
+ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
+{
+  if (rhs.is_undefined())
+    return lhs << "?";
+  if (rhs.shard == shard_id_t::NO_SHARD)
+    return lhs << rhs.get_osd();
+  return lhs << rhs.get_osd() << '(' << (unsigned)(rhs.shard) << ')';
+}
+
+void dump(Formatter* f, const osd_alerts_t& alerts)
+{
+  for (auto& a : alerts) {
+    string s0 = " osd: ";
+    s0 += stringify(a.first);
+    string s;
+    for (auto& aa : a.second) {
+      s = s0;
+      s += " ";
+      s += aa.first;
+      s += ":";
+      s += aa.second;
+      f->dump_string("alert", s);
+    }
+  }
+}
+
+// -- osd_reqid_t --
+void osd_reqid_t::dump(Formatter *f) const
+{
+  f->dump_stream("name") << name;
+  f->dump_int("inc", inc);
+  f->dump_unsigned("tid", tid);
+}
+
+void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
+{
+  o.push_back(new osd_reqid_t);
+  o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
+}
+
+// -- object_locator_t --
+
+void object_locator_t::encode(ceph::buffer::list& bl) const
+{
+  // verify that nobody's corrupted the locator
+  ceph_assert(hash == -1 || key.empty());
+  __u8 encode_compat = 3;
+  ENCODE_START(6, encode_compat, bl);
+  encode(pool, bl);
+  int32_t preferred = -1;  // tell old code there is no preferred osd (-1).
+  encode(preferred, bl);
+  encode(key, bl);
+  encode(nspace, bl);
+  encode(hash, bl);
+  if (hash != -1)
+    encode_compat = std::max<std::uint8_t>(encode_compat, 6); // need to interpret the hash
+  ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
+}
+
+void object_locator_t::decode(ceph::buffer::list::const_iterator& p)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
+  if (struct_v < 2) {
+    int32_t op;
+    decode(op, p);
+    pool = op;
+    int16_t pref;
+    decode(pref, p);
+  } else {
+    decode(pool, p);
+    int32_t preferred;
+    decode(preferred, p);
+  }
+  decode(key, p);
+  if (struct_v >= 5)
+    decode(nspace, p);
+  if (struct_v >= 6)
+    decode(hash, p);
+  else
+    hash = -1;
+  DECODE_FINISH(p);
+  // verify that nobody's corrupted the locator
+  ceph_assert(hash == -1 || key.empty());
+}
+
+void object_locator_t::dump(Formatter *f) const
+{
+  f->dump_int("pool", pool);
+  f->dump_string("key", key);
+  f->dump_string("namespace", nspace);
+  f->dump_int("hash", hash);
+}
+
+void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
+{
+  o.push_back(new object_locator_t);
+  o.push_back(new object_locator_t(123));
+  o.push_back(new object_locator_t(123, 876));
+  o.push_back(new object_locator_t(1, "n2"));
+  o.push_back(new object_locator_t(1234, "", "key"));
+  o.push_back(new object_locator_t(12, "n1", "key2"));
+}
+
+// -- request_redirect_t --
+void request_redirect_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(redirect_locator, bl);
+  encode(redirect_object, bl);
+  // legacy of the removed osd_instructions member
+  encode((uint32_t)0, bl);
+  ENCODE_FINISH(bl);
+}
+
+void request_redirect_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  uint32_t legacy_osd_instructions_len;
+  decode(redirect_locator, bl);
+  decode(redirect_object, bl);
+  decode(legacy_osd_instructions_len, bl);
+  if (legacy_osd_instructions_len) {
+    bl += legacy_osd_instructions_len;
+  }
+  DECODE_FINISH(bl);
+}
+
+void request_redirect_t::dump(Formatter *f) const
+{
+  f->dump_string("object", redirect_object);
+  f->open_object_section("locator");
+  redirect_locator.dump(f);
+  f->close_section(); // locator
+}
+
+void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
+{
+  object_locator_t loc(1, "redir_obj");
+  o.push_back(new request_redirect_t());
+  o.push_back(new request_redirect_t(loc, 0));
+  o.push_back(new request_redirect_t(loc, "redir_obj"));
+  o.push_back(new request_redirect_t(loc));
+}
+
+void objectstore_perf_stat_t::dump(Formatter *f) const
+{
+  // *_ms values just for compatibility.
+  f->dump_float("commit_latency_ms", os_commit_latency_ns / 1000000.0);
+  f->dump_float("apply_latency_ms", os_apply_latency_ns / 1000000.0);
+  f->dump_unsigned("commit_latency_ns", os_commit_latency_ns);
+  f->dump_unsigned("apply_latency_ns", os_apply_latency_ns);
+}
+
+void objectstore_perf_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+  uint8_t target_v = 2;
+  if (!HAVE_FEATURE(features, OS_PERF_STAT_NS)) {
+    target_v = 1;
+  }
+  ENCODE_START(target_v, target_v, bl);
+  if (target_v >= 2) {
+    encode(os_commit_latency_ns, bl);
+    encode(os_apply_latency_ns, bl);
+  } else {
+    constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
+    uint32_t commit_latency_ms = os_commit_latency_ns / NS_PER_MS;
+    uint32_t apply_latency_ms = os_apply_latency_ns / NS_PER_MS;
+    encode(commit_latency_ms, bl); // for compatibility with older monitor.
+    encode(apply_latency_ms, bl); // for compatibility with older monitor.
+  }
+  ENCODE_FINISH(bl);
+}
+
+void objectstore_perf_stat_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(2, bl);
+  if (struct_v >= 2) {
+    decode(os_commit_latency_ns, bl);
+    decode(os_apply_latency_ns, bl);
+  } else {
+    uint32_t commit_latency_ms;
+    uint32_t apply_latency_ms;
+    decode(commit_latency_ms, bl);
+    decode(apply_latency_ms, bl);
+    constexpr auto NS_PER_MS = std::chrono::nanoseconds(1ms).count();
+    os_commit_latency_ns = commit_latency_ms * NS_PER_MS;
+    os_apply_latency_ns = apply_latency_ms * NS_PER_MS;
+  }
+  DECODE_FINISH(bl);
+}
+
+void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
+{
+  o.push_back(new objectstore_perf_stat_t());
+  o.push_back(new objectstore_perf_stat_t());
+  o.back()->os_commit_latency_ns = 20000000;
+  o.back()->os_apply_latency_ns = 30000000;
+}
+
+// -- osd_stat_t --
+void osd_stat_t::dump(Formatter *f, bool with_net) const
+{
+  f->dump_unsigned("up_from", up_from);
+  f->dump_unsigned("seq", seq);
+  f->dump_unsigned("num_pgs", num_pgs);
+  f->dump_unsigned("num_osds", num_osds);
+  f->dump_unsigned("num_per_pool_osds", num_per_pool_osds);
+  f->dump_unsigned("num_per_pool_omap_osds", num_per_pool_omap_osds);
+
+  /// dump legacy stats fields to ensure backward compatibility.
+  f->dump_unsigned("kb", statfs.kb());
+  f->dump_unsigned("kb_used", statfs.kb_used_raw());
+  f->dump_unsigned("kb_used_data", statfs.kb_used_data());
+  f->dump_unsigned("kb_used_omap", statfs.kb_used_omap());
+  f->dump_unsigned("kb_used_meta", statfs.kb_used_internal_metadata());
+  f->dump_unsigned("kb_avail", statfs.kb_avail());
+  ////////////////////
+
+  f->open_object_section("statfs");
+  statfs.dump(f);
+  f->close_section();
+  f->open_array_section("hb_peers");
+  for (auto p : hb_peers)
+    f->dump_int("osd", p);
+  f->close_section();
+  f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
+  f->dump_int("num_snap_trimming", num_snap_trimming);
+  f->dump_int("num_shards_repaired", num_shards_repaired);
+  f->open_object_section("op_queue_age_hist");
+  op_queue_age_hist.dump(f);
+  f->close_section();
+  f->open_object_section("perf_stat");
+  os_perf_stat.dump(f);
+  f->close_section();
+  f->open_array_section("alerts");
+  ::dump(f, os_alerts);
+  f->close_section();
+  if (with_net) {
+    dump_ping_time(f);
+  }
+}
+
+void osd_stat_t::dump_ping_time(Formatter *f) const
+{
+  f->open_array_section("network_ping_times");
+  for (auto &i : hb_pingtime) {
+    f->open_object_section("entry");
+    f->dump_int("osd", i.first);
+    const time_t lu(i.second.last_update);
+    char buffer[26];
+    string lustr(ctime_r(&lu, buffer));
+    lustr.pop_back();   // Remove trailing \n
+    f->dump_string("last update", lustr);
+    f->open_array_section("interfaces");
+    f->open_object_section("interface");
+    f->dump_string("interface", "back");
+    f->open_object_section("average");
+    f->dump_float("1min", i.second.back_pingtime[0]/1000.0);
+    f->dump_float("5min", i.second.back_pingtime[1]/1000.0);
+    f->dump_float("15min", i.second.back_pingtime[2]/1000.0);
+    f->close_section(); // average
+    f->open_object_section("min");
+    f->dump_float("1min", i.second.back_min[0]/1000.0);
+    f->dump_float("5min", i.second.back_min[1]/1000.0);
+    f->dump_float("15min", i.second.back_min[2]/1000.0);
+    f->close_section(); // min
+    f->open_object_section("max");
+    f->dump_float("1min", i.second.back_max[0]/1000.0);
+    f->dump_float("5min", i.second.back_max[1]/1000.0);
+    f->dump_float("15min", i.second.back_max[2]/1000.0);
+    f->close_section(); // max
+    f->dump_float("last", i.second.back_last/1000.0);
+    f->close_section(); // interface
+
+    if (i.second.front_pingtime[0] != 0) {
+      f->open_object_section("interface");
+      f->dump_string("interface", "front");
+      f->open_object_section("average");
+      f->dump_float("1min", i.second.front_pingtime[0]/1000.0);
+      f->dump_float("5min", i.second.front_pingtime[1]/1000.0);
+      f->dump_float("15min", i.second.front_pingtime[2]/1000.0);
+      f->close_section(); // average
+      f->open_object_section("min");
+      f->dump_float("1min", i.second.front_min[0]/1000.0);
+      f->dump_float("5min", i.second.front_min[1]/1000.0);
+      f->dump_float("15min", i.second.front_min[2]/1000.0);
+      f->close_section(); // min
+      f->open_object_section("max");
+      f->dump_float("1min", i.second.front_max[0]/1000.0);
+      f->dump_float("5min", i.second.front_max[1]/1000.0);
+      f->dump_float("15min", i.second.front_max[2]/1000.0);
+      f->close_section(); // max
+      f->dump_float("last", i.second.front_last/1000.0);
+      f->close_section(); // interface
+    }
+    f->close_section(); // interfaces
+    f->close_section(); // entry
+  }
+  f->close_section(); // network_ping_time
+}
+
+void osd_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+  ENCODE_START(14, 2, bl);
+
+  //////// for compatibility ////////
+  int64_t kb = statfs.kb();
+  int64_t kb_used = statfs.kb_used_raw();
+  int64_t kb_avail = statfs.kb_avail();
+  encode(kb, bl);
+  encode(kb_used, bl);
+  encode(kb_avail, bl);
+  ///////////////////////////////////
+
+  encode(snap_trim_queue_len, bl);
+  encode(num_snap_trimming, bl);
+  encode(hb_peers, bl);
+  encode((uint32_t)0, bl);
+  encode(op_queue_age_hist, bl);
+  encode(os_perf_stat, bl, features);
+  encode(up_from, bl);
+  encode(seq, bl);
+  encode(num_pgs, bl);
+
+  //////// for compatibility ////////
+  int64_t kb_used_data = statfs.kb_used_data();
+  int64_t kb_used_omap = statfs.kb_used_omap();
+  int64_t kb_used_meta = statfs.kb_used_internal_metadata();
+  encode(kb_used_data, bl);
+  encode(kb_used_omap, bl);
+  encode(kb_used_meta, bl);
+  encode(statfs, bl);
+  ///////////////////////////////////
+  encode(os_alerts, bl);
+  encode(num_shards_repaired, bl);
+  encode(num_osds, bl);
+  encode(num_per_pool_osds, bl);
+  encode(num_per_pool_omap_osds, bl);
+
+  // hb_pingtime map
+  encode((int)hb_pingtime.size(), bl);
+  for (auto i : hb_pingtime) {
+    encode(i.first, bl); // osd
+    encode(i.second.last_update, bl);
+    encode(i.second.back_pingtime[0], bl);
+    encode(i.second.back_pingtime[1], bl);
+    encode(i.second.back_pingtime[2], bl);
+    encode(i.second.back_min[0], bl);
+    encode(i.second.back_min[1], bl);
+    encode(i.second.back_min[2], bl);
+    encode(i.second.back_max[0], bl);
+    encode(i.second.back_max[1], bl);
+    encode(i.second.back_max[2], bl);
+    encode(i.second.back_last, bl);
+    encode(i.second.front_pingtime[0], bl);
+    encode(i.second.front_pingtime[1], bl);
+    encode(i.second.front_pingtime[2], bl);
+    encode(i.second.front_min[0], bl);
+    encode(i.second.front_min[1], bl);
+    encode(i.second.front_min[2], bl);
+    encode(i.second.front_max[0], bl);
+    encode(i.second.front_max[1], bl);
+    encode(i.second.front_max[2], bl);
+    encode(i.second.front_last, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void osd_stat_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+  int64_t kb, kb_used,kb_avail;
+  int64_t kb_used_data, kb_used_omap, kb_used_meta;
+  DECODE_START_LEGACY_COMPAT_LEN(14, 2, 2, bl);
+  decode(kb, bl);
+  decode(kb_used, bl);
+  decode(kb_avail, bl);
+  decode(snap_trim_queue_len, bl);
+  decode(num_snap_trimming, bl);
+  decode(hb_peers, bl);
+  vector<int> num_hb_out;
+  decode(num_hb_out, bl);
+  if (struct_v >= 3)
+    decode(op_queue_age_hist, bl);
+  if (struct_v >= 4)
+    decode(os_perf_stat, bl);
+  if (struct_v >= 6) {
+    decode(up_from, bl);
+    decode(seq, bl);
+  }
+  if (struct_v >= 7) {
+    decode(num_pgs, bl);
+  }
+  if (struct_v >= 8) {
+    decode(kb_used_data, bl);
+    decode(kb_used_omap, bl);
+    decode(kb_used_meta, bl);
+  } else {
+    kb_used_data = kb_used;
+    kb_used_omap = 0;
+    kb_used_meta = 0;
+  }
+  if (struct_v >= 9) {
+    decode(statfs, bl);
+  } else {
+    statfs.reset();
+    statfs.total = kb << 10;
+    statfs.available = kb_avail << 10;
+    // actually it's totally unexpected to have ststfs.total < statfs.available
+    // here but unfortunately legacy generate_test_instances produced such a
+    // case hence inserting some handling rather than assert
+    statfs.internally_reserved =
+      statfs.total > statfs.available ? statfs.total - statfs.available : 0;
+    kb_used <<= 10;
+    if ((int64_t)statfs.internally_reserved > kb_used) {
+      statfs.internally_reserved -= kb_used;
+    } else {
+      statfs.internally_reserved = 0;
+    }
+    statfs.allocated = kb_used_data << 10;
+    statfs.omap_allocated = kb_used_omap << 10;
+    statfs.internal_metadata = kb_used_meta << 10;
+  }
+  if (struct_v >= 10) {
+    decode(os_alerts, bl);
+  } else {
+    os_alerts.clear();
+  }
+  if (struct_v >= 11) {
+    decode(num_shards_repaired, bl);
+  } else {
+    num_shards_repaired = 0;
+  }
+  if (struct_v >= 12) {
+    decode(num_osds, bl);
+    decode(num_per_pool_osds, bl);
+  } else {
+    num_osds = 0;
+    num_per_pool_osds = 0;
+  }
+  if (struct_v >= 13) {
+    decode(num_per_pool_omap_osds, bl);
+  } else {
+    num_per_pool_omap_osds = 0;
+  }
+  hb_pingtime.clear();
+  if (struct_v >= 14) {
+    int count;
+    decode(count, bl);
+    for (int i = 0 ; i < count ; i++) {
+      int osd;
+      decode(osd, bl);
+      struct Interfaces ifs;
+      decode(ifs.last_update, bl);
+      decode(ifs.back_pingtime[0],bl);
+      decode(ifs.back_pingtime[1], bl);
+      decode(ifs.back_pingtime[2], bl);
+      decode(ifs.back_min[0],bl);
+      decode(ifs.back_min[1], bl);
+      decode(ifs.back_min[2], bl);
+      decode(ifs.back_max[0],bl);
+      decode(ifs.back_max[1], bl);
+      decode(ifs.back_max[2], bl);
+      decode(ifs.back_last, bl);
+      decode(ifs.front_pingtime[0], bl);
+      decode(ifs.front_pingtime[1], bl);
+      decode(ifs.front_pingtime[2], bl);
+      decode(ifs.front_min[0], bl);
+      decode(ifs.front_min[1], bl);
+      decode(ifs.front_min[2], bl);
+      decode(ifs.front_max[0], bl);
+      decode(ifs.front_max[1], bl);
+      decode(ifs.front_max[2], bl);
+      decode(ifs.front_last, bl);
+      hb_pingtime[osd] = ifs;
+    }
+  }
+  DECODE_FINISH(bl);
+}
+
+void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
+{
+  o.push_back(new osd_stat_t);
+
+  o.push_back(new osd_stat_t);
+  list<store_statfs_t*> ll;
+  store_statfs_t::generate_test_instances(ll);
+  o.back()->statfs = *ll.back();
+  o.back()->hb_peers.push_back(7);
+  o.back()->snap_trim_queue_len = 8;
+  o.back()->num_snap_trimming = 99;
+  o.back()->num_shards_repaired = 101;
+  o.back()->os_alerts[0].emplace(
+    "some alert", "some alert details");
+  o.back()->os_alerts[1].emplace(
+    "some alert2", "some alert2 details");
+  struct Interfaces gen_interfaces = {
+	123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001,
+	 { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 };
+  o.back()->hb_pingtime[20] = gen_interfaces;
+  gen_interfaces = {
+	987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 };
+  o.back()->hb_pingtime[30] = gen_interfaces;
+}
+
+// -- pg_t --
+
+int pg_t::print(char *o, int maxlen) const
+{
+  return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
+}
+
+bool pg_t::parse(const char *s)
+{
+  uint64_t ppool;
+  uint32_t pseed;
+  int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
+  if (r < 2)
+    return false;
+  m_pool = ppool;
+  m_seed = pseed;
+  return true;
+}
+
+bool spg_t::parse(const char *s)
+{
+  shard = shard_id_t::NO_SHARD;
+  uint64_t ppool;
+  uint32_t pseed;
+  uint32_t pshard;
+  int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
+  if (r < 2)
+    return false;
+  pgid.set_pool(ppool);
+  pgid.set_ps(pseed);
+
+  const char *p = strchr(s, 's');
+  if (p) {
+    r = sscanf(p, "s%u", &pshard);
+    if (r == 1) {
+      shard = shard_id_t(pshard);
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
+{
+  while (*suffix_backwords)
+    *--buf = *suffix_backwords++;
+
+  if (!is_no_shard()) {
+    buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
+    *--buf = 's';
+  }
+
+  return pgid.calc_name(buf, "");
+}
+
+ostream& operator<<(ostream& out, const spg_t &pg)
+{
+  char buf[spg_t::calc_name_buf_size];
+  buf[spg_t::calc_name_buf_size - 1] = '\0';
+  out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
+  return out;
+}
+
+pg_t pg_t::get_ancestor(unsigned old_pg_num) const
+{
+  int old_bits = cbits(old_pg_num);
+  int old_mask = (1 << old_bits) - 1;
+  pg_t ret = *this;
+  ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
+  return ret;
+}
+
+bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
+{
+  //ceph_assert(m_seed < old_pg_num);
+  if (m_seed >= old_pg_num) {
+    // degenerate case
+    return false;
+  }
+  if (new_pg_num <= old_pg_num)
+    return false;
+
+  bool split = false;
+  if (true) {
+    unsigned old_bits = cbits(old_pg_num);
+    unsigned old_mask = (1 << old_bits) - 1;
+    for (unsigned n = 1; ; n++) {
+      unsigned next_bit = (n << (old_bits-1));
+      unsigned s = next_bit | m_seed;
+
+      if (s < old_pg_num || s == m_seed)
+	continue;
+      if (s >= new_pg_num)
+	break;
+      if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
+	split = true;
+	if (children)
+	  children->insert(pg_t(s, m_pool));
+      }
+    }
+  }
+  if (false) {
+    // brute force
+    int old_bits = cbits(old_pg_num);
+    int old_mask = (1 << old_bits) - 1;
+    for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
+      unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
+      if (o == m_seed) {
+	split = true;
+	children->insert(pg_t(x, m_pool));
+      }
+    }
+  }
+  return split;
+}
+
+unsigned pg_t::get_split_bits(unsigned pg_num) const {
+  if (pg_num == 1)
+    return 0;
+  ceph_assert(pg_num > 1);
+
+  // Find unique p such that pg_num \in [2^(p-1), 2^p)
+  unsigned p = cbits(pg_num);
+  ceph_assert(p); // silence coverity #751330 
+
+  if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
+    return p;
+  else
+    return p - 1;
+}
+
+bool pg_t::is_merge_source(
+  unsigned old_pg_num,
+  unsigned new_pg_num,
+  pg_t *parent) const
+{
+  if (m_seed < old_pg_num &&
+      m_seed >= new_pg_num) {
+    if (parent) {
+      pg_t t = *this;
+      while (t.m_seed >= new_pg_num) {
+	t = t.get_parent();
+      }
+      *parent = t;
+    }
+    return true;
+  }
+  return false;
+}
+
+pg_t pg_t::get_parent() const
+{
+  unsigned bits = cbits(m_seed);
+  ceph_assert(bits);
+  pg_t retval = *this;
+  retval.m_seed &= ~((~0)<<(bits - 1));
+  return retval;
+}
+
+hobject_t pg_t::get_hobj_start() const
+{
+  return hobject_t(object_t(), string(), 0, m_seed, m_pool,
+		   string());
+}
+
+hobject_t pg_t::get_hobj_end(unsigned pg_num) const
+{
+  // note: this assumes a bitwise sort; with the legacy nibblewise
+  // sort a PG did not always cover a single contiguous range of the
+  // (bit-reversed) hash range.
+  unsigned bits = get_split_bits(pg_num);
+  uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
+  uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
+  if (rev_end >= 0x100000000) {
+    ceph_assert(rev_end == 0x100000000);
+    return hobject_t::get_max();
+  } else {
+    return hobject_t(object_t(), string(), CEPH_NOSNAP,
+		   hobject_t::_reverse_bits(rev_end), m_pool,
+		   string());
+  }
+}
+
+void pg_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("pool", m_pool);
+  f->dump_unsigned("seed", m_seed);
+}
+
+void pg_t::generate_test_instances(list<pg_t*>& o)
+{
+  o.push_back(new pg_t);
+  o.push_back(new pg_t(1, 2));
+  o.push_back(new pg_t(13123, 3));
+  o.push_back(new pg_t(131223, 4));
+}
+
+char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
+{
+  while (*suffix_backwords)
+    *--buf = *suffix_backwords++;
+
+  buf = ritoa<uint32_t, 16>(m_seed, buf);
+
+  *--buf = '.';
+
+  return  ritoa<uint64_t, 10>(m_pool, buf);
+}
+
+ostream& operator<<(ostream& out, const pg_t &pg)
+{
+  char buf[pg_t::calc_name_buf_size];
+  buf[pg_t::calc_name_buf_size - 1] = '\0';
+  out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
+  return out;
+}
+
+
+// -- coll_t --
+
+void coll_t::calc_str()
+{
+  switch (type) {
+  case TYPE_META:
+    strcpy(_str_buff, "meta");
+    _str = _str_buff;
+    break;
+  case TYPE_PG:
+    _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
+    _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
+    break;
+  case TYPE_PG_TEMP:
+    _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
+    _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
+    break;
+  default:
+    ceph_abort_msg("unknown collection type");
+  }
+}
+
+bool coll_t::parse(const std::string& s)
+{
+  if (s == "meta") {
+    type = TYPE_META;
+    pgid = spg_t();
+    removal_seq = 0;
+    calc_str();
+    ceph_assert(s == _str);
+    return true;
+  }
+  if (s.find("_head") == s.length() - 5 &&
+      pgid.parse(s.substr(0, s.length() - 5))) {
+    type = TYPE_PG;
+    removal_seq = 0;
+    calc_str();
+    ceph_assert(s == _str);
+    return true;
+  }
+  if (s.find("_TEMP") == s.length() - 5 &&
+      pgid.parse(s.substr(0, s.length() - 5))) {
+    type = TYPE_PG_TEMP;
+    removal_seq = 0;
+    calc_str();
+    ceph_assert(s == _str);
+    return true;
+  }
+  return false;
+}
+
+void coll_t::encode(ceph::buffer::list& bl) const
+{
+  using ceph::encode;
+  // when changing this, remember to update encoded_size() too.
+  if (is_temp()) {
+    // can't express this as v2...
+    __u8 struct_v = 3;
+    encode(struct_v, bl);
+    encode(to_str(), bl);
+  } else {
+    __u8 struct_v = 2;
+    encode(struct_v, bl);
+    encode((__u8)type, bl);
+    encode(pgid, bl);
+    snapid_t snap = CEPH_NOSNAP;
+    encode(snap, bl);
+  }
+}
+
+size_t coll_t::encoded_size() const
+{
+  size_t r = sizeof(__u8);
+  if (is_temp()) {
+    // v3
+    r += sizeof(__u32);
+    if (_str) {
+      r += strlen(_str);
+    }
+  } else {
+      // v2
+      // 1. type
+      r += sizeof(__u8);
+      // 2. pgid
+      //  - encoding header
+      r += sizeof(ceph_le32) + 2 * sizeof(__u8);
+      // - pg_t
+      r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
+      // - shard_id_t
+      r += sizeof(int8_t);
+      // 3. snapid_t
+      r += sizeof(uint64_t);
+  }
+
+  return r;
+}
+
+void coll_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  using ceph::decode;
+  __u8 struct_v;
+  decode(struct_v, bl);
+  switch (struct_v) {
+  case 1:
+    {
+      snapid_t snap;
+      decode(pgid, bl);
+      decode(snap, bl);
+
+      // infer the type
+      if (pgid == spg_t() && snap == 0) {
+	type = TYPE_META;
+      } else {
+	type = TYPE_PG;
+      }
+      removal_seq = 0;
+    }
+    break;
+
+  case 2:
+    {
+      __u8 _type;
+      snapid_t snap;
+      decode(_type, bl);
+      decode(pgid, bl);
+      decode(snap, bl);
+      type = (type_t)_type;
+      removal_seq = 0;
+    }
+    break;
+
+  case 3:
+    {
+      string str;
+      decode(str, bl);
+      bool ok = parse(str);
+      if (!ok)
+	throw std::domain_error(std::string("unable to parse pg ") + str);
+    }
+    break;
+
+  default:
+    {
+      CachedStackStringStream css;
+      *css << "coll_t::decode(): don't know how to decode version "
+	   << struct_v;
+      throw std::domain_error(css->str());
+    }
+  }
+}
+
+void coll_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("type_id", (unsigned)type);
+  if (type != TYPE_META)
+    f->dump_stream("pgid") << pgid;
+  f->dump_string("name", to_str());
+}
+
+void coll_t::generate_test_instances(list<coll_t*>& o)
+{
+  o.push_back(new coll_t());
+  o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
+  o.push_back(new coll_t(o.back()->get_temp()));
+  o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
+  o.push_back(new coll_t(o.back()->get_temp()));
+  o.push_back(new coll_t());
+}
+
+// ---
+
+std::string pg_vector_string(const vector<int32_t> &a)
+{
+  CachedStackStringStream css;
+  *css << "[";
+  for (auto i = a.cbegin(); i != a.cend(); ++i) {
+    if (i != a.begin())
+      *css << ",";
+    if (*i != CRUSH_ITEM_NONE)
+      *css << *i;
+    else
+      *css << "NONE";
+  }
+  *css << "]";
+  return css->str();
+}
+
+std::string pg_state_string(uint64_t state)
+{
+  CachedStackStringStream css;
+  if (state & PG_STATE_STALE)
+    *css << "stale+";
+  if (state & PG_STATE_CREATING)
+    *css << "creating+";
+  if (state & PG_STATE_ACTIVE)
+    *css << "active+";
+  if (state & PG_STATE_ACTIVATING)
+    *css << "activating+";
+  if (state & PG_STATE_CLEAN)
+    *css << "clean+";
+  if (state & PG_STATE_RECOVERY_WAIT)
+    *css << "recovery_wait+";
+  if (state & PG_STATE_RECOVERY_TOOFULL)
+    *css << "recovery_toofull+";
+  if (state & PG_STATE_RECOVERING)
+    *css << "recovering+";
+  if (state & PG_STATE_FORCED_RECOVERY)
+    *css << "forced_recovery+";
+  if (state & PG_STATE_DOWN)
+    *css << "down+";
+  if (state & PG_STATE_RECOVERY_UNFOUND)
+    *css << "recovery_unfound+";
+  if (state & PG_STATE_BACKFILL_UNFOUND)
+    *css << "backfill_unfound+";
+  if (state & PG_STATE_UNDERSIZED)
+    *css << "undersized+";
+  if (state & PG_STATE_DEGRADED)
+    *css << "degraded+";
+  if (state & PG_STATE_REMAPPED)
+    *css << "remapped+";
+  if (state & PG_STATE_PREMERGE)
+    *css << "premerge+";
+  if (state & PG_STATE_SCRUBBING)
+    *css << "scrubbing+";
+  if (state & PG_STATE_DEEP_SCRUB)
+    *css << "deep+";
+  if (state & PG_STATE_INCONSISTENT)
+    *css << "inconsistent+";
+  if (state & PG_STATE_PEERING)
+    *css << "peering+";
+  if (state & PG_STATE_REPAIR)
+    *css << "repair+";
+  if (state & PG_STATE_BACKFILL_WAIT)
+    *css << "backfill_wait+";
+  if (state & PG_STATE_BACKFILLING)
+    *css << "backfilling+";
+  if (state & PG_STATE_FORCED_BACKFILL)
+    *css << "forced_backfill+";
+  if (state & PG_STATE_BACKFILL_TOOFULL)
+    *css << "backfill_toofull+";
+  if (state & PG_STATE_INCOMPLETE)
+    *css << "incomplete+";
+  if (state & PG_STATE_PEERED)
+    *css << "peered+";
+  if (state & PG_STATE_SNAPTRIM)
+    *css << "snaptrim+";
+  if (state & PG_STATE_SNAPTRIM_WAIT)
+    *css << "snaptrim_wait+";
+  if (state & PG_STATE_SNAPTRIM_ERROR)
+    *css << "snaptrim_error+";
+  if (state & PG_STATE_FAILED_REPAIR)
+    *css << "failed_repair+";
+  if (state & PG_STATE_LAGGY)
+    *css << "laggy+";
+  if (state & PG_STATE_WAIT)
+    *css << "wait+";
+  auto ret = css->str();
+  if (ret.length() > 0)
+    ret.resize(ret.length() - 1);
+  else
+    ret = "unknown";
+  return ret;
+}
+
+std::optional<uint64_t> pg_string_state(const std::string& state)
+{
+  std::optional<uint64_t> type;
+  if (state == "active")
+    type = PG_STATE_ACTIVE;
+  else if (state == "clean")
+    type = PG_STATE_CLEAN;
+  else if (state == "down")
+    type = PG_STATE_DOWN;
+  else if (state == "recovery_unfound")
+    type = PG_STATE_RECOVERY_UNFOUND;
+  else if (state == "backfill_unfound")
+    type = PG_STATE_BACKFILL_UNFOUND;
+  else if (state == "premerge")
+    type = PG_STATE_PREMERGE;
+  else if (state == "scrubbing")
+    type = PG_STATE_SCRUBBING;
+  else if (state == "degraded")
+    type = PG_STATE_DEGRADED;
+  else if (state == "inconsistent")
+    type = PG_STATE_INCONSISTENT;
+  else if (state == "peering")
+    type = PG_STATE_PEERING;
+  else if (state == "repair")
+    type = PG_STATE_REPAIR;
+  else if (state == "recovering")
+    type = PG_STATE_RECOVERING;
+  else if (state == "forced_recovery")
+    type = PG_STATE_FORCED_RECOVERY;
+  else if (state == "backfill_wait")
+    type = PG_STATE_BACKFILL_WAIT;
+  else if (state == "incomplete")
+    type = PG_STATE_INCOMPLETE;
+  else if (state == "stale")
+    type = PG_STATE_STALE;
+  else if (state == "remapped")
+    type = PG_STATE_REMAPPED;
+  else if (state == "deep")
+    type = PG_STATE_DEEP_SCRUB;
+  else if (state == "backfilling")
+    type = PG_STATE_BACKFILLING;
+  else if (state == "forced_backfill")
+    type = PG_STATE_FORCED_BACKFILL;
+  else if (state == "backfill_toofull")
+    type = PG_STATE_BACKFILL_TOOFULL;
+  else if (state == "recovery_wait")
+    type = PG_STATE_RECOVERY_WAIT;
+  else if (state == "recovery_toofull")
+    type = PG_STATE_RECOVERY_TOOFULL;
+  else if (state == "undersized")
+    type = PG_STATE_UNDERSIZED;
+  else if (state == "activating")
+    type = PG_STATE_ACTIVATING;
+  else if (state == "peered")
+    type = PG_STATE_PEERED;
+  else if (state == "snaptrim")
+    type = PG_STATE_SNAPTRIM;
+  else if (state == "snaptrim_wait")
+    type = PG_STATE_SNAPTRIM_WAIT;
+  else if (state == "snaptrim_error")
+    type = PG_STATE_SNAPTRIM_ERROR;
+  else if (state == "creating")
+    type = PG_STATE_CREATING;
+  else if (state == "failed_repair")
+    type = PG_STATE_FAILED_REPAIR;
+  else if (state == "laggy")
+    type = PG_STATE_LAGGY;
+  else if (state == "wait")
+    type = PG_STATE_WAIT;
+  else if (state == "unknown")
+    type = 0;
+  else
+    type = std::nullopt;
+  return type;
+}
+
+// -- eversion_t --
+string eversion_t::get_key_name() const
+{
+  std::string key(32, ' ');
+  get_key_name(&key[0]);
+  key.resize(31); // remove the null terminator
+  return key;
+}
+
+// -- pool_snap_info_t --
+void pool_snap_info_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("snapid", snapid);
+  f->dump_stream("stamp") << stamp;
+  f->dump_string("name", name);
+}
+
+void pool_snap_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  using ceph::encode;
+  if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
+    __u8 struct_v = 1;
+    encode(struct_v, bl);
+    encode(snapid, bl);
+    encode(stamp, bl);
+    encode(name, bl);
+    return;
+  }
+  ENCODE_START(2, 2, bl);
+  encode(snapid, bl);
+  encode(stamp, bl);
+  encode(name, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pool_snap_info_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(snapid, bl);
+  decode(stamp, bl);
+  decode(name, bl);
+  DECODE_FINISH(bl);
+}
+
+void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
+{
+  o.push_back(new pool_snap_info_t);
+  o.push_back(new pool_snap_info_t);
+  o.back()->snapid = 1;
+  o.back()->stamp = utime_t(1, 2);
+  o.back()->name = "foo";
+}
+
+// -- pool_opts_t --
+
+// The order of items in the list is important, therefore,
+// you should always add to the end of the list when adding new options.
+
+typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
+static opt_mapping_t opt_mapping = boost::assign::map_list_of
+	   ("scrub_min_interval", pool_opts_t::opt_desc_t(
+	     pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
+	   ("scrub_max_interval", pool_opts_t::opt_desc_t(
+	     pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
+	   ("deep_scrub_interval", pool_opts_t::opt_desc_t(
+	     pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
+           ("recovery_priority", pool_opts_t::opt_desc_t(
+             pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
+           ("recovery_op_priority", pool_opts_t::opt_desc_t(
+             pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
+           ("scrub_priority", pool_opts_t::opt_desc_t(
+             pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
+           ("compression_mode", pool_opts_t::opt_desc_t(
+	     pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
+           ("compression_algorithm", pool_opts_t::opt_desc_t(
+	     pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
+           ("compression_required_ratio", pool_opts_t::opt_desc_t(
+	     pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
+           ("compression_max_blob_size", pool_opts_t::opt_desc_t(
+	     pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
+           ("compression_min_blob_size", pool_opts_t::opt_desc_t(
+	     pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
+           ("csum_type", pool_opts_t::opt_desc_t(
+	     pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
+           ("csum_max_block", pool_opts_t::opt_desc_t(
+	     pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
+           ("csum_min_block", pool_opts_t::opt_desc_t(
+	     pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT))
+           ("fingerprint_algorithm", pool_opts_t::opt_desc_t(
+	     pool_opts_t::FINGERPRINT_ALGORITHM, pool_opts_t::STR))
+           ("pg_num_min", pool_opts_t::opt_desc_t(
+	     pool_opts_t::PG_NUM_MIN, pool_opts_t::INT))
+           ("target_size_bytes", pool_opts_t::opt_desc_t(
+	     pool_opts_t::TARGET_SIZE_BYTES, pool_opts_t::INT))
+           ("target_size_ratio", pool_opts_t::opt_desc_t(
+	     pool_opts_t::TARGET_SIZE_RATIO, pool_opts_t::DOUBLE))
+           ("pg_autoscale_bias", pool_opts_t::opt_desc_t(
+	     pool_opts_t::PG_AUTOSCALE_BIAS, pool_opts_t::DOUBLE))
+           ("read_lease_interval", pool_opts_t::opt_desc_t(
+	     pool_opts_t::READ_LEASE_INTERVAL, pool_opts_t::DOUBLE))
+           ("dedup_tier", pool_opts_t::opt_desc_t(
+	     pool_opts_t::DEDUP_TIER, pool_opts_t::INT))
+           ("dedup_chunk_algorithm", pool_opts_t::opt_desc_t(
+	     pool_opts_t::DEDUP_CHUNK_ALGORITHM, pool_opts_t::STR))
+           ("dedup_cdc_chunk_size", pool_opts_t::opt_desc_t(
+	     pool_opts_t::DEDUP_CDC_CHUNK_SIZE, pool_opts_t::INT))
+	   ("pg_num_max", pool_opts_t::opt_desc_t(
+             pool_opts_t::PG_NUM_MAX, pool_opts_t::INT));
+
+bool pool_opts_t::is_opt_name(const std::string& name)
+{
+  return opt_mapping.count(name);
+}
+
+pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name)
+{
+  auto i = opt_mapping.find(name);
+  ceph_assert(i != opt_mapping.end());
+  return i->second;
+}
+
+bool pool_opts_t::is_set(pool_opts_t::key_t key) const
+{
+  return opts.count(key);
+}
+
+const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const
+{
+  auto i = opts.find(key);
+  ceph_assert(i != opts.end());
+  return i->second;
+}
+
+bool pool_opts_t::unset(pool_opts_t::key_t key) {
+  return opts.erase(key) > 0;
+}
+
+class pool_opts_dumper_t : public boost::static_visitor<> {
+public:
+  pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
+    name(name_.c_str()), f(f_) {}
+
+  void operator()(std::string s) const {
+    f->dump_string(name, s);
+  }
+  void operator()(int64_t i) const {
+    f->dump_int(name, i);
+  }
+  void operator()(double d) const {
+    f->dump_float(name, d);
+  }
+
+private:
+  const char* name;
+  Formatter* f;
+};
+
+void pool_opts_t::dump(const std::string& name, Formatter* f) const
+{
+  const opt_desc_t& desc = get_opt_desc(name);
+  auto i = opts.find(desc.key);
+  if (i == opts.end()) {
+      return;
+  }
+  boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
+}
+
+void pool_opts_t::dump(Formatter* f) const
+{
+  for (auto i = opt_mapping.cbegin(); i != opt_mapping.cend(); ++i) {
+    const std::string& name = i->first;
+    const opt_desc_t& desc = i->second;
+    auto j = opts.find(desc.key);
+    if (j == opts.end()) {
+      continue;
+    }
+    boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
+  }
+}
+
+class pool_opts_encoder_t : public boost::static_visitor<> {
+public:
+  explicit pool_opts_encoder_t(ceph::buffer::list& bl_, uint64_t features)
+    : bl(bl_),
+      features(features) {}
+
+  void operator()(const std::string &s) const {
+    encode(static_cast<int32_t>(pool_opts_t::STR), bl);
+    encode(s, bl);
+  }
+  void operator()(int64_t i) const {
+    encode(static_cast<int32_t>(pool_opts_t::INT), bl);
+    if (HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+      encode(i, bl);
+    } else {
+      encode(static_cast<int32_t>(i), bl);
+    }
+  }
+  void operator()(double d) const {
+    encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
+    encode(d, bl);
+  }
+
+private:
+  ceph::buffer::list& bl;
+  uint64_t features;
+};
+
+void pool_opts_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  unsigned v = 2;
+  if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+    v = 1;
+  }
+  ENCODE_START(v, 1, bl);
+  uint32_t n = static_cast<uint32_t>(opts.size());
+  encode(n, bl);
+  for (auto i = opts.cbegin(); i != opts.cend(); ++i) {
+    encode(static_cast<int32_t>(i->first), bl);
+    boost::apply_visitor(pool_opts_encoder_t(bl, features), i->second);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void pool_opts_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  __u32 n;
+  decode(n, bl);
+  opts.clear();
+  while (n--) {
+    int32_t k, t;
+    decode(k, bl);
+    decode(t, bl);
+    if (t == STR) {
+      std::string s;
+      decode(s, bl);
+      opts[static_cast<key_t>(k)] = s;
+    } else if (t == INT) {
+      int64_t i;
+      if (struct_v >= 2) {
+	decode(i, bl);
+      } else {
+	int ii;
+	decode(ii, bl);
+	i = ii;
+      }
+      opts[static_cast<key_t>(k)] = i;
+    } else if (t == DOUBLE) {
+      double d;
+      decode(d, bl);
+      opts[static_cast<key_t>(k)] = d;
+    } else {
+      ceph_assert(!"invalid type");
+    }
+  }
+  DECODE_FINISH(bl);
+}
+
+ostream& operator<<(ostream& out, const pool_opts_t& opts)
+{
+  for (auto i = opt_mapping.begin(); i != opt_mapping.end(); ++i) {
+    const std::string& name = i->first;
+    const pool_opts_t::opt_desc_t& desc = i->second;
+    auto j = opts.opts.find(desc.key);
+    if (j == opts.opts.end()) {
+      continue;
+    }
+    out << " " << name << " " << j->second;
+  }
+  return out;
+}
+
+// -- pg_pool_t --
+
+const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
+const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
+const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
+
+void pg_pool_t::dump(Formatter *f) const
+{
+  f->dump_stream("create_time") << get_create_time();
+  f->dump_unsigned("flags", get_flags());
+  f->dump_string("flags_names", get_flags_string());
+  f->dump_int("type", get_type());
+  f->dump_int("size", get_size());
+  f->dump_int("min_size", get_min_size());
+  f->dump_int("crush_rule", get_crush_rule());
+  f->dump_int("peering_crush_bucket_count", peering_crush_bucket_count);
+  f->dump_int("peering_crush_bucket_target", peering_crush_bucket_target);
+  f->dump_int("peering_crush_bucket_barrier", peering_crush_bucket_barrier);
+  f->dump_int("peering_crush_bucket_mandatory_member", peering_crush_mandatory_member);
+  f->dump_int("object_hash", get_object_hash());
+  f->dump_string("pg_autoscale_mode",
+		 get_pg_autoscale_mode_name(pg_autoscale_mode));
+  f->dump_unsigned("pg_num", get_pg_num());
+  f->dump_unsigned("pg_placement_num", get_pgp_num());
+  f->dump_unsigned("pg_placement_num_target", get_pgp_num_target());
+  f->dump_unsigned("pg_num_target", get_pg_num_target());
+  f->dump_unsigned("pg_num_pending", get_pg_num_pending());
+  f->dump_object("last_pg_merge_meta", last_pg_merge_meta);
+  f->dump_stream("last_change") << get_last_change();
+  f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
+  f->dump_stream("last_force_op_resend_prenautilus")
+    << get_last_force_op_resend_prenautilus();
+  f->dump_stream("last_force_op_resend_preluminous")
+    << get_last_force_op_resend_preluminous();
+  f->dump_unsigned("auid", get_auid());
+  f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
+  f->dump_unsigned("snap_seq", get_snap_seq());
+  f->dump_unsigned("snap_epoch", get_snap_epoch());
+  f->open_array_section("pool_snaps");
+  for (auto p = snaps.cbegin(); p != snaps.cend(); ++p) {
+    f->open_object_section("pool_snap_info");
+    p->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+  f->dump_stream("removed_snaps") << removed_snaps;
+  f->dump_unsigned("quota_max_bytes", quota_max_bytes);
+  f->dump_unsigned("quota_max_objects", quota_max_objects);
+  f->open_array_section("tiers");
+  for (auto p = tiers.cbegin(); p != tiers.cend(); ++p)
+    f->dump_unsigned("pool_id", *p);
+  f->close_section();
+  f->dump_int("tier_of", tier_of);
+  f->dump_int("read_tier", read_tier);
+  f->dump_int("write_tier", write_tier);
+  f->dump_string("cache_mode", get_cache_mode_name());
+  f->dump_unsigned("target_max_bytes", target_max_bytes);
+  f->dump_unsigned("target_max_objects", target_max_objects);
+  f->dump_unsigned("cache_target_dirty_ratio_micro",
+		   cache_target_dirty_ratio_micro);
+  f->dump_unsigned("cache_target_dirty_high_ratio_micro",
+		   cache_target_dirty_high_ratio_micro);
+  f->dump_unsigned("cache_target_full_ratio_micro",
+		   cache_target_full_ratio_micro);
+  f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
+  f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
+  f->dump_string("erasure_code_profile", erasure_code_profile);
+  f->open_object_section("hit_set_params");
+  hit_set_params.dump(f);
+  f->close_section(); // hit_set_params
+  f->dump_unsigned("hit_set_period", hit_set_period);
+  f->dump_unsigned("hit_set_count", hit_set_count);
+  f->dump_bool("use_gmt_hitset", use_gmt_hitset);
+  f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
+  f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
+  f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
+  f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
+  f->open_array_section("grade_table");
+  for (unsigned i = 0; i < hit_set_count; ++i)
+    f->dump_unsigned("value", get_grade(i));
+  f->close_section();
+  f->dump_unsigned("stripe_width", get_stripe_width());
+  f->dump_unsigned("expected_num_objects", expected_num_objects);
+  f->dump_bool("fast_read", fast_read);
+  f->open_object_section("options");
+  opts.dump(f);
+  f->close_section(); // options
+  f->open_object_section("application_metadata");
+  for (auto &app_pair : application_metadata) {
+    f->open_object_section(app_pair.first.c_str());
+    for (auto &kv_pair : app_pair.second) {
+      f->dump_string(kv_pair.first.c_str(), kv_pair.second);
+    }
+    f->close_section(); // application
+  }
+  f->close_section(); // application_metadata
+}
+
+void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
+  for (size_t i = 0; i < from.size(); ++i) {
+    if (from[i] != CRUSH_ITEM_NONE) {
+      to->insert(
+        pg_shard_t(
+          from[i],
+          is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+    }
+  }
+}
+
+void pg_pool_t::calc_pg_masks()
+{
+  pg_num_mask = (1 << cbits(pg_num-1)) - 1;
+  pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
+}
+
+unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
+{
+  if (pg_num == pg_num_mask + 1)
+    return pg_num;                    // power-of-2 split
+  unsigned mask = pg_num_mask >> 1;
+  if ((pgid.ps() & mask) < (pg_num & mask))
+    return pg_num_mask + 1;           // smaller bin size (already split)
+  else
+    return (pg_num_mask + 1) >> 1;    // bigger bin (not yet split)
+}
+
+bool pg_pool_t::is_pending_merge(pg_t pgid, bool *target) const
+{
+  if (pg_num_pending >= pg_num) {
+    return false;
+  }
+  if (pgid.ps() >= pg_num_pending && pgid.ps() < pg_num) {
+    if (target) {
+      *target = false;
+    }
+    return true;
+  }
+  for (unsigned ps = pg_num_pending; ps < pg_num; ++ps) {
+    if (pg_t(ps, pgid.pool()).get_parent() == pgid) {
+      if (target) {
+	*target = true;
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+/*
+ * we have two snap modes:
+ *  - pool snaps
+ *    - snap existence/non-existence defined by snaps[] and snap_seq
+ *  - user managed snaps
+ *    - existence tracked by librados user
+ */
+bool pg_pool_t::is_pool_snaps_mode() const
+{
+  return has_flag(FLAG_POOL_SNAPS);
+}
+
+bool pg_pool_t::is_unmanaged_snaps_mode() const
+{
+  return has_flag(FLAG_SELFMANAGED_SNAPS);
+}
+
+bool pg_pool_t::is_removed_snap(snapid_t s) const
+{
+  if (is_pool_snaps_mode())
+    return s <= get_snap_seq() && snaps.count(s) == 0;
+  else
+    return removed_snaps.contains(s);
+}
+
+snapid_t pg_pool_t::snap_exists(std::string_view s) const
+{
+  for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
+    if (p->second.name == s)
+      return p->second.snapid;
+  return 0;
+}
+
+void pg_pool_t::add_snap(const char *n, utime_t stamp)
+{
+  ceph_assert(!is_unmanaged_snaps_mode());
+  flags |= FLAG_POOL_SNAPS;
+  snapid_t s = get_snap_seq() + 1;
+  snap_seq = s;
+  snaps[s].snapid = s;
+  snaps[s].name = n;
+  snaps[s].stamp = stamp;
+}
+
+uint64_t pg_pool_t::add_unmanaged_snap(bool preoctopus_compat)
+{
+  ceph_assert(!is_pool_snaps_mode());
+  if (snap_seq == 0) {
+    if (preoctopus_compat) {
+      // kludge for pre-mimic tracking of pool vs selfmanaged snaps.  after
+      // mimic this field is not decoded but our flag is set; pre-mimic, we
+      // have a non-empty removed_snaps to signifiy a non-pool-snaps pool.
+      removed_snaps.insert(snapid_t(1));
+    }
+    snap_seq = 1;
+  }
+  flags |= FLAG_SELFMANAGED_SNAPS;
+  snap_seq = snap_seq + 1;
+  return snap_seq;
+}
+
+void pg_pool_t::remove_snap(snapid_t s)
+{
+  ceph_assert(snaps.count(s));
+  snaps.erase(s);
+  snap_seq = snap_seq + 1;
+}
+
+void pg_pool_t::remove_unmanaged_snap(snapid_t s, bool preoctopus_compat)
+{
+  ceph_assert(is_unmanaged_snaps_mode());
+  ++snap_seq;
+  if (preoctopus_compat) {
+    removed_snaps.insert(s);
+    // try to add in the new seq, just to try to keep the interval_set contiguous
+    if (!removed_snaps.contains(get_snap_seq())) {
+      removed_snaps.insert(get_snap_seq());
+    }
+  }
+}
+
+SnapContext pg_pool_t::get_snap_context() const
+{
+  vector<snapid_t> s(snaps.size());
+  unsigned i = 0;
+  for (auto p = snaps.crbegin(); p != snaps.crend(); ++p)
+    s[i++] = p->first;
+  return SnapContext(get_snap_seq(), s);
+}
+
+uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
+{
+ if (ns.empty()) 
+    return ceph_str_hash(object_hash, key.data(), key.length());
+  int nsl = ns.length();
+  int len = key.length() + nsl + 1;
+  char buf[len];
+  memcpy(&buf[0], ns.data(), nsl);
+  buf[nsl] = '\037';
+  memcpy(&buf[nsl+1], key.data(), key.length());
+  return ceph_str_hash(object_hash, &buf[0], len);
+}
+
+uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
+{
+  return ceph_stable_mod(v, pg_num, pg_num_mask);
+}
+
+/*
+ * map a raw pg (with full precision ps) into an actual pg, for storage
+ */
+pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
+{
+  pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
+  return pg;
+}
+  
+/*
+ * map raw pg (full precision ps) into a placement seed.  include
+ * pool id in that value so that different pools don't use the same
+ * seeds.
+ */
+ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
+{
+  if (flags & FLAG_HASHPSPOOL) {
+    // Hash the pool id so that pool PGs do not overlap.
+    return
+      crush_hash32_2(CRUSH_HASH_RJENKINS1,
+		     ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
+		     pg.pool());
+  } else {
+    // Legacy behavior; add ps and pool together.  This is not a great
+    // idea because the PGs from each pool will essentially overlap on
+    // top of each other: 0.5 == 1.4 == 2.3 == ...
+    return
+      ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
+      pg.pool();
+  }
+}
+
+uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
+{
+  uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
+  if (pg_num == pg_num_mask + 1) {
+    r &= ~pg_num_mask;
+  } else {
+    unsigned smaller_mask = pg_num_mask >> 1;
+    if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
+      r &= ~pg_num_mask;
+    } else {
+      r &= ~smaller_mask;
+    }
+  }
+  r |= pg.ps();
+  return r;
+}
+
+void pg_pool_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  using ceph::encode;
+  if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
+    // this encoding matches the old struct ceph_pg_pool
+    __u8 struct_v = 2;
+    encode(struct_v, bl);
+    encode(type, bl);
+    encode(size, bl);
+    encode(crush_rule, bl);
+    encode(object_hash, bl);
+    encode(pg_num, bl);
+    encode(pgp_num, bl);
+    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
+    encode(lpg_num, bl);
+    encode(lpgp_num, bl);
+    encode(last_change, bl);
+    encode(snap_seq, bl);
+    encode(snap_epoch, bl);
+
+    __u32 n = snaps.size();
+    encode(n, bl);
+    n = removed_snaps.num_intervals();
+    encode(n, bl);
+
+    encode(auid, bl);
+
+    encode_nohead(snaps, bl, features);
+    encode_nohead(removed_snaps, bl);
+    return;
+  }
+
+  if ((features & CEPH_FEATURE_OSDENC) == 0) {
+    __u8 struct_v = 4;
+    encode(struct_v, bl);
+    encode(type, bl);
+    encode(size, bl);
+    encode(crush_rule, bl);
+    encode(object_hash, bl);
+    encode(pg_num, bl);
+    encode(pgp_num, bl);
+    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
+    encode(lpg_num, bl);
+    encode(lpgp_num, bl);
+    encode(last_change, bl);
+    encode(snap_seq, bl);
+    encode(snap_epoch, bl);
+    encode(snaps, bl, features);
+    encode(removed_snaps, bl);
+    encode(auid, bl);
+    encode(flags, bl);
+    encode((uint32_t)0, bl); // crash_replay_interval
+    return;
+  }
+
+  if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
+    // we simply added last_force_op_resend here, which is a fully
+    // backward compatible change.  however, encoding the same map
+    // differently between monitors triggers scrub noise (even though
+    // they are decodable without the feature), so let's be pendantic
+    // about it.
+    ENCODE_START(14, 5, bl);
+    encode(type, bl);
+    encode(size, bl);
+    encode(crush_rule, bl);
+    encode(object_hash, bl);
+    encode(pg_num, bl);
+    encode(pgp_num, bl);
+    __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
+    encode(lpg_num, bl);
+    encode(lpgp_num, bl);
+    encode(last_change, bl);
+    encode(snap_seq, bl);
+    encode(snap_epoch, bl);
+    encode(snaps, bl, features);
+    encode(removed_snaps, bl);
+    encode(auid, bl);
+    encode(flags, bl);
+    encode((uint32_t)0, bl); // crash_replay_interval
+    encode(min_size, bl);
+    encode(quota_max_bytes, bl);
+    encode(quota_max_objects, bl);
+    encode(tiers, bl);
+    encode(tier_of, bl);
+    __u8 c = cache_mode;
+    encode(c, bl);
+    encode(read_tier, bl);
+    encode(write_tier, bl);
+    encode(properties, bl);
+    encode(hit_set_params, bl);
+    encode(hit_set_period, bl);
+    encode(hit_set_count, bl);
+    encode(stripe_width, bl);
+    encode(target_max_bytes, bl);
+    encode(target_max_objects, bl);
+    encode(cache_target_dirty_ratio_micro, bl);
+    encode(cache_target_full_ratio_micro, bl);
+    encode(cache_min_flush_age, bl);
+    encode(cache_min_evict_age, bl);
+    encode(erasure_code_profile, bl);
+    ENCODE_FINISH(bl);
+    return;
+  }
+
+  uint8_t v = 30;
+  // NOTE: any new encoding dependencies must be reflected by
+  // SIGNIFICANT_FEATURES
+  if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
+    // this was the first post-hammer thing we added; if it's missing, encode
+    // like hammer.
+    v = 21;
+  } else if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
+    v = 24;
+  } else if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
+    v = 26;
+  } else if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+    v = 27;
+  } else if (!is_stretch_pool()) {
+    v = 29;
+  }
+
+  ENCODE_START(v, 5, bl);
+  encode(type, bl);
+  encode(size, bl);
+  encode(crush_rule, bl);
+  encode(object_hash, bl);
+  encode(pg_num, bl);
+  encode(pgp_num, bl);
+  __u32 lpg_num = 0, lpgp_num = 0;  // tell old code that there are no localized pgs.
+  encode(lpg_num, bl);
+  encode(lpgp_num, bl);
+  encode(last_change, bl);
+  encode(snap_seq, bl);
+  encode(snap_epoch, bl);
+  encode(snaps, bl, features);
+  encode(removed_snaps, bl);
+  encode(auid, bl);
+  if (v >= 27) {
+    encode(flags, bl);
+  } else {
+    auto tmp = flags;
+    tmp &= ~(FLAG_SELFMANAGED_SNAPS | FLAG_POOL_SNAPS | FLAG_CREATING);
+    encode(tmp, bl);
+  }
+  encode((uint32_t)0, bl); // crash_replay_interval
+  encode(min_size, bl);
+  encode(quota_max_bytes, bl);
+  encode(quota_max_objects, bl);
+  encode(tiers, bl);
+  encode(tier_of, bl);
+  __u8 c = cache_mode;
+  encode(c, bl);
+  encode(read_tier, bl);
+  encode(write_tier, bl);
+  encode(properties, bl);
+  encode(hit_set_params, bl);
+  encode(hit_set_period, bl);
+  encode(hit_set_count, bl);
+  encode(stripe_width, bl);
+  encode(target_max_bytes, bl);
+  encode(target_max_objects, bl);
+  encode(cache_target_dirty_ratio_micro, bl);
+  encode(cache_target_full_ratio_micro, bl);
+  encode(cache_min_flush_age, bl);
+  encode(cache_min_evict_age, bl);
+  encode(erasure_code_profile, bl);
+  encode(last_force_op_resend_preluminous, bl);
+  encode(min_read_recency_for_promote, bl);
+  encode(expected_num_objects, bl);
+  if (v >= 19) {
+    encode(cache_target_dirty_high_ratio_micro, bl);
+  }
+  if (v >= 20) {
+    encode(min_write_recency_for_promote, bl);
+  }
+  if (v >= 21) {
+    encode(use_gmt_hitset, bl);
+  }
+  if (v >= 22) {
+    encode(fast_read, bl);
+  }
+  if (v >= 23) {
+    encode(hit_set_grade_decay_rate, bl);
+    encode(hit_set_search_last_n, bl);
+  }
+  if (v >= 24) {
+    encode(opts, bl, features);
+  }
+  if (v >= 25) {
+    encode(last_force_op_resend_prenautilus, bl);
+  }
+  if (v >= 26) {
+    encode(application_metadata, bl);
+  }
+  if (v >= 27) {
+    encode(create_time, bl);
+  }
+  if (v >= 28) {
+    encode(pg_num_target, bl);
+    encode(pgp_num_target, bl);
+    encode(pg_num_pending, bl);
+    encode((epoch_t)0, bl);  // pg_num_dec_last_epoch_started from 14.1.[01]
+    encode((epoch_t)0, bl);  // pg_num_dec_last_epoch_clean from 14.1.[01]
+    encode(last_force_op_resend, bl);
+    encode(pg_autoscale_mode, bl);
+  }
+  if (v >= 29) {
+    encode(last_pg_merge_meta, bl);
+  }
+  if (v >= 30) {
+    encode(peering_crush_bucket_count, bl);
+    encode(peering_crush_bucket_target, bl);
+    encode(peering_crush_bucket_barrier, bl);
+    encode(peering_crush_mandatory_member, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void pg_pool_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(30, 5, 5, bl);
+  decode(type, bl);
+  decode(size, bl);
+  decode(crush_rule, bl);
+  decode(object_hash, bl);
+  decode(pg_num, bl);
+  decode(pgp_num, bl);
+  {
+    __u32 lpg_num, lpgp_num;
+    decode(lpg_num, bl);
+    decode(lpgp_num, bl);
+  }
+  decode(last_change, bl);
+  decode(snap_seq, bl);
+  decode(snap_epoch, bl);
+
+  if (struct_v >= 3) {
+    decode(snaps, bl);
+    decode(removed_snaps, bl);
+    decode(auid, bl);
+  } else {
+    __u32 n, m;
+    decode(n, bl);
+    decode(m, bl);
+    decode(auid, bl);
+    decode_nohead(n, snaps, bl);
+    decode_nohead(m, removed_snaps, bl);
+  }
+
+  if (struct_v >= 4) {
+    decode(flags, bl);
+    uint32_t crash_replay_interval;
+    decode(crash_replay_interval, bl);
+  } else {
+    flags = 0;
+  }
+  // upgrade path for selfmanaged vs pool snaps
+  if (snap_seq > 0 && (flags & (FLAG_SELFMANAGED_SNAPS|FLAG_POOL_SNAPS)) == 0) {
+    if (!removed_snaps.empty()) {
+      flags |= FLAG_SELFMANAGED_SNAPS;
+    } else {
+      flags |= FLAG_POOL_SNAPS;
+    }
+  }
+  if (struct_v >= 7) {
+    decode(min_size, bl);
+  } else {
+    min_size = size - size/2;
+  }
+  if (struct_v >= 8) {
+    decode(quota_max_bytes, bl);
+    decode(quota_max_objects, bl);
+  }
+  if (struct_v >= 9) {
+    decode(tiers, bl);
+    decode(tier_of, bl);
+    __u8 v;
+    decode(v, bl);
+    cache_mode = (cache_mode_t)v;
+    decode(read_tier, bl);
+    decode(write_tier, bl);
+  }
+  if (struct_v >= 10) {
+    decode(properties, bl);
+  }
+  if (struct_v >= 11) {
+    decode(hit_set_params, bl);
+    decode(hit_set_period, bl);
+    decode(hit_set_count, bl);
+  } else {
+    pg_pool_t def;
+    hit_set_period = def.hit_set_period;
+    hit_set_count = def.hit_set_count;
+  }
+  if (struct_v >= 12) {
+    decode(stripe_width, bl);
+  } else {
+    set_stripe_width(0);
+  }
+  if (struct_v >= 13) {
+    decode(target_max_bytes, bl);
+    decode(target_max_objects, bl);
+    decode(cache_target_dirty_ratio_micro, bl);
+    decode(cache_target_full_ratio_micro, bl);
+    decode(cache_min_flush_age, bl);
+    decode(cache_min_evict_age, bl);
+  } else {
+    target_max_bytes = 0;
+    target_max_objects = 0;
+    cache_target_dirty_ratio_micro = 0;
+    cache_target_full_ratio_micro = 0;
+    cache_min_flush_age = 0;
+    cache_min_evict_age = 0;
+  }
+  if (struct_v >= 14) {
+    decode(erasure_code_profile, bl);
+  }
+  if (struct_v >= 15) {
+    decode(last_force_op_resend_preluminous, bl);
+  } else {
+    last_force_op_resend_preluminous = 0;
+  }
+  if (struct_v >= 16) {
+    decode(min_read_recency_for_promote, bl);
+  } else {
+    min_read_recency_for_promote = 1;
+  }
+  if (struct_v >= 17) {
+    decode(expected_num_objects, bl);
+  } else {
+    expected_num_objects = 0;
+  }
+  if (struct_v >= 19) {
+    decode(cache_target_dirty_high_ratio_micro, bl);
+  } else {
+    cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
+  }
+  if (struct_v >= 20) {
+    decode(min_write_recency_for_promote, bl);
+  } else {
+    min_write_recency_for_promote = 1;
+  }
+  if (struct_v >= 21) {
+    decode(use_gmt_hitset, bl);
+  } else {
+    use_gmt_hitset = false;
+  }
+  if (struct_v >= 22) {
+    decode(fast_read, bl);
+  } else {
+    fast_read = false;
+  }
+  if (struct_v >= 23) {
+    decode(hit_set_grade_decay_rate, bl);
+    decode(hit_set_search_last_n, bl);
+  } else {
+    hit_set_grade_decay_rate = 0;
+    hit_set_search_last_n = 1;
+  }
+  if (struct_v >= 24) {
+    decode(opts, bl);
+  }
+  if (struct_v >= 25) {
+    decode(last_force_op_resend_prenautilus, bl);
+  } else {
+    last_force_op_resend_prenautilus = last_force_op_resend_preluminous;
+  }
+  if (struct_v >= 26) {
+    decode(application_metadata, bl);
+  }
+  if (struct_v >= 27) {
+    decode(create_time, bl);
+  }
+  if (struct_v >= 28) {
+    decode(pg_num_target, bl);
+    decode(pgp_num_target, bl);
+    decode(pg_num_pending, bl);
+    epoch_t old_merge_last_epoch_clean, old_merge_last_epoch_started;
+    decode(old_merge_last_epoch_started, bl);
+    decode(old_merge_last_epoch_clean, bl);
+    decode(last_force_op_resend, bl);
+    decode(pg_autoscale_mode, bl);
+    if (struct_v >= 29) {
+      decode(last_pg_merge_meta, bl);
+    } else {
+      last_pg_merge_meta.last_epoch_clean = old_merge_last_epoch_clean;
+      last_pg_merge_meta.last_epoch_started = old_merge_last_epoch_started;
+    }
+  } else {
+    pg_num_target = pg_num;
+    pgp_num_target = pgp_num;
+    pg_num_pending = pg_num;
+    last_force_op_resend = last_force_op_resend_prenautilus;
+    pg_autoscale_mode = pg_autoscale_mode_t::WARN;    // default to warn on upgrade
+  }
+  if (struct_v >= 30) {
+    decode(peering_crush_bucket_count, bl);
+    decode(peering_crush_bucket_target, bl);
+    decode(peering_crush_bucket_barrier, bl);
+    decode(peering_crush_mandatory_member, bl);
+  }
+  DECODE_FINISH(bl);
+  calc_pg_masks();
+  calc_grade_table();
+}
+
+bool pg_pool_t::stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap,
+				     std::ostream * out) const
+{
+  if (!is_stretch_pool()) return true;
+  const uint32_t barrier_id = peering_crush_bucket_barrier;
+  const uint32_t barrier_count = peering_crush_bucket_count;
+  set<int> ancestors;
+  const shared_ptr<CrushWrapper>& crush = osdmap.crush;
+  for (int osdid : want) {
+    int ancestor = crush->get_parent_of_type(osdid, barrier_id,
+					     crush_rule);
+    ancestors.insert(ancestor);
+  }
+  if (ancestors.size() < barrier_count) {
+    if (out) {
+      *out << __func__ << ": not enough crush buckets with OSDs in want set "
+	   << want;
+    }
+    return false;
+  } else if (peering_crush_mandatory_member != CRUSH_ITEM_NONE &&
+	     !ancestors.count(peering_crush_mandatory_member)) {
+    if (out) {
+      *out << __func__ << ": missing mandatory crush bucket member "
+	   << peering_crush_mandatory_member;
+    }
+    return false;
+  }
+  return true;
+}
+
+void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
+{
+  pg_pool_t a;
+  o.push_back(new pg_pool_t(a));
+
+  a.create_time = utime_t(4,5);
+  a.type = TYPE_REPLICATED;
+  a.size = 2;
+  a.crush_rule = 3;
+  a.object_hash = 4;
+  a.pg_num = 6;
+  a.pgp_num = 4;
+  a.pgp_num_target = 4;
+  a.pg_num_target = 5;
+  a.pg_num_pending = 5;
+  a.last_pg_merge_meta.last_epoch_started = 2;
+  a.last_pg_merge_meta.last_epoch_clean = 2;
+  a.last_change = 9;
+  a.last_force_op_resend = 123823;
+  a.last_force_op_resend_preluminous = 123824;
+  a.snap_seq = 10;
+  a.snap_epoch = 11;
+  a.flags = FLAG_POOL_SNAPS;
+  a.auid = 12;
+  a.quota_max_bytes = 473;
+  a.quota_max_objects = 474;
+  o.push_back(new pg_pool_t(a));
+
+  a.snaps[3].name = "asdf";
+  a.snaps[3].snapid = 3;
+  a.snaps[3].stamp = utime_t(123, 4);
+  a.snaps[6].name = "qwer";
+  a.snaps[6].snapid = 6;
+  a.snaps[6].stamp = utime_t(23423, 4);
+  o.push_back(new pg_pool_t(a));
+
+  a.flags = FLAG_SELFMANAGED_SNAPS;
+  a.snaps.clear();
+  a.removed_snaps.insert(2);
+  a.quota_max_bytes = 2473;
+  a.quota_max_objects = 4374;
+  a.tiers.insert(0);
+  a.tiers.insert(1);
+  a.tier_of = 2;
+  a.cache_mode = CACHEMODE_WRITEBACK;
+  a.read_tier = 1;
+  a.write_tier = 1;
+  a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
+  a.hit_set_period = 3600;
+  a.hit_set_count = 8;
+  a.min_read_recency_for_promote = 1;
+  a.min_write_recency_for_promote = 1;
+  a.hit_set_grade_decay_rate = 50;
+  a.hit_set_search_last_n = 1;
+  a.calc_grade_table();
+  a.set_stripe_width(12345);
+  a.target_max_bytes = 1238132132;
+  a.target_max_objects = 1232132;
+  a.cache_target_dirty_ratio_micro = 187232;
+  a.cache_target_dirty_high_ratio_micro = 309856;
+  a.cache_target_full_ratio_micro = 987222;
+  a.cache_min_flush_age = 231;
+  a.cache_min_evict_age = 2321;
+  a.erasure_code_profile = "profile in osdmap";
+  a.expected_num_objects = 123456;
+  a.fast_read = false;
+  a.application_metadata = {{"rbd", {{"key", "value"}}}};
+  o.push_back(new pg_pool_t(a));
+}
+
+ostream& operator<<(ostream& out, const pg_pool_t& p)
+{
+  out << p.get_type_name();
+  if (p.get_type_name() == "erasure") {
+    out << " profile " << p.erasure_code_profile;
+  }
+  out << " size " << p.get_size()
+      << " min_size " << p.get_min_size()
+      << " crush_rule " << p.get_crush_rule()
+      << " object_hash " << p.get_object_hash_name()
+      << " pg_num " << p.get_pg_num()
+      << " pgp_num " << p.get_pgp_num();
+  if (p.get_pg_num_target() != p.get_pg_num()) {
+    out << " pg_num_target " << p.get_pg_num_target();
+  }
+  if (p.get_pgp_num_target() != p.get_pgp_num()) {
+    out << " pgp_num_target " << p.get_pgp_num_target();
+  }
+  if (p.get_pg_num_pending() != p.get_pg_num()) {
+    out << " pg_num_pending " << p.get_pg_num_pending();
+  }
+  if (p.pg_autoscale_mode != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
+    out << " autoscale_mode " << p.get_pg_autoscale_mode_name(p.pg_autoscale_mode);
+  }
+  out << " last_change " << p.get_last_change();
+  if (p.get_last_force_op_resend() ||
+      p.get_last_force_op_resend_prenautilus() ||
+      p.get_last_force_op_resend_preluminous())
+    out << " lfor " << p.get_last_force_op_resend() << "/"
+	<< p.get_last_force_op_resend_prenautilus() << "/"
+	<< p.get_last_force_op_resend_preluminous();
+  if (p.get_auid())
+    out << " owner " << p.get_auid();
+  if (p.flags)
+    out << " flags " << p.get_flags_string();
+  if (p.quota_max_bytes)
+    out << " max_bytes " << p.quota_max_bytes;
+  if (p.quota_max_objects)
+    out << " max_objects " << p.quota_max_objects;
+  if (!p.tiers.empty())
+    out << " tiers " << p.tiers;
+  if (p.is_tier())
+    out << " tier_of " << p.tier_of;
+  if (p.has_read_tier())
+    out << " read_tier " << p.read_tier;
+  if (p.has_write_tier())
+    out << " write_tier " << p.write_tier;
+  if (p.cache_mode)
+    out << " cache_mode " << p.get_cache_mode_name();
+  if (p.target_max_bytes)
+    out << " target_bytes " << p.target_max_bytes;
+  if (p.target_max_objects)
+    out << " target_objects " << p.target_max_objects;
+  if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
+    out << " hit_set " << p.hit_set_params
+	<< " " << p.hit_set_period << "s"
+	<< " x" << p.hit_set_count << " decay_rate "
+	<< p.hit_set_grade_decay_rate
+	<< " search_last_n " << p.hit_set_search_last_n;
+  }
+  if (p.min_read_recency_for_promote)
+    out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
+  if (p.min_write_recency_for_promote)
+    out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
+  out << " stripe_width " << p.get_stripe_width();
+  if (p.expected_num_objects)
+    out << " expected_num_objects " << p.expected_num_objects;
+  if (p.fast_read)
+    out << " fast_read " << p.fast_read;
+  out << p.opts;
+  if (!p.application_metadata.empty()) {
+    out << " application ";
+    for (auto it = p.application_metadata.begin();
+         it != p.application_metadata.end(); ++it) {
+      if (it != p.application_metadata.begin())
+        out << ",";
+      out << it->first;
+    }
+  }
+  return out;
+}
+
+
+// -- object_stat_sum_t --
+
+void object_stat_sum_t::dump(Formatter *f) const
+{
+  f->dump_int("num_bytes", num_bytes);
+  f->dump_int("num_objects", num_objects);
+  f->dump_int("num_object_clones", num_object_clones);
+  f->dump_int("num_object_copies", num_object_copies);
+  f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
+  f->dump_int("num_objects_missing", num_objects_missing);
+  f->dump_int("num_objects_degraded", num_objects_degraded);
+  f->dump_int("num_objects_misplaced", num_objects_misplaced);
+  f->dump_int("num_objects_unfound", num_objects_unfound);
+  f->dump_int("num_objects_dirty", num_objects_dirty);
+  f->dump_int("num_whiteouts", num_whiteouts);
+  f->dump_int("num_read", num_rd);
+  f->dump_int("num_read_kb", num_rd_kb);
+  f->dump_int("num_write", num_wr);
+  f->dump_int("num_write_kb", num_wr_kb);
+  f->dump_int("num_scrub_errors", num_scrub_errors);
+  f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
+  f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
+  f->dump_int("num_objects_recovered", num_objects_recovered);
+  f->dump_int("num_bytes_recovered", num_bytes_recovered);
+  f->dump_int("num_keys_recovered", num_keys_recovered);
+  f->dump_int("num_objects_omap", num_objects_omap);
+  f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
+  f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
+  f->dump_int("num_flush", num_flush);
+  f->dump_int("num_flush_kb", num_flush_kb);
+  f->dump_int("num_evict", num_evict);
+  f->dump_int("num_evict_kb", num_evict_kb);
+  f->dump_int("num_promote", num_promote);
+  f->dump_int("num_flush_mode_high", num_flush_mode_high);
+  f->dump_int("num_flush_mode_low", num_flush_mode_low);
+  f->dump_int("num_evict_mode_some", num_evict_mode_some);
+  f->dump_int("num_evict_mode_full", num_evict_mode_full);
+  f->dump_int("num_objects_pinned", num_objects_pinned);
+  f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
+  f->dump_int("num_large_omap_objects", num_large_omap_objects);
+  f->dump_int("num_objects_manifest", num_objects_manifest);
+  f->dump_int("num_omap_bytes", num_omap_bytes);
+  f->dump_int("num_omap_keys", num_omap_keys);
+  f->dump_int("num_objects_repaired", num_objects_repaired);
+}
+
+void object_stat_sum_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(20, 14, bl);
+#if defined(CEPH_LITTLE_ENDIAN)
+  bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
+#else
+  encode(num_bytes, bl);
+  encode(num_objects, bl);
+  encode(num_object_clones, bl);
+  encode(num_object_copies, bl);
+  encode(num_objects_missing_on_primary, bl);
+  encode(num_objects_degraded, bl);
+  encode(num_objects_unfound, bl);
+  encode(num_rd, bl);
+  encode(num_rd_kb, bl);
+  encode(num_wr, bl);
+  encode(num_wr_kb, bl);
+  encode(num_scrub_errors, bl);
+  encode(num_objects_recovered, bl);
+  encode(num_bytes_recovered, bl);
+  encode(num_keys_recovered, bl);
+  encode(num_shallow_scrub_errors, bl);
+  encode(num_deep_scrub_errors, bl);
+  encode(num_objects_dirty, bl);
+  encode(num_whiteouts, bl);
+  encode(num_objects_omap, bl);
+  encode(num_objects_hit_set_archive, bl);
+  encode(num_objects_misplaced, bl);
+  encode(num_bytes_hit_set_archive, bl);
+  encode(num_flush, bl);
+  encode(num_flush_kb, bl);
+  encode(num_evict, bl);
+  encode(num_evict_kb, bl);
+  encode(num_promote, bl);
+  encode(num_flush_mode_high, bl);
+  encode(num_flush_mode_low, bl);
+  encode(num_evict_mode_some, bl);
+  encode(num_evict_mode_full, bl);
+  encode(num_objects_pinned, bl);
+  encode(num_objects_missing, bl);
+  encode(num_legacy_snapsets, bl);
+  encode(num_large_omap_objects, bl);
+  encode(num_objects_manifest, bl);
+  encode(num_omap_bytes, bl);
+  encode(num_omap_keys, bl);
+  encode(num_objects_repaired, bl);
+#endif
+  ENCODE_FINISH(bl);
+}
+
+void object_stat_sum_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  bool decode_finish = false;
+  static const int STAT_SUM_DECODE_VERSION = 20;
+  DECODE_START(STAT_SUM_DECODE_VERSION, bl);
+#if defined(CEPH_LITTLE_ENDIAN)
+  if (struct_v == STAT_SUM_DECODE_VERSION) {
+    bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
+    decode_finish = true;
+  }
+#endif
+  if (!decode_finish) {
+    decode(num_bytes, bl);
+    decode(num_objects, bl);
+    decode(num_object_clones, bl);
+    decode(num_object_copies, bl);
+    decode(num_objects_missing_on_primary, bl);
+    decode(num_objects_degraded, bl);
+    decode(num_objects_unfound, bl);
+    decode(num_rd, bl);
+    decode(num_rd_kb, bl);
+    decode(num_wr, bl);
+    decode(num_wr_kb, bl);
+    decode(num_scrub_errors, bl);
+    decode(num_objects_recovered, bl);
+    decode(num_bytes_recovered, bl);
+    decode(num_keys_recovered, bl);
+    decode(num_shallow_scrub_errors, bl);
+    decode(num_deep_scrub_errors, bl);
+    decode(num_objects_dirty, bl);
+    decode(num_whiteouts, bl);
+    decode(num_objects_omap, bl);
+    decode(num_objects_hit_set_archive, bl);
+    decode(num_objects_misplaced, bl);
+    decode(num_bytes_hit_set_archive, bl);
+    decode(num_flush, bl);
+    decode(num_flush_kb, bl);
+    decode(num_evict, bl);
+    decode(num_evict_kb, bl);
+    decode(num_promote, bl);
+    decode(num_flush_mode_high, bl);
+    decode(num_flush_mode_low, bl);
+    decode(num_evict_mode_some, bl);
+    decode(num_evict_mode_full, bl);
+    decode(num_objects_pinned, bl);
+    decode(num_objects_missing, bl);
+    if (struct_v >= 16) {
+      decode(num_legacy_snapsets, bl);
+    } else {
+      num_legacy_snapsets = num_object_clones;  // upper bound
+    }
+    if (struct_v >= 17) {
+      decode(num_large_omap_objects, bl);
+    }
+    if (struct_v >= 18) {
+      decode(num_objects_manifest, bl);
+    }
+    if (struct_v >= 19) {
+      decode(num_omap_bytes, bl);
+      decode(num_omap_keys, bl);
+    }
+    if (struct_v >= 20) {
+      decode(num_objects_repaired, bl);
+    }
+  }
+  DECODE_FINISH(bl);
+}
+
+void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
+{
+  object_stat_sum_t a;
+
+  a.num_bytes = 1;
+  a.num_objects = 3;
+  a.num_object_clones = 4;
+  a.num_object_copies = 5;
+  a.num_objects_missing_on_primary = 6;
+  a.num_objects_missing = 123;
+  a.num_objects_degraded = 7;
+  a.num_objects_unfound = 8;
+  a.num_rd = 9; a.num_rd_kb = 10;
+  a.num_wr = 11; a.num_wr_kb = 12;
+  a.num_objects_recovered = 14;
+  a.num_bytes_recovered = 15;
+  a.num_keys_recovered = 16;
+  a.num_deep_scrub_errors = 17;
+  a.num_shallow_scrub_errors = 18;
+  a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
+  a.num_objects_dirty = 21;
+  a.num_whiteouts = 22;
+  a.num_objects_misplaced = 1232;
+  a.num_objects_hit_set_archive = 2;
+  a.num_bytes_hit_set_archive = 27;
+  a.num_flush = 5;
+  a.num_flush_kb = 6;
+  a.num_evict = 7;
+  a.num_evict_kb = 8;
+  a.num_promote = 9;
+  a.num_flush_mode_high = 0;
+  a.num_flush_mode_low = 1;
+  a.num_evict_mode_some = 1;
+  a.num_evict_mode_full = 0;
+  a.num_objects_pinned = 20;
+  a.num_large_omap_objects = 5;
+  a.num_objects_manifest = 2;
+  a.num_omap_bytes = 20000;
+  a.num_omap_keys = 200;
+  a.num_objects_repaired = 300;
+  o.push_back(new object_stat_sum_t(a));
+}
+
+void object_stat_sum_t::add(const object_stat_sum_t& o)
+{
+  num_bytes += o.num_bytes;
+  num_objects += o.num_objects;
+  num_object_clones += o.num_object_clones;
+  num_object_copies += o.num_object_copies;
+  num_objects_missing_on_primary += o.num_objects_missing_on_primary;
+  num_objects_missing += o.num_objects_missing;
+  num_objects_degraded += o.num_objects_degraded;
+  num_objects_misplaced += o.num_objects_misplaced;
+  num_rd += o.num_rd;
+  num_rd_kb += o.num_rd_kb;
+  num_wr += o.num_wr;
+  num_wr_kb += o.num_wr_kb;
+  num_objects_unfound += o.num_objects_unfound;
+  num_scrub_errors += o.num_scrub_errors;
+  num_shallow_scrub_errors += o.num_shallow_scrub_errors;
+  num_deep_scrub_errors += o.num_deep_scrub_errors;
+  num_objects_recovered += o.num_objects_recovered;
+  num_bytes_recovered += o.num_bytes_recovered;
+  num_keys_recovered += o.num_keys_recovered;
+  num_objects_dirty += o.num_objects_dirty;
+  num_whiteouts += o.num_whiteouts;
+  num_objects_omap += o.num_objects_omap;
+  num_objects_hit_set_archive += o.num_objects_hit_set_archive;
+  num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
+  num_flush += o.num_flush;
+  num_flush_kb += o.num_flush_kb;
+  num_evict += o.num_evict;
+  num_evict_kb += o.num_evict_kb;
+  num_promote += o.num_promote;
+  num_flush_mode_high += o.num_flush_mode_high;
+  num_flush_mode_low += o.num_flush_mode_low;
+  num_evict_mode_some += o.num_evict_mode_some;
+  num_evict_mode_full += o.num_evict_mode_full;
+  num_objects_pinned += o.num_objects_pinned;
+  num_legacy_snapsets += o.num_legacy_snapsets;
+  num_large_omap_objects += o.num_large_omap_objects;
+  num_objects_manifest += o.num_objects_manifest;
+  num_omap_bytes += o.num_omap_bytes;
+  num_omap_keys += o.num_omap_keys;
+  num_objects_repaired += o.num_objects_repaired;
+}
+
+void object_stat_sum_t::sub(const object_stat_sum_t& o)
+{
+  num_bytes -= o.num_bytes;
+  num_objects -= o.num_objects;
+  num_object_clones -= o.num_object_clones;
+  num_object_copies -= o.num_object_copies;
+  num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
+  num_objects_missing -= o.num_objects_missing;
+  num_objects_degraded -= o.num_objects_degraded;
+  num_objects_misplaced -= o.num_objects_misplaced;
+  num_rd -= o.num_rd;
+  num_rd_kb -= o.num_rd_kb;
+  num_wr -= o.num_wr;
+  num_wr_kb -= o.num_wr_kb;
+  num_objects_unfound -= o.num_objects_unfound;
+  num_scrub_errors -= o.num_scrub_errors;
+  num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
+  num_deep_scrub_errors -= o.num_deep_scrub_errors;
+  num_objects_recovered -= o.num_objects_recovered;
+  num_bytes_recovered -= o.num_bytes_recovered;
+  num_keys_recovered -= o.num_keys_recovered;
+  num_objects_dirty -= o.num_objects_dirty;
+  num_whiteouts -= o.num_whiteouts;
+  num_objects_omap -= o.num_objects_omap;
+  num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
+  num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
+  num_flush -= o.num_flush;
+  num_flush_kb -= o.num_flush_kb;
+  num_evict -= o.num_evict;
+  num_evict_kb -= o.num_evict_kb;
+  num_promote -= o.num_promote;
+  num_flush_mode_high -= o.num_flush_mode_high;
+  num_flush_mode_low -= o.num_flush_mode_low;
+  num_evict_mode_some -= o.num_evict_mode_some;
+  num_evict_mode_full -= o.num_evict_mode_full;
+  num_objects_pinned -= o.num_objects_pinned;
+  num_legacy_snapsets -= o.num_legacy_snapsets;
+  num_large_omap_objects -= o.num_large_omap_objects;
+  num_objects_manifest -= o.num_objects_manifest;
+  num_omap_bytes -= o.num_omap_bytes;
+  num_omap_keys -= o.num_omap_keys;
+  num_objects_repaired -= o.num_objects_repaired;
+}
+
+bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
+{
+  return
+    l.num_bytes == r.num_bytes &&
+    l.num_objects == r.num_objects &&
+    l.num_object_clones == r.num_object_clones &&
+    l.num_object_copies == r.num_object_copies &&
+    l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
+    l.num_objects_missing == r.num_objects_missing &&
+    l.num_objects_degraded == r.num_objects_degraded &&
+    l.num_objects_misplaced == r.num_objects_misplaced &&
+    l.num_objects_unfound == r.num_objects_unfound &&
+    l.num_rd == r.num_rd &&
+    l.num_rd_kb == r.num_rd_kb &&
+    l.num_wr == r.num_wr &&
+    l.num_wr_kb == r.num_wr_kb &&
+    l.num_scrub_errors == r.num_scrub_errors &&
+    l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
+    l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
+    l.num_objects_recovered == r.num_objects_recovered &&
+    l.num_bytes_recovered == r.num_bytes_recovered &&
+    l.num_keys_recovered == r.num_keys_recovered &&
+    l.num_objects_dirty == r.num_objects_dirty &&
+    l.num_whiteouts == r.num_whiteouts &&
+    l.num_objects_omap == r.num_objects_omap &&
+    l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
+    l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
+    l.num_flush == r.num_flush &&
+    l.num_flush_kb == r.num_flush_kb &&
+    l.num_evict == r.num_evict &&
+    l.num_evict_kb == r.num_evict_kb &&
+    l.num_promote == r.num_promote &&
+    l.num_flush_mode_high == r.num_flush_mode_high &&
+    l.num_flush_mode_low == r.num_flush_mode_low &&
+    l.num_evict_mode_some == r.num_evict_mode_some &&
+    l.num_evict_mode_full == r.num_evict_mode_full &&
+    l.num_objects_pinned == r.num_objects_pinned &&
+    l.num_legacy_snapsets == r.num_legacy_snapsets &&
+    l.num_large_omap_objects == r.num_large_omap_objects &&
+    l.num_objects_manifest == r.num_objects_manifest &&
+    l.num_omap_bytes == r.num_omap_bytes &&
+    l.num_omap_keys == r.num_omap_keys &&
+    l.num_objects_repaired == r.num_objects_repaired;
+}
+
+// -- object_stat_collection_t --
+
+void object_stat_collection_t::dump(Formatter *f) const
+{
+  f->open_object_section("stat_sum");
+  sum.dump(f);
+  f->close_section();
+}
+
+void object_stat_collection_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(sum, bl);
+  encode((__u32)0, bl);
+  ENCODE_FINISH(bl);
+}
+
+void object_stat_collection_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+  decode(sum, bl);
+  {
+    map<string,object_stat_sum_t> cat_sum;
+    decode(cat_sum, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
+{
+  object_stat_collection_t a;
+  o.push_back(new object_stat_collection_t(a));
+  list<object_stat_sum_t*> l;
+  object_stat_sum_t::generate_test_instances(l);
+  for (auto p = l.begin(); p != l.end(); ++p) {
+    a.add(**p);
+    o.push_back(new object_stat_collection_t(a));
+  }
+}
+
+
+// -- pg_stat_t --
+
+bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
+{
+  if (primary && osd == acting_primary) {
+    return true;
+  } else if (!primary) {
+    for(auto it = acting.cbegin(); it != acting.cend(); ++it)
+    {
+      if (*it == osd)
+        return true;
+    }
+  }
+  return false;
+}
+
+void pg_stat_t::dump(Formatter *f) const
+{
+  f->dump_stream("version") << version;
+  f->dump_unsigned("reported_seq", reported_seq);
+  f->dump_unsigned("reported_epoch", reported_epoch);
+  f->dump_string("state", pg_state_string(state));
+  f->dump_stream("last_fresh") << last_fresh;
+  f->dump_stream("last_change") << last_change;
+  f->dump_stream("last_active") << last_active;
+  f->dump_stream("last_peered") << last_peered;
+  f->dump_stream("last_clean") << last_clean;
+  f->dump_stream("last_became_active") << last_became_active;
+  f->dump_stream("last_became_peered") << last_became_peered;
+  f->dump_stream("last_unstale") << last_unstale;
+  f->dump_stream("last_undegraded") << last_undegraded;
+  f->dump_stream("last_fullsized") << last_fullsized;
+  f->dump_unsigned("mapping_epoch", mapping_epoch);
+  f->dump_stream("log_start") << log_start;
+  f->dump_stream("ondisk_log_start") << ondisk_log_start;
+  f->dump_unsigned("created", created);
+  f->dump_unsigned("last_epoch_clean", last_epoch_clean);
+  f->dump_stream("parent") << parent;
+  f->dump_unsigned("parent_split_bits", parent_split_bits);
+  f->dump_stream("last_scrub") << last_scrub;
+  f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
+  f->dump_stream("last_deep_scrub") << last_deep_scrub;
+  f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
+  f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
+  f->dump_int("log_size", log_size);
+  f->dump_int("ondisk_log_size", ondisk_log_size);
+  f->dump_bool("stats_invalid", stats_invalid);
+  f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
+  f->dump_bool("omap_stats_invalid", omap_stats_invalid);
+  f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
+  f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
+  f->dump_bool("pin_stats_invalid", pin_stats_invalid);
+  f->dump_bool("manifest_stats_invalid", manifest_stats_invalid);
+  f->dump_unsigned("snaptrimq_len", snaptrimq_len);
+  stats.dump(f);
+  f->open_array_section("up");
+  for (auto p = up.cbegin(); p != up.cend(); ++p)
+    f->dump_int("osd", *p);
+  f->close_section();
+  f->open_array_section("acting");
+  for (auto p = acting.cbegin(); p != acting.cend(); ++p)
+    f->dump_int("osd", *p);
+  f->close_section();
+  f->open_array_section("avail_no_missing");
+  for (auto p = avail_no_missing.cbegin(); p != avail_no_missing.cend(); ++p)
+    f->dump_stream("shard") << *p;
+  f->close_section();
+  f->open_array_section("object_location_counts");
+  for (auto p = object_location_counts.cbegin(); p != object_location_counts.cend(); ++p) {
+    f->open_object_section("entry");
+    f->dump_stream("shards") << p->first;
+    f->dump_int("objects", p->second);
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("blocked_by");
+  for (auto p = blocked_by.cbegin(); p != blocked_by.cend(); ++p)
+    f->dump_int("osd", *p);
+  f->close_section();
+  f->dump_int("up_primary", up_primary);
+  f->dump_int("acting_primary", acting_primary);
+  f->open_array_section("purged_snaps");
+  for (auto i = purged_snaps.begin(); i != purged_snaps.end(); ++i) {
+    f->open_object_section("interval");
+    f->dump_stream("start") << i.get_start();
+    f->dump_stream("length") << i.get_len();
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void pg_stat_t::dump_brief(Formatter *f) const
+{
+  f->dump_string("state", pg_state_string(state));
+  f->open_array_section("up");
+  for (auto p = up.cbegin(); p != up.cend(); ++p)
+    f->dump_int("osd", *p);
+  f->close_section();
+  f->open_array_section("acting");
+  for (auto p = acting.cbegin(); p != acting.cend(); ++p)
+    f->dump_int("osd", *p);
+  f->close_section();
+  f->dump_int("up_primary", up_primary);
+  f->dump_int("acting_primary", acting_primary);
+}
+
+void pg_stat_t::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(26, 22, bl);
+  encode(version, bl);
+  encode(reported_seq, bl);
+  encode(reported_epoch, bl);
+  encode((__u32)state, bl);   // for older peers
+  encode(log_start, bl);
+  encode(ondisk_log_start, bl);
+  encode(created, bl);
+  encode(last_epoch_clean, bl);
+  encode(parent, bl);
+  encode(parent_split_bits, bl);
+  encode(last_scrub, bl);
+  encode(last_scrub_stamp, bl);
+  encode(stats, bl);
+  encode(log_size, bl);
+  encode(ondisk_log_size, bl);
+  encode(up, bl);
+  encode(acting, bl);
+  encode(last_fresh, bl);
+  encode(last_change, bl);
+  encode(last_active, bl);
+  encode(last_clean, bl);
+  encode(last_unstale, bl);
+  encode(mapping_epoch, bl);
+  encode(last_deep_scrub, bl);
+  encode(last_deep_scrub_stamp, bl);
+  encode(stats_invalid, bl);
+  encode(last_clean_scrub_stamp, bl);
+  encode(last_became_active, bl);
+  encode(dirty_stats_invalid, bl);
+  encode(up_primary, bl);
+  encode(acting_primary, bl);
+  encode(omap_stats_invalid, bl);
+  encode(hitset_stats_invalid, bl);
+  encode(blocked_by, bl);
+  encode(last_undegraded, bl);
+  encode(last_fullsized, bl);
+  encode(hitset_bytes_stats_invalid, bl);
+  encode(last_peered, bl);
+  encode(last_became_peered, bl);
+  encode(pin_stats_invalid, bl);
+  encode(snaptrimq_len, bl);
+  __u32 top_state = (state >> 32);
+  encode(top_state, bl);
+  encode(purged_snaps, bl);
+  encode(manifest_stats_invalid, bl);
+  encode(avail_no_missing, bl);
+  encode(object_location_counts, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+  bool tmp;
+  uint32_t old_state;
+  DECODE_START(26, bl);
+  decode(version, bl);
+  decode(reported_seq, bl);
+  decode(reported_epoch, bl);
+  decode(old_state, bl);
+  decode(log_start, bl);
+  decode(ondisk_log_start, bl);
+  decode(created, bl);
+  decode(last_epoch_clean, bl);
+  decode(parent, bl);
+  decode(parent_split_bits, bl);
+  decode(last_scrub, bl);
+  decode(last_scrub_stamp, bl);
+  decode(stats, bl);
+  decode(log_size, bl);
+  decode(ondisk_log_size, bl);
+  decode(up, bl);
+  decode(acting, bl);
+  decode(last_fresh, bl);
+  decode(last_change, bl);
+  decode(last_active, bl);
+  decode(last_clean, bl);
+  decode(last_unstale, bl);
+  decode(mapping_epoch, bl);
+  decode(last_deep_scrub, bl);
+  decode(last_deep_scrub_stamp, bl);
+  decode(tmp, bl);
+  stats_invalid = tmp;
+  decode(last_clean_scrub_stamp, bl);
+  decode(last_became_active, bl);
+  decode(tmp, bl);
+  dirty_stats_invalid = tmp;
+  decode(up_primary, bl);
+  decode(acting_primary, bl);
+  decode(tmp, bl);
+  omap_stats_invalid = tmp;
+  decode(tmp, bl);
+  hitset_stats_invalid = tmp;
+  decode(blocked_by, bl);
+  decode(last_undegraded, bl);
+  decode(last_fullsized, bl);
+  decode(tmp, bl);
+  hitset_bytes_stats_invalid = tmp;
+  decode(last_peered, bl);
+  decode(last_became_peered, bl);
+  decode(tmp, bl);
+  pin_stats_invalid = tmp;
+  if (struct_v >= 23) {
+    decode(snaptrimq_len, bl);
+    if (struct_v >= 24) {
+      __u32 top_state;
+      decode(top_state, bl);
+      state = (uint64_t)old_state | ((uint64_t)top_state << 32);
+      decode(purged_snaps, bl);
+    } else {
+      state = old_state;
+    }
+    if (struct_v >= 25) {
+      decode(tmp, bl);
+      manifest_stats_invalid = tmp;
+    } else {
+      manifest_stats_invalid = true;
+    }
+    if (struct_v >= 26) {
+      decode(avail_no_missing, bl);
+      decode(object_location_counts, bl);
+    }
+  }
+  DECODE_FINISH(bl);
+}
+
+void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
+{
+  pg_stat_t a;
+  o.push_back(new pg_stat_t(a));
+
+  a.version = eversion_t(1, 3);
+  a.reported_epoch = 1;
+  a.reported_seq = 2;
+  a.state = 123;
+  a.mapping_epoch = 998;
+  a.last_fresh = utime_t(1002, 1);
+  a.last_change = utime_t(1002, 2);
+  a.last_active = utime_t(1002, 3);
+  a.last_clean = utime_t(1002, 4);
+  a.last_unstale = utime_t(1002, 5);
+  a.last_undegraded = utime_t(1002, 7);
+  a.last_fullsized = utime_t(1002, 8);
+  a.log_start = eversion_t(1, 4);
+  a.ondisk_log_start = eversion_t(1, 5);
+  a.created = 6;
+  a.last_epoch_clean = 7;
+  a.parent = pg_t(1, 2);
+  a.parent_split_bits = 12;
+  a.last_scrub = eversion_t(9, 10);
+  a.last_scrub_stamp = utime_t(11, 12);
+  a.last_deep_scrub = eversion_t(13, 14);
+  a.last_deep_scrub_stamp = utime_t(15, 16);
+  a.last_clean_scrub_stamp = utime_t(17, 18);
+  a.snaptrimq_len = 1048576;
+  list<object_stat_collection_t*> l;
+  object_stat_collection_t::generate_test_instances(l);
+  a.stats = *l.back();
+  a.log_size = 99;
+  a.ondisk_log_size = 88;
+  a.up.push_back(123);
+  a.up_primary = 123;
+  a.acting.push_back(456);
+  a.avail_no_missing.push_back(pg_shard_t(456, shard_id_t::NO_SHARD));
+  set<pg_shard_t> sset = { pg_shard_t(0), pg_shard_t(1) };
+  a.object_location_counts.insert(make_pair(sset, 10));
+  sset.insert(pg_shard_t(2));
+  a.object_location_counts.insert(make_pair(sset, 5));
+  a.acting_primary = 456;
+  o.push_back(new pg_stat_t(a));
+
+  a.up.push_back(124);
+  a.up_primary = 124;
+  a.acting.push_back(124);
+  a.acting_primary = 124;
+  a.blocked_by.push_back(155);
+  a.blocked_by.push_back(156);
+  o.push_back(new pg_stat_t(a));
+}
+
+bool operator==(const pg_stat_t& l, const pg_stat_t& r)
+{
+  return
+    l.version == r.version &&
+    l.reported_seq == r.reported_seq &&
+    l.reported_epoch == r.reported_epoch &&
+    l.state == r.state &&
+    l.last_fresh == r.last_fresh &&
+    l.last_change == r.last_change &&
+    l.last_active == r.last_active &&
+    l.last_peered == r.last_peered &&
+    l.last_clean == r.last_clean &&
+    l.last_unstale == r.last_unstale &&
+    l.last_undegraded == r.last_undegraded &&
+    l.last_fullsized == r.last_fullsized &&
+    l.log_start == r.log_start &&
+    l.ondisk_log_start == r.ondisk_log_start &&
+    l.created == r.created &&
+    l.last_epoch_clean == r.last_epoch_clean &&
+    l.parent == r.parent &&
+    l.parent_split_bits == r.parent_split_bits &&
+    l.last_scrub == r.last_scrub &&
+    l.last_deep_scrub == r.last_deep_scrub &&
+    l.last_scrub_stamp == r.last_scrub_stamp &&
+    l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
+    l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
+    l.stats == r.stats &&
+    l.stats_invalid == r.stats_invalid &&
+    l.log_size == r.log_size &&
+    l.ondisk_log_size == r.ondisk_log_size &&
+    l.up == r.up &&
+    l.acting == r.acting &&
+    l.avail_no_missing == r.avail_no_missing &&
+    l.object_location_counts == r.object_location_counts &&
+    l.mapping_epoch == r.mapping_epoch &&
+    l.blocked_by == r.blocked_by &&
+    l.last_became_active == r.last_became_active &&
+    l.last_became_peered == r.last_became_peered &&
+    l.dirty_stats_invalid == r.dirty_stats_invalid &&
+    l.omap_stats_invalid == r.omap_stats_invalid &&
+    l.hitset_stats_invalid == r.hitset_stats_invalid &&
+    l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
+    l.up_primary == r.up_primary &&
+    l.acting_primary == r.acting_primary &&
+    l.pin_stats_invalid == r.pin_stats_invalid &&
+    l.manifest_stats_invalid == r.manifest_stats_invalid &&
+    l.purged_snaps == r.purged_snaps &&
+    l.snaptrimq_len == r.snaptrimq_len;
+}
+
+// -- store_statfs_t --
+
+bool store_statfs_t::operator==(const store_statfs_t& other) const
+{
+  return total == other.total
+    && available == other.available
+    && allocated == other.allocated
+    && internally_reserved == other.internally_reserved
+    && data_stored == other.data_stored
+    && data_compressed == other.data_compressed
+    && data_compressed_allocated == other.data_compressed_allocated
+    && data_compressed_original == other.data_compressed_original
+    && omap_allocated == other.omap_allocated
+    && internal_metadata == other.internal_metadata;
+}
+
+void store_statfs_t::dump(Formatter *f) const
+{
+  f->dump_int("total", total);
+  f->dump_int("available", available);
+  f->dump_int("internally_reserved", internally_reserved);
+  f->dump_int("allocated", allocated);
+  f->dump_int("data_stored", data_stored);
+  f->dump_int("data_compressed", data_compressed);
+  f->dump_int("data_compressed_allocated", data_compressed_allocated);
+  f->dump_int("data_compressed_original", data_compressed_original);
+  f->dump_int("omap_allocated", omap_allocated);
+  f->dump_int("internal_metadata", internal_metadata);
+}
+
+ostream& operator<<(ostream& out, const store_statfs_t &s)
+{
+  out << std::hex
+      << "store_statfs(0x" << s.available
+      << "/0x"  << s.internally_reserved
+      << "/0x"  << s.total
+      << ", data 0x" << s.data_stored
+      << "/0x"  << s.allocated
+      << ", compress 0x" << s.data_compressed
+      << "/0x"  << s.data_compressed_allocated
+      << "/0x"  << s.data_compressed_original
+      << ", omap 0x" << s.omap_allocated
+      << ", meta 0x" << s.internal_metadata
+      << std::dec
+      << ")";
+  return out;
+}
+
+void store_statfs_t::generate_test_instances(list<store_statfs_t*>& o)
+{
+  store_statfs_t a;
+  o.push_back(new store_statfs_t(a));
+  a.total = 234;
+  a.available = 123;
+  a.internally_reserved = 33;
+  a.allocated = 32;
+  a.data_stored = 44;
+  a.data_compressed = 21;
+  a.data_compressed_allocated = 12;
+  a.data_compressed_original = 13;
+  a.omap_allocated = 14;
+  a.internal_metadata = 15;
+  o.push_back(new store_statfs_t(a));
+}
+
+// -- pool_stat_t --
+
+void pool_stat_t::dump(Formatter *f) const
+{
+  stats.dump(f);
+  f->open_object_section("store_stats");
+  store_stats.dump(f);
+  f->close_section();
+  f->dump_int("log_size", log_size);
+  f->dump_int("ondisk_log_size", ondisk_log_size);
+  f->dump_int("up", up);
+  f->dump_int("acting", acting);
+  f->dump_int("num_store_stats", num_store_stats);
+}
+
+void pool_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+  using ceph::encode;
+  if ((features & CEPH_FEATURE_OSDENC) == 0) {
+    __u8 v = 4;
+    encode(v, bl);
+    encode(stats, bl);
+    encode(log_size, bl);
+    encode(ondisk_log_size, bl);
+    return;
+  }
+
+  ENCODE_START(7, 5, bl);
+  encode(stats, bl);
+  encode(log_size, bl);
+  encode(ondisk_log_size, bl);
+  encode(up, bl);
+  encode(acting, bl);
+  encode(store_stats, bl);
+  encode(num_store_stats, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pool_stat_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
+  if (struct_v >= 4) {
+    decode(stats, bl);
+    decode(log_size, bl);
+    decode(ondisk_log_size, bl);
+    if (struct_v >= 6) {
+      decode(up, bl);
+      decode(acting, bl);
+    } else {
+      up = 0;
+      acting = 0;
+    }
+    if (struct_v >= 7) {
+      decode(store_stats, bl);
+      decode(num_store_stats, bl);
+    } else {
+      store_stats.reset();
+      num_store_stats = 0;
+    }
+
+  } else {
+    decode(stats.sum.num_bytes, bl);
+    uint64_t num_kb;
+    decode(num_kb, bl);
+    decode(stats.sum.num_objects, bl);
+    decode(stats.sum.num_object_clones, bl);
+    decode(stats.sum.num_object_copies, bl);
+    decode(stats.sum.num_objects_missing_on_primary, bl);
+    decode(stats.sum.num_objects_degraded, bl);
+    decode(log_size, bl);
+    decode(ondisk_log_size, bl);
+    if (struct_v >= 2) {
+      decode(stats.sum.num_rd, bl);
+      decode(stats.sum.num_rd_kb, bl);
+      decode(stats.sum.num_wr, bl);
+      decode(stats.sum.num_wr_kb, bl);
+    }
+    if (struct_v >= 3) {
+      decode(stats.sum.num_objects_unfound, bl);
+    }
+  }
+  DECODE_FINISH(bl);
+}
+
+void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
+{
+  pool_stat_t a;
+  o.push_back(new pool_stat_t(a));
+
+  list<object_stat_collection_t*> l;
+  object_stat_collection_t::generate_test_instances(l);
+  list<store_statfs_t*> ll;
+  store_statfs_t::generate_test_instances(ll);
+  a.stats = *l.back();
+  a.store_stats = *ll.back();
+  a.log_size = 123;
+  a.ondisk_log_size = 456;
+  a.acting = 3;
+  a.up = 4;
+  a.num_store_stats = 1;
+  o.push_back(new pool_stat_t(a));
+}
+
+
+// -- pg_history_t --
+
+void pg_history_t::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(10, 4, bl);
+  encode(epoch_created, bl);
+  encode(last_epoch_started, bl);
+  encode(last_epoch_clean, bl);
+  encode(last_epoch_split, bl);
+  encode(same_interval_since, bl);
+  encode(same_up_since, bl);
+  encode(same_primary_since, bl);
+  encode(last_scrub, bl);
+  encode(last_scrub_stamp, bl);
+  encode(last_deep_scrub, bl);
+  encode(last_deep_scrub_stamp, bl);
+  encode(last_clean_scrub_stamp, bl);
+  encode(last_epoch_marked_full, bl);
+  encode(last_interval_started, bl);
+  encode(last_interval_clean, bl);
+  encode(epoch_pool_created, bl);
+  encode(prior_readable_until_ub, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_history_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl);
+  decode(epoch_created, bl);
+  decode(last_epoch_started, bl);
+  if (struct_v >= 3)
+    decode(last_epoch_clean, bl);
+  else
+    last_epoch_clean = last_epoch_started;  // careful, it's a lie!
+  decode(last_epoch_split, bl);
+  decode(same_interval_since, bl);
+  decode(same_up_since, bl);
+  decode(same_primary_since, bl);
+  if (struct_v >= 2) {
+    decode(last_scrub, bl);
+    decode(last_scrub_stamp, bl);
+  }
+  if (struct_v >= 5) {
+    decode(last_deep_scrub, bl);
+    decode(last_deep_scrub_stamp, bl);
+  }
+  if (struct_v >= 6) {
+    decode(last_clean_scrub_stamp, bl);
+  }
+  if (struct_v >= 7) {
+    decode(last_epoch_marked_full, bl);
+  }
+  if (struct_v >= 8) {
+    decode(last_interval_started, bl);
+    decode(last_interval_clean, bl);
+  } else {
+    if (last_epoch_started >= same_interval_since) {
+      last_interval_started = same_interval_since;
+    } else {
+      last_interval_started = last_epoch_started; // best guess
+    }
+    if (last_epoch_clean >= same_interval_since) {
+      last_interval_clean = same_interval_since;
+    } else {
+      last_interval_clean = last_epoch_clean; // best guess
+    }
+  }
+  if (struct_v >= 9) {
+    decode(epoch_pool_created, bl);
+  } else {
+    epoch_pool_created = epoch_created;
+  }
+  if (struct_v >= 10) {
+    decode(prior_readable_until_ub, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void pg_history_t::dump(Formatter *f) const
+{
+  f->dump_int("epoch_created", epoch_created);
+  f->dump_int("epoch_pool_created", epoch_pool_created);
+  f->dump_int("last_epoch_started", last_epoch_started);
+  f->dump_int("last_interval_started", last_interval_started);
+  f->dump_int("last_epoch_clean", last_epoch_clean);
+  f->dump_int("last_interval_clean", last_interval_clean);
+  f->dump_int("last_epoch_split", last_epoch_split);
+  f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
+  f->dump_int("same_up_since", same_up_since);
+  f->dump_int("same_interval_since", same_interval_since);
+  f->dump_int("same_primary_since", same_primary_since);
+  f->dump_stream("last_scrub") << last_scrub;
+  f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
+  f->dump_stream("last_deep_scrub") << last_deep_scrub;
+  f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
+  f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
+  f->dump_float(
+    "prior_readable_until_ub",
+    std::chrono::duration<double>(prior_readable_until_ub).count());
+}
+
+void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
+{
+  o.push_back(new pg_history_t);
+  o.push_back(new pg_history_t);
+  o.back()->epoch_created = 1;
+  o.back()->epoch_pool_created = 1;
+  o.back()->last_epoch_started = 2;
+  o.back()->last_interval_started = 2;
+  o.back()->last_epoch_clean = 3;
+  o.back()->last_interval_clean = 2;
+  o.back()->last_epoch_split = 4;
+  o.back()->prior_readable_until_ub = make_timespan(3.1415);
+  o.back()->same_up_since = 5;
+  o.back()->same_interval_since = 6;
+  o.back()->same_primary_since = 7;
+  o.back()->last_scrub = eversion_t(8, 9);
+  o.back()->last_scrub_stamp = utime_t(10, 11);
+  o.back()->last_deep_scrub = eversion_t(12, 13);
+  o.back()->last_deep_scrub_stamp = utime_t(14, 15);
+  o.back()->last_clean_scrub_stamp = utime_t(16, 17);
+  o.back()->last_epoch_marked_full = 18;
+}
+
+
+// -- pg_info_t --
+
+void pg_info_t::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(32, 26, bl);
+  encode(pgid.pgid, bl);
+  encode(last_update, bl);
+  encode(last_complete, bl);
+  encode(log_tail, bl);
+  encode(hobject_t(), bl);  // old (nibblewise) last_backfill
+  encode(stats, bl);
+  history.encode(bl);
+  encode(purged_snaps, bl);
+  encode(last_epoch_started, bl);
+  encode(last_user_version, bl);
+  encode(hit_set, bl);
+  encode(pgid.shard, bl);
+  encode(last_backfill, bl);
+  encode(true, bl); // was last_backfill_bitwise
+  encode(last_interval_started, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_info_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(32, bl);
+  decode(pgid.pgid, bl);
+  decode(last_update, bl);
+  decode(last_complete, bl);
+  decode(log_tail, bl);
+  {
+    hobject_t old_last_backfill;
+    decode(old_last_backfill, bl);
+  }
+  decode(stats, bl);
+  history.decode(bl);
+  decode(purged_snaps, bl);
+  decode(last_epoch_started, bl);
+  decode(last_user_version, bl);
+  decode(hit_set, bl);
+  decode(pgid.shard, bl);
+  decode(last_backfill, bl);
+  {
+    bool last_backfill_bitwise;
+    decode(last_backfill_bitwise, bl);
+    // note: we may see a false value here since the default value for
+    // the member was false, so it often didn't get set to true until
+    // peering progressed.
+  }
+  if (struct_v >= 32) {
+    decode(last_interval_started, bl);
+  } else {
+    last_interval_started = last_epoch_started;
+  }
+  DECODE_FINISH(bl);
+}
+
+// -- pg_info_t --
+
+void pg_info_t::dump(Formatter *f) const
+{
+  f->dump_stream("pgid") << pgid;
+  f->dump_stream("last_update") << last_update;
+  f->dump_stream("last_complete") << last_complete;
+  f->dump_stream("log_tail") << log_tail;
+  f->dump_int("last_user_version", last_user_version);
+  f->dump_stream("last_backfill") << last_backfill;
+  f->open_array_section("purged_snaps");
+  for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
+       i != purged_snaps.end();
+       ++i) {
+    f->open_object_section("purged_snap_interval");
+    f->dump_stream("start") << i.get_start();
+    f->dump_stream("length") << i.get_len();
+    f->close_section();
+  }
+  f->close_section();
+  f->open_object_section("history");
+  history.dump(f);
+  f->close_section();
+  f->open_object_section("stats");
+  stats.dump(f);
+  f->close_section();
+
+  f->dump_int("empty", is_empty());
+  f->dump_int("dne", dne());
+  f->dump_int("incomplete", is_incomplete());
+  f->dump_int("last_epoch_started", last_epoch_started);
+
+  f->open_object_section("hit_set_history");
+  hit_set.dump(f);
+  f->close_section();
+}
+
+void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
+{
+  o.push_back(new pg_info_t);
+  o.push_back(new pg_info_t);
+  list<pg_history_t*> h;
+  pg_history_t::generate_test_instances(h);
+  o.back()->history = *h.back();
+  o.back()->pgid = spg_t(pg_t(1, 2), shard_id_t::NO_SHARD);
+  o.back()->last_update = eversion_t(3, 4);
+  o.back()->last_complete = eversion_t(5, 6);
+  o.back()->last_user_version = 2;
+  o.back()->log_tail = eversion_t(7, 8);
+  o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
+  {
+    list<pg_stat_t*> s;
+    pg_stat_t::generate_test_instances(s);
+    o.back()->stats = *s.back();
+  }
+  {
+    list<pg_hit_set_history_t*> s;
+    pg_hit_set_history_t::generate_test_instances(s);
+    o.back()->hit_set = *s.back();
+  }
+}
+
+// -- pg_notify_t --
+void pg_notify_t::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(3, 2, bl);
+  encode(query_epoch, bl);
+  encode(epoch_sent, bl);
+  encode(info, bl);
+  encode(to, bl);
+  encode(from, bl);
+  encode(past_intervals, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_notify_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(3, bl);
+  decode(query_epoch, bl);
+  decode(epoch_sent, bl);
+  decode(info, bl);
+  decode(to, bl);
+  decode(from, bl);
+  if (struct_v >= 3) {
+    decode(past_intervals, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void pg_notify_t::dump(Formatter *f) const
+{
+  f->dump_int("from", from);
+  f->dump_int("to", to);
+  f->dump_unsigned("query_epoch", query_epoch);
+  f->dump_unsigned("epoch_sent", epoch_sent);
+  {
+    f->open_object_section("info");
+    info.dump(f);
+    f->close_section();
+  }
+  f->dump_object("past_intervals", past_intervals);
+}
+
+void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
+{
+  o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1,
+			      pg_info_t(), PastIntervals()));
+  o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10,
+			      pg_info_t(), PastIntervals()));
+}
+
+ostream &operator<<(ostream &lhs, const pg_notify_t &notify)
+{
+  lhs << "(query:" << notify.query_epoch
+      << " sent:" << notify.epoch_sent
+      << " " << notify.info;
+  if (notify.from != shard_id_t::NO_SHARD ||
+      notify.to != shard_id_t::NO_SHARD)
+    lhs << " " << (unsigned)notify.from
+	<< "->" << (unsigned)notify.to;
+  lhs << " " << notify.past_intervals;
+  return lhs << ")";
+}
+
+// -- pg_interval_t --
+
+void PastIntervals::pg_interval_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(4, 2, bl);
+  encode(first, bl);
+  encode(last, bl);
+  encode(up, bl);
+  encode(acting, bl);
+  encode(maybe_went_rw, bl);
+  encode(primary, bl);
+  encode(up_primary, bl);
+  ENCODE_FINISH(bl);
+}
+
+void PastIntervals::pg_interval_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
+  decode(first, bl);
+  decode(last, bl);
+  decode(up, bl);
+  decode(acting, bl);
+  decode(maybe_went_rw, bl);
+  if (struct_v >= 3) {
+    decode(primary, bl);
+  } else {
+    if (acting.size())
+      primary = acting[0];
+  }
+  if (struct_v >= 4) {
+    decode(up_primary, bl);
+  } else {
+    if (up.size())
+      up_primary = up[0];
+  }
+  DECODE_FINISH(bl);
+}
+
+void PastIntervals::pg_interval_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("first", first);
+  f->dump_unsigned("last", last);
+  f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
+  f->open_array_section("up");
+  for (auto p = up.cbegin(); p != up.cend(); ++p)
+    f->dump_int("osd", *p);
+  f->close_section();
+  f->open_array_section("acting");
+  for (auto p = acting.cbegin(); p != acting.cend(); ++p)
+    f->dump_int("osd", *p);
+  f->close_section();
+  f->dump_int("primary", primary);
+  f->dump_int("up_primary", up_primary);
+}
+
+void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
+{
+  o.push_back(new pg_interval_t);
+  o.push_back(new pg_interval_t);
+  o.back()->up.push_back(1);
+  o.back()->acting.push_back(2);
+  o.back()->acting.push_back(3);
+  o.back()->first = 4;
+  o.back()->last = 5;
+  o.back()->maybe_went_rw = true;
+}
+
+WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
+
+
+/**
+ * pi_compact_rep
+ *
+ * PastIntervals only needs to be able to answer two questions:
+ * 1) Where should the primary look for unfound objects?
+ * 2) List a set of subsets of the OSDs such that contacting at least
+ *    one from each subset guarantees we speak to at least one witness
+ *    of any completed write.
+ *
+ * Crucially, 2) does not require keeping *all* past intervals.  Certainly,
+ * we don't need to keep any where maybe_went_rw would be false.  We also
+ * needn't keep two intervals where the actingset in one is a subset
+ * of the other (only need to keep the smaller of the two sets).  In order
+ * to accurately trim the set of intervals as last_epoch_started changes
+ * without rebuilding the set from scratch, we'll retain the larger set
+ * if it in an older interval.
+ */
+struct compact_interval_t {
+  epoch_t first;
+  epoch_t last;
+  set<pg_shard_t> acting;
+  bool supersedes(const compact_interval_t &other) {
+    for (auto &&i: acting) {
+      if (!other.acting.count(i))
+	return false;
+    }
+    return true;
+  }
+  void dump(Formatter *f) const {
+    f->open_object_section("compact_interval_t");
+    f->dump_stream("first") << first;
+    f->dump_stream("last") << last;
+    f->dump_stream("acting") << acting;
+    f->close_section();
+  }
+  void encode(ceph::buffer::list &bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(first, bl);
+    encode(last, bl);
+    encode(acting, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(1, bl);
+    decode(first, bl);
+    decode(last, bl);
+    decode(acting, bl);
+    DECODE_FINISH(bl);
+  }
+  static void generate_test_instances(list<compact_interval_t*> & o) {
+    /* Not going to be used, we'll generate pi_compact_rep directly */
+  }
+};
+ostream &operator<<(ostream &o, const compact_interval_t &rhs)
+{
+  return o << "([" << rhs.first << "," << rhs.last
+	   << "] acting " << rhs.acting << ")";
+}
+WRITE_CLASS_ENCODER(compact_interval_t)
+
+class pi_compact_rep : public PastIntervals::interval_rep {
+  epoch_t first = 0;
+  epoch_t last = 0; // inclusive
+  set<pg_shard_t> all_participants;
+  list<compact_interval_t> intervals;
+  pi_compact_rep(
+    bool ec_pool,
+    std::list<PastIntervals::pg_interval_t> &&intervals) {
+    for (auto &&i: intervals)
+      add_interval(ec_pool, i);
+  }
+public:
+  pi_compact_rep() = default;
+  pi_compact_rep(const pi_compact_rep &) = default;
+  pi_compact_rep(pi_compact_rep &&) = default;
+  pi_compact_rep &operator=(const pi_compact_rep &) = default;
+  pi_compact_rep &operator=(pi_compact_rep &&) = default;
+
+  size_t size() const override { return intervals.size(); }
+  bool empty() const override {
+    return first > last || (first == 0 && last == 0);
+  }
+  void clear() override {
+    *this = pi_compact_rep();
+  }
+  pair<epoch_t, epoch_t> get_bounds() const override {
+    return make_pair(first, last + 1);
+  }
+  void adjust_start_backwards(epoch_t last_epoch_clean) override {
+    first = last_epoch_clean;
+  }
+
+  set<pg_shard_t> get_all_participants(
+    bool ec_pool) const override {
+    return all_participants;
+  }
+  void add_interval(
+    bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
+    if (first == 0)
+      first = interval.first;
+    ceph_assert(interval.last > last);
+    last = interval.last;
+    set<pg_shard_t> acting;
+    for (unsigned i = 0; i < interval.acting.size(); ++i) {
+      if (interval.acting[i] == CRUSH_ITEM_NONE)
+	continue;
+      acting.insert(
+	pg_shard_t(
+	  interval.acting[i],
+	  ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
+    }
+    all_participants.insert(acting.begin(), acting.end());
+    if (!interval.maybe_went_rw)
+      return;
+    intervals.push_back(
+      compact_interval_t{interval.first, interval.last, acting});
+    auto plast = intervals.end();
+    --plast;
+    for (auto cur = intervals.begin(); cur != plast; ) {
+      if (plast->supersedes(*cur)) {
+	intervals.erase(cur++);
+      } else {
+	++cur;
+      }
+    }
+  }
+  unique_ptr<PastIntervals::interval_rep> clone() const override {
+    return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
+  }
+  ostream &print(ostream &out) const override {
+    return out << "([" << first << "," << last
+	       << "] all_participants=" << all_participants
+	       << " intervals=" << intervals << ")";
+  }
+  void encode(ceph::buffer::list &bl) const override {
+    ENCODE_START(1, 1, bl);
+    encode(first, bl);
+    encode(last, bl);
+    encode(all_participants, bl);
+    encode(intervals, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &bl) override {
+    DECODE_START(1, bl);
+    decode(first, bl);
+    decode(last, bl);
+    decode(all_participants, bl);
+    decode(intervals, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const override {
+    f->open_object_section("PastIntervals::compact_rep");
+    f->dump_stream("first") << first;
+    f->dump_stream("last") << last;
+    f->open_array_section("all_participants");
+    for (auto& i : all_participants) {
+      f->dump_object("pg_shard", i);
+    }
+    f->close_section();
+    f->open_array_section("intervals");
+    for (auto &&i: intervals) {
+      i.dump(f);
+    }
+    f->close_section();
+    f->close_section();
+  }
+  static void generate_test_instances(list<pi_compact_rep*> &o) {
+    using ival = PastIntervals::pg_interval_t;
+    using ivallst = std::list<ival>;
+    o.push_back(
+      new pi_compact_rep(
+	true, ivallst
+	{ ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
+	, ival{{   1, 2}, {   1, 2}, 21, 30,  true, 1, 1}
+	, ival{{      2}, {      2}, 31, 35, false, 2, 2}
+	, ival{{0,    2}, {0,    2}, 36, 50,  true, 0, 0}
+	}));
+    o.push_back(
+      new pi_compact_rep(
+	false, ivallst
+	{ ival{{0, 1, 2}, {0, 1, 2}, 10, 20,  true, 0, 0}
+	, ival{{   1, 2}, {   1, 2}, 21, 30,  true, 1, 1}
+	, ival{{      2}, {      2}, 31, 35, false, 2, 2}
+	, ival{{0,    2}, {0,    2}, 36, 50,  true, 0, 0}
+	}));
+    o.push_back(
+      new pi_compact_rep(
+	true, ivallst
+	{ ival{{2, 1, 0}, {2, 1, 0}, 10, 20,  true, 1, 1}
+	, ival{{   0, 2}, {   0, 2}, 21, 30,  true, 0, 0}
+	, ival{{   0, 2}, {2,    0}, 31, 35,  true, 2, 2}
+	, ival{{   0, 2}, {   0, 2}, 36, 50,  true, 0, 0}
+	}));
+  }
+  void iterate_mayberw_back_to(
+    epoch_t les,
+    std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
+    for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
+      if (i->last < les)
+	break;
+      f(i->first, i->acting);
+    }
+  }
+  virtual ~pi_compact_rep() override {}
+};
+WRITE_CLASS_ENCODER(pi_compact_rep)
+
+PastIntervals::PastIntervals()
+{
+  past_intervals.reset(new pi_compact_rep);
+}
+
+PastIntervals::PastIntervals(const PastIntervals &rhs)
+  : past_intervals(rhs.past_intervals ?
+		   rhs.past_intervals->clone() :
+		   nullptr) {}
+
+PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
+{
+  PastIntervals other(rhs);
+  swap(other);
+  return *this;
+}
+
+ostream& operator<<(ostream& out, const PastIntervals &i)
+{
+  if (i.past_intervals) {
+    return i.past_intervals->print(out);
+  } else {
+    return out << "(empty)";
+  }
+}
+
+ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
+{
+  return out << "PriorSet("
+	     << "ec_pool: " << i.ec_pool
+	     << ", probe: " << i.probe
+	     << ", down: " << i.down
+	     << ", blocked_by: " << i.blocked_by
+	     << ", pg_down: " << i.pg_down
+	     << ")";
+}
+
+void PastIntervals::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  __u8 type = 0;
+  decode(type, bl);
+  switch (type) {
+  case 0:
+    break;
+  case 1:
+    ceph_abort_msg("pi_simple_rep support removed post-luminous");
+    break;
+  case 2:
+    past_intervals.reset(new pi_compact_rep);
+    past_intervals->decode(bl);
+    break;
+  }
+  DECODE_FINISH(bl);
+}
+
+void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
+{
+  {
+    list<pi_compact_rep *> compact;
+    pi_compact_rep::generate_test_instances(compact);
+    for (auto &&i: compact) {
+      // takes ownership of contents
+      o.push_back(new PastIntervals(i));
+    }
+  }
+  return;
+}
+
+bool PastIntervals::is_new_interval(
+  int old_acting_primary,
+  int new_acting_primary,
+  const vector<int> &old_acting,
+  const vector<int> &new_acting,
+  int old_up_primary,
+  int new_up_primary,
+  const vector<int> &old_up,
+  const vector<int> &new_up,
+  int old_size,
+  int new_size,
+  int old_min_size,
+  int new_min_size,
+  unsigned old_pg_num,
+  unsigned new_pg_num,
+  unsigned old_pg_num_pending,
+  unsigned new_pg_num_pending,
+  bool old_sort_bitwise,
+  bool new_sort_bitwise,
+  bool old_recovery_deletes,
+  bool new_recovery_deletes,
+  uint32_t old_crush_count,
+  uint32_t new_crush_count,
+  uint32_t old_crush_target,
+  uint32_t new_crush_target,
+  uint32_t old_crush_barrier,
+  uint32_t new_crush_barrier,
+  int32_t old_crush_member,
+  int32_t new_crush_member,
+  pg_t pgid) {
+  return old_acting_primary != new_acting_primary ||
+    new_acting != old_acting ||
+    old_up_primary != new_up_primary ||
+    new_up != old_up ||
+    old_min_size != new_min_size ||
+    old_size != new_size ||
+    pgid.is_split(old_pg_num, new_pg_num, 0) ||
+    // (is or was) pre-merge source
+    pgid.is_merge_source(old_pg_num_pending, new_pg_num_pending, 0) ||
+    pgid.is_merge_source(new_pg_num_pending, old_pg_num_pending, 0) ||
+    // merge source
+    pgid.is_merge_source(old_pg_num, new_pg_num, 0) ||
+    // (is or was) pre-merge target
+    pgid.is_merge_target(old_pg_num_pending, new_pg_num_pending) ||
+    pgid.is_merge_target(new_pg_num_pending, old_pg_num_pending) ||
+    // merge target
+    pgid.is_merge_target(old_pg_num, new_pg_num) ||
+    old_sort_bitwise != new_sort_bitwise ||
+    old_recovery_deletes != new_recovery_deletes ||
+    old_crush_count != new_crush_count ||
+    old_crush_target != new_crush_target ||
+    old_crush_barrier != new_crush_barrier ||
+    old_crush_member != new_crush_member;
+}
+
+bool PastIntervals::is_new_interval(
+  int old_acting_primary,
+  int new_acting_primary,
+  const vector<int> &old_acting,
+  const vector<int> &new_acting,
+  int old_up_primary,
+  int new_up_primary,
+  const vector<int> &old_up,
+  const vector<int> &new_up,
+  const OSDMap *osdmap,
+  const OSDMap *lastmap,
+  pg_t pgid)
+{
+  const pg_pool_t *plast = lastmap->get_pg_pool(pgid.pool());
+  if (!plast) {
+    return false; // after pool is deleted there are no more interval changes
+  }
+  const pg_pool_t *pi = osdmap->get_pg_pool(pgid.pool());
+  if (!pi) {
+    return true;  // pool was deleted this epoch -> (final!) interval change
+  }
+  return
+    is_new_interval(old_acting_primary,
+		    new_acting_primary,
+		    old_acting,
+		    new_acting,
+		    old_up_primary,
+		    new_up_primary,
+		    old_up,
+		    new_up,
+		    plast->size,
+		    pi->size,
+		    plast->min_size,
+		    pi->min_size,
+		    plast->get_pg_num(),
+		    pi->get_pg_num(),
+		    plast->get_pg_num_pending(),
+		    pi->get_pg_num_pending(),
+		    lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
+		    osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
+		    lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
+		    osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
+		    plast->peering_crush_bucket_count, pi->peering_crush_bucket_count,
+		    plast->peering_crush_bucket_target, pi->peering_crush_bucket_target,
+		    plast->peering_crush_bucket_barrier, pi->peering_crush_bucket_barrier,
+		    plast->peering_crush_mandatory_member, pi->peering_crush_mandatory_member,
+		    pgid);
+}
+
+bool PastIntervals::check_new_interval(
+  int old_acting_primary,
+  int new_acting_primary,
+  const vector<int> &old_acting,
+  const vector<int> &new_acting,
+  int old_up_primary,
+  int new_up_primary,
+  const vector<int> &old_up,
+  const vector<int> &new_up,
+  epoch_t same_interval_since,
+  epoch_t last_epoch_clean,
+  const OSDMap *osdmap,
+  const OSDMap *lastmap,
+  pg_t pgid,
+  const IsPGRecoverablePredicate &could_have_gone_active,
+  PastIntervals *past_intervals,
+  std::ostream *out)
+{
+  /*
+   * We have to be careful to gracefully deal with situations like
+   * so. Say we have a power outage or something that takes out both
+   * OSDs, but the monitor doesn't mark them down in the same epoch.
+   * The history may look like
+   *
+   *  1: A B
+   *  2:   B
+   *  3:       let's say B dies for good, too (say, from the power spike) 
+   *  4: A
+   *
+   * which makes it look like B may have applied updates to the PG
+   * that we need in order to proceed.  This sucks...
+   *
+   * To minimize the risk of this happening, we CANNOT go active if
+   * _any_ OSDs in the prior set are down until we send an MOSDAlive
+   * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
+   * Then, we have something like
+   *
+   *  1: A B
+   *  2:   B   up_thru[B]=0
+   *  3:
+   *  4: A
+   *
+   * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
+   *
+   * or,
+   *
+   *  1: A B
+   *  2:   B   up_thru[B]=0
+   *  3:   B   up_thru[B]=2
+   *  4:
+   *  5: A    
+   *
+   * -> we must wait for B, bc it was alive through 2, and could have
+   *    written to the pg.
+   *
+   * If B is really dead, then an administrator will need to manually
+   * intervene by marking the OSD as "lost."
+   */
+
+  // remember past interval
+  //  NOTE: a change in the up set primary triggers an interval
+  //  change, even though the interval members in the pg_interval_t
+  //  do not change.
+  ceph_assert(past_intervals);
+  ceph_assert(past_intervals->past_intervals);
+  if (is_new_interval(
+	old_acting_primary,
+	new_acting_primary,
+	old_acting,
+	new_acting,
+	old_up_primary,
+	new_up_primary,
+	old_up,
+	new_up,
+	osdmap,
+	lastmap,
+	pgid)) {
+    pg_interval_t i;
+    i.first = same_interval_since;
+    i.last = osdmap->get_epoch() - 1;
+    ceph_assert(i.first <= i.last);
+    i.acting = old_acting;
+    i.up = old_up;
+    i.primary = old_acting_primary;
+    i.up_primary = old_up_primary;
+
+    unsigned num_acting = 0;
+    for (auto p = i.acting.cbegin(); p != i.acting.cend(); ++p)
+      if (*p != CRUSH_ITEM_NONE)
+	++num_acting;
+
+    ceph_assert(lastmap->get_pools().count(pgid.pool()));
+    const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
+    set<pg_shard_t> old_acting_shards;
+    old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
+
+    if (num_acting &&
+	i.primary != -1 &&
+	num_acting >= old_pg_pool.min_size &&
+	(!old_pg_pool.is_stretch_pool() ||
+	 old_pg_pool.stretch_set_can_peer(old_acting, *lastmap, out)) &&
+        could_have_gone_active(old_acting_shards)) {
+      if (out)
+	*out << __func__ << " " << i
+	     << " up_thru " << lastmap->get_up_thru(i.primary)
+	     << " up_from " << lastmap->get_up_from(i.primary)
+	     << " last_epoch_clean " << last_epoch_clean;
+      if (lastmap->get_up_thru(i.primary) >= i.first &&
+	  lastmap->get_up_from(i.primary) <= i.first) {
+	i.maybe_went_rw = true;
+	if (out)
+	  *out << " " << i
+	       << " : primary up " << lastmap->get_up_from(i.primary)
+	       << "-" << lastmap->get_up_thru(i.primary)
+	       << " includes interval"
+               << std::endl;
+      } else if (last_epoch_clean >= i.first &&
+		 last_epoch_clean <= i.last) {
+	// If the last_epoch_clean is included in this interval, then
+	// the pg must have been rw (for recovery to have completed).
+	// This is important because we won't know the _real_
+	// first_epoch because we stop at last_epoch_clean, and we
+	// don't want the oldest interval to randomly have
+	// maybe_went_rw false depending on the relative up_thru vs
+	// last_epoch_clean timing.
+	i.maybe_went_rw = true;
+	if (out)
+	  *out << " " << i
+	       << " : includes last_epoch_clean " << last_epoch_clean
+	       << " and presumed to have been rw"
+	       << std::endl;
+      } else {
+	i.maybe_went_rw = false;
+	if (out)
+	  *out << " " << i
+	       << " : primary up " << lastmap->get_up_from(i.primary)
+	       << "-" << lastmap->get_up_thru(i.primary)
+	       << " does not include interval"
+               << std::endl;
+      }
+    } else {
+      i.maybe_went_rw = false;
+      if (out)
+	*out << __func__ << " " << i << " : acting set is too small" << std::endl;
+    }
+    past_intervals->past_intervals->add_interval(old_pg_pool.is_erasure(), i);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// true if the given map affects the prior set
+bool PastIntervals::PriorSet::affected_by_map(
+  const OSDMap &osdmap,
+  const DoutPrefixProvider *dpp) const
+{
+  for (auto p = probe.begin(); p != probe.end(); ++p) {
+    int o = p->osd;
+
+    // did someone in the prior set go down?
+    if (osdmap.is_down(o) && down.count(o) == 0) {
+      ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
+      return true;
+    }
+
+    // did a down osd in cur get (re)marked as lost?
+    auto r = blocked_by.find(o);
+    if (r != blocked_by.end()) {
+      if (!osdmap.exists(o)) {
+	ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
+	return true;
+      }
+      if (osdmap.get_info(o).lost_at != r->second) {
+	ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
+	return true;
+      }
+    }
+  }
+
+  // did someone in the prior down set go up?
+  for (auto p = down.cbegin(); p != down.cend(); ++p) {
+    int o = *p;
+
+    if (osdmap.is_up(o)) {
+      ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
+      return true;
+    }
+
+    // did someone in the prior set get lost or destroyed?
+    if (!osdmap.exists(o)) {
+      ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
+      return true;
+    }
+    // did a down osd in down get (re)marked as lost?
+    auto r = blocked_by.find(o);
+    if (r != blocked_by.end()) {
+      if (osdmap.get_info(o).lost_at != r->second) {
+        ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
+{
+  out << "interval(" << i.first << "-" << i.last
+      << " up " << i.up << "(" << i.up_primary << ")"
+      << " acting " << i.acting << "(" << i.primary << ")";
+  if (i.maybe_went_rw)
+    out << " maybe_went_rw";
+  out << ")";
+  return out;
+}
+
+
+
+// -- pg_query_t --
+
+void pg_query_t::encode(ceph::buffer::list &bl, uint64_t features) const {
+  ENCODE_START(3, 3, bl);
+  encode(type, bl);
+  encode(since, bl);
+  history.encode(bl);
+  encode(epoch_sent, bl);
+  encode(to, bl);
+  encode(from, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_query_t::decode(ceph::buffer::list::const_iterator &bl) {
+  DECODE_START(3, bl);
+  decode(type, bl);
+  decode(since, bl);
+  history.decode(bl);
+  decode(epoch_sent, bl);
+  decode(to, bl);
+  decode(from, bl);
+  DECODE_FINISH(bl);
+}
+
+void pg_query_t::dump(Formatter *f) const
+{
+  f->dump_int("from", from);
+  f->dump_int("to", to);
+  f->dump_string("type", get_type_name());
+  f->dump_stream("since") << since;
+  f->dump_stream("epoch_sent") << epoch_sent;
+  f->open_object_section("history");
+  history.dump(f);
+  f->close_section();
+}
+void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
+{
+  o.push_back(new pg_query_t());
+  list<pg_history_t*> h;
+  pg_history_t::generate_test_instances(h);
+  o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
+  o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
+  o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
+			     eversion_t(4, 5), *h.back(), 4));
+  o.push_back(new pg_query_t(pg_query_t::FULLLOG,
+			     shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
+			     *h.back(), 5));
+}
+
+// -- pg_lease_t --
+
+void pg_lease_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(readable_until, bl);
+  encode(readable_until_ub, bl);
+  encode(interval, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_lease_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(readable_until, p);
+  decode(readable_until_ub, p);
+  decode(interval, p);
+  DECODE_FINISH(p);
+}
+
+void pg_lease_t::dump(Formatter *f) const
+{
+  f->dump_stream("readable_until") << readable_until;
+  f->dump_stream("readable_until_ub") << readable_until_ub;
+  f->dump_stream("interval") << interval;
+}
+
+void pg_lease_t::generate_test_instances(std::list<pg_lease_t*>& o)
+{
+  o.push_back(new pg_lease_t());
+  o.push_back(new pg_lease_t());
+  o.back()->readable_until = make_timespan(1.5);
+  o.back()->readable_until_ub = make_timespan(3.4);
+  o.back()->interval = make_timespan(1.0);
+}
+
+// -- pg_lease_ack_t --
+
+void pg_lease_ack_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(readable_until_ub, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_lease_ack_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(readable_until_ub, p);
+  DECODE_FINISH(p);
+}
+
+void pg_lease_ack_t::dump(Formatter *f) const
+{
+  f->dump_stream("readable_until_ub") << readable_until_ub;
+}
+
+void pg_lease_ack_t::generate_test_instances(std::list<pg_lease_ack_t*>& o)
+{
+  o.push_back(new pg_lease_ack_t());
+  o.push_back(new pg_lease_ack_t());
+  o.back()->readable_until_ub = make_timespan(3.4);
+}
+
+
+// -- ObjectModDesc --
+void ObjectModDesc::visit(Visitor *visitor) const
+{
+  auto bp = bl.cbegin();
+  try {
+    while (!bp.end()) {
+      DECODE_START(max_required_version, bp);
+      uint8_t code;
+      decode(code, bp);
+      switch (code) {
+      case APPEND: {
+	uint64_t size;
+	decode(size, bp);
+	visitor->append(size);
+	break;
+      }
+      case SETATTRS: {
+	map<string, std::optional<ceph::buffer::list> > attrs;
+	decode(attrs, bp);
+	visitor->setattrs(attrs);
+	break;
+      }
+      case DELETE: {
+	version_t old_version;
+	decode(old_version, bp);
+	visitor->rmobject(old_version);
+	break;
+      }
+      case CREATE: {
+	visitor->create();
+	break;
+      }
+      case UPDATE_SNAPS: {
+	set<snapid_t> snaps;
+	decode(snaps, bp);
+	visitor->update_snaps(snaps);
+	break;
+      }
+      case TRY_DELETE: {
+	version_t old_version;
+	decode(old_version, bp);
+	visitor->try_rmobject(old_version);
+	break;
+      }
+      case ROLLBACK_EXTENTS: {
+	vector<pair<uint64_t, uint64_t> > extents;
+	version_t gen;
+	decode(gen, bp);
+	decode(extents, bp);
+	visitor->rollback_extents(gen,extents);
+	break;
+      }
+      default:
+	ceph_abort_msg("Invalid rollback code");
+      }
+      DECODE_FINISH(bp);
+    }
+  } catch (...) {
+    ceph_abort_msg("Invalid encoding");
+  }
+}
+
+struct DumpVisitor : public ObjectModDesc::Visitor {
+  Formatter *f;
+  explicit DumpVisitor(Formatter *f) : f(f) {}
+  void append(uint64_t old_size) override {
+    f->open_object_section("op");
+    f->dump_string("code", "APPEND");
+    f->dump_unsigned("old_size", old_size);
+    f->close_section();
+  }
+  void setattrs(map<string, std::optional<ceph::buffer::list> > &attrs) override {
+    f->open_object_section("op");
+    f->dump_string("code", "SETATTRS");
+    f->open_array_section("attrs");
+    for (auto i = attrs.begin(); i != attrs.end(); ++i) {
+      f->dump_string("attr_name", i->first);
+    }
+    f->close_section();
+    f->close_section();
+  }
+  void rmobject(version_t old_version) override {
+    f->open_object_section("op");
+    f->dump_string("code", "RMOBJECT");
+    f->dump_unsigned("old_version", old_version);
+    f->close_section();
+  }
+  void try_rmobject(version_t old_version) override {
+    f->open_object_section("op");
+    f->dump_string("code", "TRY_RMOBJECT");
+    f->dump_unsigned("old_version", old_version);
+    f->close_section();
+  }
+  void create() override {
+    f->open_object_section("op");
+    f->dump_string("code", "CREATE");
+    f->close_section();
+  }
+  void update_snaps(const set<snapid_t> &snaps) override {
+    f->open_object_section("op");
+    f->dump_string("code", "UPDATE_SNAPS");
+    f->dump_stream("snaps") << snaps;
+    f->close_section();
+  }
+  void rollback_extents(
+    version_t gen,
+    const vector<pair<uint64_t, uint64_t> > &extents) override {
+    f->open_object_section("op");
+    f->dump_string("code", "ROLLBACK_EXTENTS");
+    f->dump_unsigned("gen", gen);
+    f->dump_stream("snaps") << extents;
+    f->close_section();
+  }
+};
+
+void ObjectModDesc::dump(Formatter *f) const
+{
+  f->open_object_section("object_mod_desc");
+  f->dump_bool("can_local_rollback", can_local_rollback);
+  f->dump_bool("rollback_info_completed", rollback_info_completed);
+  {
+    f->open_array_section("ops");
+    DumpVisitor vis(f);
+    visit(&vis);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
+{
+  map<string, std::optional<ceph::buffer::list> > attrs;
+  attrs[OI_ATTR];
+  attrs[SS_ATTR];
+  attrs["asdf"];
+  o.push_back(new ObjectModDesc());
+  o.back()->append(100);
+  o.back()->setattrs(attrs);
+  o.push_back(new ObjectModDesc());
+  o.back()->rmobject(1001);
+  o.push_back(new ObjectModDesc());
+  o.back()->create();
+  o.back()->setattrs(attrs);
+  o.push_back(new ObjectModDesc());
+  o.back()->create();
+  o.back()->setattrs(attrs);
+  o.back()->mark_unrollbackable();
+  o.back()->append(1000);
+}
+
+void ObjectModDesc::encode(ceph::buffer::list &_bl) const
+{
+  ENCODE_START(max_required_version, max_required_version, _bl);
+  encode(can_local_rollback, _bl);
+  encode(rollback_info_completed, _bl);
+  encode(bl, _bl);
+  ENCODE_FINISH(_bl);
+}
+void ObjectModDesc::decode(ceph::buffer::list::const_iterator &_bl)
+{
+  DECODE_START(2, _bl);
+  max_required_version = struct_v;
+  decode(can_local_rollback, _bl);
+  decode(rollback_info_completed, _bl);
+  decode(bl, _bl);
+  // ensure bl does not pin a larger ceph::buffer in memory
+  bl.rebuild();
+  bl.reassign_to_mempool(mempool::mempool_osd_pglog);
+  DECODE_FINISH(_bl);
+}
+
+std::atomic<uint32_t> ObjectCleanRegions::max_num_intervals = {10};
+
+void ObjectCleanRegions::set_max_num_intervals(uint32_t num)
+{
+  max_num_intervals = num;
+}
+
+void ObjectCleanRegions::trim()
+{
+  while(clean_offsets.num_intervals() > max_num_intervals) {
+    typename interval_set<uint64_t>::iterator shortest_interval = clean_offsets.begin();
+    if (shortest_interval == clean_offsets.end())
+      break;
+    for (typename interval_set<uint64_t>::iterator it = clean_offsets.begin();
+        it != clean_offsets.end();
+        ++it) {
+      if (it.get_len() < shortest_interval.get_len())
+        shortest_interval = it;
+    }
+    clean_offsets.erase(shortest_interval);
+  }
+}
+
+void ObjectCleanRegions::merge(const ObjectCleanRegions &other)
+{
+  clean_offsets.intersection_of(other.clean_offsets);
+  clean_omap = clean_omap && other.clean_omap;
+  trim();
+}
+
+void ObjectCleanRegions::mark_data_region_dirty(uint64_t offset, uint64_t len)
+{
+  interval_set<uint64_t> clean_region;
+  clean_region.insert(0, (uint64_t)-1);
+  clean_region.erase(offset, len);
+  clean_offsets.intersection_of(clean_region);
+  trim();
+}
+
+bool ObjectCleanRegions::is_clean_region(uint64_t offset, uint64_t len) const
+{
+  return clean_offsets.contains(offset, len);
+}
+
+void ObjectCleanRegions::mark_omap_dirty()
+{
+  clean_omap = false;
+}
+
+void ObjectCleanRegions::mark_object_new()
+{
+  new_object = true;
+}
+
+void ObjectCleanRegions::mark_fully_dirty()
+{
+  mark_data_region_dirty(0, (uint64_t)-1);
+  mark_omap_dirty();
+  mark_object_new();
+}
+
+interval_set<uint64_t> ObjectCleanRegions::get_dirty_regions() const
+{
+   interval_set<uint64_t> dirty_region;
+   dirty_region.insert(0, (uint64_t)-1);
+   dirty_region.subtract(clean_offsets);
+   return dirty_region;
+}
+
+bool ObjectCleanRegions::omap_is_dirty() const
+{
+  return !clean_omap;
+}
+
+bool ObjectCleanRegions::object_is_exist() const
+{
+  return !new_object;
+}
+
+void ObjectCleanRegions::encode(bufferlist &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  using ceph::encode;
+  encode(clean_offsets, bl);
+  encode(clean_omap, bl);
+  encode(new_object, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ObjectCleanRegions::decode(bufferlist::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  using ceph::decode;
+  decode(clean_offsets, bl);
+  decode(clean_omap, bl);
+  decode(new_object, bl);
+  DECODE_FINISH(bl);
+}
+
+void ObjectCleanRegions::dump(Formatter *f) const
+{
+  f->open_object_section("object_clean_regions");
+  f->dump_stream("clean_offsets") << clean_offsets;
+  f->dump_bool("clean_omap", clean_omap);
+  f->dump_bool("new_object", new_object);
+  f->close_section();
+}
+
+void ObjectCleanRegions::generate_test_instances(list<ObjectCleanRegions*>& o)
+{
+  o.push_back(new ObjectCleanRegions());
+  o.push_back(new ObjectCleanRegions());
+  o.back()->mark_data_region_dirty(4096, 40960);
+  o.back()->mark_omap_dirty();
+  o.back()->mark_object_new();
+}
+
+ostream& operator<<(ostream& out, const ObjectCleanRegions& ocr)
+{
+  return out << "clean_offsets: " << ocr.clean_offsets
+             << ", clean_omap: " << ocr.clean_omap
+             << ", new_object: " << ocr.new_object;
+}
+
+// -- pg_log_entry_t --
+
+string pg_log_entry_t::get_key_name() const
+{
+  return version.get_key_name();
+}
+
+void pg_log_entry_t::encode_with_checksum(ceph::buffer::list& bl) const
+{
+  using ceph::encode;
+  ceph::buffer::list ebl(sizeof(*this)*2);
+  this->encode(ebl);
+  __u32 crc = ebl.crc32c(0);
+  encode(ebl, bl);
+  encode(crc, bl);
+}
+
+void pg_log_entry_t::decode_with_checksum(ceph::buffer::list::const_iterator& p)
+{
+  using ceph::decode;
+  ceph::buffer::list bl;
+  decode(bl, p);
+  __u32 crc;
+  decode(crc, p);
+  if (crc != bl.crc32c(0))
+    throw ceph::buffer::malformed_input("bad checksum on pg_log_entry_t");
+  auto q = bl.cbegin();
+  this->decode(q);
+}
+
+void pg_log_entry_t::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(14, 4, bl);
+  encode(op, bl);
+  encode(soid, bl);
+  encode(version, bl);
+
+  /**
+   * Added with reverting_to:
+   * Previous code used prior_version to encode
+   * what we now call reverting_to.  This will
+   * allow older code to decode reverting_to
+   * into prior_version as expected.
+   */
+  if (op == LOST_REVERT)
+    encode(reverting_to, bl);
+  else
+    encode(prior_version, bl);
+
+  encode(reqid, bl);
+  encode(mtime, bl);
+  if (op == LOST_REVERT)
+    encode(prior_version, bl);
+  encode(snaps, bl);
+  encode(user_version, bl);
+  encode(mod_desc, bl);
+  encode(extra_reqids, bl);
+  if (op == ERROR)
+    encode(return_code, bl);
+  if (!extra_reqids.empty())
+    encode(extra_reqid_return_codes, bl);
+  encode(clean_regions, bl);
+  if (op != ERROR)
+    encode(return_code, bl);
+  encode(op_returns, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_log_entry_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(14, 4, 4, bl);
+  decode(op, bl);
+  if (struct_v < 2) {
+    sobject_t old_soid;
+    decode(old_soid, bl);
+    soid.oid = old_soid.oid;
+    soid.snap = old_soid.snap;
+    invalid_hash = true;
+  } else {
+    decode(soid, bl);
+  }
+  if (struct_v < 3)
+    invalid_hash = true;
+  decode(version, bl);
+
+  if (struct_v >= 6 && op == LOST_REVERT)
+    decode(reverting_to, bl);
+  else
+    decode(prior_version, bl);
+
+  decode(reqid, bl);
+
+  decode(mtime, bl);
+  if (struct_v < 5)
+    invalid_pool = true;
+
+  if (op == LOST_REVERT) {
+    if (struct_v >= 6) {
+      decode(prior_version, bl);
+    } else {
+      reverting_to = prior_version;
+    }
+  }
+  if (struct_v >= 7 ||  // for v >= 7, this is for all ops.
+      op == CLONE) {    // for v < 7, it's only present for CLONE.
+    decode(snaps, bl);
+    // ensure snaps does not pin a larger ceph::buffer in memory
+    snaps.rebuild();
+    snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+  }
+
+  if (struct_v >= 8)
+    decode(user_version, bl);
+  else
+    user_version = version.version;
+
+  if (struct_v >= 9)
+    decode(mod_desc, bl);
+  else
+    mod_desc.mark_unrollbackable();
+  if (struct_v >= 10)
+    decode(extra_reqids, bl);
+  if (struct_v >= 11 && op == ERROR)
+    decode(return_code, bl);
+  if (struct_v >= 12 && !extra_reqids.empty())
+    decode(extra_reqid_return_codes, bl);
+  if (struct_v >= 13)
+    decode(clean_regions, bl);
+  else
+    clean_regions.mark_fully_dirty();
+  if (struct_v >= 14) {
+    if (op != ERROR) {
+      decode(return_code, bl);
+    }
+    decode(op_returns, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void pg_log_entry_t::dump(Formatter *f) const
+{
+  f->dump_string("op", get_op_name());
+  f->dump_stream("object") << soid;
+  f->dump_stream("version") << version;
+  f->dump_stream("prior_version") << prior_version;
+  f->dump_stream("reqid") << reqid;
+  f->open_array_section("extra_reqids");
+  uint32_t idx = 0;
+  for (auto p = extra_reqids.begin();
+       p != extra_reqids.end();
+       ++idx, ++p) {
+    f->open_object_section("extra_reqid");
+    f->dump_stream("reqid") << p->first;
+    f->dump_stream("user_version") << p->second;
+    auto it = extra_reqid_return_codes.find(idx);
+    if (it != extra_reqid_return_codes.end()) {
+      f->dump_int("return_code", it->second);
+    }
+    f->close_section();
+  }
+  f->close_section();
+  f->dump_stream("mtime") << mtime;
+  f->dump_int("return_code", return_code);
+  if (!op_returns.empty()) {
+    f->open_array_section("op_returns");
+    for (auto& i : op_returns) {
+      f->dump_object("op", i);
+    }
+    f->close_section();
+  }
+  if (snaps.length() > 0) {
+    vector<snapid_t> v;
+    ceph::buffer::list c = snaps;
+    auto p = c.cbegin();
+    try {
+      using ceph::decode;
+      decode(v, p);
+    } catch (...) {
+      v.clear();
+    }
+    f->open_object_section("snaps");
+    for (auto p = v.begin(); p != v.end(); ++p)
+      f->dump_unsigned("snap", *p);
+    f->close_section();
+  }
+  {
+    f->open_object_section("mod_desc");
+    mod_desc.dump(f);
+    f->close_section();
+  }
+  {
+    f->open_object_section("clean_regions");
+    clean_regions.dump(f);
+    f->close_section();
+  }
+}
+
+void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
+{
+  o.push_back(new pg_log_entry_t());
+  hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
+  o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
+				 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+				 utime_t(8,9), 0));
+  o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
+				 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+				 utime_t(8,9), -ENOENT));
+}
+
+ostream& operator<<(ostream& out, const pg_log_entry_t& e)
+{
+  out << e.version << " (" << e.prior_version << ") "
+      << std::left << std::setw(8) << e.get_op_name() << ' '
+      << e.soid << " by " << e.reqid << " " << e.mtime
+      << " " << e.return_code;
+  if (!e.op_returns.empty()) {
+    out << " " << e.op_returns;
+  }
+  if (e.snaps.length()) {
+    vector<snapid_t> snaps;
+    ceph::buffer::list c = e.snaps;
+    auto p = c.cbegin();
+    try {
+      decode(snaps, p);
+    } catch (...) {
+      snaps.clear();
+    }
+    out << " snaps " << snaps;
+  }
+  out << " ObjectCleanRegions " << e.clean_regions;
+  return out;
+}
+
+// -- pg_log_dup_t --
+
+std::string pg_log_dup_t::get_key_name() const
+{
+  static const char prefix[] = "dup_";
+  std::string key(36, ' ');
+  memcpy(&key[0], prefix, 4);
+  version.get_key_name(&key[4]);
+  key.resize(35); // remove the null terminator
+  return key;
+}
+
+void pg_log_dup_t::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(2, 1, bl);
+  encode(reqid, bl);
+  encode(version, bl);
+  encode(user_version, bl);
+  encode(return_code, bl);
+  encode(op_returns, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_log_dup_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(2, bl);
+  decode(reqid, bl);
+  decode(version, bl);
+  decode(user_version, bl);
+  decode(return_code, bl);
+  if (struct_v >= 2) {
+    decode(op_returns, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void pg_log_dup_t::dump(Formatter *f) const
+{
+  f->dump_stream("reqid") << reqid;
+  f->dump_stream("version") << version;
+  f->dump_stream("user_version") << user_version;
+  f->dump_stream("return_code") << return_code;
+  if (!op_returns.empty()) {
+    f->open_array_section("op_returns");
+    for (auto& i : op_returns) {
+      f->dump_object("op", i);
+    }
+    f->close_section();
+  }
+}
+
+void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
+{
+  o.push_back(new pg_log_dup_t());
+  o.push_back(new pg_log_dup_t(eversion_t(1,2),
+			       1,
+			       osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+			       0));
+  o.push_back(new pg_log_dup_t(eversion_t(1,2),
+			       2,
+			       osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+			       -ENOENT));
+}
+
+
+std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
+  out << "log_dup(reqid=" << e.reqid <<
+    " v=" << e.version << " uv=" << e.user_version <<
+    " rc=" << e.return_code;
+  if (!e.op_returns.empty()) {
+    out << " " << e.op_returns;
+  }
+  return out << ")";
+}
+
+
+// -- pg_log_t --
+
+// out: pg_log_t that only has entries that apply to import_pgid using curmap
+// reject: Entries rejected from "in" are in the reject.log.  Other fields not set.
+void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
+  const string &hit_set_namespace, const pg_log_t &in,
+  pg_log_t &out, pg_log_t &reject)
+{
+  out = in;
+  out.log.clear();
+  reject.log.clear();
+
+  for (auto i = in.log.cbegin(); i != in.log.cend(); ++i) {
+
+    // Reject pg log entries for temporary objects
+    if (i->soid.is_temp()) {
+      reject.log.push_back(*i);
+      continue;
+    }
+
+    if (i->soid.nspace != hit_set_namespace) {
+      object_t oid = i->soid.oid;
+      object_locator_t loc(i->soid);
+      pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
+      pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
+
+      if (import_pgid.pgid == pgid) {
+        out.log.push_back(*i);
+      } else {
+        reject.log.push_back(*i);
+      }
+    } else {
+      out.log.push_back(*i);
+    }
+  }
+}
+
+void pg_log_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(7, 3, bl);
+  encode(head, bl);
+  encode(tail, bl);
+  encode(log, bl);
+  encode(can_rollback_to, bl);
+  encode(rollback_info_trimmed_to, bl);
+  encode(dups, bl);
+  ENCODE_FINISH(bl);
+}
+ 
+void pg_log_t::decode(ceph::buffer::list::const_iterator &bl, int64_t pool)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
+  decode(head, bl);
+  decode(tail, bl);
+  if (struct_v < 2) {
+    bool backlog;
+    decode(backlog, bl);
+  }
+  decode(log, bl);
+  if (struct_v >= 5)
+    decode(can_rollback_to, bl);
+
+  if (struct_v >= 6)
+    decode(rollback_info_trimmed_to, bl);
+  else
+    rollback_info_trimmed_to = tail;
+
+  if (struct_v >= 7)
+    decode(dups, bl);
+
+  DECODE_FINISH(bl);
+
+  // handle hobject_t format change
+  if (struct_v < 4) {
+    for (auto i = log.begin(); i != log.end(); ++i) {
+      if (!i->soid.is_max() && i->soid.pool == -1)
+	i->soid.pool = pool;
+    }
+  }
+}
+
+void pg_log_t::dump(Formatter *f) const
+{
+  f->dump_stream("head") << head;
+  f->dump_stream("tail") << tail;
+  f->open_array_section("log");
+  for (auto p = log.cbegin(); p != log.cend(); ++p) {
+    f->open_object_section("entry");
+    p->dump(f);
+    f->close_section();
+  }
+  f->close_section();
+  f->open_array_section("dups");
+  for (const auto& entry : dups) {
+    f->open_object_section("entry");
+    entry.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
+{
+  o.push_back(new pg_log_t);
+
+  // this is nonsensical:
+  o.push_back(new pg_log_t);
+  o.back()->head = eversion_t(1,2);
+  o.back()->tail = eversion_t(3,4);
+  list<pg_log_entry_t*> e;
+  pg_log_entry_t::generate_test_instances(e);
+  for (auto p = e.begin(); p != e.end(); ++p)
+    o.back()->log.push_back(**p);
+}
+
+static void _handle_dups(CephContext* cct, pg_log_t &target, const pg_log_t &other, unsigned maxdups)
+{
+  auto earliest_dup_version =
+	        target.head.version < maxdups ? 0u : target.head.version - maxdups + 1;
+  lgeneric_subdout(cct, osd, 20) << __func__ << " earliest_dup_version "
+				 << earliest_dup_version << dendl;
+
+  for (auto d = other.dups.cbegin(); d != other.dups.cend(); ++d) {
+    if (d->version.version >= earliest_dup_version) {
+      lgeneric_subdout(cct, osd, 20)
+	      << "copy_up_to/copy_after copy dup version "
+	      << d->version << dendl;
+      target.dups.push_back(pg_log_dup_t(*d));
+    }
+  }
+
+  for (auto i = other.log.cbegin(); i != other.log.cend(); ++i) {
+    ceph_assert(i->version > other.tail);
+    if (i->version > target.tail)
+      break;
+    if (i->version.version >= earliest_dup_version) {
+      lgeneric_subdout(cct, osd, 20)
+		<< "copy_up_to/copy_after copy dup from log version "
+		<< i->version << dendl;
+      target.dups.push_back(pg_log_dup_t(*i));
+    }
+  }
+}
+
+
+void pg_log_t::copy_after(CephContext* cct, const pg_log_t &other, eversion_t v)
+{
+  can_rollback_to = other.can_rollback_to;
+  head = other.head;
+  tail = other.tail;
+  lgeneric_subdout(cct, osd, 20) << __func__ << " v " << v
+				 << " dups.size()=" << dups.size()
+				 << " other.dups.size()=" << other.dups.size() << dendl;
+  for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
+    ceph_assert(i->version > other.tail);
+    if (i->version <= v) {
+      // make tail accurate.
+      tail = i->version;
+      break;
+    }
+    lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
+    log.push_front(*i);
+  }
+  _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
+  lgeneric_subdout(cct, osd, 20) << __func__ << " END v " << v
+				 << " dups.size()=" << dups.size()
+				 << " other.dups.size()=" << other.dups.size() << dendl;
+}
+
+void pg_log_t::copy_up_to(CephContext* cct, const pg_log_t &other, int max)
+{
+  can_rollback_to = other.can_rollback_to;
+  int n = 0;
+  head = other.head;
+  tail = other.tail;
+  lgeneric_subdout(cct, osd, 20) << __func__ << " max " << max
+				<< " dups.size()=" << dups.size()
+				<< " other.dups.size()=" << other.dups.size() << dendl;
+  for (auto i = other.log.crbegin(); i != other.log.crend(); ++i) {
+    ceph_assert(i->version > other.tail);
+    if (n++ >= max) {
+      tail = i->version;
+      break;
+    }
+    lgeneric_subdout(cct, osd, 20) << __func__ << " copy log version " << i->version << dendl;
+    log.push_front(*i);
+  }
+  _handle_dups(cct, *this, other, cct->_conf->osd_pg_log_dups_tracked);
+  lgeneric_subdout(cct, osd, 20) << __func__ << " END max " << max
+				 << " dups.size()=" << dups.size()
+				 << " other.dups.size()=" << other.dups.size() << dendl;
+}
+
+ostream& pg_log_t::print(ostream& out) const
+{
+  out << *this << std::endl;
+  for (auto p = log.cbegin(); p != log.cend(); ++p)
+    out << *p << std::endl;
+  for (const auto& entry : dups) {
+    out << " dup entry: " << entry << std::endl;
+  }
+  return out;
+}
+
+// -- pg_missing_t --
+
+ostream& operator<<(ostream& out, const pg_missing_item& i)
+{
+  out << i.need;
+  if (i.have != eversion_t())
+    out << "(" << i.have << ")";
+  out << " flags = " << i.flag_str()
+      << " " << i.clean_regions;
+  return out;
+}
+
+// -- object_copy_cursor_t --
+
+void object_copy_cursor_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(attr_complete, bl);
+  encode(data_offset, bl);
+  encode(data_complete, bl);
+  encode(omap_offset, bl);
+  encode(omap_complete, bl);
+  ENCODE_FINISH(bl);
+}
+
+void object_copy_cursor_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  decode(attr_complete, bl);
+  decode(data_offset, bl);
+  decode(data_complete, bl);
+  decode(omap_offset, bl);
+  decode(omap_complete, bl);
+  DECODE_FINISH(bl);
+}
+
+void object_copy_cursor_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("attr_complete", (int)attr_complete);
+  f->dump_unsigned("data_offset", data_offset);
+  f->dump_unsigned("data_complete", (int)data_complete);
+  f->dump_string("omap_offset", omap_offset);
+  f->dump_unsigned("omap_complete", (int)omap_complete);
+}
+
+void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
+{
+  o.push_back(new object_copy_cursor_t);
+  o.push_back(new object_copy_cursor_t);
+  o.back()->attr_complete = true;
+  o.back()->data_offset = 123;
+  o.push_back(new object_copy_cursor_t);
+  o.back()->attr_complete = true;
+  o.back()->data_complete = true;
+  o.back()->omap_offset = "foo";
+  o.push_back(new object_copy_cursor_t);
+  o.back()->attr_complete = true;
+  o.back()->data_complete = true;
+  o.back()->omap_complete = true;
+}
+
+// -- object_copy_data_t --
+
+void object_copy_data_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  ENCODE_START(8, 5, bl);
+  encode(size, bl);
+  encode(mtime, bl);
+  encode(attrs, bl);
+  encode(data, bl);
+  encode(omap_data, bl);
+  encode(cursor, bl);
+  encode(omap_header, bl);
+  encode(snaps, bl);
+  encode(snap_seq, bl);
+  encode(flags, bl);
+  encode(data_digest, bl);
+  encode(omap_digest, bl);
+  encode(reqids, bl);
+  encode(truncate_seq, bl);
+  encode(truncate_size, bl);
+  encode(reqid_return_codes, bl);
+  ENCODE_FINISH(bl);
+}
+
+void object_copy_data_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START(8, bl);
+  if (struct_v < 5) {
+    // old
+    decode(size, bl);
+    decode(mtime, bl);
+    {
+      string category;
+      decode(category, bl);  // no longer used
+    }
+    decode(attrs, bl);
+    decode(data, bl);
+    {
+      map<string,ceph::buffer::list> omap;
+      decode(omap, bl);
+      omap_data.clear();
+      if (!omap.empty()) {
+	using ceph::encode;
+	encode(omap, omap_data);
+      }
+    }
+    decode(cursor, bl);
+    if (struct_v >= 2)
+      decode(omap_header, bl);
+    if (struct_v >= 3) {
+      decode(snaps, bl);
+      decode(snap_seq, bl);
+    } else {
+      snaps.clear();
+      snap_seq = 0;
+    }
+    if (struct_v >= 4) {
+      decode(flags, bl);
+      decode(data_digest, bl);
+      decode(omap_digest, bl);
+    }
+  } else {
+    // current
+    decode(size, bl);
+    decode(mtime, bl);
+    decode(attrs, bl);
+    decode(data, bl);
+    decode(omap_data, bl);
+    decode(cursor, bl);
+    decode(omap_header, bl);
+    decode(snaps, bl);
+    decode(snap_seq, bl);
+    if (struct_v >= 4) {
+      decode(flags, bl);
+      decode(data_digest, bl);
+      decode(omap_digest, bl);
+    }
+    if (struct_v >= 6) {
+      decode(reqids, bl);
+    }
+    if (struct_v >= 7) {
+      decode(truncate_seq, bl);
+      decode(truncate_size, bl);
+    }
+    if (struct_v >= 8) {
+      decode(reqid_return_codes, bl);
+    }
+  }
+  DECODE_FINISH(bl);
+}
+
+void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
+{
+  o.push_back(new object_copy_data_t());
+
+  list<object_copy_cursor_t*> cursors;
+  object_copy_cursor_t::generate_test_instances(cursors);
+  auto ci = cursors.begin();
+  o.back()->cursor = **(ci++);
+
+  o.push_back(new object_copy_data_t());
+  o.back()->cursor = **(ci++);
+
+  o.push_back(new object_copy_data_t());
+  o.back()->size = 1234;
+  o.back()->mtime.set_from_double(1234);
+  ceph::buffer::ptr bp("there", 5);
+  ceph::buffer::list bl;
+  bl.push_back(bp);
+  o.back()->attrs["hello"] = bl;
+  ceph::buffer::ptr bp2("not", 3);
+  ceph::buffer::list bl2;
+  bl2.push_back(bp2);
+  map<string,ceph::buffer::list> omap;
+  omap["why"] = bl2;
+  using ceph::encode;
+  encode(omap, o.back()->omap_data);
+  ceph::buffer::ptr databp("iamsomedatatocontain", 20);
+  o.back()->data.push_back(databp);
+  o.back()->omap_header.append("this is an omap header");
+  o.back()->snaps.push_back(123);
+  o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
+}
+
+void object_copy_data_t::dump(Formatter *f) const
+{
+  f->open_object_section("cursor");
+  cursor.dump(f);
+  f->close_section(); // cursor
+  f->dump_int("size", size);
+  f->dump_stream("mtime") << mtime;
+  /* we should really print out the attrs here, but ceph::buffer::list
+     const-correctness prevents that */
+  f->dump_int("attrs_size", attrs.size());
+  f->dump_int("flags", flags);
+  f->dump_unsigned("data_digest", data_digest);
+  f->dump_unsigned("omap_digest", omap_digest);
+  f->dump_int("omap_data_length", omap_data.length());
+  f->dump_int("omap_header_length", omap_header.length());
+  f->dump_int("data_length", data.length());
+  f->open_array_section("snaps");
+  for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
+    f->dump_unsigned("snap", *p);
+  f->close_section();
+  f->open_array_section("reqids");
+  uint32_t idx = 0;
+  for (auto p = reqids.begin();
+       p != reqids.end();
+       ++idx, ++p) {
+    f->open_object_section("extra_reqid");
+    f->dump_stream("reqid") << p->first;
+    f->dump_stream("user_version") << p->second;
+    auto it = reqid_return_codes.find(idx);
+    if (it != reqid_return_codes.end()) {
+      f->dump_int("return_code", it->second);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+// -- pg_create_t --
+
+void pg_create_t::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(created, bl);
+  encode(parent, bl);
+  encode(split_bits, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_create_t::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  decode(created, bl);
+  decode(parent, bl);
+  decode(split_bits, bl);
+  DECODE_FINISH(bl);
+}
+
+void pg_create_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("created", created);
+  f->dump_stream("parent") << parent;
+  f->dump_int("split_bits", split_bits);
+}
+
+void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
+{
+  o.push_back(new pg_create_t);
+  o.push_back(new pg_create_t(1, pg_t(3, 4), 2));
+}
+
+
+// -- pg_hit_set_info_t --
+
+void pg_hit_set_info_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(2, 1, bl);
+  encode(begin, bl);
+  encode(end, bl);
+  encode(version, bl);
+  encode(using_gmt, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_hit_set_info_t::decode(ceph::buffer::list::const_iterator& p)
+{
+  DECODE_START(2, p);
+  decode(begin, p);
+  decode(end, p);
+  decode(version, p);
+  if (struct_v >= 2) {
+    decode(using_gmt, p);
+  } else {
+    using_gmt = false;
+  }
+  DECODE_FINISH(p);
+}
+
+void pg_hit_set_info_t::dump(Formatter *f) const
+{
+  f->dump_stream("begin") << begin;
+  f->dump_stream("end") << end;
+  f->dump_stream("version") << version;
+  f->dump_stream("using_gmt") << using_gmt;
+}
+
+void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
+{
+  ls.push_back(new pg_hit_set_info_t);
+  ls.push_back(new pg_hit_set_info_t);
+  ls.back()->begin = utime_t(1, 2);
+  ls.back()->end = utime_t(3, 4);
+}
+
+
+// -- pg_hit_set_history_t --
+
+void pg_hit_set_history_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(current_last_update, bl);
+  {
+    utime_t dummy_stamp;
+    encode(dummy_stamp, bl);
+  }
+  {
+    pg_hit_set_info_t dummy_info;
+    encode(dummy_info, bl);
+  }
+  encode(history, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_hit_set_history_t::decode(ceph::buffer::list::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(current_last_update, p);
+  {
+    utime_t dummy_stamp;
+    decode(dummy_stamp, p);
+  }
+  {
+    pg_hit_set_info_t dummy_info;
+    decode(dummy_info, p);
+  }
+  decode(history, p);
+  DECODE_FINISH(p);
+}
+
+void pg_hit_set_history_t::dump(Formatter *f) const
+{
+  f->dump_stream("current_last_update") << current_last_update;
+  f->open_array_section("history");
+  for (auto p = history.cbegin(); p != history.cend(); ++p) {
+    f->open_object_section("info");
+    p->dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
+{
+  ls.push_back(new pg_hit_set_history_t);
+  ls.push_back(new pg_hit_set_history_t);
+  ls.back()->current_last_update = eversion_t(1, 2);
+  ls.back()->history.push_back(pg_hit_set_info_t());
+}
+
+// -- OSDSuperblock --
+
+void OSDSuperblock::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(9, 5, bl);
+  encode(cluster_fsid, bl);
+  encode(whoami, bl);
+  encode(current_epoch, bl);
+  encode(oldest_map, bl);
+  encode(newest_map, bl);
+  encode(weight, bl);
+  compat_features.encode(bl);
+  encode(clean_thru, bl);
+  encode(mounted, bl);
+  encode(osd_fsid, bl);
+  encode((epoch_t)0, bl);  // epoch_t last_epoch_marked_full
+  encode((uint32_t)0, bl);  // map<int64_t,epoch_t> pool_last_epoch_marked_full
+  encode(purged_snaps_last, bl);
+  encode(last_purged_snaps_scrub, bl);
+  ENCODE_FINISH(bl);
+}
+
+void OSDSuperblock::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(9, 5, 5, bl);
+  if (struct_v < 3) {
+    string magic;
+    decode(magic, bl);
+  }
+  decode(cluster_fsid, bl);
+  decode(whoami, bl);
+  decode(current_epoch, bl);
+  decode(oldest_map, bl);
+  decode(newest_map, bl);
+  decode(weight, bl);
+  if (struct_v >= 2) {
+    compat_features.decode(bl);
+  } else { //upgrade it!
+    compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
+  }
+  decode(clean_thru, bl);
+  decode(mounted, bl);
+  if (struct_v >= 4)
+    decode(osd_fsid, bl);
+  if (struct_v >= 6) {
+    epoch_t last_map_marked_full;
+    decode(last_map_marked_full, bl);
+  }
+  if (struct_v >= 7) {
+    map<int64_t,epoch_t> pool_last_map_marked_full;
+    decode(pool_last_map_marked_full, bl);
+  }
+  if (struct_v >= 9) {
+    decode(purged_snaps_last, bl);
+    decode(last_purged_snaps_scrub, bl);
+  } else {
+    purged_snaps_last = 0;
+  }
+  DECODE_FINISH(bl);
+}
+
+void OSDSuperblock::dump(Formatter *f) const
+{
+  f->dump_stream("cluster_fsid") << cluster_fsid;
+  f->dump_stream("osd_fsid") << osd_fsid;
+  f->dump_int("whoami", whoami);
+  f->dump_int("current_epoch", current_epoch);
+  f->dump_int("oldest_map", oldest_map);
+  f->dump_int("newest_map", newest_map);
+  f->dump_float("weight", weight);
+  f->open_object_section("compat");
+  compat_features.dump(f);
+  f->close_section();
+  f->dump_int("clean_thru", clean_thru);
+  f->dump_int("last_epoch_mounted", mounted);
+  f->dump_unsigned("purged_snaps_last", purged_snaps_last);
+  f->dump_stream("last_purged_snaps_scrub") << last_purged_snaps_scrub;
+}
+
+void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
+{
+  OSDSuperblock z;
+  o.push_back(new OSDSuperblock(z));
+  z.cluster_fsid.parse("01010101-0101-0101-0101-010101010101");
+  z.osd_fsid.parse("02020202-0202-0202-0202-020202020202");
+  z.whoami = 3;
+  z.current_epoch = 4;
+  z.oldest_map = 5;
+  z.newest_map = 9;
+  z.mounted = 8;
+  z.clean_thru = 7;
+  o.push_back(new OSDSuperblock(z));
+  o.push_back(new OSDSuperblock(z));
+}
+
+// -- SnapSet --
+
+void SnapSet::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(3, 2, bl);
+  encode(seq, bl);
+  encode(true, bl);  // head_exists
+  encode(snaps, bl);
+  encode(clones, bl);
+  encode(clone_overlap, bl);
+  encode(clone_size, bl);
+  encode(clone_snaps, bl);
+  ENCODE_FINISH(bl);
+}
+
+void SnapSet::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+  decode(seq, bl);
+  bl += 1u;  // skip legacy head_exists (always true)
+  decode(snaps, bl);
+  decode(clones, bl);
+  decode(clone_overlap, bl);
+  decode(clone_size, bl);
+  if (struct_v >= 3) {
+    decode(clone_snaps, bl);
+  } else {
+    clone_snaps.clear();
+  }
+  DECODE_FINISH(bl);
+}
+
+void SnapSet::dump(Formatter *f) const
+{
+  f->dump_unsigned("seq", seq);
+  f->open_array_section("clones");
+  for (auto p = clones.cbegin(); p != clones.cend(); ++p) {
+    f->open_object_section("clone");
+    f->dump_unsigned("snap", *p);
+    auto cs = clone_size.find(*p);
+    if (cs != clone_size.end())
+      f->dump_unsigned("size", cs->second);
+    else
+      f->dump_string("size", "????");
+    auto co = clone_overlap.find(*p);
+    if (co != clone_overlap.end())
+      f->dump_stream("overlap") << co->second;
+    else
+      f->dump_stream("overlap") << "????";
+    auto q = clone_snaps.find(*p);
+    if (q != clone_snaps.end()) {
+      f->open_array_section("snaps");
+      for (auto s : q->second) {
+	f->dump_unsigned("snap", s);
+      }
+      f->close_section();
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void SnapSet::generate_test_instances(list<SnapSet*>& o)
+{
+  o.push_back(new SnapSet);
+  o.push_back(new SnapSet);
+  o.back()->seq = 123;
+  o.back()->snaps.push_back(123);
+  o.back()->snaps.push_back(12);
+  o.push_back(new SnapSet);
+  o.back()->seq = 123;
+  o.back()->snaps.push_back(123);
+  o.back()->snaps.push_back(12);
+  o.back()->clones.push_back(12);
+  o.back()->clone_size[12] = 12345;
+  o.back()->clone_overlap[12];
+  o.back()->clone_snaps[12] = {12, 10, 8};
+}
+
+ostream& operator<<(ostream& out, const SnapSet& cs)
+{
+  return out << cs.seq << "=" << cs.snaps << ":"
+	     << cs.clone_snaps;
+}
+
+void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
+{
+  // NOTE: our reconstruction of snaps (and the snapc) is not strictly
+  // correct: it will not include snaps that still logically exist
+  // but for which there was no clone that is defined.  For all
+  // practical purposes this doesn't matter, since we only use that
+  // information to clone on the OSD, and we have already moved
+  // forward past that part of the object history.
+
+  seq = ss.seq;
+  set<snapid_t> _snaps;
+  set<snapid_t> _clones;
+  for (auto p = ss.clones.cbegin(); p != ss.clones.cend(); ++p) {
+    if (p->cloneid != librados::SNAP_HEAD) {
+      _clones.insert(p->cloneid);
+      _snaps.insert(p->snaps.begin(), p->snaps.end());
+      clone_size[p->cloneid] = p->size;
+      clone_overlap[p->cloneid];  // the entry must exist, even if it's empty.
+      for (auto q = p->overlap.cbegin(); q != p->overlap.cend(); ++q)
+	clone_overlap[p->cloneid].insert(q->first, q->second);
+      if (!legacy) {
+	// p->snaps is ascending; clone_snaps is descending
+	vector<snapid_t>& v = clone_snaps[p->cloneid];
+	for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
+	  v.push_back(*q);
+	}
+      }
+    }
+  }
+
+  // ascending
+  clones.clear();
+  clones.reserve(_clones.size());
+  for (auto p = _clones.begin(); p != _clones.end(); ++p)
+    clones.push_back(*p);
+
+  // descending
+  snaps.clear();
+  snaps.reserve(_snaps.size());
+  for (auto p = _snaps.rbegin();
+       p != _snaps.rend(); ++p)
+    snaps.push_back(*p);
+}
+
+uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
+{
+  ceph_assert(clone_size.count(clone));
+  uint64_t size = clone_size.find(clone)->second;
+  ceph_assert(clone_overlap.count(clone));
+  const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
+  ceph_assert(size >= (uint64_t)overlap.size());
+  return size - overlap.size();
+}
+
+void SnapSet::filter(const pg_pool_t &pinfo)
+{
+  vector<snapid_t> oldsnaps;
+  oldsnaps.swap(snaps);
+  for (auto i = oldsnaps.cbegin(); i != oldsnaps.cend(); ++i) {
+    if (!pinfo.is_removed_snap(*i))
+      snaps.push_back(*i);
+  }
+}
+
+SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
+{
+  SnapSet ss = *this;
+  ss.filter(pinfo);
+  return ss;
+}
+
+// -- watch_info_t --
+
+void watch_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  ENCODE_START(4, 3, bl);
+  encode(cookie, bl);
+  encode(timeout_seconds, bl);
+  encode(addr, bl, features);
+  ENCODE_FINISH(bl);
+}
+
+void watch_info_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
+  decode(cookie, bl);
+  if (struct_v < 2) {
+    uint64_t ver;
+    decode(ver, bl);
+  }
+  decode(timeout_seconds, bl);
+  if (struct_v >= 4) {
+    decode(addr, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void watch_info_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("cookie", cookie);
+  f->dump_unsigned("timeout_seconds", timeout_seconds);
+  f->open_object_section("addr");
+  addr.dump(f);
+  f->close_section();
+}
+
+void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
+{
+  o.push_back(new watch_info_t);
+  o.push_back(new watch_info_t);
+  o.back()->cookie = 123;
+  o.back()->timeout_seconds = 99;
+  entity_addr_t ea;
+  ea.set_type(entity_addr_t::TYPE_LEGACY);
+  ea.set_nonce(1);
+  ea.set_family(AF_INET);
+  ea.set_in4_quad(0, 127);
+  ea.set_in4_quad(1, 0);
+  ea.set_in4_quad(2, 1);
+  ea.set_in4_quad(3, 2);
+  ea.set_port(2);
+  o.back()->addr = ea;
+}
+
+// -- chunk_info_t --
+
+void chunk_info_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(offset, bl);
+  encode(length, bl);
+  encode(oid, bl);
+  __u32 _flags = flags;
+  encode(_flags, bl);
+  ENCODE_FINISH(bl);
+}
+
+void chunk_info_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(offset, bl);
+  decode(length, bl);
+  decode(oid, bl);
+  __u32 _flags;
+  decode(_flags, bl);
+  flags = (cflag_t)_flags;
+  DECODE_FINISH(bl);
+}
+
+void chunk_info_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("length", length);
+  f->open_object_section("oid");
+  oid.dump(f);
+  f->close_section();
+  f->dump_unsigned("flags", flags);
+}
+
+
+bool chunk_info_t::operator==(const chunk_info_t& cit) const
+{
+  if (has_fingerprint()) {
+    if (oid.oid.name == cit.oid.oid.name) {
+      return true;
+    }
+  } else {
+    if (offset == cit.offset && length == cit.length &&
+	oid.oid.name == cit.oid.oid.name) {
+      return true;
+    }
+
+  }
+  return false;
+}
+
+bool operator==(const std::pair<const long unsigned int, chunk_info_t> & l,
+		const std::pair<const long unsigned int, chunk_info_t> & r) 
+{
+  return l.first == r.first &&
+	 l.second == r.second;
+}
+
+ostream& operator<<(ostream& out, const chunk_info_t& ci)
+{
+  return out << "(len: " << ci.length << " oid: " << ci.oid
+	     << " offset: " << ci.offset
+	     << " flags: " << ci.get_flag_string(ci.flags) << ")";
+}
+
+// -- object_manifest_t --
+
+std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci)
+{
+  return out << ci.ref_delta << std::endl;
+}
+
+void object_manifest_t::calc_refs_to_inc_on_set(
+  const object_manifest_t* _g,
+  const object_manifest_t* _l,
+  object_ref_delta_t &refs) const
+{
+  /* avoid to increment the same reference on adjacent clones */
+  auto iter = chunk_map.begin();
+  auto find_chunk = [](decltype(iter) &i, const object_manifest_t* cur)
+    -> bool {
+    if (cur) {
+      auto c = cur->chunk_map.find(i->first);
+      if (c != cur->chunk_map.end() && c->second == i->second) {
+	return true;
+
+      }
+    }
+    return false;
+  };
+
+  /* If at least a same chunk exists on either _g or _l, do not increment 
+   * the reference 
+   *
+   * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
+   * 20:   [0, 2) aaa, <- set_chunk
+   * 30:   [0, 2) abc, [6, 2) bbb, [8, 2) ccc
+   * --> incremnt the reference
+   *
+   * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
+   * 20:   [0, 2) ccc, <- set_chunk
+   * 30:   [0, 2) abc, [6, 2) bbb, [8, 2) ccc
+   * --> do not need to increment
+   *
+   * head: [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
+   * 20:   [0, 2) ccc, <- set_chunk
+   * 30:   [0, 2) ccc, [6, 2) bbb, [8, 2) ccc
+   * --> decrement the reference of ccc
+   *
+   */
+  for (; iter != chunk_map.end(); ++iter) {
+    auto found_g = find_chunk(iter, _g);
+    auto found_l = find_chunk(iter, _l);
+    if (!found_g && !found_l) {
+      refs.inc_ref(iter->second.oid);
+    } else if (found_g && found_l) {
+      refs.dec_ref(iter->second.oid);
+    }
+  }
+}
+
+void object_manifest_t::calc_refs_to_drop_on_modify(
+  const object_manifest_t* _l,
+  const ObjectCleanRegions& clean_regions,
+  object_ref_delta_t &refs) const
+{
+  for (auto &p : chunk_map) {
+    if (!clean_regions.is_clean_region(p.first, p.second.length)) {
+      // has previous snapshot
+      if (_l) {
+	/* 
+	* Let's assume that there is a manifest snapshotted object which has three chunks
+	* head: [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
+	* 20:   [0, 2) aaa, [6, 2) bbb, [8, 2) ccc
+	*
+	* If we modify [6, 2) at head, we shouldn't decrement bbb's refcount because
+	* 20 has the reference for bbb. Therefore, we only drop the reference if two chunks 
+	* (head: [6, 2) and 20: [6, 2)) are different. 
+	*
+	*/
+	auto c = _l->chunk_map.find(p.first);
+	if (c != _l->chunk_map.end()) {
+	  if (p.second == c->second) {
+	    continue;
+	  }
+	}
+	refs.dec_ref(p.second.oid);
+      } else {
+	// decrement the reference of the updated chunks if the manifest object has no snapshot 
+	refs.dec_ref(p.second.oid);
+      }
+    }
+  }
+}
+
+void object_manifest_t::calc_refs_to_drop_on_removal(
+  const object_manifest_t* _g,
+  const object_manifest_t* _l,
+  object_ref_delta_t &refs) const
+{
+  /* At a high level, the rule is that consecutive clones with the same reference
+   * at the same offset share a reference.  As such, removing *this may result
+   * in removing references in two cases:
+   * 1) *this has a reference which it shares with neither _g nor _l
+   * 2) _g and _l have a reference which they share with each other but not
+   *   *this.
+   *
+   * For a particular offset, both 1 and 2 can happen.
+   *
+   * Notably, this means that to evaluate the reference change from removing
+   * the object with *this, we only need to look at the two adjacent clones.
+   */
+
+  // Paper over possibly missing _g or _l -- nullopt is semantically the same
+  // as an empty chunk_map
+  static const object_manifest_t empty;
+  const object_manifest_t &g = _g ? *_g : empty;
+  const object_manifest_t &l = _l ? *_l : empty;
+
+  auto giter = g.chunk_map.begin();
+  auto iter = chunk_map.begin();
+  auto liter = l.chunk_map.begin();
+
+  // Translate iter, map pair to the current offset, end() -> max
+  auto get_offset = [](decltype(iter) &i, const object_manifest_t &manifest)
+    -> uint64_t {
+    return i == manifest.chunk_map.end() ?
+      std::numeric_limits<uint64_t>::max() : i->first;
+  };
+
+  /* If current matches the offset at iter, returns the chunk at *iter
+   * and increments iter.  Otherwise, returns nullptr.
+   *
+   * current will always be derived from the min of *giter, *iter, and
+   * *liter on each cycle, so the result will be that each loop iteration
+   * will pick up all chunks at the offest being considered, each offset
+   * will be considered once, and all offsets will be considered.
+   */
+  auto get_chunk = [](
+    uint64_t current, decltype(iter) &i, const object_manifest_t &manifest)
+    -> const chunk_info_t * {
+    if (i == manifest.chunk_map.end() || current != i->first) {
+      return nullptr;
+    } else {
+      return &(i++)->second;
+    }
+  };
+
+  while (giter != g.chunk_map.end() ||
+	 iter != chunk_map.end() ||
+	 liter != l.chunk_map.end()) {
+    auto current = std::min(
+      std::min(get_offset(giter, g), get_offset(iter, *this)),
+      get_offset(liter, l));
+
+    auto gchunk = get_chunk(current, giter, g);
+    auto chunk = get_chunk(current, iter, *this);
+    auto lchunk = get_chunk(current, liter, l);
+
+    if (gchunk && lchunk && *gchunk == *lchunk &&
+	(!chunk || *gchunk != *chunk)) {
+      // case 1 from above: l and g match, chunk does not
+      refs.dec_ref(gchunk->oid);
+    }
+
+    if (chunk &&
+	(!gchunk || chunk->oid != gchunk->oid) &&
+	(!lchunk || chunk->oid != lchunk->oid)) {
+      // case 2 from above: *this matches neither
+      refs.dec_ref(chunk->oid);
+    }
+  }
+}
+
+void object_manifest_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(type, bl);
+  switch (type) {
+    case TYPE_NONE: break;
+    case TYPE_REDIRECT: 
+      encode(redirect_target, bl);
+      break;
+    case TYPE_CHUNKED:
+      encode(chunk_map, bl);
+      break;
+    default:
+      ceph_abort();
+  }
+  ENCODE_FINISH(bl);
+}
+
+void object_manifest_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(type, bl);
+  switch (type) {
+    case TYPE_NONE: break;
+    case TYPE_REDIRECT: 
+      decode(redirect_target, bl);
+      break;
+    case TYPE_CHUNKED:
+      decode(chunk_map, bl);
+      break;
+    default:
+      ceph_abort();
+  }
+  DECODE_FINISH(bl);
+}
+
+void object_manifest_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("type", type);
+  if (type == TYPE_REDIRECT) {
+    f->open_object_section("redirect_target");
+    redirect_target.dump(f);
+    f->close_section();
+  } else if (type == TYPE_CHUNKED) {
+    f->open_array_section("chunk_map");
+    for (auto& p : chunk_map) {
+      f->open_object_section("chunk");
+      f->dump_unsigned("offset", p.first);
+      p.second.dump(f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+}
+
+void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o)
+{
+  o.push_back(new object_manifest_t());
+  o.back()->type = TYPE_REDIRECT;
+}
+
+ostream& operator<<(ostream& out, const object_manifest_t& om)
+{
+  out << "manifest(" << om.get_type_name();
+  if (om.is_redirect()) {
+    out << " " << om.redirect_target;
+  } else if (om.is_chunked()) {
+    out << " " << om.chunk_map;
+  }
+  out << ")";
+  return out;
+}
+
+// -- object_info_t --
+
+void object_info_t::copy_user_bits(const object_info_t& other)
+{
+  // these bits are copied from head->clone.
+  size = other.size;
+  mtime = other.mtime;
+  local_mtime = other.local_mtime;
+  last_reqid = other.last_reqid;
+  truncate_seq = other.truncate_seq;
+  truncate_size = other.truncate_size;
+  flags = other.flags;
+  user_version = other.user_version;
+  data_digest = other.data_digest;
+  omap_digest = other.omap_digest;
+}
+
+void object_info_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  object_locator_t myoloc(soid);
+  map<entity_name_t, watch_info_t> old_watchers;
+  for (auto i = watchers.cbegin(); i != watchers.cend(); ++i) {
+    old_watchers.insert(make_pair(i->first.second, i->second));
+  }
+  ENCODE_START(17, 8, bl);
+  encode(soid, bl);
+  encode(myoloc, bl);	//Retained for compatibility
+  encode((__u32)0, bl); // was category, no longer used
+  encode(version, bl);
+  encode(prior_version, bl);
+  encode(last_reqid, bl);
+  encode(size, bl);
+  encode(mtime, bl);
+  if (soid.snap == CEPH_NOSNAP)
+    encode(osd_reqid_t(), bl);  // used to be wrlock_by
+  else
+    encode((uint32_t)0, bl);    // was legacy_snaps
+  encode(truncate_seq, bl);
+  encode(truncate_size, bl);
+  encode(is_lost(), bl);
+  encode(old_watchers, bl, features);
+  /* shenanigans to avoid breaking backwards compatibility in the disk format.
+   * When we can, switch this out for simply putting the version_t on disk. */
+  eversion_t user_eversion(0, user_version);
+  encode(user_eversion, bl);
+  encode(test_flag(FLAG_USES_TMAP), bl);
+  encode(watchers, bl, features);
+  __u32 _flags = flags;
+  encode(_flags, bl);
+  encode(local_mtime, bl);
+  encode(data_digest, bl);
+  encode(omap_digest, bl);
+  encode(expected_object_size, bl);
+  encode(expected_write_size, bl);
+  encode(alloc_hint_flags, bl);
+  if (has_manifest()) {
+    encode(manifest, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void object_info_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  object_locator_t myoloc;
+  DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
+  map<entity_name_t, watch_info_t> old_watchers;
+  decode(soid, bl);
+  decode(myoloc, bl);
+  {
+    string category;
+    decode(category, bl);  // no longer used
+  }
+  decode(version, bl);
+  decode(prior_version, bl);
+  decode(last_reqid, bl);
+  decode(size, bl);
+  decode(mtime, bl);
+  if (soid.snap == CEPH_NOSNAP) {
+    osd_reqid_t wrlock_by;
+    decode(wrlock_by, bl);
+  } else {
+    vector<snapid_t> legacy_snaps;
+    decode(legacy_snaps, bl);
+  }
+  decode(truncate_seq, bl);
+  decode(truncate_size, bl);
+
+  // if this is struct_v >= 13, we will overwrite this
+  // below since this field is just here for backwards
+  // compatibility
+  __u8 lo;
+  decode(lo, bl);
+  flags = (flag_t)lo;
+
+  decode(old_watchers, bl);
+  eversion_t user_eversion;
+  decode(user_eversion, bl);
+  user_version = user_eversion.version;
+
+  if (struct_v >= 9) {
+    bool uses_tmap = false;
+    decode(uses_tmap, bl);
+    if (uses_tmap)
+      set_flag(FLAG_USES_TMAP);
+  } else {
+    set_flag(FLAG_USES_TMAP);
+  }
+  if (struct_v < 10)
+    soid.pool = myoloc.pool;
+  if (struct_v >= 11) {
+    decode(watchers, bl);
+  } else {
+    for (auto i = old_watchers.begin(); i != old_watchers.end(); ++i) {
+      watchers.insert(
+	make_pair(
+	  make_pair(i->second.cookie, i->first), i->second));
+    }
+  }
+  if (struct_v >= 13) {
+    __u32 _flags;
+    decode(_flags, bl);
+    flags = (flag_t)_flags;
+  }
+  if (struct_v >= 14) {
+    decode(local_mtime, bl);
+  } else {
+    local_mtime = utime_t();
+  }
+  if (struct_v >= 15) {
+    decode(data_digest, bl);
+    decode(omap_digest, bl);
+  } else {
+    data_digest = omap_digest = -1;
+    clear_flag(FLAG_DATA_DIGEST);
+    clear_flag(FLAG_OMAP_DIGEST);
+  }
+  if (struct_v >= 16) {
+    decode(expected_object_size, bl);
+    decode(expected_write_size, bl);
+    decode(alloc_hint_flags, bl);
+  } else {
+    expected_object_size = 0;
+    expected_write_size = 0;
+    alloc_hint_flags = 0;
+  }
+  if (struct_v >= 17) {
+    if (has_manifest()) {
+      decode(manifest, bl);
+    }
+  }
+  DECODE_FINISH(bl);
+}
+
+void object_info_t::dump(Formatter *f) const
+{
+  f->open_object_section("oid");
+  soid.dump(f);
+  f->close_section();
+  f->dump_stream("version") << version;
+  f->dump_stream("prior_version") << prior_version;
+  f->dump_stream("last_reqid") << last_reqid;
+  f->dump_unsigned("user_version", user_version);
+  f->dump_unsigned("size", size);
+  f->dump_stream("mtime") << mtime;
+  f->dump_stream("local_mtime") << local_mtime;
+  f->dump_unsigned("lost", (int)is_lost());
+  vector<string> sv = get_flag_vector(flags);
+  f->open_array_section("flags");
+  for (auto str: sv)
+    f->dump_string("flags", str);
+  f->close_section();
+  f->dump_unsigned("truncate_seq", truncate_seq);
+  f->dump_unsigned("truncate_size", truncate_size);
+  f->dump_format("data_digest", "0x%08x", data_digest);
+  f->dump_format("omap_digest", "0x%08x", omap_digest);
+  f->dump_unsigned("expected_object_size", expected_object_size);
+  f->dump_unsigned("expected_write_size", expected_write_size);
+  f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
+  f->dump_object("manifest", manifest);
+  f->open_object_section("watchers");
+  for (auto p = watchers.cbegin(); p != watchers.cend(); ++p) {
+    CachedStackStringStream css;
+    *css << p->first.second;
+    f->open_object_section(css->strv());
+    p->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void object_info_t::generate_test_instances(list<object_info_t*>& o)
+{
+  o.push_back(new object_info_t());
+  
+  // fixme
+}
+
+
+ostream& operator<<(ostream& out, const object_info_t& oi)
+{
+  out << oi.soid << "(" << oi.version
+      << " " << oi.last_reqid;
+  if (oi.flags)
+    out << " " << oi.get_flag_string();
+  out << " s " << oi.size;
+  out << " uv " << oi.user_version;
+  if (oi.is_data_digest())
+    out << " dd " << std::hex << oi.data_digest << std::dec;
+  if (oi.is_omap_digest())
+    out << " od " << std::hex << oi.omap_digest << std::dec;
+  out << " alloc_hint [" << oi.expected_object_size
+      << " " << oi.expected_write_size
+      << " " << oi.alloc_hint_flags << "]";
+  if (oi.has_manifest())
+    out << " " << oi.manifest;
+  out << ")";
+  return out;
+}
+
+// -- ObjectRecovery --
+void ObjectRecoveryProgress::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(first, bl);
+  encode(data_complete, bl);
+  encode(data_recovered_to, bl);
+  encode(omap_recovered_to, bl);
+  encode(omap_complete, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ObjectRecoveryProgress::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  decode(first, bl);
+  decode(data_complete, bl);
+  decode(data_recovered_to, bl);
+  decode(omap_recovered_to, bl);
+  decode(omap_complete, bl);
+  DECODE_FINISH(bl);
+}
+
+ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
+{
+  return prog.print(out);
+}
+
+void ObjectRecoveryProgress::generate_test_instances(
+  list<ObjectRecoveryProgress*>& o)
+{
+  o.push_back(new ObjectRecoveryProgress);
+  o.back()->first = false;
+  o.back()->data_complete = true;
+  o.back()->omap_complete = true;
+  o.back()->data_recovered_to = 100;
+
+  o.push_back(new ObjectRecoveryProgress);
+  o.back()->first = true;
+  o.back()->data_complete = false;
+  o.back()->omap_complete = false;
+  o.back()->data_recovered_to = 0;
+}
+
+ostream &ObjectRecoveryProgress::print(ostream &out) const
+{
+  return out << "ObjectRecoveryProgress("
+	     << ( first ? "" : "!" ) << "first, "
+	     << "data_recovered_to:" << data_recovered_to
+	     << ", data_complete:" << ( data_complete ? "true" : "false" )
+	     << ", omap_recovered_to:" << omap_recovered_to
+	     << ", omap_complete:" << ( omap_complete ? "true" : "false" )
+	     << ", error:" << ( error ? "true" : "false" )
+	     << ")";
+}
+
+void ObjectRecoveryProgress::dump(Formatter *f) const
+{
+  f->dump_int("first?", first);
+  f->dump_int("data_complete?", data_complete);
+  f->dump_unsigned("data_recovered_to", data_recovered_to);
+  f->dump_int("omap_complete?", omap_complete);
+  f->dump_string("omap_recovered_to", omap_recovered_to);
+}
+
+void ObjectRecoveryInfo::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+  ENCODE_START(3, 1, bl);
+  encode(soid, bl);
+  encode(version, bl);
+  encode(size, bl);
+  encode(oi, bl, features);
+  encode(ss, bl);
+  encode(copy_subset, bl);
+  encode(clone_subset, bl);
+  encode(object_exist, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ObjectRecoveryInfo::decode(ceph::buffer::list::const_iterator &bl,
+				int64_t pool)
+{
+  DECODE_START(3, bl);
+  decode(soid, bl);
+  decode(version, bl);
+  decode(size, bl);
+  decode(oi, bl);
+  decode(ss, bl);
+  decode(copy_subset, bl);
+  decode(clone_subset, bl);
+  if (struct_v > 2)
+    decode(object_exist, bl);
+  else
+    object_exist = false;
+  DECODE_FINISH(bl);
+  if (struct_v < 2) {
+    if (!soid.is_max() && soid.pool == -1)
+      soid.pool = pool;
+    map<hobject_t, interval_set<uint64_t>> tmp;
+    tmp.swap(clone_subset);
+    for (auto i = tmp.begin(); i != tmp.end(); ++i) {
+      hobject_t first(i->first);
+      if (!first.is_max() && first.pool == -1)
+	first.pool = pool;
+      clone_subset[first].swap(i->second);
+    }
+  }
+}
+
+void ObjectRecoveryInfo::generate_test_instances(
+  list<ObjectRecoveryInfo*>& o)
+{
+  o.push_back(new ObjectRecoveryInfo);
+  o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
+  o.back()->version = eversion_t(0,0);
+  o.back()->size = 100;
+  o.back()->object_exist = false;
+}
+
+
+void ObjectRecoveryInfo::dump(Formatter *f) const
+{
+  f->dump_stream("object") << soid;
+  f->dump_stream("at_version") << version;
+  f->dump_stream("size") << size;
+  {
+    f->open_object_section("object_info");
+    oi.dump(f);
+    f->close_section();
+  }
+  {
+    f->open_object_section("snapset");
+    ss.dump(f);
+    f->close_section();
+  }
+  f->dump_stream("copy_subset") << copy_subset;
+  f->dump_stream("clone_subset") << clone_subset;
+  f->dump_stream("object_exist") << object_exist;
+}
+
+ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
+{
+  return inf.print(out);
+}
+
+ostream &ObjectRecoveryInfo::print(ostream &out) const
+{
+  return out << "ObjectRecoveryInfo("
+	     << soid << "@" << version
+	     << ", size: " << size
+	     << ", copy_subset: " << copy_subset
+	     << ", clone_subset: " << clone_subset
+	     << ", snapset: " << ss
+	     << ", object_exist: " << object_exist
+	     << ")";
+}
+
+// -- PushReplyOp --
+void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
+{
+  o.push_back(new PushReplyOp);
+  o.push_back(new PushReplyOp);
+  o.back()->soid = hobject_t(sobject_t("asdf", 2));
+  o.push_back(new PushReplyOp);
+  o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
+}
+
+void PushReplyOp::encode(ceph::buffer::list &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(soid, bl);
+  ENCODE_FINISH(bl);
+}
+
+void PushReplyOp::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  decode(soid, bl);
+  DECODE_FINISH(bl);
+}
+
+void PushReplyOp::dump(Formatter *f) const
+{
+  f->dump_stream("soid") << soid;
+}
+
+ostream &PushReplyOp::print(ostream &out) const
+{
+  return out
+    << "PushReplyOp(" << soid
+    << ")";
+}
+
+ostream& operator<<(ostream& out, const PushReplyOp &op)
+{
+  return op.print(out);
+}
+
+uint64_t PushReplyOp::cost(CephContext *cct) const
+{
+
+  return cct->_conf->osd_push_per_object_cost +
+    cct->_conf->osd_recovery_max_chunk;
+}
+
+// -- PullOp --
+void PullOp::generate_test_instances(list<PullOp*> &o)
+{
+  o.push_back(new PullOp);
+  o.push_back(new PullOp);
+  o.back()->soid = hobject_t(sobject_t("asdf", 2));
+  o.back()->recovery_info.version = eversion_t(3, 10);
+  o.push_back(new PullOp);
+  o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
+  o.back()->recovery_info.version = eversion_t(0, 0);
+}
+
+void PullOp::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(soid, bl);
+  encode(recovery_info, bl, features);
+  encode(recovery_progress, bl);
+  ENCODE_FINISH(bl);
+}
+
+void PullOp::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  decode(soid, bl);
+  decode(recovery_info, bl);
+  decode(recovery_progress, bl);
+  DECODE_FINISH(bl);
+}
+
+void PullOp::dump(Formatter *f) const
+{
+  f->dump_stream("soid") << soid;
+  {
+    f->open_object_section("recovery_info");
+    recovery_info.dump(f);
+    f->close_section();
+  }
+  {
+    f->open_object_section("recovery_progress");
+    recovery_progress.dump(f);
+    f->close_section();
+  }
+}
+
+ostream &PullOp::print(ostream &out) const
+{
+  return out
+    << "PullOp(" << soid
+    << ", recovery_info: " << recovery_info
+    << ", recovery_progress: " << recovery_progress
+    << ")";
+}
+
+ostream& operator<<(ostream& out, const PullOp &op)
+{
+  return op.print(out);
+}
+
+uint64_t PullOp::cost(CephContext *cct) const
+{
+  return cct->_conf->osd_push_per_object_cost +
+    cct->_conf->osd_recovery_max_chunk;
+}
+
+// -- PushOp --
+void PushOp::generate_test_instances(list<PushOp*> &o)
+{
+  o.push_back(new PushOp);
+  o.push_back(new PushOp);
+  o.back()->soid = hobject_t(sobject_t("asdf", 2));
+  o.back()->version = eversion_t(3, 10);
+  o.push_back(new PushOp);
+  o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
+  o.back()->version = eversion_t(0, 0);
+}
+
+void PushOp::encode(ceph::buffer::list &bl, uint64_t features) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(soid, bl);
+  encode(version, bl);
+  encode(data, bl);
+  encode(data_included, bl);
+  encode(omap_header, bl);
+  encode(omap_entries, bl);
+  encode(attrset, bl);
+  encode(recovery_info, bl, features);
+  encode(after_progress, bl);
+  encode(before_progress, bl);
+  ENCODE_FINISH(bl);
+}
+
+void PushOp::decode(ceph::buffer::list::const_iterator &bl)
+{
+  DECODE_START(1, bl);
+  decode(soid, bl);
+  decode(version, bl);
+  decode(data, bl);
+  decode(data_included, bl);
+  decode(omap_header, bl);
+  decode(omap_entries, bl);
+  decode(attrset, bl);
+  decode(recovery_info, bl);
+  decode(after_progress, bl);
+  decode(before_progress, bl);
+  DECODE_FINISH(bl);
+}
+
+void PushOp::dump(Formatter *f) const
+{
+  f->dump_stream("soid") << soid;
+  f->dump_stream("version") << version;
+  f->dump_int("data_len", data.length());
+  f->dump_stream("data_included") << data_included;
+  f->dump_int("omap_header_len", omap_header.length());
+  f->dump_int("omap_entries_len", omap_entries.size());
+  f->dump_int("attrset_len", attrset.size());
+  {
+    f->open_object_section("recovery_info");
+    recovery_info.dump(f);
+    f->close_section();
+  }
+  {
+    f->open_object_section("after_progress");
+    after_progress.dump(f);
+    f->close_section();
+  }
+  {
+    f->open_object_section("before_progress");
+    before_progress.dump(f);
+    f->close_section();
+  }
+}
+
+ostream &PushOp::print(ostream &out) const
+{
+  return out
+    << "PushOp(" << soid
+    << ", version: " << version
+    << ", data_included: " << data_included
+    << ", data_size: " << data.length()
+    << ", omap_header_size: " << omap_header.length()
+    << ", omap_entries_size: " << omap_entries.size()
+    << ", attrset_size: " << attrset.size()
+    << ", recovery_info: " << recovery_info
+    << ", after_progress: " << after_progress
+    << ", before_progress: " << before_progress
+    << ")";
+}
+
+ostream& operator<<(ostream& out, const PushOp &op)
+{
+  return op.print(out);
+}
+
+uint64_t PushOp::cost(CephContext *cct) const
+{
+  uint64_t cost = data_included.size();
+  for (auto i = omap_entries.cbegin(); i != omap_entries.cend(); ++i) {
+    cost += i->second.length();
+  }
+  cost += cct->_conf->osd_push_per_object_cost;
+  return cost;
+}
+
+// -- ScrubMap --
+
+void ScrubMap::merge_incr(const ScrubMap &l)
+{
+  ceph_assert(valid_through == l.incr_since);
+  valid_through = l.valid_through;
+
+  for (auto p = l.objects.cbegin(); p != l.objects.cend(); ++p){
+    if (p->second.negative) {
+      auto q = objects.find(p->first);
+      if (q != objects.end()) {
+	objects.erase(q);
+      }
+    } else {
+      objects[p->first] = p->second;
+    }
+  }
+}          
+
+void ScrubMap::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(3, 2, bl);
+  encode(objects, bl);
+  encode((__u32)0, bl); // used to be attrs; now deprecated
+  ceph::buffer::list old_logbl;  // not used
+  encode(old_logbl, bl);
+  encode(valid_through, bl);
+  encode(incr_since, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ScrubMap::decode(ceph::buffer::list::const_iterator& bl, int64_t pool)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+  decode(objects, bl);
+  {
+    map<string,string> attrs;  // deprecated
+    decode(attrs, bl);
+  }
+  ceph::buffer::list old_logbl;   // not used
+  decode(old_logbl, bl);
+  decode(valid_through, bl);
+  decode(incr_since, bl);
+  DECODE_FINISH(bl);
+
+  // handle hobject_t upgrade
+  if (struct_v < 3) {
+    map<hobject_t, object> tmp;
+    tmp.swap(objects);
+    for (auto i = tmp.begin(); i != tmp.end(); ++i) {
+      hobject_t first(i->first);
+      if (!first.is_max() && first.pool == -1)
+	first.pool = pool;
+      objects[first] = i->second;
+    }
+  }
+}
+
+void ScrubMap::dump(Formatter *f) const
+{
+  f->dump_stream("valid_through") << valid_through;
+  f->dump_stream("incremental_since") << incr_since;
+  f->open_array_section("objects");
+  for (auto p = objects.cbegin(); p != objects.cend(); ++p) {
+    f->open_object_section("object");
+    f->dump_string("name", p->first.oid.name);
+    f->dump_unsigned("hash", p->first.get_hash());
+    f->dump_string("key", p->first.get_key());
+    f->dump_int("snapid", p->first.snap);
+    p->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
+{
+  o.push_back(new ScrubMap);
+  o.push_back(new ScrubMap);
+  o.back()->valid_through = eversion_t(1, 2);
+  o.back()->incr_since = eversion_t(3, 4);
+  list<object*> obj;
+  object::generate_test_instances(obj);
+  o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
+  obj.pop_back();
+  o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
+}
+
+// -- ScrubMap::object --
+
+void ScrubMap::object::encode(ceph::buffer::list& bl) const
+{
+  bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
+  ENCODE_START(10, 7, bl);
+  encode(size, bl);
+  encode(negative, bl);
+  encode(attrs, bl);
+  encode(digest, bl);
+  encode(digest_present, bl);
+  encode((uint32_t)0, bl);  // obsolete nlinks
+  encode((uint32_t)0, bl);  // snapcolls
+  encode(omap_digest, bl);
+  encode(omap_digest_present, bl);
+  encode(compat_read_error, bl);
+  encode(stat_error, bl);
+  encode(read_error, bl);
+  encode(ec_hash_mismatch, bl);
+  encode(ec_size_mismatch, bl);
+  encode(large_omap_object_found, bl);
+  encode(large_omap_object_key_count, bl);
+  encode(large_omap_object_value_size, bl);
+  encode(object_omap_bytes, bl);
+  encode(object_omap_keys, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ScrubMap::object::decode(ceph::buffer::list::const_iterator& bl)
+{
+  DECODE_START(10, bl);
+  decode(size, bl);
+  bool tmp, compat_read_error = false;
+  decode(tmp, bl);
+  negative = tmp;
+  decode(attrs, bl);
+  decode(digest, bl);
+  decode(tmp, bl);
+  digest_present = tmp;
+  {
+    uint32_t nlinks;
+    decode(nlinks, bl);
+    set<snapid_t> snapcolls;
+    decode(snapcolls, bl);
+  }
+  decode(omap_digest, bl);
+  decode(tmp, bl);
+  omap_digest_present = tmp;
+  decode(compat_read_error, bl);
+  decode(tmp, bl);
+  stat_error = tmp;
+  if (struct_v >= 8) {
+    decode(tmp, bl);
+    read_error = tmp;
+    decode(tmp, bl);
+    ec_hash_mismatch = tmp;
+    decode(tmp, bl);
+    ec_size_mismatch = tmp;
+  }
+  // If older encoder found a read_error, set read_error
+  if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
+    read_error = true;
+  if (struct_v >= 9) {
+    decode(tmp, bl);
+    large_omap_object_found = tmp;
+    decode(large_omap_object_key_count, bl);
+    decode(large_omap_object_value_size, bl);
+  }
+  if (struct_v >= 10) {
+    decode(object_omap_bytes, bl);
+    decode(object_omap_keys, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void ScrubMap::object::dump(Formatter *f) const
+{
+  f->dump_int("size", size);
+  f->dump_int("negative", negative);
+  f->open_array_section("attrs");
+  for (auto p = attrs.cbegin(); p != attrs.cend(); ++p) {
+    f->open_object_section("attr");
+    f->dump_string("name", p->first);
+    f->dump_int("length", p->second.length());
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void ScrubMap::object::generate_test_instances(list<object*>& o)
+{
+  o.push_back(new object);
+  o.push_back(new object);
+  o.back()->negative = true;
+  o.push_back(new object);
+  o.back()->size = 123;
+  o.back()->attrs["foo"] = ceph::buffer::copy("foo", 3);
+  o.back()->attrs["bar"] = ceph::buffer::copy("barval", 6);
+}
+
+// -- OSDOp --
+
+ostream& operator<<(ostream& out, const OSDOp& op)
+{
+  out << ceph_osd_op_name(op.op.op);
+  if (ceph_osd_op_type_data(op.op.op)) {
+    // data extent
+    switch (op.op.op) {
+    case CEPH_OSD_OP_ASSERT_VER:
+      out << " v" << op.op.assert_ver.ver;
+      break;
+    case CEPH_OSD_OP_TRUNCATE:
+      out << " " << op.op.extent.offset;
+      break;
+    case CEPH_OSD_OP_MASKTRUNC:
+    case CEPH_OSD_OP_TRIMTRUNC:
+      out << " " << op.op.extent.truncate_seq << "@"
+	  << (int64_t)op.op.extent.truncate_size;
+      break;
+    case CEPH_OSD_OP_ROLLBACK:
+      out << " " << snapid_t(op.op.snap.snapid);
+      break;
+    case CEPH_OSD_OP_WATCH:
+      out << " " << ceph_osd_watch_op_name(op.op.watch.op)
+	  << " cookie " << op.op.watch.cookie;
+      if (op.op.watch.gen)
+	out << " gen " << op.op.watch.gen;
+      break;
+    case CEPH_OSD_OP_NOTIFY:
+      out << " cookie " << op.op.notify.cookie;
+      break;
+    case CEPH_OSD_OP_COPY_GET:
+      out << " max " << op.op.copy_get.max;
+      break;
+    case CEPH_OSD_OP_COPY_FROM:
+      out << " ver " << op.op.copy_from.src_version;
+      break;
+    case CEPH_OSD_OP_SETALLOCHINT:
+      out << " object_size " << op.op.alloc_hint.expected_object_size
+          << " write_size " << op.op.alloc_hint.expected_write_size;
+      break;
+    case CEPH_OSD_OP_READ:
+    case CEPH_OSD_OP_SPARSE_READ:
+    case CEPH_OSD_OP_SYNC_READ:
+    case CEPH_OSD_OP_WRITE:
+    case CEPH_OSD_OP_WRITEFULL:
+    case CEPH_OSD_OP_ZERO:
+    case CEPH_OSD_OP_APPEND:
+    case CEPH_OSD_OP_MAPEXT:
+    case CEPH_OSD_OP_CMPEXT:
+      out << " " << op.op.extent.offset << "~" << op.op.extent.length;
+      if (op.op.extent.truncate_seq)
+	out << " [" << op.op.extent.truncate_seq << "@"
+	    << (int64_t)op.op.extent.truncate_size << "]";
+      if (op.op.flags)
+	out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
+    default:
+      // don't show any arg info
+      break;
+    }
+  } else if (ceph_osd_op_type_attr(op.op.op)) {
+    // xattr name
+    if (op.op.xattr.name_len && op.indata.length()) {
+      out << " ";
+      op.indata.write(0, op.op.xattr.name_len, out);
+    }
+    if (op.op.xattr.value_len)
+      out << " (" << op.op.xattr.value_len << ")";
+    if (op.op.op == CEPH_OSD_OP_CMPXATTR)
+      out << " op " << (int)op.op.xattr.cmp_op
+	  << " mode " << (int)op.op.xattr.cmp_mode;
+  } else if (ceph_osd_op_type_exec(op.op.op)) {
+    // class.method
+    if (op.op.cls.class_len && op.indata.length()) {
+      out << " ";
+      op.indata.write(0, op.op.cls.class_len, out);
+      out << ".";
+      op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
+    }
+  } else if (ceph_osd_op_type_pg(op.op.op)) {
+    switch (op.op.op) {
+    case CEPH_OSD_OP_PGLS:
+    case CEPH_OSD_OP_PGLS_FILTER:
+    case CEPH_OSD_OP_PGNLS:
+    case CEPH_OSD_OP_PGNLS_FILTER:
+      out << " start_epoch " << op.op.pgls.start_epoch;
+      break;
+    case CEPH_OSD_OP_PG_HITSET_LS:
+      break;
+    case CEPH_OSD_OP_PG_HITSET_GET:
+      out << " " << utime_t(op.op.hit_set_get.stamp);
+      break;
+    case CEPH_OSD_OP_SCRUBLS:
+      break;
+    }
+  }
+  if (op.indata.length()) {
+    out << " in=" << op.indata.length() << "b";
+  }
+  if (op.outdata.length()) {
+    out << " out=" << op.outdata.length() << "b";
+  }
+  return out;
+}
+
+
+void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& in)
+{
+  auto datap = in.begin();
+  for (unsigned i = 0; i < ops.size(); i++) {
+    if (ops[i].op.payload_len) {
+      datap.copy(ops[i].op.payload_len, ops[i].outdata);
+    }
+  }
+}
+
+void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, ceph::buffer::list& out)
+{
+  for (unsigned i = 0; i < ops.size(); i++) {
+    ops[i].op.payload_len = ops[i].outdata.length();
+    if (ops[i].outdata.length()) {
+      out.append(ops[i].outdata);
+    }
+  }
+}
+
+int prepare_info_keymap(
+  CephContext* cct,
+  map<string,bufferlist> *km,
+  string *key_to_remove,
+  epoch_t epoch,
+  pg_info_t &info,
+  pg_info_t &last_written_info,
+  PastIntervals &past_intervals,
+  bool dirty_big_info,
+  bool dirty_epoch,
+  bool try_fast_info,
+  PerfCounters *logger,
+  DoutPrefixProvider *dpp)
+{
+  if (dirty_epoch) {
+    encode(epoch, (*km)[string(epoch_key)]);
+  }
+
+  if (logger)
+    logger->inc(l_osd_pg_info);
+
+  // try to do info efficiently?
+  if (!dirty_big_info && try_fast_info &&
+      info.last_update > last_written_info.last_update) {
+    pg_fast_info_t fast;
+    fast.populate_from(info);
+    bool did = fast.try_apply_to(&last_written_info);
+    ceph_assert(did);  // we verified last_update increased above
+    if (info == last_written_info) {
+      encode(fast, (*km)[string(fastinfo_key)]);
+      if (logger)
+	logger->inc(l_osd_pg_fastinfo);
+      return 0;
+    }
+    if (dpp) {
+      ldpp_dout(dpp, 30) << __func__ << " fastinfo failed, info:\n";
+      {
+	JSONFormatter jf(true);
+	jf.dump_object("info", info);
+	jf.flush(*_dout);
+      }
+      {
+	*_dout << "\nlast_written_info:\n";
+	JSONFormatter jf(true);
+	jf.dump_object("last_written_info", last_written_info);
+	jf.flush(*_dout);
+      }
+      *_dout << dendl;
+    }
+  } else if (info.last_update <= last_written_info.last_update) {
+    // clean up any potentially stale fastinfo key resulting from last_update
+    // not moving forwards (e.g., a backwards jump during peering)
+    *key_to_remove = fastinfo_key;
+  }
+
+  last_written_info = info;
+
+  // info.  store purged_snaps separately.
+  interval_set<snapid_t> purged_snaps;
+  purged_snaps.swap(info.purged_snaps);
+  encode(info, (*km)[string(info_key)]);
+  purged_snaps.swap(info.purged_snaps);
+
+  if (dirty_big_info) {
+    // potentially big stuff
+    bufferlist& bigbl = (*km)[string(biginfo_key)];
+    encode(past_intervals, bigbl);
+    encode(info.purged_snaps, bigbl);
+    //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
+    if (logger)
+      logger->inc(l_osd_pg_biginfo);
+  }
+
+  return 0;
+}
+
+void create_pg_collection(
+  ceph::os::Transaction& t, spg_t pgid, int bits)
+{
+  coll_t coll(pgid);
+  t.create_collection(coll, bits);
+}
+
+void init_pg_ondisk(
+  ceph::os::Transaction& t,
+  spg_t pgid,
+  const pg_pool_t *pool)
+{
+  coll_t coll(pgid);
+  if (pool) {
+    // Give a hint to the PG collection
+    bufferlist hint;
+    uint32_t pg_num = pool->get_pg_num();
+    uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
+    encode(pg_num, hint);
+    encode(expected_num_objects_pg, hint);
+    uint32_t hint_type = ceph::os::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
+    t.collection_hint(coll, hint_type, hint);
+  }
+
+  ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
+  t.touch(coll, pgmeta_oid);
+  map<string,bufferlist> values;
+  __u8 struct_v = pg_latest_struct_v;
+  encode(struct_v, values[string(infover_key)]);
+  t.omap_setkeys(coll, pgmeta_oid, values);
+}
+
+PGLSFilter::PGLSFilter() : cct(nullptr)
+{
+}
+
+PGLSFilter::~PGLSFilter()
+{
+}
+
+int PGLSPlainFilter::init(ceph::bufferlist::const_iterator &params)
+{
+  try {
+    decode(xattr, params);
+    decode(val, params);
+  } catch (ceph::buffer::error &e) {
+    return -EINVAL;
+  }
+  return 0;
+}
+
+bool PGLSPlainFilter::filter(const hobject_t& obj,
+                             const ceph::bufferlist& xattr_data) const
+{
+  return xattr_data.contents_equal(val.c_str(), val.size());
+}
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
new file mode 100644
index 000000000..93645c5f2
--- /dev/null
+++ b/src/osd/osd_types.h
@@ -0,0 +1,6568 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_TYPES_H
+#define CEPH_OSD_TYPES_H
+
+#include <atomic>
+#include <sstream>
+#include <cstdio>
+#include <memory>
+#include <string_view>
+
+#include <boost/scoped_ptr.hpp>
+#include <boost/optional/optional_io.hpp>
+#include <boost/variant.hpp>
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+
+#include "include/rados/rados_types.hpp"
+#include "include/mempool.h"
+
+#include "msg/msg_types.h"
+#include "include/compat.h"
+#include "include/types.h"
+#include "include/utime.h"
+#include "include/CompatSet.h"
+#include "common/ceph_context.h"
+#include "common/histogram.h"
+#include "include/interval_set.h"
+#include "include/inline_memory.h"
+#include "common/Formatter.h"
+#include "common/bloom_filter.hpp"
+#include "common/hobject.h"
+#include "common/snap_types.h"
+#include "HitSet.h"
+#include "Watch.h"
+#include "include/cmp.h"
+#include "librados/ListObjectImpl.h"
+#include "compressor/Compressor.h"
+#include "osd_perf_counters.h"
+
+#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v026"
+
+#define CEPH_OSD_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "initial feature set(~v.18)")
+#define CEPH_OSD_FEATURE_INCOMPAT_PGINFO CompatSet::Feature(2, "pginfo object")
+#define CEPH_OSD_FEATURE_INCOMPAT_OLOC CompatSet::Feature(3, "object locator")
+#define CEPH_OSD_FEATURE_INCOMPAT_LEC  CompatSet::Feature(4, "last_epoch_clean")
+#define CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES  CompatSet::Feature(5, "categories")
+#define CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL  CompatSet::Feature(6, "hobjectpool")
+#define CEPH_OSD_FEATURE_INCOMPAT_BIGINFO CompatSet::Feature(7, "biginfo")
+#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO CompatSet::Feature(8, "leveldbinfo")
+#define CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG CompatSet::Feature(9, "leveldblog")
+#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
+#define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
+#define CEPH_OSD_FEATURE_INCOMPAT_HINTS CompatSet::Feature(12, "transaction hints")
+#define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
+#define CEPH_OSD_FEATURE_INCOMPAT_MISSING CompatSet::Feature(14, "explicit missing set")
+#define CEPH_OSD_FEATURE_INCOMPAT_FASTINFO CompatSet::Feature(15, "fastinfo pg attr")
+#define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set")
+#define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2 CompatSet::Feature(17, "new snapmapper key structure")
+
+
+/// pool priority range set by user
+#define OSD_POOL_PRIORITY_MAX 10
+#define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX
+
+/// min recovery priority for MBackfillReserve
+#define OSD_RECOVERY_PRIORITY_MIN 0
+
+/// base backfill priority for MBackfillReserve
+#define OSD_BACKFILL_PRIORITY_BASE 100
+
+/// base backfill priority for MBackfillReserve (degraded PG)
+#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
+
+/// base recovery priority for MBackfillReserve
+#define OSD_RECOVERY_PRIORITY_BASE 180
+
+/// base backfill priority for MBackfillReserve (inactive PG)
+#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
+
+/// base recovery priority for MRecoveryReserve (inactive PG)
+#define OSD_RECOVERY_INACTIVE_PRIORITY_BASE 220
+
+/// max manually/automatically set recovery priority for MBackfillReserve
+#define OSD_RECOVERY_PRIORITY_MAX 253
+
+/// backfill priority for MBackfillReserve, when forced manually
+#define OSD_BACKFILL_PRIORITY_FORCED 254
+
+/// recovery priority for MRecoveryReserve, when forced manually
+#define OSD_RECOVERY_PRIORITY_FORCED 255
+
+/// priority for pg deletion when osd is not fullish
+#define OSD_DELETE_PRIORITY_NORMAL 179
+
+/// priority for pg deletion when osd is approaching full
+#define OSD_DELETE_PRIORITY_FULLISH 219
+
+/// priority when more full
+#define OSD_DELETE_PRIORITY_FULL 255
+
+static std::map<int, int> max_prio_map = {
+	{OSD_BACKFILL_PRIORITY_BASE, OSD_BACKFILL_DEGRADED_PRIORITY_BASE - 1},
+	{OSD_BACKFILL_DEGRADED_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_BASE - 1},
+	{OSD_RECOVERY_PRIORITY_BASE, OSD_BACKFILL_INACTIVE_PRIORITY_BASE - 1},
+	{OSD_RECOVERY_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX},
+	{OSD_BACKFILL_INACTIVE_PRIORITY_BASE, OSD_RECOVERY_PRIORITY_MAX}
+};
+
+typedef hobject_t collection_list_handle_t;
+
+/// convert a single CPEH_OSD_FLAG_* to a std::string
+const char *ceph_osd_flag_name(unsigned flag);
+/// convert a single CEPH_OSD_OF_FLAG_* to a std::string
+const char *ceph_osd_op_flag_name(unsigned flag);
+
+/// convert CEPH_OSD_FLAG_* op flags to a std::string
+std::string ceph_osd_flag_string(unsigned flags);
+/// conver CEPH_OSD_OP_FLAG_* op flags to a std::string
+std::string ceph_osd_op_flag_string(unsigned flags);
+/// conver CEPH_OSD_ALLOC_HINT_FLAG_* op flags to a std::string
+std::string ceph_osd_alloc_hint_flag_string(unsigned flags);
+
+typedef std::map<std::string,std::string> osd_alert_list_t;
+/// map osd id -> alert_list_t
+typedef std::map<int, osd_alert_list_t> osd_alerts_t;
+void dump(ceph::Formatter* f, const osd_alerts_t& alerts);
+
+
+typedef interval_set<
+  snapid_t,
+  mempool::osdmap::flat_map> snap_interval_set_t;
+
+
+/**
+ * osd request identifier
+ *
+ * caller name + incarnation# + tid to unique identify this request.
+ */
+struct osd_reqid_t {
+  entity_name_t name; // who
+  ceph_tid_t    tid;
+  int32_t       inc;  // incarnation
+
+  osd_reqid_t()
+    : tid(0), inc(0)
+  {}
+  osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
+    : name(a), tid(t), inc(i)
+  {}
+
+  DENC(osd_reqid_t, v, p) {
+    DENC_START(2, 2, p);
+    denc(v.name, p);
+    denc(v.tid, p);
+    denc(v.inc, p);
+    DENC_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<osd_reqid_t*>& o);
+};
+WRITE_CLASS_DENC(osd_reqid_t)
+
+
+
+struct pg_shard_t {
+  static const int32_t NO_OSD = 0x7fffffff;
+  int32_t osd;
+  shard_id_t shard;
+  pg_shard_t() : osd(-1), shard(shard_id_t::NO_SHARD) {}
+  explicit pg_shard_t(int osd) : osd(osd), shard(shard_id_t::NO_SHARD) {}
+  pg_shard_t(int osd, shard_id_t shard) : osd(osd), shard(shard) {}
+  bool is_undefined() const {
+    return osd == -1;
+  }
+  std::string get_osd() const { return (osd == NO_OSD ? "NONE" : std::to_string(osd)); }
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const {
+    f->dump_unsigned("osd", osd);
+    if (shard != shard_id_t::NO_SHARD) {
+      f->dump_unsigned("shard", shard);
+    }
+  }
+};
+WRITE_CLASS_ENCODER(pg_shard_t)
+WRITE_EQ_OPERATORS_2(pg_shard_t, osd, shard)
+WRITE_CMP_OPERATORS_2(pg_shard_t, osd, shard)
+std::ostream& operator<<(std::ostream &lhs, const pg_shard_t &rhs);
+
+using HobjToShardSetMapping = std::map<hobject_t, std::set<pg_shard_t>>;
+
+class IsPGRecoverablePredicate {
+public:
+  /**
+   * have encodes the shards available
+   */
+  virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
+  virtual ~IsPGRecoverablePredicate() {}
+};
+
+class IsPGReadablePredicate {
+public:
+  /**
+   * have encodes the shards available
+   */
+  virtual bool operator()(const std::set<pg_shard_t> &have) const = 0;
+  virtual ~IsPGReadablePredicate() {}
+};
+
+inline std::ostream& operator<<(std::ostream& out, const osd_reqid_t& r) {
+  return out << r.name << "." << r.inc << ":" << r.tid;
+}
+
+inline bool operator==(const osd_reqid_t& l, const osd_reqid_t& r) {
+  return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid);
+}
+inline bool operator!=(const osd_reqid_t& l, const osd_reqid_t& r) {
+  return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid);
+}
+inline bool operator<(const osd_reqid_t& l, const osd_reqid_t& r) {
+  return (l.name < r.name) || (l.inc < r.inc) || 
+    (l.name == r.name && l.inc == r.inc && l.tid < r.tid);
+}
+inline bool operator<=(const osd_reqid_t& l, const osd_reqid_t& r) {
+  return (l.name < r.name) || (l.inc < r.inc) ||
+    (l.name == r.name && l.inc == r.inc && l.tid <= r.tid);
+}
+inline bool operator>(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l <= r); }
+inline bool operator>=(const osd_reqid_t& l, const osd_reqid_t& r) { return !(l < r); }
+
+namespace std {
+  template<> struct hash<osd_reqid_t> {
+    size_t operator()(const osd_reqid_t &r) const { 
+      static hash<uint64_t> H;
+      return H(r.name.num() ^ r.tid ^ r.inc);
+    }
+  };
+} // namespace std
+
+
+// -----
+
+// a locator constrains the placement of an object.  mainly, which pool
+// does it go in.
+struct object_locator_t {
+  // You specify either the hash or the key -- not both
+  std::int64_t pool;     ///< pool id
+  std::string key;       ///< key string (if non-empty)
+  std::string nspace;    ///< namespace
+  std::int64_t hash;     ///< hash position (if >= 0)
+
+  explicit object_locator_t()
+    : pool(-1), hash(-1) {}
+  explicit object_locator_t(int64_t po)
+    : pool(po), hash(-1)  {}
+  explicit object_locator_t(int64_t po, int64_t ps)
+    : pool(po), hash(ps)  {}
+  explicit object_locator_t(int64_t po, std::string_view ns)
+    : pool(po), nspace(ns), hash(-1) {}
+  explicit object_locator_t(int64_t po, std::string_view ns, int64_t ps)
+    : pool(po), nspace(ns), hash(ps) {}
+  explicit object_locator_t(int64_t po, std::string_view ns, std::string_view s)
+    : pool(po), key(s), nspace(ns), hash(-1) {}
+  explicit object_locator_t(const hobject_t& soid)
+    : pool(soid.pool), key(soid.get_key()), nspace(soid.nspace), hash(-1) {}
+
+  int64_t get_pool() const {
+    return pool;
+  }
+
+  void clear() {
+    pool = -1;
+    key = "";
+    nspace = "";
+    hash = -1;
+  }
+
+  bool empty() const {
+    return pool == -1;
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<object_locator_t*>& o);
+};
+WRITE_CLASS_ENCODER(object_locator_t)
+
+inline bool operator==(const object_locator_t& l, const object_locator_t& r) {
+  return l.pool == r.pool && l.key == r.key && l.nspace == r.nspace && l.hash == r.hash;
+}
+inline bool operator!=(const object_locator_t& l, const object_locator_t& r) {
+  return !(l == r);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const object_locator_t& loc)
+{
+  out << "@" << loc.pool;
+  if (loc.nspace.length())
+    out << ";" << loc.nspace;
+  if (loc.key.length())
+    out << ":" << loc.key;
+  return out;
+}
+
+struct request_redirect_t {
+private:
+  object_locator_t redirect_locator; ///< this is authoritative
+  std::string redirect_object; ///< If non-empty, the request goes to this object name
+
+  friend std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir);
+public:
+
+  request_redirect_t() {}
+  explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
+      redirect_locator(orig) { redirect_locator.pool = rpool; }
+  explicit request_redirect_t(const object_locator_t& rloc) :
+      redirect_locator(rloc) {}
+  explicit request_redirect_t(const object_locator_t& orig,
+                              const std::string& robj) :
+      redirect_locator(orig), redirect_object(robj) {}
+
+  bool empty() const { return redirect_locator.empty() &&
+			      redirect_object.empty(); }
+
+  void combine_with_locator(object_locator_t& orig, std::string& obj) const {
+    orig = redirect_locator;
+    if (!redirect_object.empty())
+      obj = redirect_object;
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<request_redirect_t*>& o);
+};
+WRITE_CLASS_ENCODER(request_redirect_t)
+
+inline std::ostream& operator<<(std::ostream& out, const request_redirect_t& redir) {
+  out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
+  return out;
+}
+
+// Internal OSD op flags - set by the OSD based on the op types
+enum {
+  CEPH_OSD_RMW_FLAG_READ        = (1 << 1),
+  CEPH_OSD_RMW_FLAG_WRITE       = (1 << 2),
+  CEPH_OSD_RMW_FLAG_CLASS_READ  = (1 << 3),
+  CEPH_OSD_RMW_FLAG_CLASS_WRITE = (1 << 4),
+  CEPH_OSD_RMW_FLAG_PGOP        = (1 << 5),
+  CEPH_OSD_RMW_FLAG_CACHE       = (1 << 6),
+  CEPH_OSD_RMW_FLAG_FORCE_PROMOTE   = (1 << 7),
+  CEPH_OSD_RMW_FLAG_SKIP_HANDLE_CACHE = (1 << 8),
+  CEPH_OSD_RMW_FLAG_SKIP_PROMOTE      = (1 << 9),
+  CEPH_OSD_RMW_FLAG_RWORDERED         = (1 << 10),
+  CEPH_OSD_RMW_FLAG_RETURNVEC = (1 << 11),
+};
+
+
+// pg stuff
+
+#define OSD_SUPERBLOCK_GOBJECT ghobject_t(hobject_t(sobject_t(object_t("osd_superblock"), 0)))
+
+// placement seed (a hash value)
+typedef uint32_t ps_t;
+
+// old (v1) pg_t encoding (wrap old struct ceph_pg)
+struct old_pg_t {
+  ceph_pg v;
+  void encode(ceph::buffer::list& bl) const {
+    ceph::encode_raw(v, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    ceph::decode_raw(v, bl);
+  }
+};
+WRITE_CLASS_ENCODER(old_pg_t)
+
+// placement group id
+struct pg_t {
+  uint64_t m_pool;
+  uint32_t m_seed;
+
+  pg_t() : m_pool(0), m_seed(0) {}
+  pg_t(ps_t seed, uint64_t pool) :
+    m_pool(pool), m_seed(seed) {}
+  // cppcheck-suppress noExplicitConstructor
+  pg_t(const ceph_pg& cpg) :
+    m_pool(cpg.pool), m_seed(cpg.ps) {}
+
+  // cppcheck-suppress noExplicitConstructor
+  pg_t(const old_pg_t& opg) {
+    *this = opg.v;
+  }
+
+  old_pg_t get_old_pg() const {
+    old_pg_t o;
+    ceph_assert(m_pool < 0xffffffffull);
+    o.v.pool = m_pool;
+    o.v.ps = m_seed;
+    o.v.preferred = (__s16)-1;
+    return o;
+  }
+
+  ps_t ps() const {
+    return m_seed;
+  }
+  int64_t pool() const {
+    return m_pool;
+  }
+
+  static const uint8_t calc_name_buf_size = 36;  // max length for max values len("18446744073709551615.ffffffff") + future suffix len("_head") + '\0'
+  char *calc_name(char *buf, const char *suffix_backwords) const;
+
+  void set_ps(ps_t p) {
+    m_seed = p;
+  }
+  void set_pool(uint64_t p) {
+    m_pool = p;
+  }
+
+  pg_t get_parent() const;
+  pg_t get_ancestor(unsigned old_pg_num) const;
+
+  int print(char *o, int maxlen) const;
+  bool parse(const char *s);
+
+  bool is_split(unsigned old_pg_num, unsigned new_pg_num, std::set<pg_t> *pchildren) const;
+
+  bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num, pg_t *parent) const;
+  bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
+    return ps() < new_pg_num && is_split(new_pg_num, old_pg_num, nullptr);
+  }
+
+  /**
+   * Returns b such that for all object o:
+   *   ~((~0)<<b) & o.hash) == 0 iff o is in the pg for *this
+   */
+  unsigned get_split_bits(unsigned pg_num) const;
+
+  bool contains(int bits, const ghobject_t& oid) const {
+    return
+      (int64_t)m_pool == oid.hobj.get_logical_pool() &&
+      oid.match(bits, ps());
+  }
+  bool contains(int bits, const hobject_t& oid) const {
+    return
+      (int64_t)m_pool == oid.get_logical_pool() &&
+      oid.match(bits, ps());
+  }
+
+  hobject_t get_hobj_start() const;
+  hobject_t get_hobj_end(unsigned pg_num) const;
+
+  // strong ordering is supported
+  inline int compare(const pg_t& p) const noexcept {
+    if (auto delta = pool() - p.pool(); delta != 0) {
+      return delta;
+    } else if (ps() < p.ps()) {
+      return -1;
+    } else if (ps() > p.ps()) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    __u8 v = 1;
+    encode(v, bl);
+    encode(m_pool, bl);
+    encode(m_seed, bl);
+    encode((int32_t)-1, bl); // was preferred
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    using ceph::decode;
+    __u8 v;
+    decode(v, bl);
+    decode(m_pool, bl);
+    decode(m_seed, bl);
+    bl += sizeof(int32_t); // was preferred
+  }
+  void decode_old(ceph::buffer::list::const_iterator& bl) {
+    using ceph::decode;
+    old_pg_t opg;
+    decode(opg, bl);
+    *this = opg;
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_t)
+
+inline bool operator<(const pg_t& l, const pg_t& r) {
+  return l.compare(r) < 0;
+}
+inline bool operator<=(const pg_t& l, const pg_t& r) {
+  return l.compare(r) <= 0;
+}
+inline bool operator==(const pg_t& l, const pg_t& r) {
+  return l.compare(r) == 0;
+}
+inline bool operator!=(const pg_t& l, const pg_t& r) {
+  return l.compare(r) != 0;
+}
+inline bool operator>(const pg_t& l, const pg_t& r) {
+  return l.compare(r) > 0;
+}
+inline bool operator>=(const pg_t& l, const pg_t& r) {
+  return l.compare(r) >= 0;
+}
+
+std::ostream& operator<<(std::ostream& out, const pg_t &pg);
+
+namespace std {
+  template<> struct hash< pg_t >
+  {
+    size_t operator()( const pg_t& x ) const
+    {
+      static hash<uint32_t> H;
+      // xor (s32)-1 in there to preserve original m_preferred result (paranoia!)
+      return H((x.pool() & 0xffffffff) ^ (x.pool() >> 32) ^ x.ps() ^ (int32_t)(-1));
+    }
+  };
+} // namespace std
+
+struct spg_t {
+  pg_t pgid;
+  shard_id_t shard;
+  spg_t() : shard(shard_id_t::NO_SHARD) {}
+  spg_t(pg_t pgid, shard_id_t shard) : pgid(pgid), shard(shard) {}
+  explicit spg_t(pg_t pgid) : pgid(pgid), shard(shard_id_t::NO_SHARD) {}
+  unsigned get_split_bits(unsigned pg_num) const {
+    return pgid.get_split_bits(pg_num);
+  }
+  spg_t get_parent() const {
+    return spg_t(pgid.get_parent(), shard);
+  }
+  ps_t ps() const {
+    return pgid.ps();
+  }
+  uint64_t pool() const {
+    return pgid.pool();
+  }
+  void reset_shard(shard_id_t s) {
+    shard = s;
+  }
+
+  static const uint8_t calc_name_buf_size = pg_t::calc_name_buf_size + 4; // 36 + len('s') + len("255");
+  char *calc_name(char *buf, const char *suffix_backwords) const;
+ 
+  bool parse(const char *s);
+  bool parse(const std::string& s) {
+    return parse(s.c_str());
+  }
+
+  spg_t get_ancestor(unsigned old_pg_num) const {
+    return spg_t(pgid.get_ancestor(old_pg_num), shard);
+  }
+
+  bool is_split(unsigned old_pg_num, unsigned new_pg_num,
+		std::set<spg_t> *pchildren) const {
+    std::set<pg_t> _children;
+    std::set<pg_t> *children = pchildren ? &_children : NULL;
+    bool is_split = pgid.is_split(old_pg_num, new_pg_num, children);
+    if (pchildren && is_split) {
+      for (std::set<pg_t>::iterator i = _children.begin();
+	   i != _children.end();
+	   ++i) {
+	pchildren->insert(spg_t(*i, shard));
+      }
+    }
+    return is_split;
+  }
+  bool is_merge_target(unsigned old_pg_num, unsigned new_pg_num) const {
+    return pgid.is_merge_target(old_pg_num, new_pg_num);
+  }
+  bool is_merge_source(unsigned old_pg_num, unsigned new_pg_num,
+		       spg_t *parent) const {
+    spg_t out = *this;
+    bool r = pgid.is_merge_source(old_pg_num, new_pg_num, &out.pgid);
+    if (r && parent) {
+      *parent = out;
+    }
+    return r;
+  }
+
+  bool is_no_shard() const {
+    return shard == shard_id_t::NO_SHARD;
+  }
+
+  ghobject_t make_pgmeta_oid() const {
+    return ghobject_t::make_pgmeta(pgid.pool(), pgid.ps(), shard);
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(pgid, bl);
+    encode(shard, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(pgid, bl);
+    decode(shard, bl);
+    DECODE_FINISH(bl);
+  }
+
+  ghobject_t make_temp_ghobject(const std::string& name) const {
+    return ghobject_t(
+      hobject_t(object_t(name), "", CEPH_NOSNAP,
+		pgid.ps(),
+		hobject_t::get_temp_pool(pgid.pool()),
+		""),
+      ghobject_t::NO_GEN,
+      shard);
+  }
+
+  unsigned hash_to_shard(unsigned num_shards) const {
+    return ps() % num_shards;
+  }
+};
+WRITE_CLASS_ENCODER(spg_t)
+WRITE_EQ_OPERATORS_2(spg_t, pgid, shard)
+WRITE_CMP_OPERATORS_2(spg_t, pgid, shard)
+
+namespace std {
+  template<> struct hash< spg_t >
+  {
+    size_t operator()( const spg_t& x ) const
+      {
+      static hash<uint32_t> H;
+      return H(hash<pg_t>()(x.pgid) ^ x.shard);
+    }
+  };
+} // namespace std
+
+std::ostream& operator<<(std::ostream& out, const spg_t &pg);
+
+// ----------------------
+
+class coll_t {
+  enum type_t {
+    TYPE_META = 0,
+    TYPE_LEGACY_TEMP = 1,  /* no longer used */
+    TYPE_PG = 2,
+    TYPE_PG_TEMP = 3,
+  };
+  type_t type;
+  spg_t pgid;
+  uint64_t removal_seq;  // note: deprecated, not encoded
+
+  char _str_buff[spg_t::calc_name_buf_size];
+  char *_str;
+
+  void calc_str();
+
+  coll_t(type_t t, spg_t p, uint64_t r)
+    : type(t), pgid(p), removal_seq(r) {
+    calc_str();
+  }
+
+public:
+  coll_t() : type(TYPE_META), removal_seq(0)
+  {
+    calc_str();
+  }
+
+  coll_t(const coll_t& other)
+    : type(other.type), pgid(other.pgid), removal_seq(other.removal_seq) {
+    calc_str();
+  }
+
+  explicit coll_t(spg_t pgid)
+    : type(TYPE_PG), pgid(pgid), removal_seq(0)
+  {
+    calc_str();
+  }
+
+  coll_t& operator=(const coll_t& rhs)
+  {
+    this->type = rhs.type;
+    this->pgid = rhs.pgid;
+    this->removal_seq = rhs.removal_seq;
+    this->calc_str();
+    return *this;
+  }
+
+  // named constructors
+  static coll_t meta() {
+    return coll_t();
+  }
+  static coll_t pg(spg_t p) {
+    return coll_t(p);
+  }
+
+  const std::string to_str() const {
+    return std::string(_str);
+  }
+  const char *c_str() const {
+    return _str;
+  }
+
+  bool parse(const std::string& s);
+
+  int operator<(const coll_t &rhs) const {
+    return type < rhs.type ||
+		  (type == rhs.type && pgid < rhs.pgid);
+  }
+
+  bool is_meta() const {
+    return type == TYPE_META;
+  }
+  bool is_pg_prefix(spg_t *pgid_) const {
+    if (type == TYPE_PG || type == TYPE_PG_TEMP) {
+      *pgid_ = pgid;
+      return true;
+    }
+    return false;
+  }
+  bool is_pg() const {
+    return type == TYPE_PG;
+  }
+  bool is_pg(spg_t *pgid_) const {
+    if (type == TYPE_PG) {
+      *pgid_ = pgid;
+      return true;
+    }
+    return false;
+  }
+  bool is_temp() const {
+    return type == TYPE_PG_TEMP;
+  }
+  bool is_temp(spg_t *pgid_) const {
+    if (type == TYPE_PG_TEMP) {
+      *pgid_ = pgid;
+      return true;
+    }
+    return false;
+  }
+  int64_t pool() const {
+    return pgid.pool();
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  size_t encoded_size() const;
+
+  inline bool operator==(const coll_t& rhs) const {
+    // only compare type if meta
+    if (type != rhs.type)
+      return false;
+    if (type == TYPE_META)
+      return true;
+    return type == rhs.type && pgid == rhs.pgid;
+  }
+  inline bool operator!=(const coll_t& rhs) const {
+    return !(*this == rhs);
+  }
+
+  // get a TEMP collection that corresponds to the current collection,
+  // which we presume is a pg collection.
+  coll_t get_temp() const {
+    ceph_assert(type == TYPE_PG);
+    return coll_t(TYPE_PG_TEMP, pgid, 0);
+  }
+
+  ghobject_t get_min_hobj() const {
+    ghobject_t o;
+    switch (type) {
+    case TYPE_PG:
+      o.hobj.pool = pgid.pool();
+      o.set_shard(pgid.shard);
+      break;
+    case TYPE_META:
+      o.hobj.pool = -1;
+      break;
+    default:
+      break;
+    }
+    return o;
+  }
+
+  unsigned hash_to_shard(unsigned num_shards) const {
+    if (type == TYPE_PG)
+      return pgid.hash_to_shard(num_shards);
+    return 0;  // whatever.
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<coll_t*>& o);
+};
+
+WRITE_CLASS_ENCODER(coll_t)
+
+inline std::ostream& operator<<(std::ostream& out, const coll_t& c) {
+  out << c.to_str();
+  return out;
+}
+
+namespace std {
+  template<> struct hash<coll_t> {
+    size_t operator()(const coll_t &c) const { 
+      size_t h = 0;
+      std::string str(c.to_str());
+      std::string::const_iterator end(str.end());
+      for (std::string::const_iterator s = str.begin(); s != end; ++s) {
+	h += *s;
+	h += (h << 10);
+	h ^= (h >> 6);
+      }
+      h += (h << 3);
+      h ^= (h >> 11);
+      h += (h << 15);
+      return h;
+    }
+  };
+} // namespace std
+
+inline std::ostream& operator<<(std::ostream& out, const ceph_object_layout &ol)
+{
+  out << pg_t(ol.ol_pgid);
+  int su = ol.ol_stripe_unit;
+  if (su)
+    out << ".su=" << su;
+  return out;
+}
+
+
+
+// compound rados version type
+/* WARNING: If add member in eversion_t, please make sure the encode/decode function
+ * work well. For little-endian machine, we should make sure there is no padding
+ * in 32-bit machine and 64-bit machine.
+ */
+class eversion_t {
+public:
+  version_t version;
+  epoch_t epoch;
+  __u32 __pad;
+  eversion_t() : version(0), epoch(0), __pad(0) {}
+  eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
+
+  // cppcheck-suppress noExplicitConstructor
+  eversion_t(const ceph_eversion& ce) :
+    version(ce.version),
+    epoch(ce.epoch),
+    __pad(0) { }
+
+  explicit eversion_t(ceph::buffer::list& bl) : __pad(0) { decode(bl); }
+
+  static const eversion_t& max() {
+    static const eversion_t max(-1,-1);
+    return max;
+  }
+
+  operator ceph_eversion() {
+    ceph_eversion c;
+    c.epoch = epoch;
+    c.version = version;
+    return c;
+  }
+
+  std::string get_key_name() const;
+
+  // key must point to the beginning of a block of 32 chars
+  inline void get_key_name(char* key) const {
+    // Below is equivalent of sprintf("%010u.%020llu");
+    key[31] = 0;
+    ritoa<uint64_t, 10, 20>(version, key + 31);
+    key[10] = '.';
+    ritoa<uint32_t, 10, 10>(epoch, key + 10);
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+#if defined(CEPH_LITTLE_ENDIAN)
+    bl.append((char *)this, sizeof(version_t) + sizeof(epoch_t));
+#else
+    using ceph::encode;
+    encode(version, bl);
+    encode(epoch, bl);
+#endif
+  }
+  void decode(ceph::buffer::list::const_iterator &bl) {
+#if defined(CEPH_LITTLE_ENDIAN)
+    bl.copy(sizeof(version_t) + sizeof(epoch_t), (char *)this);
+#else
+    using ceph::decode;
+    decode(version, bl);
+    decode(epoch, bl);
+#endif
+  }
+  void decode(ceph::buffer::list& bl) {
+    auto p = std::cbegin(bl);
+    decode(p);
+  }
+};
+WRITE_CLASS_ENCODER(eversion_t)
+
+inline bool operator==(const eversion_t& l, const eversion_t& r) {
+  return (l.epoch == r.epoch) && (l.version == r.version);
+}
+inline bool operator!=(const eversion_t& l, const eversion_t& r) {
+  return (l.epoch != r.epoch) || (l.version != r.version);
+}
+inline bool operator<(const eversion_t& l, const eversion_t& r) {
+  return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch);
+}
+inline bool operator<=(const eversion_t& l, const eversion_t& r) {
+  return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch);
+}
+inline bool operator>(const eversion_t& l, const eversion_t& r) {
+  return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch);
+}
+inline bool operator>=(const eversion_t& l, const eversion_t& r) {
+  return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch);
+}
+inline std::ostream& operator<<(std::ostream& out, const eversion_t& e) {
+  return out << e.epoch << "'" << e.version;
+}
+
+/**
+ * objectstore_perf_stat_t
+ *
+ * current perf information about the osd
+ */
+struct objectstore_perf_stat_t {
+  // cur_op_latency is in ns since double add/sub are not associative
+  uint64_t os_commit_latency_ns;
+  uint64_t os_apply_latency_ns;
+
+  objectstore_perf_stat_t() :
+    os_commit_latency_ns(0), os_apply_latency_ns(0) {}
+
+  bool operator==(const objectstore_perf_stat_t &r) const {
+    return os_commit_latency_ns == r.os_commit_latency_ns &&
+      os_apply_latency_ns == r.os_apply_latency_ns;
+  }
+
+  void add(const objectstore_perf_stat_t &o) {
+    os_commit_latency_ns += o.os_commit_latency_ns;
+    os_apply_latency_ns += o.os_apply_latency_ns;
+  }
+  void sub(const objectstore_perf_stat_t &o) {
+    os_commit_latency_ns -= o.os_commit_latency_ns;
+    os_apply_latency_ns -= o.os_apply_latency_ns;
+  }
+  void dump(ceph::Formatter *f) const;
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  static void generate_test_instances(std::list<objectstore_perf_stat_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(objectstore_perf_stat_t)
+
+/*
+ * pg states
+ */
+#define PG_STATE_CREATING           (1ULL << 0)  // creating
+#define PG_STATE_ACTIVE             (1ULL << 1)  // i am active.  (primary: replicas too)
+#define PG_STATE_CLEAN              (1ULL << 2)  // peers are complete, clean of stray replicas.
+#define PG_STATE_DOWN               (1ULL << 4)  // a needed replica is down, PG offline
+#define PG_STATE_RECOVERY_UNFOUND   (1ULL << 5)  // recovery stopped due to unfound
+#define PG_STATE_BACKFILL_UNFOUND   (1ULL << 6)  // backfill stopped due to unfound
+#define PG_STATE_PREMERGE           (1ULL << 7)  // i am prepare to merging
+#define PG_STATE_SCRUBBING          (1ULL << 8)  // scrubbing
+//#define PG_STATE_SCRUBQ           (1ULL << 9)  // queued for scrub
+#define PG_STATE_DEGRADED           (1ULL << 10) // pg contains objects with reduced redundancy
+#define PG_STATE_INCONSISTENT       (1ULL << 11) // pg replicas are inconsistent (but shouldn't be)
+#define PG_STATE_PEERING            (1ULL << 12) // pg is (re)peering
+#define PG_STATE_REPAIR             (1ULL << 13) // pg should repair on next scrub
+#define PG_STATE_RECOVERING         (1ULL << 14) // pg is recovering/migrating objects
+#define PG_STATE_BACKFILL_WAIT      (1ULL << 15) // [active] reserving backfill
+#define PG_STATE_INCOMPLETE         (1ULL << 16) // incomplete content, peering failed.
+#define PG_STATE_STALE              (1ULL << 17) // our state for this pg is stale, unknown.
+#define PG_STATE_REMAPPED           (1ULL << 18) // pg is explicitly remapped to different OSDs than CRUSH
+#define PG_STATE_DEEP_SCRUB         (1ULL << 19) // deep scrub: check CRC32 on files
+#define PG_STATE_BACKFILLING        (1ULL << 20) // [active] backfilling pg content
+#define PG_STATE_BACKFILL_TOOFULL   (1ULL << 21) // backfill can't proceed: too full
+#define PG_STATE_RECOVERY_WAIT      (1ULL << 22) // waiting for recovery reservations
+#define PG_STATE_UNDERSIZED         (1ULL << 23) // pg acting < pool size
+#define PG_STATE_ACTIVATING         (1ULL << 24) // pg is peered but not yet active
+#define PG_STATE_PEERED             (1ULL << 25) // peered, cannot go active, can recover
+#define PG_STATE_SNAPTRIM           (1ULL << 26) // trimming snaps
+#define PG_STATE_SNAPTRIM_WAIT      (1ULL << 27) // queued to trim snaps
+#define PG_STATE_RECOVERY_TOOFULL   (1ULL << 28) // recovery can't proceed: too full
+#define PG_STATE_SNAPTRIM_ERROR     (1ULL << 29) // error stopped trimming snaps
+#define PG_STATE_FORCED_RECOVERY    (1ULL << 30) // force recovery of this pg before any other
+#define PG_STATE_FORCED_BACKFILL    (1ULL << 31) // force backfill of this pg before any other
+#define PG_STATE_FAILED_REPAIR      (1ULL << 32) // A repair failed to fix all errors
+#define PG_STATE_LAGGY              (1ULL << 33) // PG is laggy/unreabable due to slow/delayed pings
+#define PG_STATE_WAIT               (1ULL << 34) // PG is waiting for prior intervals' readable period to expire
+
+std::string pg_state_string(uint64_t state);
+std::string pg_vector_string(const std::vector<int32_t> &a);
+std::optional<uint64_t> pg_string_state(const std::string& state);
+
+
+/*
+ * pool_snap_info_t
+ *
+ * attributes for a single pool snapshot.  
+ */
+struct pool_snap_info_t {
+  snapid_t snapid;
+  utime_t stamp;
+  std::string name;
+
+  void dump(ceph::Formatter *f) const;
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  static void generate_test_instances(std::list<pool_snap_info_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(pool_snap_info_t)
+
+inline std::ostream& operator<<(std::ostream& out, const pool_snap_info_t& si) {
+  return out << si.snapid << '(' << si.name << ' ' << si.stamp << ')';
+}
+
+
+/*
+ * pool_opts_t
+ *
+ * pool options.
+ */
+
+// The order of items in the list is important, therefore,
+// you should always add to the end of the list when adding new options.
+
+class pool_opts_t {
+public:
+  enum key_t {
+    SCRUB_MIN_INTERVAL,
+    SCRUB_MAX_INTERVAL,
+    DEEP_SCRUB_INTERVAL,
+    RECOVERY_PRIORITY,
+    RECOVERY_OP_PRIORITY,
+    SCRUB_PRIORITY,
+    COMPRESSION_MODE,
+    COMPRESSION_ALGORITHM,
+    COMPRESSION_REQUIRED_RATIO,
+    COMPRESSION_MAX_BLOB_SIZE,
+    COMPRESSION_MIN_BLOB_SIZE,
+    CSUM_TYPE,
+    CSUM_MAX_BLOCK,
+    CSUM_MIN_BLOCK,
+    FINGERPRINT_ALGORITHM,
+    PG_NUM_MIN,         // min pg_num
+    TARGET_SIZE_BYTES,  // total bytes in pool
+    TARGET_SIZE_RATIO,  // fraction of total cluster
+    PG_AUTOSCALE_BIAS,
+    READ_LEASE_INTERVAL,
+    DEDUP_TIER,
+    DEDUP_CHUNK_ALGORITHM,
+    DEDUP_CDC_CHUNK_SIZE,
+    PG_NUM_MAX, // max pg_num
+  };
+
+  enum type_t {
+    STR,
+    INT,
+    DOUBLE,
+  };
+
+  struct opt_desc_t {
+    key_t key;
+    type_t type;
+
+    opt_desc_t(key_t k, type_t t) : key(k), type(t) {}
+
+    bool operator==(const opt_desc_t& rhs) const {
+      return key == rhs.key && type == rhs.type;
+    }
+  };
+
+  typedef boost::variant<std::string,int64_t,double> value_t;
+
+  static bool is_opt_name(const std::string& name);
+  static opt_desc_t get_opt_desc(const std::string& name);
+
+  pool_opts_t() : opts() {}
+
+  bool is_set(key_t key) const;
+
+  template<typename T>
+  void set(key_t key, const T &val) {
+    value_t value = val;
+    opts[key] = value;
+  }
+
+  template<typename T>
+  bool get(key_t key, T *val) const {
+    opts_t::const_iterator i = opts.find(key);
+    if (i == opts.end()) {
+      return false;
+    }
+    *val = boost::get<T>(i->second);
+    return true;
+  }
+
+  template<typename T>
+  T value_or(key_t key, T&& default_value) const {
+    auto i = opts.find(key);
+    if (i == opts.end()) {
+      return std::forward<T>(default_value);
+    }
+    return boost::get<T>(i->second);
+  }
+
+  const value_t& get(key_t key) const;
+
+  bool unset(key_t key);
+
+  void dump(const std::string& name, ceph::Formatter *f) const;
+
+  void dump(ceph::Formatter *f) const;
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+
+private:
+  typedef std::map<key_t, value_t> opts_t;
+  opts_t opts;
+
+  friend std::ostream& operator<<(std::ostream& out, const pool_opts_t& opts);
+};
+WRITE_CLASS_ENCODER_FEATURES(pool_opts_t)
+
+struct pg_merge_meta_t {
+  pg_t source_pgid;
+  epoch_t ready_epoch = 0;
+  epoch_t last_epoch_started = 0;
+  epoch_t last_epoch_clean = 0;
+  eversion_t source_version;
+  eversion_t target_version;
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(source_pgid, bl);
+    encode(ready_epoch, bl);
+    encode(last_epoch_started, bl);
+    encode(last_epoch_clean, bl);
+    encode(source_version, bl);
+    encode(target_version, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    DECODE_START(1, p);
+    decode(source_pgid, p);
+    decode(ready_epoch, p);
+    decode(last_epoch_started, p);
+    decode(last_epoch_clean, p);
+    decode(source_version, p);
+    decode(target_version, p);
+    DECODE_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->dump_stream("source_pgid") << source_pgid;
+    f->dump_unsigned("ready_epoch", ready_epoch);
+    f->dump_unsigned("last_epoch_started", last_epoch_started);
+    f->dump_unsigned("last_epoch_clean", last_epoch_clean);
+    f->dump_stream("source_version") << source_version;
+    f->dump_stream("target_version") << target_version;
+  }
+};
+WRITE_CLASS_ENCODER(pg_merge_meta_t)
+
+class OSDMap;
+
+/*
+ * pg_pool
+ */
+struct pg_pool_t {
+  static const char *APPLICATION_NAME_CEPHFS;
+  static const char *APPLICATION_NAME_RBD;
+  static const char *APPLICATION_NAME_RGW;
+
+  enum {
+    TYPE_REPLICATED = 1,     // replication
+    //TYPE_RAID4 = 2,   // raid4 (never implemented)
+    TYPE_ERASURE = 3,      // erasure-coded
+  };
+  static constexpr uint32_t pg_CRUSH_ITEM_NONE = 0x7fffffff; /* can't import crush.h here */
+  static std::string_view get_type_name(int t) {
+    switch (t) {
+    case TYPE_REPLICATED: return "replicated";
+      //case TYPE_RAID4: return "raid4";
+    case TYPE_ERASURE: return "erasure";
+    default: return "???";
+    }
+  }
+  std::string_view get_type_name() const {
+    return get_type_name(type);
+  }
+
+  enum {
+    FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
+    FLAG_FULL       = 1<<1, // pool is full
+    FLAG_EC_OVERWRITES = 1<<2, // enables overwrites, once enabled, cannot be disabled
+    FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
+    FLAG_NODELETE = 1<<4, // pool can't be deleted
+    FLAG_NOPGCHANGE = 1<<5, // pool's pg and pgp num can't be changed
+    FLAG_NOSIZECHANGE = 1<<6, // pool's size and min size can't be changed
+    FLAG_WRITE_FADVISE_DONTNEED = 1<<7, // write mode with LIBRADOS_OP_FLAG_FADVISE_DONTNEED
+    FLAG_NOSCRUB = 1<<8, // block periodic scrub
+    FLAG_NODEEP_SCRUB = 1<<9, // block periodic deep-scrub
+    FLAG_FULL_QUOTA = 1<<10, // pool is currently running out of quota, will set FLAG_FULL too
+    FLAG_NEARFULL = 1<<11, // pool is nearfull
+    FLAG_BACKFILLFULL = 1<<12, // pool is backfillfull
+    FLAG_SELFMANAGED_SNAPS = 1<<13, // pool uses selfmanaged snaps
+    FLAG_POOL_SNAPS = 1<<14,        // pool has pool snaps
+    FLAG_CREATING = 1<<15,          // initial pool PGs are being created
+    FLAG_BULK = 1<<17, //pool is large
+  };
+
+  static const char *get_flag_name(int f) {
+    switch (f) {
+    case FLAG_HASHPSPOOL: return "hashpspool";
+    case FLAG_FULL: return "full";
+    case FLAG_EC_OVERWRITES: return "ec_overwrites";
+    case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
+    case FLAG_NODELETE: return "nodelete";
+    case FLAG_NOPGCHANGE: return "nopgchange";
+    case FLAG_NOSIZECHANGE: return "nosizechange";
+    case FLAG_WRITE_FADVISE_DONTNEED: return "write_fadvise_dontneed";
+    case FLAG_NOSCRUB: return "noscrub";
+    case FLAG_NODEEP_SCRUB: return "nodeep-scrub";
+    case FLAG_FULL_QUOTA: return "full_quota";
+    case FLAG_NEARFULL: return "nearfull";
+    case FLAG_BACKFILLFULL: return "backfillfull";
+    case FLAG_SELFMANAGED_SNAPS: return "selfmanaged_snaps";
+    case FLAG_POOL_SNAPS: return "pool_snaps";
+    case FLAG_CREATING: return "creating";
+    case FLAG_BULK: return "bulk";
+    default: return "???";
+    }
+  }
+  static std::string get_flags_string(uint64_t f) {
+    std::string s;
+    for (unsigned n=0; f && n<64; ++n) {
+      if (f & (1ull << n)) {
+	if (s.length())
+	  s += ",";
+	s += get_flag_name(1ull << n);
+      }
+    }
+    return s;
+  }
+  std::string get_flags_string() const {
+    return get_flags_string(flags);
+  }
+  static uint64_t get_flag_by_name(const std::string& name) {
+    if (name == "hashpspool")
+      return FLAG_HASHPSPOOL;
+    if (name == "full")
+      return FLAG_FULL;
+    if (name == "ec_overwrites")
+      return FLAG_EC_OVERWRITES;
+    if (name == "incomplete_clones")
+      return FLAG_INCOMPLETE_CLONES;
+    if (name == "nodelete")
+      return FLAG_NODELETE;
+    if (name == "nopgchange")
+      return FLAG_NOPGCHANGE;
+    if (name == "nosizechange")
+      return FLAG_NOSIZECHANGE;
+    if (name == "write_fadvise_dontneed")
+      return FLAG_WRITE_FADVISE_DONTNEED;
+    if (name == "noscrub")
+      return FLAG_NOSCRUB;
+    if (name == "nodeep-scrub")
+      return FLAG_NODEEP_SCRUB;
+    if (name == "full_quota")
+      return FLAG_FULL_QUOTA;
+    if (name == "nearfull")
+      return FLAG_NEARFULL;
+    if (name == "backfillfull")
+      return FLAG_BACKFILLFULL;
+    if (name == "selfmanaged_snaps")
+      return FLAG_SELFMANAGED_SNAPS;
+    if (name == "pool_snaps")
+      return FLAG_POOL_SNAPS;
+    if (name == "creating")
+      return FLAG_CREATING;
+    if (name == "bulk")
+      return FLAG_BULK;
+    return 0;
+  }
+
+  /// converts the acting/up vector to a set of pg shards
+  void convert_to_pg_shards(const std::vector<int> &from, std::set<pg_shard_t>* to) const;
+
+  typedef enum {
+    CACHEMODE_NONE = 0,                  ///< no caching
+    CACHEMODE_WRITEBACK = 1,             ///< write to cache, flush later
+    CACHEMODE_FORWARD = 2,               ///< forward if not in cache
+    CACHEMODE_READONLY = 3,              ///< handle reads, forward writes [not strongly consistent]
+    CACHEMODE_READFORWARD = 4,           ///< forward reads, write to cache flush later
+    CACHEMODE_READPROXY = 5,             ///< proxy reads, write to cache flush later
+    CACHEMODE_PROXY = 6,                 ///< proxy if not in cache
+  } cache_mode_t;
+  static const char *get_cache_mode_name(cache_mode_t m) {
+    switch (m) {
+    case CACHEMODE_NONE: return "none";
+    case CACHEMODE_WRITEBACK: return "writeback";
+    case CACHEMODE_FORWARD: return "forward";
+    case CACHEMODE_READONLY: return "readonly";
+    case CACHEMODE_READFORWARD: return "readforward";
+    case CACHEMODE_READPROXY: return "readproxy";
+    case CACHEMODE_PROXY: return "proxy";
+    default: return "unknown";
+    }
+  }
+  static cache_mode_t get_cache_mode_from_str(const std::string& s) {
+    if (s == "none")
+      return CACHEMODE_NONE;
+    if (s == "writeback")
+      return CACHEMODE_WRITEBACK;
+    if (s == "forward")
+      return CACHEMODE_FORWARD;
+    if (s == "readonly")
+      return CACHEMODE_READONLY;
+    if (s == "readforward")
+      return CACHEMODE_READFORWARD;
+    if (s == "readproxy")
+      return CACHEMODE_READPROXY;
+    if (s == "proxy")
+      return CACHEMODE_PROXY;
+    return (cache_mode_t)-1;
+  }
+  const char *get_cache_mode_name() const {
+    return get_cache_mode_name(cache_mode);
+  }
+  bool cache_mode_requires_hit_set() const {
+    switch (cache_mode) {
+    case CACHEMODE_NONE:
+    case CACHEMODE_FORWARD:
+    case CACHEMODE_READONLY:
+    case CACHEMODE_PROXY:
+      return false;
+    case CACHEMODE_WRITEBACK:
+    case CACHEMODE_READFORWARD:
+    case CACHEMODE_READPROXY:
+      return true;
+    default:
+      ceph_abort_msg("implement me");
+    }
+  }
+
+  enum class pg_autoscale_mode_t : uint8_t {
+    OFF = 0,
+    WARN = 1,
+    ON = 2,
+    UNKNOWN = UINT8_MAX,
+  };
+  static const char *get_pg_autoscale_mode_name(pg_autoscale_mode_t m) {
+    switch (m) {
+    case pg_autoscale_mode_t::OFF: return "off";
+    case pg_autoscale_mode_t::ON: return "on";
+    case pg_autoscale_mode_t::WARN: return "warn";
+    default: return "???";
+    }
+  }
+  static pg_autoscale_mode_t get_pg_autoscale_mode_by_name(const std::string& m) {
+    if (m == "off") {
+      return pg_autoscale_mode_t::OFF;
+    }
+    if (m == "warn") {
+      return pg_autoscale_mode_t::WARN;
+    }
+    if (m == "on") {
+      return pg_autoscale_mode_t::ON;
+    }
+    return pg_autoscale_mode_t::UNKNOWN;
+  }
+
+  utime_t create_time;
+  uint64_t flags = 0;           ///< FLAG_*
+  __u8 type = 0;                ///< TYPE_*
+  __u8 size = 0, min_size = 0;  ///< number of osds in each pg
+  __u8 crush_rule = 0;          ///< crush placement rule
+  __u8 object_hash = 0;         ///< hash mapping object name to ps
+  pg_autoscale_mode_t pg_autoscale_mode = pg_autoscale_mode_t::UNKNOWN;
+
+private:
+  __u32 pg_num = 0, pgp_num = 0;  ///< number of pgs
+  __u32 pg_num_pending = 0;       ///< pg_num we are about to merge down to
+  __u32 pg_num_target = 0;        ///< pg_num we should converge toward
+  __u32 pgp_num_target = 0;       ///< pgp_num we should converge toward
+
+public:
+  std::map<std::string, std::string> properties;  ///< OBSOLETE
+  std::string erasure_code_profile; ///< name of the erasure code profile in OSDMap
+  epoch_t last_change = 0;      ///< most recent epoch changed, exclusing snapshot changes
+  // If non-zero, require OSDs in at least this many different instances...
+  uint32_t peering_crush_bucket_count = 0;
+  // of this bucket type...
+  uint32_t peering_crush_bucket_barrier = 0;
+  // including this one
+  int32_t peering_crush_mandatory_member = pg_CRUSH_ITEM_NONE;
+  // The per-bucket replica count is calculated with this "target"
+  // instead of the above crush_bucket_count. This means we can maintain a
+  // target size of 4 without attempting to place them all in 1 DC
+  uint32_t peering_crush_bucket_target = 0;
+  /// last epoch that forced clients to resend
+  epoch_t last_force_op_resend = 0;
+  /// last epoch that forced clients to resend (pre-nautilus clients only)
+  epoch_t last_force_op_resend_prenautilus = 0;
+  /// last epoch that forced clients to resend (pre-luminous clients only)
+  epoch_t last_force_op_resend_preluminous = 0;
+
+  /// metadata for the most recent PG merge
+  pg_merge_meta_t last_pg_merge_meta;
+  
+  snapid_t snap_seq = 0;        ///< seq for per-pool snapshot
+  epoch_t snap_epoch = 0;       ///< osdmap epoch of last snap
+  uint64_t auid = 0;            ///< who owns the pg
+
+  uint64_t quota_max_bytes = 0; ///< maximum number of bytes for this pool
+  uint64_t quota_max_objects = 0; ///< maximum number of objects for this pool
+
+  /*
+   * Pool snaps (global to this pool).  These define a SnapContext for
+   * the pool, unless the client manually specifies an alternate
+   * context.
+   */
+  std::map<snapid_t, pool_snap_info_t> snaps;
+  /*
+   * Alternatively, if we are defining non-pool snaps (e.g. via the
+   * Ceph MDS), we must track @removed_snaps (since @snaps is not
+   * used).  Snaps and removed_snaps are to be used exclusive of each
+   * other!
+   */
+  interval_set<snapid_t> removed_snaps;
+
+  unsigned pg_num_mask = 0, pgp_num_mask = 0;
+
+  std::set<uint64_t> tiers;      ///< pools that are tiers of us
+  int64_t tier_of = -1;         ///< pool for which we are a tier
+  // Note that write wins for read+write ops
+  int64_t read_tier = -1;       ///< pool/tier for objecter to direct reads to
+  int64_t write_tier = -1;      ///< pool/tier for objecter to direct writes to
+  cache_mode_t cache_mode = CACHEMODE_NONE;  ///< cache pool mode
+
+  bool is_tier() const { return tier_of >= 0; }
+  bool has_tiers() const { return !tiers.empty(); }
+  void clear_tier() {
+    tier_of = -1;
+    clear_read_tier();
+    clear_write_tier();
+    clear_tier_tunables();
+  }
+  bool has_read_tier() const { return read_tier >= 0; }
+  void clear_read_tier() { read_tier = -1; }
+  bool has_write_tier() const { return write_tier >= 0; }
+  void clear_write_tier() { write_tier = -1; }
+  void clear_tier_tunables() {
+    if (cache_mode != CACHEMODE_NONE)
+      flags |= FLAG_INCOMPLETE_CLONES;
+    cache_mode = CACHEMODE_NONE;
+
+    target_max_bytes = 0;
+    target_max_objects = 0;
+    cache_target_dirty_ratio_micro = 0;
+    cache_target_dirty_high_ratio_micro = 0;
+    cache_target_full_ratio_micro = 0;
+    hit_set_params = HitSet::Params();
+    hit_set_period = 0;
+    hit_set_count = 0;
+    hit_set_grade_decay_rate = 0;
+    hit_set_search_last_n = 0;
+    grade_table.resize(0);
+  }
+
+  bool is_stretch_pool() const {
+    return peering_crush_bucket_count != 0;
+  }
+
+  bool stretch_set_can_peer(const set<int>& want, const OSDMap& osdmap,
+			    std::ostream *out) const;
+  bool stretch_set_can_peer(const vector<int>& want, const OSDMap& osdmap,
+			    std::ostream *out) const {
+    if (!is_stretch_pool()) return true;
+    set<int> swant;
+    for (auto i : want) swant.insert(i);
+    return stretch_set_can_peer(swant, osdmap, out);
+  }
+
+  uint64_t target_max_bytes = 0;   ///< tiering: target max pool size
+  uint64_t target_max_objects = 0; ///< tiering: target max pool size
+
+  uint32_t cache_target_dirty_ratio_micro = 0; ///< cache: fraction of target to leave dirty
+  uint32_t cache_target_dirty_high_ratio_micro = 0; ///< cache: fraction of  target to flush with high speed
+  uint32_t cache_target_full_ratio_micro = 0;  ///< cache: fraction of target to fill before we evict in earnest
+
+  uint32_t cache_min_flush_age = 0;  ///< minimum age (seconds) before we can flush
+  uint32_t cache_min_evict_age = 0;  ///< minimum age (seconds) before we can evict
+
+  HitSet::Params hit_set_params; ///< The HitSet params to use on this pool
+  uint32_t hit_set_period = 0;   ///< periodicity of HitSet segments (seconds)
+  uint32_t hit_set_count = 0;    ///< number of periods to retain
+  bool use_gmt_hitset = true;	 ///< use gmt to name the hitset archive object
+  uint32_t min_read_recency_for_promote = 0;   ///< minimum number of HitSet to check before promote on read
+  uint32_t min_write_recency_for_promote = 0;  ///< minimum number of HitSet to check before promote on write
+  uint32_t hit_set_grade_decay_rate = 0; ///< current hit_set has highest priority on objects
+                                         ///< temperature count,the follow hit_set's priority decay
+                                         ///< by this params than pre hit_set
+  uint32_t hit_set_search_last_n = 0;    ///< accumulate atmost N hit_sets for temperature
+
+  uint32_t stripe_width = 0;        ///< erasure coded stripe size in bytes
+
+  uint64_t expected_num_objects = 0; ///< expected number of objects on this pool, a value of 0 indicates
+                                     ///< user does not specify any expected value
+  bool fast_read = false;            ///< whether turn on fast read on the pool or not
+
+  pool_opts_t opts; ///< options
+
+  typedef enum {
+    TYPE_FINGERPRINT_NONE = 0,
+    TYPE_FINGERPRINT_SHA1 = 1,     
+    TYPE_FINGERPRINT_SHA256 = 2,     
+    TYPE_FINGERPRINT_SHA512 = 3,     
+  } fingerprint_t;
+  static fingerprint_t get_fingerprint_from_str(const std::string& s) {
+    if (s == "none")
+      return TYPE_FINGERPRINT_NONE;
+    if (s == "sha1")
+      return TYPE_FINGERPRINT_SHA1;
+    if (s == "sha256")
+      return TYPE_FINGERPRINT_SHA256;
+    if (s == "sha512")
+      return TYPE_FINGERPRINT_SHA512;
+    return (fingerprint_t)-1;
+  }
+  const fingerprint_t get_fingerprint_type() const {
+    std::string fp_str;
+    opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
+    return get_fingerprint_from_str(fp_str);
+  }
+  const char *get_fingerprint_name() const {
+    std::string fp_str;
+    fingerprint_t fp_t;
+    opts.get(pool_opts_t::FINGERPRINT_ALGORITHM, &fp_str);
+    fp_t = get_fingerprint_from_str(fp_str);
+    return get_fingerprint_name(fp_t);
+  }
+  static const char *get_fingerprint_name(fingerprint_t m) {
+    switch (m) {
+    case TYPE_FINGERPRINT_NONE: return "none";
+    case TYPE_FINGERPRINT_SHA1: return "sha1";
+    case TYPE_FINGERPRINT_SHA256: return "sha256";
+    case TYPE_FINGERPRINT_SHA512: return "sha512";
+    default: return "unknown";
+    }
+  }
+
+  typedef enum {
+    TYPE_DEDUP_CHUNK_NONE = 0,
+    TYPE_DEDUP_CHUNK_FASTCDC = 1,     
+    TYPE_DEDUP_CHUNK_FIXEDCDC = 2,     
+  } dedup_chunk_algo_t;
+  static dedup_chunk_algo_t get_dedup_chunk_algorithm_from_str(const std::string& s) {
+    if (s == "none")
+      return TYPE_DEDUP_CHUNK_NONE;
+    if (s == "fastcdc")
+      return TYPE_DEDUP_CHUNK_FASTCDC;
+    if (s == "fixed")
+      return TYPE_DEDUP_CHUNK_FIXEDCDC;
+    return (dedup_chunk_algo_t)-1;
+  }
+  const dedup_chunk_algo_t get_dedup_chunk_algorithm_type() const {
+    std::string algo_str;
+    opts.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM, &algo_str);
+    return get_dedup_chunk_algorithm_from_str(algo_str);
+  }
+  const char *get_dedup_chunk_algorithm_name() const {
+    std::string dedup_chunk_algo_str;
+    dedup_chunk_algo_t dedup_chunk_algo_t;
+    opts.get(pool_opts_t::DEDUP_CHUNK_ALGORITHM, &dedup_chunk_algo_str);
+    dedup_chunk_algo_t = get_dedup_chunk_algorithm_from_str(dedup_chunk_algo_str);
+    return get_dedup_chunk_algorithm_name(dedup_chunk_algo_t);
+  }
+  static const char *get_dedup_chunk_algorithm_name(dedup_chunk_algo_t m) {
+    switch (m) {
+    case TYPE_DEDUP_CHUNK_NONE: return "none";
+    case TYPE_DEDUP_CHUNK_FASTCDC: return "fastcdc";
+    case TYPE_DEDUP_CHUNK_FIXEDCDC: return "fixed";
+    default: return "unknown";
+    }
+  }
+
+  int64_t get_dedup_tier() const {
+    int64_t tier_id = 0;
+    opts.get(pool_opts_t::DEDUP_TIER, &tier_id);
+    return tier_id;
+  }
+  int64_t get_dedup_cdc_chunk_size() const {
+    int64_t chunk_size = 0;
+    opts.get(pool_opts_t::DEDUP_CDC_CHUNK_SIZE, &chunk_size);
+    return chunk_size;
+  }
+
+  /// application -> key/value metadata
+  std::map<std::string, std::map<std::string, std::string>> application_metadata;
+
+private:
+  std::vector<uint32_t> grade_table;
+
+public:
+  uint32_t get_grade(unsigned i) const {
+    if (grade_table.size() <= i)
+      return 0;
+    return grade_table[i];
+  }
+  void calc_grade_table() {
+    unsigned v = 1000000;
+    grade_table.resize(hit_set_count);
+    for (unsigned i = 0; i < hit_set_count; i++) {
+      v = v * (1 - (hit_set_grade_decay_rate / 100.0));
+      grade_table[i] = v;
+    }
+  }
+
+  pg_pool_t() = default;
+
+  void dump(ceph::Formatter *f) const;
+
+  const utime_t &get_create_time() const { return create_time; }
+  uint64_t get_flags() const { return flags; }
+  bool has_flag(uint64_t f) const { return flags & f; }
+  void set_flag(uint64_t f) { flags |= f; }
+  void unset_flag(uint64_t f) { flags &= ~f; }
+
+  bool require_rollback() const {
+    return is_erasure();
+  }
+
+  /// true if incomplete clones may be present
+  bool allow_incomplete_clones() const {
+    return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
+  }
+
+  unsigned get_type() const { return type; }
+  unsigned get_size() const { return size; }
+  unsigned get_min_size() const { return min_size; }
+  int get_crush_rule() const { return crush_rule; }
+  int get_object_hash() const { return object_hash; }
+  const char *get_object_hash_name() const {
+    return ceph_str_hash_name(get_object_hash());
+  }
+  epoch_t get_last_change() const { return last_change; }
+  epoch_t get_last_force_op_resend() const { return last_force_op_resend; }
+  epoch_t get_last_force_op_resend_prenautilus() const {
+    return last_force_op_resend_prenautilus;
+  }
+  epoch_t get_last_force_op_resend_preluminous() const {
+    return last_force_op_resend_preluminous;
+  }
+  epoch_t get_snap_epoch() const { return snap_epoch; }
+  snapid_t get_snap_seq() const { return snap_seq; }
+  uint64_t get_auid() const { return auid; }
+
+  void set_snap_seq(snapid_t s) { snap_seq = s; }
+  void set_snap_epoch(epoch_t e) { snap_epoch = e; }
+
+  void set_stripe_width(uint32_t s) { stripe_width = s; }
+  uint32_t get_stripe_width() const { return stripe_width; }
+
+  bool is_replicated()   const { return get_type() == TYPE_REPLICATED; }
+  bool is_erasure() const { return get_type() == TYPE_ERASURE; }
+
+  bool supports_omap() const {
+    return !(get_type() == TYPE_ERASURE);
+  }
+
+  bool requires_aligned_append() const {
+    return is_erasure() && !has_flag(FLAG_EC_OVERWRITES);
+  }
+  uint64_t required_alignment() const { return stripe_width; }
+
+  bool allows_ecoverwrites() const {
+    return has_flag(FLAG_EC_OVERWRITES);
+  }
+
+  bool can_shift_osds() const {
+    switch (get_type()) {
+    case TYPE_REPLICATED:
+      return true;
+    case TYPE_ERASURE:
+      return false;
+    default:
+      ceph_abort_msg("unhandled pool type");
+    }
+  }
+
+  unsigned get_pg_num() const { return pg_num; }
+  unsigned get_pgp_num() const { return pgp_num; }
+  unsigned get_pg_num_target() const { return pg_num_target; }
+  unsigned get_pgp_num_target() const { return pgp_num_target; }
+  unsigned get_pg_num_pending() const { return pg_num_pending; }
+
+  unsigned get_pg_num_mask() const { return pg_num_mask; }
+  unsigned get_pgp_num_mask() const { return pgp_num_mask; }
+
+  // if pg_num is not a multiple of two, pgs are not equally sized.
+  // return, for a given pg, the fraction (denominator) of the total
+  // pool size that it represents.
+  unsigned get_pg_num_divisor(pg_t pgid) const;
+
+  bool is_pending_merge(pg_t pgid, bool *target) const;
+
+  void set_pg_num(int p) {
+    pg_num = p;
+    pg_num_pending = p;
+    calc_pg_masks();
+  }
+  void set_pgp_num(int p) {
+    pgp_num = p;
+    calc_pg_masks();
+  }
+  void set_pg_num_pending(int p) {
+    pg_num_pending = p;
+    calc_pg_masks();
+  }
+  void set_pg_num_target(int p) {
+    pg_num_target = p;
+  }
+  void set_pgp_num_target(int p) {
+    pgp_num_target = p;
+  }
+  void dec_pg_num(pg_t source_pgid,
+		  epoch_t ready_epoch,
+		  eversion_t source_version,
+		  eversion_t target_version,
+		  epoch_t last_epoch_started,
+		  epoch_t last_epoch_clean) {
+    --pg_num;
+    last_pg_merge_meta.source_pgid = source_pgid;
+    last_pg_merge_meta.ready_epoch = ready_epoch;
+    last_pg_merge_meta.source_version = source_version;
+    last_pg_merge_meta.target_version = target_version;
+    last_pg_merge_meta.last_epoch_started = last_epoch_started;
+    last_pg_merge_meta.last_epoch_clean = last_epoch_clean;
+    calc_pg_masks();
+  }
+
+  void set_quota_max_bytes(uint64_t m) {
+    quota_max_bytes = m;
+  }
+  uint64_t get_quota_max_bytes() {
+    return quota_max_bytes;
+  }
+
+  void set_quota_max_objects(uint64_t m) {
+    quota_max_objects = m;
+  }
+  uint64_t get_quota_max_objects() {
+    return quota_max_objects;
+  }
+
+  void set_last_force_op_resend(uint64_t t) {
+    last_force_op_resend = t;
+    last_force_op_resend_prenautilus = t;
+    last_force_op_resend_preluminous = t;
+  }
+
+  void calc_pg_masks();
+
+  /*
+   * we have two snap modes:
+   *  - pool global snaps
+   *    - snap existence/non-existence defined by snaps[] and snap_seq
+   *  - user managed snaps
+   *    - removal governed by removed_snaps
+   *
+   * we know which mode we're using based on whether removed_snaps is empty.
+   * If nothing has been created, both functions report false.
+   */
+  bool is_pool_snaps_mode() const;
+  bool is_unmanaged_snaps_mode() const;
+  bool is_removed_snap(snapid_t s) const;
+
+  snapid_t snap_exists(std::string_view s) const;
+  void add_snap(const char *n, utime_t stamp);
+  uint64_t add_unmanaged_snap(bool preoctopus_compat);
+  void remove_snap(snapid_t s);
+  void remove_unmanaged_snap(snapid_t s, bool preoctopus_compat);
+
+  SnapContext get_snap_context() const;
+
+  /// hash a object name+namespace key to a hash position
+  uint32_t hash_key(const std::string& key, const std::string& ns) const;
+
+  /// round a hash position down to a pg num
+  uint32_t raw_hash_to_pg(uint32_t v) const;
+
+  /*
+   * map a raw pg (with full precision ps) into an actual pg, for storage
+   */
+  pg_t raw_pg_to_pg(pg_t pg) const;
+  
+  /*
+   * map raw pg (full precision ps) into a placement seed.  include
+   * pool id in that value so that different pools don't use the same
+   * seeds.
+   */
+  ps_t raw_pg_to_pps(pg_t pg) const;
+
+  /// choose a random hash position within a pg
+  uint32_t get_random_pg_position(pg_t pgid, uint32_t seed) const;
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+
+  static void generate_test_instances(std::list<pg_pool_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(pg_pool_t)
+
+std::ostream& operator<<(std::ostream& out, const pg_pool_t& p);
+
+
+/**
+ * a summation of object stats
+ *
+ * This is just a container for object stats; we don't know what for.
+ *
+ * If you add members in object_stat_sum_t, you should make sure there are
+ * not padding among these members.
+ * You should also modify the padding_check function.
+
+ */
+struct object_stat_sum_t {
+  /**************************************************************************
+   * WARNING: be sure to update operator==, floor, and split when
+   * adding/removing fields!
+   **************************************************************************/
+  int64_t num_bytes;    // in bytes
+  int64_t num_objects;
+  int64_t num_object_clones;
+  int64_t num_object_copies;  // num_objects * num_replicas
+  int64_t num_objects_missing_on_primary;
+  int64_t num_objects_degraded;
+  int64_t num_objects_unfound;
+  int64_t num_rd;
+  int64_t num_rd_kb;
+  int64_t num_wr;
+  int64_t num_wr_kb;
+  int64_t num_scrub_errors;	// total deep and shallow scrub errors
+  int64_t num_objects_recovered;
+  int64_t num_bytes_recovered;
+  int64_t num_keys_recovered;
+  int64_t num_shallow_scrub_errors;
+  int64_t num_deep_scrub_errors;
+  int64_t num_objects_dirty;
+  int64_t num_whiteouts;
+  int64_t num_objects_omap;
+  int64_t num_objects_hit_set_archive;
+  int64_t num_objects_misplaced;
+  int64_t num_bytes_hit_set_archive;
+  int64_t num_flush;
+  int64_t num_flush_kb;
+  int64_t num_evict;
+  int64_t num_evict_kb;
+  int64_t num_promote;
+  int32_t num_flush_mode_high;  // 1 when in high flush mode, otherwise 0
+  int32_t num_flush_mode_low;   // 1 when in low flush mode, otherwise 0
+  int32_t num_evict_mode_some;  // 1 when in evict some mode, otherwise 0
+  int32_t num_evict_mode_full;  // 1 when in evict full mode, otherwise 0
+  int64_t num_objects_pinned;
+  int64_t num_objects_missing;
+  int64_t num_legacy_snapsets; ///< upper bound on pre-luminous-style SnapSets
+  int64_t num_large_omap_objects = 0;
+  int64_t num_objects_manifest = 0;
+  int64_t num_omap_bytes = 0;
+  int64_t num_omap_keys = 0;
+  int64_t num_objects_repaired = 0;
+
+  object_stat_sum_t()
+    : num_bytes(0),
+      num_objects(0), num_object_clones(0), num_object_copies(0),
+      num_objects_missing_on_primary(0), num_objects_degraded(0),
+      num_objects_unfound(0),
+      num_rd(0), num_rd_kb(0), num_wr(0), num_wr_kb(0),
+      num_scrub_errors(0),
+      num_objects_recovered(0),
+      num_bytes_recovered(0),
+      num_keys_recovered(0),
+      num_shallow_scrub_errors(0),
+      num_deep_scrub_errors(0),
+      num_objects_dirty(0),
+      num_whiteouts(0),
+      num_objects_omap(0),
+      num_objects_hit_set_archive(0),
+      num_objects_misplaced(0),
+      num_bytes_hit_set_archive(0),
+      num_flush(0),
+      num_flush_kb(0),
+      num_evict(0),
+      num_evict_kb(0),
+      num_promote(0),
+      num_flush_mode_high(0), num_flush_mode_low(0),
+      num_evict_mode_some(0), num_evict_mode_full(0),
+      num_objects_pinned(0),
+      num_objects_missing(0),
+      num_legacy_snapsets(0)
+  {}
+
+  void floor(int64_t f) {
+#define FLOOR(x) if (x < f) x = f
+    FLOOR(num_bytes);
+    FLOOR(num_objects);
+    FLOOR(num_object_clones);
+    FLOOR(num_object_copies);
+    FLOOR(num_objects_missing_on_primary);
+    FLOOR(num_objects_missing);
+    FLOOR(num_objects_degraded);
+    FLOOR(num_objects_misplaced);
+    FLOOR(num_objects_unfound);
+    FLOOR(num_rd);
+    FLOOR(num_rd_kb);
+    FLOOR(num_wr);
+    FLOOR(num_wr_kb);
+    FLOOR(num_large_omap_objects);
+    FLOOR(num_objects_manifest);
+    FLOOR(num_omap_bytes);
+    FLOOR(num_omap_keys);
+    FLOOR(num_shallow_scrub_errors);
+    FLOOR(num_deep_scrub_errors);
+    num_scrub_errors = num_shallow_scrub_errors + num_deep_scrub_errors;
+    FLOOR(num_objects_recovered);
+    FLOOR(num_bytes_recovered);
+    FLOOR(num_keys_recovered);
+    FLOOR(num_objects_dirty);
+    FLOOR(num_whiteouts);
+    FLOOR(num_objects_omap);
+    FLOOR(num_objects_hit_set_archive);
+    FLOOR(num_bytes_hit_set_archive);
+    FLOOR(num_flush);
+    FLOOR(num_flush_kb);
+    FLOOR(num_evict);
+    FLOOR(num_evict_kb);
+    FLOOR(num_promote);
+    FLOOR(num_flush_mode_high);
+    FLOOR(num_flush_mode_low);
+    FLOOR(num_evict_mode_some);
+    FLOOR(num_evict_mode_full);
+    FLOOR(num_objects_pinned);
+    FLOOR(num_legacy_snapsets);
+    FLOOR(num_objects_repaired);
+#undef FLOOR
+  }
+
+  void split(std::vector<object_stat_sum_t> &out) const {
+#define SPLIT(PARAM)                            \
+    for (unsigned i = 0; i < out.size(); ++i) { \
+      out[i].PARAM = PARAM / out.size();        \
+      if (i < (PARAM % out.size())) {           \
+	out[i].PARAM++;                         \
+      }                                         \
+    }
+#define SPLIT_PRESERVE_NONZERO(PARAM)		\
+    for (unsigned i = 0; i < out.size(); ++i) { \
+      if (PARAM)				\
+	out[i].PARAM = 1 + PARAM / out.size();	\
+      else					\
+	out[i].PARAM = 0;			\
+    }
+
+    SPLIT(num_bytes);
+    SPLIT(num_objects);
+    SPLIT(num_object_clones);
+    SPLIT(num_object_copies);
+    SPLIT(num_objects_missing_on_primary);
+    SPLIT(num_objects_missing);
+    SPLIT(num_objects_degraded);
+    SPLIT(num_objects_misplaced);
+    SPLIT(num_objects_unfound);
+    SPLIT(num_rd);
+    SPLIT(num_rd_kb);
+    SPLIT(num_wr);
+    SPLIT(num_wr_kb);
+    SPLIT(num_large_omap_objects);
+    SPLIT(num_objects_manifest);
+    SPLIT(num_omap_bytes);
+    SPLIT(num_omap_keys);
+    SPLIT(num_objects_repaired);
+    SPLIT_PRESERVE_NONZERO(num_shallow_scrub_errors);
+    SPLIT_PRESERVE_NONZERO(num_deep_scrub_errors);
+    for (unsigned i = 0; i < out.size(); ++i) {
+      out[i].num_scrub_errors = out[i].num_shallow_scrub_errors +
+				out[i].num_deep_scrub_errors;
+    }
+    SPLIT(num_objects_recovered);
+    SPLIT(num_bytes_recovered);
+    SPLIT(num_keys_recovered);
+    SPLIT(num_objects_dirty);
+    SPLIT(num_whiteouts);
+    SPLIT(num_objects_omap);
+    SPLIT(num_objects_hit_set_archive);
+    SPLIT(num_bytes_hit_set_archive);
+    SPLIT(num_flush);
+    SPLIT(num_flush_kb);
+    SPLIT(num_evict);
+    SPLIT(num_evict_kb);
+    SPLIT(num_promote);
+    SPLIT(num_flush_mode_high);
+    SPLIT(num_flush_mode_low);
+    SPLIT(num_evict_mode_some);
+    SPLIT(num_evict_mode_full);
+    SPLIT(num_objects_pinned);
+    SPLIT_PRESERVE_NONZERO(num_legacy_snapsets);
+#undef SPLIT
+#undef SPLIT_PRESERVE_NONZERO
+  }
+
+  void clear() {
+    // FIPS zeroization audit 20191117: this memset is not security related.
+    memset(this, 0, sizeof(*this));
+  }
+
+  void calc_copies(int nrep) {
+    num_object_copies = nrep * num_objects;
+  }
+
+  bool is_zero() const {
+    return mem_is_zero((char*)this, sizeof(*this));
+  }
+
+  void add(const object_stat_sum_t& o);
+  void sub(const object_stat_sum_t& o);
+
+  void dump(ceph::Formatter *f) const;
+  void padding_check() {
+    static_assert(
+      sizeof(object_stat_sum_t) ==
+        sizeof(num_bytes) +
+        sizeof(num_objects) +
+        sizeof(num_object_clones) +
+        sizeof(num_object_copies) +
+        sizeof(num_objects_missing_on_primary) +
+        sizeof(num_objects_degraded) +
+        sizeof(num_objects_unfound) +
+        sizeof(num_rd) +
+        sizeof(num_rd_kb) +
+        sizeof(num_wr) +
+        sizeof(num_wr_kb) +
+        sizeof(num_scrub_errors) +
+        sizeof(num_large_omap_objects) +
+        sizeof(num_objects_manifest) +
+        sizeof(num_omap_bytes) +
+        sizeof(num_omap_keys) +
+        sizeof(num_objects_repaired) +
+        sizeof(num_objects_recovered) +
+        sizeof(num_bytes_recovered) +
+        sizeof(num_keys_recovered) +
+        sizeof(num_shallow_scrub_errors) +
+        sizeof(num_deep_scrub_errors) +
+        sizeof(num_objects_dirty) +
+        sizeof(num_whiteouts) +
+        sizeof(num_objects_omap) +
+        sizeof(num_objects_hit_set_archive) +
+        sizeof(num_objects_misplaced) +
+        sizeof(num_bytes_hit_set_archive) +
+        sizeof(num_flush) +
+        sizeof(num_flush_kb) +
+        sizeof(num_evict) +
+        sizeof(num_evict_kb) +
+        sizeof(num_promote) +
+        sizeof(num_flush_mode_high) +
+        sizeof(num_flush_mode_low) +
+        sizeof(num_evict_mode_some) +
+        sizeof(num_evict_mode_full) +
+        sizeof(num_objects_pinned) +
+        sizeof(num_objects_missing) +
+        sizeof(num_legacy_snapsets)
+      ,
+      "object_stat_sum_t have padding");
+  }
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  static void generate_test_instances(std::list<object_stat_sum_t*>& o);
+};
+WRITE_CLASS_ENCODER(object_stat_sum_t)
+
+bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r);
+
+/**
+ * a collection of object stat sums
+ *
+ * This is a collection of stat sums over different categories.
+ */
+struct object_stat_collection_t {
+  /**************************************************************************
+   * WARNING: be sure to update the operator== when adding/removing fields! *
+   **************************************************************************/
+  object_stat_sum_t sum;
+
+  void calc_copies(int nrep) {
+    sum.calc_copies(nrep);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  static void generate_test_instances(std::list<object_stat_collection_t*>& o);
+
+  bool is_zero() const {
+    return sum.is_zero();
+  }
+
+  void clear() {
+    sum.clear();
+  }
+
+  void floor(int64_t f) {
+    sum.floor(f);
+  }
+
+  void add(const object_stat_sum_t& o) {
+    sum.add(o);
+  }
+
+  void add(const object_stat_collection_t& o) {
+    sum.add(o.sum);
+  }
+  void sub(const object_stat_collection_t& o) {
+    sum.sub(o.sum);
+  }
+};
+WRITE_CLASS_ENCODER(object_stat_collection_t)
+
+inline bool operator==(const object_stat_collection_t& l,
+		       const object_stat_collection_t& r) {
+  return l.sum == r.sum;
+}
+
+
+/** pg_stat
+ * aggregate stats for a single PG.
+ */
+struct pg_stat_t {
+  /**************************************************************************
+   * WARNING: be sure to update the operator== when adding/removing fields! *
+   **************************************************************************/
+  eversion_t version;
+  version_t reported_seq;  // sequence number
+  epoch_t reported_epoch;  // epoch of this report
+  uint64_t state;
+  utime_t last_fresh;   // last reported
+  utime_t last_change;  // new state != previous state
+  utime_t last_active;  // state & PG_STATE_ACTIVE
+  utime_t last_peered;  // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
+  utime_t last_clean;   // state & PG_STATE_CLEAN
+  utime_t last_unstale; // (state & PG_STATE_STALE) == 0
+  utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
+  utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
+
+  eversion_t log_start;         // (log_start,version]
+  eversion_t ondisk_log_start;  // there may be more on disk
+
+  epoch_t created;
+  epoch_t last_epoch_clean;
+  pg_t parent;
+  __u32 parent_split_bits;
+
+  eversion_t last_scrub;
+  eversion_t last_deep_scrub;
+  utime_t last_scrub_stamp;
+  utime_t last_deep_scrub_stamp;
+  utime_t last_clean_scrub_stamp;
+
+  object_stat_collection_t stats;
+
+  int64_t log_size;
+  int64_t ondisk_log_size;    // >= active_log_size
+
+  std::vector<int32_t> up, acting;
+  std::vector<pg_shard_t> avail_no_missing;
+  std::map< std::set<pg_shard_t>, int32_t > object_location_counts;
+  epoch_t mapping_epoch;
+
+  std::vector<int32_t> blocked_by;  ///< osds on which the pg is blocked
+
+  interval_set<snapid_t> purged_snaps;  ///< recently removed snaps that we've purged
+
+  utime_t last_became_active;
+  utime_t last_became_peered;
+
+  /// up, acting primaries
+  int32_t up_primary;
+  int32_t acting_primary;
+
+  // snaptrimq.size() is 64bit, but let's be serious - anything over 50k is
+  // absurd already, so cap it to 2^32 and save 4 bytes at  the same time
+  uint32_t snaptrimq_len;
+
+  bool stats_invalid:1;
+  /// true if num_objects_dirty is not accurate (because it was not
+  /// maintained starting from pool creation)
+  bool dirty_stats_invalid:1;
+  bool omap_stats_invalid:1;
+  bool hitset_stats_invalid:1;
+  bool hitset_bytes_stats_invalid:1;
+  bool pin_stats_invalid:1;
+  bool manifest_stats_invalid:1;
+
+  pg_stat_t()
+    : reported_seq(0),
+      reported_epoch(0),
+      state(0),
+      created(0), last_epoch_clean(0),
+      parent_split_bits(0),
+      log_size(0), ondisk_log_size(0),
+      mapping_epoch(0),
+      up_primary(-1),
+      acting_primary(-1),
+      snaptrimq_len(0),
+      stats_invalid(false),
+      dirty_stats_invalid(false),
+      omap_stats_invalid(false),
+      hitset_stats_invalid(false),
+      hitset_bytes_stats_invalid(false),
+      pin_stats_invalid(false),
+      manifest_stats_invalid(false)
+  { }
+
+  epoch_t get_effective_last_epoch_clean() const {
+    if (state & PG_STATE_CLEAN) {
+      // we are clean as of this report, and should thus take the
+      // reported epoch
+      return reported_epoch;
+    } else {
+      return last_epoch_clean;
+    }
+  }
+
+  std::pair<epoch_t, version_t> get_version_pair() const {
+    return { reported_epoch, reported_seq };
+  }
+
+  void floor(int64_t f) {
+    stats.floor(f);
+    if (log_size < f)
+      log_size = f;
+    if (ondisk_log_size < f)
+      ondisk_log_size = f;
+    if (snaptrimq_len < f)
+      snaptrimq_len = f;
+  }
+
+  void add_sub_invalid_flags(const pg_stat_t& o) {
+    // adding (or subtracting!) invalid stats render our stats invalid too
+    stats_invalid |= o.stats_invalid;
+    dirty_stats_invalid |= o.dirty_stats_invalid;
+    omap_stats_invalid |= o.omap_stats_invalid;
+    hitset_stats_invalid |= o.hitset_stats_invalid;
+    hitset_bytes_stats_invalid |= o.hitset_bytes_stats_invalid;
+    pin_stats_invalid |= o.pin_stats_invalid;
+    manifest_stats_invalid |= o.manifest_stats_invalid;
+  }
+  void add(const pg_stat_t& o) {
+    stats.add(o.stats);
+    log_size += o.log_size;
+    ondisk_log_size += o.ondisk_log_size;
+    snaptrimq_len = std::min((uint64_t)snaptrimq_len + o.snaptrimq_len,
+                             (uint64_t)(1ull << 31));
+    add_sub_invalid_flags(o);
+  }
+  void sub(const pg_stat_t& o) {
+    stats.sub(o.stats);
+    log_size -= o.log_size;
+    ondisk_log_size -= o.ondisk_log_size;
+    if (o.snaptrimq_len < snaptrimq_len) {
+      snaptrimq_len -= o.snaptrimq_len;
+    } else {
+      snaptrimq_len = 0;
+    }
+    add_sub_invalid_flags(o);
+  }
+
+  bool is_acting_osd(int32_t osd, bool primary) const;
+  void dump(ceph::Formatter *f) const;
+  void dump_brief(ceph::Formatter *f) const;
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  static void generate_test_instances(std::list<pg_stat_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_stat_t)
+
+bool operator==(const pg_stat_t& l, const pg_stat_t& r);
+
+/** store_statfs_t
+ * ObjectStore full statfs information
+ */
+struct store_statfs_t
+{
+  uint64_t total = 0;                  ///< Total bytes
+  uint64_t available = 0;              ///< Free bytes available
+  uint64_t internally_reserved = 0;    ///< Bytes reserved for internal purposes
+
+  int64_t allocated = 0;               ///< Bytes allocated by the store
+
+  int64_t data_stored = 0;                ///< Bytes actually stored by the user
+  int64_t data_compressed = 0;            ///< Bytes stored after compression
+  int64_t data_compressed_allocated = 0;  ///< Bytes allocated for compressed data
+  int64_t data_compressed_original = 0;   ///< Bytes that were compressed
+
+  int64_t omap_allocated = 0;         ///< approx usage of omap data
+  int64_t internal_metadata = 0;      ///< approx usage of internal metadata
+
+  void reset() {
+    *this = store_statfs_t();
+  }
+  void floor(int64_t f) {
+#define FLOOR(x) if (int64_t(x) < f) x = f
+    FLOOR(total);
+    FLOOR(available);
+    FLOOR(internally_reserved);
+    FLOOR(allocated);
+    FLOOR(data_stored);
+    FLOOR(data_compressed);
+    FLOOR(data_compressed_allocated);
+    FLOOR(data_compressed_original);
+
+    FLOOR(omap_allocated);
+    FLOOR(internal_metadata);
+#undef FLOOR
+  }
+
+  bool operator ==(const store_statfs_t& other) const;
+  bool is_zero() const {
+    return *this == store_statfs_t();
+  }
+
+  uint64_t get_used() const {
+    return total - available - internally_reserved;
+  }
+
+  // this accumulates both actually used and statfs's internally_reserved
+  uint64_t get_used_raw() const {
+    return total - available;
+  }
+
+  float get_used_raw_ratio() const {
+    if (total) {
+      return (float)get_used_raw() / (float)total;
+    } else {
+      return 0.0;
+    }
+  }
+
+  // helpers to ease legacy code porting
+  uint64_t kb_avail() const {
+    return available >> 10;
+  }
+  uint64_t kb() const {
+    return total >> 10;
+  }
+  uint64_t kb_used() const {
+    return (total - available - internally_reserved) >> 10;
+  }
+  uint64_t kb_used_raw() const {
+    return get_used_raw() >> 10;
+  }
+
+  uint64_t kb_used_data() const {
+    return allocated >> 10;
+  }
+  uint64_t kb_used_omap() const {
+    return omap_allocated >> 10;
+  }
+
+  uint64_t kb_used_internal_metadata() const {
+    return internal_metadata >> 10;
+  }
+
+  void add(const store_statfs_t& o) {
+    total += o.total;
+    available += o.available;
+    internally_reserved += o.internally_reserved;
+    allocated += o.allocated;
+    data_stored += o.data_stored;
+    data_compressed += o.data_compressed;
+    data_compressed_allocated += o.data_compressed_allocated;
+    data_compressed_original += o.data_compressed_original;
+    omap_allocated += o.omap_allocated;
+    internal_metadata += o.internal_metadata;
+  }
+  void sub(const store_statfs_t& o) {
+    total -= o.total;
+    available -= o.available;
+    internally_reserved -= o.internally_reserved;
+    allocated -= o.allocated;
+    data_stored -= o.data_stored;
+    data_compressed -= o.data_compressed;
+    data_compressed_allocated -= o.data_compressed_allocated;
+    data_compressed_original -= o.data_compressed_original;
+    omap_allocated -= o.omap_allocated;
+    internal_metadata -= o.internal_metadata;
+  }
+  void dump(ceph::Formatter *f) const;
+  DENC(store_statfs_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.total, p);
+    denc(v.available, p);
+    denc(v.internally_reserved, p);
+    denc(v.allocated, p);
+    denc(v.data_stored, p);
+    denc(v.data_compressed, p);
+    denc(v.data_compressed_allocated, p);
+    denc(v.data_compressed_original, p);
+    denc(v.omap_allocated, p);
+    denc(v.internal_metadata, p);
+    DENC_FINISH(p);
+  }
+  static void generate_test_instances(std::list<store_statfs_t*>& o);
+};
+WRITE_CLASS_DENC(store_statfs_t)
+
+std::ostream &operator<<(std::ostream &lhs, const store_statfs_t &rhs);
+
+/** osd_stat
+ * aggregate stats for an osd
+ */
+struct osd_stat_t {
+  store_statfs_t statfs;
+  std::vector<int> hb_peers;
+  int32_t snap_trim_queue_len, num_snap_trimming;
+  uint64_t num_shards_repaired;
+
+  pow2_hist_t op_queue_age_hist;
+
+  objectstore_perf_stat_t os_perf_stat;
+  osd_alerts_t os_alerts;
+
+  epoch_t up_from = 0;
+  uint64_t seq = 0;
+
+  uint32_t num_pgs = 0;
+
+  uint32_t num_osds = 0;
+  uint32_t num_per_pool_osds = 0;
+  uint32_t num_per_pool_omap_osds = 0;
+
+  struct Interfaces {
+    uint32_t last_update;  // in seconds
+    uint32_t back_pingtime[3];
+    uint32_t back_min[3];
+    uint32_t back_max[3];
+    uint32_t back_last;
+    uint32_t front_pingtime[3];
+    uint32_t front_min[3];
+    uint32_t front_max[3];
+    uint32_t front_last;
+  };
+  std::map<int, Interfaces> hb_pingtime;  ///< map of osd id to Interfaces
+
+  osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
+       num_shards_repaired(0)	{}
+
+ void add(const osd_stat_t& o) {
+    statfs.add(o.statfs);
+    snap_trim_queue_len += o.snap_trim_queue_len;
+    num_snap_trimming += o.num_snap_trimming;
+    num_shards_repaired += o.num_shards_repaired;
+    op_queue_age_hist.add(o.op_queue_age_hist);
+    os_perf_stat.add(o.os_perf_stat);
+    num_pgs += o.num_pgs;
+    num_osds += o.num_osds;
+    num_per_pool_osds += o.num_per_pool_osds;
+    num_per_pool_omap_osds += o.num_per_pool_omap_osds;
+    for (const auto& a : o.os_alerts) {
+      auto& target = os_alerts[a.first];
+      for (auto& i : a.second) {
+	target.emplace(i.first, i.second);
+      }
+    }
+  }
+  void sub(const osd_stat_t& o) {
+    statfs.sub(o.statfs);
+    snap_trim_queue_len -= o.snap_trim_queue_len;
+    num_snap_trimming -= o.num_snap_trimming;
+    num_shards_repaired -= o.num_shards_repaired;
+    op_queue_age_hist.sub(o.op_queue_age_hist);
+    os_perf_stat.sub(o.os_perf_stat);
+    num_pgs -= o.num_pgs;
+    num_osds -= o.num_osds;
+    num_per_pool_osds -= o.num_per_pool_osds;
+    num_per_pool_omap_osds -= o.num_per_pool_omap_osds;
+    for (const auto& a : o.os_alerts) {
+      auto& target = os_alerts[a.first];
+      for (auto& i : a.second) {
+        target.erase(i.first);
+      }
+      if (target.empty()) {
+	os_alerts.erase(a.first);
+      }
+    }
+  }
+  void dump(ceph::Formatter *f, bool with_net = true) const;
+  void dump_ping_time(ceph::Formatter *f) const;
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  static void generate_test_instances(std::list<osd_stat_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(osd_stat_t)
+
+inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
+  return l.statfs == r.statfs &&
+    l.snap_trim_queue_len == r.snap_trim_queue_len &&
+    l.num_snap_trimming == r.num_snap_trimming &&
+    l.num_shards_repaired == r.num_shards_repaired &&
+    l.hb_peers == r.hb_peers &&
+    l.op_queue_age_hist == r.op_queue_age_hist &&
+    l.os_perf_stat == r.os_perf_stat &&
+    l.num_pgs == r.num_pgs &&
+    l.num_osds == r.num_osds &&
+    l.num_per_pool_osds == r.num_per_pool_osds &&
+    l.num_per_pool_omap_osds == r.num_per_pool_omap_osds;
+}
+inline bool operator!=(const osd_stat_t& l, const osd_stat_t& r) {
+  return !(l == r);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const osd_stat_t& s) {
+  return out << "osd_stat(" << s.statfs << ", "
+	     << "peers " << s.hb_peers
+	     << " op hist " << s.op_queue_age_hist.h
+	     << ")";
+}
+
+/*
+ * summation over an entire pool
+ */
+struct pool_stat_t {
+  object_stat_collection_t stats;
+  store_statfs_t store_stats;
+  int64_t log_size;
+  int64_t ondisk_log_size;    // >= active_log_size
+  int32_t up;       ///< number of up replicas or shards
+  int32_t acting;   ///< number of acting replicas or shards
+  int32_t num_store_stats; ///< amount of store_stats accumulated
+
+  pool_stat_t() : log_size(0), ondisk_log_size(0), up(0), acting(0),
+    num_store_stats(0)
+  { }
+
+  void floor(int64_t f) {
+    stats.floor(f);
+    store_stats.floor(f);
+    if (log_size < f)
+      log_size = f;
+    if (ondisk_log_size < f)
+      ondisk_log_size = f;
+    if (up < f)
+      up = f;
+    if (acting < f)
+      acting = f;
+    if (num_store_stats < f)
+      num_store_stats = f;
+  }
+
+  void add(const store_statfs_t& o) {
+    store_stats.add(o);
+    ++num_store_stats;
+  }
+  void sub(const store_statfs_t& o) {
+    store_stats.sub(o);
+    --num_store_stats;
+  }
+
+  void add(const pg_stat_t& o) {
+    stats.add(o.stats);
+    log_size += o.log_size;
+    ondisk_log_size += o.ondisk_log_size;
+    up += o.up.size();
+    acting += o.acting.size();
+  }
+  void sub(const pg_stat_t& o) {
+    stats.sub(o.stats);
+    log_size -= o.log_size;
+    ondisk_log_size -= o.ondisk_log_size;
+    up -= o.up.size();
+    acting -= o.acting.size();
+  }
+
+  bool is_zero() const {
+    return (stats.is_zero() &&
+            store_stats.is_zero() &&
+	    log_size == 0 &&
+	    ondisk_log_size == 0 &&
+	    up == 0 &&
+	    acting == 0 &&
+	    num_store_stats == 0);
+  }
+
+  // helper accessors to retrieve used/netto bytes depending on the
+  // collection method: new per-pool objectstore report or legacy PG
+  // summation at OSD.
+  // In legacy mode used and netto values are the same. But for new per-pool
+  // collection 'used' provides amount of space ALLOCATED at all related OSDs 
+  // and 'netto' is amount of stored user data.
+  uint64_t get_allocated_data_bytes(bool per_pool) const {
+    if (per_pool) {
+      return store_stats.allocated;
+    } else {
+      // legacy mode, use numbers from 'stats'
+      return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
+    }
+  }
+  uint64_t get_allocated_omap_bytes(bool per_pool_omap) const {
+    if (per_pool_omap) {
+      return store_stats.omap_allocated;
+    } else {
+      // omap is not broken out by pool by nautilus bluestore; report the
+      // scrub value.  this will be imprecise in that it won't account for
+      // any storage overhead/efficiency.
+      return stats.sum.num_omap_bytes;
+    }
+  }
+  uint64_t get_user_data_bytes(float raw_used_rate, ///< space amp factor
+			       bool per_pool) const {
+    // NOTE: we need the space amp factor so that we can work backwards from
+    // the raw utilization to the amount of data that the user actually stored.
+    if (per_pool) {
+      return raw_used_rate ? store_stats.data_stored / raw_used_rate : 0;
+    } else {
+      // legacy mode, use numbers from 'stats'.  note that we do NOT use the
+      // raw_used_rate factor here because we are working from the PG stats
+      // directly.
+      return stats.sum.num_bytes + stats.sum.num_bytes_hit_set_archive;
+    }
+  }
+  uint64_t get_user_omap_bytes(float raw_used_rate, ///< space amp factor
+			       bool per_pool_omap) const {
+    if (per_pool_omap) {
+      return raw_used_rate ? store_stats.omap_allocated / raw_used_rate : 0;
+    } else {
+      // omap usage is lazily reported during scrub; this value may lag.
+      return stats.sum.num_omap_bytes;
+    }
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  static void generate_test_instances(std::list<pool_stat_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(pool_stat_t)
+
+
+// -----------------------------------------
+
+/**
+ * pg_hit_set_info_t - information about a single recorded HitSet
+ *
+ * Track basic metadata about a HitSet, like the number of insertions
+ * and the time range it covers.
+ */
+struct pg_hit_set_info_t {
+  utime_t begin, end;   ///< time interval
+  eversion_t version;   ///< version this HitSet object was written
+  bool using_gmt;	///< use gmt for creating the hit_set archive object name
+
+  friend bool operator==(const pg_hit_set_info_t& l,
+			 const pg_hit_set_info_t& r) {
+    return
+      l.begin == r.begin &&
+      l.end == r.end &&
+      l.version == r.version &&
+      l.using_gmt == r.using_gmt;
+  }
+
+  explicit pg_hit_set_info_t(bool using_gmt = true)
+    : using_gmt(using_gmt) {}
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_hit_set_info_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_hit_set_info_t)
+
+/**
+ * pg_hit_set_history_t - information about a history of hitsets
+ *
+ * Include information about the currently accumulating hit set as well
+ * as archived/historical ones.
+ */
+struct pg_hit_set_history_t {
+  eversion_t current_last_update;  ///< last version inserted into current set
+  std::list<pg_hit_set_info_t> history; ///< archived sets, sorted oldest -> newest
+
+  friend bool operator==(const pg_hit_set_history_t& l,
+			 const pg_hit_set_history_t& r) {
+    return
+      l.current_last_update == r.current_last_update &&
+      l.history == r.history;
+  }
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_hit_set_history_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_hit_set_history_t)
+
+
+// -----------------------------------------
+
+/**
+ * pg_history_t - information about recent pg peering/mapping history
+ *
+ * This is aggressively shared between OSDs to bound the amount of past
+ * history they need to worry about.
+ */
+struct pg_history_t {
+  epoch_t epoch_created = 0;       // epoch in which *pg* was created (pool or pg)
+  epoch_t epoch_pool_created = 0;  // epoch in which *pool* was created
+			       // (note: may be pg creation epoch for
+			       // pre-luminous clusters)
+  epoch_t last_epoch_started = 0;;  // lower bound on last epoch started (anywhere, not necessarily locally)
+                                    // https://docs.ceph.com/docs/master/dev/osd_internals/last_epoch_started/
+  epoch_t last_interval_started = 0;; // first epoch of last_epoch_started interval
+  epoch_t last_epoch_clean = 0;;    // lower bound on last epoch the PG was completely clean.
+  epoch_t last_interval_clean = 0;; // first epoch of last_epoch_clean interval
+  epoch_t last_epoch_split = 0;;    // as parent or child
+  epoch_t last_epoch_marked_full = 0;;  // pool or cluster
+
+  /**
+   * In the event of a map discontinuity, same_*_since may reflect the first
+   * map the osd has seen in the new map sequence rather than the actual start
+   * of the interval.  This is ok since a discontinuity at epoch e means there
+   * must have been a clean interval between e and now and that we cannot be
+   * in the active set during the interval containing e.
+   */
+  epoch_t same_up_since = 0;;       // same acting set since
+  epoch_t same_interval_since = 0;;   // same acting AND up set since
+  epoch_t same_primary_since = 0;;  // same primary at least back through this epoch.
+
+  eversion_t last_scrub;
+  eversion_t last_deep_scrub;
+  utime_t last_scrub_stamp;
+  utime_t last_deep_scrub_stamp;
+  utime_t last_clean_scrub_stamp;
+
+  /// upper bound on how long prior interval readable (relative to encode time)
+  ceph::timespan prior_readable_until_ub = ceph::timespan::zero();
+
+  friend bool operator==(const pg_history_t& l, const pg_history_t& r) {
+    return
+      l.epoch_created == r.epoch_created &&
+      l.epoch_pool_created == r.epoch_pool_created &&
+      l.last_epoch_started == r.last_epoch_started &&
+      l.last_interval_started == r.last_interval_started &&
+      l.last_epoch_clean == r.last_epoch_clean &&
+      l.last_interval_clean == r.last_interval_clean &&
+      l.last_epoch_split == r.last_epoch_split &&
+      l.last_epoch_marked_full == r.last_epoch_marked_full &&
+      l.same_up_since == r.same_up_since &&
+      l.same_interval_since == r.same_interval_since &&
+      l.same_primary_since == r.same_primary_since &&
+      l.last_scrub == r.last_scrub &&
+      l.last_deep_scrub == r.last_deep_scrub &&
+      l.last_scrub_stamp == r.last_scrub_stamp &&
+      l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
+      l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
+      l.prior_readable_until_ub == r.prior_readable_until_ub;
+  }
+
+  pg_history_t() {}
+  pg_history_t(epoch_t created, utime_t stamp)
+    : epoch_created(created),
+      epoch_pool_created(created),
+      same_up_since(created),
+      same_interval_since(created),
+      same_primary_since(created),
+      last_scrub_stamp(stamp),
+      last_deep_scrub_stamp(stamp),
+      last_clean_scrub_stamp(stamp) {}
+  
+  bool merge(const pg_history_t &other) {
+    // Here, we only update the fields which cannot be calculated from the OSDmap.
+    bool modified = false;
+    if (epoch_created < other.epoch_created) {
+      epoch_created = other.epoch_created;
+      modified = true;
+    }
+    if (epoch_pool_created < other.epoch_pool_created) {
+      // FIXME: for jewel compat only; this should either be 0 or always the
+      // same value across all pg instances.
+      epoch_pool_created = other.epoch_pool_created;
+      modified = true;
+    }
+    if (last_epoch_started < other.last_epoch_started) {
+      last_epoch_started = other.last_epoch_started;
+      modified = true;
+    }
+    if (last_interval_started < other.last_interval_started) {
+      last_interval_started = other.last_interval_started;
+      // if we are learning about a newer *started* interval, our
+      // readable_until_ub is obsolete
+      prior_readable_until_ub = other.prior_readable_until_ub;
+      modified = true;
+    } else if (other.last_interval_started == last_interval_started &&
+	       other.prior_readable_until_ub < prior_readable_until_ub) {
+      // if other is the *same* interval, than pull our upper bound in
+      // if they have a tighter bound.
+      prior_readable_until_ub = other.prior_readable_until_ub;
+      modified = true;
+    }
+    if (last_epoch_clean < other.last_epoch_clean) {
+      last_epoch_clean = other.last_epoch_clean;
+      modified = true;
+    }
+    if (last_interval_clean < other.last_interval_clean) {
+      last_interval_clean = other.last_interval_clean;
+      modified = true;
+    }
+    if (last_epoch_split < other.last_epoch_split) {
+      last_epoch_split = other.last_epoch_split; 
+      modified = true;
+    }
+    if (last_epoch_marked_full < other.last_epoch_marked_full) {
+      last_epoch_marked_full = other.last_epoch_marked_full;
+      modified = true;
+    }
+    if (other.last_scrub > last_scrub) {
+      last_scrub = other.last_scrub;
+      modified = true;
+    }
+    if (other.last_scrub_stamp > last_scrub_stamp) {
+      last_scrub_stamp = other.last_scrub_stamp;
+      modified = true;
+    }
+    if (other.last_deep_scrub > last_deep_scrub) {
+      last_deep_scrub = other.last_deep_scrub;
+      modified = true;
+    }
+    if (other.last_deep_scrub_stamp > last_deep_scrub_stamp) {
+      last_deep_scrub_stamp = other.last_deep_scrub_stamp;
+      modified = true;
+    }
+    if (other.last_clean_scrub_stamp > last_clean_scrub_stamp) {
+      last_clean_scrub_stamp = other.last_clean_scrub_stamp;
+      modified = true;
+    }
+    return modified;
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_history_t*>& o);
+
+  ceph::signedspan refresh_prior_readable_until_ub(
+    ceph::signedspan now,  ///< now, relative to osd startup_time
+    ceph::signedspan ub) { ///< ub, relative to osd startup_time
+    if (now >= ub) {
+      // prior interval(s) are unreadable; we can zero the upper bound
+      prior_readable_until_ub = ceph::signedspan::zero();
+      return ceph::signedspan::zero();
+    } else {
+      prior_readable_until_ub = ub - now;
+      return ub;
+    }
+  }
+  ceph::signedspan get_prior_readable_until_ub(ceph::signedspan now) {
+    if (prior_readable_until_ub == ceph::signedspan::zero()) {
+      return ceph::signedspan::zero();
+    }
+    return now + prior_readable_until_ub;
+  }
+};
+WRITE_CLASS_ENCODER(pg_history_t)
+
+inline std::ostream& operator<<(std::ostream& out, const pg_history_t& h) {
+  out << "ec=" << h.epoch_created << "/" << h.epoch_pool_created
+      << " lis/c=" << h.last_interval_started
+      << "/" << h.last_interval_clean
+      << " les/c/f=" << h.last_epoch_started << "/" << h.last_epoch_clean
+      << "/" << h.last_epoch_marked_full
+      << " sis=" << h.same_interval_since;
+  if (h.prior_readable_until_ub != ceph::timespan::zero()) {
+    out << " pruub=" << h.prior_readable_until_ub;
+  }
+  return out;
+}
+
+
+/**
+ * pg_info_t - summary of PG statistics.
+ *
+ * some notes: 
+ *  - last_complete implies we have all objects that existed as of that
+ *    stamp, OR a newer object, OR have already applied a later delete.
+ *  - if last_complete >= log.tail, then we know pg contents thru log.head.
+ *    otherwise, we have no idea what the pg is supposed to contain.
+ */
+struct pg_info_t {
+  spg_t pgid;
+  eversion_t last_update;      ///< last object version applied to store.
+  eversion_t last_complete;    ///< last version pg was complete through.
+  epoch_t last_epoch_started;  ///< last epoch at which this pg started on this osd
+  epoch_t last_interval_started; ///< first epoch of last_epoch_started interval
+  
+  version_t last_user_version; ///< last user object version applied to store
+
+  eversion_t log_tail;         ///< oldest log entry.
+
+  hobject_t last_backfill;     ///< objects >= this and < last_complete may be missing
+
+  interval_set<snapid_t> purged_snaps;
+
+  pg_stat_t stats;
+
+  pg_history_t history;
+  pg_hit_set_history_t hit_set;
+
+  friend bool operator==(const pg_info_t& l, const pg_info_t& r) {
+    return
+      l.pgid == r.pgid &&
+      l.last_update == r.last_update &&
+      l.last_complete == r.last_complete &&
+      l.last_epoch_started == r.last_epoch_started &&
+      l.last_interval_started == r.last_interval_started &&
+      l.last_user_version == r.last_user_version &&
+      l.log_tail == r.log_tail &&
+      l.last_backfill == r.last_backfill &&
+      l.purged_snaps == r.purged_snaps &&
+      l.stats == r.stats &&
+      l.history == r.history &&
+      l.hit_set == r.hit_set;
+  }
+
+  pg_info_t()
+    : last_epoch_started(0),
+      last_interval_started(0),
+      last_user_version(0),
+      last_backfill(hobject_t::get_max())
+  { }
+  // cppcheck-suppress noExplicitConstructor
+  pg_info_t(spg_t p)
+    : pgid(p),
+      last_epoch_started(0),
+      last_interval_started(0),
+      last_user_version(0),
+      last_backfill(hobject_t::get_max())
+  { }
+  
+  void set_last_backfill(hobject_t pos) {
+    last_backfill = pos;
+  }
+
+  bool is_empty() const { return last_update.version == 0; }
+  bool dne() const { return history.epoch_created == 0; }
+
+  bool has_missing() const { return last_complete != last_update; }
+  bool is_incomplete() const { return !last_backfill.is_max(); }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_info_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_info_t)
+
+inline std::ostream& operator<<(std::ostream& out, const pg_info_t& pgi) 
+{
+  out << pgi.pgid << "(";
+  if (pgi.dne())
+    out << " DNE";
+  if (pgi.is_empty())
+    out << " empty";
+  else {
+    out << " v " << pgi.last_update;
+    if (pgi.last_complete != pgi.last_update)
+      out << " lc " << pgi.last_complete;
+    out << " (" << pgi.log_tail << "," << pgi.last_update << "]";
+  }
+  if (pgi.is_incomplete())
+    out << " lb " << pgi.last_backfill;
+  //out << " c " << pgi.epoch_created;
+  out << " local-lis/les=" << pgi.last_interval_started
+      << "/" << pgi.last_epoch_started;
+  out << " n=" << pgi.stats.stats.sum.num_objects;
+  out << " " << pgi.history
+      << ")";
+  return out;
+}
+
+/**
+ * pg_fast_info_t - common pg_info_t fields
+ *
+ * These are the fields of pg_info_t (and children) that are updated for
+ * most IO operations.
+ *
+ * ** WARNING **
+ * Because we rely on these fields to be applied to the normal
+ * info struct, adding a new field here that is not also new in info
+ * means that we must set an incompat OSD feature bit!
+ */
+struct pg_fast_info_t {
+  eversion_t last_update;
+  eversion_t last_complete;
+  version_t last_user_version;
+  struct { // pg_stat_t stats
+    eversion_t version;
+    version_t reported_seq;
+    utime_t last_fresh;
+    utime_t last_active;
+    utime_t last_peered;
+    utime_t last_clean;
+    utime_t last_unstale;
+    utime_t last_undegraded;
+    utime_t last_fullsized;
+    int64_t log_size;  // (also ondisk_log_size, which has the same value)
+    struct { // object_stat_collection_t stats;
+      struct { // objct_stat_sum_t sum
+	int64_t num_bytes;    // in bytes
+	int64_t num_objects;
+	int64_t num_object_copies;
+	int64_t num_rd;
+	int64_t num_rd_kb;
+	int64_t num_wr;
+	int64_t num_wr_kb;
+	int64_t num_objects_dirty;
+      } sum;
+    } stats;
+  } stats;
+
+  void populate_from(const pg_info_t& info) {
+    last_update = info.last_update;
+    last_complete = info.last_complete;
+    last_user_version = info.last_user_version;
+    stats.version = info.stats.version;
+    stats.reported_seq = info.stats.reported_seq;
+    stats.last_fresh = info.stats.last_fresh;
+    stats.last_active = info.stats.last_active;
+    stats.last_peered = info.stats.last_peered;
+    stats.last_clean = info.stats.last_clean;
+    stats.last_unstale = info.stats.last_unstale;
+    stats.last_undegraded = info.stats.last_undegraded;
+    stats.last_fullsized = info.stats.last_fullsized;
+    stats.log_size = info.stats.log_size;
+    stats.stats.sum.num_bytes = info.stats.stats.sum.num_bytes;
+    stats.stats.sum.num_objects = info.stats.stats.sum.num_objects;
+    stats.stats.sum.num_object_copies = info.stats.stats.sum.num_object_copies;
+    stats.stats.sum.num_rd = info.stats.stats.sum.num_rd;
+    stats.stats.sum.num_rd_kb = info.stats.stats.sum.num_rd_kb;
+    stats.stats.sum.num_wr = info.stats.stats.sum.num_wr;
+    stats.stats.sum.num_wr_kb = info.stats.stats.sum.num_wr_kb;
+    stats.stats.sum.num_objects_dirty = info.stats.stats.sum.num_objects_dirty;
+  }
+
+  bool try_apply_to(pg_info_t* info) {
+    if (last_update <= info->last_update)
+      return false;
+    info->last_update = last_update;
+    info->last_complete = last_complete;
+    info->last_user_version = last_user_version;
+    info->stats.version = stats.version;
+    info->stats.reported_seq = stats.reported_seq;
+    info->stats.last_fresh = stats.last_fresh;
+    info->stats.last_active = stats.last_active;
+    info->stats.last_peered = stats.last_peered;
+    info->stats.last_clean = stats.last_clean;
+    info->stats.last_unstale = stats.last_unstale;
+    info->stats.last_undegraded = stats.last_undegraded;
+    info->stats.last_fullsized = stats.last_fullsized;
+    info->stats.log_size = stats.log_size;
+    info->stats.ondisk_log_size = stats.log_size;
+    info->stats.stats.sum.num_bytes = stats.stats.sum.num_bytes;
+    info->stats.stats.sum.num_objects = stats.stats.sum.num_objects;
+    info->stats.stats.sum.num_object_copies = stats.stats.sum.num_object_copies;
+    info->stats.stats.sum.num_rd = stats.stats.sum.num_rd;
+    info->stats.stats.sum.num_rd_kb = stats.stats.sum.num_rd_kb;
+    info->stats.stats.sum.num_wr = stats.stats.sum.num_wr;
+    info->stats.stats.sum.num_wr_kb = stats.stats.sum.num_wr_kb;
+    info->stats.stats.sum.num_objects_dirty = stats.stats.sum.num_objects_dirty;
+    return true;
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(last_update, bl);
+    encode(last_complete, bl);
+    encode(last_user_version, bl);
+    encode(stats.version, bl);
+    encode(stats.reported_seq, bl);
+    encode(stats.last_fresh, bl);
+    encode(stats.last_active, bl);
+    encode(stats.last_peered, bl);
+    encode(stats.last_clean, bl);
+    encode(stats.last_unstale, bl);
+    encode(stats.last_undegraded, bl);
+    encode(stats.last_fullsized, bl);
+    encode(stats.log_size, bl);
+    encode(stats.stats.sum.num_bytes, bl);
+    encode(stats.stats.sum.num_objects, bl);
+    encode(stats.stats.sum.num_object_copies, bl);
+    encode(stats.stats.sum.num_rd, bl);
+    encode(stats.stats.sum.num_rd_kb, bl);
+    encode(stats.stats.sum.num_wr, bl);
+    encode(stats.stats.sum.num_wr_kb, bl);
+    encode(stats.stats.sum.num_objects_dirty, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    DECODE_START(1, p);
+    decode(last_update, p);
+    decode(last_complete, p);
+    decode(last_user_version, p);
+    decode(stats.version, p);
+    decode(stats.reported_seq, p);
+    decode(stats.last_fresh, p);
+    decode(stats.last_active, p);
+    decode(stats.last_peered, p);
+    decode(stats.last_clean, p);
+    decode(stats.last_unstale, p);
+    decode(stats.last_undegraded, p);
+    decode(stats.last_fullsized, p);
+    decode(stats.log_size, p);
+    decode(stats.stats.sum.num_bytes, p);
+    decode(stats.stats.sum.num_objects, p);
+    decode(stats.stats.sum.num_object_copies, p);
+    decode(stats.stats.sum.num_rd, p);
+    decode(stats.stats.sum.num_rd_kb, p);
+    decode(stats.stats.sum.num_wr, p);
+    decode(stats.stats.sum.num_wr_kb, p);
+    decode(stats.stats.sum.num_objects_dirty, p);
+    DECODE_FINISH(p);
+  }
+};
+WRITE_CLASS_ENCODER(pg_fast_info_t)
+
+
+/**
+ * PastIntervals -- information needed to determine the PriorSet and
+ * the might_have_unfound set
+ */
+class PastIntervals {
+#ifdef WITH_SEASTAR
+  using OSDMapRef = boost::local_shared_ptr<const OSDMap>;
+#else
+  using OSDMapRef = std::shared_ptr<const OSDMap>;
+#endif
+public:
+  struct pg_interval_t {
+    std::vector<int32_t> up, acting;
+    epoch_t first, last;
+    bool maybe_went_rw;
+    int32_t primary;
+    int32_t up_primary;
+
+    pg_interval_t()
+      : first(0), last(0),
+	maybe_went_rw(false),
+	primary(-1),
+	up_primary(-1)
+      {}
+
+    pg_interval_t(
+      std::vector<int32_t> &&up,
+      std::vector<int32_t> &&acting,
+      epoch_t first,
+      epoch_t last,
+      bool maybe_went_rw,
+      int32_t primary,
+      int32_t up_primary)
+      : up(up), acting(acting), first(first), last(last),
+	maybe_went_rw(maybe_went_rw), primary(primary), up_primary(up_primary)
+      {}
+
+    void encode(ceph::buffer::list& bl) const;
+    void decode(ceph::buffer::list::const_iterator& bl);
+    void dump(ceph::Formatter *f) const;
+    static void generate_test_instances(std::list<pg_interval_t*>& o);
+  };
+
+  PastIntervals();
+  PastIntervals(PastIntervals &&rhs) = default;
+  PastIntervals &operator=(PastIntervals &&rhs) = default;
+
+  PastIntervals(const PastIntervals &rhs);
+  PastIntervals &operator=(const PastIntervals &rhs);
+
+  class interval_rep {
+  public:
+    virtual size_t size() const = 0;
+    virtual bool empty() const = 0;
+    virtual void clear() = 0;
+    virtual std::pair<epoch_t, epoch_t> get_bounds() const = 0;
+    virtual std::set<pg_shard_t> get_all_participants(
+      bool ec_pool) const = 0;
+    virtual void add_interval(bool ec_pool, const pg_interval_t &interval) = 0;
+    virtual std::unique_ptr<interval_rep> clone() const = 0;
+    virtual std::ostream &print(std::ostream &out) const = 0;
+    virtual void encode(ceph::buffer::list &bl) const = 0;
+    virtual void decode(ceph::buffer::list::const_iterator &bl) = 0;
+    virtual void dump(ceph::Formatter *f) const = 0;
+    virtual void iterate_mayberw_back_to(
+      epoch_t les,
+      std::function<void(epoch_t, const std::set<pg_shard_t> &)> &&f) const = 0;
+
+    virtual bool has_full_intervals() const { return false; }
+    virtual void iterate_all_intervals(
+      std::function<void(const pg_interval_t &)> &&f) const {
+      ceph_assert(!has_full_intervals());
+      ceph_abort_msg("not valid for this implementation");
+    }
+    virtual void adjust_start_backwards(epoch_t last_epoch_clean) = 0;
+
+    virtual ~interval_rep() {}
+  };
+  friend class pi_compact_rep;
+private:
+
+  std::unique_ptr<interval_rep> past_intervals;
+
+  explicit PastIntervals(interval_rep *rep) : past_intervals(rep) {}
+
+public:
+  void add_interval(bool ec_pool, const pg_interval_t &interval) {
+    ceph_assert(past_intervals);
+    return past_intervals->add_interval(ec_pool, interval);
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    ENCODE_START(1, 1, bl);
+    if (past_intervals) {
+      __u8 type = 2;
+      encode(type, bl);
+      past_intervals->encode(bl);
+    } else {
+      encode((__u8)0, bl);
+    }
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator &bl);
+
+  void dump(ceph::Formatter *f) const {
+    ceph_assert(past_intervals);
+    past_intervals->dump(f);
+  }
+  static void generate_test_instances(std::list<PastIntervals *> & o);
+
+  /**
+   * Determines whether there is an interval change
+   */
+  static bool is_new_interval(
+    int old_acting_primary,
+    int new_acting_primary,
+    const std::vector<int> &old_acting,
+    const std::vector<int> &new_acting,
+    int old_up_primary,
+    int new_up_primary,
+    const std::vector<int> &old_up,
+    const std::vector<int> &new_up,
+    int old_size,
+    int new_size,
+    int old_min_size,
+    int new_min_size,
+    unsigned old_pg_num,
+    unsigned new_pg_num,
+    unsigned old_pg_num_pending,
+    unsigned new_pg_num_pending,
+    bool old_sort_bitwise,
+    bool new_sort_bitwise,
+    bool old_recovery_deletes,
+    bool new_recovery_deletes,
+    uint32_t old_crush_count,
+    uint32_t new_crush_count,
+    uint32_t old_crush_target,
+    uint32_t new_crush_target,
+    uint32_t old_crush_barrier,
+    uint32_t new_crush_barrier,
+    int32_t old_crush_member,
+    int32_t new_crush_member,
+    pg_t pgid
+    );
+
+  /**
+   * Determines whether there is an interval change
+   */
+  static bool is_new_interval(
+    int old_acting_primary,                     ///< [in] primary as of lastmap
+    int new_acting_primary,                     ///< [in] primary as of lastmap
+    const std::vector<int> &old_acting,              ///< [in] acting as of lastmap
+    const std::vector<int> &new_acting,              ///< [in] acting as of osdmap
+    int old_up_primary,                         ///< [in] up primary of lastmap
+    int new_up_primary,                         ///< [in] up primary of osdmap
+    const std::vector<int> &old_up,                  ///< [in] up as of lastmap
+    const std::vector<int> &new_up,                  ///< [in] up as of osdmap
+    const OSDMap *osdmap,  ///< [in] current map
+    const OSDMap *lastmap, ///< [in] last map
+    pg_t pgid                                   ///< [in] pgid for pg
+    );
+
+  /**
+   * Integrates a new map into *past_intervals, returns true
+   * if an interval was closed out.
+   */
+  static bool check_new_interval(
+    int old_acting_primary,                     ///< [in] primary as of lastmap
+    int new_acting_primary,                     ///< [in] primary as of osdmap
+    const std::vector<int> &old_acting,              ///< [in] acting as of lastmap
+    const std::vector<int> &new_acting,              ///< [in] acting as of osdmap
+    int old_up_primary,                         ///< [in] up primary of lastmap
+    int new_up_primary,                         ///< [in] up primary of osdmap
+    const std::vector<int> &old_up,                  ///< [in] up as of lastmap
+    const std::vector<int> &new_up,                  ///< [in] up as of osdmap
+    epoch_t same_interval_since,                ///< [in] as of osdmap
+    epoch_t last_epoch_clean,                   ///< [in] current
+    const OSDMap *osdmap,      ///< [in] current map
+    const OSDMap *lastmap,     ///< [in] last map
+    pg_t pgid,                                  ///< [in] pgid for pg
+    const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
+    PastIntervals *past_intervals,              ///< [out] intervals
+    std::ostream *out = 0                            ///< [out] debug ostream
+    );
+  static bool check_new_interval(
+    int old_acting_primary,                     ///< [in] primary as of lastmap
+    int new_acting_primary,                     ///< [in] primary as of osdmap
+    const std::vector<int> &old_acting,              ///< [in] acting as of lastmap
+    const std::vector<int> &new_acting,              ///< [in] acting as of osdmap
+    int old_up_primary,                         ///< [in] up primary of lastmap
+    int new_up_primary,                         ///< [in] up primary of osdmap
+    const std::vector<int> &old_up,                  ///< [in] up as of lastmap
+    const std::vector<int> &new_up,                  ///< [in] up as of osdmap
+    epoch_t same_interval_since,                ///< [in] as of osdmap
+    epoch_t last_epoch_clean,                   ///< [in] current
+    OSDMapRef osdmap,      ///< [in] current map
+    OSDMapRef lastmap,     ///< [in] last map
+    pg_t pgid,                                  ///< [in] pgid for pg
+    const IsPGRecoverablePredicate &could_have_gone_active, ///< [in] predicate whether the pg can be active
+    PastIntervals *past_intervals,              ///< [out] intervals
+    std::ostream *out = 0                            ///< [out] debug ostream
+    ) {
+    return check_new_interval(
+      old_acting_primary, new_acting_primary,
+      old_acting, new_acting,
+      old_up_primary, new_up_primary,
+      old_up, new_up,
+      same_interval_since, last_epoch_clean,
+      osdmap.get(), lastmap.get(),
+      pgid,
+      could_have_gone_active,
+      past_intervals,
+      out);
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
+
+  template <typename F>
+  void iterate_mayberw_back_to(
+    epoch_t les,
+    F &&f) const {
+    ceph_assert(past_intervals);
+    past_intervals->iterate_mayberw_back_to(les, std::forward<F>(f));
+  }
+  void clear() {
+    ceph_assert(past_intervals);
+    past_intervals->clear();
+  }
+
+  /**
+   * Should return a value which gives an indication of the amount
+   * of state contained
+   */
+  size_t size() const {
+    ceph_assert(past_intervals);
+    return past_intervals->size();
+  }
+
+  bool empty() const {
+    ceph_assert(past_intervals);
+    return past_intervals->empty();
+  }
+
+  void swap(PastIntervals &other) {
+    using std::swap;
+    swap(other.past_intervals, past_intervals);
+  }
+
+  /**
+   * Return all shards which have been in the acting set back to the
+   * latest epoch to which we have trimmed except for pg_whoami
+   */
+  std::set<pg_shard_t> get_might_have_unfound(
+    pg_shard_t pg_whoami,
+    bool ec_pool) const {
+    ceph_assert(past_intervals);
+    auto ret = past_intervals->get_all_participants(ec_pool);
+    ret.erase(pg_whoami);
+    return ret;
+  }
+
+  /**
+   * Return all shards which we might want to talk to for peering
+   */
+  std::set<pg_shard_t> get_all_probe(
+    bool ec_pool) const {
+    ceph_assert(past_intervals);
+    return past_intervals->get_all_participants(ec_pool);
+  }
+
+  /* Return the set of epochs [start, end) represented by the
+   * past_interval set.
+   */
+  std::pair<epoch_t, epoch_t> get_bounds() const {
+    ceph_assert(past_intervals);
+    return past_intervals->get_bounds();
+  }
+
+  void adjust_start_backwards(epoch_t last_epoch_clean) {
+    ceph_assert(past_intervals);
+    past_intervals->adjust_start_backwards(last_epoch_clean);
+  }
+
+  enum osd_state_t {
+    UP,
+    DOWN,
+    DNE,
+    LOST
+  };
+  struct PriorSet {
+    bool ec_pool = false;
+    std::set<pg_shard_t> probe; ///< current+prior OSDs we need to probe.
+    std::set<int> down;  ///< down osds that would normally be in @a probe and might be interesting.
+    std::map<int, epoch_t> blocked_by;  ///< current lost_at values for any OSDs in cur set for which (re)marking them lost would affect cur set
+
+    bool pg_down = false;   ///< some down osds are included in @a cur; the DOWN pg state bit should be set.
+    const IsPGRecoverablePredicate* pcontdec = nullptr;
+
+    PriorSet() = default;
+    PriorSet(PriorSet &&) = default;
+    PriorSet &operator=(PriorSet &&) = default;
+
+    PriorSet &operator=(const PriorSet &) = delete;
+    PriorSet(const PriorSet &) = delete;
+
+    bool operator==(const PriorSet &rhs) const {
+      return (ec_pool == rhs.ec_pool) &&
+	(probe == rhs.probe) &&
+	(down == rhs.down) &&
+	(blocked_by == rhs.blocked_by) &&
+	(pg_down == rhs.pg_down);
+    }
+
+    bool affected_by_map(
+      const OSDMap &osdmap,
+      const DoutPrefixProvider *dpp) const;
+
+    // For verifying tests
+    PriorSet(
+      bool ec_pool,
+      std::set<pg_shard_t> probe,
+      std::set<int> down,
+      std::map<int, epoch_t> blocked_by,
+      bool pg_down,
+      const IsPGRecoverablePredicate *pcontdec)
+      : ec_pool(ec_pool), probe(probe), down(down), blocked_by(blocked_by),
+	pg_down(pg_down), pcontdec(pcontdec) {}
+
+  private:
+    template <typename F>
+    PriorSet(
+      const PastIntervals &past_intervals,
+      bool ec_pool,
+      epoch_t last_epoch_started,
+      const IsPGRecoverablePredicate *c,
+      F f,
+      const std::vector<int> &up,
+      const std::vector<int> &acting,
+      const DoutPrefixProvider *dpp);
+
+    friend class PastIntervals;
+  };
+
+  template <typename... Args>
+  PriorSet get_prior_set(Args&&... args) const {
+    return PriorSet(*this, std::forward<Args>(args)...);
+  }
+};
+WRITE_CLASS_ENCODER(PastIntervals)
+
+std::ostream& operator<<(std::ostream& out, const PastIntervals::pg_interval_t& i);
+std::ostream& operator<<(std::ostream& out, const PastIntervals &i);
+std::ostream& operator<<(std::ostream& out, const PastIntervals::PriorSet &i);
+
+template <typename F>
+PastIntervals::PriorSet::PriorSet(
+  const PastIntervals &past_intervals,
+  bool ec_pool,
+  epoch_t last_epoch_started,
+  const IsPGRecoverablePredicate *c,
+  F f,
+  const std::vector<int> &up,
+  const std::vector<int> &acting,
+  const DoutPrefixProvider *dpp)
+  : ec_pool(ec_pool), pg_down(false), pcontdec(c)
+{
+  /*
+   * We have to be careful to gracefully deal with situations like
+   * so. Say we have a power outage or something that takes out both
+   * OSDs, but the monitor doesn't mark them down in the same epoch.
+   * The history may look like
+   *
+   *  1: A B
+   *  2:   B
+   *  3:       let's say B dies for good, too (say, from the power spike)
+   *  4: A
+   *
+   * which makes it look like B may have applied updates to the PG
+   * that we need in order to proceed.  This sucks...
+   *
+   * To minimize the risk of this happening, we CANNOT go active if
+   * _any_ OSDs in the prior set are down until we send an MOSDAlive
+   * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
+   * Then, we have something like
+   *
+   *  1: A B
+   *  2:   B   up_thru[B]=0
+   *  3:
+   *  4: A
+   *
+   * -> we can ignore B, bc it couldn't have gone active (alive_thru
+   *    still 0).
+   *
+   * or,
+   *
+   *  1: A B
+   *  2:   B   up_thru[B]=0
+   *  3:   B   up_thru[B]=2
+   *  4:
+   *  5: A
+   *
+   * -> we must wait for B, bc it was alive through 2, and could have
+   *    written to the pg.
+   *
+   * If B is really dead, then an administrator will need to manually
+   * intervene by marking the OSD as "lost."
+   */
+
+  // Include current acting and up nodes... not because they may
+  // contain old data (this interval hasn't gone active, obviously),
+  // but because we want their pg_info to inform choose_acting(), and
+  // so that we know what they do/do not have explicitly before
+  // sending them any new info/logs/whatever.
+  for (unsigned i = 0; i < acting.size(); i++) {
+    if (acting[i] != pg_pool_t::pg_CRUSH_ITEM_NONE)
+      probe.insert(pg_shard_t(acting[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
+  }
+  // It may be possible to exclude the up nodes, but let's keep them in
+  // there for now.
+  for (unsigned i = 0; i < up.size(); i++) {
+    if (up[i] != pg_pool_t::pg_CRUSH_ITEM_NONE)
+      probe.insert(pg_shard_t(up[i], ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
+  }
+
+  std::set<pg_shard_t> all_probe = past_intervals.get_all_probe(ec_pool);
+  ldpp_dout(dpp, 10) << "build_prior all_probe " << all_probe << dendl;
+  for (auto &&i: all_probe) {
+    switch (f(0, i.osd, nullptr)) {
+    case UP: {
+      probe.insert(i);
+      break;
+    }
+    case DNE:
+    case LOST:
+    case DOWN: {
+      down.insert(i.osd);
+      break;
+    }
+    }
+  }
+
+  past_intervals.iterate_mayberw_back_to(
+    last_epoch_started,
+    [&](epoch_t start, const std::set<pg_shard_t> &acting) {
+      ldpp_dout(dpp, 10) << "build_prior maybe_rw interval:" << start
+			 << ", acting: " << acting << dendl;
+
+      // look at candidate osds during this interval.  each falls into
+      // one of three categories: up, down (but potentially
+      // interesting), or lost (down, but we won't wait for it).
+      std::set<pg_shard_t> up_now;
+      std::map<int, epoch_t> candidate_blocked_by;
+      // any candidates down now (that might have useful data)
+      bool any_down_now = false;
+
+      // consider ACTING osds
+      for (auto &&so: acting) {
+	epoch_t lost_at = 0;
+	switch (f(start, so.osd, &lost_at)) {
+	case UP: {
+	  // include past acting osds if they are up.
+	  up_now.insert(so);
+	  break;
+	}
+	case DNE: {
+	  ldpp_dout(dpp, 10) << "build_prior  prior osd." << so.osd
+			     << " no longer exists" << dendl;
+	  break;
+	}
+	case LOST: {
+	  ldpp_dout(dpp, 10) << "build_prior  prior osd." << so.osd
+			     << " is down, but lost_at " << lost_at << dendl;
+	  up_now.insert(so);
+	  break;
+	}
+	case DOWN: {
+	  ldpp_dout(dpp, 10) << "build_prior  prior osd." << so.osd
+			     << " is down" << dendl;
+	  candidate_blocked_by[so.osd] = lost_at;
+	  any_down_now = true;
+	  break;
+	}
+	}
+      }
+
+      // if not enough osds survived this interval, and we may have gone rw,
+      // then we need to wait for one of those osds to recover to
+      // ensure that we haven't lost any information.
+      if (!(*pcontdec)(up_now) && any_down_now) {
+	// fixme: how do we identify a "clean" shutdown anyway?
+	ldpp_dout(dpp, 10) << "build_prior  possibly went active+rw,"
+			   << " insufficient up; including down osds" << dendl;
+	ceph_assert(!candidate_blocked_by.empty());
+	pg_down = true;
+	blocked_by.insert(
+	  candidate_blocked_by.begin(),
+	  candidate_blocked_by.end());
+      }
+    });
+
+  ldpp_dout(dpp, 10) << "build_prior final: probe " << probe
+	   << " down " << down
+	   << " blocked_by " << blocked_by
+	   << (pg_down ? " pg_down":"")
+	   << dendl;
+}
+
+struct pg_notify_t {
+  epoch_t query_epoch;
+  epoch_t epoch_sent;
+  pg_info_t info;
+  shard_id_t to;
+  shard_id_t from;
+  PastIntervals past_intervals;
+  pg_notify_t() :
+    query_epoch(0), epoch_sent(0), to(shard_id_t::NO_SHARD),
+    from(shard_id_t::NO_SHARD) {}
+  pg_notify_t(
+    shard_id_t to,
+    shard_id_t from,
+    epoch_t query_epoch,
+    epoch_t epoch_sent,
+    const pg_info_t &info,
+    const PastIntervals& pi)
+    : query_epoch(query_epoch),
+      epoch_sent(epoch_sent),
+      info(info), to(to), from(from),
+      past_intervals(pi) {
+    ceph_assert(from == info.pgid.shard);
+  }
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_notify_t*> &o);
+};
+WRITE_CLASS_ENCODER(pg_notify_t)
+std::ostream &operator<<(std::ostream &lhs, const pg_notify_t &notify);
+
+
+/** 
+ * pg_query_t - used to ask a peer for information about a pg.
+ *
+ * note: if version=0, type=LOG, then we just provide our full log.
+ */
+struct pg_query_t {
+  enum {
+    INFO = 0,
+    LOG = 1,
+    MISSING = 4,
+    FULLLOG = 5,
+  };
+  std::string_view get_type_name() const {
+    switch (type) {
+    case INFO: return "info";
+    case LOG: return "log";
+    case MISSING: return "missing";
+    case FULLLOG: return "fulllog";
+    default: return "???";
+    }
+  }
+
+  __s32 type;
+  eversion_t since;
+  pg_history_t history;
+  epoch_t epoch_sent;
+  shard_id_t to;
+  shard_id_t from;
+
+  pg_query_t() : type(-1), epoch_sent(0), to(shard_id_t::NO_SHARD),
+		 from(shard_id_t::NO_SHARD) {}
+  pg_query_t(
+    int t,
+    shard_id_t to,
+    shard_id_t from,
+    const pg_history_t& h,
+    epoch_t epoch_sent)
+    : type(t),
+      history(h),
+      epoch_sent(epoch_sent),
+      to(to), from(from) {
+    ceph_assert(t != LOG);
+  }
+  pg_query_t(
+    int t,
+    shard_id_t to,
+    shard_id_t from,
+    eversion_t s,
+    const pg_history_t& h,
+    epoch_t epoch_sent)
+    : type(t), since(s), history(h),
+      epoch_sent(epoch_sent), to(to), from(from) {
+    ceph_assert(t == LOG);
+  }
+  
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_query_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(pg_query_t)
+
+inline std::ostream& operator<<(std::ostream& out, const pg_query_t& q) {
+  out << "query(" << q.get_type_name() << " " << q.since;
+  if (q.type == pg_query_t::LOG)
+    out << " " << q.history;
+  out << " epoch_sent " << q.epoch_sent;
+  out << ")";
+  return out;
+}
+
+/**
+ * pg_lease_t - readable lease metadata, from primary -> non-primary
+ *
+ * This metadata serves to increase either or both of the lease expiration
+ * and upper bound on the non-primary.
+ */
+struct pg_lease_t {
+  /// pg readable_until value; replicas must not be readable beyond this
+  ceph::signedspan readable_until = ceph::signedspan::zero();
+
+  /// upper bound on any acting osd's readable_until
+  ceph::signedspan readable_until_ub = ceph::signedspan::zero();
+
+  /// duration of the lease (in case clock deltas aren't available)
+  ceph::signedspan interval = ceph::signedspan::zero();
+
+  pg_lease_t() {}
+  pg_lease_t(ceph::signedspan ru, ceph::signedspan ruub,
+	     ceph::signedspan i)
+    : readable_until(ru),
+      readable_until_ub(ruub),
+      interval(i) {}
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_lease_t*>& o);
+
+  friend std::ostream& operator<<(std::ostream& out, const pg_lease_t& l) {
+    return out << "pg_lease(ru " << l.readable_until
+	       << " ub " << l.readable_until_ub
+	       << " int " << l.interval << ")";
+  }
+};
+WRITE_CLASS_ENCODER(pg_lease_t)
+
+/**
+ * pg_lease_ack_t - lease ack, from non-primary -> primary
+ *
+ * This metadata acknowledges to the primary what a non-primary's noted
+ * upper bound is.
+ */
+struct pg_lease_ack_t {
+  /// highest upper bound non-primary has recorded (primary's clock)
+  ceph::signedspan readable_until_ub = ceph::signedspan::zero();
+
+  pg_lease_ack_t() {}
+  pg_lease_ack_t(ceph::signedspan ub)
+    : readable_until_ub(ub) {}
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_lease_ack_t*>& o);
+
+  friend std::ostream& operator<<(std::ostream& out, const pg_lease_ack_t& l) {
+    return out << "pg_lease_ack(ruub " << l.readable_until_ub << ")";
+  }
+};
+WRITE_CLASS_ENCODER(pg_lease_ack_t)
+
+
+
+class PGBackend;
+class ObjectModDesc {
+  bool can_local_rollback;
+  bool rollback_info_completed;
+
+  // version required to decode, reflected in encode/decode version
+  __u8 max_required_version = 1;
+public:
+  class Visitor {
+  public:
+    virtual void append(uint64_t old_offset) {}
+    virtual void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &attrs) {}
+    virtual void rmobject(version_t old_version) {}
+    /**
+     * Used to support the unfound_lost_delete log event: if the stashed
+     * version exists, we unstash it, otherwise, we do nothing.  This way
+     * each replica rolls back to whatever state it had prior to the attempt
+     * at mark unfound lost delete
+     */
+    virtual void try_rmobject(version_t old_version) {
+      rmobject(old_version);
+    }
+    virtual void create() {}
+    virtual void update_snaps(const std::set<snapid_t> &old_snaps) {}
+    virtual void rollback_extents(
+      version_t gen,
+      const std::vector<std::pair<uint64_t, uint64_t> > &extents) {}
+    virtual ~Visitor() {}
+  };
+  void visit(Visitor *visitor) const;
+  mutable ceph::buffer::list bl;
+  enum ModID {
+    APPEND = 1,
+    SETATTRS = 2,
+    DELETE = 3,
+    CREATE = 4,
+    UPDATE_SNAPS = 5,
+    TRY_DELETE = 6,
+    ROLLBACK_EXTENTS = 7
+  };
+  ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {
+    bl.reassign_to_mempool(mempool::mempool_osd_pglog);
+  }
+  void claim(ObjectModDesc &other) {
+    bl = std::move(other.bl);
+    can_local_rollback = other.can_local_rollback;
+    rollback_info_completed = other.rollback_info_completed;
+  }
+  void claim_append(ObjectModDesc &other) {
+    if (!can_local_rollback || rollback_info_completed)
+      return;
+    if (!other.can_local_rollback) {
+      mark_unrollbackable();
+      return;
+    }
+    bl.claim_append(other.bl);
+    rollback_info_completed = other.rollback_info_completed;
+  }
+  void swap(ObjectModDesc &other) {
+    bl.swap(other.bl);
+
+    using std::swap;
+    swap(other.can_local_rollback, can_local_rollback);
+    swap(other.rollback_info_completed, rollback_info_completed);
+    swap(other.max_required_version, max_required_version);
+  }
+  void append_id(ModID id) {
+    using ceph::encode;
+    uint8_t _id(id);
+    encode(_id, bl);
+  }
+  void append(uint64_t old_size) {
+    if (!can_local_rollback || rollback_info_completed)
+      return;
+    ENCODE_START(1, 1, bl);
+    append_id(APPEND);
+    encode(old_size, bl);
+    ENCODE_FINISH(bl);
+  }
+  void setattrs(std::map<std::string, std::optional<ceph::buffer::list>> &old_attrs) {
+    if (!can_local_rollback || rollback_info_completed)
+      return;
+    ENCODE_START(1, 1, bl);
+    append_id(SETATTRS);
+    encode(old_attrs, bl);
+    ENCODE_FINISH(bl);
+  }
+  bool rmobject(version_t deletion_version) {
+    if (!can_local_rollback || rollback_info_completed)
+      return false;
+    ENCODE_START(1, 1, bl);
+    append_id(DELETE);
+    encode(deletion_version, bl);
+    ENCODE_FINISH(bl);
+    rollback_info_completed = true;
+    return true;
+  }
+  bool try_rmobject(version_t deletion_version) {
+    if (!can_local_rollback || rollback_info_completed)
+      return false;
+    ENCODE_START(1, 1, bl);
+    append_id(TRY_DELETE);
+    encode(deletion_version, bl);
+    ENCODE_FINISH(bl);
+    rollback_info_completed = true;
+    return true;
+  }
+  void create() {
+    if (!can_local_rollback || rollback_info_completed)
+      return;
+    rollback_info_completed = true;
+    ENCODE_START(1, 1, bl);
+    append_id(CREATE);
+    ENCODE_FINISH(bl);
+  }
+  void update_snaps(const std::set<snapid_t> &old_snaps) {
+    if (!can_local_rollback || rollback_info_completed)
+      return;
+    ENCODE_START(1, 1, bl);
+    append_id(UPDATE_SNAPS);
+    encode(old_snaps, bl);
+    ENCODE_FINISH(bl);
+  }
+  void rollback_extents(
+    version_t gen, const std::vector<std::pair<uint64_t, uint64_t> > &extents) {
+    ceph_assert(can_local_rollback);
+    ceph_assert(!rollback_info_completed);
+    if (max_required_version < 2)
+      max_required_version = 2;
+    ENCODE_START(2, 2, bl);
+    append_id(ROLLBACK_EXTENTS);
+    encode(gen, bl);
+    encode(extents, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  // cannot be rolled back
+  void mark_unrollbackable() {
+    can_local_rollback = false;
+    bl.clear();
+  }
+  bool can_rollback() const {
+    return can_local_rollback;
+  }
+  bool empty() const {
+    return can_local_rollback && (bl.length() == 0);
+  }
+
+  bool requires_kraken() const {
+    return max_required_version >= 2;
+  }
+
+  /**
+   * Create fresh copy of bl bytes to avoid keeping large buffers around
+   * in the case that bl contains ptrs which point into a much larger
+   * message buffer
+   */
+  void trim_bl() const {
+    if (bl.length() > 0)
+      bl.rebuild();
+  }
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<ObjectModDesc*>& o);
+};
+WRITE_CLASS_ENCODER(ObjectModDesc)
+
+class ObjectCleanRegions {
+private:
+  bool new_object;
+  bool clean_omap;
+  interval_set<uint64_t> clean_offsets;
+  static std::atomic<uint32_t> max_num_intervals;
+
+  /**
+   * trim the number of intervals if clean_offsets.num_intervals()
+   * exceeds the given upbound max_num_intervals
+   * etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]}
+   * then new interval [30~10] will evict out the shortest one [20~5]
+   * finally, clean_offsets becomes {[5~10], [30~10]}
+   */
+  void trim();
+  friend std::ostream& operator<<(std::ostream& out, const ObjectCleanRegions& ocr);
+public:
+  ObjectCleanRegions() : new_object(false), clean_omap(true) {
+    clean_offsets.insert(0, (uint64_t)-1);
+  }
+  ObjectCleanRegions(uint64_t offset, uint64_t len, bool co)
+    : new_object(false), clean_omap(co) {
+    clean_offsets.insert(offset, len);
+  }
+  bool operator==(const ObjectCleanRegions &orc) const {
+    return new_object == orc.new_object && clean_omap == orc.clean_omap && clean_offsets == orc.clean_offsets;
+  }
+  static void set_max_num_intervals(uint32_t num);
+  void merge(const ObjectCleanRegions &other);
+  void mark_data_region_dirty(uint64_t offset, uint64_t len);
+  void mark_omap_dirty();
+  void mark_object_new();
+  void mark_fully_dirty();
+  interval_set<uint64_t> get_dirty_regions() const;
+  bool omap_is_dirty() const;
+  bool object_is_exist() const;
+  bool is_clean_region(uint64_t offset, uint64_t len) const;
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<ObjectCleanRegions*>& o);
+};
+WRITE_CLASS_ENCODER(ObjectCleanRegions)
+std::ostream& operator<<(std::ostream& out, const ObjectCleanRegions& ocr);
+
+
+struct OSDOp {
+  ceph_osd_op op;
+  sobject_t soid;
+
+  ceph::buffer::list indata, outdata;
+  errorcode32_t rval = 0;
+
+  OSDOp() {
+    // FIPS zeroization audit 20191115: this memset clean for security
+    memset(&op, 0, sizeof(ceph_osd_op));
+  }
+
+  OSDOp(const int op_code) {
+    // FIPS zeroization audit 20191115: this memset clean for security
+    memset(&op, 0, sizeof(ceph_osd_op));
+    op.op = op_code;
+  }
+
+  /**
+   * split a ceph::buffer::list into constituent indata members of a vector of OSDOps
+   *
+   * @param ops [out] vector of OSDOps
+   * @param in  [in] combined data buffer
+   */
+  template<typename V>
+  static void split_osd_op_vector_in_data(V& ops,
+					  ceph::buffer::list& in) {
+    ceph::buffer::list::iterator datap = in.begin();
+    for (unsigned i = 0; i < ops.size(); i++) {
+      if (ops[i].op.payload_len) {
+	datap.copy(ops[i].op.payload_len, ops[i].indata);
+      }
+    }
+  }
+
+  /**
+   * merge indata members of a vector of OSDOp into a single ceph::buffer::list
+   *
+   * Notably this also encodes certain other OSDOp data into the data
+   * buffer, including the sobject_t soid.
+   *
+   * @param ops [in] vector of OSDOps
+   * @param out [out] combined data buffer
+   */
+  template<typename V>
+  static void merge_osd_op_vector_in_data(V& ops, ceph::buffer::list& out) {
+    for (unsigned i = 0; i < ops.size(); i++) {
+      if (ops[i].indata.length()) {
+	ops[i].op.payload_len = ops[i].indata.length();
+	out.append(ops[i].indata);
+      }
+    }
+  }
+
+  /**
+   * split a ceph::buffer::list into constituent outdata members of a vector of OSDOps
+   *
+   * @param ops [out] vector of OSDOps
+   * @param in  [in] combined data buffer
+   */
+  static void split_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& in);
+
+  /**
+   * merge outdata members of a vector of OSDOps into a single ceph::buffer::list
+   *
+   * @param ops [in] vector of OSDOps
+   * @param out [out] combined data buffer
+   */
+  static void merge_osd_op_vector_out_data(std::vector<OSDOp>& ops, ceph::buffer::list& out);
+
+  /**
+   * Clear data as much as possible, leave minimal data for historical op dump
+   *
+   * @param ops [in] vector of OSDOps
+   */
+  template<typename V>
+  static void clear_data(V& ops) {
+    for (unsigned i = 0; i < ops.size(); i++) {
+      OSDOp& op = ops[i];
+      op.outdata.clear();
+      if (ceph_osd_op_type_attr(op.op.op) &&
+	  op.op.xattr.name_len &&
+	  op.indata.length() >= op.op.xattr.name_len) {
+	ceph::buffer::list bl;
+	bl.push_back(ceph::buffer::ptr_node::create(op.op.xattr.name_len));
+	bl.begin().copy_in(op.op.xattr.name_len, op.indata);
+	op.indata = std::move(bl);
+      } else if (ceph_osd_op_type_exec(op.op.op) &&
+		 op.op.cls.class_len &&
+		 op.indata.length() >
+	         (op.op.cls.class_len + op.op.cls.method_len)) {
+	__u8 len = op.op.cls.class_len + op.op.cls.method_len;
+	ceph::buffer::list bl;
+	bl.push_back(ceph::buffer::ptr_node::create(len));
+	bl.begin().copy_in(len, op.indata);
+	op.indata = std::move(bl);
+      } else {
+	op.indata.clear();
+      }
+    }
+  }
+};
+std::ostream& operator<<(std::ostream& out, const OSDOp& op);
+
+struct pg_log_op_return_item_t {
+  int32_t rval;
+  ceph::buffer::list bl;
+  void encode(ceph::buffer::list& p) const {
+    using ceph::encode;
+    encode(rval, p);
+    encode(bl, p);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    using ceph::decode;
+    decode(rval, p);
+    decode(bl, p);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->dump_int("rval", rval);
+    f->dump_unsigned("bl_length", bl.length());
+  }
+  friend bool operator==(const pg_log_op_return_item_t& lhs,
+			 const pg_log_op_return_item_t& rhs) {
+    return lhs.rval == rhs.rval &&
+      lhs.bl.contents_equal(rhs.bl);
+  }
+  friend bool operator!=(const pg_log_op_return_item_t& lhs,
+			 const pg_log_op_return_item_t& rhs) {
+    return !(lhs == rhs);
+  }
+  friend std::ostream& operator<<(std::ostream& out, const pg_log_op_return_item_t& i) {
+    return out << "r=" << i.rval << "+" << i.bl.length() << "b";
+  }
+};
+WRITE_CLASS_ENCODER(pg_log_op_return_item_t)
+
+/**
+ * pg_log_entry_t - single entry/event in pg log
+ *
+ */
+struct pg_log_entry_t {
+  enum {
+    MODIFY = 1,   // some unspecified modification (but not *all* modifications)
+    CLONE = 2,    // cloned object from head
+    DELETE = 3,   // deleted object
+    //BACKLOG = 4,  // event invented by generate_backlog [obsolete]
+    LOST_REVERT = 5, // lost new version, revert to an older version.
+    LOST_DELETE = 6, // lost new version, revert to no object (deleted).
+    LOST_MARK = 7,   // lost new version, now EIO
+    PROMOTE = 8,     // promoted object from another tier
+    CLEAN = 9,       // mark an object clean
+    ERROR = 10,      // write that returned an error
+  };
+  static const char *get_op_name(int op) {
+    switch (op) {
+    case MODIFY:
+      return "modify";
+    case PROMOTE:
+      return "promote";
+    case CLONE:
+      return "clone";
+    case DELETE:
+      return "delete";
+    case LOST_REVERT:
+      return "l_revert";
+    case LOST_DELETE:
+      return "l_delete";
+    case LOST_MARK:
+      return "l_mark";
+    case CLEAN:
+      return "clean";
+    case ERROR:
+      return "error";
+    default:
+      return "unknown";
+    }
+  }
+  const char *get_op_name() const {
+    return get_op_name(op);
+  }
+
+  // describes state for a locally-rollbackable entry
+  ObjectModDesc mod_desc;
+  ceph::buffer::list snaps;   // only for clone entries
+  hobject_t  soid;
+  osd_reqid_t reqid;  // caller+tid to uniquely identify request
+  mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > extra_reqids;
+
+  /// map extra_reqids by index to error return code (if any)
+  mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
+
+  eversion_t version, prior_version, reverting_to;
+  version_t user_version; // the user version for this entry
+  utime_t     mtime;  // this is the _user_ mtime, mind you
+  int32_t return_code; // only stored for ERRORs for dup detection
+
+  std::vector<pg_log_op_return_item_t> op_returns;
+
+  __s32      op;
+  bool invalid_hash; // only when decoding sobject_t based entries
+  bool invalid_pool; // only when decoding pool-less hobject based entries
+  ObjectCleanRegions clean_regions;
+
+  pg_log_entry_t()
+   : user_version(0), return_code(0), op(0),
+     invalid_hash(false), invalid_pool(false) {
+    snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+  }
+  pg_log_entry_t(int _op, const hobject_t& _soid,
+                const eversion_t& v, const eversion_t& pv,
+                version_t uv,
+                const osd_reqid_t& rid, const utime_t& mt,
+                int return_code)
+   : soid(_soid), reqid(rid), version(v), prior_version(pv), user_version(uv),
+     mtime(mt), return_code(return_code), op(_op),
+     invalid_hash(false), invalid_pool(false) {
+    snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
+  }
+      
+  bool is_clone() const { return op == CLONE; }
+  bool is_modify() const { return op == MODIFY; }
+  bool is_promote() const { return op == PROMOTE; }
+  bool is_clean() const { return op == CLEAN; }
+  bool is_lost_revert() const { return op == LOST_REVERT; }
+  bool is_lost_delete() const { return op == LOST_DELETE; }
+  bool is_lost_mark() const { return op == LOST_MARK; }
+  bool is_error() const { return op == ERROR; }
+
+  bool is_update() const {
+    return
+      is_clone() || is_modify() || is_promote() || is_clean() ||
+      is_lost_revert() || is_lost_mark();
+  }
+  bool is_delete() const {
+    return op == DELETE || op == LOST_DELETE;
+  }
+
+  bool can_rollback() const {
+    return mod_desc.can_rollback();
+  }
+
+  void mark_unrollbackable() {
+    mod_desc.mark_unrollbackable();
+  }
+
+  bool requires_kraken() const {
+    return mod_desc.requires_kraken();
+  }
+
+  // Errors are only used for dup detection, whereas
+  // the index by objects is used by recovery, copy_get,
+  // and other facilities that don't expect or need to
+  // be aware of error entries.
+  bool object_is_indexed() const {
+    return !is_error();
+  }
+
+  bool reqid_is_indexed() const {
+    return reqid != osd_reqid_t() &&
+      (op == MODIFY || op == DELETE || op == ERROR);
+  }
+
+  void set_op_returns(const std::vector<OSDOp>& ops) {
+    op_returns.resize(ops.size());
+    for (unsigned i = 0; i < ops.size(); ++i) {
+      op_returns[i].rval = ops[i].rval;
+      op_returns[i].bl = ops[i].outdata;
+    }
+  }
+
+  std::string get_key_name() const;
+  void encode_with_checksum(ceph::buffer::list& bl) const;
+  void decode_with_checksum(ceph::buffer::list::const_iterator& p);
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_log_entry_t*>& o);
+
+};
+WRITE_CLASS_ENCODER(pg_log_entry_t)
+
+std::ostream& operator<<(std::ostream& out, const pg_log_entry_t& e);
+
+struct pg_log_dup_t {
+  osd_reqid_t reqid;  // caller+tid to uniquely identify request
+  eversion_t version;
+  version_t user_version; // the user version for this entry
+  int32_t return_code; // only stored for ERRORs for dup detection
+
+  std::vector<pg_log_op_return_item_t> op_returns;
+
+  pg_log_dup_t()
+    : user_version(0), return_code(0)
+  {}
+  explicit pg_log_dup_t(const pg_log_entry_t& entry)
+    : reqid(entry.reqid), version(entry.version),
+      user_version(entry.user_version),
+      return_code(entry.return_code),
+      op_returns(entry.op_returns)
+  {}
+  pg_log_dup_t(const eversion_t& v, version_t uv,
+	       const osd_reqid_t& rid, int return_code)
+    : reqid(rid), version(v), user_version(uv),
+      return_code(return_code)
+  {}
+
+  std::string get_key_name() const;
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_log_dup_t*>& o);
+
+  bool operator==(const pg_log_dup_t &rhs) const {
+    return reqid == rhs.reqid &&
+      version == rhs.version &&
+      user_version == rhs.user_version &&
+      return_code == rhs.return_code &&
+      op_returns == rhs.op_returns;
+  }
+  bool operator!=(const pg_log_dup_t &rhs) const {
+    return !(*this == rhs);
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
+};
+WRITE_CLASS_ENCODER(pg_log_dup_t)
+
+std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
+
+/**
+ * pg_log_t - incremental log of recent pg changes.
+ *
+ *  serves as a recovery queue for recent changes.
+ */
+struct pg_log_t {
+  /*
+   *   head - newest entry (update|delete)
+   *   tail - entry previous to oldest (update|delete) for which we have
+   *          complete negative information.  
+   * i.e. we can infer pg contents for any store whose last_update >= tail.
+   */
+  eversion_t head;    // newest entry
+  eversion_t tail;    // version prior to oldest
+
+protected:
+  // We can rollback rollback-able entries > can_rollback_to
+  eversion_t can_rollback_to;
+
+  // always <= can_rollback_to, indicates how far stashed rollback
+  // data can be found
+  eversion_t rollback_info_trimmed_to;
+
+public:
+  // the actual log
+  mempool::osd_pglog::list<pg_log_entry_t> log;
+
+  // entries just for dup op detection ordered oldest to newest
+  mempool::osd_pglog::list<pg_log_dup_t> dups;
+
+  pg_log_t() = default;
+  pg_log_t(const eversion_t &last_update,
+	   const eversion_t &log_tail,
+	   const eversion_t &can_rollback_to,
+	   const eversion_t &rollback_info_trimmed_to,
+	   mempool::osd_pglog::list<pg_log_entry_t> &&entries,
+	   mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
+    : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
+      rollback_info_trimmed_to(rollback_info_trimmed_to),
+      log(std::move(entries)), dups(std::move(dup_entries)) {}
+  pg_log_t(const eversion_t &last_update,
+	   const eversion_t &log_tail,
+	   const eversion_t &can_rollback_to,
+	   const eversion_t &rollback_info_trimmed_to,
+	   const std::list<pg_log_entry_t> &entries,
+	   const std::list<pg_log_dup_t> &dup_entries)
+    : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
+      rollback_info_trimmed_to(rollback_info_trimmed_to) {
+    for (auto &&entry: entries) {
+      log.push_back(entry);
+    }
+    for (auto &&entry: dup_entries) {
+      dups.push_back(entry);
+    }
+  }
+
+  void clear() {
+    eversion_t z;
+    rollback_info_trimmed_to = can_rollback_to = head = tail = z;
+    log.clear();
+    dups.clear();
+  }
+
+  eversion_t get_rollback_info_trimmed_to() const {
+    return rollback_info_trimmed_to;
+  }
+  eversion_t get_can_rollback_to() const {
+    return can_rollback_to;
+  }
+
+
+  pg_log_t split_out_child(pg_t child_pgid, unsigned split_bits) {
+    mempool::osd_pglog::list<pg_log_entry_t> oldlog, childlog;
+    oldlog.swap(log);
+
+    eversion_t old_tail;
+    unsigned mask = ~((~0)<<split_bits);
+    for (auto i = oldlog.begin();
+	 i != oldlog.end();
+      ) {
+      if ((i->soid.get_hash() & mask) == child_pgid.m_seed) {
+	childlog.push_back(*i);
+      } else {
+	log.push_back(*i);
+      }
+      oldlog.erase(i++);
+    }
+
+    // osd_reqid is unique, so it doesn't matter if there are extra
+    // dup entries in each pg. To avoid storing oid with the dup
+    // entries, just copy the whole list.
+    auto childdups(dups);
+
+    return pg_log_t(
+      head,
+      tail,
+      can_rollback_to,
+      rollback_info_trimmed_to,
+      std::move(childlog),
+      std::move(childdups));
+    }
+
+  mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
+    ceph_assert(newhead >= tail);
+
+    mempool::osd_pglog::list<pg_log_entry_t>::iterator p = log.end();
+    mempool::osd_pglog::list<pg_log_entry_t> divergent;
+    while (true) {
+      if (p == log.begin()) {
+	// yikes, the whole thing is divergent!
+	using std::swap;
+	swap(divergent, log);
+	break;
+      }
+      --p;
+      if (p->version.version <= newhead.version) {
+	/*
+	 * look at eversion.version here.  we want to avoid a situation like:
+	 *  our log: 100'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
+	 *  new log: 122'10 (0'0) m 10000004d3a.00000000/head by client4225.1:18529
+	 *  lower_bound = 100'9
+	 * i.e, same request, different version.  If the eversion.version is > the
+	 * lower_bound, we it is divergent.
+	 */
+	++p;
+	divergent.splice(divergent.begin(), log, p, log.end());
+	break;
+      }
+      ceph_assert(p->version > newhead);
+    }
+    head = newhead;
+
+    if (can_rollback_to > newhead)
+      can_rollback_to = newhead;
+
+    if (rollback_info_trimmed_to > newhead)
+      rollback_info_trimmed_to = newhead;
+
+    return divergent;
+  }
+
+  void merge_from(const std::vector<pg_log_t*>& slogs, eversion_t last_update) {
+    log.clear();
+
+    // sort and merge dups
+    std::multimap<eversion_t,pg_log_dup_t> sorted;
+    for (auto& d : dups) {
+      sorted.emplace(d.version, d);
+    }
+    for (auto l : slogs) {
+      for (auto& d : l->dups) {
+	sorted.emplace(d.version, d);
+      }
+    }
+    dups.clear();
+    for (auto& i : sorted) {
+      dups.push_back(i.second);
+    }
+
+    head = last_update;
+    tail = last_update;
+    can_rollback_to = last_update;
+    rollback_info_trimmed_to = last_update;
+  }
+
+  bool empty() const {
+    return log.empty();
+  }
+
+  bool null() const {
+    return head.version == 0 && head.epoch == 0;
+  }
+
+  uint64_t approx_size() const {
+    return head.version - tail.version;
+  }
+
+  static void filter_log(spg_t import_pgid, const OSDMap &curmap,
+    const std::string &hit_set_namespace, const pg_log_t &in,
+    pg_log_t &out, pg_log_t &reject);
+
+  /**
+   * copy entries from the tail of another pg_log_t
+   *
+   * @param other pg_log_t to copy from
+   * @param from copy entries after this version
+   */
+  void copy_after(CephContext* cct, const pg_log_t &other, eversion_t from);
+
+  /**
+   * copy up to N entries
+   *
+   * @param other source log
+   * @param max max number of entries to copy
+   */
+  void copy_up_to(CephContext* cct, const pg_log_t &other, int max);
+
+  std::ostream& print(std::ostream& out) const;
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_log_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_log_t)
+
+inline std::ostream& operator<<(std::ostream& out, const pg_log_t& log)
+{
+  out << "log((" << log.tail << "," << log.head << "], crt="
+      << log.get_can_rollback_to() << ")";
+  return out;
+}
+
+
+/**
+ * pg_missing_t - summary of missing objects.
+ *
+ *  kept in memory, as a supplement to pg_log_t
+ *  also used to pass missing info in messages.
+ */
+struct pg_missing_item {
+  eversion_t need, have;
+  ObjectCleanRegions clean_regions;
+  enum missing_flags_t {
+    FLAG_NONE = 0,
+    FLAG_DELETE = 1,
+  } flags;
+  pg_missing_item() : flags(FLAG_NONE) {}
+  explicit pg_missing_item(eversion_t n) : need(n), flags(FLAG_NONE) {}  // have no old version
+  pg_missing_item(eversion_t n, eversion_t h, bool is_delete=false, bool old_style = false) :
+    need(n), have(h) {
+    set_delete(is_delete);
+    if (old_style)
+      clean_regions.mark_fully_dirty();
+  }
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const {
+    using ceph::encode;
+    if (HAVE_FEATURE(features, SERVER_OCTOPUS)) {
+      // encoding a zeroed eversion_t to differentiate between OSD_RECOVERY_DELETES、
+      // SERVER_OCTOPUS and legacy unversioned encoding - a need value of 0'0 is not
+      // possible. This can be replaced with the legacy encoding
+      encode(eversion_t(), bl);
+      encode(eversion_t(-1, -1), bl);
+      encode(need, bl);
+      encode(have, bl);   
+      encode(static_cast<uint8_t>(flags), bl);
+      encode(clean_regions, bl);
+    } else {
+      encode(eversion_t(), bl);
+      encode(need, bl);
+      encode(have, bl);
+      encode(static_cast<uint8_t>(flags), bl);
+    }
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    using ceph::decode;
+    eversion_t e, l;
+    decode(e, bl);
+    decode(l, bl);
+    if(l == eversion_t(-1, -1)) {
+      // support all
+      decode(need, bl);
+      decode(have, bl);
+      uint8_t f;
+      decode(f, bl);
+      flags = static_cast<missing_flags_t>(f);
+      decode(clean_regions, bl);
+     } else {
+      // support OSD_RECOVERY_DELETES
+      need = l;
+      decode(have, bl);
+      uint8_t f;
+      decode(f, bl);
+      flags = static_cast<missing_flags_t>(f); 
+      clean_regions.mark_fully_dirty();
+    }
+  }
+
+  void set_delete(bool is_delete) {
+    flags = is_delete ? FLAG_DELETE : FLAG_NONE;
+  }
+
+  bool is_delete() const {
+    return (flags & FLAG_DELETE) == FLAG_DELETE;
+  }
+
+  std::string flag_str() const {
+    if (flags == FLAG_NONE) {
+      return "none";
+    } else {
+      return "delete";
+    }
+  }
+
+  void dump(ceph::Formatter *f) const {
+    f->dump_stream("need") << need;
+    f->dump_stream("have") << have;
+    f->dump_stream("flags") << flag_str();
+    f->dump_stream("clean_regions") << clean_regions;
+  }
+  static void generate_test_instances(std::list<pg_missing_item*>& o) {
+    o.push_back(new pg_missing_item);
+    o.push_back(new pg_missing_item);
+    o.back()->need = eversion_t(1, 2);
+    o.back()->have = eversion_t(1, 1);
+    o.push_back(new pg_missing_item);
+    o.back()->need = eversion_t(3, 5);
+    o.back()->have = eversion_t(3, 4);
+    o.back()->clean_regions.mark_data_region_dirty(4096, 8192);
+    o.back()->clean_regions.mark_omap_dirty();
+    o.back()->flags = FLAG_DELETE;
+  }
+  bool operator==(const pg_missing_item &rhs) const {
+    return need == rhs.need && have == rhs.have && flags == rhs.flags;
+  }
+  bool operator!=(const pg_missing_item &rhs) const {
+    return !(*this == rhs);
+  }
+};
+WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
+std::ostream& operator<<(std::ostream& out, const pg_missing_item &item);
+
+class pg_missing_const_i {
+public:
+  virtual const std::map<hobject_t, pg_missing_item> &
+    get_items() const = 0;
+  virtual const std::map<version_t, hobject_t> &get_rmissing() const = 0;
+  virtual bool get_may_include_deletes() const = 0;
+  virtual unsigned int num_missing() const = 0;
+  virtual bool have_missing() const = 0;
+  virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
+  virtual bool is_missing(const hobject_t& oid, eversion_t v) const = 0;
+  virtual ~pg_missing_const_i() {}
+};
+
+
+template <bool Track>
+class ChangeTracker {
+public:
+  void changed(const hobject_t &obj) {}
+  template <typename F>
+  void get_changed(F &&f) const {}
+  void flush() {}
+  bool is_clean() const {
+    return true;
+  }
+};
+template <>
+class ChangeTracker<true> {
+  std::set<hobject_t> _changed;
+public:
+  void changed(const hobject_t &obj) {
+    _changed.insert(obj);
+  }
+  template <typename F>
+  void get_changed(F &&f) const {
+    for (auto const &i: _changed) {
+      f(i);
+    }
+  }
+  void flush() {
+    _changed.clear();
+  }
+  bool is_clean() const {
+    return _changed.empty();
+  }
+};
+
+template <bool TrackChanges>
+class pg_missing_set : public pg_missing_const_i {
+  using item = pg_missing_item;
+  std::map<hobject_t, item> missing;  // oid -> (need v, have v)
+  std::map<version_t, hobject_t> rmissing;  // v -> oid
+  ChangeTracker<TrackChanges> tracker;
+
+public:
+  pg_missing_set() = default;
+
+  template <typename missing_type>
+  pg_missing_set(const missing_type &m) {
+    missing = m.get_items();
+    rmissing = m.get_rmissing();
+    may_include_deletes = m.get_may_include_deletes();
+    for (auto &&i: missing)
+      tracker.changed(i.first);
+  }
+
+  bool may_include_deletes = false;
+
+  const std::map<hobject_t, item> &get_items() const override {
+    return missing;
+  }
+  const std::map<version_t, hobject_t> &get_rmissing() const override {
+    return rmissing;
+  }
+  bool get_may_include_deletes() const override {
+    return may_include_deletes;
+  }
+  unsigned int num_missing() const override {
+    return missing.size();
+  }
+  bool have_missing() const override {
+    return !missing.empty();
+  }
+  void merge(const pg_log_entry_t& e) {
+    auto miter = missing.find(e.soid);
+    if (miter != missing.end() && miter->second.have != eversion_t() && e.version > miter->second.have)
+      miter->second.clean_regions.merge(e.clean_regions);
+  }
+  bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const override {
+    auto iter = missing.find(oid);
+    if (iter == missing.end())
+      return false;
+    if (out)
+      *out = iter->second;
+    return true;
+  }
+  bool is_missing(const hobject_t& oid, eversion_t v) const override {
+    std::map<hobject_t, item>::const_iterator m =
+      missing.find(oid);
+    if (m == missing.end())
+      return false;
+    const item &item(m->second);
+    if (item.need > v)
+      return false;
+    return true;
+  }
+  eversion_t get_oldest_need() const {
+    if (missing.empty()) {
+      return eversion_t();
+    }
+    auto it = missing.find(rmissing.begin()->second);
+    ceph_assert(it != missing.end());
+    return it->second.need;
+  }
+
+  void claim(pg_missing_set&& o) {
+    static_assert(!TrackChanges, "Can't use claim with TrackChanges");
+    missing = std::move(o.missing);
+    rmissing = std::move(o.rmissing);
+  }
+
+  /*
+   * this needs to be called in log order as we extend the log.  it
+   * assumes missing is accurate up through the previous log entry.
+   */
+  void add_next_event(const pg_log_entry_t& e) {
+    std::map<hobject_t, item>::iterator missing_it;
+    missing_it = missing.find(e.soid);
+    bool is_missing_divergent_item = missing_it != missing.end();
+    if (e.prior_version == eversion_t() || e.is_clone()) {
+      // new object.
+      if (is_missing_divergent_item) {  // use iterator
+        rmissing.erase(missing_it->second.need.version);
+        // .have = nil
+        missing_it->second = item(e.version, eversion_t(), e.is_delete());
+        missing_it->second.clean_regions.mark_fully_dirty();
+      } else {
+         // create new element in missing map
+         // .have = nil
+        missing[e.soid] = item(e.version, eversion_t(), e.is_delete());
+        missing[e.soid].clean_regions.mark_fully_dirty();
+      }
+    } else if (is_missing_divergent_item) {
+      // already missing (prior).
+      rmissing.erase((missing_it->second).need.version);
+      missing_it->second.need = e.version;  // leave .have unchanged.
+      missing_it->second.set_delete(e.is_delete());
+      if (e.is_lost_revert())
+        missing_it->second.clean_regions.mark_fully_dirty();
+      else
+        missing_it->second.clean_regions.merge(e.clean_regions);
+    } else {
+      // not missing, we must have prior_version (if any)
+      ceph_assert(!is_missing_divergent_item);
+      missing[e.soid] = item(e.version, e.prior_version, e.is_delete());
+      if (e.is_lost_revert())
+        missing[e.soid].clean_regions.mark_fully_dirty();
+      else
+        missing[e.soid].clean_regions = e.clean_regions;
+    }
+    rmissing[e.version.version] = e.soid;
+    tracker.changed(e.soid);
+  }
+
+  void revise_need(hobject_t oid, eversion_t need, bool is_delete) {
+    auto p = missing.find(oid);
+    if (p != missing.end()) {
+      rmissing.erase((p->second).need.version);
+      p->second.need = need;          // do not adjust .have
+      p->second.set_delete(is_delete);
+      p->second.clean_regions.mark_fully_dirty();
+    } else {
+      missing[oid] = item(need, eversion_t(), is_delete);
+      missing[oid].clean_regions.mark_fully_dirty();
+    }
+    rmissing[need.version] = oid;
+
+    tracker.changed(oid);
+  }
+
+  void revise_have(hobject_t oid, eversion_t have) {
+    auto p = missing.find(oid);
+    if (p != missing.end()) {
+      tracker.changed(oid);
+      (p->second).have = have;
+    }
+  }
+
+  void mark_fully_dirty(const hobject_t& oid) {
+    auto p = missing.find(oid);
+    if (p != missing.end()) {
+      tracker.changed(oid);
+      (p->second).clean_regions.mark_fully_dirty();
+    }
+  }
+
+  void add(const hobject_t& oid, eversion_t need, eversion_t have,
+	   bool is_delete) {
+    missing[oid] = item(need, have, is_delete, true);
+    rmissing[need.version] = oid;
+    tracker.changed(oid);
+  }
+
+  void add(const hobject_t& oid, pg_missing_item&& item) {
+    rmissing[item.need.version] = oid;
+    missing.insert({oid, std::move(item)});
+    tracker.changed(oid);
+  }
+
+  void rm(const hobject_t& oid, eversion_t v) {
+    std::map<hobject_t, item>::iterator p = missing.find(oid);
+    if (p != missing.end() && p->second.need <= v)
+      rm(p);
+  }
+
+  void rm(std::map<hobject_t, item>::const_iterator m) {
+    tracker.changed(m->first);
+    rmissing.erase(m->second.need.version);
+    missing.erase(m);
+  }
+
+  void got(const hobject_t& oid, eversion_t v) {
+    std::map<hobject_t, item>::iterator p = missing.find(oid);
+    ceph_assert(p != missing.end());
+    ceph_assert(p->second.need <= v || p->second.is_delete());
+    got(p);
+  }
+
+  void got(std::map<hobject_t, item>::const_iterator m) {
+    tracker.changed(m->first);
+    rmissing.erase(m->second.need.version);
+    missing.erase(m);
+  }
+
+  void split_into(
+    pg_t child_pgid,
+    unsigned split_bits,
+    pg_missing_set *omissing) {
+    omissing->may_include_deletes = may_include_deletes;
+    unsigned mask = ~((~0)<<split_bits);
+    for (std::map<hobject_t, item>::iterator i = missing.begin();
+	 i != missing.end();
+      ) {
+      if ((i->first.get_hash() & mask) == child_pgid.m_seed) {
+	omissing->add(i->first, i->second.need, i->second.have,
+		      i->second.is_delete());
+	rm(i++);
+      } else {
+	++i;
+      }
+    }
+  }
+
+  void clear() {
+    for (auto const &i: missing)
+      tracker.changed(i.first);
+    missing.clear();
+    rmissing.clear();
+  }
+
+  void encode(ceph::buffer::list &bl, uint64_t features) const {
+    ENCODE_START(5, 2, bl)
+    encode(missing, bl, features);
+    encode(may_include_deletes, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1) {
+    for (auto const &i: missing)
+      tracker.changed(i.first);
+    DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl);
+    decode(missing, bl);
+    if (struct_v >= 4) {
+      decode(may_include_deletes, bl);
+    }
+    DECODE_FINISH(bl);
+
+    if (struct_v < 3) {
+      // Handle hobject_t upgrade
+      std::map<hobject_t, item> tmp;
+      for (std::map<hobject_t, item>::iterator i =
+	     missing.begin();
+	   i != missing.end();
+	) {
+	if (!i->first.is_max() && i->first.pool == -1) {
+	  hobject_t to_insert(i->first);
+	  to_insert.pool = pool;
+	  tmp[to_insert] = i->second;
+	  missing.erase(i++);
+	} else {
+	  ++i;
+	}
+      }
+      missing.insert(tmp.begin(), tmp.end());
+    }
+
+    for (std::map<hobject_t,item>::iterator it =
+	   missing.begin();
+	 it != missing.end();
+	 ++it)
+      rmissing[it->second.need.version] = it->first;
+    for (auto const &i: missing)
+      tracker.changed(i.first);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->open_array_section("missing");
+    for (std::map<hobject_t,item>::const_iterator p =
+	   missing.begin(); p != missing.end(); ++p) {
+      f->open_object_section("item");
+      f->dump_stream("object") << p->first;
+      p->second.dump(f);
+      f->close_section();
+    }
+    f->close_section();
+    f->dump_bool("may_include_deletes", may_include_deletes);
+  }
+  template <typename F>
+  void filter_objects(F &&f) {
+    for (auto i = missing.begin(); i != missing.end();) {
+      if (f(i->first)) {
+	rm(i++);
+      } else {
+        ++i;
+      }
+    }
+  }
+  static void generate_test_instances(std::list<pg_missing_set*>& o) {
+    o.push_back(new pg_missing_set);
+    o.back()->may_include_deletes = true;
+    o.push_back(new pg_missing_set);
+    o.back()->add(
+      hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
+      eversion_t(5, 6), eversion_t(5, 1), false);
+    o.back()->may_include_deletes = true;
+    o.push_back(new pg_missing_set);
+    o.back()->add(
+      hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
+      eversion_t(5, 6), eversion_t(5, 1), true);
+    o.back()->may_include_deletes = true;
+  }
+  template <typename F>
+  void get_changed(F &&f) const {
+    tracker.get_changed(f);
+  }
+  void flush() {
+    tracker.flush();
+  }
+  bool is_clean() const {
+    return tracker.is_clean();
+  }
+  template <typename missing_t>
+  bool debug_verify_from_init(
+    const missing_t &init_missing,
+    std::ostream *oss) const {
+    if (!TrackChanges)
+      return true;
+    auto check_missing(init_missing.get_items());
+    tracker.get_changed([&](const hobject_t &hoid) {
+	check_missing.erase(hoid);
+	if (missing.count(hoid)) {
+	  check_missing.insert(*(missing.find(hoid)));
+	}
+      });
+    bool ok = true;
+    if (check_missing.size() != missing.size()) {
+      if (oss) {
+	*oss << "Size mismatch, check: " << check_missing.size()
+	     << ", actual: " << missing.size() << "\n";
+      }
+      ok = false;
+    }
+    for (auto &i: missing) {
+      if (!check_missing.count(i.first)) {
+	if (oss)
+	  *oss << "check_missing missing " << i.first << "\n";
+	ok = false;
+      } else if (check_missing[i.first] != i.second) {
+	if (oss)
+	  *oss << "check_missing missing item mismatch on " << i.first
+	       << ", check: " << check_missing[i.first]
+	       << ", actual: " << i.second << "\n";
+	ok = false;
+      }
+    }
+    if (oss && !ok) {
+      *oss << "check_missing: " << check_missing << "\n";
+      std::set<hobject_t> changed;
+      tracker.get_changed([&](const hobject_t &hoid) { changed.insert(hoid); });
+      *oss << "changed: " << changed << "\n";
+    }
+    return ok;
+  }
+};
+template <bool TrackChanges>
+void encode(
+  const pg_missing_set<TrackChanges> &c, ceph::buffer::list &bl, uint64_t features=0) {
+  ENCODE_DUMP_PRE();
+  c.encode(bl, features);
+  ENCODE_DUMP_POST(cl);
+}
+template <bool TrackChanges>
+void decode(pg_missing_set<TrackChanges> &c, ceph::buffer::list::const_iterator &p) {
+  c.decode(p);
+}
+template <bool TrackChanges>
+std::ostream& operator<<(std::ostream& out, const pg_missing_set<TrackChanges> &missing)
+{
+  out << "missing(" << missing.num_missing()
+      << " may_include_deletes = " << missing.may_include_deletes;
+  //if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
+  out << ")";
+  return out;
+}
+
+using pg_missing_t = pg_missing_set<false>;
+using pg_missing_tracker_t = pg_missing_set<true>;
+
+
+
+
+/**
+ * pg list objects response format
+ *
+ */
+
+template<typename T>
+struct pg_nls_response_template {
+  collection_list_handle_t handle;
+  std::vector<T> entries;
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(handle, bl);
+    __u32 n = (__u32)entries.size();
+    encode(n, bl);
+    for (auto i = entries.begin(); i != entries.end(); ++i) {
+      encode(i->nspace, bl);
+      encode(i->oid, bl);
+      encode(i->locator, bl);
+    }
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(handle, bl);
+    __u32 n;
+    decode(n, bl);
+    entries.clear();
+    while (n--) {
+      T i;
+      decode(i.nspace, bl);
+      decode(i.oid, bl);
+      decode(i.locator, bl);
+      entries.push_back(i);
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->dump_stream("handle") << handle;
+    f->open_array_section("entries");
+    for (auto p = entries.begin(); p != entries.end(); ++p) {
+      f->open_object_section("object");
+      f->dump_string("namespace", p->nspace);
+      f->dump_string("object", p->oid);
+      f->dump_string("key", p->locator);
+      f->close_section();
+    }
+    f->close_section();
+  }
+  static void generate_test_instances(std::list<pg_nls_response_template<T>*>& o) {
+    o.push_back(new pg_nls_response_template<T>);
+    o.push_back(new pg_nls_response_template<T>);
+    o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
+    o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
+    o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
+    o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
+    o.push_back(new pg_nls_response_template<T>);
+    o.back()->handle = hobject_t(object_t("hi"), "key", 3, 4, -1, "");
+    o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
+    o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
+    o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
+    o.push_back(new pg_nls_response_template<T>);
+    o.back()->handle = hobject_t(object_t("hi"), "key", 5, 6, -1, "");
+    o.back()->entries.push_back(librados::ListObjectImpl("", "one", ""));
+    o.back()->entries.push_back(librados::ListObjectImpl("", "two", "twokey"));
+    o.back()->entries.push_back(librados::ListObjectImpl("", "three", ""));
+    o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1one", ""));
+    o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1two", "n1twokey"));
+    o.back()->entries.push_back(librados::ListObjectImpl("n1", "n1three", ""));
+  }
+};
+
+using pg_nls_response_t = pg_nls_response_template<librados::ListObjectImpl>;
+
+WRITE_CLASS_ENCODER(pg_nls_response_t)
+
+// For backwards compatibility with older OSD requests
+struct pg_ls_response_t {
+  collection_list_handle_t handle; 
+  std::list<std::pair<object_t, std::string> > entries;
+
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    __u8 v = 1;
+    encode(v, bl);
+    encode(handle, bl);
+    encode(entries, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    using ceph::decode;
+    __u8 v;
+    decode(v, bl);
+    ceph_assert(v == 1);
+    decode(handle, bl);
+    decode(entries, bl);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->dump_stream("handle") << handle;
+    f->open_array_section("entries");
+    for (std::list<std::pair<object_t, std::string> >::const_iterator p = entries.begin(); p != entries.end(); ++p) {
+      f->open_object_section("object");
+      f->dump_stream("object") << p->first;
+      f->dump_string("key", p->second);
+      f->close_section();
+    }
+    f->close_section();
+  }
+  static void generate_test_instances(std::list<pg_ls_response_t*>& o) {
+    o.push_back(new pg_ls_response_t);
+    o.push_back(new pg_ls_response_t);
+    o.back()->handle = hobject_t(object_t("hi"), "key", 1, 2, -1, "");
+    o.back()->entries.push_back(std::make_pair(object_t("one"), std::string()));
+    o.back()->entries.push_back(std::make_pair(object_t("two"), std::string("twokey")));
+  }
+};
+
+WRITE_CLASS_ENCODER(pg_ls_response_t)
+
+/**
+ * object_copy_cursor_t
+ */
+struct object_copy_cursor_t {
+  uint64_t data_offset;
+  std::string omap_offset;
+  bool attr_complete;
+  bool data_complete;
+  bool omap_complete;
+
+  object_copy_cursor_t()
+    : data_offset(0),
+      attr_complete(false),
+      data_complete(false),
+      omap_complete(false)
+  {}
+
+  bool is_initial() const {
+    return !attr_complete && data_offset == 0 && omap_offset.empty();
+  }
+  bool is_complete() const {
+    return attr_complete && data_complete && omap_complete;
+  }
+
+  static void generate_test_instances(std::list<object_copy_cursor_t*>& o);
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(object_copy_cursor_t)
+
+/**
+ * object_copy_data_t
+ *
+ * Return data from a copy request. The semantics are a little strange
+ * as a result of the encoding's heritage.
+ *
+ * In particular, the sender unconditionally fills in the cursor (from what
+ * it receives and sends), the size, and the mtime, but is responsible for
+ * figuring out whether it should put any data in the attrs, data, or
+ * omap members (corresponding to xattrs, object data, and the omap entries)
+ * based on external data (the client includes a max amount to return with
+ * the copy request). The client then looks into the attrs, data, and/or omap
+ * based on the contents of the cursor.
+ */
+struct object_copy_data_t {
+  enum {
+    FLAG_DATA_DIGEST = 1<<0,
+    FLAG_OMAP_DIGEST = 1<<1,
+  };
+  object_copy_cursor_t cursor;
+  uint64_t size;
+  utime_t mtime;
+  uint32_t data_digest, omap_digest;
+  uint32_t flags;
+  std::map<std::string, ceph::buffer::list> attrs;
+  ceph::buffer::list data;
+  ceph::buffer::list omap_header;
+  ceph::buffer::list omap_data;
+
+  /// which snaps we are defined for (if a snap and not the head)
+  std::vector<snapid_t> snaps;
+  /// latest snap seq for the object (if head)
+  snapid_t snap_seq;
+
+  /// recent reqids on this object
+  mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > reqids;
+
+  /// map reqids by index to error return code (if any)
+  mempool::osd_pglog::map<uint32_t, int> reqid_return_codes;
+
+  uint64_t truncate_seq;
+  uint64_t truncate_size;
+
+public:
+  object_copy_data_t() :
+    size((uint64_t)-1), data_digest(-1),
+    omap_digest(-1), flags(0),
+    truncate_seq(0),
+    truncate_size(0) {}
+
+  static void generate_test_instances(std::list<object_copy_data_t*>& o);
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+};
+WRITE_CLASS_ENCODER_FEATURES(object_copy_data_t)
+
+/**
+ * pg creation info
+ */
+struct pg_create_t {
+  epoch_t created;   // epoch pg created
+  pg_t parent;       // split from parent (if != pg_t())
+  __s32 split_bits;
+
+  pg_create_t()
+    : created(0), split_bits(0) {}
+  pg_create_t(unsigned c, pg_t p, int s)
+    : created(c), parent(p), split_bits(s) {}
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<pg_create_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_create_t)
+
+// -----------------------------------------
+
+class ObjectExtent {
+  /**
+   * ObjectExtents are used for specifying IO behavior against RADOS
+   * objects when one is using the ObjectCacher.
+   *
+   * To use this in a real system, *every member* must be filled
+   * out correctly. In particular, make sure to initialize the
+   * oloc correctly, as its default values are deliberate poison
+   * and will cause internal ObjectCacher asserts.
+   *
+   * Similarly, your buffer_extents vector *must* specify a total
+   * size equal to your length. If the buffer_extents inadvertently
+   * contain less space than the length member specifies, you
+   * will get unintelligible asserts deep in the ObjectCacher.
+   *
+   * If you are trying to do testing and don't care about actual
+   * RADOS function, the simplest thing to do is to initialize
+   * the ObjectExtent (truncate_size can be 0), create a single entry
+   * in buffer_extents matching the length, and set oloc.pool to 0.
+   */
+ public:
+  object_t    oid;       // object id
+  uint64_t    objectno;
+  uint64_t    offset;    // in object
+  uint64_t    length;    // in object
+  uint64_t    truncate_size;	// in object
+
+  object_locator_t oloc;   // object locator (pool etc)
+
+  std::vector<std::pair<uint64_t,uint64_t> >  buffer_extents;  // off -> len.  extents in buffer being mapped (may be fragmented bc of striping!)
+  
+  ObjectExtent() : objectno(0), offset(0), length(0), truncate_size(0) {}
+  ObjectExtent(object_t o, uint64_t ono, uint64_t off, uint64_t l, uint64_t ts) :
+    oid(o), objectno(ono), offset(off), length(l), truncate_size(ts) { }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const ObjectExtent &ex)
+{
+  return out << "extent(" 
+             << ex.oid << " (" << ex.objectno << ") in " << ex.oloc
+             << " " << ex.offset << "~" << ex.length
+	     << " -> " << ex.buffer_extents
+             << ")";
+}
+
+
+// ---------------------------------------
+
+class OSDSuperblock {
+public:
+  uuid_d cluster_fsid, osd_fsid;
+  int32_t whoami = -1;    // my role in this fs.
+  epoch_t current_epoch = 0;             // most recent epoch
+  epoch_t oldest_map = 0, newest_map = 0;    // oldest/newest maps we have.
+  double weight = 0.0;
+
+  CompatSet compat_features;
+
+  // last interval over which i mounted and was then active
+  epoch_t mounted = 0;     // last epoch i mounted
+  epoch_t clean_thru = 0;  // epoch i was active and clean thru
+
+  epoch_t purged_snaps_last = 0;
+  utime_t last_purged_snaps_scrub;
+
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<OSDSuperblock*>& o);
+};
+WRITE_CLASS_ENCODER(OSDSuperblock)
+
+inline std::ostream& operator<<(std::ostream& out, const OSDSuperblock& sb)
+{
+  return out << "sb(" << sb.cluster_fsid
+             << " osd." << sb.whoami
+	     << " " << sb.osd_fsid
+             << " e" << sb.current_epoch
+             << " [" << sb.oldest_map << "," << sb.newest_map << "]"
+	     << " lci=[" << sb.mounted << "," << sb.clean_thru << "]"
+             << ")";
+}
+
+
+// -------
+
+
+
+
+
+
+/*
+ * attached to object head.  describes most recent snap context, and
+ * set of existing clones.
+ */
+struct SnapSet {
+  snapid_t seq;
+  // NOTE: this is for pre-octopus compatibility only! remove in Q release
+  std::vector<snapid_t> snaps;    // descending
+  std::vector<snapid_t> clones;   // ascending
+  std::map<snapid_t, interval_set<uint64_t> > clone_overlap;  // overlap w/ next newest
+  std::map<snapid_t, uint64_t> clone_size;
+  std::map<snapid_t, std::vector<snapid_t>> clone_snaps; // descending
+
+  SnapSet() : seq(0) {}
+  explicit SnapSet(ceph::buffer::list& bl) {
+    auto p = std::cbegin(bl);
+    decode(p);
+  }
+
+  /// populate SnapSet from a librados::snap_set_t
+  void from_snap_set(const librados::snap_set_t& ss, bool legacy);
+
+  /// get space accounted to clone
+  uint64_t get_clone_bytes(snapid_t clone) const;
+    
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<SnapSet*>& o);  
+
+  SnapContext get_ssc_as_of(snapid_t as_of) const {
+    SnapContext out;
+    out.seq = as_of;
+    for (auto p = clone_snaps.rbegin();
+	 p != clone_snaps.rend();
+	 ++p) {
+      for (auto snap : p->second) {
+	if (snap <= as_of) {
+	  out.snaps.push_back(snap);
+	}
+      }
+    }
+    return out;
+  }
+
+
+  SnapSet get_filtered(const pg_pool_t &pinfo) const;
+  void filter(const pg_pool_t &pinfo);
+};
+WRITE_CLASS_ENCODER(SnapSet)
+
+std::ostream& operator<<(std::ostream& out, const SnapSet& cs);
+
+
+
+#define OI_ATTR "_"
+#define SS_ATTR "snapset"
+
+struct watch_info_t {
+  uint64_t cookie;
+  uint32_t timeout_seconds;
+  entity_addr_t addr;
+
+  watch_info_t() : cookie(0), timeout_seconds(0) { }
+  watch_info_t(uint64_t c, uint32_t t, const entity_addr_t& a) : cookie(c), timeout_seconds(t), addr(a) {}
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<watch_info_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(watch_info_t)
+
+static inline bool operator==(const watch_info_t& l, const watch_info_t& r) {
+  return l.cookie == r.cookie && l.timeout_seconds == r.timeout_seconds
+	    && l.addr == r.addr;
+}
+
+static inline std::ostream& operator<<(std::ostream& out, const watch_info_t& w) {
+  return out << "watch(cookie " << w.cookie << " " << w.timeout_seconds << "s"
+    << " " << w.addr << ")";
+}
+
+struct notify_info_t {
+  uint64_t cookie;
+  uint64_t notify_id;
+  uint32_t timeout;
+  ceph::buffer::list bl;
+};
+
+static inline std::ostream& operator<<(std::ostream& out, const notify_info_t& n) {
+  return out << "notify(cookie " << n.cookie
+	     << " notify" << n.notify_id
+	     << " " << n.timeout << "s)";
+}
+
+class object_ref_delta_t {
+  std::map<hobject_t, int> ref_delta;
+
+public:
+  object_ref_delta_t() = default;
+  object_ref_delta_t(const object_ref_delta_t &) = default;
+  object_ref_delta_t(object_ref_delta_t &&) = default;
+
+  object_ref_delta_t(decltype(ref_delta) &&ref_delta)
+    : ref_delta(std::move(ref_delta)) {}
+  object_ref_delta_t(const decltype(ref_delta) &ref_delta)
+    : ref_delta(ref_delta) {}
+
+  object_ref_delta_t &operator=(const object_ref_delta_t &) = default;
+  object_ref_delta_t &operator=(object_ref_delta_t &&) = default;
+
+  void dec_ref(const hobject_t &hoid, unsigned num=1) {
+    mut_ref(hoid, -num);
+  }
+  void inc_ref(const hobject_t &hoid, unsigned num=1) {
+    mut_ref(hoid, num);
+  }
+  void mut_ref(const hobject_t &hoid, int num) {
+    [[maybe_unused]] auto [iter, _] = ref_delta.try_emplace(hoid, 0);
+    iter->second += num;
+    if (iter->second == 0)
+      ref_delta.erase(iter);
+  }
+
+  auto begin() const { return ref_delta.begin(); }
+  auto end() const { return ref_delta.end(); }
+  auto find(hobject_t &key) const { return ref_delta.find(key); }
+
+  bool operator==(const object_ref_delta_t &rhs) const {
+    return ref_delta == rhs.ref_delta;
+  }
+  bool operator!=(const object_ref_delta_t &rhs) const {
+    return !(*this == rhs);
+  }
+  bool is_empty() {
+    return ref_delta.empty();
+  }
+  uint64_t size() {
+    return ref_delta.size();
+  }
+  friend std::ostream& operator<<(std::ostream& out, const object_ref_delta_t & ci);
+};
+
+struct chunk_info_t {
+  typedef enum {
+    FLAG_DIRTY = 1, 
+    FLAG_MISSING = 2,
+    FLAG_HAS_REFERENCE = 4,
+    FLAG_HAS_FINGERPRINT = 8,
+  } cflag_t;
+  uint32_t offset;
+  uint32_t length;
+  hobject_t oid;
+  cflag_t flags;   // FLAG_*
+
+  chunk_info_t() : offset(0), length(0), flags((cflag_t)0) { }
+  chunk_info_t(uint32_t offset, uint32_t length, hobject_t oid) : 
+    offset(offset), length(length), oid(oid), flags((cflag_t)0) { }
+
+  static std::string get_flag_string(uint64_t flags) {
+    std::string r;
+    if (flags & FLAG_DIRTY) {
+      r += "|dirty";
+    }
+    if (flags & FLAG_MISSING) {
+      r += "|missing";
+    }
+    if (flags & FLAG_HAS_REFERENCE) {
+      r += "|has_reference";
+    }
+    if (flags & FLAG_HAS_FINGERPRINT) {
+      r += "|has_fingerprint";
+    }
+    if (r.length())
+      return r.substr(1);
+    return r;
+  }
+  bool test_flag(cflag_t f) const {
+    return (flags & f) == f;
+  }
+  void set_flag(cflag_t f) {
+    flags = (cflag_t)(flags | f);
+  }
+  void set_flags(cflag_t f) {
+    flags = f;
+  }
+  void clear_flag(cflag_t f) {
+    flags = (cflag_t)(flags & ~f);
+  }
+  void clear_flags() {
+    flags = (cflag_t)0;
+  }
+  bool is_dirty() const {
+    return test_flag(FLAG_DIRTY);
+  }
+  bool is_missing() const {
+    return test_flag(FLAG_MISSING);
+  }
+  bool has_reference() const {
+    return test_flag(FLAG_HAS_REFERENCE);
+  }
+  bool has_fingerprint() const {
+    return test_flag(FLAG_HAS_FINGERPRINT);
+  }
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  friend std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
+  bool operator==(const chunk_info_t& cit) const;
+  bool operator!=(const chunk_info_t& cit) const {
+    return !(cit == *this);
+  }
+};
+WRITE_CLASS_ENCODER(chunk_info_t)
+std::ostream& operator<<(std::ostream& out, const chunk_info_t& ci);
+
+struct object_info_t;
+struct object_manifest_t {
+  enum {
+    TYPE_NONE = 0,
+    TYPE_REDIRECT = 1, 
+    TYPE_CHUNKED = 2, 
+  };
+  uint8_t type;  // redirect, chunked, ...
+  hobject_t redirect_target;
+  std::map<uint64_t, chunk_info_t> chunk_map;
+
+  object_manifest_t() : type(0) { }
+  object_manifest_t(uint8_t type, const hobject_t& redirect_target) 
+    : type(type), redirect_target(redirect_target) { }
+
+  bool is_empty() const {
+    return type == TYPE_NONE;
+  }
+  bool is_redirect() const {
+    return type == TYPE_REDIRECT;
+  }
+  bool is_chunked() const {
+    return type == TYPE_CHUNKED;
+  }
+  static std::string_view get_type_name(uint8_t m) {
+    switch (m) {
+    case TYPE_NONE: return "none";
+    case TYPE_REDIRECT: return "redirect";
+    case TYPE_CHUNKED: return "chunked";
+    default: return "unknown";
+    }
+  }
+  std::string_view get_type_name() const {
+    return get_type_name(type);
+  }
+  void clear() {
+    type = 0;
+    redirect_target = hobject_t();
+    chunk_map.clear();
+  }
+
+  /**
+   * calc_refs_to_inc_on_set
+   *
+   * Takes a manifest and returns the set of refs to
+   * increment upon set-chunk
+   *
+   * l should be nullptr if there are no clones, or 
+   * l and g may each be null if the corresponding clone does not exist.
+   * *this contains the set of new references to set
+   *
+   */
+  void calc_refs_to_inc_on_set(
+    const object_manifest_t* g, ///< [in] manifest for clone > *this
+    const object_manifest_t* l, ///< [in] manifest for clone < *this
+    object_ref_delta_t &delta   ///< [out] set of refs to drop
+  ) const;
+
+  /**
+   * calc_refs_to_drop_on_modify
+   *
+   * Takes a manifest and returns the set of refs to
+   * drop upon modification 
+   *
+   * l should be nullptr if there are no clones, or 
+   * l may be null if the corresponding clone does not exist.
+   *
+   */
+  void calc_refs_to_drop_on_modify(
+    const object_manifest_t* l, ///< [in] manifest for previous clone 
+    const ObjectCleanRegions& clean_regions, ///< [in] clean regions
+    object_ref_delta_t &delta    ///< [out] set of refs to drop
+  ) const;
+
+  /**
+   * calc_refs_to_drop_on_removal
+   *
+   * Takes the two adjacent manifests and returns the set of refs to
+   * drop upon removal of the clone containing *this.
+   *
+   * g should be nullptr if *this is on HEAD, l should be nullptr if
+   * *this is on the oldest clone (or head if there are no clones).
+   */
+  void calc_refs_to_drop_on_removal(
+    const object_manifest_t* g, ///< [in] manifest for clone > *this
+    const object_manifest_t* l, ///< [in] manifest for clone < *this
+    object_ref_delta_t &delta    ///< [out] set of refs to drop
+  ) const;
+
+  static void generate_test_instances(std::list<object_manifest_t*>& o);
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  void dump(ceph::Formatter *f) const;
+  friend std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
+};
+WRITE_CLASS_ENCODER(object_manifest_t)
+std::ostream& operator<<(std::ostream& out, const object_manifest_t& oi);
+
+struct object_info_t {
+  hobject_t soid;
+  eversion_t version, prior_version;
+  version_t user_version;
+  osd_reqid_t last_reqid;
+
+  uint64_t size;
+  utime_t mtime;
+  utime_t local_mtime; // local mtime
+
+  // note: these are currently encoded into a total 16 bits; see
+  // encode()/decode() for the weirdness.
+  typedef enum {
+    FLAG_LOST        = 1<<0,
+    FLAG_WHITEOUT    = 1<<1, // object logically does not exist
+    FLAG_DIRTY       = 1<<2, // object has been modified since last flushed or undirtied
+    FLAG_OMAP        = 1<<3, // has (or may have) some/any omap data
+    FLAG_DATA_DIGEST = 1<<4, // has data crc
+    FLAG_OMAP_DIGEST = 1<<5, // has omap crc
+    FLAG_CACHE_PIN   = 1<<6, // pin the object in cache tier
+    FLAG_MANIFEST    = 1<<7, // has manifest
+    FLAG_USES_TMAP   = 1<<8, // deprecated; no longer used
+    FLAG_REDIRECT_HAS_REFERENCE = 1<<9, // has reference
+  } flag_t;
+
+  flag_t flags;
+
+  static std::string get_flag_string(flag_t flags) {
+    std::string s;
+    std::vector<std::string> sv = get_flag_vector(flags);
+    for (auto ss : sv) {
+      s += std::string("|") + ss;
+    }
+    if (s.length())
+      return s.substr(1);
+    return s;
+  }
+  static std::vector<std::string> get_flag_vector(flag_t flags) {
+    std::vector<std::string> sv;
+    if (flags & FLAG_LOST)
+      sv.insert(sv.end(), "lost");
+    if (flags & FLAG_WHITEOUT)
+      sv.insert(sv.end(), "whiteout");
+    if (flags & FLAG_DIRTY)
+      sv.insert(sv.end(), "dirty");
+    if (flags & FLAG_USES_TMAP)
+      sv.insert(sv.end(), "uses_tmap");
+    if (flags & FLAG_OMAP)
+      sv.insert(sv.end(), "omap");
+    if (flags & FLAG_DATA_DIGEST)
+      sv.insert(sv.end(), "data_digest");
+    if (flags & FLAG_OMAP_DIGEST)
+      sv.insert(sv.end(), "omap_digest");
+    if (flags & FLAG_CACHE_PIN)
+      sv.insert(sv.end(), "cache_pin");
+    if (flags & FLAG_MANIFEST)
+      sv.insert(sv.end(), "manifest");
+    if (flags & FLAG_REDIRECT_HAS_REFERENCE)
+      sv.insert(sv.end(), "redirect_has_reference");
+    return sv;
+  }
+  std::string get_flag_string() const {
+    return get_flag_string(flags);
+  }
+
+  uint64_t truncate_seq, truncate_size;
+
+  std::map<std::pair<uint64_t, entity_name_t>, watch_info_t> watchers;
+
+  // opportunistic checksums; may or may not be present
+  __u32 data_digest;  ///< data crc32c
+  __u32 omap_digest;  ///< omap crc32c
+  
+  // alloc hint attribute
+  uint64_t expected_object_size, expected_write_size;
+  uint32_t alloc_hint_flags;
+
+  struct object_manifest_t manifest;
+
+  void copy_user_bits(const object_info_t& other);
+
+  bool test_flag(flag_t f) const {
+    return (flags & f) == f;
+  }
+  void set_flag(flag_t f) {
+    flags = (flag_t)(flags | f);
+  }
+  void clear_flag(flag_t f) {
+    flags = (flag_t)(flags & ~f);
+  }
+  bool is_lost() const {
+    return test_flag(FLAG_LOST);
+  }
+  bool is_whiteout() const {
+    return test_flag(FLAG_WHITEOUT);
+  }
+  bool is_dirty() const {
+    return test_flag(FLAG_DIRTY);
+  }
+  bool is_omap() const {
+    return test_flag(FLAG_OMAP);
+  }
+  bool is_data_digest() const {
+    return test_flag(FLAG_DATA_DIGEST);
+  }
+  bool is_omap_digest() const {
+    return test_flag(FLAG_OMAP_DIGEST);
+  }
+  bool is_cache_pinned() const {
+    return test_flag(FLAG_CACHE_PIN);
+  }
+  bool has_manifest() const {
+    return test_flag(FLAG_MANIFEST);
+  }
+  void set_data_digest(__u32 d) {
+    set_flag(FLAG_DATA_DIGEST);
+    data_digest = d;
+  }
+  void set_omap_digest(__u32 d) {
+    set_flag(FLAG_OMAP_DIGEST);
+    omap_digest = d;
+  }
+  void clear_data_digest() {
+    clear_flag(FLAG_DATA_DIGEST);
+    data_digest = -1;
+  }
+  void clear_omap_digest() {
+    clear_flag(FLAG_OMAP_DIGEST);
+    omap_digest = -1;
+  }
+  void new_object() {
+    clear_data_digest();
+    clear_omap_digest();
+  }
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void decode(const ceph::buffer::list& bl) {
+    auto p = std::cbegin(bl);
+    decode(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<object_info_t*>& o);
+
+  explicit object_info_t()
+    : user_version(0), size(0), flags((flag_t)0),
+      truncate_seq(0), truncate_size(0),
+      data_digest(-1), omap_digest(-1),
+      expected_object_size(0), expected_write_size(0),
+      alloc_hint_flags(0)
+  {}
+
+  explicit object_info_t(const hobject_t& s)
+    : soid(s),
+      user_version(0), size(0), flags((flag_t)0),
+      truncate_seq(0), truncate_size(0),
+      data_digest(-1), omap_digest(-1),
+      expected_object_size(0), expected_write_size(0),
+      alloc_hint_flags(0)
+  {}
+
+  explicit object_info_t(ceph::buffer::list& bl) {
+    decode(bl);
+  }
+};
+WRITE_CLASS_ENCODER_FEATURES(object_info_t)
+
+std::ostream& operator<<(std::ostream& out, const object_info_t& oi);
+
+
+
+// Object recovery
+struct ObjectRecoveryInfo {
+  hobject_t soid;
+  eversion_t version;
+  uint64_t size;
+  object_info_t oi;
+  SnapSet ss;   // only populated if soid is_snap()
+  interval_set<uint64_t> copy_subset;
+  std::map<hobject_t, interval_set<uint64_t>> clone_subset;
+  bool object_exist;
+
+  ObjectRecoveryInfo() : size(0), object_exist(true) { }
+
+  static void generate_test_instances(std::list<ObjectRecoveryInfo*>& o);
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator &bl, int64_t pool = -1);
+  std::ostream &print(std::ostream &out) const;
+  void dump(ceph::Formatter *f) const;
+};
+WRITE_CLASS_ENCODER_FEATURES(ObjectRecoveryInfo)
+std::ostream& operator<<(std::ostream& out, const ObjectRecoveryInfo &inf);
+
+struct ObjectRecoveryProgress {
+  uint64_t data_recovered_to;
+  std::string omap_recovered_to;
+  bool first;
+  bool data_complete;
+  bool omap_complete;
+  bool error = false;
+
+  ObjectRecoveryProgress()
+    : data_recovered_to(0),
+      first(true),
+      data_complete(false), omap_complete(false) { }
+
+  bool is_complete(const ObjectRecoveryInfo& info) const {
+    return (data_recovered_to >= (
+      info.copy_subset.empty() ?
+      0 : info.copy_subset.range_end())) &&
+      omap_complete;
+  }
+
+  static void generate_test_instances(std::list<ObjectRecoveryProgress*>& o);
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  std::ostream &print(std::ostream &out) const;
+  void dump(ceph::Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(ObjectRecoveryProgress)
+std::ostream& operator<<(std::ostream& out, const ObjectRecoveryProgress &prog);
+
+struct PushReplyOp {
+  hobject_t soid;
+
+  static void generate_test_instances(std::list<PushReplyOp*>& o);
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  std::ostream &print(std::ostream &out) const;
+  void dump(ceph::Formatter *f) const;
+
+  uint64_t cost(CephContext *cct) const;
+};
+WRITE_CLASS_ENCODER(PushReplyOp)
+std::ostream& operator<<(std::ostream& out, const PushReplyOp &op);
+
+struct PullOp {
+  hobject_t soid;
+
+  ObjectRecoveryInfo recovery_info;
+  ObjectRecoveryProgress recovery_progress;
+
+  static void generate_test_instances(std::list<PullOp*>& o);
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  std::ostream &print(std::ostream &out) const;
+  void dump(ceph::Formatter *f) const;
+
+  uint64_t cost(CephContext *cct) const;
+};
+WRITE_CLASS_ENCODER_FEATURES(PullOp)
+std::ostream& operator<<(std::ostream& out, const PullOp &op);
+
+struct PushOp {
+  hobject_t soid;
+  eversion_t version;
+  ceph::buffer::list data;
+  interval_set<uint64_t> data_included;
+  ceph::buffer::list omap_header;
+  std::map<std::string, ceph::buffer::list> omap_entries;
+  std::map<std::string, ceph::buffer::list> attrset;
+
+  ObjectRecoveryInfo recovery_info;
+  ObjectRecoveryProgress before_progress;
+  ObjectRecoveryProgress after_progress;
+
+  static void generate_test_instances(std::list<PushOp*>& o);
+  void encode(ceph::buffer::list &bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  std::ostream &print(std::ostream &out) const;
+  void dump(ceph::Formatter *f) const;
+
+  uint64_t cost(CephContext *cct) const;
+};
+WRITE_CLASS_ENCODER_FEATURES(PushOp)
+std::ostream& operator<<(std::ostream& out, const PushOp &op);
+
+enum class scrub_level_t : bool { shallow = false, deep = true };
+enum class scrub_type_t : bool { not_repair = false, do_repair = true };
+
+/*
+ * summarize pg contents for purposes of a scrub
+ */
+struct ScrubMap {
+  struct object {
+    std::map<std::string, ceph::buffer::ptr> attrs;
+    uint64_t size;
+    __u32 omap_digest;         ///< omap crc32c
+    __u32 digest;              ///< data crc32c
+    bool negative:1;
+    bool digest_present:1;
+    bool omap_digest_present:1;
+    bool read_error:1;
+    bool stat_error:1;
+    bool ec_hash_mismatch:1;
+    bool ec_size_mismatch:1;
+    bool large_omap_object_found:1;
+    uint64_t large_omap_object_key_count = 0;
+    uint64_t large_omap_object_value_size = 0;
+    uint64_t object_omap_bytes = 0;
+    uint64_t object_omap_keys = 0;
+
+    object() :
+      // Init invalid size so it won't match if we get a stat EIO error
+      size(-1), omap_digest(0), digest(0),
+      negative(false), digest_present(false), omap_digest_present(false),
+      read_error(false), stat_error(false), ec_hash_mismatch(false),
+      ec_size_mismatch(false), large_omap_object_found(false) {}
+
+    void encode(ceph::buffer::list& bl) const;
+    void decode(ceph::buffer::list::const_iterator& bl);
+    void dump(ceph::Formatter *f) const;
+    static void generate_test_instances(std::list<object*>& o);
+  };
+  WRITE_CLASS_ENCODER(object)
+
+  std::map<hobject_t,object> objects;
+  eversion_t valid_through;
+  eversion_t incr_since;
+  bool has_large_omap_object_errors:1;
+  bool has_omap_keys:1;
+
+  void merge_incr(const ScrubMap &l);
+  void clear_from(const hobject_t& start) {
+    objects.erase(objects.lower_bound(start), objects.end());
+  }
+  void insert(const ScrubMap &r) {
+    objects.insert(r.objects.begin(), r.objects.end());
+  }
+  void swap(ScrubMap &r) {
+    using std::swap;
+    swap(objects, r.objects);
+    swap(valid_through, r.valid_through);
+    swap(incr_since, r.incr_since);
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl, int64_t pool=-1);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<ScrubMap*>& o);
+};
+WRITE_CLASS_ENCODER(ScrubMap::object)
+WRITE_CLASS_ENCODER(ScrubMap)
+
+struct ScrubMapBuilder {
+  bool deep = false;
+  std::vector<hobject_t> ls;
+  size_t pos = 0;
+  int64_t data_pos = 0;
+  std::string omap_pos;
+  int ret = 0;
+  ceph::buffer::hash data_hash, omap_hash;  ///< accumulatinng hash value
+  uint64_t omap_keys = 0;
+  uint64_t omap_bytes = 0;
+
+  bool empty() {
+    return ls.empty();
+  }
+  bool done() {
+    return pos >= ls.size();
+  }
+  void reset() {
+    *this = ScrubMapBuilder();
+  }
+
+  bool data_done() {
+    return data_pos < 0;
+  }
+
+  void next_object() {
+    ++pos;
+    data_pos = 0;
+    omap_pos.clear();
+    omap_keys = 0;
+    omap_bytes = 0;
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const ScrubMapBuilder& pos) {
+    out << "(" << pos.pos << "/" << pos.ls.size();
+    if (pos.pos < pos.ls.size()) {
+      out << " " << pos.ls[pos.pos];
+    }
+    if (pos.data_pos < 0) {
+      out << " byte " << pos.data_pos;
+    }
+    if (!pos.omap_pos.empty()) {
+      out << " key " << pos.omap_pos;
+    }
+    if (pos.deep) {
+      out << " deep";
+    }
+    if (pos.ret) {
+      out << " ret " << pos.ret;
+    }
+    return out << ")";
+  }
+};
+
+struct watch_item_t {
+  entity_name_t name;
+  uint64_t cookie;
+  uint32_t timeout_seconds;
+  entity_addr_t addr;
+
+  watch_item_t() : cookie(0), timeout_seconds(0) { }
+  watch_item_t(entity_name_t name, uint64_t cookie, uint32_t timeout,
+     const entity_addr_t& addr)
+    : name(name), cookie(cookie), timeout_seconds(timeout),
+    addr(addr) { }
+
+  void encode(ceph::buffer::list &bl, uint64_t features) const {
+    ENCODE_START(2, 1, bl);
+    encode(name, bl);
+    encode(cookie, bl);
+    encode(timeout_seconds, bl);
+    encode(addr, bl, features);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(2, bl);
+    decode(name, bl);
+    decode(cookie, bl);
+    decode(timeout_seconds, bl);
+    if (struct_v >= 2) {
+      decode(addr, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->dump_stream("watcher") << name;
+    f->dump_int("cookie", cookie);
+    f->dump_int("timeout", timeout_seconds);
+    f->open_object_section("addr");
+    addr.dump(f);
+    f->close_section();
+  }
+  static void generate_test_instances(std::list<watch_item_t*>& o) {
+    entity_addr_t ea;
+    ea.set_type(entity_addr_t::TYPE_LEGACY);
+    ea.set_nonce(1000);
+    ea.set_family(AF_INET);
+    ea.set_in4_quad(0, 127);
+    ea.set_in4_quad(1, 0);
+    ea.set_in4_quad(2, 0);
+    ea.set_in4_quad(3, 1);
+    ea.set_port(1024);
+    o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 1), 10, 30, ea));
+    ea.set_nonce(1001);
+    ea.set_in4_quad(3, 2);
+    ea.set_port(1025);
+    o.push_back(new watch_item_t(entity_name_t(entity_name_t::TYPE_CLIENT, 2), 20, 60, ea));
+  }
+};
+WRITE_CLASS_ENCODER_FEATURES(watch_item_t)
+
+struct obj_watch_item_t {
+  hobject_t obj;
+  watch_item_t wi;
+};
+
+/**
+ * obj list watch response format
+ *
+ */
+struct obj_list_watch_response_t {
+  std::list<watch_item_t> entries;
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const {
+    ENCODE_START(1, 1, bl);
+    encode(entries, bl, features);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(entries, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->open_array_section("entries");
+    for (std::list<watch_item_t>::const_iterator p = entries.begin(); p != entries.end(); ++p) {
+      f->open_object_section("watch");
+      p->dump(f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+  static void generate_test_instances(std::list<obj_list_watch_response_t*>& o) {
+    entity_addr_t ea;
+    o.push_back(new obj_list_watch_response_t);
+    o.push_back(new obj_list_watch_response_t);
+    std::list<watch_item_t*> test_watchers;
+    watch_item_t::generate_test_instances(test_watchers);
+    for (auto &e : test_watchers) {
+      o.back()->entries.push_back(*e);
+      delete e;
+    }
+  }
+};
+WRITE_CLASS_ENCODER_FEATURES(obj_list_watch_response_t)
+
+struct clone_info {
+  snapid_t cloneid;
+  std::vector<snapid_t> snaps;  // ascending
+  std::vector< std::pair<uint64_t,uint64_t> > overlap;
+  uint64_t size;
+
+  clone_info() : cloneid(CEPH_NOSNAP), size(0) {}
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(cloneid, bl);
+    encode(snaps, bl);
+    encode(overlap, bl);
+    encode(size, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(cloneid, bl);
+    decode(snaps, bl);
+    decode(overlap, bl);
+    decode(size, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(ceph::Formatter *f) const {
+    if (cloneid == CEPH_NOSNAP)
+      f->dump_string("cloneid", "HEAD");
+    else
+      f->dump_unsigned("cloneid", cloneid.val);
+    f->open_array_section("snapshots");
+    for (std::vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
+      f->open_object_section("snap");
+      f->dump_unsigned("id", p->val);
+      f->close_section();
+    }
+    f->close_section();
+    f->open_array_section("overlaps");
+    for (std::vector< std::pair<uint64_t,uint64_t> >::const_iterator q = overlap.begin();
+          q != overlap.end(); ++q) {
+      f->open_object_section("overlap");
+      f->dump_unsigned("offset", q->first);
+      f->dump_unsigned("length", q->second);
+      f->close_section();
+    }
+    f->close_section();
+    f->dump_unsigned("size", size);
+  }
+  static void generate_test_instances(std::list<clone_info*>& o) {
+    o.push_back(new clone_info);
+    o.push_back(new clone_info);
+    o.back()->cloneid = 1;
+    o.back()->snaps.push_back(1);
+    o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
+    o.back()->overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
+    o.back()->size = 16384;
+    o.push_back(new clone_info);
+    o.back()->cloneid = CEPH_NOSNAP;
+    o.back()->size = 32768;
+  }
+};
+WRITE_CLASS_ENCODER(clone_info)
+
+/**
+ * obj list snaps response format
+ *
+ */
+struct obj_list_snap_response_t {
+  std::vector<clone_info> clones;   // ascending
+  snapid_t seq;
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(clones, bl);
+    encode(seq, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(clones, bl);
+    if (struct_v >= 2)
+      decode(seq, bl);
+    else
+      seq = CEPH_NOSNAP;
+    DECODE_FINISH(bl);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->open_array_section("clones");
+    for (std::vector<clone_info>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
+      f->open_object_section("clone");
+      p->dump(f);
+      f->close_section();
+    }
+    f->dump_unsigned("seq", seq);
+    f->close_section();
+  }
+  static void generate_test_instances(std::list<obj_list_snap_response_t*>& o) {
+    o.push_back(new obj_list_snap_response_t);
+    o.push_back(new obj_list_snap_response_t);
+    clone_info cl;
+    cl.cloneid = 1;
+    cl.snaps.push_back(1);
+    cl.overlap.push_back(std::pair<uint64_t,uint64_t>(0,4096));
+    cl.overlap.push_back(std::pair<uint64_t,uint64_t>(8192,4096));
+    cl.size = 16384;
+    o.back()->clones.push_back(cl);
+    cl.cloneid = CEPH_NOSNAP;
+    cl.snaps.clear();
+    cl.overlap.clear();
+    cl.size = 32768;
+    o.back()->clones.push_back(cl);
+    o.back()->seq = 123;
+  }
+};
+
+WRITE_CLASS_ENCODER(obj_list_snap_response_t)
+
+// PromoteCounter
+
+struct PromoteCounter {
+  std::atomic<unsigned long long>  attempts{0};
+  std::atomic<unsigned long long>  objects{0};
+  std::atomic<unsigned long long>  bytes{0};
+
+  void attempt() {
+    attempts++;
+  }
+
+  void finish(uint64_t size) {
+    objects++;
+    bytes += size;
+  }
+
+  void sample_and_attenuate(uint64_t *a, uint64_t *o, uint64_t *b) {
+    *a = attempts;
+    *o = objects;
+    *b = bytes;
+    attempts = *a / 2;
+    objects = *o / 2;
+    bytes = *b / 2;
+  }
+};
+
+struct pool_pg_num_history_t {
+  /// last epoch updated
+  epoch_t epoch = 0;
+  /// poolid -> epoch -> pg_num
+  std::map<int64_t, std::map<epoch_t,uint32_t>> pg_nums;
+  /// pair(epoch, poolid)
+  std::set<std::pair<epoch_t,int64_t>> deleted_pools;
+
+  void log_pg_num_change(epoch_t epoch, int64_t pool, uint32_t pg_num) {
+    pg_nums[pool][epoch] = pg_num;
+  }
+  void log_pool_delete(epoch_t epoch, int64_t pool) {
+    deleted_pools.insert(std::make_pair(epoch, pool));
+  }
+
+  /// prune history based on oldest osdmap epoch in the cluster
+  void prune(epoch_t oldest_epoch) {
+    auto i = deleted_pools.begin();
+    while (i != deleted_pools.end()) {
+      if (i->first >= oldest_epoch) {
+	break;
+      }
+      pg_nums.erase(i->second);
+      i = deleted_pools.erase(i);
+    }
+    for (auto& j : pg_nums) {
+      auto k = j.second.lower_bound(oldest_epoch);
+      // keep this and the entry before it (just to be paranoid)
+      if (k != j.second.begin()) {
+	--k;
+	j.second.erase(j.second.begin(), k);
+      }
+    }
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(epoch, bl);
+    encode(pg_nums, bl);
+    encode(deleted_pools, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    DECODE_START(1, p);
+    decode(epoch, p);
+    decode(pg_nums, p);
+    decode(deleted_pools, p);
+    DECODE_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->dump_unsigned("epoch", epoch);
+    f->open_object_section("pools");
+    for (auto& i : pg_nums) {
+      f->open_object_section("pool");
+      f->dump_unsigned("pool_id", i.first);
+      f->open_array_section("changes");
+      for (auto& j : i.second) {
+	f->open_object_section("change");
+	f->dump_unsigned("epoch", j.first);
+	f->dump_unsigned("pg_num", j.second);
+	f->close_section();
+      }
+      f->close_section();
+      f->close_section();
+    }
+    f->close_section();
+    f->open_array_section("deleted_pools");
+    for (auto& i : deleted_pools) {
+      f->open_object_section("deletion");
+      f->dump_unsigned("pool_id", i.second);
+      f->dump_unsigned("epoch", i.first);
+      f->close_section();
+    }
+    f->close_section();
+  }
+  static void generate_test_instances(std::list<pool_pg_num_history_t*>& ls) {
+    ls.push_back(new pool_pg_num_history_t);
+  }
+  friend std::ostream& operator<<(std::ostream& out, const pool_pg_num_history_t& h) {
+    return out << "pg_num_history(e" << h.epoch
+	       << " pg_nums " << h.pg_nums
+	       << " deleted_pools " << h.deleted_pools
+	       << ")";
+  }
+};
+WRITE_CLASS_ENCODER(pool_pg_num_history_t)
+
+// prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
+// easily skip them
+static const std::string_view infover_key = "_infover";
+static const std::string_view info_key = "_info";
+static const std::string_view biginfo_key = "_biginfo";
+static const std::string_view epoch_key = "_epoch";
+static const std::string_view fastinfo_key = "_fastinfo";
+
+static const __u8 pg_latest_struct_v = 10;
+// v10 is the new past_intervals encoding
+// v9 was fastinfo_key addition
+// v8 was the move to a per-pg pgmeta object
+// v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
+// (first appeared in cuttlefish).
+static const __u8 pg_compat_struct_v = 10;
+
+int prepare_info_keymap(
+  CephContext* cct,
+  std::map<std::string,ceph::buffer::list> *km,
+  std::string *key_to_remove,
+  epoch_t epoch,
+  pg_info_t &info,
+  pg_info_t &last_written_info,
+  PastIntervals &past_intervals,
+  bool dirty_big_info,
+  bool dirty_epoch,
+  bool try_fast_info,
+  PerfCounters *logger = nullptr,
+  DoutPrefixProvider *dpp = nullptr);
+
+namespace ceph::os {
+  class Transaction;
+};
+
+void create_pg_collection(
+  ceph::os::Transaction& t, spg_t pgid, int bits);
+
+void init_pg_ondisk(
+  ceph::os::Transaction& t, spg_t pgid, const pg_pool_t *pool);
+
+// omap specific stats
+struct omap_stat_t {
+ int large_omap_objects;
+ int64_t omap_bytes;
+ int64_t omap_keys;
+};
+
+// filter for pg listings
+class PGLSFilter {
+  CephContext* cct;
+protected:
+  std::string xattr;
+public:
+  PGLSFilter();
+  virtual ~PGLSFilter();
+  virtual bool filter(const hobject_t &obj,
+                      const ceph::buffer::list& xattr_data) const = 0;
+
+  /**
+   * Arguments passed from the RADOS client.  Implementations must
+   * handle any encoding errors, and return an appropriate error code,
+   * or 0 on valid input.
+   */
+  virtual int init(ceph::buffer::list::const_iterator &params) = 0;
+
+  /**
+   * xattr key, or empty string.  If non-empty, this xattr will be fetched
+   * and the value passed into ::filter
+   */
+  virtual const std::string& get_xattr() const { return xattr; }
+
+  /**
+   * If true, objects without the named xattr (if xattr name is not empty)
+   * will be rejected without calling ::filter
+   */
+  virtual bool reject_empty_xattr() const { return true; }
+};
+
+class PGLSPlainFilter : public PGLSFilter {
+  std::string val;
+public:
+  int init(ceph::buffer::list::const_iterator &params) override;
+  ~PGLSPlainFilter() override {}
+  bool filter(const hobject_t& obj,
+              const ceph::buffer::list& xattr_data) const override;
+};
+
+// alias name for this structure:
+using missing_map_t = std::map<hobject_t,
+  std::pair<std::optional<uint32_t>,
+    std::optional<uint32_t>>>;
+
+#endif
diff --git a/src/osd/pg_scrubber.cc b/src/osd/pg_scrubber.cc
new file mode 100644
index 000000000..20ab0a1aa
--- /dev/null
+++ b/src/osd/pg_scrubber.cc
@@ -0,0 +1,2384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=2 sw=2 smarttab
+
+#include "./pg_scrubber.h"  // the '.' notation used to affect clang-format order
+
+#include <iostream>
+#include <vector>
+
+#include "debug.h"
+
+#include "common/errno.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrubReserve.h"
+
+#include "OSD.h"
+#include "ScrubStore.h"
+#include "scrub_machine.h"
+
+using namespace Scrub;
+using namespace std::chrono;
+using namespace std::chrono_literals;
+
+#define dout_context (m_osds->cct)
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+template <class T>
+static ostream& _prefix(std::ostream* _dout, T* t)
+{
+  return t->gen_prefix(*_dout);
+}
+
+ostream& operator<<(ostream& out, const scrub_flags_t& sf)
+{
+  if (sf.auto_repair)
+    out << " AUTO_REPAIR";
+  if (sf.check_repair)
+    out << " CHECK_REPAIR";
+  if (sf.deep_scrub_on_error)
+    out << " DEEP_SCRUB_ON_ERROR";
+  if (sf.required)
+    out << " REQ_SCRUB";
+
+  return out;
+}
+
+ostream& operator<<(ostream& out, const requested_scrub_t& sf)
+{
+  if (sf.must_repair)
+    out << " MUST_REPAIR";
+  if (sf.auto_repair)
+    out << " planned AUTO_REPAIR";
+  if (sf.check_repair)
+    out << " planned CHECK_REPAIR";
+  if (sf.deep_scrub_on_error)
+    out << " planned DEEP_SCRUB_ON_ERROR";
+  if (sf.must_deep_scrub)
+    out << " MUST_DEEP_SCRUB";
+  if (sf.must_scrub)
+    out << " MUST_SCRUB";
+  if (sf.time_for_deep)
+    out << " TIME_FOR_DEEP";
+  if (sf.need_auto)
+    out << " NEED_AUTO";
+  if (sf.req_scrub)
+    out << " planned REQ_SCRUB";
+
+  return out;
+}
+
+/*
+ * if the incoming message is from a previous interval, it must mean
+ * PrimaryLogPG::on_change() was called when that interval ended. We can safely discard
+ * the stale message.
+ */
+bool PgScrubber::check_interval(epoch_t epoch_to_verify)
+{
+  return epoch_to_verify >= m_pg->get_same_interval_since();
+}
+
+bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify)
+{
+  if (!m_active) {
+    // not scrubbing. We can assume that the scrub was already terminated, and we
+    // can silently discard the incoming event.
+    return false;
+  }
+
+  // is this a message from before we started this scrub?
+  if (epoch_to_verify < m_epoch_start) {
+    return false;
+  }
+
+  // has a new interval started?
+  if (!check_interval(epoch_to_verify)) {
+    // if this is a new interval, on_change() has already terminated that
+    // old scrub.
+    return false;
+  }
+
+  ceph_assert(is_primary());
+
+  // were we instructed to abort?
+  return verify_against_abort(epoch_to_verify);
+}
+
+bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify)
+{
+  if (!should_abort()) {
+    return true;
+  }
+
+  dout(10) << __func__ << " aborting. incoming epoch: " << epoch_to_verify
+	   << " vs last-aborted: " << m_last_aborted << dendl;
+
+  // if we were not aware of the abort before - kill the scrub.
+  if (epoch_to_verify >= m_last_aborted) {
+    scrub_clear_state();
+    m_last_aborted = std::max(epoch_to_verify, m_epoch_start);
+  }
+  return false;
+}
+
+bool PgScrubber::should_abort() const
+{
+  if (m_flags.required) {
+    return false;  // not stopping 'required' scrubs for configuration changes
+  }
+
+  if (m_is_deep) {
+    if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
+	m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
+      dout(10) << "nodeep_scrub set, aborting" << dendl;
+      return true;
+    }
+  } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+      m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) {
+    dout(10) << "noscrub set, aborting" << dendl;
+    return true;
+  }
+
+  return false;
+}
+
+//   initiating state-machine events --------------------------------
+
+/*
+ * a note re the checks performed before sending scrub-initiating messages:
+ *
+ * For those ('StartScrub', 'AfterRepairScrub') scrub-initiation messages that
+ * possibly were in the queue while the PG changed state and became unavailable for
+ * scrubbing:
+ *
+ * The check_interval() catches all major changes to the PG. As for the other conditions
+ * we may check (and see is_message_relevant() above):
+ *
+ * - we are not 'active' yet, so must not check against is_active(), and:
+ *
+ * - the 'abort' flags were just verified (when the triggering message was queued). As
+ *   those are only modified in human speeds - they need not be queried again.
+ *
+ * Some of the considerations above are also relevant to the replica-side initiation
+ * ('StartReplica' & 'StartReplicaNoWait').
+ */
+
+void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
+{
+  dout(15) << __func__ << " epoch: " << epoch_queued << dendl;
+  // we may have lost our Primary status while the message languished in the queue
+  if (check_interval(epoch_queued)) {
+    dout(10) << "scrubber event -->> StartScrub epoch: " << epoch_queued << dendl;
+    reset_epoch(epoch_queued);
+    m_fsm->process_event(StartScrub{});
+    dout(10) << "scrubber event --<< StartScrub" << dendl;
+  } else {
+    // and just in case snap trimming was blocked by the aborted scrub
+    m_pg->snap_trimmer_scrub_complete();
+    clear_queued_or_active();
+  }
+}
+
+void PgScrubber::initiate_scrub_after_repair(epoch_t epoch_queued)
+{
+  dout(15) << __func__ << " epoch: " << epoch_queued << dendl;
+  // we may have lost our Primary status while the message languished in the queue
+  if (check_interval(epoch_queued)) {
+    dout(10) << "scrubber event -->> AfterRepairScrub epoch: " << epoch_queued << dendl;
+    reset_epoch(epoch_queued);
+    m_fsm->process_event(AfterRepairScrub{});
+    dout(10) << "scrubber event --<< AfterRepairScrub" << dendl;
+  } else {
+    m_pg->snap_trimmer_scrub_complete();
+    clear_queued_or_active();
+  }
+}
+void PgScrubber::send_scrub_unblock(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->process_event(Unblocked{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_scrub_resched(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->process_event(InternalSchedScrub{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
+	   << " token: " << token << dendl;
+  if (is_primary()) {
+    // shouldn't happen. Ignore
+    dout(1) << "got a replica scrub request while Primary!" << dendl;
+    return;
+  }
+
+  if (check_interval(epoch_queued) && is_token_current(token)) {
+    // save us some time by not waiting for updates if there are none
+    // to wait for. Affects the transition from NotActive into either
+    // ReplicaWaitUpdates or ActiveReplica.
+    if (pending_active_pushes())
+      m_fsm->process_event(StartReplica{});
+    else
+      m_fsm->process_event(StartReplicaNoWait{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
+	   << " token: " << token << dendl;
+  if (check_interval(epoch_queued) && is_token_current(token)) {
+    m_fsm->process_event(SchedReplica{});  // retest for map availability
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::active_pushes_notification(epoch_t epoch_queued)
+{
+  // note: Primary only
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->process_event(ActivePushesUpd{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::update_applied_notification(epoch_t epoch_queued)
+{
+  // note: Primary only
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->process_event(UpdatesApplied{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::digest_update_notification(epoch_t epoch_queued)
+{
+  // note: Primary only
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->process_event(DigestUpdate{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_local_map_done(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->process_event(Scrub::IntLocalMapDone{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->process_event(GotReplicas{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval(epoch_queued)) {
+    m_fsm->process_event(ReplicaPushesUpd{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_remotes_reserved(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  // note: scrub is not active yet
+  if (check_interval(epoch_queued)) {
+    m_fsm->process_event(RemotesReserved{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_reservation_failure(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval(epoch_queued)) {  // do not check for 'active'!
+    m_fsm->process_event(ReservationFailure{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_full_reset(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+
+  m_fsm->process_event(Scrub::FullReset{});
+
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_chunk_free(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval(epoch_queued)) {
+    m_fsm->process_event(Scrub::SelectedChunkFree{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_chunk_busy(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (check_interval(epoch_queued)) {
+    m_fsm->process_event(Scrub::ChunkIsBusy{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_get_next_chunk(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  if (is_message_relevant(epoch_queued)) {
+    m_fsm->process_event(Scrub::NextChunk{});
+  }
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_scrub_is_finished(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+
+  // can't check for "active"
+
+  m_fsm->process_event(Scrub::ScrubFinished{});
+
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+void PgScrubber::send_maps_compared(epoch_t epoch_queued)
+{
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+
+  m_fsm->process_event(Scrub::MapsCompared{});
+
+  dout(10) << "scrubber event --<< " << __func__ << dendl;
+}
+
+// -----------------
+
+bool PgScrubber::is_reserving() const
+{
+  return m_fsm->is_reserving();
+}
+
+void PgScrubber::reset_epoch(epoch_t epoch_queued)
+{
+  dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl;
+  m_fsm->assert_not_active();
+
+  m_epoch_start = epoch_queued;
+  m_needs_sleep = true;
+  m_is_deep = state_test(PG_STATE_DEEP_SCRUB);
+  update_op_mode_text();
+}
+
+unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const
+{
+  unsigned int qu_priority = m_flags.priority;
+
+  if (with_priority == Scrub::scrub_prio_t::high_priority) {
+    qu_priority =
+      std::max(qu_priority, (unsigned int)m_pg->get_cct()->_conf->osd_client_op_priority);
+  }
+  return qu_priority;
+}
+
+unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+						unsigned int suggested_priority) const
+{
+  if (with_priority == Scrub::scrub_prio_t::high_priority) {
+    suggested_priority = std::max(suggested_priority,
+				  (unsigned int)m_pg->cct->_conf->osd_client_op_priority);
+  }
+  return suggested_priority;
+}
+
+// ///////////////////////////////////////////////////////////////////// //
+// scrub-op registration handling
+
+bool PgScrubber::is_scrub_registered() const
+{
+  return !m_scrub_reg_stamp.is_zero();
+}
+
+void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags)
+{
+  if (!is_primary()) {
+    // normal. No warning is required.
+    return;
+  }
+
+  dout(10) << __func__ << " planned: must? " << request_flags.must_scrub << " need-auto? "
+	   << request_flags.need_auto << " stamp: " << m_pg->info.history.last_scrub_stamp
+	   << dendl;
+
+  ceph_assert(!is_scrub_registered());
+
+  utime_t reg_stamp;
+  bool must = false;
+
+  if (request_flags.must_scrub || request_flags.need_auto) {
+    // Set the smallest time that isn't utime_t()
+    reg_stamp = PgScrubber::scrub_must_stamp();
+    must = true;
+  } else if (m_pg->info.stats.stats_invalid &&
+	     m_pg->cct->_conf->osd_scrub_invalid_stats) {
+    reg_stamp = ceph_clock_now();
+    must = true;
+  } else {
+    reg_stamp = m_pg->info.history.last_scrub_stamp;
+  }
+
+  dout(15) << __func__ << " pg(" << m_pg_id << ") must: " << must
+	   << " required:" << m_flags.required << " flags: " << request_flags
+	   << " stamp: " << reg_stamp << dendl;
+
+  const double scrub_min_interval =
+    m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MIN_INTERVAL, 0.0);
+  const double scrub_max_interval =
+    m_pg->pool.info.opts.value_or(pool_opts_t::SCRUB_MAX_INTERVAL, 0.0);
+
+  // note the sched_time, so we can locate this scrub, and remove it later
+  m_scrub_reg_stamp = m_osds->reg_pg_scrub(m_pg->info.pgid, reg_stamp, scrub_min_interval,
+					   scrub_max_interval, must);
+  dout(15) << __func__ << " pg(" << m_pg_id << ") register next scrub, scrub time "
+	   << m_scrub_reg_stamp << ", must = " << (int)must << dendl;
+}
+
+void PgScrubber::unreg_next_scrub()
+{
+  if (is_scrub_registered()) {
+    dout(15) << __func__ << " existing-" << m_scrub_reg_stamp << dendl;
+    m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp);
+    m_scrub_reg_stamp = utime_t{};
+  }
+}
+
+void PgScrubber::scrub_requested(scrub_level_t scrub_level,
+				 scrub_type_t scrub_type,
+				 requested_scrub_t& req_flags)
+{
+  dout(10) << __func__ << (scrub_level == scrub_level_t::deep ? " deep " : " shallow ")
+	   << (scrub_type == scrub_type_t::do_repair ? " repair-scrub " : " not-repair ")
+	   << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered()
+	   << dendl;
+
+  unreg_next_scrub();
+
+  req_flags.must_scrub = true;
+  req_flags.must_deep_scrub =
+    (scrub_level == scrub_level_t::deep) || (scrub_type == scrub_type_t::do_repair);
+  req_flags.must_repair = (scrub_type == scrub_type_t::do_repair);
+  // User might intervene, so clear this
+  req_flags.need_auto = false;
+  req_flags.req_scrub = true;
+
+  dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl;
+
+  reg_next_scrub(req_flags);
+}
+
+void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags)
+{
+  dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? "
+	   << is_scrub_registered() << dendl;
+
+  unreg_next_scrub();
+  req_flags.need_auto = true;
+  reg_next_scrub(req_flags);
+}
+
+bool PgScrubber::reserve_local()
+{
+  // try to create the reservation object (which translates into asking the
+  // OSD for the local scrub resource). If failing - undo it immediately
+
+  m_local_osd_resource.emplace(m_osds);
+  if (m_local_osd_resource->is_reserved()) {
+    dout(15) << __func__ << ": local resources reserved" << dendl;
+    return true;
+  }
+
+  dout(10) << __func__ << ": failed to reserve local scrub resources" << dendl;
+  m_local_osd_resource.reset();
+  return false;
+}
+
+// ----------------------------------------------------------------------------
+
+bool PgScrubber::has_pg_marked_new_updates() const
+{
+  auto last_applied = m_pg->recovery_state.get_last_update_applied();
+  dout(10) << __func__ << " recovery last: " << last_applied
+	   << " vs. scrub's: " << m_subset_last_update << dendl;
+
+  return last_applied >= m_subset_last_update;
+}
+
+void PgScrubber::set_subset_last_update(eversion_t e)
+{
+  m_subset_last_update = e;
+  dout(15) << __func__ << " last-update: " << e << dendl;
+}
+
+void PgScrubber::on_applied_when_primary(const eversion_t& applied_version)
+{
+  // we are only interested in updates if we are the Primary, and in state
+  // WaitLastUpdate
+  if (m_fsm->is_accepting_updates() && (applied_version >= m_subset_last_update)) {
+    m_osds->queue_scrub_applied_update(m_pg, m_pg->is_scrub_blocking_ops());
+    dout(15) << __func__ << " update: " << applied_version
+	     << " vs. required: " << m_subset_last_update << dendl;
+  }
+}
+
+/*
+ * The selected range is set directly into 'm_start' and 'm_end'
+ * setting:
+ * - m_subset_last_update
+ * - m_max_end
+ * - end
+ * - start
+ */
+bool PgScrubber::select_range()
+{
+  m_primary_scrubmap = ScrubMap{};
+  m_received_maps.clear();
+
+  /* get the start and end of our scrub chunk
+   *
+   * Our scrub chunk has an important restriction we're going to need to
+   * respect. We can't let head be start or end.
+   * Using a half-open interval means that if end == head,
+   * we'd scrub/lock head and the clone right next to head in different
+   * chunks which would allow us to miss clones created between
+   * scrubbing that chunk and scrubbing the chunk including head.
+   * This isn't true for any of the other clones since clones can
+   * only be created "just to the left of" head.  There is one exception
+   * to this: promotion of clones which always happens to the left of the
+   * left-most clone, but promote_object checks the scrubber in that
+   * case, so it should be ok.  Also, it's ok to "miss" clones at the
+   * left end of the range if we are a tier because they may legitimately
+   * not exist (see _scrub).
+   */
+  int min_idx = std::max<int64_t>(
+    3, m_pg->get_cct()->_conf->osd_scrub_chunk_min / preemption_data.chunk_divisor());
+
+  int max_idx = std::max<int64_t>(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max /
+					     preemption_data.chunk_divisor());
+
+  dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx
+	   << " Div: " << preemption_data.chunk_divisor() << dendl;
+
+  hobject_t start = m_start;
+  hobject_t candidate_end;
+  std::vector<hobject_t> objects;
+  int ret = m_pg->get_pgbackend()->objects_list_partial(start, min_idx, max_idx, &objects,
+							&candidate_end);
+  ceph_assert(ret >= 0);
+
+  if (!objects.empty()) {
+
+    hobject_t back = objects.back();
+    while (candidate_end.is_head() && candidate_end == back.get_head()) {
+      candidate_end = back;
+      objects.pop_back();
+      if (objects.empty()) {
+	ceph_assert(0 ==
+		    "Somehow we got more than 2 objects which"
+		    "have the same head but are not clones");
+      }
+      back = objects.back();
+    }
+
+    if (candidate_end.is_head()) {
+      ceph_assert(candidate_end != back.get_head());
+      candidate_end = candidate_end.get_object_boundary();
+    }
+
+  } else {
+    ceph_assert(candidate_end.is_max());
+  }
+
+  // is that range free for us? if not - we will be rescheduled later by whoever
+  // triggered us this time
+
+  if (!m_pg->_range_available_for_scrub(m_start, candidate_end)) {
+    // we'll be requeued by whatever made us unavailable for scrub
+    dout(10) << __func__ << ": scrub blocked somewhere in range "
+	     << "[" << m_start << ", " << candidate_end << ")" << dendl;
+    return false;
+  }
+
+  m_end = candidate_end;
+  if (m_end > m_max_end)
+    m_max_end = m_end;
+
+  dout(15) << __func__ << " range selected: " << m_start << " //// " << m_end << " //// "
+	   << m_max_end << dendl;
+  return true;
+}
+
+void PgScrubber::select_range_n_notify()
+{
+  if (select_range()) {
+    // the next chunk to handle is not blocked
+    dout(20) << __func__ << ": selection OK" << dendl;
+    m_osds->queue_scrub_chunk_free(m_pg, Scrub::scrub_prio_t::low_priority);
+
+  } else {
+    // we will wait for the objects range to become available for scrubbing
+    dout(10) << __func__ << ": selected chunk is busy" << dendl;
+    m_osds->queue_scrub_chunk_busy(m_pg, Scrub::scrub_prio_t::low_priority);
+  }
+}
+
+bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid)
+{
+  if (soid < m_start || soid >= m_end) {
+    return false;
+  }
+
+  dout(20) << __func__ << " " << soid << " can preempt? "
+	   << preemption_data.is_preemptable() << " already preempted? "
+	   << preemption_data.was_preempted() << dendl;
+
+  if (preemption_data.was_preempted()) {
+    // otherwise - write requests arriving while 'already preempted' is set
+    // but 'preemptable' is not - will not be allowed to continue, and will
+    // not be requeued on time.
+    return false;
+  }
+
+  if (preemption_data.is_preemptable()) {
+
+    dout(10) << __func__ << " " << soid << " preempted" << dendl;
+
+    // signal the preemption
+    preemption_data.do_preempt();
+    m_end = m_start;  // free the range we were scrubbing
+
+    return false;
+  }
+  return true;
+}
+
+bool PgScrubber::range_intersects_scrub(const hobject_t& start, const hobject_t& end)
+{
+  // does [start, end] intersect [scrubber.start, scrubber.m_max_end)
+  return (start < m_max_end && end >= m_start);
+}
+
+/**
+ *  if we are required to sleep:
+ *	arrange a callback sometimes later.
+ *	be sure to be able to identify a stale callback.
+ *  Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue)
+ *    anyway.
+ */
+void PgScrubber::add_delayed_scheduling()
+{
+  m_end = m_start;  // not blocking any range now
+
+  milliseconds sleep_time{0ms};
+  if (m_needs_sleep) {
+    double scrub_sleep = 1000.0 * m_osds->osd->scrub_sleep_time(m_flags.required);
+    sleep_time = milliseconds{long(scrub_sleep)};
+  }
+  dout(15) << __func__ << " sleep: " << sleep_time.count() << "ms. needed? "
+	   << m_needs_sleep << dendl;
+
+  if (sleep_time.count()) {
+    // schedule a transition for some 'sleep_time' ms in the future
+
+    m_needs_sleep = false;
+    m_sleep_started_at = ceph_clock_now();
+
+    // the following log line is used by osd-scrub-test.sh
+    dout(20) << __func__ << " scrub state is PendingTimer, sleeping" << dendl;
+
+    // the 'delayer' for crimson is different. Will be factored out.
+
+    spg_t pgid = m_pg->get_pgid();
+    auto callbk = new LambdaContext([osds = m_osds, pgid,
+				     scrbr = this]([[maybe_unused]] int r) mutable {
+      PGRef pg = osds->osd->lookup_lock_pg(pgid);
+      if (!pg) {
+	lgeneric_subdout(g_ceph_context, osd, 10)
+	  << "scrub_requeue_callback: Could not find "
+	  << "PG " << pgid << " can't complete scrub requeue after sleep" << dendl;
+	return;
+      }
+      scrbr->m_needs_sleep = true;
+      lgeneric_dout(scrbr->get_pg_cct(), 7)
+	<< "scrub_requeue_callback: slept for "
+	<< ceph_clock_now() - scrbr->m_sleep_started_at << ", re-queuing scrub" << dendl;
+
+      scrbr->m_sleep_started_at = utime_t{};
+      osds->queue_for_scrub_resched(&(*pg), Scrub::scrub_prio_t::low_priority);
+      pg->unlock();
+    });
+
+    std::lock_guard l(m_osds->sleep_lock);
+    m_osds->sleep_timer.add_event_after(sleep_time.count() / 1000.0f, callbk);
+
+  } else {
+    // just a requeue
+    m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority);
+  }
+}
+
+eversion_t PgScrubber::search_log_for_updates() const
+{
+  auto& projected = m_pg->projected_log.log;
+  auto pi = find_if(
+    projected.crbegin(), projected.crend(),
+    [this](const auto& e) -> bool { return e.soid >= m_start && e.soid < m_end; });
+
+  if (pi != projected.crend())
+    return pi->version;
+
+  // there was no relevant update entry in the log
+
+  auto& log = m_pg->recovery_state.get_pg_log().get_log().log;
+  auto p = find_if(log.crbegin(), log.crend(), [this](const auto& e) -> bool {
+    return e.soid >= m_start && e.soid < m_end;
+  });
+
+  if (p == log.crend())
+    return eversion_t{};
+  else
+    return p->version;
+}
+
+void PgScrubber::get_replicas_maps(bool replica_can_preempt)
+{
+  dout(10) << __func__ << " started in epoch/interval: " << m_epoch_start << "/"
+	   << m_interval_start
+	   << " pg same_interval_since: " << m_pg->info.history.same_interval_since
+	   << dendl;
+
+  m_primary_scrubmap_pos.reset();
+
+  // ask replicas to scan and send maps
+  for (const auto& i : m_pg->get_actingset()) {
+
+    if (i == m_pg_whoami)
+      continue;
+
+    m_maps_status.mark_replica_map_request(i);
+    _request_scrub_map(i, m_subset_last_update, m_start, m_end, m_is_deep,
+		       replica_can_preempt);
+  }
+
+  dout(10) << __func__ << " awaiting" << m_maps_status << dendl;
+}
+
+bool PgScrubber::was_epoch_changed() const
+{
+  // for crimson we have m_pg->get_info().history.same_interval_since
+  dout(10) << __func__ << " epoch_start: " << m_interval_start
+	   << " from pg: " << m_pg->get_history().same_interval_since << dendl;
+
+  return m_interval_start < m_pg->get_history().same_interval_since;
+}
+
+void PgScrubber::mark_local_map_ready()
+{
+  m_maps_status.mark_local_map_ready();
+}
+
+bool PgScrubber::are_all_maps_available() const
+{
+  return m_maps_status.are_all_maps_available();
+}
+
+std::string PgScrubber::dump_awaited_maps() const
+{
+  return m_maps_status.dump();
+}
+
+void PgScrubber::update_op_mode_text()
+{
+  auto visible_repair = state_test(PG_STATE_REPAIR);
+  m_mode_desc = (visible_repair ? "repair"sv : (m_is_deep ? "deep-scrub"sv : "scrub"sv));
+
+  dout(10) << __func__ << ": repair: visible: " << (visible_repair ? "true" : "false")
+	   << ", internal: " << (m_is_repair ? "true" : "false")
+	   << ". Displayed: " << m_mode_desc << dendl;
+}
+
+void PgScrubber::_request_scrub_map(pg_shard_t replica,
+				    eversion_t version,
+				    hobject_t start,
+				    hobject_t end,
+				    bool deep,
+				    bool allow_preemption)
+{
+  ceph_assert(replica != m_pg_whoami);
+  dout(10) << __func__ << " scrubmap from osd." << replica
+	   << (deep ? " deep" : " shallow") << dendl;
+
+  auto repscrubop =
+    new MOSDRepScrub(spg_t(m_pg->info.pgid.pgid, replica.shard), version,
+		     get_osdmap_epoch(), m_pg->get_last_peering_reset(), start, end, deep,
+		     allow_preemption, m_flags.priority, m_pg->ops_blocked_by_scrub());
+
+  // default priority. We want the replica-scrub processed prior to any recovery
+  // or client io messages (we are holding a lock!)
+  m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch());
+}
+
+void PgScrubber::cleanup_store(ObjectStore::Transaction* t)
+{
+  if (!m_store)
+    return;
+
+  struct OnComplete : Context {
+    std::unique_ptr<Scrub::Store> store;
+    explicit OnComplete(std::unique_ptr<Scrub::Store>&& store) : store(std::move(store))
+    {}
+    void finish(int) override {}
+  };
+  m_store->cleanup(t);
+  t->register_on_complete(new OnComplete(std::move(m_store)));
+  ceph_assert(!m_store);
+}
+
+void PgScrubber::on_init()
+{
+  // going upwards from 'inactive'
+  ceph_assert(!is_scrub_active());
+
+  preemption_data.reset();
+  m_pg->publish_stats_to_osd();
+  m_interval_start = m_pg->get_history().same_interval_since;
+
+  dout(10) << __func__ << " start same_interval:" << m_interval_start << dendl;
+
+  //  create a new store
+  {
+    ObjectStore::Transaction t;
+    cleanup_store(&t);
+    m_store.reset(
+      Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll));
+    m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+  }
+
+  m_start = m_pg->info.pgid.pgid.get_hobj_start();
+  m_active = true;
+}
+
+void PgScrubber::on_replica_init()
+{
+  m_active = true;
+}
+
+void PgScrubber::_scan_snaps(ScrubMap& smap)
+{
+  hobject_t head;
+  SnapSet snapset;
+
+  // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings
+  // in this function
+  dout(15) << "_scan_snaps starts" << dendl;
+
+  for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) {
+
+    const hobject_t& hoid = i->first;
+    ScrubMap::object& o = i->second;
+
+    dout(20) << __func__ << " " << hoid << dendl;
+
+    ceph_assert(!hoid.is_snapdir());
+    if (hoid.is_head()) {
+      // parse the SnapSet
+      bufferlist bl;
+      if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
+	continue;
+      }
+      bl.push_back(o.attrs[SS_ATTR]);
+      auto p = bl.cbegin();
+      try {
+	decode(snapset, p);
+      } catch (...) {
+	continue;
+      }
+      head = hoid.get_head();
+      continue;
+    }
+
+    if (hoid.snap < CEPH_MAXSNAP) {
+      // check and if necessary fix snap_mapper
+      if (hoid.get_head() != head) {
+	derr << __func__ << " no head for " << hoid << " (have " << head << ")" << dendl;
+	continue;
+      }
+      set<snapid_t> obj_snaps;
+      auto p = snapset.clone_snaps.find(hoid.snap);
+      if (p == snapset.clone_snaps.end()) {
+	derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset << dendl;
+	continue;
+      }
+      obj_snaps.insert(p->second.begin(), p->second.end());
+      set<snapid_t> cur_snaps;
+      int r = m_pg->snap_mapper.get_snaps(hoid, &cur_snaps);
+      if (r != 0 && r != -ENOENT) {
+	derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
+	ceph_abort();
+      }
+      if (r == -ENOENT || cur_snaps != obj_snaps) {
+	ObjectStore::Transaction t;
+	OSDriver::OSTransaction _t(m_pg->osdriver.get_transaction(&t));
+	if (r == 0) {
+	  r = m_pg->snap_mapper.remove_oid(hoid, &_t);
+	  if (r != 0) {
+	    derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
+	    ceph_abort();
+	  }
+	  m_pg->osd->clog->error()
+	    << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
+	    << m_pg->info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps
+	    << ", oi: " << obj_snaps << "...repaired";
+	} else {
+	  m_pg->osd->clog->error()
+	    << "osd." << m_pg->osd->whoami << " found snap mapper error on pg "
+	    << m_pg->info.pgid << " oid " << hoid << " snaps missing in mapper"
+	    << ", should be: " << obj_snaps << " was " << cur_snaps << " r " << r
+	    << "...repaired";
+	}
+	m_pg->snap_mapper.add_oid(hoid, obj_snaps, &_t);
+
+	// wait for repair to apply to avoid confusing other bits of the system.
+	{
+	  dout(15) << __func__ << " wait on repair!" << dendl;
+
+	  ceph::condition_variable my_cond;
+	  ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock");
+	  int e = 0;
+	  bool done;
+
+	  t.register_on_applied_sync(new C_SafeCond(my_lock, my_cond, &done, &e));
+
+	  e = m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t));
+	  if (e != 0) {
+	    derr << __func__ << ": queue_transaction got " << cpp_strerror(e) << dendl;
+	  } else {
+	    std::unique_lock l{my_lock};
+	    my_cond.wait(l, [&done] { return done; });
+	  }
+	}
+      }
+    }
+  }
+}
+
+int PgScrubber::build_primary_map_chunk()
+{
+  epoch_t map_building_since = m_pg->get_osdmap_epoch();
+  dout(20) << __func__ << ": initiated at epoch " << map_building_since << dendl;
+
+  auto ret = build_scrub_map_chunk(m_primary_scrubmap, m_primary_scrubmap_pos, m_start,
+				   m_end, m_is_deep);
+
+  if (ret == -EINPROGRESS) {
+    // reschedule another round of asking the backend to collect the scrub data
+    m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::low_priority);
+  }
+  return ret;
+}
+
+int PgScrubber::build_replica_map_chunk()
+{
+  dout(10) << __func__ << " interval start: " << m_interval_start
+	   << " current token: " << m_current_token << " epoch: " << m_epoch_start
+	   << " deep: " << m_is_deep << dendl;
+
+  auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end,
+				   m_is_deep);
+
+  switch (ret) {
+
+    case -EINPROGRESS:
+      // must wait for the backend to finish. No external event source.
+      // (note: previous version used low priority here. Now switched to using the
+      // priority of the original message)
+      m_osds->queue_for_rep_scrub_resched(m_pg, m_replica_request_priority,
+					  m_flags.priority, m_current_token);
+      break;
+
+    case 0: {
+      // finished!
+      m_cleaned_meta_map.clear_from(m_start);
+      m_cleaned_meta_map.insert(replica_scrubmap);
+      auto for_meta_scrub = clean_meta_map();
+      _scan_snaps(for_meta_scrub);
+
+      // the local map has been created. Send it to the primary.
+      // Note: once the message reaches the Primary, it may ask us for another
+      // chunk - and we better be done with the current scrub. Thus - the preparation of
+      // the reply message is separate, and we clear the scrub state before actually
+      // sending it.
+
+      auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption);
+      replica_handling_done();
+      dout(15) << __func__ << " chunk map sent " << dendl;
+      send_replica_map(reply);
+    } break;
+
+    default:
+      // negative retval: build_scrub_map_chunk() signalled an error
+      // Pre-Pacific code ignored this option, treating it as a success.
+      // \todo Add an error flag in the returning message.
+      dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret
+	      << dendl;
+      replica_handling_done();
+      // only in debug mode for now:
+      assert(false && "backend error");
+      break;
+  };
+
+  return ret;
+}
+
+int PgScrubber::build_scrub_map_chunk(
+  ScrubMap& map, ScrubMapBuilder& pos, hobject_t start, hobject_t end, bool deep)
+{
+  dout(10) << __func__ << " [" << start << "," << end << ") "
+	   << " pos " << pos << " Deep: " << deep << dendl;
+
+  // start
+  while (pos.empty()) {
+
+    pos.deep = deep;
+    map.valid_through = m_pg->info.last_update;
+
+    // objects
+    vector<ghobject_t> rollback_obs;
+    pos.ret =
+      m_pg->get_pgbackend()->objects_list_range(start, end, &pos.ls, &rollback_obs);
+    dout(10) << __func__ << " while pos empty " << pos.ret << dendl;
+    if (pos.ret < 0) {
+      dout(5) << "objects_list_range error: " << pos.ret << dendl;
+      return pos.ret;
+    }
+    dout(10) << __func__ << " pos.ls.empty()? " << (pos.ls.empty() ? "+" : "-") << dendl;
+    if (pos.ls.empty()) {
+      break;
+    }
+    m_pg->_scan_rollback_obs(rollback_obs);
+    pos.pos = 0;
+    return -EINPROGRESS;
+  }
+
+  // scan objects
+  while (!pos.done()) {
+
+    int r = m_pg->get_pgbackend()->be_scan_list(map, pos);
+    if (r == -EINPROGRESS) {
+      dout(20) << __func__ << " in progress" << dendl;
+      return r;
+    }
+  }
+
+  // finish
+  dout(20) << __func__ << " finishing" << dendl;
+  ceph_assert(pos.done());
+  m_pg->_repair_oinfo_oid(map);
+
+  dout(20) << __func__ << " done, got " << map.objects.size() << " items" << dendl;
+  return 0;
+}
+
+/*
+ * Process:
+ * Building a map of objects suitable for snapshot validation.
+ * The data in m_cleaned_meta_map is the left over partial items that need to
+ * be completed before they can be processed.
+ *
+ * Snapshots in maps precede the head object, which is why we are scanning backwards.
+ */
+ScrubMap PgScrubber::clean_meta_map()
+{
+  ScrubMap for_meta_scrub;
+
+  if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) {
+    m_cleaned_meta_map.swap(for_meta_scrub);
+  } else {
+    auto iter = m_cleaned_meta_map.objects.end();
+    --iter;  // not empty, see 'if' clause
+    auto begin = m_cleaned_meta_map.objects.begin();
+    if (iter->first.has_snapset()) {
+      ++iter;
+    } else {
+      while (iter != begin) {
+	auto next = iter--;
+	if (next->first.get_head() != iter->first.get_head()) {
+	  ++iter;
+	  break;
+	}
+      }
+    }
+    for_meta_scrub.objects.insert(begin, iter);
+    m_cleaned_meta_map.objects.erase(begin, iter);
+  }
+
+  return for_meta_scrub;
+}
+
+void PgScrubber::run_callbacks()
+{
+  std::list<Context*> to_run;
+  to_run.swap(m_callbacks);
+
+  for (auto& tr : to_run) {
+    tr->complete(0);
+  }
+}
+
+void PgScrubber::maps_compare_n_cleanup()
+{
+  scrub_compare_maps();
+  m_start = m_end;
+  run_callbacks();
+  requeue_waiting();
+  m_osds->queue_scrub_maps_compared(m_pg, Scrub::scrub_prio_t::low_priority);
+}
+
+Scrub::preemption_t& PgScrubber::get_preemptor()
+{
+  return preemption_data;
+}
+
+/*
+ * Process note: called for the arriving "give me your map, replica!" request. Unlike
+ * the original implementation, we do not requeue the Op waiting for
+ * updates. Instead - we trigger the FSM.
+ */
+void PgScrubber::replica_scrub_op(OpRequestRef op)
+{
+  op->mark_started();
+  auto msg = op->get_req<MOSDRepScrub>();
+  dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch
+	   << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl;
+
+  // are we still processing a previous scrub-map request without noticing that the
+  // interval changed? won't see it here, but rather at the reservation stage.
+
+  if (msg->map_epoch < m_pg->info.history.same_interval_since) {
+    dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch
+	     << " < " << m_pg->info.history.same_interval_since << dendl;
+
+    // is there a general sync issue? are we holding a stale reservation?
+    // not checking now - assuming we will actively react to interval change.
+
+    return;
+  }
+
+  if (is_queued_or_active()) {
+    // this is bug!
+    // Somehow, we have received a new scrub request from our Primary, before
+    // having finished with the previous one. Did we go through an interval
+    // change without reseting the FSM? Possible responses:
+    // - crashing (the original assert_not_active() implemented that one), or
+    // - trying to recover:
+    //  - (logging enough information to debug this scenario)
+    //  - reset the FSM.
+    m_osds->clog->warn()
+      << __func__
+      << ": error: a second scrub-op received while handling the previous one";
+
+    scrub_clear_state();
+    m_osds->clog->warn() << __func__
+			 << ": after a reset. Now handling the new OP";
+  }
+  // make sure the FSM is at NotActive
+  m_fsm->assert_not_active();
+
+  replica_scrubmap = ScrubMap{};
+  replica_scrubmap_pos = ScrubMapBuilder{};
+
+  m_replica_min_epoch = msg->min_epoch;
+  m_start = msg->start;
+  m_end = msg->end;
+  m_max_end = msg->end;
+  m_is_deep = msg->deep;
+  m_interval_start = m_pg->info.history.same_interval_since;
+  m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority
+						  : Scrub::scrub_prio_t::low_priority;
+  m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority();
+
+  preemption_data.reset();
+  preemption_data.force_preemptability(msg->allow_preemption);
+
+  replica_scrubmap_pos.reset();
+
+  set_queued_or_active();
+  m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority,
+                              m_flags.priority, m_current_token);
+}
+
+void PgScrubber::set_op_parameters(requested_scrub_t& request)
+{
+  dout(10) << __func__ << " input: " << request << dendl;
+
+  // write down the epoch of starting a new scrub. Will be used
+  // to discard stale messages from previous aborted scrubs.
+  m_epoch_start = m_pg->get_osdmap_epoch();
+
+  m_flags.check_repair = request.check_repair;
+  m_flags.auto_repair = request.auto_repair || request.need_auto;
+  m_flags.required = request.req_scrub || request.must_scrub;
+
+  m_flags.priority = (request.must_scrub || request.need_auto)
+		       ? get_pg_cct()->_conf->osd_requested_scrub_priority
+		       : m_pg->get_scrub_priority();
+
+  state_set(PG_STATE_SCRUBBING);
+
+  // will we be deep-scrubbing?
+  if (request.must_deep_scrub || request.need_auto || request.time_for_deep) {
+    state_set(PG_STATE_DEEP_SCRUB);
+  }
+
+  // m_is_repair is set for either 'must_repair' or 'repair-on-the-go' (i.e.
+  // deep-scrub with the auto_repair configuration flag set). m_is_repair value
+  // determines the scrubber behavior.
+  // PG_STATE_REPAIR, on the other hand, is only used for status reports (inc. the
+  // PG status as appearing in the logs).
+  m_is_repair = request.must_repair || m_flags.auto_repair;
+  if (request.must_repair) {
+    state_set(PG_STATE_REPAIR);
+    // not calling update_op_mode_text() yet, as m_is_deep not set yet
+  }
+
+  // the publishing here seems to be required for tests synchronization
+  m_pg->publish_stats_to_osd();
+  m_flags.deep_scrub_on_error = request.deep_scrub_on_error;
+}
+
+void PgScrubber::scrub_compare_maps()
+{
+  dout(10) << __func__ << " has maps, analyzing" << dendl;
+
+  // construct authoritative scrub map for type-specific scrubbing
+  m_cleaned_meta_map.insert(m_primary_scrubmap);
+  map<hobject_t, pair<std::optional<uint32_t>, std::optional<uint32_t>>> missing_digest;
+
+  map<pg_shard_t, ScrubMap*> maps;
+  maps[m_pg_whoami] = &m_primary_scrubmap;
+
+  for (const auto& i : m_pg->get_actingset()) {
+    if (i == m_pg_whoami)
+      continue;
+    dout(2) << __func__ << " replica " << i << " has "
+	    << m_received_maps[i].objects.size() << " items" << dendl;
+    maps[i] = &m_received_maps[i];
+  }
+
+  set<hobject_t> master_set;
+
+  // Construct master set
+  for (const auto& map : maps) {
+    for (const auto& i : map.second->objects) {
+      master_set.insert(i.first);
+    }
+  }
+
+  stringstream ss;
+  m_pg->get_pgbackend()->be_omap_checks(maps, master_set, m_omap_stats, ss);
+
+  if (!ss.str().empty()) {
+    m_osds->clog->warn(ss);
+  }
+
+  if (m_pg->recovery_state.get_actingset().size() > 1) {
+
+    dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
+
+    // Map from object with errors to good peer
+    map<hobject_t, list<pg_shard_t>> authoritative;
+
+    dout(2) << __func__ << ": primary (" << m_pg->get_primary() << ") has "
+	    << m_primary_scrubmap.objects.size() << " items" << dendl;
+
+    ss.str("");
+    ss.clear();
+
+    m_pg->get_pgbackend()->be_compare_scrubmaps(
+      maps, master_set, m_is_repair, m_missing, m_inconsistent,
+      authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(),
+      m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss);
+
+    if (!ss.str().empty()) {
+      m_osds->clog->error(ss);
+    }
+
+    for (auto& i : authoritative) {
+      list<pair<ScrubMap::object, pg_shard_t>> good_peers;
+      for (list<pg_shard_t>::const_iterator j = i.second.begin(); j != i.second.end();
+	   ++j) {
+	good_peers.emplace_back(maps[*j]->objects[i.first], *j);
+      }
+      m_authoritative.emplace(i.first, good_peers);
+    }
+
+    for (auto i = authoritative.begin(); i != authoritative.end(); ++i) {
+      m_cleaned_meta_map.objects.erase(i->first);
+      m_cleaned_meta_map.objects.insert(
+	*(maps[i->second.back()]->objects.find(i->first)));
+    }
+  }
+
+  auto for_meta_scrub = clean_meta_map();
+
+  // ok, do the pg-type specific scrubbing
+
+  // (Validates consistency of the object info and snap sets)
+  scrub_snapshot_metadata(for_meta_scrub, missing_digest);
+
+  // Called here on the primary can use an authoritative map if it isn't the primary
+  _scan_snaps(for_meta_scrub);
+
+  if (!m_store->empty()) {
+
+    if (m_is_repair) {
+      dout(10) << __func__ << ": discarding scrub results" << dendl;
+      m_store->flush(nullptr);
+    } else {
+      dout(10) << __func__ << ": updating scrub object" << dendl;
+      ObjectStore::Transaction t;
+      m_store->flush(&t);
+      m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+    }
+  }
+}
+
+ScrubMachineListener::MsgAndEpoch PgScrubber::prep_replica_map_msg(
+  PreemptionNoted was_preempted)
+{
+  dout(10) << __func__ << " min epoch:" << m_replica_min_epoch << dendl;
+
+  auto reply =
+    make_message<MOSDRepScrubMap>(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
+				  m_replica_min_epoch, m_pg_whoami);
+
+  reply->preempted = (was_preempted == PreemptionNoted::preempted);
+  ::encode(replica_scrubmap, reply->get_data());
+
+  return ScrubMachineListener::MsgAndEpoch{reply, m_replica_min_epoch};
+}
+
+void PgScrubber::send_replica_map(const MsgAndEpoch& preprepared)
+{
+  m_pg->send_cluster_message(m_pg->get_primary().osd, preprepared.m_msg,
+			     preprepared.m_epoch, false);
+}
+
+void PgScrubber::send_preempted_replica()
+{
+  auto reply =
+    make_message<MOSDRepScrubMap>(spg_t{m_pg->info.pgid.pgid, m_pg->get_primary().shard},
+				  m_replica_min_epoch, m_pg_whoami);
+
+  reply->preempted = true;
+  ::encode(replica_scrubmap, reply->get_data()); // must not skip this
+  m_pg->send_cluster_message(m_pg->get_primary().osd, reply, m_replica_min_epoch, false);
+}
+
+/*
+ *  - if the replica lets us know it was interrupted, we mark the chunk as interrupted.
+ *    The state-machine will react to that when all replica maps are received.
+ *  - when all maps are received, we signal the FSM with the GotReplicas event (see
+ *    scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the
+ *    FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to
+ *    handle.
+ */
+void PgScrubber::map_from_replica(OpRequestRef op)
+{
+  auto m = op->get_req<MOSDRepScrubMap>();
+  dout(15) << __func__ << " " << *m << dendl;
+
+  if (m->map_epoch < m_pg->info.history.same_interval_since) {
+    dout(10) << __func__ << " discarding old from " << m->map_epoch << " < "
+	     << m_pg->info.history.same_interval_since << dendl;
+    return;
+  }
+
+  auto p = const_cast<bufferlist&>(m->get_data()).cbegin();
+
+  m_received_maps[m->from].decode(p, m_pg->info.pgid.pool());
+  dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl;
+
+  auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from);
+  if (!is_ok) {
+    // previously an unexpected map was triggering an assert. Now, as scrubs can be
+    // aborted at any time, the chances of this happening have increased, and aborting is
+    // not justified
+    dout(1) << __func__ << err_txt << " from OSD " << m->from << dendl;
+    return;
+  }
+
+  if (m->preempted) {
+    dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
+    preemption_data.do_preempt();
+  }
+
+  if (m_maps_status.are_all_maps_available()) {
+    dout(15) << __func__ << " all repl-maps available" << dendl;
+    m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops());
+  }
+}
+
+void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
+{
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+  auto request_ep = op->get_req<MOSDScrubReserve>()->get_map_epoch();
+
+  /*
+   *  if we are currently holding a reservation, then:
+   *  either (1) we, the scrubber, did not yet notice an interval change. The remembered
+   *  reservation epoch is from before our interval, and we can silently discard the
+   *  reservation (no message is required).
+   *  or:
+   *  (2) the interval hasn't changed, but the same Primary that (we think) holds the
+   *  lock just sent us a new request. Note that we know it's the same Primary, as
+   *  otherwise the interval would have changed.
+   *  Ostensibly we can discard & redo the reservation. But then we
+   *  will be temporarily releasing the OSD resource - and might not be able to grab it
+   *  again. Thus, we simply treat this as a successful new request
+   *  (but mark the fact that if there is a previous request from the primary to
+   *  scrub a specific chunk - that request is now defunct).
+   */
+
+  if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) {
+    // we are holding a stale reservation from a past epoch
+    m_remote_osd_resource.reset();
+    dout(10) << __func__ << " cleared existing stale reservation" << dendl;
+  }
+
+  if (request_ep < m_pg->get_same_interval_since()) {
+    // will not ack stale requests
+    return;
+  }
+
+  bool granted{false};
+  if (m_remote_osd_resource.has_value()) {
+
+    dout(10) << __func__ << " already reserved." << dendl;
+
+    /*
+     * it might well be that we did not yet finish handling the latest scrub-op from
+     * our primary. This happens, for example, if 'noscrub' was set via a command, then
+     * reset. The primary in this scenario will remain in the same interval, but we do need
+     * to reset our internal state (otherwise - the first renewed 'give me your scrub map'
+     * from the primary will see us in active state, crashing the OSD).
+     */
+    advance_token();
+    granted = true;
+
+  } else if (m_pg->cct->_conf->osd_scrub_during_recovery ||
+	     !m_osds->is_recovery_active()) {
+    m_remote_osd_resource.emplace(this, m_pg, m_osds, request_ep);
+    // OSD resources allocated?
+    granted = m_remote_osd_resource->is_reserved();
+    if (!granted) {
+      // just forget it
+      m_remote_osd_resource.reset();
+      dout(20) << __func__ << ": failed to reserve remotely" << dendl;
+    }
+  }
+
+  dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
+
+  Message* reply = new MOSDScrubReserve(
+    spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), request_ep,
+    granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami);
+
+  m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
+}
+
+void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
+{
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+
+  if (m_reservations.has_value()) {
+    m_reservations->handle_reserve_grant(op, from);
+  } else {
+    dout(20) << __func__ << ": late/unsolicited reservation grant from osd "
+	 << from << " (" << op << ")" << dendl;
+  }
+}
+
+void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
+{
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+
+  if (m_reservations.has_value()) {
+    // there is an active reservation process. No action is required otherwise.
+    m_reservations->handle_reserve_reject(op, from);
+  }
+}
+
+void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
+{
+  dout(10) << __func__ << " " << *op->get_req() << dendl;
+  op->mark_started();
+
+  /*
+   * this specific scrub session has terminated. All incoming events carrying the old
+   * tag will be discarded.
+   */
+  advance_token();
+  m_remote_osd_resource.reset();
+}
+
+void PgScrubber::discard_replica_reservations()
+{
+  dout(10) << __func__ << dendl;
+  if (m_reservations.has_value()) {
+    m_reservations->discard_all();
+  }
+}
+
+void PgScrubber::clear_scrub_reservations()
+{
+  dout(10) << __func__ << dendl;
+  m_reservations.reset();	  // the remote reservations
+  m_local_osd_resource.reset();	  // the local reservation
+  m_remote_osd_resource.reset();  // we as replica reserved for a Primary
+}
+
+void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text)
+{
+  ceph_assert(m_pg->recovery_state.get_backfill_targets().empty());
+
+  std::vector<pair<int, Message*>> messages;
+  messages.reserve(m_pg->get_actingset().size());
+
+  epoch_t epch = get_osdmap_epoch();
+
+  for (auto& p : m_pg->get_actingset()) {
+
+    if (p == m_pg_whoami)
+      continue;
+
+    dout(10) << "scrub requesting " << op_text << " from osd." << p << " Epoch: " << epch
+	     << dendl;
+    Message* m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epch, opcode,
+				      m_pg_whoami);
+    messages.push_back(std::make_pair(p.osd, m));
+  }
+
+  if (!messages.empty()) {
+    m_osds->send_message_osd_cluster(messages, epch);
+  }
+}
+
+void PgScrubber::unreserve_replicas()
+{
+  dout(10) << __func__ << dendl;
+  m_reservations.reset();
+}
+
+void PgScrubber::set_queued_or_active()
+{
+  m_queued_or_active = true;
+}
+
+void PgScrubber::clear_queued_or_active()
+{
+  m_queued_or_active = false;
+}
+
+bool PgScrubber::is_queued_or_active() const
+{
+  return m_queued_or_active;
+}
+
+[[nodiscard]] bool PgScrubber::scrub_process_inconsistent()
+{
+  dout(10) << __func__ << ": checking authoritative (mode="
+	   << m_mode_desc << ", auth remaining #: " << m_authoritative.size()
+	   << ")" << dendl;
+
+  // authoritative only store objects which are missing or inconsistent.
+  if (!m_authoritative.empty()) {
+
+    stringstream ss;
+    ss << m_pg->info.pgid << " " << m_mode_desc << " " << m_missing.size() << " missing, "
+       << m_inconsistent.size() << " inconsistent objects";
+    dout(2) << ss.str() << dendl;
+    m_osds->clog->error(ss);
+
+    if (m_is_repair) {
+      state_clear(PG_STATE_CLEAN);
+      // we know we have a problem, so it's OK to set the user-visible flag
+      // even if we only reached here via auto-repair
+      state_set(PG_STATE_REPAIR);
+      update_op_mode_text();
+
+      for (const auto& [hobj, shrd_list] : m_authoritative) {
+
+	auto missing_entry = m_missing.find(hobj);
+
+	if (missing_entry != m_missing.end()) {
+	  m_pg->repair_object(hobj, shrd_list, missing_entry->second);
+	  m_fixed_count += missing_entry->second.size();
+	}
+
+	if (m_inconsistent.count(hobj)) {
+	  m_pg->repair_object(hobj, shrd_list, m_inconsistent[hobj]);
+	  m_fixed_count += m_inconsistent[hobj].size();
+	}
+      }
+    }
+  }
+  return (!m_authoritative.empty() && m_is_repair);
+}
+
+/*
+ * note: only called for the Primary.
+ */
+void PgScrubber::scrub_finish()
+{
+  dout(10) << __func__ << " before flags: " << m_flags
+	   << ". repair state: " << (state_test(PG_STATE_REPAIR) ? "repair" : "no-repair")
+	   << ". deep_scrub_on_error: " << m_flags.deep_scrub_on_error << dendl;
+
+  ceph_assert(m_pg->is_locked());
+  ceph_assert(is_queued_or_active());
+
+  m_pg->m_planned_scrub = requested_scrub_t{};
+
+  // if the repair request comes from auto-repair and large number of errors,
+  // we would like to cancel auto-repair
+  if (m_is_repair && m_flags.auto_repair &&
+      m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
+
+    dout(10) << __func__ << " undoing the repair" << dendl;
+    state_clear(PG_STATE_REPAIR); // not expected to be set, anyway
+    m_is_repair = false;
+    update_op_mode_text();
+  }
+
+  bool do_auto_scrub = false;
+
+  // if a regular scrub had errors within the limit, do a deep scrub to auto repair
+  if (m_flags.deep_scrub_on_error && !m_authoritative.empty() &&
+      m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) {
+    ceph_assert(!m_is_deep);
+    do_auto_scrub = true;
+    dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl;
+  }
+
+  m_flags.deep_scrub_on_error = false;
+
+  // type-specific finish (can tally more errors)
+  _scrub_finish();
+
+  bool has_error = scrub_process_inconsistent();
+
+  {
+    stringstream oss;
+    oss << m_pg->info.pgid.pgid << " " << m_mode_desc << " ";
+    int total_errors = m_shallow_errors + m_deep_errors;
+    if (total_errors)
+      oss << total_errors << " errors";
+    else
+      oss << "ok";
+    if (!m_is_deep && m_pg->info.stats.stats.sum.num_deep_scrub_errors)
+      oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors
+	  << " remaining deep scrub error details lost)";
+    if (m_is_repair)
+      oss << ", " << m_fixed_count << " fixed";
+    if (total_errors)
+      m_osds->clog->error(oss);
+    else
+      m_osds->clog->debug(oss);
+  }
+
+  // Since we don't know which errors were fixed, we can only clear them
+  // when every one has been fixed.
+  if (m_is_repair) {
+    if (m_fixed_count == m_shallow_errors + m_deep_errors) {
+
+      ceph_assert(m_is_deep);
+      m_shallow_errors = 0;
+      m_deep_errors = 0;
+      dout(20) << __func__ << " All may be fixed" << dendl;
+
+    } else if (has_error) {
+
+      // Deep scrub in order to get corrected error counts
+      m_pg->scrub_after_recovery = true;
+      m_pg->m_planned_scrub.req_scrub =
+	m_pg->m_planned_scrub.req_scrub || m_flags.required;
+
+      dout(20) << __func__ << " Current 'required': " << m_flags.required
+	       << " Planned 'req_scrub': " << m_pg->m_planned_scrub.req_scrub << dendl;
+
+    } else if (m_shallow_errors || m_deep_errors) {
+
+      // We have errors but nothing can be fixed, so there is no repair
+      // possible.
+      state_set(PG_STATE_FAILED_REPAIR);
+      dout(10) << __func__ << " " << (m_shallow_errors + m_deep_errors)
+	       << " error(s) present with no repair possible" << dendl;
+    }
+  }
+
+  {
+    // finish up
+    ObjectStore::Transaction t;
+    m_pg->recovery_state.update_stats(
+      [this](auto& history, auto& stats) {
+	dout(10) << "m_pg->recovery_state.update_stats()" << dendl;
+	utime_t now = ceph_clock_now();
+	history.last_scrub = m_pg->recovery_state.get_info().last_update;
+	history.last_scrub_stamp = now;
+	if (m_is_deep) {
+	  history.last_deep_scrub = m_pg->recovery_state.get_info().last_update;
+	  history.last_deep_scrub_stamp = now;
+	}
+
+	if (m_is_deep) {
+	  if ((m_shallow_errors == 0) && (m_deep_errors == 0))
+	    history.last_clean_scrub_stamp = now;
+	  stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
+	  stats.stats.sum.num_deep_scrub_errors = m_deep_errors;
+	  stats.stats.sum.num_large_omap_objects = m_omap_stats.large_omap_objects;
+	  stats.stats.sum.num_omap_bytes = m_omap_stats.omap_bytes;
+	  stats.stats.sum.num_omap_keys = m_omap_stats.omap_keys;
+	  dout(25) << "scrub_finish shard " << m_pg_whoami
+		   << " num_omap_bytes = " << stats.stats.sum.num_omap_bytes
+		   << " num_omap_keys = " << stats.stats.sum.num_omap_keys << dendl;
+	} else {
+	  stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors;
+	  // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
+	  // because of deep-scrub errors
+	  if (m_shallow_errors == 0)
+	    history.last_clean_scrub_stamp = now;
+	}
+	stats.stats.sum.num_scrub_errors = stats.stats.sum.num_shallow_scrub_errors +
+					   stats.stats.sum.num_deep_scrub_errors;
+	if (m_flags.check_repair) {
+	  m_flags.check_repair = false;
+	  if (m_pg->info.stats.stats.sum.num_scrub_errors) {
+	    state_set(PG_STATE_FAILED_REPAIR);
+	    dout(10) << "scrub_finish " << m_pg->info.stats.stats.sum.num_scrub_errors
+		     << " error(s) still present after re-scrub" << dendl;
+	  }
+	}
+	return true;
+      },
+      &t);
+    int tr = m_osds->store->queue_transaction(m_pg->ch, std::move(t), nullptr);
+    ceph_assert(tr == 0);
+  }
+
+  if (has_error) {
+    m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared<PGPeeringEvent>(
+      get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery())));
+  } else {
+    m_is_repair = false;
+    state_clear(PG_STATE_REPAIR);
+    update_op_mode_text();
+  }
+
+  cleanup_on_finish();
+  if (do_auto_scrub) {
+    request_rescrubbing(m_pg->m_planned_scrub);
+  }
+
+  if (m_pg->is_active() && m_pg->is_primary()) {
+    m_pg->recovery_state.share_pg_info();
+  }
+
+  // we may have blocked the snap trimmer
+  m_pg->snap_trimmer_scrub_complete();
+}
+
+void PgScrubber::on_digest_updates()
+{
+  dout(10) << __func__ << " #pending: " << num_digest_updates_pending
+	   << (m_end.is_max() ? " <last chunk>" : " <mid chunk>")
+           << (is_queued_or_active() ? "" : " ** not marked as scrubbing **")
+           << dendl;
+
+  if (num_digest_updates_pending > 0) {
+    // do nothing for now. We will be called again when new updates arrive
+    return;
+  }
+
+  // got all updates, and finished with this chunk. Any more?
+  if (m_end.is_max()) {
+    m_osds->queue_scrub_is_finished(m_pg);
+  } else {
+    // go get a new chunk (via "requeue")
+    preemption_data.reset();
+    m_osds->queue_scrub_next_chunk(m_pg, m_pg->is_scrub_blocking_ops());
+  }
+}
+
+
+/*
+ * note that the flags-set fetched from the PG (m_pg->m_planned_scrub)
+ * is cleared once scrubbing starts; Some of the values dumped here are
+ * thus transitory.
+ */
+void PgScrubber::dump(ceph::Formatter* f) const
+{
+  f->open_object_section("scrubber");
+  f->dump_stream("epoch_start") << m_interval_start;
+  f->dump_bool("active", m_active);
+  if (m_active) {
+    f->dump_stream("start") << m_start;
+    f->dump_stream("end") << m_end;
+    f->dump_stream("m_max_end") << m_max_end;
+    f->dump_stream("subset_last_update") << m_subset_last_update;
+    f->dump_bool("deep", m_is_deep);
+    f->dump_bool("must_scrub", (m_pg->m_planned_scrub.must_scrub || m_flags.required));
+    f->dump_bool("must_deep_scrub", m_pg->m_planned_scrub.must_deep_scrub);
+    f->dump_bool("must_repair", m_pg->m_planned_scrub.must_repair);
+    f->dump_bool("need_auto", m_pg->m_planned_scrub.need_auto);
+    f->dump_bool("req_scrub", m_flags.required);
+    f->dump_bool("time_for_deep", m_pg->m_planned_scrub.time_for_deep);
+    f->dump_bool("auto_repair", m_flags.auto_repair);
+    f->dump_bool("check_repair", m_flags.check_repair);
+    f->dump_bool("deep_scrub_on_error", m_flags.deep_scrub_on_error);
+    f->dump_stream("scrub_reg_stamp") << m_scrub_reg_stamp;  // utime_t
+    f->dump_unsigned("priority", m_flags.priority);
+    f->dump_int("shallow_errors", m_shallow_errors);
+    f->dump_int("deep_errors", m_deep_errors);
+    f->dump_int("fixed", m_fixed_count);
+    {
+      f->open_array_section("waiting_on_whom");
+      for (const auto& p : m_maps_status.get_awaited()) {
+	f->dump_stream("shard") << p;
+      }
+      f->close_section();
+    }
+  }
+  f->close_section();
+}
+
+
+void PgScrubber::handle_query_state(ceph::Formatter* f)
+{
+  dout(10) << __func__ << dendl;
+
+  f->open_object_section("scrub");
+  f->dump_stream("scrubber.epoch_start") << m_interval_start;
+  f->dump_bool("scrubber.active", m_active);
+  f->dump_stream("scrubber.start") << m_start;
+  f->dump_stream("scrubber.end") << m_end;
+  f->dump_stream("scrubber.m_max_end") << m_max_end;
+  f->dump_stream("scrubber.m_subset_last_update") << m_subset_last_update;
+  f->dump_bool("scrubber.deep", m_is_deep);
+  {
+    f->open_array_section("scrubber.waiting_on_whom");
+    for (const auto& p : m_maps_status.get_awaited()) {
+      f->dump_stream("shard") << p;
+    }
+    f->close_section();
+  }
+
+  f->dump_string("comment", "DEPRECATED - may be removed in the next release");
+
+  f->close_section();
+}
+
+PgScrubber::~PgScrubber() = default;
+
+PgScrubber::PgScrubber(PG* pg)
+    : m_pg{pg}
+    , m_pg_id{pg->pg_id}
+    , m_osds{m_pg->osd}
+    , m_pg_whoami{pg->pg_whoami}
+    , preemption_data{pg}
+{
+  m_fsm = std::make_unique<ScrubMachine>(m_pg, this);
+  m_fsm->initiate();
+}
+
+void PgScrubber::scrub_begin()
+{
+  stringstream ss;
+  ss << m_pg->info.pgid.pgid << " " << m_mode_desc << " starts";
+  dout(2) << ss.str() << dendl;
+  m_osds->clog->debug(ss);
+}
+
+void PgScrubber::reserve_replicas()
+{
+  dout(10) << __func__ << dendl;
+  m_reservations.emplace(m_pg, m_pg_whoami);
+}
+
+void PgScrubber::cleanup_on_finish()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(m_pg->is_locked());
+
+  state_clear(PG_STATE_SCRUBBING);
+  state_clear(PG_STATE_DEEP_SCRUB);
+  m_pg->publish_stats_to_osd();
+
+  clear_scrub_reservations();
+  m_pg->publish_stats_to_osd();
+
+  requeue_waiting();
+
+  reset_internal_state();
+  m_flags = scrub_flags_t{};
+
+  // type-specific state clear
+  _scrub_clear_state();
+}
+
+// uses process_event(), so must be invoked externally
+void PgScrubber::scrub_clear_state()
+{
+  dout(10) << __func__ << dendl;
+
+  clear_pgscrub_state();
+  m_fsm->process_event(FullReset{});
+}
+
+/*
+ * note: does not access the state-machine
+ */
+void PgScrubber::clear_pgscrub_state()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(m_pg->is_locked());
+
+  state_clear(PG_STATE_SCRUBBING);
+  state_clear(PG_STATE_DEEP_SCRUB);
+
+  state_clear(PG_STATE_REPAIR);
+
+  clear_scrub_reservations();
+  m_pg->publish_stats_to_osd();
+
+  requeue_waiting();
+
+  reset_internal_state();
+  m_flags = scrub_flags_t{};
+
+  // type-specific state clear
+  _scrub_clear_state();
+}
+
+void PgScrubber::replica_handling_done()
+{
+  dout(10) << __func__ << dendl;
+
+  state_clear(PG_STATE_SCRUBBING);
+  state_clear(PG_STATE_DEEP_SCRUB);
+
+  reset_internal_state();
+
+  m_pg->publish_stats_to_osd();
+}
+
+/*
+ * note: performs run_callbacks()
+ * note: reservations-related variables are not reset here
+ */
+void PgScrubber::reset_internal_state()
+{
+  dout(10) << __func__ << dendl;
+
+  preemption_data.reset();
+  m_maps_status.reset();
+  m_received_maps.clear();
+
+  m_start = hobject_t{};
+  m_end = hobject_t{};
+  m_max_end = hobject_t{};
+  m_subset_last_update = eversion_t{};
+  m_shallow_errors = 0;
+  m_deep_errors = 0;
+  m_fixed_count = 0;
+  m_omap_stats = (const struct omap_stat_t){0};
+
+  run_callbacks();
+
+  m_inconsistent.clear();
+  m_missing.clear();
+  m_authoritative.clear();
+  num_digest_updates_pending = 0;
+  m_primary_scrubmap = ScrubMap{};
+  m_primary_scrubmap_pos.reset();
+  replica_scrubmap = ScrubMap{};
+  replica_scrubmap_pos.reset();
+  m_cleaned_meta_map = ScrubMap{};
+  m_needs_sleep = true;
+  m_sleep_started_at = utime_t{};
+
+  m_active = false;
+  clear_queued_or_active();
+}
+
+// note that only applicable to the Replica:
+void PgScrubber::advance_token()
+{
+  dout(10) << __func__ << " was: " << m_current_token << dendl;
+  m_current_token++;
+
+  // when advance_token() is called, it is assumed that no scrubbing takes place.
+  // We will, though, verify that. And if we are actually still handling a stale request -
+  // both our internal state and the FSM state will be cleared.
+  replica_handling_done();
+  m_fsm->process_event(FullReset{});
+}
+
+bool PgScrubber::is_token_current(Scrub::act_token_t received_token)
+{
+  if (received_token == 0 || received_token == m_current_token) {
+    return true;
+  }
+  dout(5) << __func__ << " obsolete token (" << received_token
+          << " vs current " << m_current_token << dendl;
+
+  return false;
+}
+
+const OSDMapRef& PgScrubber::get_osdmap() const
+{
+  return m_pg->get_osdmap();
+}
+
+ostream& operator<<(ostream& out, const PgScrubber& scrubber)
+{
+  return out << scrubber.m_flags;
+}
+
+std::ostream& PgScrubber::gen_prefix(std::ostream& out) const
+{
+  const auto fsm_state = m_fsm ? m_fsm->current_states_desc() : "- :";
+  if (m_pg) {
+    return m_pg->gen_prefix(out) << "scrubber " << fsm_state << ": ";
+  } else {
+    return out << " scrubber [~] " << fsm_state << ": ";
+  }
+}
+
+ostream& PgScrubber::show(ostream& out) const
+{
+  return out << " [ " << m_pg_id << ": " << m_flags << " ] ";
+}
+
+// ///////////////////// preemption_data_t //////////////////////////////////
+
+PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg}
+{
+  m_left = static_cast<int>(
+    m_pg->get_cct()->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
+}
+
+void PgScrubber::preemption_data_t::reset()
+{
+  std::lock_guard<std::mutex> lk{m_preemption_lock};
+
+  m_preemptable = false;
+  m_preempted = false;
+  m_left =
+    static_cast<int>(m_pg->cct->_conf.get_val<uint64_t>("osd_scrub_max_preemptions"));
+  m_size_divisor = 1;
+}
+
+
+// ///////////////////// ReplicaReservations //////////////////////////////////
+namespace Scrub {
+
+void ReplicaReservations::release_replica(pg_shard_t peer, epoch_t epoch)
+{
+  auto m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, peer.shard), epoch,
+				MOSDScrubReserve::RELEASE, m_pg->pg_whoami);
+  m_osds->send_message_osd_cluster(peer.osd, m, epoch);
+}
+
+ReplicaReservations::ReplicaReservations(PG* pg, pg_shard_t whoami)
+    : m_pg{pg}
+    , m_acting_set{pg->get_actingset()}
+    , m_osds{m_pg->osd}
+    , m_pending{static_cast<int>(m_acting_set.size()) - 1}
+{
+  epoch_t epoch = m_pg->get_osdmap_epoch();
+
+  {
+    std::stringstream prefix;
+    prefix << "osd." << m_osds->whoami << " ep: " << epoch
+	   << " scrubber::ReplicaReservations pg[" << pg->pg_id << "]: ";
+    m_log_msg_prefix = prefix.str();
+  }
+
+  // handle the special case of no replicas
+  if (m_pending <= 0) {
+    // just signal the scrub state-machine to continue
+    send_all_done();
+
+  } else {
+
+    for (auto p : m_acting_set) {
+      if (p == whoami)
+	continue;
+      auto m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epoch,
+				    MOSDScrubReserve::REQUEST, m_pg->pg_whoami);
+      m_osds->send_message_osd_cluster(p.osd, m, epoch);
+      m_waited_for_peers.push_back(p);
+      dout(10) << __func__ << ": reserve " << p.osd << dendl;
+    }
+  }
+}
+
+void ReplicaReservations::send_all_done()
+{
+  m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
+}
+
+void ReplicaReservations::send_reject()
+{
+  m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
+}
+
+void ReplicaReservations::discard_all()
+{
+  dout(10) << __func__ << ": " << m_reserved_peers << dendl;
+
+  m_had_rejections = true;  // preventing late-coming responses from triggering events
+  m_reserved_peers.clear();
+  m_waited_for_peers.clear();
+}
+
+ReplicaReservations::~ReplicaReservations()
+{
+  m_had_rejections = true;  // preventing late-coming responses from triggering events
+
+  // send un-reserve messages to all reserved replicas. We do not wait for answer (there
+  // wouldn't be one). Other incoming messages will be discarded on the way, by our
+  // owner.
+  epoch_t epoch = m_pg->get_osdmap_epoch();
+
+  for (auto& p : m_reserved_peers) {
+    release_replica(p, epoch);
+  }
+  m_reserved_peers.clear();
+
+  // note: the release will follow on the heels of the request. When tried otherwise,
+  // grants that followed a reject arrived after the whole scrub machine-state was
+  // reset, causing leaked reservations.
+  for (auto& p : m_waited_for_peers) {
+    release_replica(p, epoch);
+  }
+  m_waited_for_peers.clear();
+}
+
+/**
+ *  @ATTN we would not reach here if the ReplicaReservation object managed by the
+ * scrubber was reset.
+ */
+void ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
+{
+  dout(10) << __func__ << ": granted by " << from << dendl;
+  op->mark_started();
+
+  {
+    // reduce the amount of extra release messages. Not a must, but the log is cleaner
+    auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from);
+    if (w != m_waited_for_peers.end())
+      m_waited_for_peers.erase(w);
+  }
+
+  // are we forced to reject the reservation?
+  if (m_had_rejections) {
+
+    dout(10) << __func__ << ": rejecting late-coming reservation from "
+	     << from << dendl;
+    release_replica(from, m_pg->get_osdmap_epoch());
+
+  } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
+	     m_reserved_peers.end()) {
+
+    dout(10) << __func__ << ": already had osd." << from << " reserved" << dendl;
+
+  } else {
+
+    dout(10) << __func__ << ": osd." << from << " scrub reserve = success"
+	     << dendl;
+    m_reserved_peers.push_back(from);
+    if (--m_pending == 0) {
+      send_all_done();
+    }
+  }
+}
+
+void ReplicaReservations::handle_reserve_reject(OpRequestRef op, pg_shard_t from)
+{
+  dout(10) << __func__ << ": rejected by " << from << dendl;
+  dout(15) << __func__ << ": " << *op->get_req() << dendl;
+  op->mark_started();
+
+  {
+    // reduce the amount of extra release messages. Not a must, but the log is cleaner
+    auto w = find(m_waited_for_peers.begin(), m_waited_for_peers.end(), from);
+    if (w != m_waited_for_peers.end())
+      m_waited_for_peers.erase(w);
+  }
+
+  if (m_had_rejections) {
+
+    // our failure was already handled when the first rejection arrived
+    dout(15) << __func__ << ": ignoring late-coming rejection from "
+	     << from << dendl;
+
+  } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) !=
+	     m_reserved_peers.end()) {
+
+    dout(10) << __func__ << ": already had osd." << from << " reserved" << dendl;
+
+  } else {
+
+    dout(10) << __func__ << ": osd." << from << " scrub reserve = fail" << dendl;
+    m_had_rejections = true;  // preventing any additional notifications
+    send_reject();
+  }
+}
+
+std::ostream& ReplicaReservations::gen_prefix(std::ostream& out) const
+{
+  return out << m_log_msg_prefix;
+}
+
+// ///////////////////// LocalReservation //////////////////////////////////
+
+// note: no dout()s in LocalReservation functions. Client logs interactions.
+LocalReservation::LocalReservation(OSDService* osds)
+    : m_osds{osds}
+{
+  if (m_osds->inc_scrubs_local()) {
+    // the failure is signalled by not having m_holding_local_reservation set
+    m_holding_local_reservation = true;
+  }
+}
+
+LocalReservation::~LocalReservation()
+{
+  if (m_holding_local_reservation) {
+    m_holding_local_reservation = false;
+    m_osds->dec_scrubs_local();
+  }
+}
+
+// ///////////////////// ReservedByRemotePrimary ///////////////////////////////
+
+ReservedByRemotePrimary::ReservedByRemotePrimary(const PgScrubber* scrubber,
+						 PG* pg,
+						 OSDService* osds,
+						 epoch_t epoch)
+    : m_scrubber{scrubber}
+    , m_pg{pg}
+    , m_osds{osds}
+    , m_reserved_at{epoch}
+{
+  if (!m_osds->inc_scrubs_remote()) {
+    dout(10) << __func__ << ": failed to reserve at Primary request" << dendl;
+    // the failure is signalled by not having m_reserved_by_remote_primary set
+    return;
+  }
+
+  dout(20) << __func__ << ": scrub resources reserved at Primary request" << dendl;
+  m_reserved_by_remote_primary = true;
+}
+
+bool ReservedByRemotePrimary::is_stale() const
+{
+  return m_reserved_at < m_pg->get_same_interval_since();
+}
+
+ReservedByRemotePrimary::~ReservedByRemotePrimary()
+{
+  if (m_reserved_by_remote_primary) {
+    m_reserved_by_remote_primary = false;
+    m_osds->dec_scrubs_remote();
+  }
+}
+
+std::ostream& ReservedByRemotePrimary::gen_prefix(std::ostream& out) const
+{
+  return m_scrubber->gen_prefix(out);
+}
+
+// ///////////////////// MapsCollectionStatus ////////////////////////////////
+
+auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from)
+  -> std::tuple<bool, std::string_view>
+{
+  auto fe = std::find(m_maps_awaited_for.begin(), m_maps_awaited_for.end(), from);
+  if (fe != m_maps_awaited_for.end()) {
+    // we are indeed waiting for a map from this replica
+    m_maps_awaited_for.erase(fe);
+    return std::tuple{true, ""sv};
+  } else {
+    return std::tuple{false, " unsolicited scrub-map"sv};
+  }
+}
+
+void MapsCollectionStatus::reset()
+{
+  *this = MapsCollectionStatus{};
+}
+
+std::string MapsCollectionStatus::dump() const
+{
+  std::string all;
+  for (const auto& rp : m_maps_awaited_for) {
+    all.append(rp.get_osd() + " "s);
+  }
+  return all;
+}
+
+ostream& operator<<(ostream& out, const MapsCollectionStatus& sf)
+{
+  out << " [ ";
+  for (const auto& rp : sf.m_maps_awaited_for) {
+    out << rp.get_osd() << " ";
+  }
+  if (!sf.m_local_map_ready) {
+    out << " local ";
+  }
+  return out << " ] ";
+}
+
+}  // namespace Scrub
diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h
new file mode 100644
index 000000000..392a4a588
--- /dev/null
+++ b/src/osd/pg_scrubber.h
@@ -0,0 +1,821 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cassert>
+#include <chrono>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "PG.h"
+#include "ScrubStore.h"
+#include "scrub_machine_lstnr.h"
+#include "scrubber_common.h"
+
+class Callback;
+
+namespace Scrub {
+class ScrubMachine;
+struct BuildMap;
+
+/**
+ * Reserving/freeing scrub resources at the replicas.
+ *
+ *  When constructed - sends reservation requests to the acting_set.
+ *  A rejection triggers a "couldn't acquire the replicas' scrub resources" event.
+ *  All previous requests, whether already granted or not, are explicitly released.
+ *
+ *  A note re performance: I've measured a few container alternatives for
+ *  m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as
+ *  expected. flat_set is only slightly better. Surprisingly - std::vector (with no
+ *  sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve.
+ */
+class ReplicaReservations {
+  using OrigSet = decltype(std::declval<PG>().get_actingset());
+
+  PG* m_pg;
+  OrigSet m_acting_set;
+  OSDService* m_osds;
+  std::vector<pg_shard_t> m_waited_for_peers;
+  std::vector<pg_shard_t> m_reserved_peers;
+  bool m_had_rejections{false};
+  int m_pending{-1};
+
+  void release_replica(pg_shard_t peer, epoch_t epoch);
+
+  void send_all_done();	 ///< all reservations are granted
+
+  /// notify the scrubber that we have failed to reserve replicas' resources
+  void send_reject();
+
+ public:
+  std::string m_log_msg_prefix;
+
+  /**
+   *  quietly discard all knowledge about existing reservations. No messages
+   *  are sent to peers.
+   *  To be used upon interval change, as we know the the running scrub is no longer
+   *  relevant, and that the replicas had reset the reservations on their side.
+   */
+  void discard_all();
+
+  ReplicaReservations(PG* pg, pg_shard_t whoami);
+
+  ~ReplicaReservations();
+
+  void handle_reserve_grant(OpRequestRef op, pg_shard_t from);
+
+  void handle_reserve_reject(OpRequestRef op, pg_shard_t from);
+
+  std::ostream& gen_prefix(std::ostream& out) const;
+};
+
+/**
+ *  wraps the local OSD scrub resource reservation in an RAII wrapper
+ */
+class LocalReservation {
+  OSDService* m_osds;
+  bool m_holding_local_reservation{false};
+
+ public:
+  LocalReservation(OSDService* osds);
+  ~LocalReservation();
+  bool is_reserved() const { return m_holding_local_reservation; }
+};
+
+/**
+ *  wraps the OSD resource we are using when reserved as a replica by a scrubbing master.
+ */
+class ReservedByRemotePrimary {
+  const PgScrubber* m_scrubber; ///< we will be using its gen_prefix()
+  PG* m_pg;
+  OSDService* m_osds;
+  bool m_reserved_by_remote_primary{false};
+  const epoch_t m_reserved_at;
+
+ public:
+  ReservedByRemotePrimary(const PgScrubber* scrubber, PG* pg, OSDService* osds, epoch_t epoch);
+  ~ReservedByRemotePrimary();
+  [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; }
+
+  /// compare the remembered reserved-at epoch to the current interval
+  [[nodiscard]] bool is_stale() const;
+
+  std::ostream& gen_prefix(std::ostream& out) const;
+};
+
+/**
+ * Once all replicas' scrub maps are received, we go on to compare the maps. That is -
+ * unless we we have not yet completed building our own scrub map. MapsCollectionStatus
+ * combines the status of waiting for both the local map and the replicas, without
+ * resorting to adding dummy entries into a list.
+ */
+class MapsCollectionStatus {
+
+  bool m_local_map_ready{false};
+  std::vector<pg_shard_t> m_maps_awaited_for;
+
+ public:
+  [[nodiscard]] bool are_all_maps_available() const
+  {
+    return m_local_map_ready && m_maps_awaited_for.empty();
+  }
+
+  void mark_local_map_ready() { m_local_map_ready = true; }
+
+  void mark_replica_map_request(pg_shard_t from_whom)
+  {
+    m_maps_awaited_for.push_back(from_whom);
+  }
+
+  /// @returns true if indeed waiting for this one. Otherwise: an error string
+  auto mark_arriving_map(pg_shard_t from) -> std::tuple<bool, std::string_view>;
+
+  std::vector<pg_shard_t> get_awaited() const { return m_maps_awaited_for; }
+
+  void reset();
+
+  std::string dump() const;
+
+  friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf);
+};
+
+}  // namespace Scrub
+
+
+/**
+ * the scrub operation flags. Primary only.
+ * Set at scrub start. Checked in multiple locations - mostly
+ * at finish.
+ */
+struct scrub_flags_t {
+
+  unsigned int priority{0};
+
+  /**
+   * set by queue_scrub() if either planned_scrub.auto_repair or
+   * need_auto were set.
+   * Tested at scrub end.
+   */
+  bool auto_repair{false};
+
+  /// this flag indicates that we are scrubbing post repair to verify everything is fixed
+  bool check_repair{false};
+
+  /// checked at the end of the scrub, to possibly initiate a deep-scrub
+  bool deep_scrub_on_error{false};
+
+  /**
+   * scrub must not be aborted.
+   * Set for explicitly requested scrubs, and for scrubs originated by the pairing
+   * process with the 'repair' flag set (in the RequestScrub event).
+   */
+  bool required{false};
+};
+
+ostream& operator<<(ostream& out, const scrub_flags_t& sf);
+
+
+/**
+ * The part of PG-scrubbing code that isn't state-machine wiring.
+ *
+ * Why the separation? I wish to move to a different FSM implementation. Thus I
+ * am forced to strongly decouple the state-machine implementation details from
+ * the actual scrubbing code.
+ */
+class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
+
+ public:
+  explicit PgScrubber(PG* pg);
+
+  //  ------------------  the I/F exposed to the PG (ScrubPgIF) -------------
+
+  /// are we waiting for resource reservation grants form our replicas?
+  [[nodiscard]] bool is_reserving() const final;
+
+  void initiate_regular_scrub(epoch_t epoch_queued) final;
+
+  void initiate_scrub_after_repair(epoch_t epoch_queued) final;
+
+  void send_scrub_resched(epoch_t epoch_queued) final;
+
+  void active_pushes_notification(epoch_t epoch_queued) final;
+
+  void update_applied_notification(epoch_t epoch_queued) final;
+
+  void send_scrub_unblock(epoch_t epoch_queued) final;
+
+  void digest_update_notification(epoch_t epoch_queued) final;
+
+  void send_replica_maps_ready(epoch_t epoch_queued) final;
+
+  void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
+
+  void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
+
+  void send_replica_pushes_upd(epoch_t epoch_queued) final;
+  /**
+   *  The PG has updated its 'applied version'. It might be that we are waiting for this
+   *  information: after selecting a range of objects to scrub, we've marked the latest
+   *  version of these objects in m_subset_last_update. We will not start the map building
+   *  before we know that the PG has reached this version.
+   */
+  void on_applied_when_primary(const eversion_t& applied_version) final;
+
+  void send_full_reset(epoch_t epoch_queued) final;
+
+  void send_chunk_free(epoch_t epoch_queued) final;
+
+  void send_chunk_busy(epoch_t epoch_queued) final;
+
+  void send_local_map_done(epoch_t epoch_queued) final;
+
+  void send_maps_compared(epoch_t epoch_queued) final;
+
+  void send_get_next_chunk(epoch_t epoch_queued) final;
+
+  void send_scrub_is_finished(epoch_t epoch_queued) final;
+
+  /**
+   *  we allow some number of preemptions of the scrub, which mean we do
+   *  not block.  Then we start to block.  Once we start blocking, we do
+   *  not stop until the scrub range is completed.
+   */
+  bool write_blocked_by_scrub(const hobject_t& soid) final;
+
+  /// true if the given range intersects the scrub interval in any way
+  bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final;
+
+  /**
+   *  we are a replica being asked by the Primary to reserve OSD resources for
+   *  scrubbing
+   */
+  void handle_scrub_reserve_request(OpRequestRef op) final;
+
+  void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final;
+  void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final;
+  void handle_scrub_reserve_release(OpRequestRef op) final;
+  void discard_replica_reservations() final;
+  void clear_scrub_reservations() final;  // PG::clear... fwds to here
+  void unreserve_replicas() final;
+
+  // managing scrub op registration
+
+  void reg_next_scrub(const requested_scrub_t& request_flags) final;
+
+  void unreg_next_scrub() final;
+
+  void scrub_requested(scrub_level_t scrub_level,
+		       scrub_type_t scrub_type,
+		       requested_scrub_t& req_flags) final;
+
+  /**
+   * Reserve local scrub resources (managed by the OSD)
+   *
+   * Fails if OSD's local-scrubs budget was exhausted
+   * \returns were local resources reserved?
+   */
+  bool reserve_local() final;
+
+  void handle_query_state(ceph::Formatter* f) final;
+
+  void dump(ceph::Formatter* f) const override;
+
+  // used if we are a replica
+
+  void replica_scrub_op(OpRequestRef op) final;
+
+  /// the op priority, taken from the primary's request message
+  Scrub::scrub_prio_t replica_op_priority() const final
+  {
+    return m_replica_request_priority;
+  };
+
+  unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+				      unsigned int suggested_priority) const final;
+  /// the version that refers to m_flags.priority
+  unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final;
+
+  void add_callback(Context* context) final { m_callbacks.push_back(context); }
+
+  [[nodiscard]] bool are_callbacks_pending() const final  // used for an assert in PG.cc
+  {
+    return !m_callbacks.empty();
+  }
+
+  /// handle a message carrying a replica map
+  void map_from_replica(OpRequestRef op) final;
+
+  void scrub_clear_state() final;
+
+  bool is_queued_or_active() const final;
+
+  /**
+   *  add to scrub statistics, but only if the soid is below the scrub start
+   */
+  virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+					const hobject_t& soid) override
+  {
+    ceph_assert(false);
+  }
+
+  /**
+   * finalize the parameters of the initiated scrubbing session:
+   *
+   * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set;
+   * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set.
+   */
+  void set_op_parameters(requested_scrub_t& request) final;
+
+  void cleanup_store(ObjectStore::Transaction* t) final;
+
+  bool get_store_errors(const scrub_ls_arg_t& arg,
+			scrub_ls_result_t& res_inout) const override
+  {
+    return false;
+  }
+
+  // -------------------------------------------------------------------------------------------
+  // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener)
+
+  [[nodiscard]] bool is_primary() const final { return m_pg->recovery_state.is_primary(); }
+
+  void select_range_n_notify() final;
+
+  /// walk the log to find the latest update that affects our chunk
+  eversion_t search_log_for_updates() const final;
+
+  eversion_t get_last_update_applied() const final
+  {
+    return m_pg->recovery_state.get_last_update_applied();
+  }
+
+  int pending_active_pushes() const final { return m_pg->active_pushes; }
+
+  void on_init() final;
+  void on_replica_init() final;
+  void replica_handling_done() final;
+
+  /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
+  /// (thus can be called from FSM reactions)
+  void clear_pgscrub_state() final;
+
+  /*
+   * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
+   * is asserted - after a configuration-dependent timeout.
+   */
+  void add_delayed_scheduling() final;
+
+  void get_replicas_maps(bool replica_can_preempt) final;
+
+  void on_digest_updates() final;
+
+  void scrub_begin() final;
+
+  void scrub_finish() final;
+
+  ScrubMachineListener::MsgAndEpoch
+  prep_replica_map_msg(Scrub::PreemptionNoted was_preempted) final;
+
+  void send_replica_map(const ScrubMachineListener::MsgAndEpoch& preprepared) final;
+
+  void send_preempted_replica() final;
+
+  void send_remotes_reserved(epoch_t epoch_queued) final;
+  void send_reservation_failure(epoch_t epoch_queued) final;
+
+  /**
+   *  does the PG have newer updates than what we (the scrubber) know?
+   */
+  [[nodiscard]] bool has_pg_marked_new_updates() const final;
+
+  void set_subset_last_update(eversion_t e) final;
+
+  void maps_compare_n_cleanup() final;
+
+  Scrub::preemption_t& get_preemptor() final;
+
+  int build_primary_map_chunk() final;
+
+  int build_replica_map_chunk() final;
+
+  void reserve_replicas() final;
+
+  [[nodiscard]] bool was_epoch_changed() const final;
+
+  void set_queued_or_active() final;
+  void clear_queued_or_active() final;
+
+  void mark_local_map_ready() final;
+
+  [[nodiscard]] bool are_all_maps_available() const final;
+
+  std::string dump_awaited_maps() const final;
+
+  std::ostream& gen_prefix(std::ostream& out) const final;
+
+ protected:
+  bool state_test(uint64_t m) const { return m_pg->state_test(m); }
+  void state_set(uint64_t m) { m_pg->state_set(m); }
+  void state_clear(uint64_t m) { m_pg->state_clear(m); }
+
+  [[nodiscard]] bool is_scrub_registered() const;
+
+  virtual void _scrub_clear_state() {}
+
+  utime_t m_scrub_reg_stamp;  ///< stamp we registered for
+
+  ostream& show(ostream& out) const override;
+
+ public:
+  // -------------------------------------------------------------------------------------------
+
+  friend ostream& operator<<(ostream& out, const PgScrubber& scrubber);
+
+  static utime_t scrub_must_stamp() { return utime_t(1, 1); }
+
+  virtual ~PgScrubber();  // must be defined separately, in the .cc file
+
+  [[nodiscard]] bool is_scrub_active() const final { return m_active; }
+
+ private:
+  void reset_internal_state();
+
+  /**
+   *  the current scrubbing operation is done. We should mark that fact, so that
+   *  all events related to the previous operation can be discarded.
+   */
+  void advance_token();
+
+  bool is_token_current(Scrub::act_token_t received_token);
+
+  void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
+
+  void _scan_snaps(ScrubMap& smap);
+
+  ScrubMap clean_meta_map();
+
+  /**
+   *  mark down some parameters of the initiated scrub:
+   *  - the epoch when started;
+   *  - the depth of the scrub requested (from the PG_STATE variable)
+   */
+  void reset_epoch(epoch_t epoch_queued);
+
+  void run_callbacks();
+
+  // -----     methods used to verify the relevance of incoming events:
+
+  /**
+   *  is the incoming event still relevant, and should be processed?
+   *
+   *  It isn't if:
+   *  - (1) we are no longer 'actively scrubbing'; or
+   *  - (2) the message is from an epoch prior to when we started the current scrub
+   * session; or
+   *  - (3) the message epoch is from a previous interval; or
+   *  - (4) the 'abort' configuration flags were set.
+   *
+   *  For (1) & (2) - teh incoming message is discarded, w/o further action.
+   *
+   *  For (3): (see check_interval() for a full description) if we have not reacted yet
+   *  to this specific new interval, we do now:
+   *  - replica reservations are silently discarded (we count on the replicas to notice
+   *        the interval change and un-reserve themselves);
+   *  - the scrubbing is halted.
+   *
+   *  For (4): the message will be discarded, but also:
+   *    if this is the first time we've noticed the 'abort' request, we perform the abort.
+   *
+   *  \returns should the incoming event be processed?
+   */
+  bool is_message_relevant(epoch_t epoch_to_verify);
+
+  /**
+   * check the 'no scrub' configuration options.
+   */
+  [[nodiscard]] bool should_abort() const;
+
+  /**
+   * Check the 'no scrub' configuration flags.
+   *
+   * Reset everything if the abort was not handled before.
+   * @returns false if the message was discarded due to abort flag.
+   */
+  [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify);
+
+  [[nodiscard]] bool check_interval(epoch_t epoch_to_verify);
+
+  epoch_t m_last_aborted{};  // last time we've noticed a request to abort
+
+  /**
+   * return true if any inconsistency/missing is repaired, false otherwise
+   */
+  [[nodiscard]] bool scrub_process_inconsistent();
+
+  void scrub_compare_maps();
+
+  bool m_needs_sleep{true};  ///< should we sleep before being rescheduled? always
+			     ///< 'true', unless we just got out of a sleep period
+
+  utime_t m_sleep_started_at;
+
+
+  // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed'
+  // to guarantee un-reserving when deleted.
+  std::optional<Scrub::ReplicaReservations> m_reservations;
+  std::optional<Scrub::LocalReservation> m_local_osd_resource;
+
+  /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing
+  std::optional<Scrub::ReservedByRemotePrimary> m_remote_osd_resource;
+
+  void cleanup_on_finish();  // scrub_clear_state() as called for a Primary when
+			     // Active->NotActive
+
+ protected:
+  PG* const m_pg;
+
+  /**
+   * the derivative-specific scrub-finishing touches:
+   */
+  virtual void _scrub_finish() {}
+
+  /**
+   * Validate consistency of the object info and snap sets.
+   */
+  virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest)
+  {}
+
+  // common code used by build_primary_map_chunk() and build_replica_map_chunk():
+  int build_scrub_map_chunk(ScrubMap& map,  // primary or replica?
+			    ScrubMapBuilder& pos,
+			    hobject_t start,
+			    hobject_t end,
+			    bool deep);
+
+  std::unique_ptr<Scrub::ScrubMachine> m_fsm;
+  const spg_t m_pg_id;	///< a local copy of m_pg->pg_id
+  OSDService* const m_osds;
+  const pg_shard_t m_pg_whoami;	 ///< a local copy of m_pg->pg_whoami;
+
+  epoch_t m_interval_start{0};  ///< interval's 'from' of when scrubbing was first scheduled
+  /*
+   * the exact epoch when the scrubbing actually started (started here - cleared checks
+   *  for no-scrub conf). Incoming events are verified against this, with stale events
+   *  discarded.
+   */
+  epoch_t m_epoch_start{0};  ///< the actual epoch when scrubbing started
+
+  /**
+   *  (replica) a tag identifying a specific scrub "session". Incremented whenever the
+   *  Primary releases the replica scrub resources.
+   *  When the scrub session is terminated (even if the interval remains unchanged, as
+   *  might happen following an asok no-scrub command), stale scrub-resched messages
+   *  triggered by the backend will be discarded.
+   */
+  Scrub::act_token_t m_current_token{1};
+
+  scrub_flags_t m_flags;
+
+  bool m_active{false};
+
+  /**
+   * a flag designed to prevent the initiation of a second scrub on a PG for which scrubbing
+   * has been initiated.
+   *
+   * set once scrubbing was initiated (i.e. - even before the FSM event that
+   * will trigger a state-change out of Inactive was handled), and only reset
+   * once the FSM is back in Inactive.
+   * In other words - its ON period encompasses:
+   *   - the time period covered today by 'queued', and
+   *   - the time when m_active is set, and
+   *   - all the time from scrub_finish() calling update_stats() till the
+   *     FSM handles the 'finished' event
+   *
+   * Compared with 'm_active', this flag is asserted earlier  and remains ON for longer.
+   */
+  bool m_queued_or_active{false};
+
+  eversion_t m_subset_last_update{};
+
+  std::unique_ptr<Scrub::Store> m_store;
+
+  int num_digest_updates_pending{0};
+  hobject_t m_start, m_end;  ///< note: half-closed: [start,end)
+
+  /// Returns reference to current osdmap
+  const OSDMapRef& get_osdmap() const;
+
+  /// Returns epoch of current osdmap
+  epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); }
+
+  CephContext* get_pg_cct() const { return m_pg->cct; }
+
+  // collected statistics
+  int m_shallow_errors{0};
+  int m_deep_errors{0};
+  int m_fixed_count{0};
+
+  /// Maps from objects with errors to missing peers
+  HobjToShardSetMapping m_missing;
+
+ protected:
+  /**
+   * 'm_is_deep' - is the running scrub a deep one?
+   *
+   * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is
+   * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is
+   * meaningful both for the primary and the replicas, and is used as a parameter when
+   * building the scrub maps.
+   */
+  bool m_is_deep{false};
+
+  /**
+   * If set: affects the backend & scrubber-backend functions called after all
+   * scrub maps are available.
+   *
+   * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be
+   * a "user facing" status display only).
+   */
+  bool m_is_repair{false};
+
+  /**
+   * User-readable summary of the scrubber's current mode of operation. Used for
+   * both osd.*.log and the cluster log.
+   * One of:
+   *    "repair"
+   *    "deep-scrub",
+   *    "scrub
+   *
+   * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for
+   * auto_repair will show as "deep-scrub" and not as "repair" (until the first error
+   * is detected).
+   */
+  std::string_view m_mode_desc;
+
+  void update_op_mode_text();
+
+private:
+
+  /**
+   * initiate a deep-scrub after the current scrub ended with errors.
+   */
+  void request_rescrubbing(requested_scrub_t& req_flags);
+
+  /*
+   * Select a range of objects to scrub.
+   *
+   * By:
+   * - setting tentative range based on conf and divisor
+   * - requesting a partial list of elements from the backend;
+   * - handling some head/clones issues
+   *
+   * The selected range is set directly into 'm_start' and 'm_end'
+   */
+  bool select_range();
+
+  std::list<Context*> m_callbacks;
+
+  /**
+   * send a replica (un)reservation request to the acting set
+   *
+   * @param opcode - one of MOSDScrubReserve::REQUEST
+   *                  or MOSDScrubReserve::RELEASE
+   */
+  void message_all_replicas(int32_t opcode, std::string_view op_text);
+
+  hobject_t m_max_end;	///< Largest end that may have been sent to replicas
+  ScrubMap m_primary_scrubmap;
+  ScrubMapBuilder m_primary_scrubmap_pos;
+
+  std::map<pg_shard_t, ScrubMap> m_received_maps;
+
+  /// Cleaned std::map pending snap metadata scrub
+  ScrubMap m_cleaned_meta_map;
+
+  void _request_scrub_map(pg_shard_t replica,
+			  eversion_t version,
+			  hobject_t start,
+			  hobject_t end,
+			  bool deep,
+			  bool allow_preemption);
+
+
+  Scrub::MapsCollectionStatus m_maps_status;
+
+  omap_stat_t m_omap_stats = (const struct omap_stat_t){0};
+
+  /// Maps from objects with errors to inconsistent peers
+  HobjToShardSetMapping m_inconsistent;
+
+  /// Maps from object with errors to good peers
+  std::map<hobject_t, std::list<std::pair<ScrubMap::object, pg_shard_t>>> m_authoritative;
+
+  // ------------ members used if we are a replica
+
+  epoch_t m_replica_min_epoch;	///< the min epoch needed to handle this message
+
+  ScrubMapBuilder replica_scrubmap_pos;
+  ScrubMap replica_scrubmap;
+
+  /**
+   * we mark the request priority as it arrived. It influences the queuing priority
+   * when we wait for local updates
+   */
+  Scrub::scrub_prio_t m_replica_request_priority;
+
+  /**
+   * the 'preemption' "state-machine".
+   * Note: I was considering an orthogonal sub-machine implementation, but as
+   * the state diagram is extremely simple, the added complexity wasn't justified.
+   */
+  class preemption_data_t : public Scrub::preemption_t {
+   public:
+    preemption_data_t(PG* pg);	// the PG access is used for conf access (and logs)
+
+    [[nodiscard]] bool is_preemptable() const final { return m_preemptable; }
+
+    bool do_preempt() final
+    {
+      if (m_preempted || !m_preemptable)
+	return false;
+
+      std::lock_guard<std::mutex> lk{m_preemption_lock};
+      if (!m_preemptable)
+	return false;
+
+      m_preempted = true;
+      return true;
+    }
+
+    /// same as 'do_preempt()' but w/o checks (as once a replica
+    /// was preempted, we cannot continue)
+    void replica_preempted() { m_preempted = true; }
+
+    void enable_preemption()
+    {
+      std::lock_guard<std::mutex> lk{m_preemption_lock};
+      if (are_preemptions_left() && !m_preempted) {
+	m_preemptable = true;
+      }
+    }
+
+    /// used by a replica to set preemptability state according to the Primary's request
+    void force_preemptability(bool is_allowed)
+    {
+      // note: no need to lock for a replica
+      m_preempted = false;
+      m_preemptable = is_allowed;
+    }
+
+    bool disable_and_test() final
+    {
+      std::lock_guard<std::mutex> lk{m_preemption_lock};
+      m_preemptable = false;
+      return m_preempted;
+    }
+
+    [[nodiscard]] bool was_preempted() const { return m_preempted; }
+
+    [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; }
+
+    void reset();
+
+    void adjust_parameters() final
+    {
+      std::lock_guard<std::mutex> lk{m_preemption_lock};
+
+      if (m_preempted) {
+	m_preempted = false;
+	m_preemptable = adjust_left();
+      } else {
+	m_preemptable = are_preemptions_left();
+      }
+    }
+
+   private:
+    PG* m_pg;
+    mutable std::mutex m_preemption_lock;
+    bool m_preemptable{false};
+    bool m_preempted{false};
+    int m_left;
+    size_t m_size_divisor{1};
+    bool are_preemptions_left() const { return m_left > 0; }
+
+    bool adjust_left()
+    {
+      if (m_left > 0) {
+	--m_left;
+	m_size_divisor *= 2;
+      }
+      return m_left > 0;
+    }
+  };
+
+  preemption_data_t preemption_data;
+};
diff --git a/src/osd/recovery_types.cc b/src/osd/recovery_types.cc
new file mode 100644
index 000000000..3dd49a82d
--- /dev/null
+++ b/src/osd/recovery_types.cc
@@ -0,0 +1,16 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "recovery_types.h"
+
+ostream& operator<<(ostream& out, const BackfillInterval& bi)
+{
+  out << "BackfillInfo(" << bi.begin << "-" << bi.end
+      << " " << bi.objects.size() << " objects";
+  if (!bi.objects.empty())
+    out << " " << bi.objects;
+  out << ")";
+  return out;
+}
+
+
diff --git a/src/osd/recovery_types.h b/src/osd/recovery_types.h
new file mode 100644
index 000000000..73a621882
--- /dev/null
+++ b/src/osd/recovery_types.h
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+
+#include "osd_types.h"
+
+/**
+ * BackfillInterval
+ *
+ * Represents the objects in a range [begin, end)
+ *
+ * Possible states:
+ * 1) begin == end == hobject_t() indicates the the interval is unpopulated
+ * 2) Else, objects contains all objects in [begin, end)
+ */
+struct BackfillInterval {
+  // info about a backfill interval on a peer
+  eversion_t version; /// version at which the scan occurred
+  std::map<hobject_t,eversion_t> objects;
+  hobject_t begin;
+  hobject_t end;
+
+  /// clear content
+  void clear() {
+    *this = BackfillInterval();
+  }
+
+  /// clear objects std::list only
+  void clear_objects() {
+    objects.clear();
+  }
+
+  /// reinstantiate with a new start+end position and sort order
+  void reset(hobject_t start) {
+    clear();
+    begin = end = start;
+  }
+
+  /// true if there are no objects in this interval
+  bool empty() const {
+    return objects.empty();
+  }
+
+  /// true if interval extends to the end of the range
+  bool extends_to_end() const {
+    return end.is_max();
+  }
+
+  /// removes items <= soid and adjusts begin to the first object
+  void trim_to(const hobject_t &soid) {
+    trim();
+    while (!objects.empty() &&
+           objects.begin()->first <= soid) {
+      pop_front();
+    }
+  }
+
+  /// Adjusts begin to the first object
+  void trim() {
+    if (!objects.empty())
+      begin = objects.begin()->first;
+    else
+      begin = end;
+  }
+
+  /// drop first entry, and adjust @begin accordingly
+  void pop_front() {
+    ceph_assert(!objects.empty());
+    objects.erase(objects.begin());
+    trim();
+  }
+
+  /// dump
+  void dump(ceph::Formatter *f) const {
+    f->dump_stream("begin") << begin;
+    f->dump_stream("end") << end;
+    f->open_array_section("objects");
+    for (std::map<hobject_t, eversion_t>::const_iterator i =
+           objects.begin();
+         i != objects.end();
+         ++i) {
+      f->open_object_section("object");
+      f->dump_stream("object") << i->first;
+      f->dump_stream("version") << i->second;
+      f->close_section();
+    }
+    f->close_section();
+  }
+};
+
+std::ostream &operator<<(std::ostream &out, const BackfillInterval &bi);
+
diff --git a/src/osd/scheduler/OpScheduler.cc b/src/osd/scheduler/OpScheduler.cc
new file mode 100644
index 000000000..3ce6fdb55
--- /dev/null
+++ b/src/osd/scheduler/OpScheduler.cc
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <ostream>
+
+#include "osd/scheduler/OpScheduler.h"
+
+#include "common/WeightedPriorityQueue.h"
+#include "osd/scheduler/mClockScheduler.h"
+
+namespace ceph::osd::scheduler {
+
+OpSchedulerRef make_scheduler(
+  CephContext *cct, uint32_t num_shards, bool is_rotational)
+{
+  const std::string *type = &cct->_conf->osd_op_queue;
+  if (*type == "debug_random") {
+    static const std::string index_lookup[] = { "mclock_scheduler",
+						"wpq" };
+    srand(time(NULL));
+    unsigned which = rand() % (sizeof(index_lookup) / sizeof(index_lookup[0]));
+    type = &index_lookup[which];
+  }
+
+  if (*type == "wpq" ) {
+    // default is 'wpq'
+    return std::make_unique<
+      ClassedOpQueueScheduler<WeightedPriorityQueue<OpSchedulerItem, client>>>(
+	cct,
+	cct->_conf->osd_op_pq_max_tokens_per_priority,
+	cct->_conf->osd_op_pq_min_cost
+    );
+  } else if (*type == "mclock_scheduler") {
+    return std::make_unique<mClockScheduler>(cct, num_shards, is_rotational);
+  } else {
+    ceph_assert("Invalid choice of wq" == 0);
+  }
+}
+
+std::ostream &operator<<(std::ostream &lhs, const OpScheduler &rhs) {
+  rhs.print(lhs);
+  return lhs;
+}
+
+}
diff --git a/src/osd/scheduler/OpScheduler.h b/src/osd/scheduler/OpScheduler.h
new file mode 100644
index 000000000..6e2bb5abd
--- /dev/null
+++ b/src/osd/scheduler/OpScheduler.h
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <ostream>
+#include <variant>
+
+#include "common/ceph_context.h"
+#include "osd/scheduler/OpSchedulerItem.h"
+
+namespace ceph::osd::scheduler {
+
+using client = uint64_t;
+using WorkItem = std::variant<std::monostate, OpSchedulerItem, double>;
+
+/**
+ * Base interface for classes responsible for choosing
+ * op processing order in the OSD.
+ */
+class OpScheduler {
+public:
+  // Enqueue op for scheduling
+  virtual void enqueue(OpSchedulerItem &&item) = 0;
+
+  // Enqueue op for processing as though it were enqueued prior
+  // to other items already scheduled.
+  virtual void enqueue_front(OpSchedulerItem &&item) = 0;
+
+  // Returns true iff there are no ops scheduled
+  virtual bool empty() const = 0;
+
+  // Return next op to be processed
+  virtual WorkItem dequeue() = 0;
+
+  // Dump formatted representation for the queue
+  virtual void dump(ceph::Formatter &f) const = 0;
+
+  // Print human readable brief description with relevant parameters
+  virtual void print(std::ostream &out) const = 0;
+
+  // Apply config changes to the scheduler (if any)
+  virtual void update_configuration() = 0;
+
+  // Destructor
+  virtual ~OpScheduler() {};
+};
+
+std::ostream &operator<<(std::ostream &lhs, const OpScheduler &);
+using OpSchedulerRef = std::unique_ptr<OpScheduler>;
+
+OpSchedulerRef make_scheduler(
+  CephContext *cct, uint32_t num_shards, bool is_rotational);
+
+/**
+ * Implements OpScheduler in terms of OpQueue
+ *
+ * Templated on queue type to avoid dynamic dispatch, T should implement
+ * OpQueue<OpSchedulerItem, client>.  This adapter is mainly responsible for
+ * the boilerplate priority cutoff/strict concept which is needed for
+ * OpQueue based implementations.
+ */
+template <typename T>
+class ClassedOpQueueScheduler final : public OpScheduler {
+  unsigned cutoff;
+  T queue;
+
+  static unsigned int get_io_prio_cut(CephContext *cct) {
+    if (cct->_conf->osd_op_queue_cut_off == "debug_random") {
+      srand(time(NULL));
+      return (rand() % 2 < 1) ? CEPH_MSG_PRIO_HIGH : CEPH_MSG_PRIO_LOW;
+    } else if (cct->_conf->osd_op_queue_cut_off == "high") {
+      return CEPH_MSG_PRIO_HIGH;
+    } else {
+      // default / catch-all is 'low'
+      return CEPH_MSG_PRIO_LOW;
+    }
+  }
+public:
+  template <typename... Args>
+  ClassedOpQueueScheduler(CephContext *cct, Args&&... args) :
+    cutoff(get_io_prio_cut(cct)),
+    queue(std::forward<Args>(args)...)
+  {}
+
+  void enqueue(OpSchedulerItem &&item) final {
+    unsigned priority = item.get_priority();
+    unsigned cost = item.get_cost();
+
+    if (priority >= cutoff)
+      queue.enqueue_strict(
+	item.get_owner(), priority, std::move(item));
+    else
+      queue.enqueue(
+	item.get_owner(), priority, cost, std::move(item));
+  }
+
+  void enqueue_front(OpSchedulerItem &&item) final {
+    unsigned priority = item.get_priority();
+    unsigned cost = item.get_cost();
+    if (priority >= cutoff)
+      queue.enqueue_strict_front(
+	item.get_owner(),
+	priority, std::move(item));
+    else
+      queue.enqueue_front(
+	item.get_owner(),
+	priority, cost, std::move(item));
+  }
+
+  bool empty() const final {
+    return queue.empty();
+  }
+
+  WorkItem dequeue() final {
+    return queue.dequeue();
+  }
+
+  void dump(ceph::Formatter &f) const final {
+    return queue.dump(&f);
+  }
+
+  void print(std::ostream &out) const final {
+    out << "ClassedOpQueueScheduler(queue=";
+    queue.print(out);
+    out << ", cutoff=" << cutoff << ")";
+  }
+
+  void update_configuration() final {
+    // no-op
+  }
+
+  ~ClassedOpQueueScheduler() final {};
+};
+
+}
diff --git a/src/osd/scheduler/OpSchedulerItem.cc b/src/osd/scheduler/OpSchedulerItem.cc
new file mode 100644
index 000000000..27db1dfa3
--- /dev/null
+++ b/src/osd/scheduler/OpSchedulerItem.cc
@@ -0,0 +1,259 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "osd/scheduler/OpSchedulerItem.h"
+#include "osd/OSD.h"
+#ifdef HAVE_JAEGER
+#include "common/tracer.h"
+#endif
+
+namespace ceph::osd::scheduler {
+
+void PGOpItem::run(
+  OSD *osd,
+  OSDShard *sdata,
+  PGRef& pg,
+  ThreadPool::TPHandle &handle)
+{
+#ifdef HAVE_JAEGER
+  auto PGOpItem_span = jaeger_tracing::child_span("PGOpItem::run", op->osd_parent_span);
+#endif
+  osd->dequeue_op(pg, op, handle);
+  pg->unlock();
+}
+
+void PGPeeringItem::run(
+  OSD *osd,
+  OSDShard *sdata,
+  PGRef& pg,
+  ThreadPool::TPHandle &handle)
+{
+  osd->dequeue_peering_evt(sdata, pg.get(), evt, handle);
+}
+
+void PGSnapTrim::run(
+  OSD *osd,
+  OSDShard *sdata,
+  PGRef& pg,
+  ThreadPool::TPHandle &handle)
+{
+  pg->snap_trimmer(epoch_queued);
+  pg->unlock();
+}
+
+void PGScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
+{
+  pg->scrub(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubAfterRepair::run(OSD* osd,
+			  OSDShard* sdata,
+			  PGRef& pg,
+			  ThreadPool::TPHandle& handle)
+{
+  pg->recovery_scrub(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubResched::run(OSD* osd,
+			 OSDShard* sdata,
+			 PGRef& pg,
+			 ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_scrub_resched(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubResourcesOK::run(OSD* osd,
+			     OSDShard* sdata,
+			     PGRef& pg,
+			     ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_resources_granted(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubDenied::run(OSD* osd,
+			OSDShard* sdata,
+			PGRef& pg,
+			ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_resources_denied(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubPushesUpdate::run(OSD* osd,
+			      OSDShard* sdata,
+			      PGRef& pg,
+			      ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_pushes_update(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubAppliedUpdate::run(OSD* osd,
+			       OSDShard* sdata,
+			       PGRef& pg,
+			       ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_applied_update(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubUnblocked::run(OSD* osd,
+			   OSDShard* sdata,
+			   PGRef& pg,
+			   ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_unblocking(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubDigestUpdate::run(OSD* osd,
+			      OSDShard* sdata,
+			      PGRef& pg,
+			      ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_digest_update(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubGotLocalMap::run(OSD* osd,
+			     OSDShard* sdata,
+			     PGRef& pg,
+			     ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_local_map_ready(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubGotReplMaps::run(OSD* osd,
+			     OSDShard* sdata,
+			     PGRef& pg,
+			     ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_replmaps_ready(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubMapsCompared::run(OSD* osd,
+			     OSDShard* sdata,
+			     PGRef& pg,
+			     ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_maps_compared(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGRepScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
+{
+  pg->replica_scrub(epoch_queued, activation_index, handle);
+  pg->unlock();
+}
+
+void PGRepScrubResched::run(OSD* osd,
+			    OSDShard* sdata,
+			    PGRef& pg,
+			    ThreadPool::TPHandle& handle)
+{
+  pg->replica_scrub_resched(epoch_queued, activation_index, handle);
+  pg->unlock();
+}
+
+void PGScrubReplicaPushes::run([[maybe_unused]] OSD* osd,
+			      OSDShard* sdata,
+			      PGRef& pg,
+			      ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_replica_pushes(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubScrubFinished::run([[maybe_unused]] OSD* osd,
+			       OSDShard* sdata,
+			       PGRef& pg,
+			       ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_scrub_is_finished(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubGetNextChunk::run([[maybe_unused]] OSD* osd,
+			       OSDShard* sdata,
+			       PGRef& pg,
+			       ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_get_next_chunk(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubChunkIsBusy::run([[maybe_unused]] OSD* osd,
+			      OSDShard* sdata,
+			      PGRef& pg,
+			      ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_chunk_busy(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGScrubChunkIsFree::run([[maybe_unused]] OSD* osd,
+			      OSDShard* sdata,
+			      PGRef& pg,
+			      ThreadPool::TPHandle& handle)
+{
+  pg->scrub_send_chunk_free(epoch_queued, handle);
+  pg->unlock();
+}
+
+void PGRecovery::run(
+  OSD *osd,
+  OSDShard *sdata,
+  PGRef& pg,
+  ThreadPool::TPHandle &handle)
+{
+  osd->do_recovery(pg.get(), epoch_queued, reserved_pushes, handle);
+  pg->unlock();
+}
+
+void PGRecoveryContext::run(
+  OSD *osd,
+  OSDShard *sdata,
+  PGRef& pg,
+  ThreadPool::TPHandle &handle)
+{
+  c.release()->complete(handle);
+  pg->unlock();
+}
+
+void PGDelete::run(
+  OSD *osd,
+  OSDShard *sdata,
+  PGRef& pg,
+  ThreadPool::TPHandle &handle)
+{
+  osd->dequeue_delete(sdata, pg.get(), epoch_queued, handle);
+}
+
+void PGRecoveryMsg::run(
+  OSD *osd,
+  OSDShard *sdata,
+  PGRef& pg,
+  ThreadPool::TPHandle &handle)
+{
+  osd->dequeue_op(pg, op, handle);
+  pg->unlock();
+}
+
+}
diff --git a/src/osd/scheduler/OpSchedulerItem.h b/src/osd/scheduler/OpSchedulerItem.h
new file mode 100644
index 000000000..7ba59838e
--- /dev/null
+++ b/src/osd/scheduler/OpSchedulerItem.h
@@ -0,0 +1,629 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <ostream>
+
+#include "include/types.h"
+#include "include/utime.h"
+#include "osd/OpRequest.h"
+#include "osd/PG.h"
+#include "osd/PGPeeringEvent.h"
+#include "messages/MOSDOp.h"
+
+
+class OSD;
+class OSDShard;
+
+namespace ceph::osd::scheduler {
+
+enum class op_scheduler_class : uint8_t {
+  background_recovery = 0,
+  background_best_effort,
+  immediate,
+  client,
+};
+
+class OpSchedulerItem {
+public:
+  class OrderLocker {
+  public:
+    using Ref = std::unique_ptr<OrderLocker>;
+    virtual void lock() = 0;
+    virtual void unlock() = 0;
+    virtual ~OrderLocker() {}
+  };
+
+  // Abstraction for operations queueable in the op queue
+  class OpQueueable {
+  public:
+    enum class op_type_t {
+      client_op,
+      peering_event,
+      bg_snaptrim,
+      bg_recovery,
+      bg_scrub,
+      bg_pg_delete
+    };
+    using Ref = std::unique_ptr<OpQueueable>;
+
+    /// Items with the same queue token will end up in the same shard
+    virtual uint32_t get_queue_token() const = 0;
+
+    /* Items will be dequeued and locked atomically w.r.t. other items with the
+       * same ordering token */
+    virtual const spg_t& get_ordering_token() const = 0;
+    virtual OrderLocker::Ref get_order_locker(PGRef pg) = 0;
+    virtual op_type_t get_op_type() const = 0;
+    virtual std::optional<OpRequestRef> maybe_get_op() const {
+      return std::nullopt;
+    }
+
+    virtual uint64_t get_reserved_pushes() const {
+      return 0;
+    }
+
+    virtual bool is_peering() const {
+      return false;
+    }
+    virtual bool peering_requires_pg() const {
+      ceph_abort();
+    }
+    virtual const PGCreateInfo *creates_pg() const {
+      return nullptr;
+    }
+
+    virtual std::ostream &print(std::ostream &rhs) const = 0;
+
+    virtual void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) = 0;
+    virtual op_scheduler_class get_scheduler_class() const = 0;
+
+    virtual ~OpQueueable() {}
+    friend std::ostream& operator<<(std::ostream& out, const OpQueueable& q) {
+      return q.print(out);
+    }
+
+  };
+
+private:
+  OpQueueable::Ref qitem;
+  int cost;
+  unsigned priority;
+  utime_t start_time;
+  uint64_t owner;  ///< global id (e.g., client.XXX)
+  epoch_t map_epoch;    ///< an epoch we expect the PG to exist in
+
+public:
+  OpSchedulerItem(
+    OpQueueable::Ref &&item,
+    int cost,
+    unsigned priority,
+    utime_t start_time,
+    uint64_t owner,
+    epoch_t e)
+    : qitem(std::move(item)),
+      cost(cost),
+      priority(priority),
+      start_time(start_time),
+      owner(owner),
+      map_epoch(e)
+  {}
+  OpSchedulerItem(OpSchedulerItem &&) = default;
+  OpSchedulerItem(const OpSchedulerItem &) = delete;
+  OpSchedulerItem &operator=(OpSchedulerItem &&) = default;
+  OpSchedulerItem &operator=(const OpSchedulerItem &) = delete;
+
+  OrderLocker::Ref get_order_locker(PGRef pg) {
+    return qitem->get_order_locker(pg);
+  }
+  uint32_t get_queue_token() const {
+    return qitem->get_queue_token();
+  }
+  const spg_t& get_ordering_token() const {
+    return qitem->get_ordering_token();
+  }
+  using op_type_t = OpQueueable::op_type_t;
+  OpQueueable::op_type_t get_op_type() const {
+    return qitem->get_op_type();
+  }
+  std::optional<OpRequestRef> maybe_get_op() const {
+    return qitem->maybe_get_op();
+  }
+  uint64_t get_reserved_pushes() const {
+    return qitem->get_reserved_pushes();
+  }
+  void run(OSD *osd, OSDShard *sdata,PGRef& pg, ThreadPool::TPHandle &handle) {
+    qitem->run(osd, sdata, pg, handle);
+  }
+  unsigned get_priority() const { return priority; }
+  int get_cost() const { return cost; }
+  utime_t get_start_time() const { return start_time; }
+  uint64_t get_owner() const { return owner; }
+  epoch_t get_map_epoch() const { return map_epoch; }
+
+  bool is_peering() const {
+    return qitem->is_peering();
+  }
+
+  const PGCreateInfo *creates_pg() const {
+    return qitem->creates_pg();
+  }
+
+  bool peering_requires_pg() const {
+    return qitem->peering_requires_pg();
+  }
+
+  op_scheduler_class get_scheduler_class() const {
+    return qitem->get_scheduler_class();
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const OpSchedulerItem& item) {
+     out << "OpSchedulerItem("
+	 << item.get_ordering_token() << " " << *item.qitem
+	 << " prio " << item.get_priority()
+	 << " cost " << item.get_cost()
+	 << " e" << item.get_map_epoch();
+     if (item.get_reserved_pushes()) {
+       out << " reserved_pushes " << item.get_reserved_pushes();
+     }
+    return out << ")";
+  }
+}; // class OpSchedulerItem
+
+/// Implements boilerplate for operations queued for the pg lock
+class PGOpQueueable : public OpSchedulerItem::OpQueueable {
+  spg_t pgid;
+protected:
+  const spg_t& get_pgid() const {
+    return pgid;
+  }
+public:
+  explicit PGOpQueueable(spg_t pg) : pgid(pg) {}
+  uint32_t get_queue_token() const final {
+    return get_pgid().ps();
+  }
+
+  const spg_t& get_ordering_token() const final {
+    return get_pgid();
+  }
+
+  OpSchedulerItem::OrderLocker::Ref get_order_locker(PGRef pg) final {
+    class Locker : public OpSchedulerItem::OrderLocker {
+      PGRef pg;
+    public:
+      explicit Locker(PGRef pg) : pg(pg) {}
+      void lock() final {
+	pg->lock();
+      }
+      void unlock() final {
+	pg->unlock();
+      }
+    };
+    return OpSchedulerItem::OrderLocker::Ref(
+      new Locker(pg));
+  }
+};
+
+class PGOpItem : public PGOpQueueable {
+  OpRequestRef op;
+
+  const MOSDOp *maybe_get_mosd_op() const {
+    auto req = op->get_req();
+    if (req->get_type() == CEPH_MSG_OSD_OP) {
+      return op->get_req<MOSDOp>();
+    } else {
+      return nullptr;
+    }
+  }
+
+public:
+  PGOpItem(spg_t pg, OpRequestRef op) : PGOpQueueable(pg), op(std::move(op)) {}
+  op_type_t get_op_type() const final {
+
+    return op_type_t::client_op;
+  }
+
+  std::ostream &print(std::ostream &rhs) const final {
+    return rhs << "PGOpItem(op=" << *(op->get_req()) << ")";
+  }
+
+  std::optional<OpRequestRef> maybe_get_op() const final {
+    return op;
+  }
+
+  op_scheduler_class get_scheduler_class() const final {
+    auto type = op->get_req()->get_type();
+    if (type == CEPH_MSG_OSD_OP ||
+	type == CEPH_MSG_OSD_BACKOFF) {
+      return op_scheduler_class::client;
+    } else {
+      return op_scheduler_class::immediate;
+    }
+  }
+
+  void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+};
+
+class PGPeeringItem : public PGOpQueueable {
+  PGPeeringEventRef evt;
+public:
+  PGPeeringItem(spg_t pg, PGPeeringEventRef e) : PGOpQueueable(pg), evt(e) {}
+  op_type_t get_op_type() const final {
+    return op_type_t::peering_event;
+  }
+  std::ostream &print(std::ostream &rhs) const final {
+    return rhs << "PGPeeringEvent(" << evt->get_desc() << ")";
+  }
+  void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+  bool is_peering() const override {
+    return true;
+  }
+  bool peering_requires_pg() const override {
+    return evt->requires_pg;
+  }
+  const PGCreateInfo *creates_pg() const override {
+    return evt->create_info.get();
+  }
+  op_scheduler_class get_scheduler_class() const final {
+    return op_scheduler_class::immediate;
+  }
+};
+
+class PGSnapTrim : public PGOpQueueable {
+  epoch_t epoch_queued;
+public:
+  PGSnapTrim(
+    spg_t pg,
+    epoch_t epoch_queued)
+    : PGOpQueueable(pg), epoch_queued(epoch_queued) {}
+  op_type_t get_op_type() const final {
+    return op_type_t::bg_snaptrim;
+  }
+  std::ostream &print(std::ostream &rhs) const final {
+    return rhs << "PGSnapTrim(pgid=" << get_pgid()
+	       << " epoch_queued=" << epoch_queued
+	       << ")";
+  }
+  void run(
+    OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+  op_scheduler_class get_scheduler_class() const final {
+    return op_scheduler_class::background_best_effort;
+  }
+};
+
+class PGScrub : public PGOpQueueable {
+  epoch_t epoch_queued;
+public:
+  PGScrub(
+    spg_t pg,
+    epoch_t epoch_queued)
+    : PGOpQueueable(pg), epoch_queued(epoch_queued) {}
+  op_type_t get_op_type() const final {
+    return op_type_t::bg_scrub;
+  }
+  std::ostream &print(std::ostream &rhs) const final {
+    return rhs << "PGScrub(pgid=" << get_pgid()
+	       << "epoch_queued=" << epoch_queued
+	       << ")";
+  }
+  void run(
+    OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+  op_scheduler_class get_scheduler_class() const final {
+    return op_scheduler_class::background_best_effort;
+  }
+};
+
+class PGScrubItem : public PGOpQueueable {
+ protected:
+  epoch_t epoch_queued;
+  Scrub::act_token_t activation_index;
+  std::string_view message_name;
+  PGScrubItem(spg_t pg, epoch_t epoch_queued, std::string_view derivative_name)
+      : PGOpQueueable{pg}
+      , epoch_queued{epoch_queued}
+      , activation_index{0}
+      , message_name{derivative_name}
+  {}
+  PGScrubItem(spg_t pg,
+	      epoch_t epoch_queued,
+	      Scrub::act_token_t op_index,
+	      std::string_view derivative_name)
+      : PGOpQueueable{pg}
+      , epoch_queued{epoch_queued}
+      , activation_index{op_index}
+      , message_name{derivative_name}
+  {}
+  op_type_t get_op_type() const final { return op_type_t::bg_scrub; }
+  std::ostream& print(std::ostream& rhs) const final
+  {
+    return rhs << message_name << "(pgid=" << get_pgid()
+	       << "epoch_queued=" << epoch_queued
+	       << " scrub-token=" << activation_index << ")";
+  }
+  void run(OSD* osd,
+	   OSDShard* sdata,
+	   PGRef& pg,
+	   ThreadPool::TPHandle& handle) override = 0;
+  op_scheduler_class get_scheduler_class() const final
+  {
+    return op_scheduler_class::background_best_effort;
+  }
+};
+
+class PGScrubResched : public PGScrubItem {
+ public:
+  PGScrubResched(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubResched"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+/**
+ *  all replicas have granted our scrub resources request
+ */
+class PGScrubResourcesOK : public PGScrubItem {
+ public:
+  PGScrubResourcesOK(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubResourcesOK"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+/**
+ *  scrub resources requests denied by replica(s)
+ */
+class PGScrubDenied : public PGScrubItem {
+ public:
+  PGScrubDenied(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubDenied"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+/**
+ *  called when a repair process completes, to initiate scrubbing. No local/remote
+ *  resources are allocated.
+ */
+class PGScrubAfterRepair : public PGScrubItem {
+ public:
+  PGScrubAfterRepair(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubAfterRepair"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubPushesUpdate : public PGScrubItem {
+ public:
+  PGScrubPushesUpdate(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubPushesUpdate"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubAppliedUpdate : public PGScrubItem {
+ public:
+  PGScrubAppliedUpdate(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubAppliedUpdate"}
+  {}
+  void run(OSD* osd,
+	   OSDShard* sdata,
+	   PGRef& pg,
+	   [[maybe_unused]] ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubUnblocked : public PGScrubItem {
+ public:
+  PGScrubUnblocked(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubUnblocked"}
+  {}
+  void run(OSD* osd,
+	   OSDShard* sdata,
+	   PGRef& pg,
+	   [[maybe_unused]] ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubDigestUpdate : public PGScrubItem {
+ public:
+  PGScrubDigestUpdate(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubDigestUpdate"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubGotLocalMap : public PGScrubItem {
+ public:
+  PGScrubGotLocalMap(spg_t pg, epoch_t epoch_queued)
+    : PGScrubItem{pg, epoch_queued, "PGScrubGotLocalMap"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubGotReplMaps : public PGScrubItem {
+ public:
+  PGScrubGotReplMaps(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubGotReplMaps"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubMapsCompared : public PGScrubItem {
+ public:
+  PGScrubMapsCompared(spg_t pg, epoch_t epoch_queued)
+    : PGScrubItem{pg, epoch_queued, "PGScrubMapsCompared"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGRepScrub : public PGScrubItem {
+ public:
+  PGRepScrub(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token)
+      : PGScrubItem{pg, epoch_queued, op_token, "PGRepScrub"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGRepScrubResched : public PGScrubItem {
+ public:
+  PGRepScrubResched(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token)
+      : PGScrubItem{pg, epoch_queued, op_token, "PGRepScrubResched"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubReplicaPushes : public PGScrubItem {
+ public:
+  PGScrubReplicaPushes(spg_t pg, epoch_t epoch_queued)
+      : PGScrubItem{pg, epoch_queued, "PGScrubReplicaPushes"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubScrubFinished : public PGScrubItem {
+ public:
+  PGScrubScrubFinished(spg_t pg, epoch_t epoch_queued)
+    : PGScrubItem{pg, epoch_queued, "PGScrubScrubFinished"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubGetNextChunk : public PGScrubItem {
+ public:
+  PGScrubGetNextChunk(spg_t pg, epoch_t epoch_queued)
+    : PGScrubItem{pg, epoch_queued, "PGScrubGetNextChunk"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubChunkIsBusy : public PGScrubItem {
+ public:
+  PGScrubChunkIsBusy(spg_t pg, epoch_t epoch_queued)
+    : PGScrubItem{pg, epoch_queued, "PGScrubChunkIsBusy"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGScrubChunkIsFree : public PGScrubItem {
+ public:
+  PGScrubChunkIsFree(spg_t pg, epoch_t epoch_queued)
+    : PGScrubItem{pg, epoch_queued, "PGScrubChunkIsFree"}
+  {}
+  void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
+};
+
+class PGRecovery : public PGOpQueueable {
+  epoch_t epoch_queued;
+  uint64_t reserved_pushes;
+public:
+  PGRecovery(
+    spg_t pg,
+    epoch_t epoch_queued,
+    uint64_t reserved_pushes)
+    : PGOpQueueable(pg),
+      epoch_queued(epoch_queued),
+      reserved_pushes(reserved_pushes) {}
+  op_type_t get_op_type() const final {
+    return op_type_t::bg_recovery;
+  }
+  std::ostream &print(std::ostream &rhs) const final {
+    return rhs << "PGRecovery(pgid=" << get_pgid()
+	       << " epoch_queued=" << epoch_queued
+	       << " reserved_pushes=" << reserved_pushes
+	       << ")";
+  }
+  uint64_t get_reserved_pushes() const final {
+    return reserved_pushes;
+  }
+  void run(
+    OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+  op_scheduler_class get_scheduler_class() const final {
+    return op_scheduler_class::background_recovery;
+  }
+};
+
+class PGRecoveryContext : public PGOpQueueable {
+  std::unique_ptr<GenContext<ThreadPool::TPHandle&>> c;
+  epoch_t epoch;
+public:
+  PGRecoveryContext(spg_t pgid,
+		    GenContext<ThreadPool::TPHandle&> *c, epoch_t epoch)
+    : PGOpQueueable(pgid),
+      c(c), epoch(epoch) {}
+  op_type_t get_op_type() const final {
+    return op_type_t::bg_recovery;
+  }
+  std::ostream &print(std::ostream &rhs) const final {
+    return rhs << "PGRecoveryContext(pgid=" << get_pgid()
+	       << " c=" << c.get() << " epoch=" << epoch
+	       << ")";
+  }
+  void run(
+    OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+  op_scheduler_class get_scheduler_class() const final {
+    return op_scheduler_class::background_recovery;
+  }
+};
+
+class PGDelete : public PGOpQueueable {
+  epoch_t epoch_queued;
+public:
+  PGDelete(
+    spg_t pg,
+    epoch_t epoch_queued)
+    : PGOpQueueable(pg),
+      epoch_queued(epoch_queued) {}
+  op_type_t get_op_type() const final {
+    return op_type_t::bg_pg_delete;
+  }
+  std::ostream &print(std::ostream &rhs) const final {
+    return rhs << "PGDelete(" << get_pgid()
+	       << " e" << epoch_queued
+	       << ")";
+  }
+  void run(
+    OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+  op_scheduler_class get_scheduler_class() const final {
+    return op_scheduler_class::background_best_effort;
+  }
+};
+
+class PGRecoveryMsg : public PGOpQueueable {
+  OpRequestRef op;
+
+public:
+  PGRecoveryMsg(spg_t pg, OpRequestRef op) : PGOpQueueable(pg), op(std::move(op)) {}
+  op_type_t get_op_type() const final {
+    return op_type_t::bg_recovery;
+  }
+
+  std::ostream &print(std::ostream &rhs) const final {
+    return rhs << "PGRecoveryMsg(op=" << *(op->get_req()) << ")";
+  }
+
+  std::optional<OpRequestRef> maybe_get_op() const final {
+    return op;
+  }
+
+  op_scheduler_class get_scheduler_class() const final {
+    auto priority = op->get_req()->get_priority();
+    if (priority >= CEPH_MSG_PRIO_HIGH) {
+      return op_scheduler_class::immediate;
+    }
+    return op_scheduler_class::background_recovery;
+  }
+
+  void run(OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
+};
+
+}
diff --git a/src/osd/scheduler/mClockScheduler.cc b/src/osd/scheduler/mClockScheduler.cc
new file mode 100644
index 000000000..f2f0ffc3d
--- /dev/null
+++ b/src/osd/scheduler/mClockScheduler.cc
@@ -0,0 +1,514 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include <memory>
+#include <functional>
+
+#include "osd/scheduler/mClockScheduler.h"
+#include "common/dout.h"
+
+namespace dmc = crimson::dmclock;
+using namespace std::placeholders;
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout << "mClockScheduler: "
+
+
+namespace ceph::osd::scheduler {
+
+mClockScheduler::mClockScheduler(CephContext *cct,
+  uint32_t num_shards,
+  bool is_rotational)
+  : cct(cct),
+    num_shards(num_shards),
+    is_rotational(is_rotational),
+    scheduler(
+      std::bind(&mClockScheduler::ClientRegistry::get_info,
+                &client_registry,
+                _1),
+      dmc::AtLimit::Wait,
+      cct->_conf.get_val<double>("osd_mclock_scheduler_anticipation_timeout"))
+{
+  cct->_conf.add_observer(this);
+  ceph_assert(num_shards > 0);
+  set_max_osd_capacity();
+  set_osd_mclock_cost_per_io();
+  set_osd_mclock_cost_per_byte();
+  set_mclock_profile();
+  enable_mclock_profile_settings();
+  client_registry.update_from_config(cct->_conf);
+}
+
+void mClockScheduler::ClientRegistry::update_from_config(const ConfigProxy &conf)
+{
+  default_external_client_info.update(
+    conf.get_val<uint64_t>("osd_mclock_scheduler_client_res"),
+    conf.get_val<uint64_t>("osd_mclock_scheduler_client_wgt"),
+    conf.get_val<uint64_t>("osd_mclock_scheduler_client_lim"));
+
+  internal_client_infos[
+    static_cast<size_t>(op_scheduler_class::background_recovery)].update(
+    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_res"),
+    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_wgt"),
+    conf.get_val<uint64_t>("osd_mclock_scheduler_background_recovery_lim"));
+
+  internal_client_infos[
+    static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
+    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_res"),
+    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_wgt"),
+    conf.get_val<uint64_t>("osd_mclock_scheduler_background_best_effort_lim"));
+}
+
+const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_external_client(
+  const client_profile_id_t &client) const
+{
+  auto ret = external_client_infos.find(client);
+  if (ret == external_client_infos.end())
+    return &default_external_client_info;
+  else
+    return &(ret->second);
+}
+
+const dmc::ClientInfo *mClockScheduler::ClientRegistry::get_info(
+  const scheduler_id_t &id) const {
+  switch (id.class_id) {
+  case op_scheduler_class::immediate:
+    ceph_assert(0 == "Cannot schedule immediate");
+    return (dmc::ClientInfo*)nullptr;
+  case op_scheduler_class::client:
+    return get_external_client(id.client_profile_id);
+  default:
+    ceph_assert(static_cast<size_t>(id.class_id) < internal_client_infos.size());
+    return &internal_client_infos[static_cast<size_t>(id.class_id)];
+  }
+}
+
+void mClockScheduler::set_max_osd_capacity()
+{
+  if (is_rotational) {
+    max_osd_capacity =
+      cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_hdd");
+  } else {
+    max_osd_capacity =
+      cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
+  }
+  // Set per op-shard iops limit
+  max_osd_capacity /= num_shards;
+  dout(1) << __func__ << " #op shards: " << num_shards
+          << std::fixed << std::setprecision(2)
+          << " max osd capacity(iops) per shard: " << max_osd_capacity
+          << dendl;
+}
+
+void mClockScheduler::set_osd_mclock_cost_per_io()
+{
+  std::chrono::seconds sec(1);
+  if (cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec")) {
+    osd_mclock_cost_per_io =
+      cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec");
+  } else {
+    if (is_rotational) {
+      osd_mclock_cost_per_io =
+        cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_hdd");
+      // For HDDs, convert value to seconds
+      osd_mclock_cost_per_io /= std::chrono::microseconds(sec).count();
+    } else {
+      // For SSDs, convert value to milliseconds
+      osd_mclock_cost_per_io =
+        cct->_conf.get_val<double>("osd_mclock_cost_per_io_usec_ssd");
+      osd_mclock_cost_per_io /= std::chrono::milliseconds(sec).count();
+    }
+  }
+  dout(1) << __func__ << " osd_mclock_cost_per_io: "
+          << std::fixed << std::setprecision(7) << osd_mclock_cost_per_io
+          << dendl;
+}
+
+void mClockScheduler::set_osd_mclock_cost_per_byte()
+{
+  std::chrono::seconds sec(1);
+  if (cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec")) {
+    osd_mclock_cost_per_byte =
+      cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec");
+  } else {
+    if (is_rotational) {
+      osd_mclock_cost_per_byte =
+        cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_hdd");
+      // For HDDs, convert value to seconds
+      osd_mclock_cost_per_byte /= std::chrono::microseconds(sec).count();
+    } else {
+      osd_mclock_cost_per_byte =
+        cct->_conf.get_val<double>("osd_mclock_cost_per_byte_usec_ssd");
+      // For SSDs, convert value to milliseconds
+      osd_mclock_cost_per_byte /= std::chrono::milliseconds(sec).count();
+    }
+  }
+  dout(1) << __func__ << " osd_mclock_cost_per_byte: "
+          << std::fixed << std::setprecision(7) << osd_mclock_cost_per_byte
+          << dendl;
+}
+
+void mClockScheduler::set_mclock_profile()
+{
+  mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
+  dout(1) << __func__ << " mclock profile: " << mclock_profile << dendl;
+}
+
+std::string mClockScheduler::get_mclock_profile()
+{
+  return mclock_profile;
+}
+
+void mClockScheduler::set_balanced_profile_allocations()
+{
+  // Client Allocation:
+  //   reservation: 40% | weight: 1 | limit: 100% |
+  // Background Recovery Allocation:
+  //   reservation: 40% | weight: 1 | limit: 150% |
+  // Background Best Effort Allocation:
+  //   reservation: 20% | weight: 2 | limit: max |
+
+  // Client
+  uint64_t client_res = static_cast<uint64_t>(
+    std::round(0.40 * max_osd_capacity));
+  uint64_t client_lim = static_cast<uint64_t>(
+    std::round(max_osd_capacity));
+  uint64_t client_wgt = default_min;
+
+  // Background Recovery
+  uint64_t rec_res = static_cast<uint64_t>(
+    std::round(0.40 * max_osd_capacity));
+  uint64_t rec_lim = static_cast<uint64_t>(
+    std::round(1.5 * max_osd_capacity));
+  uint64_t rec_wgt = default_min;
+
+  // Background Best Effort
+  uint64_t best_effort_res = static_cast<uint64_t>(
+    std::round(0.20 * max_osd_capacity));
+  uint64_t best_effort_lim = default_max;
+  uint64_t best_effort_wgt = 2;
+
+  // Set the allocations for the mclock clients
+  client_allocs[
+    static_cast<size_t>(op_scheduler_class::client)].update(
+      client_res,
+      client_wgt,
+      client_lim);
+  client_allocs[
+    static_cast<size_t>(op_scheduler_class::background_recovery)].update(
+      rec_res,
+      rec_wgt,
+      rec_lim);
+  client_allocs[
+    static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
+      best_effort_res,
+      best_effort_wgt,
+      best_effort_lim);
+}
+
+void mClockScheduler::set_high_recovery_ops_profile_allocations()
+{
+  // Client Allocation:
+  //   reservation: 30% | weight: 1 | limit: 80% |
+  // Background Recovery Allocation:
+  //   reservation: 60% | weight: 2 | limit: 200% |
+  // Background Best Effort Allocation:
+  //   reservation: 1 | weight: 2 | limit: max |
+
+  // Client
+  uint64_t client_res = static_cast<uint64_t>(
+    std::round(0.30 * max_osd_capacity));
+  uint64_t client_lim = static_cast<uint64_t>(
+    std::round(0.80 * max_osd_capacity));
+  uint64_t client_wgt = default_min;
+
+  // Background Recovery
+  uint64_t rec_res = static_cast<uint64_t>(
+    std::round(0.60 * max_osd_capacity));
+  uint64_t rec_lim = static_cast<uint64_t>(
+    std::round(2.0 * max_osd_capacity));
+  uint64_t rec_wgt = 2;
+
+  // Background Best Effort
+  uint64_t best_effort_res = default_min;
+  uint64_t best_effort_lim = default_max;
+  uint64_t best_effort_wgt = 2;
+
+  // Set the allocations for the mclock clients
+  client_allocs[
+    static_cast<size_t>(op_scheduler_class::client)].update(
+      client_res,
+      client_wgt,
+      client_lim);
+  client_allocs[
+    static_cast<size_t>(op_scheduler_class::background_recovery)].update(
+      rec_res,
+      rec_wgt,
+      rec_lim);
+  client_allocs[
+    static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
+      best_effort_res,
+      best_effort_wgt,
+      best_effort_lim);
+}
+
+void mClockScheduler::set_high_client_ops_profile_allocations()
+{
+  // Client Allocation:
+  //   reservation: 50% | weight: 2 | limit: max |
+  // Background Recovery Allocation:
+  //   reservation: 25% | weight: 1 | limit: 100% |
+  // Background Best Effort Allocation:
+  //   reservation: 25% | weight: 2 | limit: max |
+
+  // Client
+  uint64_t client_res = static_cast<uint64_t>(
+    std::round(0.50 * max_osd_capacity));
+  uint64_t client_wgt = 2;
+  uint64_t client_lim = default_max;
+
+  // Background Recovery
+  uint64_t rec_res = static_cast<uint64_t>(
+    std::round(0.25 * max_osd_capacity));
+  uint64_t rec_lim = static_cast<uint64_t>(
+    std::round(max_osd_capacity));
+  uint64_t rec_wgt = default_min;
+
+  // Background Best Effort
+  uint64_t best_effort_res = static_cast<uint64_t>(
+    std::round(0.25 * max_osd_capacity));
+  uint64_t best_effort_lim = default_max;
+  uint64_t best_effort_wgt = 2;
+
+  // Set the allocations for the mclock clients
+  client_allocs[
+    static_cast<size_t>(op_scheduler_class::client)].update(
+      client_res,
+      client_wgt,
+      client_lim);
+  client_allocs[
+    static_cast<size_t>(op_scheduler_class::background_recovery)].update(
+      rec_res,
+      rec_wgt,
+      rec_lim);
+  client_allocs[
+    static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
+      best_effort_res,
+      best_effort_wgt,
+      best_effort_lim);
+}
+
+void mClockScheduler::enable_mclock_profile_settings()
+{
+  // Nothing to do for "custom" profile
+  if (mclock_profile == "custom") {
+    return;
+  }
+
+  // Set mclock and ceph config options for the chosen profile
+  if (mclock_profile == "balanced") {
+    set_balanced_profile_allocations();
+  } else if (mclock_profile == "high_recovery_ops") {
+    set_high_recovery_ops_profile_allocations();
+  } else if (mclock_profile == "high_client_ops") {
+    set_high_client_ops_profile_allocations();
+  } else {
+    ceph_assert("Invalid choice of mclock profile" == 0);
+    return;
+  }
+
+  // Set the mclock config parameters
+  set_profile_config();
+}
+
+void mClockScheduler::set_profile_config()
+{
+  ClientAllocs client = client_allocs[
+    static_cast<size_t>(op_scheduler_class::client)];
+  ClientAllocs rec = client_allocs[
+    static_cast<size_t>(op_scheduler_class::background_recovery)];
+  ClientAllocs best_effort = client_allocs[
+    static_cast<size_t>(op_scheduler_class::background_best_effort)];
+
+  // Set external client params
+  cct->_conf.set_val("osd_mclock_scheduler_client_res",
+    std::to_string(client.res));
+  cct->_conf.set_val("osd_mclock_scheduler_client_wgt",
+    std::to_string(client.wgt));
+  cct->_conf.set_val("osd_mclock_scheduler_client_lim",
+    std::to_string(client.lim));
+
+  // Set background recovery client params
+  cct->_conf.set_val("osd_mclock_scheduler_background_recovery_res",
+    std::to_string(rec.res));
+  cct->_conf.set_val("osd_mclock_scheduler_background_recovery_wgt",
+    std::to_string(rec.wgt));
+  cct->_conf.set_val("osd_mclock_scheduler_background_recovery_lim",
+    std::to_string(rec.lim));
+
+  // Set background best effort client params
+  cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_res",
+    std::to_string(best_effort.res));
+  cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_wgt",
+    std::to_string(best_effort.wgt));
+  cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_lim",
+    std::to_string(best_effort.lim));
+}
+
+int mClockScheduler::calc_scaled_cost(int item_cost)
+{
+  // Calculate total scaled cost in secs
+  int scaled_cost =
+    std::round(osd_mclock_cost_per_io + (osd_mclock_cost_per_byte * item_cost));
+  return std::max(scaled_cost, 1);
+}
+
+void mClockScheduler::update_configuration()
+{
+  // Apply configuration change. The expectation is that
+  // at least one of the tracked mclock config option keys
+  // is modified before calling this method.
+  cct->_conf.apply_changes(nullptr);
+}
+
+void mClockScheduler::dump(ceph::Formatter &f) const
+{
+}
+
+void mClockScheduler::enqueue(OpSchedulerItem&& item)
+{
+  auto id = get_scheduler_id(item);
+
+  // TODO: move this check into OpSchedulerItem, handle backwards compat
+  if (op_scheduler_class::immediate == id.class_id) {
+    immediate.push_front(std::move(item));
+  } else {
+    int cost = calc_scaled_cost(item.get_cost());
+    // Add item to scheduler queue
+    scheduler.add_request(
+      std::move(item),
+      id,
+      cost);
+  }
+}
+
+void mClockScheduler::enqueue_front(OpSchedulerItem&& item)
+{
+  immediate.push_back(std::move(item));
+  // TODO: item may not be immediate, update mclock machinery to permit
+  // putting the item back in the queue
+}
+
+WorkItem mClockScheduler::dequeue()
+{
+  if (!immediate.empty()) {
+    WorkItem work_item{std::move(immediate.back())};
+    immediate.pop_back();
+    return work_item;
+  } else {
+    mclock_queue_t::PullReq result = scheduler.pull_request();
+    if (result.is_future()) {
+      return result.getTime();
+    } else if (result.is_none()) {
+      ceph_assert(
+	0 == "Impossible, must have checked empty() first");
+      return {};
+    } else {
+      ceph_assert(result.is_retn());
+
+      auto &retn = result.get_retn();
+      return std::move(*retn.request);
+    }
+  }
+}
+
+const char** mClockScheduler::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "osd_mclock_scheduler_client_res",
+    "osd_mclock_scheduler_client_wgt",
+    "osd_mclock_scheduler_client_lim",
+    "osd_mclock_scheduler_background_recovery_res",
+    "osd_mclock_scheduler_background_recovery_wgt",
+    "osd_mclock_scheduler_background_recovery_lim",
+    "osd_mclock_scheduler_background_best_effort_res",
+    "osd_mclock_scheduler_background_best_effort_wgt",
+    "osd_mclock_scheduler_background_best_effort_lim",
+    "osd_mclock_cost_per_io_usec",
+    "osd_mclock_cost_per_io_usec_hdd",
+    "osd_mclock_cost_per_io_usec_ssd",
+    "osd_mclock_cost_per_byte_usec",
+    "osd_mclock_cost_per_byte_usec_hdd",
+    "osd_mclock_cost_per_byte_usec_ssd",
+    "osd_mclock_max_capacity_iops_hdd",
+    "osd_mclock_max_capacity_iops_ssd",
+    "osd_mclock_profile",
+    NULL
+  };
+  return KEYS;
+}
+
+void mClockScheduler::handle_conf_change(
+  const ConfigProxy& conf,
+  const std::set<std::string> &changed)
+{
+  if (changed.count("osd_mclock_cost_per_io_usec") ||
+      changed.count("osd_mclock_cost_per_io_usec_hdd") ||
+      changed.count("osd_mclock_cost_per_io_usec_ssd")) {
+    set_osd_mclock_cost_per_io();
+  }
+  if (changed.count("osd_mclock_cost_per_byte_usec") ||
+      changed.count("osd_mclock_cost_per_byte_usec_hdd") ||
+      changed.count("osd_mclock_cost_per_byte_usec_ssd")) {
+    set_osd_mclock_cost_per_byte();
+  }
+  if (changed.count("osd_mclock_max_capacity_iops_hdd") ||
+      changed.count("osd_mclock_max_capacity_iops_ssd")) {
+    set_max_osd_capacity();
+    if (mclock_profile != "custom") {
+      enable_mclock_profile_settings();
+      client_registry.update_from_config(conf);
+    }
+  }
+  if (changed.count("osd_mclock_profile")) {
+    set_mclock_profile();
+    if (mclock_profile != "custom") {
+      enable_mclock_profile_settings();
+      client_registry.update_from_config(conf);
+    }
+  }
+  if (changed.count("osd_mclock_scheduler_client_res") ||
+      changed.count("osd_mclock_scheduler_client_wgt") ||
+      changed.count("osd_mclock_scheduler_client_lim") ||
+      changed.count("osd_mclock_scheduler_background_recovery_res") ||
+      changed.count("osd_mclock_scheduler_background_recovery_wgt") ||
+      changed.count("osd_mclock_scheduler_background_recovery_lim") ||
+      changed.count("osd_mclock_scheduler_background_best_effort_res") ||
+      changed.count("osd_mclock_scheduler_background_best_effort_wgt") ||
+      changed.count("osd_mclock_scheduler_background_best_effort_lim")) {
+    if (mclock_profile == "custom") {
+      client_registry.update_from_config(conf);
+    }
+  }
+}
+
+mClockScheduler::~mClockScheduler()
+{
+  cct->_conf.remove_observer(this);
+}
+
+}
diff --git a/src/osd/scheduler/mClockScheduler.h b/src/osd/scheduler/mClockScheduler.h
new file mode 100644
index 000000000..32f3851ec
--- /dev/null
+++ b/src/osd/scheduler/mClockScheduler.h
@@ -0,0 +1,204 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include <ostream>
+#include <map>
+#include <vector>
+
+#include "boost/variant.hpp"
+
+#include "dmclock/src/dmclock_server.h"
+
+#include "osd/scheduler/OpScheduler.h"
+#include "common/config.h"
+#include "include/cmp.h"
+#include "common/ceph_context.h"
+#include "common/mClockPriorityQueue.h"
+#include "osd/scheduler/OpSchedulerItem.h"
+
+
+namespace ceph::osd::scheduler {
+
+constexpr uint64_t default_min = 1;
+constexpr uint64_t default_max = 999999;
+
+using client_id_t = uint64_t;
+using profile_id_t = uint64_t;
+
+struct client_profile_id_t {
+  client_id_t client_id;
+  profile_id_t profile_id;
+};
+
+WRITE_EQ_OPERATORS_2(client_profile_id_t, client_id, profile_id)
+WRITE_CMP_OPERATORS_2(client_profile_id_t, client_id, profile_id)
+
+
+struct scheduler_id_t {
+  op_scheduler_class class_id;
+  client_profile_id_t client_profile_id;
+};
+
+WRITE_EQ_OPERATORS_2(scheduler_id_t, class_id, client_profile_id)
+WRITE_CMP_OPERATORS_2(scheduler_id_t, class_id, client_profile_id)
+
+/**
+ * Scheduler implementation based on mclock.
+ *
+ * TODO: explain configs
+ */
+class mClockScheduler : public OpScheduler, md_config_obs_t {
+
+  CephContext *cct;
+  const uint32_t num_shards;
+  bool is_rotational;
+  double max_osd_capacity;
+  double osd_mclock_cost_per_io;
+  double osd_mclock_cost_per_byte;
+  std::string mclock_profile = "high_client_ops";
+  struct ClientAllocs {
+    uint64_t res;
+    uint64_t wgt;
+    uint64_t lim;
+
+    ClientAllocs(uint64_t _res, uint64_t _wgt, uint64_t _lim) {
+      update(_res, _wgt, _lim);
+    }
+
+    inline void update(uint64_t _res, uint64_t _wgt, uint64_t _lim) {
+      res = _res;
+      wgt = _wgt;
+      lim = _lim;
+    }
+  };
+  std::array<
+    ClientAllocs,
+    static_cast<size_t>(op_scheduler_class::client) + 1
+  > client_allocs = {
+    // Placeholder, get replaced with configured values
+    ClientAllocs(1, 1, 1), // background_recovery
+    ClientAllocs(1, 1, 1), // background_best_effort
+    ClientAllocs(1, 1, 1), // immediate (not used)
+    ClientAllocs(1, 1, 1)  // client
+  };
+  class ClientRegistry {
+    std::array<
+      crimson::dmclock::ClientInfo,
+      static_cast<size_t>(op_scheduler_class::immediate)
+    > internal_client_infos = {
+      // Placeholder, gets replaced with configured values
+      crimson::dmclock::ClientInfo(1, 1, 1),
+      crimson::dmclock::ClientInfo(1, 1, 1)
+    };
+
+    crimson::dmclock::ClientInfo default_external_client_info = {1, 1, 1};
+    std::map<client_profile_id_t,
+	     crimson::dmclock::ClientInfo> external_client_infos;
+    const crimson::dmclock::ClientInfo *get_external_client(
+      const client_profile_id_t &client) const;
+  public:
+    void update_from_config(const ConfigProxy &conf);
+    const crimson::dmclock::ClientInfo *get_info(
+      const scheduler_id_t &id) const;
+  } client_registry;
+
+  using mclock_queue_t = crimson::dmclock::PullPriorityQueue<
+    scheduler_id_t,
+    OpSchedulerItem,
+    true,
+    true,
+    2>;
+  mclock_queue_t scheduler;
+  std::list<OpSchedulerItem> immediate;
+
+  static scheduler_id_t get_scheduler_id(const OpSchedulerItem &item) {
+    return scheduler_id_t{
+      item.get_scheduler_class(),
+	client_profile_id_t{
+	item.get_owner(),
+	  0
+	  }
+    };
+  }
+
+public:
+  mClockScheduler(CephContext *cct, uint32_t num_shards, bool is_rotational);
+  ~mClockScheduler() override;
+
+  // Set the max osd capacity in iops
+  void set_max_osd_capacity();
+
+  // Set the cost per io for the osd
+  void set_osd_mclock_cost_per_io();
+
+  // Set the cost per byte for the osd
+  void set_osd_mclock_cost_per_byte();
+
+  // Set the mclock profile type to enable
+  void set_mclock_profile();
+
+  // Get the active mclock profile
+  std::string get_mclock_profile();
+
+  // Set "balanced" profile allocations
+  void set_balanced_profile_allocations();
+
+  // Set "high_recovery_ops" profile allocations
+  void set_high_recovery_ops_profile_allocations();
+
+  // Set "high_client_ops" profile allocations
+  void set_high_client_ops_profile_allocations();
+
+  // Set the mclock related config params based on the profile
+  void enable_mclock_profile_settings();
+
+  // Set mclock config parameter based on allocations
+  void set_profile_config();
+
+  // Calculate scale cost per item
+  int calc_scaled_cost(int cost);
+
+  // Enqueue op in the back of the regular queue
+  void enqueue(OpSchedulerItem &&item) final;
+
+  // Enqueue the op in the front of the regular queue
+  void enqueue_front(OpSchedulerItem &&item) final;
+
+  // Return an op to be dispatch
+  WorkItem dequeue() final;
+
+  // Returns if the queue is empty
+  bool empty() const final {
+    return immediate.empty() && scheduler.empty();
+  }
+
+  // Formatted output of the queue
+  void dump(ceph::Formatter &f) const final;
+
+  void print(std::ostream &ostream) const final {
+    ostream << "mClockScheduler";
+  }
+
+  // Update data associated with the modified mclock config key(s)
+  void update_configuration() final;
+
+  const char** get_tracked_conf_keys() const final;
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set<std::string> &changed) final;
+};
+
+}
diff --git a/src/osd/scrub_machine.cc b/src/osd/scrub_machine.cc
new file mode 100644
index 000000000..fff372081
--- /dev/null
+++ b/src/osd/scrub_machine.cc
@@ -0,0 +1,534 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "scrub_machine.h"
+
+#include <chrono>
+#include <typeinfo>
+
+#include <boost/core/demangle.hpp>
+
+#include "OSD.h"
+#include "OpRequest.h"
+#include "ScrubStore.h"
+#include "scrub_machine_lstnr.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_osd
+#undef dout_prefix
+#define dout_prefix *_dout << " scrubberFSM "
+
+using namespace std::chrono;
+using namespace std::chrono_literals;
+namespace sc = boost::statechart;
+
+#define DECLARE_LOCALS                                           \
+  ScrubMachineListener* scrbr = context<ScrubMachine>().m_scrbr; \
+  std::ignore = scrbr;                                           \
+  auto pg_id = context<ScrubMachine>().m_pg_id;                  \
+  std::ignore = pg_id;
+
+namespace Scrub {
+
+// --------- trace/debug auxiliaries -------------------------------
+
+void on_event_creation(std::string_view nm)
+{
+  dout(20) << " event: --vvvv---- " << nm << dendl;
+}
+
+void on_event_discard(std::string_view nm)
+{
+  dout(20) << " event: --^^^^---- " << nm << dendl;
+}
+
+std::string ScrubMachine::current_states_desc() const
+{
+  std::string sts{"<"};
+  for (auto si = state_begin(); si != state_end(); ++si) {
+    const auto& siw{ *si };  // prevents a warning re side-effects
+    // the '7' is the size of the 'scrub::'
+    sts += boost::core::demangle(typeid(siw).name()).substr(7, std::string::npos) + "/";
+  }
+  return sts + ">";
+}
+
+void ScrubMachine::assert_not_active() const
+{
+  ceph_assert(state_cast<const NotActive*>());
+}
+
+bool ScrubMachine::is_reserving() const
+{
+  return state_cast<const ReservingReplicas*>();
+}
+
+bool ScrubMachine::is_accepting_updates() const
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  ceph_assert(scrbr->is_primary());
+
+  return state_cast<const WaitLastUpdate*>();
+}
+
+// for the rest of the code in this file - we know what PG we are dealing with:
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this->context<ScrubMachine>())
+
+template <class T>
+static ostream& _prefix(std::ostream* _dout, T& t)
+{
+  return t.gen_prefix(*_dout);
+}
+
+std::ostream& ScrubMachine::gen_prefix(std::ostream& out) const
+{
+  return m_scrbr->gen_prefix(out) << "FSM: ";
+}
+
+// ////////////// the actual actions
+
+// ----------------------- NotActive -----------------------------------------
+
+NotActive::NotActive(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> NotActive" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  scrbr->clear_queued_or_active();
+}
+
+// ----------------------- ReservingReplicas ---------------------------------
+
+ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> ReservingReplicas" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  scrbr->scrub_begin();
+  scrbr->reserve_replicas();
+}
+
+sc::result ReservingReplicas::react(const ReservationFailure&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl;
+
+  // the Scrubber must release all resources and abort the scrubbing
+  scrbr->clear_pgscrub_state();
+  return transit<NotActive>();
+}
+
+/**
+ * note: the event poster is handling the scrubber reset
+ */
+sc::result ReservingReplicas::react(const FullReset&)
+{
+  dout(10) << "ReservingReplicas::react(const FullReset&)" << dendl;
+  return transit<NotActive>();
+}
+
+// ----------------------- ActiveScrubbing -----------------------------------
+
+ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> ActiveScrubbing" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  scrbr->on_init();
+}
+
+/**
+ *  upon exiting the Active state
+ */
+ActiveScrubbing::~ActiveScrubbing()
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(15) << __func__ << dendl;
+  scrbr->unreserve_replicas();
+  scrbr->clear_queued_or_active();
+}
+
+/*
+ * The only source of an InternalError event as of now is the BuildMap state,
+ * when encountering a backend error.
+ * We kill the scrub and reset the FSM.
+ */
+sc::result ActiveScrubbing::react(const InternalError&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << __func__ << dendl;
+  scrbr->clear_pgscrub_state();
+  return transit<NotActive>();
+}
+
+sc::result ActiveScrubbing::react(const FullReset&)
+{
+  dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl;
+  // caller takes care of clearing the scrubber & FSM states
+  return transit<NotActive>();
+}
+
+// ----------------------- RangeBlocked -----------------------------------
+
+/*
+ * Blocked. Will be released by kick_object_context_blocked() (or upon
+ * an abort)
+ */
+RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> Act/RangeBlocked" << dendl;
+}
+
+// ----------------------- PendingTimer -----------------------------------
+
+/**
+ *  Sleeping till timer reactivation - or just requeuing
+ */
+PendingTimer::PendingTimer(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> Act/PendingTimer" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+
+  scrbr->add_delayed_scheduling();
+}
+
+// ----------------------- NewChunk -----------------------------------
+
+/**
+ *  Preconditions:
+ *  - preemption data was set
+ *  - epoch start was updated
+ */
+NewChunk::NewChunk(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> Act/NewChunk" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+
+  scrbr->get_preemptor().adjust_parameters();
+
+  //  choose range to work on
+  //  select_range_n_notify() will signal either SelectedChunkFree or
+  //  ChunkIsBusy. If 'busy', we transition to Blocked, and wait for the
+  //  range to become available.
+  scrbr->select_range_n_notify();
+}
+
+sc::result NewChunk::react(const SelectedChunkFree&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "NewChunk::react(const SelectedChunkFree&)" << dendl;
+
+  scrbr->set_subset_last_update(scrbr->search_log_for_updates());
+  return transit<WaitPushes>();
+}
+
+// ----------------------- WaitPushes -----------------------------------
+
+WaitPushes::WaitPushes(my_context ctx) : my_base(ctx)
+{
+  dout(10) << " -- state -->> Act/WaitPushes" << dendl;
+  post_event(ActivePushesUpd{});
+}
+
+/*
+ * Triggered externally, by the entity that had an update re pushes
+ */
+sc::result WaitPushes::react(const ActivePushesUpd&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "WaitPushes::react(const ActivePushesUpd&) pending_active_pushes: "
+	   << scrbr->pending_active_pushes() << dendl;
+
+  if (!scrbr->pending_active_pushes()) {
+    // done waiting
+    return transit<WaitLastUpdate>();
+  }
+
+  return discard_event();
+}
+
+// ----------------------- WaitLastUpdate -----------------------------------
+
+WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx)
+{
+  dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl;
+  post_event(UpdatesApplied{});
+}
+
+/**
+ *  Note:
+ *  Updates are locally readable immediately. Thus, on the replicas we do need
+ *  to wait for the update notifications before scrubbing. For the Primary it's
+ *  a bit different: on EC (and only there) rmw operations have an additional
+ *  read roundtrip. That means that on the Primary we need to wait for
+ *  last_update_applied (the replica side, even on EC, is still safe
+ *  since the actual transaction will already be readable by commit time.
+ */
+void WaitLastUpdate::on_new_updates(const UpdatesApplied&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "WaitLastUpdate::on_new_updates(const UpdatesApplied&)" << dendl;
+
+  if (scrbr->has_pg_marked_new_updates()) {
+    post_event(InternalAllUpdates{});
+  } else {
+    // will be requeued by op_applied
+    dout(10) << "wait for EC read/modify/writes to queue" << dendl;
+  }
+}
+
+/*
+ *  request maps from the replicas in the acting set
+ */
+sc::result WaitLastUpdate::react(const InternalAllUpdates&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl;
+
+  scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable());
+  return transit<BuildMap>();
+}
+
+// ----------------------- BuildMap -----------------------------------
+
+BuildMap::BuildMap(my_context ctx) : my_base(ctx)
+{
+  dout(10) << " -- state -->> Act/BuildMap" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+
+  // no need to check for an epoch change, as all possible flows that brought us here have
+  // a check_interval() verification of their final event.
+
+  if (scrbr->get_preemptor().was_preempted()) {
+
+    // we were preempted, either directly or by a replica
+    dout(10) << __func__ << " preempted!!!" << dendl;
+    scrbr->mark_local_map_ready();
+    post_event(IntBmPreempted{});
+
+  } else {
+
+    auto ret = scrbr->build_primary_map_chunk();
+
+    if (ret == -EINPROGRESS) {
+      // must wait for the backend to finish. No specific event provided.
+      // build_primary_map_chunk() has already requeued us.
+      dout(20) << "waiting for the backend..." << dendl;
+
+    } else if (ret < 0) {
+
+      dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl;
+      post_event(InternalError{});
+
+    } else {
+
+      // the local map was created
+      post_event(IntLocalMapDone{});
+    }
+  }
+}
+
+sc::result BuildMap::react(const IntLocalMapDone&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "BuildMap::react(const IntLocalMapDone&)" << dendl;
+
+  scrbr->mark_local_map_ready();
+  return transit<WaitReplicas>();
+}
+
+// ----------------------- DrainReplMaps -----------------------------------
+
+DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> Act/DrainReplMaps" << dendl;
+  // we may have received all maps already. Send the event that will make us check.
+  post_event(GotReplicas{});
+}
+
+sc::result DrainReplMaps::react(const GotReplicas&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "DrainReplMaps::react(const GotReplicas&)" << dendl;
+
+  if (scrbr->are_all_maps_available()) {
+    // NewChunk will handle the preemption that brought us to this state
+    return transit<PendingTimer>();
+  }
+
+  dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: "
+	   << scrbr->dump_awaited_maps() << dendl;
+  return discard_event();
+}
+
+// ----------------------- WaitReplicas -----------------------------------
+
+WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> Act/WaitReplicas" << dendl;
+  post_event(GotReplicas{});
+}
+
+/**
+ * note: now that maps_compare_n_cleanup() is "futurized"(*), and we remain in this state
+ *  for a while even after we got all our maps, we must prevent are_all_maps_available()
+ *  (actually - the code after the if()) from being called more than once.
+ * This is basically a separate state, but it's too transitory and artificial to justify
+ *  the cost of a separate state.
+
+ * (*) "futurized" - in Crimson, the call to maps_compare_n_cleanup() returns immediately
+ *  after initiating the process. The actual termination of the maps comparing etc' is
+ *  signalled via an event. As we share the code with "classic" OSD, here too
+ *  maps_compare_n_cleanup() is responsible for signalling the completion of the
+ *  processing.
+ */
+sc::result WaitReplicas::react(const GotReplicas&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "WaitReplicas::react(const GotReplicas&)" << dendl;
+
+  if (!all_maps_already_called && scrbr->are_all_maps_available()) {
+    dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl;
+
+    all_maps_already_called = true;
+
+    // were we preempted?
+    if (scrbr->get_preemptor().disable_and_test()) {  // a test&set
+
+
+      dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl;
+      return transit<PendingTimer>();
+
+    } else {
+
+      // maps_compare_n_cleanup() will arrange for MapsCompared event to be sent:
+      scrbr->maps_compare_n_cleanup();
+      return discard_event();
+    }
+  } else {
+    return discard_event();
+  }
+}
+
+// ----------------------- WaitDigestUpdate -----------------------------------
+
+WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl;
+
+  // perform an initial check: maybe we already
+  // have all the updates we need:
+  // (note that DigestUpdate is usually an external event)
+  post_event(DigestUpdate{});
+}
+
+sc::result WaitDigestUpdate::react(const DigestUpdate&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "WaitDigestUpdate::react(const DigestUpdate&)" << dendl;
+
+  // on_digest_updates() will either:
+  // - do nothing - if we are still waiting for updates, or
+  // - finish the scrubbing of the current chunk, and:
+  //  - send NextChunk, or
+  //  - send ScrubFinished
+
+  scrbr->on_digest_updates();
+  return discard_event();
+}
+
+sc::result WaitDigestUpdate::react(const ScrubFinished&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "WaitDigestUpdate::react(const ScrubFinished&)" << dendl;
+  scrbr->scrub_finish();
+  return transit<NotActive>();
+}
+
+ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub)
+    : m_pg_id{pg->pg_id}, m_scrbr{pg_scrub}
+{
+}
+
+ScrubMachine::~ScrubMachine() = default;
+
+// -------- for replicas -----------------------------------------------------
+
+// ----------------------- ReplicaWaitUpdates --------------------------------
+
+ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx)
+{
+  dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl;
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  scrbr->on_replica_init();
+}
+
+/*
+ * Triggered externally, by the entity that had an update re pushes
+ */
+sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): "
+	   << scrbr->pending_active_pushes() << dendl;
+
+  if (scrbr->pending_active_pushes() == 0) {
+
+    // done waiting
+    return transit<ActiveReplica>();
+  }
+
+  return discard_event();
+}
+
+/**
+ * the event poster is handling the scrubber reset
+ */
+sc::result ReplicaWaitUpdates::react(const FullReset&)
+{
+  dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl;
+  return transit<NotActive>();
+}
+
+// ----------------------- ActiveReplica -----------------------------------
+
+ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "-- state -->> ActiveReplica" << dendl;
+  scrbr->on_replica_init();  // as we might have skipped ReplicaWaitUpdates
+  post_event(SchedReplica{});
+}
+
+sc::result ActiveReplica::react(const SchedReplica&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? "
+	   << scrbr->get_preemptor().is_preemptable() << dendl;
+
+  if (scrbr->get_preemptor().was_preempted()) {
+    dout(10) << "replica scrub job preempted" << dendl;
+
+    scrbr->send_preempted_replica();
+    scrbr->replica_handling_done();
+    return transit<NotActive>();
+  }
+
+  // start or check progress of build_replica_map_chunk()
+  auto ret_init = scrbr->build_replica_map_chunk();
+  if (ret_init != -EINPROGRESS) {
+    return transit<NotActive>();
+  }
+
+  return discard_event();
+}
+
+/**
+ * the event poster is handling the scrubber reset
+ */
+sc::result ActiveReplica::react(const FullReset&)
+{
+  dout(10) << "ActiveReplica::react(const FullReset&)" << dendl;
+  return transit<NotActive>();
+}
+
+}  // namespace Scrub
diff --git a/src/osd/scrub_machine.h b/src/osd/scrub_machine.h
new file mode 100644
index 000000000..7f88a675a
--- /dev/null
+++ b/src/osd/scrub_machine.h
@@ -0,0 +1,344 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include <string>
+
+#include <boost/statechart/custom_reaction.hpp>
+#include <boost/statechart/deferral.hpp>
+#include <boost/statechart/event.hpp>
+#include <boost/statechart/event_base.hpp>
+#include <boost/statechart/in_state_reaction.hpp>
+#include <boost/statechart/simple_state.hpp>
+#include <boost/statechart/state.hpp>
+#include <boost/statechart/state_machine.hpp>
+#include <boost/statechart/transition.hpp>
+
+#include "common/version.h"
+#include "include/Context.h"
+
+#include "scrub_machine_lstnr.h"
+#include "scrubber_common.h"
+
+using namespace std::string_literals;
+
+class PG;  // holding a pointer to that one - just for testing
+class PgScrubber;
+namespace Scrub {
+
+namespace sc = ::boost::statechart;
+namespace mpl = ::boost::mpl;
+
+//
+//  EVENTS
+//
+
+void on_event_creation(std::string_view nm);
+void on_event_discard(std::string_view nm);
+
+#define MEV(E)                                          \
+  struct E : sc::event<E> {                             \
+    inline static int actv{0};                          \
+    E()                                                 \
+    {                                                   \
+      if (!actv++)                                      \
+	on_event_creation(#E);                          \
+    }                                                   \
+    ~E()                                                \
+    {                                                   \
+      if (!--actv)                                      \
+	on_event_discard(#E);                           \
+    }                                                   \
+    void print(std::ostream* out) const { *out << #E; } \
+    std::string_view print() const { return #E; }       \
+  };
+
+MEV(RemotesReserved)  ///< all replicas have granted our reserve request
+
+MEV(ReservationFailure)	 ///< a reservation request has failed
+
+MEV(StartScrub)	 ///< initiate a new scrubbing session (relevant if we are a Primary)
+
+MEV(AfterRepairScrub)  ///< initiate a new scrubbing session. Only triggered at Recovery
+		       ///< completion.
+
+MEV(Unblocked)	///< triggered when the PG unblocked an object that was marked for
+		///< scrubbing. Via the PGScrubUnblocked op
+
+MEV(InternalSchedScrub)
+
+MEV(SelectedChunkFree)
+
+MEV(ChunkIsBusy)
+
+MEV(ActivePushesUpd)	 ///< Update to active_pushes. 'active_pushes' represents recovery
+			 ///< that is in-flight to the local ObjectStore
+MEV(UpdatesApplied)	 ///< (Primary only) all updates are committed
+
+MEV(InternalAllUpdates)	 ///< the internal counterpart of UpdatesApplied
+
+MEV(GotReplicas)  ///< got a map from a replica
+
+MEV(IntBmPreempted)  ///< internal - BuildMap preempted. Required, as detected within the
+		     ///< ctor
+
+MEV(InternalError)
+
+MEV(IntLocalMapDone)
+
+MEV(DigestUpdate)  ///< external. called upon success of a MODIFY op. See
+		   ///< scrub_snapshot_metadata()
+
+MEV(MapsCompared)  ///< (Crimson) maps_compare_n_cleanup() transactions are done
+
+MEV(StartReplica)  ///< initiating replica scrub.
+
+MEV(StartReplicaNoWait)	 ///< 'start replica' when there are no pending updates
+
+MEV(SchedReplica)
+
+MEV(ReplicaPushesUpd)  ///< Update to active_pushes. 'active_pushes' represents recovery
+		       ///< that is in-flight to the local ObjectStore
+
+MEV(FullReset)	///< guarantee that the FSM is in the quiescent state (i.e. NotActive)
+
+MEV(NextChunk)	///< finished handling this chunk. Go get the next one
+
+MEV(ScrubFinished)  ///< all chunks handled
+
+
+struct NotActive;	    ///< the quiescent state. No active scrubbing.
+struct ReservingReplicas;   ///< securing scrub resources from replicas' OSDs
+struct ActiveScrubbing;	    ///< the active state for a Primary. A sub-machine.
+struct ReplicaWaitUpdates;  ///< an active state for a replica. Waiting for all active
+			    ///< operations to finish.
+struct ActiveReplica;	    ///< an active state for a replica.
+
+
+class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
+ public:
+  friend class PgScrubber;
+
+ public:
+  explicit ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub);
+  ~ScrubMachine();
+
+  spg_t m_pg_id;
+  ScrubMachineListener* m_scrbr;
+  std::ostream& gen_prefix(std::ostream& out) const;
+
+  std::string current_states_desc() const;
+  void assert_not_active() const;
+  [[nodiscard]] bool is_reserving() const;
+  [[nodiscard]] bool is_accepting_updates() const;
+};
+
+/**
+ *  The Scrubber's base (quiescent) state.
+ *  Scrubbing is triggered by one of the following events:
+ *  - (standard scenario for a Primary): 'StartScrub'. Initiates the OSDs resources
+ *    reservation process. Will be issued by PG::scrub(), following a
+ *    queued "PGScrub" op.
+ *  - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is
+ *    not required to reserve resources.
+ *  - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming
+ *    MOSDRepScrub message.
+ *
+ *  note (20.8.21): originally, AfterRepairScrub was triggering a scrub without waiting
+ *   for replica resources to be acquired. But once replicas started using the
+ *   resource-request to identify and tag the scrub session, this bypass cannot be
+ *   supported anymore.
+ */
+struct NotActive : sc::state<NotActive, ScrubMachine> {
+  explicit NotActive(my_context ctx);
+
+  using reactions = mpl::list<sc::transition<StartScrub, ReservingReplicas>,
+			      // a scrubbing that was initiated at recovery completion,
+			      // and requires no resource reservations:
+			      sc::transition<AfterRepairScrub, ReservingReplicas>,
+			      sc::transition<StartReplica, ReplicaWaitUpdates>,
+			      sc::transition<StartReplicaNoWait, ActiveReplica>>;
+};
+
+struct ReservingReplicas : sc::state<ReservingReplicas, ScrubMachine> {
+
+  explicit ReservingReplicas(my_context ctx);
+  using reactions = mpl::list<sc::custom_reaction<FullReset>,
+			      // all replicas granted our resources request
+			      sc::transition<RemotesReserved, ActiveScrubbing>,
+			      sc::custom_reaction<ReservationFailure>>;
+
+  sc::result react(const FullReset&);
+
+  /// at least one replica denied us the scrub resources we've requested
+  sc::result react(const ReservationFailure&);
+};
+
+
+// the "active" sub-states
+
+struct RangeBlocked;  ///< the objects range is blocked
+struct PendingTimer;  ///< either delaying the scrub by some time and requeuing, or just
+		      ///< requeue
+struct NewChunk;      ///< select a chunk to scrub, and verify its availability
+struct WaitPushes;
+struct WaitLastUpdate;
+struct BuildMap;
+struct DrainReplMaps;  ///< a problem during BuildMap. Wait for all replicas to report,
+		       ///< then restart.
+struct WaitReplicas;   ///< wait for all replicas to report
+struct WaitDigestUpdate;
+
+struct ActiveScrubbing : sc::state<ActiveScrubbing, ScrubMachine, PendingTimer> {
+
+  explicit ActiveScrubbing(my_context ctx);
+  ~ActiveScrubbing();
+
+  using reactions = mpl::list<
+    sc::custom_reaction<InternalError>,
+    sc::custom_reaction<FullReset>>;
+
+  sc::result react(const FullReset&);
+  sc::result react(const InternalError&);
+};
+
+struct RangeBlocked : sc::state<RangeBlocked, ActiveScrubbing> {
+  explicit RangeBlocked(my_context ctx);
+  using reactions = mpl::list<sc::transition<Unblocked, PendingTimer>>;
+};
+
+struct PendingTimer : sc::state<PendingTimer, ActiveScrubbing> {
+
+  explicit PendingTimer(my_context ctx);
+
+  using reactions = mpl::list<sc::transition<InternalSchedScrub, NewChunk>>;
+};
+
+struct NewChunk : sc::state<NewChunk, ActiveScrubbing> {
+
+  explicit NewChunk(my_context ctx);
+
+  using reactions = mpl::list<sc::transition<ChunkIsBusy, RangeBlocked>,
+			      sc::custom_reaction<SelectedChunkFree>>;
+
+  sc::result react(const SelectedChunkFree&);
+};
+
+/**
+ * initiate the update process for this chunk
+ *
+ * Wait fo 'active_pushes' to clear.
+ * 'active_pushes' represents recovery that is in-flight to the local Objectstore, hence
+ * scrub waits until the correct data is readable (in-flight data to the Objectstore is
+ * not readable until written to disk, termed 'applied' here)
+ */
+struct WaitPushes : sc::state<WaitPushes, ActiveScrubbing> {
+
+  explicit WaitPushes(my_context ctx);
+
+  using reactions = mpl::list<sc::custom_reaction<ActivePushesUpd>>;
+
+  sc::result react(const ActivePushesUpd&);
+};
+
+struct WaitLastUpdate : sc::state<WaitLastUpdate, ActiveScrubbing> {
+
+  explicit WaitLastUpdate(my_context ctx);
+
+  void on_new_updates(const UpdatesApplied&);
+
+  using reactions = mpl::list<sc::custom_reaction<InternalAllUpdates>,
+			      sc::in_state_reaction<UpdatesApplied,
+						    WaitLastUpdate,
+						    &WaitLastUpdate::on_new_updates>>;
+
+  sc::result react(const InternalAllUpdates&);
+};
+
+struct BuildMap : sc::state<BuildMap, ActiveScrubbing> {
+  explicit BuildMap(my_context ctx);
+
+  // possible error scenarios:
+  // - an error reported by the backend will trigger an 'InternalError' event,
+  //   handled by our parent state;
+  // - if preempted, we switch to DrainReplMaps, where we will wait for all
+  //   replicas to send their maps before acknowledging the preemption;
+  // - an interval change will be handled by the relevant 'send-event' functions,
+  //   and will translated into a 'FullReset' event.
+  using reactions =
+    mpl::list<sc::transition<IntBmPreempted, DrainReplMaps>,
+	      sc::transition<InternalSchedScrub, BuildMap>,  // looping, waiting
+							     // for the backend to
+							     // finish
+	      sc::custom_reaction<IntLocalMapDone>>;
+
+  sc::result react(const IntLocalMapDone&);
+};
+
+/*
+ *  "drain" scrub-maps responses from replicas
+ */
+struct DrainReplMaps : sc::state<DrainReplMaps, ActiveScrubbing> {
+  explicit DrainReplMaps(my_context ctx);
+
+  using reactions =
+    mpl::list<sc::custom_reaction<GotReplicas>	// all replicas are accounted for
+	      >;
+
+  sc::result react(const GotReplicas&);
+};
+
+struct WaitReplicas : sc::state<WaitReplicas, ActiveScrubbing> {
+  explicit WaitReplicas(my_context ctx);
+
+  using reactions =
+    mpl::list<sc::custom_reaction<GotReplicas>,	 // all replicas are accounted for
+	      sc::transition<MapsCompared, WaitDigestUpdate>,
+	      sc::deferral<DigestUpdate>  // might arrive before we've reached WDU
+	      >;
+
+  sc::result react(const GotReplicas&);
+
+  bool all_maps_already_called{false};	// see comment in react code
+};
+
+struct WaitDigestUpdate : sc::state<WaitDigestUpdate, ActiveScrubbing> {
+  explicit WaitDigestUpdate(my_context ctx);
+
+  using reactions = mpl::list<sc::custom_reaction<DigestUpdate>,
+			      sc::custom_reaction<ScrubFinished>,
+			      sc::transition<NextChunk, PendingTimer>>;
+  sc::result react(const DigestUpdate&);
+  sc::result react(const ScrubFinished&);
+};
+
+// ----------------------------- the "replica active" states -----------------------
+
+/*
+ * Waiting for 'active_pushes' to complete
+ *
+ * When in this state:
+ * - the details of the Primary's request were internalized by PgScrubber;
+ * - 'active' scrubbing is set
+ */
+struct ReplicaWaitUpdates : sc::state<ReplicaWaitUpdates, ScrubMachine> {
+  explicit ReplicaWaitUpdates(my_context ctx);
+  using reactions =
+    mpl::list<sc::custom_reaction<ReplicaPushesUpd>, sc::custom_reaction<FullReset>>;
+
+  sc::result react(const ReplicaPushesUpd&);
+  sc::result react(const FullReset&);
+};
+
+
+struct ActiveReplica : sc::state<ActiveReplica, ScrubMachine> {
+  explicit ActiveReplica(my_context ctx);
+  using reactions = mpl::list<sc::custom_reaction<SchedReplica>,
+			      sc::custom_reaction<FullReset>,
+			      sc::transition<ScrubFinished, NotActive>>;
+
+  sc::result react(const SchedReplica&);
+  sc::result react(const FullReset&);
+};
+
+}  // namespace Scrub
diff --git a/src/osd/scrub_machine_lstnr.h b/src/osd/scrub_machine_lstnr.h
new file mode 100644
index 000000000..8d9622b9b
--- /dev/null
+++ b/src/osd/scrub_machine_lstnr.h
@@ -0,0 +1,164 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+/**
+ * \file the PgScrubber interface used by the scrub FSM
+ */
+#include "common/version.h"
+#include "include/Context.h"
+
+#include "osd_types.h"
+
+namespace Scrub {
+
+enum class PreemptionNoted { no_preemption, preempted };
+
+/// the interface exposed by the PgScrubber into its internal
+/// preemption_data object
+struct preemption_t {
+
+  virtual ~preemption_t() = default;
+
+  [[nodiscard]] virtual bool is_preemptable() const = 0;
+
+  [[nodiscard]] virtual bool was_preempted() const = 0;
+
+  virtual void adjust_parameters() = 0;
+
+  /**
+   *  Try to preempt the scrub.
+   *  'true' (i.e. - preempted) if:
+   *   preemptable && not already preempted
+   */
+  virtual bool do_preempt() = 0;
+
+  /**
+   *  disables preemptions.
+   *  Returns 'true' if we were already preempted
+   */
+  virtual bool disable_and_test() = 0;
+};
+
+}  // namespace Scrub
+
+struct ScrubMachineListener {
+
+  struct MsgAndEpoch {
+    MessageRef m_msg;
+    epoch_t m_epoch;
+  };
+
+  virtual ~ScrubMachineListener() = default;
+
+  virtual void select_range_n_notify() = 0;
+
+  [[nodiscard]] virtual bool is_primary() const = 0;
+
+  /// walk the log to find the latest update that affects our chunk
+  virtual eversion_t search_log_for_updates() const = 0;
+
+  virtual eversion_t get_last_update_applied() const = 0;
+
+  virtual int pending_active_pushes() const = 0;
+
+  virtual int build_primary_map_chunk() = 0;
+
+  virtual int build_replica_map_chunk() = 0;
+
+  virtual void on_init() = 0;
+
+  virtual void on_replica_init() = 0;
+
+  virtual void replica_handling_done() = 0;
+
+  /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
+  /// (thus can be called from FSM reactions)
+  virtual void clear_pgscrub_state() = 0;
+
+  /*
+   * Send an 'InternalSchedScrub' FSM event either immediately, or - if 'm_need_sleep'
+   * is asserted - after a configuration-dependent timeout.
+   */
+  virtual void add_delayed_scheduling() = 0;
+
+  /**
+   * Ask all replicas for their scrub maps for the current chunk.
+   */
+  virtual void get_replicas_maps(bool replica_can_preempt) = 0;
+
+  virtual void on_digest_updates() = 0;
+
+  virtual void scrub_begin() = 0;
+
+  /// the part that actually finalizes a scrub
+  virtual void scrub_finish() = 0;
+
+  /**
+   * Prepare a MOSDRepScrubMap message carrying the requested scrub map
+   * @param was_preempted - were we preempted?
+   * @return the message, and the current value of 'm_replica_min_epoch' (which is
+   *     used when sending the message, but will be overwritten before that).
+   */
+  [[nodiscard]] virtual MsgAndEpoch prep_replica_map_msg(
+    Scrub::PreemptionNoted was_preempted) = 0;
+
+  /**
+   * Send to the primary the pre-prepared message containing the requested map
+   */
+  virtual void send_replica_map(const MsgAndEpoch& preprepared) = 0;
+
+  /**
+   * Let the primary know that we were preempted while trying to build the
+   * requested map.
+   */
+  virtual void send_preempted_replica() = 0;
+
+  [[nodiscard]] virtual bool has_pg_marked_new_updates() const = 0;
+
+  virtual void set_subset_last_update(eversion_t e) = 0;
+
+  [[nodiscard]] virtual bool was_epoch_changed() const = 0;
+
+  virtual Scrub::preemption_t& get_preemptor() = 0;
+
+  /**
+   *  a "technical" collection of the steps performed once all
+   *  rep maps are available:
+   *  - the maps are compared
+   *  - the scrub region markers (start_ & end_) are advanced
+   *  - callbacks and ops that were pending are allowed to run
+   */
+  virtual void maps_compare_n_cleanup() = 0;
+
+  /**
+   * order the PgScrubber to initiate the process of reserving replicas' scrub
+   * resources.
+   */
+  virtual void reserve_replicas() = 0;
+
+  virtual void unreserve_replicas() = 0;
+
+  /**
+   * Manipulate the 'I am being scrubbed now' Scrubber's flag
+   */
+  virtual void set_queued_or_active() = 0;
+  virtual void clear_queued_or_active() = 0;
+
+  /**
+   * the FSM interface into the "are we waiting for maps, either our own or from
+   * replicas" state.
+   * The FSM can only:
+   * - mark the local map as available, and
+   * - query status
+   */
+  virtual void mark_local_map_ready() = 0;
+
+  [[nodiscard]] virtual bool are_all_maps_available() const = 0;
+
+  /// a log/debug interface
+  virtual std::string dump_awaited_maps() const = 0;
+
+  /// exposed to be used by the scrub_machine logger
+  virtual std::ostream& gen_prefix(std::ostream& out) const = 0;
+};
diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h
new file mode 100644
index 000000000..65014b594
--- /dev/null
+++ b/src/osd/scrubber_common.h
@@ -0,0 +1,299 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include "common/scrub_types.h"
+#include "include/types.h"
+#include "os/ObjectStore.h"
+
+#include "OpRequest.h"
+
+namespace ceph {
+class Formatter;
+}
+
+namespace Scrub {
+
+/// high/low OP priority
+enum class scrub_prio_t : bool { low_priority = false, high_priority = true };
+
+/// Identifies a specific scrub activation within an interval,
+/// see ScrubPGgIF::m_current_token
+using act_token_t = uint32_t;
+
+}  // namespace Scrub
+
+
+/**
+ * Flags affecting the scheduling and behaviour of the *next* scrub.
+ *
+ * we hold two of these flag collections: one
+ * for the next scrub, and one frozen at initiation (i.e. in pg::queue_scrub())
+ */
+struct requested_scrub_t {
+
+  // flags to indicate explicitly requested scrubs (by admin):
+  // bool must_scrub, must_deep_scrub, must_repair, need_auto;
+
+  /**
+   * 'must_scrub' is set by an admin command (or by need_auto).
+   *  Affects the priority of the scrubbing, and the sleep periods
+   *  during the scrub.
+   */
+  bool must_scrub{false};
+
+  /**
+   * scrub must not be aborted.
+   * Set for explicitly requested scrubs, and for scrubs originated by the pairing
+   * process with the 'repair' flag set (in the RequestScrub event).
+   *
+   * Will be copied into the 'required' scrub flag upon scrub start.
+   */
+  bool req_scrub{false};
+
+  /**
+   * Set from:
+   *  - scrub_requested() with need_auto param set, which only happens in
+   *  - scrub_finish() - if deep_scrub_on_error is set, and we have errors
+   *
+   * If set, will prevent the OSD from casually postponing our scrub. When scrubbing
+   * starts, will cause must_scrub, must_deep_scrub and auto_repair to be set.
+   */
+  bool need_auto{false};
+
+  /**
+   * Set for scrub-after-recovery just before we initiate the recovery deep scrub,
+   * or if scrub_requested() was called with either need_auto ot repair.
+   * Affects PG_STATE_DEEP_SCRUB.
+   */
+  bool must_deep_scrub{false};
+
+  /**
+   * (An intermediary flag used by pg::sched_scrub() on the first time
+   * a planned scrub has all its resources). Determines whether the next
+   * repair/scrub will be 'deep'.
+   *
+   * Note: 'dumped' by PgScrubber::dump() and such. In reality, being a
+   * temporary that is set and reset by the same operation, will never
+   * appear externally to be set
+   */
+  bool time_for_deep{false};
+
+  bool deep_scrub_on_error{false};
+
+  /**
+   * If set, we should see must_deep_scrub and must_repair set, too
+   *
+   * - 'must_repair' is checked by the OSD when scheduling the scrubs.
+   * - also checked & cleared at pg::queue_scrub()
+   */
+  bool must_repair{false};
+
+  /*
+   * the value of auto_repair is determined in sched_scrub() (once per scrub. previous
+   * value is not remembered). Set if
+   * - allowed by configuration and backend, and
+   * - must_scrub is not set (i.e. - this is a periodic scrub),
+   * - time_for_deep was just set
+   */
+  bool auto_repair{false};
+
+  /**
+   * indicating that we are scrubbing post repair to verify everything is fixed.
+   * Otherwise - PG_STATE_FAILED_REPAIR will be asserted.
+   */
+  bool check_repair{false};
+};
+
+ostream& operator<<(ostream& out, const requested_scrub_t& sf);
+
+/**
+ *  The interface used by the PG when requesting scrub-related info or services
+ */
+struct ScrubPgIF {
+
+  virtual ~ScrubPgIF() = default;
+
+  friend ostream& operator<<(ostream& out, const ScrubPgIF& s) { return s.show(out); }
+
+  virtual ostream& show(ostream& out) const = 0;
+
+  // --------------- triggering state-machine events:
+
+  virtual void initiate_regular_scrub(epoch_t epoch_queued) = 0;
+
+  virtual void initiate_scrub_after_repair(epoch_t epoch_queued) = 0;
+
+  virtual void send_scrub_resched(epoch_t epoch_queued) = 0;
+
+  virtual void active_pushes_notification(epoch_t epoch_queued) = 0;
+
+  virtual void update_applied_notification(epoch_t epoch_queued) = 0;
+
+  virtual void digest_update_notification(epoch_t epoch_queued) = 0;
+
+  virtual void send_scrub_unblock(epoch_t epoch_queued) = 0;
+
+  virtual void send_replica_maps_ready(epoch_t epoch_queued) = 0;
+
+  virtual void send_replica_pushes_upd(epoch_t epoch_queued) = 0;
+
+  virtual void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) = 0;
+
+  virtual void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) = 0;
+
+  virtual void on_applied_when_primary(const eversion_t &applied_version) = 0;
+
+  virtual void send_full_reset(epoch_t epoch_queued) = 0;
+
+  virtual void send_chunk_free(epoch_t epoch_queued) = 0;
+
+  virtual void send_chunk_busy(epoch_t epoch_queued) = 0;
+
+  virtual void send_local_map_done(epoch_t epoch_queued) = 0;
+
+  virtual void send_get_next_chunk(epoch_t epoch_queued) = 0;
+
+  virtual void send_scrub_is_finished(epoch_t epoch_queued) = 0;
+
+  virtual void send_maps_compared(epoch_t epoch_queued) = 0;
+
+  // --------------------------------------------------
+
+  [[nodiscard]] virtual bool are_callbacks_pending()
+    const = 0;	// currently only used for an assert
+
+  /**
+   * the scrubber is marked 'active':
+   * - for the primary: when all replica OSDs grant us the requested resources
+   * - for replicas: upon receiving the scrub request from the primary
+   */
+  [[nodiscard]] virtual bool is_scrub_active() const = 0;
+
+  /**
+   * 'true' until after the FSM processes the 'scrub-finished' event,
+   * and scrubbing is completely cleaned-up.
+   *
+   * In other words - holds longer than is_scrub_active(), thus preventing
+   * a rescrubbing of the same PG while the previous scrub has not fully
+   * terminated.
+   */
+  [[nodiscard]] virtual bool is_queued_or_active() const = 0;
+
+  /**
+   * Manipulate the 'scrubbing request has been queued, or - we are
+   * actually scrubbing' Scrubber's flag
+   */
+  virtual void set_queued_or_active() = 0;
+  virtual void clear_queued_or_active() = 0;
+
+  /// are we waiting for resource reservation grants form our replicas?
+  [[nodiscard]] virtual bool is_reserving() const = 0;
+
+  /// handle a message carrying a replica map
+  virtual void map_from_replica(OpRequestRef op) = 0;
+
+  virtual void replica_scrub_op(OpRequestRef op) = 0;
+
+  virtual void set_op_parameters(requested_scrub_t&) = 0;
+
+  virtual void scrub_clear_state() = 0;
+
+  virtual void handle_query_state(ceph::Formatter* f) = 0;
+
+  virtual void dump(ceph::Formatter* f) const = 0;
+
+  /**
+   * Return true if soid is currently being scrubbed and pending IOs should block.
+   * May have a side effect of preempting an in-progress scrub -- will return false
+   * in that case.
+   *
+   * @param soid object to check for ongoing scrub
+   * @return boolean whether a request on soid should block until scrub completion
+   */
+  virtual bool write_blocked_by_scrub(const hobject_t& soid) = 0;
+
+  /// Returns whether any objects in the range [begin, end] are being scrubbed
+  virtual bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) = 0;
+
+  /// the op priority, taken from the primary's request message
+  virtual Scrub::scrub_prio_t replica_op_priority() const = 0;
+
+  /// the priority of the on-going scrub (used when requeuing events)
+  virtual unsigned int scrub_requeue_priority(
+    Scrub::scrub_prio_t with_priority) const = 0;
+  virtual unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
+					      unsigned int suggested_priority) const = 0;
+
+  virtual void add_callback(Context* context) = 0;
+
+  /// add to scrub statistics, but only if the soid is below the scrub start
+  virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
+					const hobject_t& soid) = 0;
+
+  /**
+   * the version of 'scrub_clear_state()' that does not try to invoke FSM services
+   * (thus can be called from FSM reactions)
+   */
+  virtual void clear_pgscrub_state() = 0;
+
+  /**
+   *  triggers the 'RemotesReserved' (all replicas granted scrub resources)
+   *  state-machine event
+   */
+  virtual void send_remotes_reserved(epoch_t epoch_queued) = 0;
+
+  /**
+   * triggers the 'ReservationFailure' (at least one replica denied us the requested
+   * resources) state-machine event
+   */
+  virtual void send_reservation_failure(epoch_t epoch_queued) = 0;
+
+  virtual void cleanup_store(ObjectStore::Transaction* t) = 0;
+
+  virtual bool get_store_errors(const scrub_ls_arg_t& arg,
+				scrub_ls_result_t& res_inout) const = 0;
+
+  // --------------- reservations -----------------------------------
+
+  /**
+   *  message all replicas with a request to "unreserve" scrub
+   */
+  virtual void unreserve_replicas() = 0;
+
+  /**
+   *  "forget" all replica reservations. No messages are sent to the
+   *  previously-reserved.
+   *
+   *  Used upon interval change. The replicas' state is guaranteed to
+   *  be reset separately by the interval-change event.
+   */
+  virtual void discard_replica_reservations() = 0;
+
+  /**
+   * clear both local and OSD-managed resource reservation flags
+   */
+  virtual void clear_scrub_reservations() = 0;
+
+  /**
+   * Reserve local scrub resources (managed by the OSD)
+   *
+   * Fails if OSD's local-scrubs budget was exhausted
+   * \returns were local resources reserved?
+   */
+  virtual bool reserve_local() = 0;
+
+  // on the replica:
+  virtual void handle_scrub_reserve_request(OpRequestRef op) = 0;
+  virtual void handle_scrub_reserve_release(OpRequestRef op) = 0;
+
+  // and on the primary:
+  virtual void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) = 0;
+  virtual void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) = 0;
+
+  virtual void reg_next_scrub(const requested_scrub_t& request_flags) = 0;
+  virtual void unreg_next_scrub() = 0;
+  virtual void scrub_requested(scrub_level_t scrub_level,
+			       scrub_type_t scrub_type,
+			       requested_scrub_t& req_flags) = 0;
+};
diff --git a/src/osdc/CMakeLists.txt b/src/osdc/CMakeLists.txt
new file mode 100644
index 000000000..205ad3d4f
--- /dev/null
+++ b/src/osdc/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(osdc_files
+  Filer.cc
+  ObjectCacher.cc
+  Objecter.cc
+  error_code.cc
+  Striper.cc)
+add_library(osdc STATIC ${osdc_files})
+target_link_libraries(osdc ceph-common)
+if(WITH_EVENTTRACE)
+  add_dependencies(osdc eventtrace_tp)
+endif()
diff --git a/src/osdc/Filer.cc b/src/osdc/Filer.cc
new file mode 100644
index 000000000..0b12df1a4
--- /dev/null
+++ b/src/osdc/Filer.cc
@@ -0,0 +1,487 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include <mutex>
+#include <algorithm>
+#include "Filer.h"
+#include "osd/OSDMap.h"
+#include "Striper.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDMap.h"
+
+#include "msg/Messenger.h"
+
+#include "include/Context.h"
+
+#include "common/Finisher.h"
+#include "common/config.h"
+
+#define dout_subsys ceph_subsys_filer
+#undef dout_prefix
+#define dout_prefix *_dout << objecter->messenger->get_myname() << ".filer "
+
+using std::hex;
+using std::dec;
+using std::vector;
+
+class Filer::C_Probe : public Context {
+public:
+  Filer *filer;
+  Probe *probe;
+  object_t oid;
+  uint64_t size;
+  ceph::real_time mtime;
+  C_Probe(Filer *f, Probe *p, object_t o) : filer(f), probe(p), oid(o),
+					    size(0) {}
+  void finish(int r) override {
+    if (r == -ENOENT) {
+      r = 0;
+      ceph_assert(size == 0);
+    }
+
+    bool probe_complete;
+    {
+      Probe::unique_lock pl(probe->lock);
+      if (r != 0) {
+	probe->err = r;
+      }
+
+      probe_complete = filer->_probed(probe, oid, size, mtime, pl);
+      ceph_assert(!pl.owns_lock());
+    }
+    if (probe_complete) {
+      probe->onfinish->complete(probe->err);
+      delete probe;
+    }
+  }
+};
+
+int Filer::probe(inodeno_t ino,
+		 const file_layout_t *layout,
+		 snapid_t snapid,
+		 uint64_t start_from,
+		 uint64_t *end, // LB, when !fwd
+		 ceph::real_time *pmtime,
+		 bool fwd,
+		 int flags,
+		 Context *onfinish)
+{
+  ldout(cct, 10) << "probe " << (fwd ? "fwd ":"bwd ")
+	   << hex << ino << dec
+	   << " starting from " << start_from
+	   << dendl;
+
+  ceph_assert(snapid);  // (until there is a non-NOSNAP write)
+
+  Probe *probe = new Probe(ino, *layout, snapid, start_from, end, pmtime,
+			   flags, fwd, onfinish);
+
+  return probe_impl(probe, layout, start_from, end);
+}
+
+int Filer::probe(inodeno_t ino,
+		 const file_layout_t *layout,
+		 snapid_t snapid,
+		 uint64_t start_from,
+		 uint64_t *end, // LB, when !fwd
+		 utime_t *pmtime,
+		 bool fwd,
+		 int flags,
+		 Context *onfinish)
+{
+  ldout(cct, 10) << "probe " << (fwd ? "fwd ":"bwd ")
+	   << hex << ino << dec
+	   << " starting from " << start_from
+	   << dendl;
+
+  ceph_assert(snapid);  // (until there is a non-NOSNAP write)
+
+  Probe *probe = new Probe(ino, *layout, snapid, start_from, end, pmtime,
+			   flags, fwd, onfinish);
+  return probe_impl(probe, layout, start_from, end);
+}
+
+int Filer::probe_impl(Probe* probe, const file_layout_t *layout,
+		      uint64_t start_from, uint64_t *end) // LB, when !fwd
+{
+  // period (bytes before we jump unto a new set of object(s))
+  uint64_t period = layout->get_period();
+
+  // start with 1+ periods.
+  probe->probing_len = period;
+  if (probe->fwd) {
+    if (start_from % period)
+      probe->probing_len += period - (start_from % period);
+  } else {
+    ceph_assert(start_from > *end);
+    if (start_from % period)
+      probe->probing_len -= period - (start_from % period);
+    probe->probing_off -= probe->probing_len;
+  }
+
+  Probe::unique_lock pl(probe->lock);
+  _probe(probe, pl);
+  ceph_assert(!pl.owns_lock());
+
+  return 0;
+}
+
+
+
+/**
+ * probe->lock must be initially locked, this function will release it
+ */
+void Filer::_probe(Probe *probe, Probe::unique_lock& pl)
+{
+  ceph_assert(pl.owns_lock() && pl.mutex() == &probe->lock);
+
+  ldout(cct, 10) << "_probe " << hex << probe->ino << dec
+		 << " " << probe->probing_off << "~" << probe->probing_len
+		 << dendl;
+
+  // map range onto objects
+  probe->known_size.clear();
+  probe->probing.clear();
+  Striper::file_to_extents(cct, probe->ino, &probe->layout, probe->probing_off,
+			   probe->probing_len, 0, probe->probing);
+
+  std::vector<ObjectExtent> stat_extents;
+  for (auto p = probe->probing.begin(); p != probe->probing.end(); ++p) {
+    ldout(cct, 10) << "_probe  probing " << p->oid << dendl;
+    probe->ops.insert(p->oid);
+    stat_extents.push_back(*p);
+  }
+
+  pl.unlock();
+  for (std::vector<ObjectExtent>::iterator i = stat_extents.begin();
+       i != stat_extents.end(); ++i) {
+    C_Probe *c = new C_Probe(this, probe, i->oid);
+    objecter->stat(i->oid, i->oloc, probe->snapid, &c->size, &c->mtime,
+		   probe->flags | CEPH_OSD_FLAG_RWORDERED,
+		   new C_OnFinisher(c, finisher));
+  }
+}
+
+/**
+ * probe->lock must be initially held, and will be released by this function.
+ *
+ * @return true if probe is complete and Probe object may be freed.
+ */
+bool Filer::_probed(Probe *probe, const object_t& oid, uint64_t size,
+		    ceph::real_time mtime, Probe::unique_lock& pl)
+{
+  ceph_assert(pl.owns_lock() && pl.mutex() == &probe->lock);
+
+  ldout(cct, 10) << "_probed " << probe->ino << " object " << oid
+	   << " has size " << size << " mtime " << mtime << dendl;
+
+  probe->known_size[oid] = size;
+  if (mtime > probe->max_mtime)
+    probe->max_mtime = mtime;
+
+  ceph_assert(probe->ops.count(oid));
+  probe->ops.erase(oid);
+
+  if (!probe->ops.empty()) {
+    pl.unlock();
+    return false;  // waiting for more!
+  }
+
+  if (probe->err) { // we hit an error, propagate back up
+    pl.unlock();
+    return true;
+  }
+
+  // analyze!
+  uint64_t end = 0;
+
+  if (!probe->fwd) {
+    std::reverse(probe->probing.begin(), probe->probing.end());
+  }
+
+  for (auto p = probe->probing.begin(); p != probe->probing.end(); ++p) {
+    uint64_t shouldbe = p->length + p->offset;
+    ldout(cct, 10) << "_probed  " << probe->ino << " object " << hex
+		   << p->oid << dec << " should be " << shouldbe
+		   << ", actual is " << probe->known_size[p->oid]
+		   << dendl;
+
+    if (!probe->found_size) {
+      ceph_assert(probe->known_size[p->oid] <= shouldbe);
+
+      if ((probe->fwd && probe->known_size[p->oid] == shouldbe) ||
+	  (!probe->fwd && probe->known_size[p->oid] == 0 &&
+	   probe->probing_off > 0))
+	continue;  // keep going
+
+      // aha, we found the end!
+      // calc offset into buffer_extent to get distance from probe->from.
+      uint64_t oleft = probe->known_size[p->oid] - p->offset;
+      for (auto i = p->buffer_extents.begin();
+	   i != p->buffer_extents.end();
+	   ++i) {
+	if (oleft <= (uint64_t)i->second) {
+	  end = probe->probing_off + i->first + oleft;
+	  ldout(cct, 10) << "_probed  end is in buffer_extent " << i->first
+			 << "~" << i->second << " off " << oleft
+			 << ", from was " << probe->probing_off << ", end is "
+			 << end << dendl;
+
+	  probe->found_size = true;
+	  ldout(cct, 10) << "_probed found size at " << end << dendl;
+	  *probe->psize = end;
+
+	  if (!probe->pmtime &&
+	      !probe->pumtime)  // stop if we don't need mtime too
+	    break;
+	}
+	oleft -= i->second;
+      }
+    }
+    break;
+  }
+
+  if (!probe->found_size || (probe->probing_off && (probe->pmtime ||
+						    probe->pumtime))) {
+    // keep probing!
+    ldout(cct, 10) << "_probed probing further" << dendl;
+
+    uint64_t period = probe->layout.get_period();
+    if (probe->fwd) {
+      probe->probing_off += probe->probing_len;
+      ceph_assert(probe->probing_off % period == 0);
+      probe->probing_len = period;
+    } else {
+      // previous period.
+      ceph_assert(probe->probing_off % period == 0);
+      probe->probing_len = period;
+      probe->probing_off -= period;
+    }
+    _probe(probe, pl);
+    ceph_assert(!pl.owns_lock());
+    return false;
+  } else if (probe->pmtime) {
+    ldout(cct, 10) << "_probed found mtime " << probe->max_mtime << dendl;
+    *probe->pmtime = probe->max_mtime;
+  } else if (probe->pumtime) {
+    ldout(cct, 10) << "_probed found mtime " << probe->max_mtime << dendl;
+    *probe->pumtime = ceph::real_clock::to_ceph_timespec(probe->max_mtime);
+  }
+  // done!
+  pl.unlock();
+  return true;
+}
+
+
+// -----------------------
+
+struct PurgeRange {
+  std::mutex lock;
+  typedef std::lock_guard<std::mutex> lock_guard;
+  typedef std::unique_lock<std::mutex> unique_lock;
+  inodeno_t ino;
+  file_layout_t layout;
+  SnapContext snapc;
+  uint64_t first, num;
+  ceph::real_time mtime;
+  int flags;
+  Context *oncommit;
+  int uncommitted;
+  int err = 0;
+  PurgeRange(inodeno_t i, const file_layout_t& l, const SnapContext& sc,
+	     uint64_t fo, uint64_t no, ceph::real_time t, int fl,
+	     Context *fin)
+    : ino(i), layout(l), snapc(sc), first(fo), num(no), mtime(t), flags(fl),
+      oncommit(fin), uncommitted(0) {}
+};
+
+int Filer::purge_range(inodeno_t ino,
+		       const file_layout_t *layout,
+		       const SnapContext& snapc,
+		       uint64_t first_obj, uint64_t num_obj,
+		       ceph::real_time mtime,
+		       int flags,
+		       Context *oncommit)
+{
+  ceph_assert(num_obj > 0);
+
+  // single object?  easy!
+  if (num_obj == 1) {
+    object_t oid = file_object_t(ino, first_obj);
+    object_locator_t oloc = OSDMap::file_to_object_locator(*layout);
+    ldout(cct, 10) << "purge_range removing " << oid << dendl;
+    objecter->remove(oid, oloc, snapc, mtime, flags, oncommit);
+    return 0;
+  }
+
+  PurgeRange *pr = new PurgeRange(ino, *layout, snapc, first_obj,
+				  num_obj, mtime, flags, oncommit);
+
+  _do_purge_range(pr, 0, 0);
+  return 0;
+}
+
+struct C_PurgeRange : public Context {
+  Filer *filer;
+  PurgeRange *pr;
+  C_PurgeRange(Filer *f, PurgeRange *p) : filer(f), pr(p) {}
+  void finish(int r) override {
+    filer->_do_purge_range(pr, 1, r);
+  }
+};
+
+void Filer::_do_purge_range(PurgeRange *pr, int fin, int err)
+{
+  PurgeRange::unique_lock prl(pr->lock);
+  if (err && err != -ENOENT)
+    pr->err = err;
+  pr->uncommitted -= fin;
+  ldout(cct, 10) << "_do_purge_range " << pr->ino << " objects " << pr->first
+		 << "~" << pr->num << " uncommitted " << pr->uncommitted
+		 << dendl;
+
+  if (pr->num == 0 && pr->uncommitted == 0) {
+    pr->oncommit->complete(pr->err);
+    prl.unlock();
+    delete pr;
+    return;
+  }
+
+  std::vector<object_t> remove_oids;
+
+  int max = cct->_conf->filer_max_purge_ops - pr->uncommitted;
+  while (pr->num > 0 && max > 0) {
+    remove_oids.push_back(file_object_t(pr->ino, pr->first));
+    pr->uncommitted++;
+    pr->first++;
+    pr->num--;
+    max--;
+  }
+  prl.unlock();
+
+  // Issue objecter ops outside pr->lock to avoid lock dependency loop
+  for (const auto& oid : remove_oids) {
+    object_locator_t oloc = OSDMap::file_to_object_locator(pr->layout);
+    objecter->remove(oid, oloc, pr->snapc, pr->mtime, pr->flags,
+		     new C_OnFinisher(new C_PurgeRange(this, pr), finisher));
+  }
+}
+
+// -----------------------
+struct TruncRange {
+  std::mutex lock;
+  typedef std::lock_guard<std::mutex> lock_guard;
+  typedef std::unique_lock<std::mutex> unique_lock;
+  inodeno_t ino;
+  file_layout_t layout;
+  SnapContext snapc;
+  ceph::real_time mtime;
+  int flags;
+  Context *oncommit;
+  int uncommitted;
+  uint64_t offset;
+  uint64_t length;
+  uint32_t truncate_seq;
+  TruncRange(inodeno_t i, const file_layout_t& l, const SnapContext& sc,
+	     ceph::real_time t, int fl, Context *fin,
+	     uint64_t off, uint64_t len, uint32_t ts)
+    : ino(i), layout(l), snapc(sc), mtime(t), flags(fl), oncommit(fin),
+      uncommitted(0), offset(off), length(len), truncate_seq(ts) {}
+};
+
+void Filer::truncate(inodeno_t ino,
+		     const file_layout_t *layout,
+		     const SnapContext& snapc,
+		     uint64_t offset,
+		     uint64_t len,
+		     __u32 truncate_seq,
+		     ceph::real_time mtime,
+		     int flags,
+		     Context *oncommit)
+{
+  uint64_t period = layout->get_period();
+  uint64_t num_objs = Striper::get_num_objects(*layout, len + (offset % period));
+  if (num_objs == 1) {
+    vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents);
+    osdc_opvec ops(1);
+    ops[0].op.op = CEPH_OSD_OP_TRIMTRUNC;
+    ops[0].op.extent.truncate_seq = truncate_seq;
+    ops[0].op.extent.truncate_size = extents[0].offset;
+    objecter->_modify(extents[0].oid, extents[0].oloc, ops, mtime, snapc,
+		      flags, oncommit);
+    return;
+  }
+
+  if (len > 0 && (offset + len) % period)
+    len += period - ((offset + len) % period);
+
+  TruncRange *tr = new TruncRange(ino, *layout, snapc, mtime, flags, oncommit,
+				  offset, len, truncate_seq);
+  _do_truncate_range(tr, 0);
+}
+
+struct C_TruncRange : public Context {
+  Filer *filer;
+  TruncRange *tr;
+  C_TruncRange(Filer *f, TruncRange *t) : filer(f), tr(t) {}
+  void finish(int r) override {
+    filer->_do_truncate_range(tr, 1);
+  }
+};
+
+void Filer::_do_truncate_range(TruncRange *tr, int fin)
+{
+  TruncRange::unique_lock trl(tr->lock);
+  tr->uncommitted -= fin;
+  ldout(cct, 10) << "_do_truncate_range " << tr->ino << " objects " << tr->offset
+		 << "~" << tr->length << " uncommitted " << tr->uncommitted
+		 << dendl;
+
+  if (tr->length == 0 && tr->uncommitted == 0) {
+    tr->oncommit->complete(0);
+    trl.unlock();
+    delete tr;
+    return;
+  }
+
+  vector<ObjectExtent> extents;
+
+  int max = cct->_conf->filer_max_truncate_ops - tr->uncommitted;
+  if (max > 0 && tr->length > 0) {
+    uint64_t len = tr->layout.get_period() * max;
+    if (len > tr->length)
+      len = tr->length;
+
+    uint64_t offset = tr->offset + tr->length - len;
+    Striper::file_to_extents(cct, tr->ino, &tr->layout, offset, len, 0, extents);
+    tr->uncommitted += extents.size();
+    tr->length -= len;
+  }
+
+  trl.unlock();
+
+  // Issue objecter ops outside tr->lock to avoid lock dependency loop
+  for (const auto& p : extents) {
+    osdc_opvec ops(1);
+    ops[0].op.op = CEPH_OSD_OP_TRIMTRUNC;
+    ops[0].op.extent.truncate_size = p.offset;
+    ops[0].op.extent.truncate_seq = tr->truncate_seq;
+    objecter->_modify(p.oid, p.oloc, ops, tr->mtime, tr->snapc, tr->flags,
+		      new C_OnFinisher(new C_TruncRange(this, tr), finisher));
+  }
+}
diff --git a/src/osdc/Filer.h b/src/osdc/Filer.h
new file mode 100644
index 000000000..42cf9e998
--- /dev/null
+++ b/src/osdc/Filer.h
@@ -0,0 +1,300 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILER_H
+#define CEPH_FILER_H
+
+/*** Filer
+ *
+ * stripe file ranges onto objects.
+ * build list<ObjectExtent> for the objecter or objectcacher.
+ *
+ * also, provide convenience methods that call objecter for you.
+ *
+ * "files" are identified by ino.
+ */
+
+
+#include <mutex>
+
+#include "include/types.h"
+
+#include "common/ceph_time.h"
+
+#include "osd/OSDMap.h"
+#include "Objecter.h"
+#include "Striper.h"
+
+class Context;
+class Messenger;
+class OSDMap;
+class Finisher;
+
+
+/**** Filer interface ***/
+
+class Filer {
+  CephContext *cct;
+  Objecter   *objecter;
+  Finisher   *finisher;
+
+  // probes
+  struct Probe {
+    std::mutex lock;
+    typedef std::lock_guard<std::mutex> lock_guard;
+    typedef std::unique_lock<std::mutex> unique_lock;
+    inodeno_t ino;
+    file_layout_t layout;
+    snapid_t snapid;
+
+    uint64_t *psize;
+    ceph::real_time *pmtime;
+    utime_t *pumtime;
+
+    int flags;
+
+    bool fwd;
+
+    Context *onfinish;
+
+    std::vector<ObjectExtent> probing;
+    uint64_t probing_off, probing_len;
+
+    std::map<object_t, uint64_t> known_size;
+    ceph::real_time max_mtime;
+
+    std::set<object_t> ops;
+
+    int err;
+    bool found_size;
+
+    Probe(inodeno_t i, const file_layout_t &l, snapid_t sn,
+	  uint64_t f, uint64_t *e, ceph::real_time *m, int fl, bool fw,
+	  Context *c) :
+      ino(i), layout(l), snapid(sn),
+      psize(e), pmtime(m), pumtime(nullptr), flags(fl), fwd(fw), onfinish(c),
+      probing_off(f), probing_len(0),
+      err(0), found_size(false) {}
+
+    Probe(inodeno_t i, const file_layout_t &l, snapid_t sn,
+	  uint64_t f, uint64_t *e, utime_t *m, int fl, bool fw,
+	  Context *c) :
+      ino(i), layout(l), snapid(sn),
+      psize(e), pmtime(nullptr), pumtime(m), flags(fl), fwd(fw),
+      onfinish(c), probing_off(f), probing_len(0),
+      err(0), found_size(false) {}
+  };
+
+  class C_Probe;
+
+  void _probe(Probe *p, Probe::unique_lock& pl);
+  bool _probed(Probe *p, const object_t& oid, uint64_t size,
+	       ceph::real_time mtime, Probe::unique_lock& pl);
+
+ public:
+  Filer(const Filer& other);
+  const Filer operator=(const Filer& other);
+
+  Filer(Objecter *o, Finisher *f) : cct(o->cct), objecter(o), finisher(f) {}
+  ~Filer() {}
+
+  bool is_active() {
+    return objecter->is_active(); // || (oc && oc->is_active());
+  }
+
+
+  /*** async file interface.  scatter/gather as needed. ***/
+
+  void read(inodeno_t ino,
+	   const file_layout_t *layout,
+	   snapid_t snap,
+	   uint64_t offset,
+	   uint64_t len,
+	   ceph::buffer::list *bl,   // ptr to data
+	   int flags,
+	   Context *onfinish,
+	   int op_flags = 0) {
+    ceph_assert(snap);  // (until there is a non-NOSNAP write)
+    std::vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents);
+    objecter->sg_read(extents, snap, bl, flags, onfinish, op_flags);
+  }
+
+  void read_trunc(inodeno_t ino,
+		 const file_layout_t *layout,
+		 snapid_t snap,
+		 uint64_t offset,
+		 uint64_t len,
+		 ceph::buffer::list *bl, // ptr to data
+		 int flags,
+		 uint64_t truncate_size,
+		 __u32 truncate_seq,
+		 Context *onfinish,
+		 int op_flags = 0) {
+    ceph_assert(snap);  // (until there is a non-NOSNAP write)
+    std::vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, truncate_size,
+			     extents);
+    objecter->sg_read_trunc(extents, snap, bl, flags,
+			    truncate_size, truncate_seq, onfinish, op_flags);
+  }
+
+  void write(inodeno_t ino,
+	    const file_layout_t *layout,
+	    const SnapContext& snapc,
+	    uint64_t offset,
+	    uint64_t len,
+	    ceph::buffer::list& bl,
+	    ceph::real_time mtime,
+	    int flags,
+	    Context *oncommit,
+	    int op_flags = 0) {
+    std::vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents);
+    objecter->sg_write(extents, snapc, bl, mtime, flags, oncommit, op_flags);
+  }
+
+  void write_trunc(inodeno_t ino,
+		  const file_layout_t *layout,
+		  const SnapContext& snapc,
+		  uint64_t offset,
+		  uint64_t len,
+		  ceph::buffer::list& bl,
+		  ceph::real_time mtime,
+		  int flags,
+		  uint64_t truncate_size,
+		  __u32 truncate_seq,
+		  Context *oncommit,
+		  int op_flags = 0) {
+    std::vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, truncate_size,
+			     extents);
+    objecter->sg_write_trunc(extents, snapc, bl, mtime, flags,
+		       truncate_size, truncate_seq, oncommit, op_flags);
+  }
+
+  void truncate(inodeno_t ino,
+	       const file_layout_t *layout,
+	       const SnapContext& snapc,
+	       uint64_t offset,
+	       uint64_t len,
+	       __u32 truncate_seq,
+	       ceph::real_time mtime,
+	       int flags,
+	       Context *oncommit);
+  void _do_truncate_range(struct TruncRange *pr, int fin);
+
+  void zero(inodeno_t ino,
+	   const file_layout_t *layout,
+	   const SnapContext& snapc,
+	   uint64_t offset,
+	   uint64_t len,
+	   ceph::real_time mtime,
+	   int flags,
+	   bool keep_first,
+	   Context *oncommit) {
+    std::vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents);
+    if (extents.size() == 1) {
+      if (extents[0].offset == 0 && extents[0].length == layout->object_size
+	  && (!keep_first || extents[0].objectno != 0))
+	objecter->remove(extents[0].oid, extents[0].oloc,
+			 snapc, mtime, flags, oncommit);
+      else
+	objecter->zero(extents[0].oid, extents[0].oloc, extents[0].offset,
+		       extents[0].length, snapc, mtime, flags, oncommit);
+    } else {
+      C_GatherBuilder gcom(cct, oncommit);
+      for (auto p = extents.begin(); p != extents.end(); ++p) {
+	if (p->offset == 0 && p->length == layout->object_size &&
+	    (!keep_first || p->objectno != 0))
+	  objecter->remove(p->oid, p->oloc,
+			   snapc, mtime, flags,
+			   oncommit ? gcom.new_sub():0);
+	else
+	  objecter->zero(p->oid, p->oloc, p->offset, p->length,
+			 snapc, mtime, flags,
+			 oncommit ? gcom.new_sub():0);
+      }
+      gcom.activate();
+    }
+  }
+
+  void zero(inodeno_t ino,
+	   const file_layout_t *layout,
+	   const SnapContext& snapc,
+	   uint64_t offset,
+	   uint64_t len,
+	   ceph::real_time mtime,
+	   int flags,
+	   Context *oncommit) {
+    zero(ino, layout,
+         snapc, offset,
+         len, mtime,
+         flags, false,
+         oncommit);
+  }
+  // purge range of ino.### objects
+  int purge_range(inodeno_t ino,
+		  const file_layout_t *layout,
+		  const SnapContext& snapc,
+		  uint64_t first_obj, uint64_t num_obj,
+		  ceph::real_time mtime,
+		  int flags, Context *oncommit);
+  void _do_purge_range(struct PurgeRange *pr, int fin, int err);
+
+  /*
+   * probe
+   *  specify direction,
+   *  and whether we stop when we find data, or hole.
+   */
+  int probe(inodeno_t ino,
+	    const file_layout_t *layout,
+	    snapid_t snapid,
+	    uint64_t start_from,
+	    uint64_t *end,
+	    ceph::real_time *mtime,
+	    bool fwd,
+	    int flags,
+	    Context *onfinish);
+
+  int probe(inodeno_t ino,
+	    const file_layout_t *layout,
+	    snapid_t snapid,
+	    uint64_t start_from,
+	    uint64_t *end,
+	    bool fwd,
+	    int flags,
+	    Context *onfinish) {
+    return probe(ino, layout, snapid, start_from, end,
+		 (ceph::real_time* )0, fwd, flags, onfinish);
+  }
+
+  int probe(inodeno_t ino,
+	    const file_layout_t *layout,
+	    snapid_t snapid,
+	    uint64_t start_from,
+	    uint64_t *end,
+	    utime_t *mtime,
+	    bool fwd,
+	    int flags,
+	    Context *onfinish);
+
+private:
+  int probe_impl(Probe* probe, const file_layout_t *layout,
+		 uint64_t start_from, uint64_t *end);
+};
+
+#endif // !CEPH_FILER_H
diff --git a/src/osdc/Journaler.cc b/src/osdc/Journaler.cc
new file mode 100644
index 000000000..6e1cbd930
--- /dev/null
+++ b/src/osdc/Journaler.cc
@@ -0,0 +1,1607 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/perf_counters.h"
+#include "common/dout.h"
+#include "include/Context.h"
+#include "msg/Messenger.h"
+#include "osdc/Journaler.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+#include "common/Finisher.h"
+
+#define dout_subsys ceph_subsys_journaler
+#undef dout_prefix
+#define dout_prefix *_dout << objecter->messenger->get_myname() \
+  << ".journaler." << name << (readonly ? "(ro) ":"(rw) ")
+
+using std::chrono::seconds;
+
+
+class Journaler::C_DelayFlush : public Context {
+  Journaler *journaler;
+  public:
+  explicit C_DelayFlush(Journaler *j) : journaler(j) {}
+  void finish(int r) override {
+    journaler->_do_delayed_flush();
+  }
+};
+
+void Journaler::set_readonly()
+{
+  lock_guard l(lock);
+
+  ldout(cct, 1) << "set_readonly" << dendl;
+  readonly = true;
+}
+
+void Journaler::set_writeable()
+{
+  lock_guard l(lock);
+
+  ldout(cct, 1) << "set_writeable" << dendl;
+  readonly = false;
+}
+
+void Journaler::create(file_layout_t *l, stream_format_t const sf)
+{
+  lock_guard lk(lock);
+
+  ceph_assert(!readonly);
+  state = STATE_ACTIVE;
+
+  stream_format = sf;
+  journal_stream.set_format(sf);
+  _set_layout(l);
+
+  prezeroing_pos = prezero_pos = write_pos = flush_pos =
+    safe_pos = read_pos = requested_pos = received_pos =
+    expire_pos = trimming_pos = trimmed_pos =
+    next_safe_pos = layout.get_period();
+
+  ldout(cct, 1) << "created blank journal at inode 0x" << std::hex << ino
+		<< std::dec << ", format=" << stream_format << dendl;
+}
+
+void Journaler::set_layout(file_layout_t const *l)
+{
+    lock_guard lk(lock);
+    _set_layout(l);
+}
+
+void Journaler::_set_layout(file_layout_t const *l)
+{
+  layout = *l;
+
+  if (layout.pool_id != pg_pool) {
+    // user can reset pool id through cephfs-journal-tool
+    lderr(cct) << "may got older pool id from header layout" << dendl;
+    ceph_abort();
+  }
+  last_written.layout = layout;
+  last_committed.layout = layout;
+
+  // prefetch intelligently.
+  // (watch out, this is big if you use big objects or weird striping)
+  uint64_t periods = cct->_conf.get_val<uint64_t>("journaler_prefetch_periods");
+  fetch_len = layout.get_period() * periods;
+}
+
+
+/***************** HEADER *******************/
+
+ostream& operator<<(ostream &out, const Journaler::Header &h)
+{
+  return out << "loghead(trim " << h.trimmed_pos
+	     << ", expire " << h.expire_pos
+	     << ", write " << h.write_pos
+	     << ", stream_format " << (int)(h.stream_format)
+	     << ")";
+}
+
+class Journaler::C_ReadHead : public Context {
+  Journaler *ls;
+public:
+  bufferlist bl;
+  explicit C_ReadHead(Journaler *l) : ls(l) {}
+  void finish(int r) override {
+    ls->_finish_read_head(r, bl);
+  }
+};
+
+class Journaler::C_RereadHead : public Context {
+  Journaler *ls;
+  Context *onfinish;
+public:
+  bufferlist bl;
+  C_RereadHead(Journaler *l, Context *onfinish_) : ls (l),
+						   onfinish(onfinish_) {}
+  void finish(int r) override {
+    ls->_finish_reread_head(r, bl, onfinish);
+  }
+};
+
+class Journaler::C_ProbeEnd : public Context {
+  Journaler *ls;
+public:
+  uint64_t end;
+  explicit C_ProbeEnd(Journaler *l) : ls(l), end(-1) {}
+  void finish(int r) override {
+    ls->_finish_probe_end(r, end);
+  }
+};
+
+class Journaler::C_ReProbe : public Context {
+  Journaler *ls;
+  C_OnFinisher *onfinish;
+public:
+  uint64_t end;
+  C_ReProbe(Journaler *l, C_OnFinisher *onfinish_) :
+    ls(l), onfinish(onfinish_), end(0) {}
+  void finish(int r) override {
+    ls->_finish_reprobe(r, end, onfinish);
+  }
+};
+
+void Journaler::recover(Context *onread) 
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    onread->complete(-EAGAIN);
+    return;
+  }
+
+  ldout(cct, 1) << "recover start" << dendl;
+  ceph_assert(state != STATE_ACTIVE);
+  ceph_assert(readonly);
+
+  if (onread)
+    waitfor_recover.push_back(wrap_finisher(onread));
+
+  if (state != STATE_UNDEF) {
+    ldout(cct, 1) << "recover - already recovering" << dendl;
+    return;
+  }
+
+  ldout(cct, 1) << "read_head" << dendl;
+  state = STATE_READHEAD;
+  C_ReadHead *fin = new C_ReadHead(this);
+  _read_head(fin, &fin->bl);
+}
+
+void Journaler::_read_head(Context *on_finish, bufferlist *bl)
+{
+  // lock is locked
+  ceph_assert(state == STATE_READHEAD || state == STATE_REREADHEAD);
+
+  object_t oid = file_object_t(ino, 0);
+  object_locator_t oloc(pg_pool);
+  objecter->read_full(oid, oloc, CEPH_NOSNAP, bl, 0, wrap_finisher(on_finish));
+}
+
+void Journaler::reread_head(Context *onfinish)
+{
+  lock_guard l(lock);
+  _reread_head(wrap_finisher(onfinish));
+}
+
+/**
+ * Re-read the head from disk, and set the write_pos, expire_pos, trimmed_pos
+ * from the on-disk header. This switches the state to STATE_REREADHEAD for
+ * the duration, and you shouldn't start a re-read while other operations are
+ * in-flight, nor start other operations while a re-read is in progress.
+ * Also, don't call this until the Journaler has finished its recovery and has
+ * gone STATE_ACTIVE!
+ */
+void Journaler::_reread_head(Context *onfinish)
+{
+  ldout(cct, 10) << "reread_head" << dendl;
+  ceph_assert(state == STATE_ACTIVE);
+
+  state = STATE_REREADHEAD;
+  C_RereadHead *fin = new C_RereadHead(this, onfinish);
+  _read_head(fin, &fin->bl);
+}
+
+void Journaler::_finish_reread_head(int r, bufferlist& bl, Context *finish)
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    finish->complete(-EAGAIN);
+    return;
+  }
+
+  //read on-disk header into
+  ceph_assert(bl.length() || r < 0 );
+
+  // unpack header
+  if (r == 0) {
+    Header h;
+    auto p = bl.cbegin();
+    try {
+      decode(h, p);
+    } catch (const buffer::error &e) {
+      finish->complete(-EINVAL);
+      return;
+    }
+    prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos
+      = h.write_pos;
+    expire_pos = h.expire_pos;
+    trimmed_pos = trimming_pos = h.trimmed_pos;
+    init_headers(h);
+    state = STATE_ACTIVE;
+  }
+
+  finish->complete(r);
+}
+
+void Journaler::_finish_read_head(int r, bufferlist& bl)
+{
+  lock_guard l(lock);
+  if (is_stopping())
+    return;
+
+  ceph_assert(state == STATE_READHEAD);
+
+  if (r!=0) {
+    ldout(cct, 0) << "error getting journal off disk" << dendl;
+    list<Context*> ls;
+    ls.swap(waitfor_recover);
+    finish_contexts(cct, ls, r);
+    return;
+  }
+
+  if (bl.length() == 0) {
+    ldout(cct, 1) << "_finish_read_head r=" << r
+		  << " read 0 bytes, assuming empty log" << dendl;
+    state = STATE_ACTIVE;
+    list<Context*> ls;
+    ls.swap(waitfor_recover);
+    finish_contexts(cct, ls, 0);
+    return;
+  }
+
+  // unpack header
+  bool corrupt = false;
+  Header h;
+  auto p = bl.cbegin();
+  try {
+    decode(h, p);
+
+    if (h.magic != magic) {
+      ldout(cct, 0) << "on disk magic '" << h.magic << "' != my magic '"
+		    << magic << "'" << dendl;
+      corrupt = true;
+    } else if (h.write_pos < h.expire_pos || h.expire_pos < h.trimmed_pos) {
+      ldout(cct, 0) << "Corrupt header (bad offsets): " << h << dendl;
+      corrupt = true;
+    }
+  } catch (const buffer::error &e) {
+    corrupt = true;
+  }
+
+  if (corrupt) {
+    list<Context*> ls;
+    ls.swap(waitfor_recover);
+    finish_contexts(cct, ls, -EINVAL);
+    return;
+  }
+
+  prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos
+    = h.write_pos;
+  read_pos = requested_pos = received_pos = expire_pos = h.expire_pos;
+  trimmed_pos = trimming_pos = h.trimmed_pos;
+
+  init_headers(h);
+  _set_layout(&h.layout);
+  stream_format = h.stream_format;
+  journal_stream.set_format(h.stream_format);
+
+  ldout(cct, 1) << "_finish_read_head " << h
+		<< ".  probing for end of log (from " << write_pos << ")..."
+		<< dendl;
+  C_ProbeEnd *fin = new C_ProbeEnd(this);
+  state = STATE_PROBING;
+  _probe(fin, &fin->end);
+}
+
+void Journaler::_probe(Context *finish, uint64_t *end)
+{
+  // lock is locked
+  ldout(cct, 1) << "probing for end of the log" << dendl;
+  ceph_assert(state == STATE_PROBING || state == STATE_REPROBING);
+  // probe the log
+  filer.probe(ino, &layout, CEPH_NOSNAP,
+	      write_pos, end, true, 0, wrap_finisher(finish));
+}
+
+void Journaler::_reprobe(C_OnFinisher *finish)
+{
+  ldout(cct, 10) << "reprobe" << dendl;
+  ceph_assert(state == STATE_ACTIVE);
+
+  state = STATE_REPROBING;
+  C_ReProbe *fin = new C_ReProbe(this, finish);
+  _probe(fin, &fin->end);
+}
+
+
+void Journaler::_finish_reprobe(int r, uint64_t new_end,
+				C_OnFinisher *onfinish)
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    onfinish->complete(-EAGAIN);
+    return;
+  }
+
+  ceph_assert(new_end >= write_pos || r < 0);
+  ldout(cct, 1) << "_finish_reprobe new_end = " << new_end
+	  << " (header had " << write_pos << ")."
+	  << dendl;
+  prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = new_end;
+  state = STATE_ACTIVE;
+  onfinish->complete(r);
+}
+
+void Journaler::_finish_probe_end(int r, uint64_t end)
+{
+  lock_guard l(lock);
+  if (is_stopping())
+    return;
+
+  ceph_assert(state == STATE_PROBING);
+  if (r < 0) { // error in probing
+    goto out;
+  }
+  if (((int64_t)end) == -1) {
+    end = write_pos;
+    ldout(cct, 1) << "_finish_probe_end write_pos = " << end << " (header had "
+		  << write_pos << "). log was empty. recovered." << dendl;
+    ceph_abort(); // hrm.
+  } else {
+    ceph_assert(end >= write_pos);
+    ldout(cct, 1) << "_finish_probe_end write_pos = " << end
+		  << " (header had " << write_pos << "). recovered."
+		  << dendl;
+  }
+
+  state = STATE_ACTIVE;
+
+  prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = end;
+
+out:
+  // done.
+  list<Context*> ls;
+  ls.swap(waitfor_recover);
+  finish_contexts(cct, ls, r);
+}
+
+class Journaler::C_RereadHeadProbe : public Context
+{
+  Journaler *ls;
+  C_OnFinisher *final_finish;
+public:
+  C_RereadHeadProbe(Journaler *l, C_OnFinisher *finish) :
+    ls(l), final_finish(finish) {}
+  void finish(int r) override {
+    ls->_finish_reread_head_and_probe(r, final_finish);
+  }
+};
+
+void Journaler::reread_head_and_probe(Context *onfinish)
+{
+  lock_guard l(lock);
+
+  ceph_assert(state == STATE_ACTIVE);
+  _reread_head(new C_RereadHeadProbe(this, wrap_finisher(onfinish)));
+}
+
+void Journaler::_finish_reread_head_and_probe(int r, C_OnFinisher *onfinish)
+{
+  // Expect to be called back from finish_reread_head, which already takes lock
+  // lock is locked
+  if (is_stopping()) {
+    onfinish->complete(-EAGAIN);
+    return;
+  }
+
+  // Let the caller know that the operation has failed or was intentionally
+  // failed since the caller has been blocklisted.
+  if (r == -EBLOCKLISTED) {
+    onfinish->complete(r);
+    return;
+  }
+
+  ceph_assert(!r); //if we get an error, we're boned
+  _reprobe(onfinish);
+}
+
+
+// WRITING
+
+class Journaler::C_WriteHead : public Context {
+public:
+  Journaler *ls;
+  Header h;
+  C_OnFinisher *oncommit;
+  C_WriteHead(Journaler *l, Header& h_, C_OnFinisher *c) : ls(l), h(h_),
+							   oncommit(c) {}
+  void finish(int r) override {
+    ls->_finish_write_head(r, h, oncommit);
+  }
+};
+
+void Journaler::write_head(Context *oncommit)
+{
+  lock_guard l(lock);
+  _write_head(oncommit);
+}
+
+
+void Journaler::_write_head(Context *oncommit)
+{
+  ceph_assert(!readonly);
+  ceph_assert(state == STATE_ACTIVE);
+  last_written.trimmed_pos = trimmed_pos;
+  last_written.expire_pos = expire_pos;
+  last_written.unused_field = expire_pos;
+  last_written.write_pos = safe_pos;
+  last_written.stream_format = stream_format;
+  ldout(cct, 10) << "write_head " << last_written << dendl;
+
+  // Avoid persisting bad pointers in case of bugs
+  ceph_assert(last_written.write_pos >= last_written.expire_pos);
+  ceph_assert(last_written.expire_pos >= last_written.trimmed_pos);
+
+  last_wrote_head = ceph::real_clock::now();
+
+  bufferlist bl;
+  encode(last_written, bl);
+  SnapContext snapc;
+
+  object_t oid = file_object_t(ino, 0);
+  object_locator_t oloc(pg_pool);
+  objecter->write_full(oid, oloc, snapc, bl, ceph::real_clock::now(), 0,
+		       wrap_finisher(new C_WriteHead(
+					     this, last_written,
+					     wrap_finisher(oncommit))),
+		       0, 0, write_iohint);
+}
+
+void Journaler::_finish_write_head(int r, Header &wrote,
+				   C_OnFinisher *oncommit)
+{
+  lock_guard l(lock);
+
+  if (r < 0) {
+    lderr(cct) << "_finish_write_head got " << cpp_strerror(r) << dendl;
+    handle_write_error(r);
+    return;
+  }
+  ceph_assert(!readonly);
+  ldout(cct, 10) << "_finish_write_head " << wrote << dendl;
+  last_committed = wrote;
+  if (oncommit) {
+    oncommit->complete(r);
+  }
+
+  _trim();  // trim?
+}
+
+
+/***************** WRITING *******************/
+
+class Journaler::C_Flush : public Context {
+  Journaler *ls;
+  uint64_t start;
+  ceph::real_time stamp;
+public:
+  C_Flush(Journaler *l, int64_t s, ceph::real_time st)
+    : ls(l), start(s), stamp(st) {}
+  void finish(int r) override {
+    ls->_finish_flush(r, start, stamp);
+  }
+};
+
+void Journaler::_finish_flush(int r, uint64_t start, ceph::real_time stamp)
+{
+  lock_guard l(lock);
+  ceph_assert(!readonly);
+
+  if (r < 0) {
+    lderr(cct) << "_finish_flush got " << cpp_strerror(r) << dendl;
+    handle_write_error(r);
+    return;
+  }
+
+  ceph_assert(start < flush_pos);
+
+  // calc latency?
+  if (logger) {
+    ceph::timespan lat = ceph::real_clock::now() - stamp;
+    logger->tinc(logger_key_lat, lat);
+  }
+
+  // adjust safe_pos
+  auto it = pending_safe.find(start);
+  ceph_assert(it != pending_safe.end());
+  uint64_t min_next_safe_pos = pending_safe.begin()->second;
+  pending_safe.erase(it);
+  if (pending_safe.empty())
+    safe_pos = next_safe_pos;
+  else
+    safe_pos = min_next_safe_pos;
+
+  ldout(cct, 10) << "_finish_flush safe from " << start
+		 << ", pending_safe " << pending_safe
+		 << ", (prezeroing/prezero)/write/flush/safe positions now "
+		 << "(" << prezeroing_pos << "/" << prezero_pos << ")/"
+		 << write_pos << "/" << flush_pos << "/" << safe_pos
+		 << dendl;
+
+  // kick waiters <= safe_pos
+  if (!waitfor_safe.empty()) {
+    list<Context*> ls;
+    while (!waitfor_safe.empty()) {
+      auto it = waitfor_safe.begin();
+      if (it->first > safe_pos)
+	break;
+      ls.splice(ls.end(), it->second);
+      waitfor_safe.erase(it);
+    }
+    finish_contexts(cct, ls);
+  }
+}
+
+
+
+uint64_t Journaler::append_entry(bufferlist& bl)
+{
+  unique_lock l(lock);
+
+  ceph_assert(!readonly);
+  uint32_t s = bl.length();
+
+  // append
+  size_t delta = bl.length() + journal_stream.get_envelope_size();
+  // write_buf space is nearly full
+  if (!write_buf_throttle.get_or_fail(delta)) {
+    l.unlock();
+    ldout(cct, 10) << "write_buf_throttle wait, delta " << delta << dendl;
+    write_buf_throttle.get(delta);
+    l.lock();
+  }
+  ldout(cct, 20) << "write_buf_throttle get, delta " << delta << dendl;
+  size_t wrote = journal_stream.write(bl, &write_buf, write_pos);
+  ldout(cct, 10) << "append_entry len " << s << " to " << write_pos << "~"
+		 << wrote << dendl;
+  write_pos += wrote;
+
+  // flush previous object?
+  uint64_t su = get_layout_period();
+  ceph_assert(su > 0);
+  uint64_t write_off = write_pos % su;
+  uint64_t write_obj = write_pos / su;
+  uint64_t flush_obj = flush_pos / su;
+  if (write_obj != flush_obj) {
+    ldout(cct, 10) << " flushing completed object(s) (su " << su << " wro "
+		   << write_obj << " flo " << flush_obj << ")" << dendl;
+    _do_flush(write_buf.length() - write_off);
+
+    // if _do_flush() skips flushing some data, it does do a best effort to
+    // update next_safe_pos.
+    if (write_buf.length() > 0 &&
+	write_buf.length() <= wrote) { // the unflushed data are within this entry
+      // set next_safe_pos to end of previous entry
+      next_safe_pos = write_pos - wrote;
+    }
+  }
+
+  return write_pos;
+}
+
+
+void Journaler::_do_flush(unsigned amount)
+{
+  if (is_stopping())
+    return;
+  if (write_pos == flush_pos)
+    return;
+  ceph_assert(write_pos > flush_pos);
+  ceph_assert(!readonly);
+
+  // flush
+  uint64_t len = write_pos - flush_pos;
+  ceph_assert(len == write_buf.length());
+  if (amount && amount < len)
+    len = amount;
+
+  // zero at least two full periods ahead.  this ensures
+  // that the next object will not exist.
+  uint64_t period = get_layout_period();
+  if (flush_pos + len + 2*period > prezero_pos) {
+    _issue_prezero();
+
+    int64_t newlen = prezero_pos - flush_pos - period;
+    if (newlen <= 0) {
+      ldout(cct, 10) << "_do_flush wanted to do " << flush_pos << "~" << len
+		     << " already too close to prezero_pos " << prezero_pos
+		     << ", zeroing first" << dendl;
+      waiting_for_zero_pos = flush_pos + len;
+      return;
+    }
+    if (static_cast<uint64_t>(newlen) < len) {
+      ldout(cct, 10) << "_do_flush wanted to do " << flush_pos << "~" << len
+		     << " but hit prezero_pos " << prezero_pos
+		     << ", will do " << flush_pos << "~" << newlen << dendl;
+      waiting_for_zero_pos = flush_pos + len;
+      len = newlen;
+    }
+  }
+  ldout(cct, 10) << "_do_flush flushing " << flush_pos << "~" << len << dendl;
+
+  // submit write for anything pending
+  // flush _start_ pos to _finish_flush
+  ceph::real_time now = ceph::real_clock::now();
+  SnapContext snapc;
+
+  Context *onsafe = new C_Flush(this, flush_pos, now);  // on COMMIT
+  pending_safe[flush_pos] = next_safe_pos;
+
+  bufferlist write_bl;
+
+  // adjust pointers
+  if (len == write_buf.length()) {
+    write_bl.swap(write_buf);
+    next_safe_pos = write_pos;
+  } else {
+    write_buf.splice(0, len, &write_bl);
+    // Keys of waitfor_safe map are journal entry boundaries.
+    // Try finding a journal entry that we are actually flushing
+    // and set next_safe_pos to end of it. This is best effort.
+    // The one we found may not be the lastest flushing entry.
+    auto p = waitfor_safe.lower_bound(flush_pos + len);
+    if (p != waitfor_safe.end()) {
+      if (p->first > flush_pos + len && p != waitfor_safe.begin())
+       --p;
+      if (p->first <= flush_pos + len && p->first > next_safe_pos)
+       next_safe_pos = p->first;
+    }
+  }
+
+  filer.write(ino, &layout, snapc,
+	      flush_pos, len, write_bl, ceph::real_clock::now(),
+	      0,
+	      wrap_finisher(onsafe), write_iohint);
+
+  flush_pos += len;
+  ceph_assert(write_buf.length() == write_pos - flush_pos);
+  write_buf_throttle.put(len);
+  ldout(cct, 20) << "write_buf_throttle put, len " << len << dendl;
+ 
+  ldout(cct, 10)
+    << "_do_flush (prezeroing/prezero)/write/flush/safe pointers now at "
+    << "(" << prezeroing_pos << "/" << prezero_pos << ")/" << write_pos
+    << "/" << flush_pos << "/" << safe_pos << dendl;
+
+  _issue_prezero();
+}
+
+
+void Journaler::wait_for_flush(Context *onsafe)
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    if (onsafe)
+      onsafe->complete(-EAGAIN);
+    return;
+  }
+  _wait_for_flush(onsafe);
+}
+
+void Journaler::_wait_for_flush(Context *onsafe)
+{
+  ceph_assert(!readonly);
+
+  // all flushed and safe?
+  if (write_pos == safe_pos) {
+    ceph_assert(write_buf.length() == 0);
+    ldout(cct, 10)
+      << "flush nothing to flush, (prezeroing/prezero)/write/flush/safe "
+      "pointers at " << "(" << prezeroing_pos << "/" << prezero_pos << ")/"
+      << write_pos << "/" << flush_pos << "/" << safe_pos << dendl;
+    if (onsafe) {
+      finisher->queue(onsafe, 0);
+    }
+    return;
+  }
+
+  // queue waiter
+  if (onsafe) {
+    waitfor_safe[write_pos].push_back(wrap_finisher(onsafe));
+  }
+}
+
+void Journaler::flush(Context *onsafe)
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    if (onsafe)
+      onsafe->complete(-EAGAIN);
+    return;
+  }
+  _flush(wrap_finisher(onsafe));
+}
+
+void Journaler::_flush(C_OnFinisher *onsafe)
+{
+  ceph_assert(!readonly);
+
+  if (write_pos == flush_pos) {
+    ceph_assert(write_buf.length() == 0);
+    ldout(cct, 10) << "flush nothing to flush, (prezeroing/prezero)/write/"
+      "flush/safe pointers at " << "(" << prezeroing_pos << "/" << prezero_pos
+		   << ")/" << write_pos << "/" << flush_pos << "/" << safe_pos
+		   << dendl;
+    if (onsafe) {
+      onsafe->complete(0);
+    }
+  } else {
+    _do_flush();
+    _wait_for_flush(onsafe);
+  }
+
+  // write head?
+  if (_write_head_needed()) {
+    _write_head();
+  }
+}
+
+bool Journaler::_write_head_needed()
+{
+  return last_wrote_head + seconds(cct->_conf.get_val<int64_t>("journaler_write_head_interval"))
+      < ceph::real_clock::now();
+}
+
+
+/*************** prezeroing ******************/
+
+struct C_Journaler_Prezero : public Context {
+  Journaler *journaler;
+  uint64_t from, len;
+  C_Journaler_Prezero(Journaler *j, uint64_t f, uint64_t l)
+    : journaler(j), from(f), len(l) {}
+  void finish(int r) override {
+    journaler->_finish_prezero(r, from, len);
+  }
+};
+
+void Journaler::_issue_prezero()
+{
+  ceph_assert(prezeroing_pos >= flush_pos);
+
+  uint64_t num_periods = cct->_conf.get_val<uint64_t>("journaler_prezero_periods");
+  /*
+   * issue zero requests based on write_pos, even though the invariant
+   * is that we zero ahead of flush_pos.
+   */
+  uint64_t period = get_layout_period();
+  uint64_t to = write_pos + period * num_periods  + period - 1;
+  to -= to % period;
+
+  if (prezeroing_pos >= to) {
+    ldout(cct, 20) << "_issue_prezero target " << to << " <= prezeroing_pos "
+		   << prezeroing_pos << dendl;
+    return;
+  }
+
+  while (prezeroing_pos < to) {
+    uint64_t len;
+    if (prezeroing_pos % period == 0) {
+      len = period;
+      ldout(cct, 10) << "_issue_prezero removing " << prezeroing_pos << "~"
+		     << period << " (full period)" << dendl;
+    } else {
+      len = period - (prezeroing_pos % period);
+      ldout(cct, 10) << "_issue_prezero zeroing " << prezeroing_pos << "~"
+		     << len << " (partial period)" << dendl;
+    }
+    SnapContext snapc;
+    Context *c = wrap_finisher(new C_Journaler_Prezero(this, prezeroing_pos,
+						       len));
+    filer.zero(ino, &layout, snapc, prezeroing_pos, len,
+	       ceph::real_clock::now(), 0, c);
+    prezeroing_pos += len;
+  }
+}
+
+// Lock cycle because we get called out of objecter callback (holding
+// objecter read lock), but there are also cases where we take the journaler
+// lock before calling into objecter to do I/O.
+void Journaler::_finish_prezero(int r, uint64_t start, uint64_t len)
+{
+  lock_guard l(lock);
+
+  ldout(cct, 10) << "_prezeroed to " << start << "~" << len
+		 << ", prezeroing/prezero was " << prezeroing_pos << "/"
+		 << prezero_pos << ", pending " << pending_zero
+		 << dendl;
+  if (r < 0 && r != -ENOENT) {
+    lderr(cct) << "_prezeroed got " << cpp_strerror(r) << dendl;
+    handle_write_error(r);
+    return;
+  }
+
+  ceph_assert(r == 0 || r == -ENOENT);
+
+  if (start == prezero_pos) {
+    prezero_pos += len;
+    while (!pending_zero.empty() &&
+	   pending_zero.begin().get_start() == prezero_pos) {
+      interval_set<uint64_t>::iterator b(pending_zero.begin());
+      prezero_pos += b.get_len();
+      pending_zero.erase(b);
+    }
+
+    if (waiting_for_zero_pos > flush_pos) {
+      _do_flush(waiting_for_zero_pos - flush_pos);
+    }
+
+    if (prezero_pos == prezeroing_pos &&
+	!waitfor_prezero.empty()) {
+      list<Context*> ls;
+      ls.swap(waitfor_prezero);
+      finish_contexts(cct, ls, 0);
+    }
+  } else {
+    pending_zero.insert(start, len);
+  }
+  ldout(cct, 10) << "_prezeroed prezeroing/prezero now " << prezeroing_pos
+		 << "/" << prezero_pos
+		 << ", pending " << pending_zero
+		 << dendl;
+}
+
+void Journaler::wait_for_prezero(Context *onfinish)
+{
+  ceph_assert(onfinish);
+  lock_guard l(lock);
+
+  if (prezero_pos == prezeroing_pos) {
+    finisher->queue(onfinish, 0);
+    return;
+  }
+  waitfor_prezero.push_back(wrap_finisher(onfinish));
+}
+
+
+/***************** READING *******************/
+
+
+class Journaler::C_Read : public Context {
+  Journaler *ls;
+  uint64_t offset;
+  uint64_t length;
+public:
+  bufferlist bl;
+  C_Read(Journaler *j, uint64_t o, uint64_t l) : ls(j), offset(o), length(l) {}
+  void finish(int r) override {
+    ls->_finish_read(r, offset, length, bl);
+  }
+};
+
+class Journaler::C_RetryRead : public Context {
+  Journaler *ls;
+public:
+  explicit C_RetryRead(Journaler *l) : ls(l) {}
+
+  void finish(int r) override {
+    // Should only be called from waitfor_safe i.e. already inside lock
+    // (ls->lock is locked
+    ls->_prefetch();
+  }
+};
+
+void Journaler::_finish_read(int r, uint64_t offset, uint64_t length,
+			     bufferlist& bl)
+{
+  lock_guard l(lock);
+
+  if (r < 0) {
+    ldout(cct, 0) << "_finish_read got error " << r << dendl;
+    error = r;
+  } else {
+    ldout(cct, 10) << "_finish_read got " << offset << "~" << bl.length()
+		   << dendl;
+    if (bl.length() < length) {
+      ldout(cct, 0) << "_finish_read got less than expected (" << length << ")"
+		    << dendl;
+      error = -EINVAL;
+    }
+  }
+
+  if (error) {
+    if (on_readable) {
+      C_OnFinisher *f = on_readable;
+      on_readable = 0;
+      f->complete(error);
+    }
+    return;
+  }
+
+  prefetch_buf[offset].swap(bl);
+
+  try {
+    _assimilate_prefetch();
+  } catch (const buffer::error &err) {
+    lderr(cct) << "_decode error from assimilate_prefetch" << dendl;
+    error = -EINVAL;
+    if (on_readable) {
+      C_OnFinisher *f = on_readable;
+      on_readable = 0;
+      f->complete(error);
+    }
+    return;
+  }
+  _prefetch();
+}
+
+void Journaler::_assimilate_prefetch()
+{
+  bool was_readable = readable;
+
+  bool got_any = false;
+  while (!prefetch_buf.empty()) {
+    map<uint64_t,bufferlist>::iterator p = prefetch_buf.begin();
+    if (p->first != received_pos) {
+      uint64_t gap = p->first - received_pos;
+      ldout(cct, 10) << "_assimilate_prefetch gap of " << gap
+		     << " from received_pos " << received_pos
+		     << " to first prefetched buffer " << p->first << dendl;
+      break;
+    }
+
+    ldout(cct, 10) << "_assimilate_prefetch " << p->first << "~"
+		   << p->second.length() << dendl;
+    received_pos += p->second.length();
+    read_buf.claim_append(p->second);
+    ceph_assert(received_pos <= requested_pos);
+    prefetch_buf.erase(p);
+    got_any = true;
+  }
+
+  if (got_any) {
+    ldout(cct, 10) << "_assimilate_prefetch read_buf now " << read_pos << "~"
+		   << read_buf.length() << ", read pointers read_pos=" << read_pos 
+                   << " received_pos=" << received_pos << " requested_pos=" << requested_pos
+		   << dendl;
+
+    // Update readability (this will also hit any decode errors resulting
+    // from bad data)
+    readable = _is_readable();
+  }
+
+  if ((got_any && !was_readable && readable) || read_pos == write_pos) {
+    // readable!
+    ldout(cct, 10) << "_finish_read now readable (or at journal end) readable="
+                   << readable << " read_pos=" << read_pos << " write_pos="
+                   << write_pos << dendl;
+    if (on_readable) {
+      C_OnFinisher *f = on_readable;
+      on_readable = 0;
+      f->complete(0);
+    }
+  }
+}
+
+void Journaler::_issue_read(uint64_t len)
+{
+  // stuck at safe_pos?  (this is needed if we are reading the tail of
+  // a journal we are also writing to)
+  ceph_assert(requested_pos <= safe_pos);
+  if (requested_pos == safe_pos) {
+    ldout(cct, 10) << "_issue_read requested_pos = safe_pos = " << safe_pos
+		   << ", waiting" << dendl;
+    ceph_assert(write_pos > requested_pos);
+    if (pending_safe.empty()) {
+      _flush(NULL);
+    }
+
+    // Make sure keys of waitfor_safe map are journal entry boundaries.
+    // The key we used here is either next_safe_pos or old value of
+    // next_safe_pos. next_safe_pos is always set to journal entry
+    // boundary.
+    auto p = pending_safe.rbegin();
+    if (p != pending_safe.rend())
+      waitfor_safe[p->second].push_back(new C_RetryRead(this));
+    else
+      waitfor_safe[next_safe_pos].push_back(new C_RetryRead(this));
+    return;
+  }
+
+  // don't read too much
+  if (requested_pos + len > safe_pos) {
+    len = safe_pos - requested_pos;
+    ldout(cct, 10) << "_issue_read reading only up to safe_pos " << safe_pos
+		   << dendl;
+  }
+
+  // go.
+  ldout(cct, 10) << "_issue_read reading " << requested_pos << "~" << len
+		 << ", read pointers read_pos=" << read_pos << " received_pos=" << received_pos
+		 << " requested_pos+len=" << (requested_pos+len) << dendl;
+
+  // step by period (object).  _don't_ do a single big filer.read()
+  // here because it will wait for all object reads to complete before
+  // giving us back any data.  this way we can process whatever bits
+  // come in that are contiguous.
+  uint64_t period = get_layout_period();
+  while (len > 0) {
+    uint64_t e = requested_pos + period;
+    e -= e % period;
+    uint64_t l = e - requested_pos;
+    if (l > len)
+      l = len;
+    C_Read *c = new C_Read(this, requested_pos, l);
+    filer.read(ino, &layout, CEPH_NOSNAP, requested_pos, l, &c->bl, 0,
+	       wrap_finisher(c), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+    requested_pos += l;
+    len -= l;
+  }
+}
+
+void Journaler::_prefetch()
+{
+  if (is_stopping())
+    return;
+
+  ldout(cct, 10) << "_prefetch" << dendl;
+  // prefetch
+  uint64_t pf;
+  if (temp_fetch_len) {
+    ldout(cct, 10) << "_prefetch temp_fetch_len " << temp_fetch_len << dendl;
+    pf = temp_fetch_len;
+    temp_fetch_len = 0;
+  } else {
+    pf = fetch_len;
+  }
+
+  uint64_t raw_target = read_pos + pf;
+
+  // read full log segments, so increase if necessary
+  uint64_t period = get_layout_period();
+  uint64_t remainder = raw_target % period;
+  uint64_t adjustment = remainder ? period - remainder : 0;
+  uint64_t target = raw_target + adjustment;
+
+  // don't read past the log tail
+  if (target > write_pos)
+    target = write_pos;
+
+  if (requested_pos < target) {
+    uint64_t len = target - requested_pos;
+    ldout(cct, 10) << "_prefetch " << pf << " requested_pos " << requested_pos
+		   << " < target " << target << " (" << raw_target
+		   << "), prefetching " << len << dendl;
+
+    if (pending_safe.empty() && write_pos > safe_pos) {
+      // If we are reading and writing the journal, then we may need
+      // to issue a flush if one isn't already in progress.
+      // Avoid doing a flush every time so that if we do write/read/write/read
+      // we don't end up flushing after every write.
+      ldout(cct, 10) << "_prefetch: requested_pos=" << requested_pos
+                     << ", read_pos=" << read_pos
+                     << ", write_pos=" << write_pos
+                     << ", safe_pos=" << safe_pos << dendl;
+      _do_flush();
+    }
+
+    _issue_read(len);
+  }
+}
+
+
+/*
+ * _is_readable() - return true if next entry is ready.
+ */
+bool Journaler::_is_readable()
+{
+  // anything to read?
+  if (read_pos == write_pos)
+    return false;
+
+  // Check if the retrieve bytestream has enough for an entry
+  uint64_t need;
+  if (journal_stream.readable(read_buf, &need)) {
+    return true;
+  }
+
+  ldout (cct, 10) << "_is_readable read_buf.length() == " << read_buf.length()
+		  << ", but need " << need << " for next entry; fetch_len is "
+		  << fetch_len << dendl;
+
+  // partial fragment at the end?
+  if (received_pos == write_pos) {
+    ldout(cct, 10) << "is_readable() detected partial entry at tail, "
+      "adjusting write_pos to " << read_pos << dendl;
+
+    // adjust write_pos
+    prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = read_pos;
+    ceph_assert(write_buf.length() == 0);
+    ceph_assert(waitfor_safe.empty());
+
+    // reset read state
+    requested_pos = received_pos = read_pos;
+    read_buf.clear();
+
+    // FIXME: truncate on disk?
+
+    return false;
+  }
+
+  if (need > fetch_len) {
+    temp_fetch_len = need;
+    ldout(cct, 10) << "_is_readable noting temp_fetch_len " << temp_fetch_len
+		   << dendl;
+  }
+
+  ldout(cct, 10) << "_is_readable: not readable, returning false" << dendl;
+  return false;
+}
+
+/*
+ * is_readable() - kickstart prefetch, too
+ */
+bool Journaler::is_readable()
+{
+  lock_guard l(lock);
+
+  if (error != 0) {
+    return false;
+  }
+
+  bool r = readable;
+  _prefetch();
+  return r;
+}
+
+class Journaler::C_EraseFinish : public Context {
+  Journaler *journaler;
+  C_OnFinisher *completion;
+  public:
+  C_EraseFinish(Journaler *j, C_OnFinisher *c) : journaler(j), completion(c) {}
+  void finish(int r) override {
+    journaler->_finish_erase(r, completion);
+  }
+};
+
+/**
+ * Entirely erase the journal, including header.  For use when you
+ * have already made a copy of the journal somewhere else.
+ */
+void Journaler::erase(Context *completion)
+{
+  lock_guard l(lock);
+
+  // Async delete the journal data
+  uint64_t first = trimmed_pos / get_layout_period();
+  uint64_t num = (write_pos - trimmed_pos) / get_layout_period() + 2;
+  filer.purge_range(ino, &layout, SnapContext(), first, num,
+		    ceph::real_clock::now(), 0,
+		    wrap_finisher(new C_EraseFinish(
+				    this, wrap_finisher(completion))));
+
+  // We will not start the operation to delete the header until
+  // _finish_erase has seen the data deletion succeed: otherwise if
+  // there was an error deleting data we might prematurely delete the
+  // header thereby lose our reference to the data.
+}
+
+void Journaler::_finish_erase(int data_result, C_OnFinisher *completion)
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    completion->complete(-EAGAIN);
+    return;
+  }
+
+  if (data_result == 0) {
+    // Async delete the journal header
+    filer.purge_range(ino, &layout, SnapContext(), 0, 1,
+		      ceph::real_clock::now(),
+		      0, wrap_finisher(completion));
+  } else {
+    lderr(cct) << "Failed to delete journal " << ino << " data: "
+	       << cpp_strerror(data_result) << dendl;
+    completion->complete(data_result);
+  }
+}
+
+/* try_read_entry(bl)
+ *  read entry into bl if it's ready.
+ *  otherwise, do nothing.
+ */
+bool Journaler::try_read_entry(bufferlist& bl)
+{
+  lock_guard l(lock);
+
+  if (!readable) {
+    ldout(cct, 10) << "try_read_entry at " << read_pos << " not readable"
+		   << dendl;
+    return false;
+  }
+
+  uint64_t start_ptr;
+  size_t consumed;
+  try {
+    consumed = journal_stream.read(read_buf, &bl, &start_ptr);
+    if (stream_format >= JOURNAL_FORMAT_RESILIENT) {
+      ceph_assert(start_ptr == read_pos);
+    }
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << ": decode error from journal_stream" << dendl;
+    error = -EINVAL;
+    return false;
+  }
+
+  ldout(cct, 10) << "try_read_entry at " << read_pos << " read "
+		 << read_pos << "~" << consumed << " (have "
+		 << read_buf.length() << ")" << dendl;
+
+  read_pos += consumed;
+  try {
+    // We were readable, we might not be any more
+    readable = _is_readable();
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << ": decode error from _is_readable" << dendl;
+    error = -EINVAL;
+    return false;
+  }
+
+  // prefetch?
+  _prefetch();
+
+  // If bufferlist consists of discontiguous memory, decoding types whose
+  // denc_traits needs contiguous memory is inefficient. The bufferlist may
+  // get copied to temporary memory multiple times (copy_shallow() in
+  // src/include/denc.h actually does deep copy)
+  if (bl.get_num_buffers() > 1)
+    bl.rebuild();
+  return true;
+}
+
+void Journaler::wait_for_readable(Context *onreadable)
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    finisher->queue(onreadable, -EAGAIN);
+    return;
+  }
+
+  ceph_assert(on_readable == 0);
+  if (!readable) {
+    ldout(cct, 10) << "wait_for_readable at " << read_pos << " onreadable "
+		   << onreadable << dendl;
+    on_readable = wrap_finisher(onreadable);
+  } else {
+    // race with OSD reply
+    finisher->queue(onreadable, 0);
+  }
+}
+
+bool Journaler::have_waiter() const
+{
+  return on_readable != nullptr;
+}
+
+
+
+
+/***************** TRIMMING *******************/
+
+
+class Journaler::C_Trim : public Context {
+  Journaler *ls;
+  uint64_t to;
+public:
+  C_Trim(Journaler *l, int64_t t) : ls(l), to(t) {}
+  void finish(int r) override {
+    ls->_finish_trim(r, to);
+  }
+};
+
+void Journaler::trim()
+{
+  lock_guard l(lock);
+  _trim();
+}
+
+void Journaler::_trim()
+{
+  if (is_stopping())
+    return;
+
+  ceph_assert(!readonly);
+  uint64_t period = get_layout_period();
+  uint64_t trim_to = last_committed.expire_pos;
+  trim_to -= trim_to % period;
+  ldout(cct, 10) << "trim last_commited head was " << last_committed
+	   << ", can trim to " << trim_to
+	   << dendl;
+  if (trim_to == 0 || trim_to == trimming_pos) {
+    ldout(cct, 10) << "trim already trimmed/trimming to "
+		   << trimmed_pos << "/" << trimming_pos << dendl;
+    return;
+  }
+
+  if (trimming_pos > trimmed_pos) {
+    ldout(cct, 10) << "trim already trimming atm, try again later.  "
+      "trimmed/trimming is " << trimmed_pos << "/" << trimming_pos << dendl;
+    return;
+  }
+
+  // trim
+  ceph_assert(trim_to <= write_pos);
+  ceph_assert(trim_to <= expire_pos);
+  ceph_assert(trim_to > trimming_pos);
+  ldout(cct, 10) << "trim trimming to " << trim_to
+		 << ", trimmed/trimming/expire are "
+		 << trimmed_pos << "/" << trimming_pos << "/" << expire_pos
+		 << dendl;
+
+  // delete range of objects
+  uint64_t first = trimming_pos / period;
+  uint64_t num = (trim_to - trimming_pos) / period;
+  SnapContext snapc;
+  filer.purge_range(ino, &layout, snapc, first, num,
+		    ceph::real_clock::now(), 0,
+		    wrap_finisher(new C_Trim(this, trim_to)));
+  trimming_pos = trim_to;
+}
+
+void Journaler::_finish_trim(int r, uint64_t to)
+{
+  lock_guard l(lock);
+
+  ceph_assert(!readonly);
+  ldout(cct, 10) << "_finish_trim trimmed_pos was " << trimmed_pos
+	   << ", trimmed/trimming/expire now "
+	   << to << "/" << trimming_pos << "/" << expire_pos
+	   << dendl;
+  if (r < 0 && r != -ENOENT) {
+    lderr(cct) << "_finish_trim got " << cpp_strerror(r) << dendl;
+    handle_write_error(r);
+    return;
+  }
+
+  ceph_assert(r >= 0 || r == -ENOENT);
+
+  ceph_assert(to <= trimming_pos);
+  ceph_assert(to > trimmed_pos);
+  trimmed_pos = to;
+}
+
+void Journaler::handle_write_error(int r)
+{
+  // lock is locked
+
+  lderr(cct) << "handle_write_error " << cpp_strerror(r) << dendl;
+  if (on_write_error) {
+    on_write_error->complete(r);
+    on_write_error = NULL;
+    called_write_error = true;
+  } else if (called_write_error) {
+    /* We don't call error handler more than once, subsequent errors
+     * are dropped -- this is okay as long as the error handler does
+     * something dramatic like respawn */
+    lderr(cct) << __func__ << ": multiple write errors, handler already called"
+	       << dendl;
+  } else {
+    ceph_abort_msg("unhandled write error");
+  }
+}
+
+
+/**
+ * Test whether the 'read_buf' byte stream has enough data to read
+ * an entry
+ *
+ * sets 'next_envelope_size' to the number of bytes needed to advance (enough
+ * to get the next header if header was unavailable, or enough to get the whole
+ * next entry if the header was available but the body wasn't).
+ */
+bool JournalStream::readable(bufferlist &read_buf, uint64_t *need) const
+{
+  ceph_assert(need != NULL);
+
+  uint32_t entry_size = 0;
+  uint64_t entry_sentinel = 0;
+  auto p = read_buf.cbegin();
+
+  // Do we have enough data to decode an entry prefix?
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    *need = sizeof(entry_size) + sizeof(entry_sentinel);
+  } else {
+    *need = sizeof(entry_size);
+  }
+  if (read_buf.length() >= *need) {
+    if (format >= JOURNAL_FORMAT_RESILIENT) {
+      decode(entry_sentinel, p);
+      if (entry_sentinel != sentinel) {
+	throw buffer::malformed_input("Invalid sentinel");
+      }
+    }
+
+    decode(entry_size, p);
+  } else {
+    return false;
+  }
+
+  // Do we have enough data to decode an entry prefix, payload and suffix?
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    *need = JOURNAL_ENVELOPE_RESILIENT + entry_size;
+  } else {
+    *need = JOURNAL_ENVELOPE_LEGACY + entry_size;
+  }
+  if (read_buf.length() >= *need) {
+    return true;  // No more bytes needed
+  }
+
+  return false;
+}
+
+
+/**
+ * Consume one entry from a journal byte stream 'from', splicing a
+ * serialized LogEvent blob into 'entry'.
+ *
+ * 'entry' must be non null and point to an empty bufferlist.
+ *
+ * 'from' must contain sufficient valid data (i.e. readable is true).
+ *
+ * 'start_ptr' will be set to the entry's start pointer, if the collection
+ * format provides it.  It may not be null.
+ *
+ * @returns The number of bytes consumed from the `from` byte stream.  Note
+ *          that this is not equal to the length of `entry`, which contains
+ *          the inner serialized LogEvent and not the envelope.
+ */
+size_t JournalStream::read(bufferlist &from, bufferlist *entry,
+			   uint64_t *start_ptr)
+{
+  ceph_assert(start_ptr != NULL);
+  ceph_assert(entry != NULL);
+  ceph_assert(entry->length() == 0);
+
+  uint32_t entry_size = 0;
+
+  // Consume envelope prefix: entry_size and entry_sentinel
+  auto from_ptr = from.cbegin();
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    uint64_t entry_sentinel = 0;
+    decode(entry_sentinel, from_ptr);
+    // Assertion instead of clean check because of precondition of this
+    // fn is that readable() already passed
+    ceph_assert(entry_sentinel == sentinel);
+  }
+  decode(entry_size, from_ptr);
+
+  // Read out the payload
+  from_ptr.copy(entry_size, *entry);
+
+  // Consume the envelope suffix (start_ptr)
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    decode(*start_ptr, from_ptr);
+  } else {
+    *start_ptr = 0;
+  }
+
+  // Trim the input buffer to discard the bytes we have consumed
+  from.splice(0, from_ptr.get_off());
+
+  return from_ptr.get_off();
+}
+
+
+/**
+ * Append one entry
+ */
+size_t JournalStream::write(bufferlist &entry, bufferlist *to,
+			    uint64_t const &start_ptr)
+{
+  ceph_assert(to != NULL);
+
+  uint32_t const entry_size = entry.length();
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    encode(sentinel, *to);
+  }
+  encode(entry_size, *to);
+  to->claim_append(entry);
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    encode(start_ptr, *to);
+  }
+
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    return JOURNAL_ENVELOPE_RESILIENT + entry_size;
+  } else {
+    return JOURNAL_ENVELOPE_LEGACY + entry_size;
+  }
+}
+
+/**
+ * set write error callback
+ *
+ * Set a callback/context to trigger if we get a write error from
+ * the objecter.  This may be from an explicit request (e.g., flush)
+ * or something async the journaler did on its own (e.g., journal
+ * header update).
+ *
+ * It is only used once; if the caller continues to use the
+ * Journaler and wants to hear about errors, it needs to reset the
+ * error_handler.
+ *
+ * @param c callback/context to trigger on error
+ */
+void Journaler::set_write_error_handler(Context *c) {
+  lock_guard l(lock);
+  ceph_assert(!on_write_error);
+  on_write_error = wrap_finisher(c);
+  called_write_error = false;
+}
+
+
+/**
+ * Wrap a context in a C_OnFinisher, if it is non-NULL
+ *
+ * Utility function to avoid lots of error-prone and verbose
+ * NULL checking on contexts passed in.
+ */
+C_OnFinisher *Journaler::wrap_finisher(Context *c)
+{
+  if (c != NULL) {
+    return new C_OnFinisher(c, finisher);
+  } else {
+    return NULL;
+  }
+}
+
+void Journaler::shutdown()
+{
+  lock_guard l(lock);
+
+  ldout(cct, 1) << __func__ << dendl;
+
+  state = STATE_STOPPING;
+  readable = false;
+
+  // Kick out anyone reading from journal
+  error = -EAGAIN;
+  if (on_readable) {
+    C_OnFinisher *f = on_readable;
+    on_readable = 0;
+    f->complete(-EAGAIN);
+  }
+
+  list<Context*> ls;
+  ls.swap(waitfor_recover);
+  finish_contexts(cct, ls, -ESHUTDOWN);
+
+  std::map<uint64_t, std::list<Context*> >::iterator i;
+  for (i = waitfor_safe.begin(); i != waitfor_safe.end(); ++i) {
+    finish_contexts(cct, i->second, -EAGAIN);
+  }
+  waitfor_safe.clear();
+}
+
diff --git a/src/osdc/Journaler.h b/src/osdc/Journaler.h
new file mode 100644
index 000000000..3e8f0f665
--- /dev/null
+++ b/src/osdc/Journaler.h
@@ -0,0 +1,542 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+/* Journaler
+ *
+ * This class stripes a serial log over objects on the store.  Four
+ * logical pointers:
+ *
+ *  write_pos - where we're writing new entries
+ *  unused_field - where we're reading old entires
+ *  expire_pos - what is deemed "old" by user
+ *  trimmed_pos - where we're expiring old items
+ *
+ *  trimmed_pos <= expire_pos <= unused_field <= write_pos.
+ *
+ * Often, unused_field <= write_pos (as with MDS log).  During
+ * recovery, write_pos is undefined until the end of the log is
+ * discovered.
+ *
+ * A "head" struct at the beginning of the log is used to store
+ * metadata at regular intervals.  The basic invariants include:
+ *
+ *   head.unused_field <= unused_field -- the head may "lag", since
+ *                                        it's updated lazily.
+ *   head.write_pos  <= write_pos
+ *   head.expire_pos <= expire_pos
+ *   head.trimmed_pos   <= trimmed_pos
+ *
+ * More significantly,
+ *
+ *   head.expire_pos >= trimmed_pos -- this ensures we can find the
+ *                                     "beginning" of the log as last
+ *                                     recorded, before it is trimmed.
+ *                                     trimming will block until a
+ *                                     sufficiently current expire_pos
+ *                                     is committed.
+ *
+ * To recover log state, we simply start at the last write_pos in the
+ * head, and probe the object sequence sizes until we read the end.
+ *
+ * Head struct is stored in the first object.  Actual journal starts
+ * after layout.period() bytes.
+ *
+ */
+
+#ifndef CEPH_JOURNALER_H
+#define CEPH_JOURNALER_H
+
+#include <list>
+#include <map>
+
+#include "Objecter.h"
+#include "Filer.h"
+
+#include "common/Timer.h"
+#include "common/Throttle.h"
+#include "include/common_fwd.h"
+
+class Context;
+class Finisher;
+class C_OnFinisher;
+
+typedef __u8 stream_format_t;
+
+// Legacy envelope is leading uint32_t size
+enum StreamFormat {
+    JOURNAL_FORMAT_LEGACY = 0,
+    JOURNAL_FORMAT_RESILIENT = 1,
+    // Insert new formats here, before COUNT
+    JOURNAL_FORMAT_COUNT
+};
+
+// Highest journal format version that we support
+#define JOURNAL_FORMAT_MAX (JOURNAL_FORMAT_COUNT - 1)
+
+// Legacy envelope is leading uint32_t size
+#define JOURNAL_ENVELOPE_LEGACY (sizeof(uint32_t))
+
+// Resilient envelope is leading uint64_t sentinel, uint32_t size,
+// trailing uint64_t start_ptr
+#define JOURNAL_ENVELOPE_RESILIENT (sizeof(uint32_t) + sizeof(uint64_t) + \
+				    sizeof(uint64_t))
+
+/**
+ * Represents a collection of entries serialized in a byte stream.
+ *
+ * Each entry consists of:
+ *  - a blob (used by the next level up as a serialized LogEvent)
+ *  - a uint64_t (used by the next level up as a pointer to the start
+ *    of the entry in the collection bytestream)
+ */
+class JournalStream
+{
+  stream_format_t format;
+
+  public:
+  JournalStream(stream_format_t format_) : format(format_) {}
+
+  void set_format(stream_format_t format_) {format = format_;}
+
+  bool readable(bufferlist &bl, uint64_t *need) const;
+  size_t read(bufferlist &from, bufferlist *to, uint64_t *start_ptr);
+  size_t write(bufferlist &entry, bufferlist *to, uint64_t const &start_ptr);
+  size_t get_envelope_size() const {
+     if (format >= JOURNAL_FORMAT_RESILIENT) {
+       return JOURNAL_ENVELOPE_RESILIENT;
+     } else {
+       return JOURNAL_ENVELOPE_LEGACY;
+     }
+  }
+
+  // A magic number for the start of journal entries, so that we can
+  // identify them in damaged journals.
+  static const uint64_t sentinel = 0x3141592653589793;
+};
+
+
+class Journaler {
+public:
+  // this goes at the head of the log "file".
+  class Header {
+    public:
+    uint64_t trimmed_pos;
+    uint64_t expire_pos;
+    uint64_t unused_field;
+    uint64_t write_pos;
+    string magic;
+    file_layout_t layout; //< The mapping from byte stream offsets
+			     //  to RADOS objects
+    stream_format_t stream_format; //< The encoding of LogEvents
+				   //  within the journal byte stream
+
+    Header(const char *m="") :
+      trimmed_pos(0), expire_pos(0), unused_field(0), write_pos(0), magic(m),
+      stream_format(-1) {
+    }
+
+    void encode(bufferlist &bl) const {
+      ENCODE_START(2, 2, bl);
+      encode(magic, bl);
+      encode(trimmed_pos, bl);
+      encode(expire_pos, bl);
+      encode(unused_field, bl);
+      encode(write_pos, bl);
+      encode(layout, bl, 0);  // encode in legacy format
+      encode(stream_format, bl);
+      ENCODE_FINISH(bl);
+    }
+    void decode(bufferlist::const_iterator &bl) {
+      DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+      decode(magic, bl);
+      decode(trimmed_pos, bl);
+      decode(expire_pos, bl);
+      decode(unused_field, bl);
+      decode(write_pos, bl);
+      decode(layout, bl);
+      if (struct_v > 1) {
+	decode(stream_format, bl);
+      } else {
+	stream_format = JOURNAL_FORMAT_LEGACY;
+      }
+      DECODE_FINISH(bl);
+    }
+
+    void dump(Formatter *f) const {
+      f->open_object_section("journal_header");
+      {
+	f->dump_string("magic", magic);
+	f->dump_unsigned("write_pos", write_pos);
+	f->dump_unsigned("expire_pos", expire_pos);
+	f->dump_unsigned("trimmed_pos", trimmed_pos);
+	f->dump_unsigned("stream_format", stream_format);
+	f->dump_object("layout", layout);
+      }
+      f->close_section(); // journal_header
+    }
+
+    static void generate_test_instances(list<Header*> &ls)
+    {
+      ls.push_back(new Header());
+
+      ls.push_back(new Header());
+      ls.back()->trimmed_pos = 1;
+      ls.back()->expire_pos = 2;
+      ls.back()->unused_field = 3;
+      ls.back()->write_pos = 4;
+      ls.back()->magic = "magique";
+
+      ls.push_back(new Header());
+      ls.back()->stream_format = JOURNAL_FORMAT_RESILIENT;
+    }
+  };
+  WRITE_CLASS_ENCODER(Header)
+
+  uint32_t get_stream_format() const {
+    return stream_format;
+  }
+
+  Header last_committed;
+
+private:
+  // me
+  CephContext *cct;
+  std::mutex lock;
+  const std::string name;
+  typedef std::lock_guard<std::mutex> lock_guard;
+  typedef std::unique_lock<std::mutex> unique_lock;
+  Finisher *finisher;
+  Header last_written;
+  inodeno_t ino;
+  int64_t pg_pool;
+  bool readonly;
+  file_layout_t layout;
+  uint32_t stream_format;
+  JournalStream journal_stream;
+
+  const char *magic;
+  Objecter *objecter;
+  Filer filer;
+
+  PerfCounters *logger;
+  int logger_key_lat;
+
+  class C_DelayFlush;
+  C_DelayFlush *delay_flush_event;
+  /*
+   * Do a flush as a result of a C_DelayFlush context.
+   */
+  void _do_delayed_flush()
+  {
+    ceph_assert(delay_flush_event != NULL);
+    lock_guard l(lock);
+    delay_flush_event = NULL;
+    _do_flush();
+  }
+
+  // my state
+  static const int STATE_UNDEF = 0;
+  static const int STATE_READHEAD = 1;
+  static const int STATE_PROBING = 2;
+  static const int STATE_ACTIVE = 3;
+  static const int STATE_REREADHEAD = 4;
+  static const int STATE_REPROBING = 5;
+  static const int STATE_STOPPING = 6;
+
+  int state;
+  int error;
+
+  void _write_head(Context *oncommit=NULL);
+  void _wait_for_flush(Context *onsafe);
+  void _trim();
+
+  // header
+  ceph::real_time last_wrote_head;
+  void _finish_write_head(int r, Header &wrote, C_OnFinisher *oncommit);
+  class C_WriteHead;
+  friend class C_WriteHead;
+
+  void _reread_head(Context *onfinish);
+  void _set_layout(file_layout_t const *l);
+  list<Context*> waitfor_recover;
+  void _read_head(Context *on_finish, bufferlist *bl);
+  void _finish_read_head(int r, bufferlist& bl);
+  void _finish_reread_head(int r, bufferlist& bl, Context *finish);
+  void _probe(Context *finish, uint64_t *end);
+  void _finish_probe_end(int r, uint64_t end);
+  void _reprobe(C_OnFinisher *onfinish);
+  void _finish_reprobe(int r, uint64_t end, C_OnFinisher *onfinish);
+  void _finish_reread_head_and_probe(int r, C_OnFinisher *onfinish);
+  class C_ReadHead;
+  friend class C_ReadHead;
+  class C_ProbeEnd;
+  friend class C_ProbeEnd;
+  class C_RereadHead;
+  friend class C_RereadHead;
+  class C_ReProbe;
+  friend class C_ReProbe;
+  class C_RereadHeadProbe;
+  friend class C_RereadHeadProbe;
+
+  // writer
+  uint64_t prezeroing_pos;
+  uint64_t prezero_pos; ///< we zero journal space ahead of write_pos to
+			//   avoid problems with tail probing
+  uint64_t write_pos; ///< logical write position, where next entry
+		      //   will go
+  uint64_t flush_pos; ///< where we will flush. if
+		      ///  write_pos>flush_pos, we're buffering writes.
+  uint64_t safe_pos; ///< what has been committed safely to disk.
+
+  uint64_t next_safe_pos; /// start position of the first entry that isn't
+			  /// being fully flushed. If we don't flush any
+			  // partial entry, it's equal to flush_pos.
+
+  bufferlist write_buf; ///< write buffer.  flush_pos +
+			///  write_buf.length() == write_pos.
+
+  // protect write_buf from bufferlist _len overflow 
+  Throttle write_buf_throttle;
+
+  uint64_t waiting_for_zero_pos;
+  interval_set<uint64_t> pending_zero;  // non-contig bits we've zeroed
+  list<Context*> waitfor_prezero;
+
+  std::map<uint64_t, uint64_t> pending_safe; // flush_pos -> safe_pos
+  // when safe through given offset
+  std::map<uint64_t, std::list<Context*> > waitfor_safe;
+
+  void _flush(C_OnFinisher *onsafe);
+  void _do_flush(unsigned amount=0);
+  void _finish_flush(int r, uint64_t start, ceph::real_time stamp);
+  class C_Flush;
+  friend class C_Flush;
+
+  // reader
+  uint64_t read_pos;      // logical read position, where next entry starts.
+  uint64_t requested_pos; // what we've requested from OSD.
+  uint64_t received_pos;  // what we've received from OSD.
+  // read buffer.  unused_field + read_buf.length() == prefetch_pos.
+  bufferlist read_buf;
+
+  map<uint64_t,bufferlist> prefetch_buf;
+
+  uint64_t fetch_len;     // how much to read at a time
+  uint64_t temp_fetch_len;
+
+  // for wait_for_readable()
+  C_OnFinisher *on_readable;
+  C_OnFinisher *on_write_error;
+  bool called_write_error;
+
+  // read completion callback
+  void _finish_read(int r, uint64_t offset, uint64_t length, bufferlist &bl);
+  void _finish_retry_read(int r);
+  void _assimilate_prefetch();
+  void _issue_read(uint64_t len); // read some more
+  void _prefetch(); // maybe read ahead
+  class C_Read;
+  friend class C_Read;
+  class C_RetryRead;
+  friend class C_RetryRead;
+
+  // trimmer
+  uint64_t expire_pos;    // what we're allowed to trim to
+  uint64_t trimming_pos;      // what we've requested to trim through
+  uint64_t trimmed_pos;   // what has been trimmed
+
+  bool readable;
+
+  void _finish_trim(int r, uint64_t to);
+  class C_Trim;
+  friend class C_Trim;
+
+  void _issue_prezero();
+  void _finish_prezero(int r, uint64_t from, uint64_t len);
+  friend struct C_Journaler_Prezero;
+
+  // only init_headers when following or first reading off-disk
+  void init_headers(Header& h) {
+    ceph_assert(readonly ||
+	   state == STATE_READHEAD ||
+	   state == STATE_REREADHEAD);
+    last_written = last_committed = h;
+  }
+
+  /**
+   * handle a write error
+   *
+   * called when we get an objecter error on a write.
+   *
+   * @param r error code
+   */
+  void handle_write_error(int r);
+
+  bool _is_readable();
+
+  void _finish_erase(int data_result, C_OnFinisher *completion);
+  class C_EraseFinish;
+  friend class C_EraseFinish;
+
+  C_OnFinisher *wrap_finisher(Context *c);
+
+  uint32_t write_iohint; // the fadvise flags for write op, see
+			 // CEPH_OSD_OP_FADIVSE_*
+
+public:
+  Journaler(const std::string &name_, inodeno_t ino_, int64_t pool,
+      const char *mag, Objecter *obj, PerfCounters *l, int lkey, Finisher *f) :
+    last_committed(mag),
+    cct(obj->cct), name(name_), finisher(f), last_written(mag),
+    ino(ino_), pg_pool(pool), readonly(true),
+    stream_format(-1), journal_stream(-1),
+    magic(mag),
+    objecter(obj), filer(objecter, f), logger(l), logger_key_lat(lkey),
+    delay_flush_event(0),
+    state(STATE_UNDEF), error(0),
+    prezeroing_pos(0), prezero_pos(0), write_pos(0), flush_pos(0),
+    safe_pos(0), next_safe_pos(0),
+    write_buf_throttle(cct, "write_buf_throttle", UINT_MAX - (UINT_MAX >> 3)),
+    waiting_for_zero_pos(0),
+    read_pos(0), requested_pos(0), received_pos(0),
+    fetch_len(0), temp_fetch_len(0),
+    on_readable(0), on_write_error(NULL), called_write_error(false),
+    expire_pos(0), trimming_pos(0), trimmed_pos(0), readable(false),
+    write_iohint(0)
+  {
+  }
+
+  /* reset
+   *
+   * NOTE: we assume the caller knows/has ensured that any objects in
+   * our sequence do not exist.. e.g. after a MKFS.  this is _not_ an
+   * "erase" method.
+   */
+  void reset() {
+    lock_guard l(lock);
+    ceph_assert(state == STATE_ACTIVE);
+
+    readonly = true;
+    delay_flush_event = NULL;
+    state = STATE_UNDEF;
+    error = 0;
+    prezeroing_pos = 0;
+    prezero_pos = 0;
+    write_pos = 0;
+    flush_pos = 0;
+    safe_pos = 0;
+    next_safe_pos = 0;
+    read_pos = 0;
+    requested_pos = 0;
+    received_pos = 0;
+    fetch_len = 0;
+    ceph_assert(!on_readable);
+    expire_pos = 0;
+    trimming_pos = 0;
+    trimmed_pos = 0;
+    waiting_for_zero_pos = 0;
+  }
+
+  // Asynchronous operations
+  // =======================
+  void erase(Context *completion);
+  void create(file_layout_t *layout, stream_format_t const sf);
+  void recover(Context *onfinish);
+  void reread_head(Context *onfinish);
+  void reread_head_and_probe(Context *onfinish);
+  void write_head(Context *onsave=0);
+  void wait_for_flush(Context *onsafe = 0);
+  void flush(Context *onsafe = 0);
+  void wait_for_readable(Context *onfinish);
+  bool have_waiter() const;
+  void wait_for_prezero(Context *onfinish);
+
+  // Synchronous setters
+  // ===================
+  void set_layout(file_layout_t const *l);
+  void set_readonly();
+  void set_writeable();
+  void set_write_pos(uint64_t p) {
+    lock_guard l(lock);
+    prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = p;
+  }
+  void set_read_pos(uint64_t p) {
+    lock_guard l(lock);
+    // we can't cope w/ in-progress read right now.
+    ceph_assert(requested_pos == received_pos);
+    read_pos = requested_pos = received_pos = p;
+    read_buf.clear();
+  }
+  uint64_t append_entry(bufferlist& bl);
+  void set_expire_pos(uint64_t ep) {
+      lock_guard l(lock);
+      expire_pos = ep;
+  }
+  void set_trimmed_pos(uint64_t p) {
+      lock_guard l(lock);
+      trimming_pos = trimmed_pos = p;
+  }
+
+  bool _write_head_needed();
+  bool write_head_needed() {
+    lock_guard l(lock);
+    return _write_head_needed();
+  }
+
+
+  void trim();
+  void trim_tail() {
+    lock_guard l(lock);
+
+    ceph_assert(!readonly);
+    _issue_prezero();
+  }
+
+  void set_write_error_handler(Context *c);
+
+  void set_write_iohint(uint32_t iohint_flags) {
+    write_iohint = iohint_flags;
+  }
+  /**
+   * Cause any ongoing waits to error out with -EAGAIN, set error
+   * to -EAGAIN.
+   */
+  void shutdown();
+public:
+
+  // Synchronous getters
+  // ===================
+  // TODO: need some locks on reads for true safety
+  uint64_t get_layout_period() const {
+    return layout.get_period();
+  }
+  file_layout_t& get_layout() { return layout; }
+  bool is_active() { return state == STATE_ACTIVE; }
+  bool is_stopping() { return state == STATE_STOPPING; }
+  int get_error() { return error; }
+  bool is_readonly() { return readonly; }
+  bool is_readable();
+  bool try_read_entry(bufferlist& bl);
+  uint64_t get_write_pos() const { return write_pos; }
+  uint64_t get_write_safe_pos() const { return safe_pos; }
+  uint64_t get_read_pos() const { return read_pos; }
+  uint64_t get_expire_pos() const { return expire_pos; }
+  uint64_t get_trimmed_pos() const { return trimmed_pos; }
+  size_t get_journal_envelope_size() const { 
+    return journal_stream.get_envelope_size(); 
+  }
+};
+WRITE_CLASS_ENCODER(Journaler::Header)
+
+#endif
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
new file mode 100644
index 000000000..f2b6d9736
--- /dev/null
+++ b/src/osdc/ObjectCacher.cc
@@ -0,0 +1,2807 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <limits.h>
+
+#include "msg/Messenger.h"
+#include "ObjectCacher.h"
+#include "WritebackHandler.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+
+#include "include/ceph_assert.h"
+
+#define MAX_FLUSH_UNDER_LOCK 20  ///< max bh's we start writeback on
+#define BUFFER_MEMORY_WEIGHT CEPH_PAGE_SHIFT  // memory usage of BufferHead, count in (1<<n)
+				 /// while holding the lock
+
+using std::chrono::seconds;
+using std::list;
+using std::map;
+using std::make_pair;
+using std::pair;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+
+using namespace std::literals;
+
+/*** ObjectCacher::BufferHead ***/
+
+
+/*** ObjectCacher::Object ***/
+
+#define dout_subsys ceph_subsys_objectcacher
+#undef dout_prefix
+#define dout_prefix *_dout << "objectcacher.object(" << oid << ") "
+
+
+
+class ObjectCacher::C_ReadFinish : public Context {
+  ObjectCacher *oc;
+  int64_t poolid;
+  sobject_t oid;
+  loff_t start;
+  uint64_t length;
+  xlist<C_ReadFinish*>::item set_item;
+  bool trust_enoent;
+  ceph_tid_t tid;
+  ZTracer::Trace trace;
+
+public:
+  bufferlist bl;
+  C_ReadFinish(ObjectCacher *c, Object *ob, ceph_tid_t t, loff_t s,
+	       uint64_t l, const ZTracer::Trace &trace) :
+    oc(c), poolid(ob->oloc.pool), oid(ob->get_soid()), start(s), length(l),
+    set_item(this), trust_enoent(true),
+    tid(t), trace(trace) {
+    ob->reads.push_back(&set_item);
+  }
+
+  void finish(int r) override {
+    oc->bh_read_finish(poolid, oid, tid, start, length, bl, r, trust_enoent);
+    trace.event("finish");
+
+    // object destructor clears the list
+    if (set_item.is_on_list())
+      set_item.remove_myself();
+  }
+
+  void distrust_enoent() {
+    trust_enoent = false;
+  }
+};
+
+class ObjectCacher::C_RetryRead : public Context {
+  ObjectCacher *oc;
+  OSDRead *rd;
+  ObjectSet *oset;
+  Context *onfinish;
+  ZTracer::Trace trace;
+public:
+  C_RetryRead(ObjectCacher *_oc, OSDRead *r, ObjectSet *os, Context *c,
+	      const ZTracer::Trace &trace)
+    : oc(_oc), rd(r), oset(os), onfinish(c), trace(trace) {
+  }
+  void finish(int r) override {
+    if (r >= 0) {
+      r = oc->_readx(rd, oset, onfinish, false, &trace);
+    }
+
+    if (r == 0) {
+      // read is still in-progress
+      return;
+    }
+
+    trace.event("finish");
+    if (onfinish) {
+      onfinish->complete(r);
+    }
+  }
+};
+
+ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left,
+						      loff_t off)
+{
+  ceph_assert(ceph_mutex_is_locked(oc->lock));
+  ldout(oc->cct, 20) << "split " << *left << " at " << off << dendl;
+
+  // split off right
+  ObjectCacher::BufferHead *right = new BufferHead(this);
+
+  //inherit and if later access, this auto clean.
+  right->set_dontneed(left->get_dontneed());
+  right->set_nocache(left->get_nocache());
+
+  right->last_write_tid = left->last_write_tid;
+  right->last_read_tid = left->last_read_tid;
+  right->set_state(left->get_state());
+  right->set_error(left->error);
+  right->snapc = left->snapc;
+  right->set_journal_tid(left->journal_tid);
+
+  loff_t newleftlen = off - left->start();
+  right->set_start(off);
+  right->set_length(left->length() - newleftlen);
+
+  // shorten left
+  oc->bh_stat_sub(left);
+  left->set_length(newleftlen);
+  oc->bh_stat_add(left);
+
+  // add right
+  oc->bh_add(this, right);
+
+  // split buffers too
+  bufferlist bl;
+  bl = std::move(left->bl);
+  if (bl.length()) {
+    ceph_assert(bl.length() == (left->length() + right->length()));
+    right->bl.substr_of(bl, left->length(), right->length());
+    left->bl.substr_of(bl, 0, left->length());
+  }
+
+  // move read waiters
+  if (!left->waitfor_read.empty()) {
+    auto start_remove = left->waitfor_read.begin();
+    while (start_remove != left->waitfor_read.end() &&
+	   start_remove->first < right->start())
+      ++start_remove;
+    for (auto p = start_remove; p != left->waitfor_read.end(); ++p) {
+      ldout(oc->cct, 20) << "split  moving waiters at byte " << p->first
+			 << " to right bh" << dendl;
+      right->waitfor_read[p->first].swap( p->second );
+      ceph_assert(p->second.empty());
+    }
+    left->waitfor_read.erase(start_remove, left->waitfor_read.end());
+  }
+
+  ldout(oc->cct, 20) << "split    left is " << *left << dendl;
+  ldout(oc->cct, 20) << "split   right is " << *right << dendl;
+  return right;
+}
+
+
+void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right)
+{
+  ceph_assert(ceph_mutex_is_locked(oc->lock));
+
+  ldout(oc->cct, 10) << "merge_left " << *left << " + " << *right << dendl;
+  if (left->get_journal_tid() == 0) {
+    left->set_journal_tid(right->get_journal_tid());
+  }
+  right->set_journal_tid(0);
+
+  oc->bh_remove(this, right);
+  oc->bh_stat_sub(left);
+  left->set_length(left->length() + right->length());
+  oc->bh_stat_add(left);
+
+  // data
+  left->bl.claim_append(right->bl);
+
+  // version
+  // note: this is sorta busted, but should only be used for dirty buffers
+  left->last_write_tid =  std::max( left->last_write_tid, right->last_write_tid );
+  left->last_write = std::max( left->last_write, right->last_write );
+
+  left->set_dontneed(right->get_dontneed() ? left->get_dontneed() : false);
+  left->set_nocache(right->get_nocache() ? left->get_nocache() : false);
+
+  // waiters
+  for (auto p = right->waitfor_read.begin();
+       p != right->waitfor_read.end();
+       ++p)
+    left->waitfor_read[p->first].splice(left->waitfor_read[p->first].begin(),
+					p->second );
+
+  // hose right
+  delete right;
+
+  ldout(oc->cct, 10) << "merge_left result " << *left << dendl;
+}
+
+bool ObjectCacher::Object::can_merge_bh(BufferHead *left, BufferHead *right)
+{
+  if (left->end() != right->start() ||
+      left->get_state() != right->get_state() ||
+      !left->can_merge_journal(right))
+    return false;
+  if (left->is_tx() && left->last_write_tid != right->last_write_tid)
+    return false;
+  return true;
+}
+
+void ObjectCacher::Object::try_merge_bh(BufferHead *bh)
+{
+  ceph_assert(ceph_mutex_is_locked(oc->lock));
+  ldout(oc->cct, 10) << "try_merge_bh " << *bh << dendl;
+
+  // do not merge rx buffers; last_read_tid may not match
+  if (bh->is_rx())
+    return;
+
+  // to the left?
+  auto p = data.find(bh->start());
+  ceph_assert(p->second == bh);
+  if (p != data.begin()) {
+    --p;
+    if (can_merge_bh(p->second, bh)) {
+      merge_left(p->second, bh);
+      bh = p->second;
+    } else {
+      ++p;
+    }
+  }
+  // to the right?
+  ceph_assert(p->second == bh);
+  ++p;
+  if (p != data.end() && can_merge_bh(bh, p->second))
+    merge_left(bh, p->second);
+
+  maybe_rebuild_buffer(bh);
+}
+
+void ObjectCacher::Object::maybe_rebuild_buffer(BufferHead *bh)
+{
+  auto& bl = bh->bl;
+  if (bl.get_num_buffers() <= 1)
+    return;
+
+  auto wasted = bl.get_wasted_space();
+  if (wasted * 2 > bl.length() &&
+      wasted > (1U << BUFFER_MEMORY_WEIGHT))
+    bl.rebuild();
+}
+
+/*
+ * count bytes we have cached in given range
+ */
+bool ObjectCacher::Object::is_cached(loff_t cur, loff_t left) const
+{
+  ceph_assert(ceph_mutex_is_locked(oc->lock));
+  auto p = data_lower_bound(cur);
+  while (left > 0) {
+    if (p == data.end())
+      return false;
+
+    if (p->first <= cur) {
+      // have part of it
+      loff_t lenfromcur = std::min(p->second->end() - cur, left);
+      cur += lenfromcur;
+      left -= lenfromcur;
+      ++p;
+      continue;
+    } else if (p->first > cur) {
+      // gap
+      return false;
+    } else
+      ceph_abort();
+  }
+
+  return true;
+}
+
+/*
+ * all cached data in this range[off, off+len]
+ */
+bool ObjectCacher::Object::include_all_cached_data(loff_t off, loff_t len)
+{
+  ceph_assert(ceph_mutex_is_locked(oc->lock));
+  if (data.empty())
+      return true;
+  auto first = data.begin();
+  auto last = data.rbegin();
+  if (first->second->start() >= off && last->second->end() <= (off + len))
+    return true;
+  else
+    return false;
+}
+
+/*
+ * map a range of bytes into buffer_heads.
+ * - create missing buffer_heads as necessary.
+ */
+int ObjectCacher::Object::map_read(ObjectExtent &ex,
+                                   map<loff_t, BufferHead*>& hits,
+                                   map<loff_t, BufferHead*>& missing,
+                                   map<loff_t, BufferHead*>& rx,
+				   map<loff_t, BufferHead*>& errors)
+{
+  ceph_assert(ceph_mutex_is_locked(oc->lock));
+  ldout(oc->cct, 10) << "map_read " << ex.oid << " "
+                     << ex.offset << "~" << ex.length << dendl;
+
+  loff_t cur = ex.offset;
+  loff_t left = ex.length;
+
+  auto p = data_lower_bound(ex.offset);
+  while (left > 0) {
+    // at end?
+    if (p == data.end()) {
+      // rest is a miss.
+      BufferHead *n = new BufferHead(this);
+      n->set_start(cur);
+      n->set_length(left);
+      oc->bh_add(this, n);
+      if (complete) {
+        oc->mark_zero(n);
+        hits[cur] = n;
+        ldout(oc->cct, 20) << "map_read miss+complete+zero " << left << " left, " << *n << dendl;
+      } else {
+        missing[cur] = n;
+        ldout(oc->cct, 20) << "map_read miss " << left << " left, " << *n << dendl;
+      }
+      cur += left;
+      ceph_assert(cur == (loff_t)ex.offset + (loff_t)ex.length);
+      break;  // no more.
+    }
+
+    if (p->first <= cur) {
+      // have it (or part of it)
+      BufferHead *e = p->second;
+
+      if (e->is_clean() ||
+          e->is_dirty() ||
+          e->is_tx() ||
+          e->is_zero()) {
+        hits[cur] = e;     // readable!
+        ldout(oc->cct, 20) << "map_read hit " << *e << dendl;
+      } else if (e->is_rx()) {
+        rx[cur] = e;       // missing, not readable.
+        ldout(oc->cct, 20) << "map_read rx " << *e << dendl;
+      } else if (e->is_error()) {
+        errors[cur] = e;
+        ldout(oc->cct, 20) << "map_read error " << *e << dendl;
+      } else {
+        ceph_abort();
+      }
+
+      loff_t lenfromcur = std::min(e->end() - cur, left);
+      cur += lenfromcur;
+      left -= lenfromcur;
+      ++p;
+      continue;  // more?
+
+    } else if (p->first > cur) {
+      // gap.. miss
+      loff_t next = p->first;
+      BufferHead *n = new BufferHead(this);
+      loff_t len = std::min(next - cur, left);
+      n->set_start(cur);
+      n->set_length(len);
+      oc->bh_add(this,n);
+      if (complete) {
+        oc->mark_zero(n);
+        hits[cur] = n;
+        ldout(oc->cct, 20) << "map_read gap+complete+zero " << *n << dendl;
+      } else {
+        missing[cur] = n;
+        ldout(oc->cct, 20) << "map_read gap " << *n << dendl;
+      }
+      cur += std::min(left, n->length());
+      left -= std::min(left, n->length());
+      continue;    // more?
+    } else {
+      ceph_abort();
+    }
+  }
+  return 0;
+}
+
+void ObjectCacher::Object::audit_buffers()
+{
+  loff_t offset = 0;
+  for (auto it = data.begin(); it != data.end(); ++it) {
+    if (it->first != it->second->start()) {
+      lderr(oc->cct) << "AUDIT FAILURE: map position " << it->first
+		     << " does not match bh start position: "
+		     << *it->second << dendl;
+      ceph_assert(it->first == it->second->start());
+    }
+    if (it->first < offset) {
+      lderr(oc->cct) << "AUDIT FAILURE: " << it->first << " " << *it->second
+		     << " overlaps with previous bh " << *((--it)->second)
+		     << dendl;
+      ceph_assert(it->first >= offset);
+    }
+    BufferHead *bh = it->second;
+    for (auto w_it = bh->waitfor_read.begin();
+	 w_it != bh->waitfor_read.end(); ++w_it) {
+      if (w_it->first < bh->start() ||
+	    w_it->first >= bh->start() + bh->length()) {
+	lderr(oc->cct) << "AUDIT FAILURE: waiter at " << w_it->first
+		       << " is not within bh " << *bh << dendl;
+	ceph_assert(w_it->first >= bh->start());
+	ceph_assert(w_it->first < bh->start() + bh->length());
+      }
+    }
+    offset = it->first + it->second->length();
+  }
+}
+
+/*
+ * map a range of extents on an object's buffer cache.
+ * - combine any bh's we're writing into one
+ * - break up bufferheads that don't fall completely within the range
+ * //no! - return a bh that includes the write.  may also include
+ * other dirty data to left and/or right.
+ */
+ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex,
+							  ceph_tid_t tid)
+{
+  ceph_assert(ceph_mutex_is_locked(oc->lock));
+  BufferHead *final = 0;
+
+  ldout(oc->cct, 10) << "map_write oex " << ex.oid
+      	       << " " << ex.offset << "~" << ex.length << dendl;
+
+  loff_t cur = ex.offset;
+  loff_t left = ex.length;
+
+  auto p = data_lower_bound(ex.offset);
+  while (left > 0) {
+    loff_t max = left;
+
+    // at end ?
+    if (p == data.end()) {
+      if (final == NULL) {
+        final = new BufferHead(this);
+        replace_journal_tid(final, tid);
+        final->set_start( cur );
+        final->set_length( max );
+        oc->bh_add(this, final);
+        ldout(oc->cct, 10) << "map_write adding trailing bh " << *final << dendl;
+      } else {
+        oc->bh_stat_sub(final);
+        final->set_length(final->length() + max);
+        oc->bh_stat_add(final);
+      }
+      left -= max;
+      cur += max;
+      continue;
+    }
+
+    ldout(oc->cct, 10) << "cur is " << cur << ", p is " << *p->second << dendl;
+    //oc->verify_stats();
+
+    if (p->first <= cur) {
+      BufferHead *bh = p->second;
+      ldout(oc->cct, 10) << "map_write bh " << *bh << " intersected" << dendl;
+
+      if (p->first < cur) {
+        ceph_assert(final == 0);
+        if (cur + max >= bh->end()) {
+          // we want right bit (one splice)
+          final = split(bh, cur);   // just split it, take right half.
+          maybe_rebuild_buffer(bh);
+          replace_journal_tid(final, tid);
+          ++p;
+          ceph_assert(p->second == final);
+        } else {
+          // we want middle bit (two splices)
+          final = split(bh, cur);
+          maybe_rebuild_buffer(bh);
+          ++p;
+          ceph_assert(p->second == final);
+          auto right = split(final, cur+max);
+          maybe_rebuild_buffer(right);
+          replace_journal_tid(final, tid);
+        }
+      } else {
+        ceph_assert(p->first == cur);
+        if (bh->length() <= max) {
+          // whole bufferhead, piece of cake.
+        } else {
+          // we want left bit (one splice)
+          auto right = split(bh, cur + max);        // just split
+          maybe_rebuild_buffer(right);
+        }
+        if (final) {
+          oc->mark_dirty(bh);
+          oc->mark_dirty(final);
+          --p;  // move iterator back to final
+          ceph_assert(p->second == final);
+          replace_journal_tid(bh, tid);
+          merge_left(final, bh);
+        } else {
+          final = bh;
+          replace_journal_tid(final, tid);
+        }
+      }
+
+      // keep going.
+      loff_t lenfromcur = final->end() - cur;
+      cur += lenfromcur;
+      left -= lenfromcur;
+      ++p;
+      continue;
+    } else {
+      // gap!
+      loff_t next = p->first;
+      loff_t glen = std::min(next - cur, max);
+      ldout(oc->cct, 10) << "map_write gap " << cur << "~" << glen << dendl;
+      if (final) {
+        oc->bh_stat_sub(final);
+        final->set_length(final->length() + glen);
+        oc->bh_stat_add(final);
+      } else {
+        final = new BufferHead(this);
+	replace_journal_tid(final, tid);
+        final->set_start( cur );
+        final->set_length( glen );
+        oc->bh_add(this, final);
+      }
+
+      cur += glen;
+      left -= glen;
+      continue;    // more?
+    }
+  }
+
+  // set version
+  ceph_assert(final);
+  ceph_assert(final->get_journal_tid() == tid);
+  ldout(oc->cct, 10) << "map_write final is " << *final << dendl;
+
+  return final;
+}
+
+void ObjectCacher::Object::replace_journal_tid(BufferHead *bh,
+					       ceph_tid_t tid) {
+  ceph_tid_t bh_tid = bh->get_journal_tid();
+
+  ceph_assert(tid == 0 || bh_tid <= tid);
+  if (bh_tid != 0 && bh_tid != tid) {
+    // inform journal that it should not expect a writeback from this extent
+    oc->writeback_handler.overwrite_extent(get_oid(), bh->start(),
+					   bh->length(), bh_tid, tid);
+  }
+  bh->set_journal_tid(tid);
+}
+
+void ObjectCacher::Object::truncate(loff_t s)
+{
+  ceph_assert(ceph_mutex_is_locked(oc->lock));
+  ldout(oc->cct, 10) << "truncate " << *this << " to " << s << dendl;
+
+  std::list<Context*> waiting_for_read;
+  while (!data.empty()) {
+    BufferHead *bh = data.rbegin()->second;
+    if (bh->end() <= s)
+      break;
+
+    // split bh at truncation point?
+    if (bh->start() < s) {
+      split(bh, s);
+      maybe_rebuild_buffer(bh);
+      continue;
+    }
+
+    // remove bh entirely
+    ceph_assert(bh->start() >= s);
+    for ([[maybe_unused]] auto& [off, ctxs] : bh->waitfor_read) {
+      waiting_for_read.splice(waiting_for_read.end(), ctxs);
+    }
+    bh->waitfor_read.clear();
+    replace_journal_tid(bh, 0);
+    oc->bh_remove(this, bh);
+    delete bh;
+  }
+  if (!waiting_for_read.empty()) {
+    ldout(oc->cct, 10) <<  "restarting reads post-truncate" << dendl;
+  }
+  finish_contexts(oc->cct, waiting_for_read, 0);
+}
+
+void ObjectCacher::Object::discard(loff_t off, loff_t len,
+                                   C_GatherBuilder* commit_gather)
+{
+  ceph_assert(ceph_mutex_is_locked(oc->lock));
+  ldout(oc->cct, 10) << "discard " << *this << " " << off << "~" << len
+		     << dendl;
+
+  if (!exists) {
+    ldout(oc->cct, 10) << " setting exists on " << *this << dendl;
+    exists = true;
+  }
+  if (complete) {
+    ldout(oc->cct, 10) << " clearing complete on " << *this << dendl;
+    complete = false;
+  }
+
+  std::list<Context*> waiting_for_read;
+  auto p = data_lower_bound(off);
+  while (p != data.end()) {
+    BufferHead *bh = p->second;
+    if (bh->start() >= off + len)
+      break;
+
+    // split bh at truncation point?
+    if (bh->start() < off) {
+      split(bh, off);
+      maybe_rebuild_buffer(bh);
+      ++p;
+      continue;
+    }
+
+    ceph_assert(bh->start() >= off);
+    if (bh->end() > off + len) {
+      auto right = split(bh, off + len);
+      maybe_rebuild_buffer(right);
+    }
+
+    ++p;
+    ldout(oc->cct, 10) << "discard " << *this << " bh " << *bh << dendl;
+    replace_journal_tid(bh, 0);
+
+    if (bh->is_tx() && commit_gather != nullptr) {
+      // wait for the writeback to commit
+      waitfor_commit[bh->last_write_tid].emplace_back(commit_gather->new_sub());
+    } else if (bh->is_rx()) {
+      // cannot remove bh with in-flight read, but we can ensure the
+      // read won't overwrite the discard
+      bh->last_read_tid = ++oc->last_read_tid;
+      bh->bl.clear();
+      bh->set_nocache(true);
+      oc->mark_zero(bh);
+      // we should mark all Rx bh to zero
+      continue;
+    } else {
+      for ([[maybe_unused]] auto& [off, ctxs] : bh->waitfor_read) {
+        waiting_for_read.splice(waiting_for_read.end(), ctxs);
+      }
+      bh->waitfor_read.clear();
+    }
+
+    oc->bh_remove(this, bh);
+    delete bh;
+  }
+  if (!waiting_for_read.empty()) {
+    ldout(oc->cct, 10) <<  "restarting reads post-discard" << dendl;
+  }
+  finish_contexts(oc->cct, waiting_for_read, 0); /* restart reads */
+}
+
+
+
+/*** ObjectCacher ***/
+
+#undef dout_prefix
+#define dout_prefix *_dout << "objectcacher "
+
+
+ObjectCacher::ObjectCacher(CephContext *cct_, string name,
+			   WritebackHandler& wb, ceph::mutex& l,
+			   flush_set_callback_t flush_callback,
+			   void *flush_callback_arg, uint64_t max_bytes,
+			   uint64_t max_objects, uint64_t max_dirty,
+			   uint64_t target_dirty, double max_dirty_age,
+			   bool block_writes_upfront)
+  : perfcounter(NULL),
+    cct(cct_), writeback_handler(wb), name(name), lock(l),
+    max_dirty(max_dirty), target_dirty(target_dirty),
+    max_size(max_bytes), max_objects(max_objects),
+    max_dirty_age(ceph::make_timespan(max_dirty_age)),
+    block_writes_upfront(block_writes_upfront),
+    trace_endpoint("ObjectCacher"),
+    flush_set_callback(flush_callback),
+    flush_set_callback_arg(flush_callback_arg),
+    last_read_tid(0), flusher_stop(false), flusher_thread(this),finisher(cct),
+    stat_clean(0), stat_zero(0), stat_dirty(0), stat_rx(0), stat_tx(0),
+    stat_missing(0), stat_error(0), stat_dirty_waiting(0),
+    stat_nr_dirty_waiters(0), reads_outstanding(0)
+{
+  perf_start();
+  finisher.start();
+  scattered_write = writeback_handler.can_scattered_write();
+}
+
+ObjectCacher::~ObjectCacher()
+{
+  finisher.stop();
+  perf_stop();
+  // we should be empty.
+  for (auto i = objects.begin(); i != objects.end(); ++i)
+    ceph_assert(i->empty());
+  ceph_assert(bh_lru_rest.lru_get_size() == 0);
+  ceph_assert(bh_lru_dirty.lru_get_size() == 0);
+  ceph_assert(ob_lru.lru_get_size() == 0);
+  ceph_assert(dirty_or_tx_bh.empty());
+}
+
+void ObjectCacher::perf_start()
+{
+  string n = "objectcacher-" + name;
+  PerfCountersBuilder plb(cct, n, l_objectcacher_first, l_objectcacher_last);
+
+  plb.add_u64_counter(l_objectcacher_cache_ops_hit,
+		      "cache_ops_hit", "Hit operations");
+  plb.add_u64_counter(l_objectcacher_cache_ops_miss,
+		      "cache_ops_miss", "Miss operations");
+  plb.add_u64_counter(l_objectcacher_cache_bytes_hit,
+		      "cache_bytes_hit", "Hit data", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_objectcacher_cache_bytes_miss,
+		      "cache_bytes_miss", "Miss data", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_objectcacher_data_read,
+		      "data_read", "Read data");
+  plb.add_u64_counter(l_objectcacher_data_written,
+		      "data_written", "Data written to cache");
+  plb.add_u64_counter(l_objectcacher_data_flushed,
+		      "data_flushed", "Data flushed");
+  plb.add_u64_counter(l_objectcacher_overwritten_in_flush,
+		      "data_overwritten_while_flushing",
+		      "Data overwritten while flushing");
+  plb.add_u64_counter(l_objectcacher_write_ops_blocked, "write_ops_blocked",
+		      "Write operations, delayed due to dirty limits");
+  plb.add_u64_counter(l_objectcacher_write_bytes_blocked,
+		      "write_bytes_blocked",
+		      "Write data blocked on dirty limit", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_time(l_objectcacher_write_time_blocked, "write_time_blocked",
+	       "Time spent blocking a write due to dirty limits");
+
+  perfcounter = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perfcounter);
+}
+
+void ObjectCacher::perf_stop()
+{
+  ceph_assert(perfcounter);
+  cct->get_perfcounters_collection()->remove(perfcounter);
+  delete perfcounter;
+}
+
+/* private */
+ObjectCacher::Object *ObjectCacher::get_object(sobject_t oid,
+					       uint64_t object_no,
+					       ObjectSet *oset,
+					       object_locator_t &l,
+					       uint64_t truncate_size,
+					       uint64_t truncate_seq)
+{
+  // XXX: Add handling of nspace in object_locator_t in cache
+  ceph_assert(ceph_mutex_is_locked(lock));
+  // have it?
+  if ((uint32_t)l.pool < objects.size()) {
+    if (objects[l.pool].count(oid)) {
+      Object *o = objects[l.pool][oid];
+      o->object_no = object_no;
+      o->truncate_size = truncate_size;
+      o->truncate_seq = truncate_seq;
+      return o;
+    }
+  } else {
+    objects.resize(l.pool+1);
+  }
+
+  // create it.
+  Object *o = new Object(this, oid, object_no, oset, l, truncate_size,
+			 truncate_seq);
+  objects[l.pool][oid] = o;
+  ob_lru.lru_insert_top(o);
+  return o;
+}
+
+void ObjectCacher::close_object(Object *ob)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 10) << "close_object " << *ob << dendl;
+  ceph_assert(ob->can_close());
+
+  // ok!
+  ob_lru.lru_remove(ob);
+  objects[ob->oloc.pool].erase(ob->get_soid());
+  ob->set_item.remove_myself();
+  delete ob;
+}
+
+void ObjectCacher::bh_read(BufferHead *bh, int op_flags,
+                           const ZTracer::Trace &parent_trace)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 7) << "bh_read on " << *bh << " outstanding reads "
+		<< reads_outstanding << dendl;
+
+  ZTracer::Trace trace;
+  if (parent_trace.valid()) {
+    trace.init("", &trace_endpoint, &parent_trace);
+    trace.copy_name("bh_read " + bh->ob->get_oid().name);
+    trace.event("start");
+  }
+
+  mark_rx(bh);
+  bh->last_read_tid = ++last_read_tid;
+
+  // finisher
+  C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob, bh->last_read_tid,
+					    bh->start(), bh->length(), trace);
+  // go
+  writeback_handler.read(bh->ob->get_oid(), bh->ob->get_object_number(),
+			 bh->ob->get_oloc(), bh->start(), bh->length(),
+			 bh->ob->get_snap(), &onfinish->bl,
+			 bh->ob->truncate_size, bh->ob->truncate_seq,
+			 op_flags, trace, onfinish);
+
+  ++reads_outstanding;
+}
+
+void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid,
+				  ceph_tid_t tid, loff_t start,
+				  uint64_t length, bufferlist &bl, int r,
+				  bool trust_enoent)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 7) << "bh_read_finish "
+		<< oid
+		<< " tid " << tid
+		<< " " << start << "~" << length
+		<< " (bl is " << bl.length() << ")"
+		<< " returned " << r
+		<< " outstanding reads " << reads_outstanding
+		<< dendl;
+
+  if (r >= 0 && bl.length() < length) {
+    ldout(cct, 7) << "bh_read_finish " << oid << " padding " << start << "~"
+		  << length << " with " << length - bl.length() << " bytes of zeroes"
+		  << dendl;
+    bl.append_zero(length - bl.length());
+  }
+
+  list<Context*> ls;
+  int err = 0;
+
+  if (objects[poolid].count(oid) == 0) {
+    ldout(cct, 7) << "bh_read_finish no object cache" << dendl;
+  } else {
+    Object *ob = objects[poolid][oid];
+
+    if (r == -ENOENT && !ob->complete) {
+      // wake up *all* rx waiters, or else we risk reordering
+      // identical reads. e.g.
+      //   read 1~1
+      //   reply to unrelated 3~1 -> !exists
+      //   read 1~1 -> immediate ENOENT
+      //   reply to first 1~1 -> ooo ENOENT
+      bool allzero = true;
+      for (auto p = ob->data.begin(); p != ob->data.end(); ++p) {
+	BufferHead *bh = p->second;
+	for (auto p = bh->waitfor_read.begin();
+	     p != bh->waitfor_read.end();
+	     ++p)
+	  ls.splice(ls.end(), p->second);
+	bh->waitfor_read.clear();
+	if (!bh->is_zero() && !bh->is_rx())
+	  allzero = false;
+      }
+
+      // just pass through and retry all waiters if we don't trust
+      // -ENOENT for this read
+      if (trust_enoent) {
+	ldout(cct, 7)
+	  << "bh_read_finish ENOENT, marking complete and !exists on " << *ob
+	  << dendl;
+	ob->complete = true;
+	ob->exists = false;
+
+	/* If all the bhs are effectively zero, get rid of them.  All
+	 * the waiters will be retried and get -ENOENT immediately, so
+	 * it's safe to clean up the unneeded bh's now. Since we know
+	 * it's safe to remove them now, do so, so they aren't hanging
+	 *around waiting for more -ENOENTs from rados while the cache
+	 * is being shut down.
+	 *
+	 * Only do this when all the bhs are rx or clean, to match the
+	 * condition in _readx(). If there are any non-rx or non-clean
+	 * bhs, _readx() will wait for the final result instead of
+	 * returning -ENOENT immediately.
+	 */
+	if (allzero) {
+	  ldout(cct, 10)
+	    << "bh_read_finish ENOENT and allzero, getting rid of "
+	    << "bhs for " << *ob << dendl;
+	  auto p = ob->data.begin();
+	  while (p != ob->data.end()) {
+	    BufferHead *bh = p->second;
+	    // current iterator will be invalidated by bh_remove()
+	    ++p;
+	    bh_remove(ob, bh);
+	    delete bh;
+	  }
+	}
+      }
+    }
+
+    // apply to bh's!
+    loff_t opos = start;
+    while (true) {
+      auto p = ob->data_lower_bound(opos);
+      if (p == ob->data.end())
+	break;
+      if (opos >= start+(loff_t)length) {
+	ldout(cct, 20) << "break due to opos " << opos << " >= start+length "
+		       << start << "+" << length << "=" << start+(loff_t)length
+		       << dendl;
+	break;
+      }
+
+      BufferHead *bh = p->second;
+      ldout(cct, 20) << "checking bh " << *bh << dendl;
+
+      // finishers?
+      for (auto it = bh->waitfor_read.begin();
+	   it != bh->waitfor_read.end();
+	   ++it)
+	ls.splice(ls.end(), it->second);
+      bh->waitfor_read.clear();
+
+      if (bh->start() > opos) {
+	ldout(cct, 1) << "bh_read_finish skipping gap "
+		      << opos << "~" << bh->start() - opos
+		      << dendl;
+	opos = bh->start();
+	continue;
+      }
+
+      if (!bh->is_rx()) {
+	ldout(cct, 10) << "bh_read_finish skipping non-rx " << *bh << dendl;
+	opos = bh->end();
+	continue;
+      }
+
+      if (bh->last_read_tid != tid) {
+	ldout(cct, 10) << "bh_read_finish bh->last_read_tid "
+		       << bh->last_read_tid << " != tid " << tid
+		       << ", skipping" << dendl;
+	opos = bh->end();
+	continue;
+      }
+
+      ceph_assert(opos >= bh->start());
+      ceph_assert(bh->start() == opos);   // we don't merge rx bh's... yet!
+      ceph_assert(bh->length() <= start+(loff_t)length-opos);
+
+      if (bh->error < 0)
+	err = bh->error;
+
+      opos = bh->end();
+
+      if (r == -ENOENT) {
+	if (trust_enoent) {
+	  ldout(cct, 10) << "bh_read_finish removing " << *bh << dendl;
+	  bh_remove(ob, bh);
+	  delete bh;
+	} else {
+	  ldout(cct, 10) << "skipping unstrusted -ENOENT and will retry for "
+			 << *bh << dendl;
+	}
+	continue;
+      }
+
+      if (r < 0) {
+	bh->error = r;
+	mark_error(bh);
+      } else {
+	bh->bl.substr_of(bl,
+			 bh->start() - start,
+			 bh->length());
+	mark_clean(bh);
+      }
+
+      ldout(cct, 10) << "bh_read_finish read " << *bh << dendl;
+
+      ob->try_merge_bh(bh);
+    }
+  }
+
+  // called with lock held.
+  ldout(cct, 20) << "finishing waiters " << ls << dendl;
+
+  finish_contexts(cct, ls, err);
+  retry_waiting_reads();
+
+  --reads_outstanding;
+  read_cond.notify_all();
+}
+
+void ObjectCacher::bh_write_adjacencies(BufferHead *bh, ceph::real_time cutoff,
+					int64_t *max_amount, int *max_count)
+{
+  list<BufferHead*> blist;
+
+  int count = 0;
+  int64_t total_len = 0;
+  set<BufferHead*, BufferHead::ptr_lt>::iterator it = dirty_or_tx_bh.find(bh);
+  ceph_assert(it != dirty_or_tx_bh.end());
+  for (set<BufferHead*, BufferHead::ptr_lt>::iterator p = it;
+       p != dirty_or_tx_bh.end();
+       ++p) {
+    BufferHead *obh = *p;
+    if (obh->ob != bh->ob)
+      break;
+    if (obh->is_dirty() && obh->last_write <= cutoff) {
+      blist.push_back(obh);
+      ++count;
+      total_len += obh->length();
+      if ((max_count && count > *max_count) ||
+	  (max_amount && total_len > *max_amount))
+	break;
+    }
+  }
+
+  while (it != dirty_or_tx_bh.begin()) {
+    --it;
+    BufferHead *obh = *it;
+    if (obh->ob != bh->ob)
+      break;
+    if (obh->is_dirty() && obh->last_write <= cutoff) {
+      blist.push_front(obh);
+      ++count;
+      total_len += obh->length();
+      if ((max_count && count > *max_count) ||
+	  (max_amount && total_len > *max_amount))
+	break;
+    }
+  }
+  if (max_count)
+    *max_count -= count;
+  if (max_amount)
+    *max_amount -= total_len;
+
+  bh_write_scattered(blist);
+}
+
+class ObjectCacher::C_WriteCommit : public Context {
+  ObjectCacher *oc;
+  int64_t poolid;
+  sobject_t oid;
+  vector<pair<loff_t, uint64_t> > ranges;
+  ZTracer::Trace trace;
+public:
+  ceph_tid_t tid = 0;
+  C_WriteCommit(ObjectCacher *c, int64_t _poolid, sobject_t o, loff_t s,
+		uint64_t l, const ZTracer::Trace &trace) :
+    oc(c), poolid(_poolid), oid(o), trace(trace) {
+      ranges.push_back(make_pair(s, l));
+    }
+  C_WriteCommit(ObjectCacher *c, int64_t _poolid, sobject_t o,
+		vector<pair<loff_t, uint64_t> >& _ranges) :
+    oc(c), poolid(_poolid), oid(o), tid(0) {
+      ranges.swap(_ranges);
+    }
+  void finish(int r) override {
+    oc->bh_write_commit(poolid, oid, ranges, tid, r);
+    trace.event("finish");
+  }
+};
+void ObjectCacher::bh_write_scattered(list<BufferHead*>& blist)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+
+  Object *ob = blist.front()->ob;
+  ob->get();
+
+  ceph::real_time last_write;
+  SnapContext snapc;
+  vector<pair<loff_t, uint64_t> > ranges;
+  vector<pair<uint64_t, bufferlist> > io_vec;
+
+  ranges.reserve(blist.size());
+  io_vec.reserve(blist.size());
+
+  uint64_t total_len = 0;
+  for (list<BufferHead*>::iterator p = blist.begin(); p != blist.end(); ++p) {
+    BufferHead *bh = *p;
+    ldout(cct, 7) << "bh_write_scattered " << *bh << dendl;
+    ceph_assert(bh->ob == ob);
+    ceph_assert(bh->bl.length() == bh->length());
+    ranges.push_back(pair<loff_t, uint64_t>(bh->start(), bh->length()));
+
+    int n = io_vec.size();
+    io_vec.resize(n + 1);
+    io_vec[n].first = bh->start();
+    io_vec[n].second = bh->bl;
+
+    total_len += bh->length();
+    if (bh->snapc.seq > snapc.seq)
+      snapc = bh->snapc;
+    if (bh->last_write > last_write)
+      last_write = bh->last_write;
+  }
+
+  C_WriteCommit *oncommit = new C_WriteCommit(this, ob->oloc.pool, ob->get_soid(), ranges);
+
+  ceph_tid_t tid = writeback_handler.write(ob->get_oid(), ob->get_oloc(),
+					   io_vec, snapc, last_write,
+					   ob->truncate_size, ob->truncate_seq,
+					   oncommit);
+  oncommit->tid = tid;
+  ob->last_write_tid = tid;
+  for (list<BufferHead*>::iterator p = blist.begin(); p != blist.end(); ++p) {
+    BufferHead *bh = *p;
+    bh->last_write_tid = tid;
+    mark_tx(bh);
+  }
+
+  if (perfcounter)
+    perfcounter->inc(l_objectcacher_data_flushed, total_len);
+}
+
+void ObjectCacher::bh_write(BufferHead *bh, const ZTracer::Trace &parent_trace)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 7) << "bh_write " << *bh << dendl;
+
+  bh->ob->get();
+
+  ZTracer::Trace trace;
+  if (parent_trace.valid()) {
+    trace.init("", &trace_endpoint, &parent_trace);
+    trace.copy_name("bh_write " + bh->ob->get_oid().name);
+    trace.event("start");
+  }
+
+  // finishers
+  C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->oloc.pool,
+					      bh->ob->get_soid(), bh->start(),
+					      bh->length(), trace);
+  // go
+  ceph_tid_t tid = writeback_handler.write(bh->ob->get_oid(),
+					   bh->ob->get_oloc(),
+					   bh->start(), bh->length(),
+					   bh->snapc, bh->bl, bh->last_write,
+					   bh->ob->truncate_size,
+					   bh->ob->truncate_seq,
+					   bh->journal_tid, trace, oncommit);
+  ldout(cct, 20) << " tid " << tid << " on " << bh->ob->get_oid() << dendl;
+
+  // set bh last_write_tid
+  oncommit->tid = tid;
+  bh->ob->last_write_tid = tid;
+  bh->last_write_tid = tid;
+
+  if (perfcounter) {
+    perfcounter->inc(l_objectcacher_data_flushed, bh->length());
+  }
+
+  mark_tx(bh);
+}
+
+void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid,
+				   vector<pair<loff_t, uint64_t> >& ranges,
+				   ceph_tid_t tid, int r)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 7) << "bh_write_commit " << oid << " tid " << tid
+		<< " ranges " << ranges << " returned " << r << dendl;
+
+  if (objects[poolid].count(oid) == 0) {
+    ldout(cct, 7) << "bh_write_commit no object cache" << dendl;
+    return;
+  }
+
+  Object *ob = objects[poolid][oid];
+  int was_dirty_or_tx = ob->oset->dirty_or_tx;
+
+  for (vector<pair<loff_t, uint64_t> >::iterator p = ranges.begin();
+       p != ranges.end();
+       ++p) {
+    loff_t start = p->first;
+    uint64_t length = p->second;
+    if (!ob->exists) {
+      ldout(cct, 10) << "bh_write_commit marking exists on " << *ob << dendl;
+      ob->exists = true;
+
+      if (writeback_handler.may_copy_on_write(ob->get_oid(), start, length,
+					      ob->get_snap())) {
+	ldout(cct, 10) << "bh_write_commit may copy on write, clearing "
+	  "complete on " << *ob << dendl;
+	ob->complete = false;
+      }
+    }
+
+    vector<pair<loff_t, BufferHead*>> hit;
+    // apply to bh's!
+    for (map<loff_t, BufferHead*>::const_iterator p = ob->data_lower_bound(start);
+	 p != ob->data.end();
+	 ++p) {
+      BufferHead *bh = p->second;
+
+      if (bh->start() >= start+(loff_t)length)
+	break;
+
+      // make sure bh is tx
+      if (!bh->is_tx()) {
+	ldout(cct, 10) << "bh_write_commit skipping non-tx " << *bh << dendl;
+	continue;
+      }
+
+      // make sure bh tid matches
+      if (bh->last_write_tid != tid) {
+	ceph_assert(bh->last_write_tid > tid);
+	ldout(cct, 10) << "bh_write_commit newer tid on " << *bh << dendl;
+	continue;
+      }
+
+      // we don't merge tx buffers. tx buffer should be within the range
+      ceph_assert(bh->start() >= start);
+      ceph_assert(bh->end() <= start+(loff_t)length);
+
+      if (r >= 0) {
+	// ok!  mark bh clean and error-free
+	mark_clean(bh);
+	bh->set_journal_tid(0);
+	if (bh->get_nocache())
+	  bh_lru_rest.lru_bottouch(bh);
+	hit.push_back(make_pair(bh->start(), bh));
+	ldout(cct, 10) << "bh_write_commit clean " << *bh << dendl;
+      } else {
+	mark_dirty(bh);
+	ldout(cct, 10) << "bh_write_commit marking dirty again due to error "
+		       << *bh << " r = " << r << " " << cpp_strerror(-r)
+		       << dendl;
+      }
+    }
+
+    for (auto& p : hit) {
+      //p.second maybe merged and deleted in merge_left
+      if (ob->data.count(p.first))
+	ob->try_merge_bh(p.second);
+    }
+  }
+
+  // update last_commit.
+  ceph_assert(ob->last_commit_tid < tid);
+  ob->last_commit_tid = tid;
+
+  // waiters?
+  list<Context*> ls;
+  if (ob->waitfor_commit.count(tid)) {
+    ls.splice(ls.begin(), ob->waitfor_commit[tid]);
+    ob->waitfor_commit.erase(tid);
+  }
+
+  // is the entire object set now clean and fully committed?
+  ObjectSet *oset = ob->oset;
+  ob->put();
+
+  if (flush_set_callback &&
+      was_dirty_or_tx > 0 &&
+      oset->dirty_or_tx == 0) {        // nothing dirty/tx
+    flush_set_callback(flush_set_callback_arg, oset);
+  }
+
+  if (!ls.empty())
+    finish_contexts(cct, ls, r);
+}
+
+void ObjectCacher::flush(ZTracer::Trace *trace, loff_t amount)
+{
+  ceph_assert(trace != nullptr);
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ceph::real_time cutoff = ceph::real_clock::now();
+
+  ldout(cct, 10) << "flush " << amount << dendl;
+
+  /*
+   * NOTE: we aren't actually pulling things off the LRU here, just
+   * looking at the tail item.  Then we call bh_write, which moves it
+   * to the other LRU, so that we can call
+   * lru_dirty.lru_get_next_expire() again.
+   */
+  int64_t left = amount;
+  while (amount == 0 || left > 0) {
+    BufferHead *bh = static_cast<BufferHead*>(
+      bh_lru_dirty.lru_get_next_expire());
+    if (!bh) break;
+    if (bh->last_write > cutoff) break;
+
+    if (scattered_write) {
+      bh_write_adjacencies(bh, cutoff, amount > 0 ? &left : NULL, NULL);
+    } else {
+      left -= bh->length();
+      bh_write(bh, *trace);
+    }
+  }
+}
+
+
+void ObjectCacher::trim()
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 10) << "trim  start: bytes: max " << max_size << "  clean "
+		 << get_stat_clean() << ", objects: max " << max_objects
+		 << " current " << ob_lru.lru_get_size() << dendl;
+
+  uint64_t max_clean_bh = max_size >> BUFFER_MEMORY_WEIGHT;
+  uint64_t nr_clean_bh = bh_lru_rest.lru_get_size() - bh_lru_rest.lru_get_num_pinned();
+  while (get_stat_clean() > 0 &&
+	 ((uint64_t)get_stat_clean() > max_size ||
+	  nr_clean_bh > max_clean_bh)) {
+    BufferHead *bh = static_cast<BufferHead*>(bh_lru_rest.lru_expire());
+    if (!bh)
+      break;
+
+    ldout(cct, 10) << "trim trimming " << *bh << dendl;
+    ceph_assert(bh->is_clean() || bh->is_zero() || bh->is_error());
+
+    Object *ob = bh->ob;
+    bh_remove(ob, bh);
+    delete bh;
+
+    --nr_clean_bh;
+
+    if (ob->complete) {
+      ldout(cct, 10) << "trim clearing complete on " << *ob << dendl;
+      ob->complete = false;
+    }
+  }
+
+  while (ob_lru.lru_get_size() > max_objects) {
+    Object *ob = static_cast<Object*>(ob_lru.lru_expire());
+    if (!ob)
+      break;
+
+    ldout(cct, 10) << "trim trimming " << *ob << dendl;
+    close_object(ob);
+  }
+
+  ldout(cct, 10) << "trim finish:  max " << max_size << "  clean "
+		 << get_stat_clean() << ", objects: max " << max_objects
+		 << " current " << ob_lru.lru_get_size() << dendl;
+}
+
+
+
+/* public */
+
+bool ObjectCacher::is_cached(ObjectSet *oset, vector<ObjectExtent>& extents,
+			     snapid_t snapid)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  for (vector<ObjectExtent>::iterator ex_it = extents.begin();
+       ex_it != extents.end();
+       ++ex_it) {
+    ldout(cct, 10) << "is_cached " << *ex_it << dendl;
+
+    // get Object cache
+    sobject_t soid(ex_it->oid, snapid);
+    Object *o = get_object_maybe(soid, ex_it->oloc);
+    if (!o)
+      return false;
+    if (!o->is_cached(ex_it->offset, ex_it->length))
+      return false;
+  }
+  return true;
+}
+
+
+/*
+ * returns # bytes read (if in cache).  onfinish is untouched (caller
+ *           must delete it)
+ * returns 0 if doing async read
+ */
+int ObjectCacher::readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
+			ZTracer::Trace *parent_trace)
+{
+  ZTracer::Trace trace;
+  if (parent_trace != nullptr) {
+    trace.init("read", &trace_endpoint, parent_trace);
+    trace.event("start");
+  }
+
+  int r =_readx(rd, oset, onfinish, true, &trace);
+  if (r < 0) {
+    trace.event("finish");
+  }
+  return r;
+}
+
+int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
+			 bool external_call, ZTracer::Trace *trace)
+{
+  ceph_assert(trace != nullptr);
+  ceph_assert(ceph_mutex_is_locked(lock));
+  bool success = true;
+  int error = 0;
+  uint64_t bytes_in_cache = 0;
+  uint64_t bytes_not_in_cache = 0;
+  uint64_t total_bytes_read = 0;
+  map<uint64_t, bufferlist> stripe_map;  // final buffer offset -> substring
+  bool dontneed = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
+  bool nocache = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+
+  /*
+   * WARNING: we can only meaningfully return ENOENT if the read request
+   * passed in a single ObjectExtent.  Any caller who wants ENOENT instead of
+   * zeroed buffers needs to feed single extents into readx().
+   */
+  ceph_assert(!oset->return_enoent || rd->extents.size() == 1);
+
+  for (vector<ObjectExtent>::iterator ex_it = rd->extents.begin();
+       ex_it != rd->extents.end();
+       ++ex_it) {
+    ldout(cct, 10) << "readx " << *ex_it << dendl;
+
+    total_bytes_read += ex_it->length;
+
+    // get Object cache
+    sobject_t soid(ex_it->oid, rd->snap);
+    Object *o = get_object(soid, ex_it->objectno, oset, ex_it->oloc,
+			   ex_it->truncate_size, oset->truncate_seq);
+    if (external_call)
+      touch_ob(o);
+
+    // does not exist and no hits?
+    if (oset->return_enoent && !o->exists) {
+      ldout(cct, 10) << "readx  object !exists, 1 extent..." << dendl;
+
+      // should we worry about COW underneath us?
+      if (writeback_handler.may_copy_on_write(soid.oid, ex_it->offset,
+					      ex_it->length, soid.snap)) {
+	ldout(cct, 20) << "readx  may copy on write" << dendl;
+	bool wait = false;
+	list<BufferHead*> blist;
+	for (map<loff_t, BufferHead*>::iterator bh_it = o->data.begin();
+	     bh_it != o->data.end();
+	     ++bh_it) {
+	  BufferHead *bh = bh_it->second;
+	  if (bh->is_dirty() || bh->is_tx()) {
+	    ldout(cct, 10) << "readx  flushing " << *bh << dendl;
+	    wait = true;
+	    if (bh->is_dirty()) {
+	      if (scattered_write)
+		blist.push_back(bh);
+	      else
+		bh_write(bh, *trace);
+	    }
+	  }
+	}
+	if (scattered_write && !blist.empty())
+	  bh_write_scattered(blist);
+	if (wait) {
+	  ldout(cct, 10) << "readx  waiting on tid " << o->last_write_tid
+			 << " on " << *o << dendl;
+	  o->waitfor_commit[o->last_write_tid].push_back(
+	    new C_RetryRead(this,rd, oset, onfinish, *trace));
+	  // FIXME: perfcounter!
+	  return 0;
+	}
+      }
+
+      // can we return ENOENT?
+      bool allzero = true;
+      for (map<loff_t, BufferHead*>::iterator bh_it = o->data.begin();
+	   bh_it != o->data.end();
+	   ++bh_it) {
+	ldout(cct, 20) << "readx  ob has bh " << *bh_it->second << dendl;
+	if (!bh_it->second->is_zero() && !bh_it->second->is_rx()) {
+	  allzero = false;
+	  break;
+	}
+      }
+      if (allzero) {
+	ldout(cct, 10) << "readx  ob has all zero|rx, returning ENOENT"
+		       << dendl;
+	delete rd;
+	if (dontneed)
+	  bottouch_ob(o);
+	return -ENOENT;
+      }
+    }
+
+    // map extent into bufferheads
+    map<loff_t, BufferHead*> hits, missing, rx, errors;
+    o->map_read(*ex_it, hits, missing, rx, errors);
+    if (external_call) {
+      // retry reading error buffers
+      missing.insert(errors.begin(), errors.end());
+    } else {
+      // some reads had errors, fail later so completions
+      // are cleaned up properly
+      // TODO: make read path not call _readx for every completion
+      hits.insert(errors.begin(), errors.end());
+    }
+
+    if (!missing.empty() || !rx.empty()) {
+      // read missing
+      map<loff_t, BufferHead*>::iterator last = missing.end();
+      for (map<loff_t, BufferHead*>::iterator bh_it = missing.begin();
+	   bh_it != missing.end();
+	   ++bh_it) {
+	uint64_t rx_bytes = static_cast<uint64_t>(
+	  stat_rx + bh_it->second->length());
+	bytes_not_in_cache += bh_it->second->length();
+	if (!waitfor_read.empty() || (stat_rx > 0 && rx_bytes > max_size)) {
+	  // cache is full with concurrent reads -- wait for rx's to complete
+	  // to constrain memory growth (especially during copy-ups)
+	  if (success) {
+	    ldout(cct, 10) << "readx missed, waiting on cache to complete "
+			   << waitfor_read.size() << " blocked reads, "
+			   << (std::max(rx_bytes, max_size) - max_size)
+			   << " read bytes" << dendl;
+	    waitfor_read.push_back(new C_RetryRead(this, rd, oset, onfinish,
+						   *trace));
+	  }
+
+	  bh_remove(o, bh_it->second);
+	  delete bh_it->second;
+	} else {
+	  bh_it->second->set_nocache(nocache);
+	  bh_read(bh_it->second, rd->fadvise_flags, *trace);
+	  if ((success && onfinish) || last != missing.end())
+	    last = bh_it;
+	}
+	success = false;
+      }
+
+      //add wait in last bh avoid wakeup early. Because read is order
+      if (last != missing.end()) {
+	ldout(cct, 10) << "readx missed, waiting on " << *last->second
+	  << " off " << last->first << dendl;
+	last->second->waitfor_read[last->first].push_back(
+	  new C_RetryRead(this, rd, oset, onfinish, *trace) );
+
+      }
+
+      // bump rx
+      for (map<loff_t, BufferHead*>::iterator bh_it = rx.begin();
+	   bh_it != rx.end();
+	   ++bh_it) {
+	touch_bh(bh_it->second); // bump in lru, so we don't lose it.
+	if (success && onfinish) {
+	  ldout(cct, 10) << "readx missed, waiting on " << *bh_it->second
+			 << " off " << bh_it->first << dendl;
+	  bh_it->second->waitfor_read[bh_it->first].push_back(
+	    new C_RetryRead(this, rd, oset, onfinish, *trace) );
+	}
+	bytes_not_in_cache += bh_it->second->length();
+	success = false;
+      }
+
+      for (map<loff_t, BufferHead*>::iterator bh_it = hits.begin();
+	   bh_it != hits.end();  ++bh_it)
+	//bump in lru, so we don't lose it when later read
+	touch_bh(bh_it->second);
+
+    } else {
+      ceph_assert(!hits.empty());
+
+      // make a plain list
+      for (map<loff_t, BufferHead*>::iterator bh_it = hits.begin();
+	   bh_it != hits.end();
+	   ++bh_it) {
+	BufferHead *bh = bh_it->second;
+	ldout(cct, 10) << "readx hit bh " << *bh << dendl;
+	if (bh->is_error() && bh->error)
+	  error = bh->error;
+	bytes_in_cache += bh->length();
+
+	if (bh->get_nocache() && bh->is_clean())
+	  bh_lru_rest.lru_bottouch(bh);
+	else
+	  touch_bh(bh);
+	//must be after touch_bh because touch_bh set dontneed false
+	if (dontneed &&
+	    ((loff_t)ex_it->offset <= bh->start() &&
+	     (bh->end() <=(loff_t)(ex_it->offset + ex_it->length)))) {
+	  bh->set_dontneed(true); //if dirty
+	  if (bh->is_clean())
+	    bh_lru_rest.lru_bottouch(bh);
+	}
+      }
+
+      if (!error) {
+	// create reverse map of buffer offset -> object for the
+	// eventual result.  this is over a single ObjectExtent, so we
+	// know that
+	//  - the bh's are contiguous
+	//  - the buffer frags need not be (and almost certainly aren't)
+	loff_t opos = ex_it->offset;
+	map<loff_t, BufferHead*>::iterator bh_it = hits.begin();
+	ceph_assert(bh_it->second->start() <= opos);
+	uint64_t bhoff = opos - bh_it->second->start();
+	vector<pair<uint64_t,uint64_t> >::iterator f_it
+	  = ex_it->buffer_extents.begin();
+	uint64_t foff = 0;
+	while (1) {
+	  BufferHead *bh = bh_it->second;
+	  ceph_assert(opos == (loff_t)(bh->start() + bhoff));
+
+	  uint64_t len = std::min(f_it->second - foff, bh->length() - bhoff);
+	  ldout(cct, 10) << "readx rmap opos " << opos << ": " << *bh << " +"
+			 << bhoff << " frag " << f_it->first << "~"
+			 << f_it->second << " +" << foff << "~" << len
+			 << dendl;
+
+	  bufferlist bit;
+	  // put substr here first, since substr_of clobbers, and we
+	  // may get multiple bh's at this stripe_map position
+	  if (bh->is_zero()) {
+	    stripe_map[f_it->first].append_zero(len);
+	  } else {
+	    bit.substr_of(bh->bl,
+		opos - bh->start(),
+		len);
+	    stripe_map[f_it->first].claim_append(bit);
+	  }
+
+	  opos += len;
+	  bhoff += len;
+	  foff += len;
+	  if (opos == bh->end()) {
+	    ++bh_it;
+	    bhoff = 0;
+	  }
+	  if (foff == f_it->second) {
+	    ++f_it;
+	    foff = 0;
+	  }
+	  if (bh_it == hits.end()) break;
+	  if (f_it == ex_it->buffer_extents.end())
+	    break;
+	}
+	ceph_assert(f_it == ex_it->buffer_extents.end());
+	ceph_assert(opos == (loff_t)ex_it->offset + (loff_t)ex_it->length);
+      }
+
+      if (dontneed && o->include_all_cached_data(ex_it->offset, ex_it->length))
+	  bottouch_ob(o);
+    }
+  }
+
+  if (!success) {
+    if (perfcounter && external_call) {
+      perfcounter->inc(l_objectcacher_data_read, total_bytes_read);
+      perfcounter->inc(l_objectcacher_cache_bytes_miss, bytes_not_in_cache);
+      perfcounter->inc(l_objectcacher_cache_ops_miss);
+    }
+    if (onfinish) {
+      ldout(cct, 20) << "readx defer " << rd << dendl;
+    } else {
+      ldout(cct, 20) << "readx drop " << rd << " (no complete, but no waiter)"
+		     << dendl;
+      delete rd;
+    }
+    return 0;  // wait!
+  }
+  if (perfcounter && external_call) {
+    perfcounter->inc(l_objectcacher_data_read, total_bytes_read);
+    perfcounter->inc(l_objectcacher_cache_bytes_hit, bytes_in_cache);
+    perfcounter->inc(l_objectcacher_cache_ops_hit);
+  }
+
+  // no misses... success!  do the read.
+  ldout(cct, 10) << "readx has all buffers" << dendl;
+
+  // ok, assemble into result buffer.
+  uint64_t pos = 0;
+  if (rd->bl && !error) {
+    rd->bl->clear();
+    for (map<uint64_t,bufferlist>::iterator i = stripe_map.begin();
+	 i != stripe_map.end();
+	 ++i) {
+      ceph_assert(pos == i->first);
+      ldout(cct, 10) << "readx  adding buffer len " << i->second.length()
+		     << " at " << pos << dendl;
+      pos += i->second.length();
+      rd->bl->claim_append(i->second);
+      ceph_assert(rd->bl->length() == pos);
+    }
+    ldout(cct, 10) << "readx  result is " << rd->bl->length() << dendl;
+  } else if (!error) {
+    ldout(cct, 10) << "readx  no bufferlist ptr (readahead?), done." << dendl;
+    map<uint64_t,bufferlist>::reverse_iterator i = stripe_map.rbegin();
+    pos = i->first + i->second.length();
+  }
+
+  // done with read.
+  int ret = error ? error : pos;
+  ldout(cct, 20) << "readx done " << rd << " " << ret << dendl;
+  ceph_assert(pos <= (uint64_t) INT_MAX);
+
+  delete rd;
+
+  trim();
+
+  return ret;
+}
+
+void ObjectCacher::retry_waiting_reads()
+{
+  list<Context *> ls;
+  ls.swap(waitfor_read);
+
+  while (!ls.empty() && waitfor_read.empty()) {
+    Context *ctx = ls.front();
+    ls.pop_front();
+    ctx->complete(0);
+  }
+  waitfor_read.splice(waitfor_read.end(), ls);
+}
+
+int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
+			 ZTracer::Trace *parent_trace)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ceph::real_time now = ceph::real_clock::now();
+  uint64_t bytes_written = 0;
+  uint64_t bytes_written_in_flush = 0;
+  bool dontneed = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
+  bool nocache = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+
+  ZTracer::Trace trace;
+  if (parent_trace != nullptr) {
+    trace.init("write", &trace_endpoint, parent_trace);
+    trace.event("start");
+  }
+
+  list<Context*> wait_for_reads;
+  for (vector<ObjectExtent>::iterator ex_it = wr->extents.begin();
+       ex_it != wr->extents.end();
+       ++ex_it) {
+    // get object cache
+    sobject_t soid(ex_it->oid, CEPH_NOSNAP);
+    Object *o = get_object(soid, ex_it->objectno, oset, ex_it->oloc,
+			   ex_it->truncate_size, oset->truncate_seq);
+
+    // map it all into a single bufferhead.
+    BufferHead *bh = o->map_write(*ex_it, wr->journal_tid);
+    bool missing = bh->is_missing();
+    bh->snapc = wr->snapc;
+
+    // readers that need to be woken up due to an overwrite
+    for (auto& [_, wait_for_read] : bh->waitfor_read) {
+      wait_for_reads.splice(wait_for_reads.end(), wait_for_read);
+    }
+    bh->waitfor_read.clear();
+
+    bytes_written += ex_it->length;
+    if (bh->is_tx()) {
+      bytes_written_in_flush += ex_it->length;
+    }
+
+    // adjust buffer pointers (ie "copy" data into my cache)
+    // this is over a single ObjectExtent, so we know that
+    //  - there is one contiguous bh
+    //  - the buffer frags need not be (and almost certainly aren't)
+    // note: i assume striping is monotonic... no jumps backwards, ever!
+    loff_t opos = ex_it->offset;
+    for (vector<pair<uint64_t, uint64_t> >::iterator f_it
+	   = ex_it->buffer_extents.begin();
+	 f_it != ex_it->buffer_extents.end();
+	 ++f_it) {
+      ldout(cct, 10) << "writex writing " << f_it->first << "~"
+		     << f_it->second << " into " << *bh << " at " << opos
+		     << dendl;
+      uint64_t bhoff = opos - bh->start();
+      ceph_assert(f_it->second <= bh->length() - bhoff);
+
+      // get the frag we're mapping in
+      bufferlist frag;
+      frag.substr_of(wr->bl, f_it->first, f_it->second);
+
+      // keep anything left of bhoff
+      if (!bhoff)
+        bh->bl.swap(frag);
+      else
+        bh->bl.claim_append(frag);
+
+      opos += f_it->second;
+    }
+
+    // ok, now bh is dirty.
+    mark_dirty(bh);
+    if (dontneed)
+      bh->set_dontneed(true);
+    else if (nocache && missing)
+      bh->set_nocache(true);
+    else
+      touch_bh(bh);
+
+    bh->last_write = now;
+
+    o->try_merge_bh(bh);
+  }
+
+  if (perfcounter) {
+    perfcounter->inc(l_objectcacher_data_written, bytes_written);
+    if (bytes_written_in_flush) {
+      perfcounter->inc(l_objectcacher_overwritten_in_flush,
+		       bytes_written_in_flush);
+    }
+  }
+
+  int r = _wait_for_write(wr, bytes_written, oset, &trace, onfreespace);
+  delete wr;
+
+  finish_contexts(cct, wait_for_reads, 0);
+
+  //verify_stats();
+  trim();
+  return r;
+}
+
+class ObjectCacher::C_WaitForWrite : public Context {
+public:
+  C_WaitForWrite(ObjectCacher *oc, uint64_t len,
+                 const ZTracer::Trace &trace, Context *onfinish) :
+    m_oc(oc), m_len(len), m_trace(trace), m_onfinish(onfinish) {}
+  void finish(int r) override;
+private:
+  ObjectCacher *m_oc;
+  uint64_t m_len;
+  ZTracer::Trace m_trace;
+  Context *m_onfinish;
+};
+
+void ObjectCacher::C_WaitForWrite::finish(int r)
+{
+  std::lock_guard l(m_oc->lock);
+  m_oc->_maybe_wait_for_writeback(m_len, &m_trace);
+  m_onfinish->complete(r);
+}
+
+void ObjectCacher::_maybe_wait_for_writeback(uint64_t len,
+					     ZTracer::Trace *trace)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ceph::mono_time start = ceph::mono_clock::now();
+  int blocked = 0;
+  // wait for writeback?
+  //  - wait for dirty and tx bytes (relative to the max_dirty threshold)
+  //  - do not wait for bytes other waiters are waiting on.  this means that
+  //    threads do not wait for each other.  this effectively allows the cache
+  //    size to balloon proportional to the data that is in flight.
+
+  uint64_t max_dirty_bh = max_dirty >> BUFFER_MEMORY_WEIGHT;
+  while (get_stat_dirty() + get_stat_tx() > 0 &&
+	 (((uint64_t)(get_stat_dirty() + get_stat_tx()) >=
+	  max_dirty + get_stat_dirty_waiting()) ||
+	 (dirty_or_tx_bh.size() >=
+	  max_dirty_bh + get_stat_nr_dirty_waiters()))) {
+
+    if (blocked == 0) {
+      trace->event("start wait for writeback");
+    }
+    ldout(cct, 10) << __func__ << " waiting for dirty|tx "
+		   << (get_stat_dirty() + get_stat_tx()) << " >= max "
+		   << max_dirty << " + dirty_waiting "
+		   << get_stat_dirty_waiting() << dendl;
+    flusher_cond.notify_all();
+    stat_dirty_waiting += len;
+    ++stat_nr_dirty_waiters;
+    std::unique_lock l{lock, std::adopt_lock};
+    stat_cond.wait(l);
+    l.release();
+    stat_dirty_waiting -= len;
+    --stat_nr_dirty_waiters;
+    ++blocked;
+    ldout(cct, 10) << __func__ << " woke up" << dendl;
+  }
+  if (blocked > 0) {
+    trace->event("finish wait for writeback");
+  }
+  if (blocked && perfcounter) {
+    perfcounter->inc(l_objectcacher_write_ops_blocked);
+    perfcounter->inc(l_objectcacher_write_bytes_blocked, len);
+    ceph::timespan blocked = ceph::mono_clock::now() - start;
+    perfcounter->tinc(l_objectcacher_write_time_blocked, blocked);
+  }
+}
+
+// blocking wait for write.
+int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset,
+				  ZTracer::Trace *trace, Context *onfreespace)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ceph_assert(trace != nullptr);
+  int ret = 0;
+
+  if (max_dirty > 0 && !(wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_FUA)) {
+    if (block_writes_upfront) {
+      _maybe_wait_for_writeback(len, trace);
+      if (onfreespace)
+	onfreespace->complete(0);
+    } else {
+      ceph_assert(onfreespace);
+      finisher.queue(new C_WaitForWrite(this, len, *trace, onfreespace));
+    }
+  } else {
+    // write-thru!  flush what we just wrote.
+    ceph::condition_variable cond;
+    bool done = false;
+    Context *fin = block_writes_upfront ?
+      new C_Cond(cond, &done, &ret) : onfreespace;
+    ceph_assert(fin);
+    bool flushed = flush_set(oset, wr->extents, trace, fin);
+    ceph_assert(!flushed);   // we just dirtied it, and didn't drop our lock!
+    ldout(cct, 10) << "wait_for_write waiting on write-thru of " << len
+		   << " bytes" << dendl;
+    if (block_writes_upfront) {
+      std::unique_lock l{lock, std::adopt_lock};
+      cond.wait(l, [&done] { return done; });
+      l.release();
+      ldout(cct, 10) << "wait_for_write woke up, ret " << ret << dendl;
+      if (onfreespace)
+	onfreespace->complete(ret);
+    }
+  }
+
+  // start writeback anyway?
+  if (get_stat_dirty() > 0 && (uint64_t) get_stat_dirty() > target_dirty) {
+    ldout(cct, 10) << "wait_for_write " << get_stat_dirty() << " > target "
+		   << target_dirty << ", nudging flusher" << dendl;
+    flusher_cond.notify_all();
+  }
+  return ret;
+}
+
+void ObjectCacher::flusher_entry()
+{
+  ldout(cct, 10) << "flusher start" << dendl;
+  std::unique_lock l{lock};
+  while (!flusher_stop) {
+    loff_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() +
+      get_stat_dirty();
+    ldout(cct, 11) << "flusher "
+		   << all << " / " << max_size << ":  "
+		   << get_stat_tx() << " tx, "
+		   << get_stat_rx() << " rx, "
+		   << get_stat_clean() << " clean, "
+		   << get_stat_dirty() << " dirty ("
+		   << target_dirty << " target, "
+		   << max_dirty << " max)"
+		   << dendl;
+    loff_t actual = get_stat_dirty() + get_stat_dirty_waiting();
+
+    ZTracer::Trace trace;
+    if (cct->_conf->osdc_blkin_trace_all) {
+      trace.init("flusher", &trace_endpoint);
+      trace.event("start");
+    }
+
+    if (actual > 0 && (uint64_t) actual > target_dirty) {
+      // flush some dirty pages
+      ldout(cct, 10) << "flusher " << get_stat_dirty() << " dirty + "
+		     << get_stat_dirty_waiting() << " dirty_waiting > target "
+		     << target_dirty << ", flushing some dirty bhs" << dendl;
+      flush(&trace, actual - target_dirty);
+    } else {
+      // check tail of lru for old dirty items
+      ceph::real_time cutoff = ceph::real_clock::now();
+      cutoff -= max_dirty_age;
+      BufferHead *bh = 0;
+      int max = MAX_FLUSH_UNDER_LOCK;
+      while ((bh = static_cast<BufferHead*>(bh_lru_dirty.
+					    lru_get_next_expire())) != 0 &&
+	     bh->last_write <= cutoff &&
+	     max > 0) {
+	ldout(cct, 10) << "flusher flushing aged dirty bh " << *bh << dendl;
+	if (scattered_write) {
+	  bh_write_adjacencies(bh, cutoff, NULL, &max);
+        } else {
+	  bh_write(bh, trace);
+	  --max;
+	}
+      }
+      if (!max) {
+	// back off the lock to avoid starving other threads
+        trace.event("backoff");
+	l.unlock();
+	l.lock();
+	continue;
+      }
+    }
+
+    trace.event("finish");
+    if (flusher_stop)
+      break;
+
+    flusher_cond.wait_for(l, 1s);
+  }
+
+  /* Wait for reads to finish. This is only possible if handling
+   * -ENOENT made some read completions finish before their rados read
+   * came back. If we don't wait for them, and destroy the cache, when
+   * the rados reads do come back their callback will try to access the
+   * no-longer-valid ObjectCacher.
+   */
+  read_cond.wait(l, [this] {
+    if (reads_outstanding > 0) {
+      ldout(cct, 10) << "Waiting for all reads to complete. Number left: "
+		     << reads_outstanding << dendl;
+      return false;
+    } else {
+      return true;
+    }
+  });
+  ldout(cct, 10) << "flusher finish" << dendl;
+}
+
+
+// -------------------------------------------------
+
+bool ObjectCacher::set_is_empty(ObjectSet *oset)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  if (oset->objects.empty())
+    return true;
+
+  for (xlist<Object*>::iterator p = oset->objects.begin(); !p.end(); ++p)
+    if (!(*p)->is_empty())
+      return false;
+
+  return true;
+}
+
+bool ObjectCacher::set_is_cached(ObjectSet *oset)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  if (oset->objects.empty())
+    return false;
+
+  for (xlist<Object*>::iterator p = oset->objects.begin();
+       !p.end(); ++p) {
+    Object *ob = *p;
+    for (map<loff_t,BufferHead*>::iterator q = ob->data.begin();
+	 q != ob->data.end();
+	 ++q) {
+      BufferHead *bh = q->second;
+      if (!bh->is_dirty() && !bh->is_tx())
+	return true;
+    }
+  }
+
+  return false;
+}
+
+bool ObjectCacher::set_is_dirty_or_committing(ObjectSet *oset)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  if (oset->objects.empty())
+    return false;
+
+  for (xlist<Object*>::iterator i = oset->objects.begin();
+       !i.end(); ++i) {
+    Object *ob = *i;
+
+    for (map<loff_t,BufferHead*>::iterator p = ob->data.begin();
+	 p != ob->data.end();
+	 ++p) {
+      BufferHead *bh = p->second;
+      if (bh->is_dirty() || bh->is_tx())
+	return true;
+    }
+  }
+
+  return false;
+}
+
+
+// purge.  non-blocking.  violently removes dirty buffers from cache.
+void ObjectCacher::purge(Object *ob)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 10) << "purge " << *ob << dendl;
+
+  ob->truncate(0);
+}
+
+
+// flush.  non-blocking.  no callback.
+// true if clean, already flushed.
+// false if we wrote something.
+// be sloppy about the ranges and flush any buffer it touches
+bool ObjectCacher::flush(Object *ob, loff_t offset, loff_t length,
+                         ZTracer::Trace *trace)
+{
+  ceph_assert(trace != nullptr);
+  ceph_assert(ceph_mutex_is_locked(lock));
+  list<BufferHead*> blist;
+  bool clean = true;
+  ldout(cct, 10) << "flush " << *ob << " " << offset << "~" << length << dendl;
+  for (map<loff_t,BufferHead*>::const_iterator p = ob->data_lower_bound(offset);
+       p != ob->data.end();
+       ++p) {
+    BufferHead *bh = p->second;
+    ldout(cct, 20) << "flush  " << *bh << dendl;
+    if (length && bh->start() > offset+length) {
+      break;
+    }
+    if (bh->is_tx()) {
+      clean = false;
+      continue;
+    }
+    if (!bh->is_dirty()) {
+      continue;
+    }
+
+    if (scattered_write)
+      blist.push_back(bh);
+    else
+      bh_write(bh, *trace);
+    clean = false;
+  }
+  if (scattered_write && !blist.empty())
+    bh_write_scattered(blist);
+
+  return clean;
+}
+
+bool ObjectCacher::_flush_set_finish(C_GatherBuilder *gather,
+				     Context *onfinish)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  if (gather->has_subs()) {
+    gather->set_finisher(onfinish);
+    gather->activate();
+    return false;
+  }
+
+  ldout(cct, 10) << "flush_set has no dirty|tx bhs" << dendl;
+  onfinish->complete(0);
+  return true;
+}
+
+// flush.  non-blocking, takes callback.
+// returns true if already flushed
+bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ceph_assert(onfinish != NULL);
+  if (oset->objects.empty()) {
+    ldout(cct, 10) << "flush_set on " << oset << " dne" << dendl;
+    onfinish->complete(0);
+    return true;
+  }
+
+  ldout(cct, 10) << "flush_set " << oset << dendl;
+
+  // we'll need to wait for all objects to flush!
+  C_GatherBuilder gather(cct);
+  set<Object*> waitfor_commit;
+
+  list<BufferHead*> blist;
+  Object *last_ob = NULL;
+  set<BufferHead*, BufferHead::ptr_lt>::const_iterator it, p, q;
+
+  // Buffer heads in dirty_or_tx_bh are sorted in ObjectSet/Object/offset
+  // order. But items in oset->objects are not sorted. So the iterator can
+  // point to any buffer head in the ObjectSet
+  BufferHead key(*oset->objects.begin());
+  it = dirty_or_tx_bh.lower_bound(&key);
+  p = q = it;
+
+  bool backwards = true;
+  if (it != dirty_or_tx_bh.begin())
+    --it;
+  else
+    backwards = false;
+
+  for (; p != dirty_or_tx_bh.end(); p = q) {
+    ++q;
+    BufferHead *bh = *p;
+    if (bh->ob->oset != oset)
+      break;
+    waitfor_commit.insert(bh->ob);
+    if (bh->is_dirty()) {
+      if (scattered_write) {
+	if (last_ob != bh->ob) {
+	  if (!blist.empty()) {
+	    bh_write_scattered(blist);
+	    blist.clear();
+	  }
+	  last_ob = bh->ob;
+	}
+	blist.push_back(bh);
+      } else {
+	bh_write(bh, {});
+      }
+    }
+  }
+
+  if (backwards) {
+    for(p = q = it; true; p = q) {
+      if (q != dirty_or_tx_bh.begin())
+	--q;
+      else
+	backwards = false;
+      BufferHead *bh = *p;
+      if (bh->ob->oset != oset)
+	break;
+      waitfor_commit.insert(bh->ob);
+      if (bh->is_dirty()) {
+	if (scattered_write) {
+	  if (last_ob != bh->ob) {
+	    if (!blist.empty()) {
+	      bh_write_scattered(blist);
+	      blist.clear();
+	    }
+	    last_ob = bh->ob;
+	  }
+	  blist.push_front(bh);
+	} else {
+	  bh_write(bh, {});
+	}
+      }
+      if (!backwards)
+	break;
+    }
+  }
+
+  if (scattered_write && !blist.empty())
+    bh_write_scattered(blist);
+
+  for (set<Object*>::iterator i = waitfor_commit.begin();
+       i != waitfor_commit.end(); ++i) {
+    Object *ob = *i;
+
+    // we'll need to gather...
+    ldout(cct, 10) << "flush_set " << oset << " will wait for ack tid "
+		   << ob->last_write_tid << " on " << *ob << dendl;
+    ob->waitfor_commit[ob->last_write_tid].push_back(gather.new_sub());
+  }
+
+  return _flush_set_finish(&gather, onfinish);
+}
+
+// flush.  non-blocking, takes callback.
+// returns true if already flushed
+bool ObjectCacher::flush_set(ObjectSet *oset, vector<ObjectExtent>& exv,
+			     ZTracer::Trace *trace, Context *onfinish)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ceph_assert(trace != nullptr);
+  ceph_assert(onfinish != NULL);
+  if (oset->objects.empty()) {
+    ldout(cct, 10) << "flush_set on " << oset << " dne" << dendl;
+    onfinish->complete(0);
+    return true;
+  }
+
+  ldout(cct, 10) << "flush_set " << oset << " on " << exv.size()
+		 << " ObjectExtents" << dendl;
+
+  // we'll need to wait for all objects to flush!
+  C_GatherBuilder gather(cct);
+
+  for (vector<ObjectExtent>::iterator p = exv.begin();
+       p != exv.end();
+       ++p) {
+    ObjectExtent &ex = *p;
+    sobject_t soid(ex.oid, CEPH_NOSNAP);
+    if (objects[oset->poolid].count(soid) == 0)
+      continue;
+    Object *ob = objects[oset->poolid][soid];
+
+    ldout(cct, 20) << "flush_set " << oset << " ex " << ex << " ob " << soid
+		   << " " << ob << dendl;
+
+    if (!flush(ob, ex.offset, ex.length, trace)) {
+      // we'll need to gather...
+      ldout(cct, 10) << "flush_set " << oset << " will wait for ack tid "
+		     << ob->last_write_tid << " on " << *ob << dendl;
+      ob->waitfor_commit[ob->last_write_tid].push_back(gather.new_sub());
+    }
+  }
+
+  return _flush_set_finish(&gather, onfinish);
+}
+
+// flush all dirty data.  non-blocking, takes callback.
+// returns true if already flushed
+bool ObjectCacher::flush_all(Context *onfinish)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ceph_assert(onfinish != NULL);
+
+  ldout(cct, 10) << "flush_all " << dendl;
+
+  // we'll need to wait for all objects to flush!
+  C_GatherBuilder gather(cct);
+  set<Object*> waitfor_commit;
+
+  list<BufferHead*> blist;
+  Object *last_ob = NULL;
+  set<BufferHead*, BufferHead::ptr_lt>::iterator next, it;
+  next = it = dirty_or_tx_bh.begin();
+  while (it != dirty_or_tx_bh.end()) {
+    ++next;
+    BufferHead *bh = *it;
+    waitfor_commit.insert(bh->ob);
+
+    if (bh->is_dirty()) {
+      if (scattered_write) {
+	if (last_ob != bh->ob) {
+	  if (!blist.empty()) {
+	    bh_write_scattered(blist);
+	    blist.clear();
+	  }
+	  last_ob = bh->ob;
+	}
+	blist.push_back(bh);
+      } else {
+	bh_write(bh, {});
+      }
+    }
+
+    it = next;
+  }
+
+  if (scattered_write && !blist.empty())
+    bh_write_scattered(blist);
+
+  for (set<Object*>::iterator i = waitfor_commit.begin();
+       i != waitfor_commit.end();
+       ++i) {
+    Object *ob = *i;
+
+    // we'll need to gather...
+    ldout(cct, 10) << "flush_all will wait for ack tid "
+		   << ob->last_write_tid << " on " << *ob << dendl;
+    ob->waitfor_commit[ob->last_write_tid].push_back(gather.new_sub());
+  }
+
+  return _flush_set_finish(&gather, onfinish);
+}
+
+void ObjectCacher::purge_set(ObjectSet *oset)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  if (oset->objects.empty()) {
+    ldout(cct, 10) << "purge_set on " << oset << " dne" << dendl;
+    return;
+  }
+
+  ldout(cct, 10) << "purge_set " << oset << dendl;
+  const bool were_dirty = oset->dirty_or_tx > 0;
+
+  for (xlist<Object*>::iterator i = oset->objects.begin();
+       !i.end(); ++i) {
+    Object *ob = *i;
+	purge(ob);
+  }
+
+  // Although we have purged rather than flushed, caller should still
+  // drop any resources associate with dirty data.
+  ceph_assert(oset->dirty_or_tx == 0);
+  if (flush_set_callback && were_dirty) {
+    flush_set_callback(flush_set_callback_arg, oset);
+  }
+}
+
+
+loff_t ObjectCacher::release(Object *ob)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  list<BufferHead*> clean;
+  loff_t o_unclean = 0;
+
+  for (map<loff_t,BufferHead*>::iterator p = ob->data.begin();
+       p != ob->data.end();
+       ++p) {
+    BufferHead *bh = p->second;
+    if (bh->is_clean() || bh->is_zero() || bh->is_error())
+      clean.push_back(bh);
+    else
+      o_unclean += bh->length();
+  }
+
+  for (list<BufferHead*>::iterator p = clean.begin();
+       p != clean.end();
+       ++p) {
+    bh_remove(ob, *p);
+    delete *p;
+  }
+
+  if (ob->can_close()) {
+    ldout(cct, 10) << "release trimming " << *ob << dendl;
+    close_object(ob);
+    ceph_assert(o_unclean == 0);
+    return 0;
+  }
+
+  if (ob->complete) {
+    ldout(cct, 10) << "release clearing complete on " << *ob << dendl;
+    ob->complete = false;
+  }
+  if (!ob->exists) {
+    ldout(cct, 10) << "release setting exists on " << *ob << dendl;
+    ob->exists = true;
+  }
+
+  return o_unclean;
+}
+
+loff_t ObjectCacher::release_set(ObjectSet *oset)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  // return # bytes not clean (and thus not released).
+  loff_t unclean = 0;
+
+  if (oset->objects.empty()) {
+    ldout(cct, 10) << "release_set on " << oset << " dne" << dendl;
+    return 0;
+  }
+
+  ldout(cct, 10) << "release_set " << oset << dendl;
+
+  xlist<Object*>::iterator q;
+  for (xlist<Object*>::iterator p = oset->objects.begin();
+       !p.end(); ) {
+    q = p;
+    ++q;
+    Object *ob = *p;
+
+    loff_t o_unclean = release(ob);
+    unclean += o_unclean;
+
+    if (o_unclean)
+      ldout(cct, 10) << "release_set " << oset << " " << *ob
+		     << " has " << o_unclean << " bytes left"
+		     << dendl;
+    p = q;
+  }
+
+  if (unclean) {
+    ldout(cct, 10) << "release_set " << oset
+		   << ", " << unclean << " bytes left" << dendl;
+  }
+
+  return unclean;
+}
+
+
+uint64_t ObjectCacher::release_all()
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 10) << "release_all" << dendl;
+  uint64_t unclean = 0;
+
+  vector<ceph::unordered_map<sobject_t, Object*> >::iterator i
+    = objects.begin();
+  while (i != objects.end()) {
+    ceph::unordered_map<sobject_t, Object*>::iterator p = i->begin();
+    while (p != i->end()) {
+      ceph::unordered_map<sobject_t, Object*>::iterator n = p;
+      ++n;
+
+      Object *ob = p->second;
+
+      loff_t o_unclean = release(ob);
+      unclean += o_unclean;
+
+      if (o_unclean)
+	ldout(cct, 10) << "release_all " << *ob
+		       << " has " << o_unclean << " bytes left"
+		       << dendl;
+    p = n;
+    }
+    ++i;
+  }
+
+  if (unclean) {
+    ldout(cct, 10) << "release_all unclean " << unclean << " bytes left"
+		   << dendl;
+  }
+
+  return unclean;
+}
+
+void ObjectCacher::clear_nonexistence(ObjectSet *oset)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 10) << "clear_nonexistence() " << oset << dendl;
+
+  for (xlist<Object*>::iterator p = oset->objects.begin();
+       !p.end(); ++p) {
+    Object *ob = *p;
+    if (!ob->exists) {
+      ldout(cct, 10) << " setting exists and complete on " << *ob << dendl;
+      ob->exists = true;
+      ob->complete = false;
+    }
+    for (xlist<C_ReadFinish*>::iterator q = ob->reads.begin();
+	 !q.end(); ++q) {
+      C_ReadFinish *comp = *q;
+      comp->distrust_enoent();
+    }
+  }
+}
+
+/**
+ * discard object extents from an ObjectSet by removing the objects in
+ * exls from the in-memory oset.
+ */
+void ObjectCacher::discard_set(ObjectSet *oset, const vector<ObjectExtent>& exls)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  bool was_dirty = oset->dirty_or_tx > 0;
+
+  _discard(oset, exls, nullptr);
+  _discard_finish(oset, was_dirty, nullptr);
+}
+
+/**
+ * discard object extents from an ObjectSet by removing the objects in
+ * exls from the in-memory oset. If the bh is in TX state, the discard
+ * will wait for the write to commit prior to invoking on_finish.
+ */
+void ObjectCacher::discard_writeback(ObjectSet *oset,
+                                     const vector<ObjectExtent>& exls,
+                                     Context* on_finish)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  bool was_dirty = oset->dirty_or_tx > 0;
+
+  C_GatherBuilder gather(cct);
+  _discard(oset, exls, &gather);
+
+  if (gather.has_subs()) {
+    bool flushed = was_dirty && oset->dirty_or_tx == 0;
+    gather.set_finisher(new LambdaContext(
+      [this, oset, flushed, on_finish](int) {
+	ceph_assert(ceph_mutex_is_locked(lock));
+	if (flushed && flush_set_callback)
+	  flush_set_callback(flush_set_callback_arg, oset);
+	if (on_finish)
+	  on_finish->complete(0);
+      }));
+    gather.activate();
+    return;
+  }
+
+  _discard_finish(oset, was_dirty, on_finish);
+}
+
+void ObjectCacher::_discard(ObjectSet *oset, const vector<ObjectExtent>& exls,
+                            C_GatherBuilder* gather)
+{
+  if (oset->objects.empty()) {
+    ldout(cct, 10) << __func__ << " on " << oset << " dne" << dendl;
+    return;
+  }
+
+  ldout(cct, 10) << __func__ << " " << oset << dendl;
+
+  for (auto& ex : exls) {
+    ldout(cct, 10) << __func__ << " " << oset << " ex " << ex << dendl;
+    sobject_t soid(ex.oid, CEPH_NOSNAP);
+    if (objects[oset->poolid].count(soid) == 0)
+      continue;
+    Object *ob = objects[oset->poolid][soid];
+
+    ob->discard(ex.offset, ex.length, gather);
+  }
+}
+
+void ObjectCacher::_discard_finish(ObjectSet *oset, bool was_dirty,
+                                   Context* on_finish)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+
+  // did we truncate off dirty data?
+  if (flush_set_callback && was_dirty && oset->dirty_or_tx == 0) {
+    flush_set_callback(flush_set_callback_arg, oset);
+  }
+
+  // notify that in-flight writeback has completed
+  if (on_finish != nullptr) {
+    on_finish->complete(0);
+  }
+}
+
+void ObjectCacher::verify_stats() const
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 10) << "verify_stats" << dendl;
+
+  loff_t clean = 0, zero = 0, dirty = 0, rx = 0, tx = 0, missing = 0,
+    error = 0;
+  for (vector<ceph::unordered_map<sobject_t, Object*> >::const_iterator i
+	 = objects.begin();
+       i != objects.end();
+       ++i) {
+    for (ceph::unordered_map<sobject_t, Object*>::const_iterator p
+	   = i->begin();
+	 p != i->end();
+	 ++p) {
+      Object *ob = p->second;
+      for (map<loff_t, BufferHead*>::const_iterator q = ob->data.begin();
+	   q != ob->data.end();
+	  ++q) {
+	BufferHead *bh = q->second;
+	switch (bh->get_state()) {
+	case BufferHead::STATE_MISSING:
+	  missing += bh->length();
+	  break;
+	case BufferHead::STATE_CLEAN:
+	  clean += bh->length();
+	  break;
+	case BufferHead::STATE_ZERO:
+	  zero += bh->length();
+	  break;
+	case BufferHead::STATE_DIRTY:
+	  dirty += bh->length();
+	  break;
+	case BufferHead::STATE_TX:
+	  tx += bh->length();
+	  break;
+	case BufferHead::STATE_RX:
+	  rx += bh->length();
+	  break;
+	case BufferHead::STATE_ERROR:
+	  error += bh->length();
+	  break;
+	default:
+	  ceph_abort();
+	}
+      }
+    }
+  }
+
+  ldout(cct, 10) << " clean " << clean << " rx " << rx << " tx " << tx
+		 << " dirty " << dirty << " missing " << missing
+		 << " error " << error << dendl;
+  ceph_assert(clean == stat_clean);
+  ceph_assert(rx == stat_rx);
+  ceph_assert(tx == stat_tx);
+  ceph_assert(dirty == stat_dirty);
+  ceph_assert(missing == stat_missing);
+  ceph_assert(zero == stat_zero);
+  ceph_assert(error == stat_error);
+}
+
+void ObjectCacher::bh_stat_add(BufferHead *bh)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  switch (bh->get_state()) {
+  case BufferHead::STATE_MISSING:
+    stat_missing += bh->length();
+    break;
+  case BufferHead::STATE_CLEAN:
+    stat_clean += bh->length();
+    break;
+  case BufferHead::STATE_ZERO:
+    stat_zero += bh->length();
+    break;
+  case BufferHead::STATE_DIRTY:
+    stat_dirty += bh->length();
+    bh->ob->dirty_or_tx += bh->length();
+    bh->ob->oset->dirty_or_tx += bh->length();
+    break;
+  case BufferHead::STATE_TX:
+    stat_tx += bh->length();
+    bh->ob->dirty_or_tx += bh->length();
+    bh->ob->oset->dirty_or_tx += bh->length();
+    break;
+  case BufferHead::STATE_RX:
+    stat_rx += bh->length();
+    break;
+  case BufferHead::STATE_ERROR:
+    stat_error += bh->length();
+    break;
+  default:
+    ceph_abort_msg("bh_stat_add: invalid bufferhead state");
+  }
+  if (get_stat_dirty_waiting() > 0)
+    stat_cond.notify_all();
+}
+
+void ObjectCacher::bh_stat_sub(BufferHead *bh)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  switch (bh->get_state()) {
+  case BufferHead::STATE_MISSING:
+    stat_missing -= bh->length();
+    break;
+  case BufferHead::STATE_CLEAN:
+    stat_clean -= bh->length();
+    break;
+  case BufferHead::STATE_ZERO:
+    stat_zero -= bh->length();
+    break;
+  case BufferHead::STATE_DIRTY:
+    stat_dirty -= bh->length();
+    bh->ob->dirty_or_tx -= bh->length();
+    bh->ob->oset->dirty_or_tx -= bh->length();
+    break;
+  case BufferHead::STATE_TX:
+    stat_tx -= bh->length();
+    bh->ob->dirty_or_tx -= bh->length();
+    bh->ob->oset->dirty_or_tx -= bh->length();
+    break;
+  case BufferHead::STATE_RX:
+    stat_rx -= bh->length();
+    break;
+  case BufferHead::STATE_ERROR:
+    stat_error -= bh->length();
+    break;
+  default:
+    ceph_abort_msg("bh_stat_sub: invalid bufferhead state");
+  }
+}
+
+void ObjectCacher::bh_set_state(BufferHead *bh, int s)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  int state = bh->get_state();
+  // move between lru lists?
+  if (s == BufferHead::STATE_DIRTY && state != BufferHead::STATE_DIRTY) {
+    bh_lru_rest.lru_remove(bh);
+    bh_lru_dirty.lru_insert_top(bh);
+  } else if (s != BufferHead::STATE_DIRTY &&state == BufferHead::STATE_DIRTY) {
+    bh_lru_dirty.lru_remove(bh);
+    if (bh->get_dontneed())
+      bh_lru_rest.lru_insert_bot(bh);
+    else
+      bh_lru_rest.lru_insert_top(bh);
+  }
+
+  if ((s == BufferHead::STATE_TX ||
+       s == BufferHead::STATE_DIRTY) &&
+      state != BufferHead::STATE_TX &&
+      state != BufferHead::STATE_DIRTY) {
+    dirty_or_tx_bh.insert(bh);
+  } else if ((state == BufferHead::STATE_TX ||
+	      state == BufferHead::STATE_DIRTY) &&
+	     s != BufferHead::STATE_TX &&
+	     s != BufferHead::STATE_DIRTY) {
+    dirty_or_tx_bh.erase(bh);
+  }
+
+  if (s != BufferHead::STATE_ERROR &&
+      state == BufferHead::STATE_ERROR) {
+    bh->error = 0;
+  }
+
+  // set state
+  bh_stat_sub(bh);
+  bh->set_state(s);
+  bh_stat_add(bh);
+}
+
+void ObjectCacher::bh_add(Object *ob, BufferHead *bh)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 30) << "bh_add " << *ob << " " << *bh << dendl;
+  ob->add_bh(bh);
+  if (bh->is_dirty()) {
+    bh_lru_dirty.lru_insert_top(bh);
+    dirty_or_tx_bh.insert(bh);
+  } else {
+    if (bh->get_dontneed())
+      bh_lru_rest.lru_insert_bot(bh);
+    else
+      bh_lru_rest.lru_insert_top(bh);
+  }
+
+  if (bh->is_tx()) {
+    dirty_or_tx_bh.insert(bh);
+  }
+  bh_stat_add(bh);
+}
+
+void ObjectCacher::bh_remove(Object *ob, BufferHead *bh)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ceph_assert(bh->get_journal_tid() == 0);
+  ldout(cct, 30) << "bh_remove " << *ob << " " << *bh << dendl;
+  ob->remove_bh(bh);
+  if (bh->is_dirty()) {
+    bh_lru_dirty.lru_remove(bh);
+    dirty_or_tx_bh.erase(bh);
+  } else {
+    bh_lru_rest.lru_remove(bh);
+  }
+
+  if (bh->is_tx()) {
+    dirty_or_tx_bh.erase(bh);
+  }
+  bh_stat_sub(bh);
+  if (get_stat_dirty_waiting() > 0)
+    stat_cond.notify_all();
+}
+
diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h
new file mode 100644
index 000000000..2101692e1
--- /dev/null
+++ b/src/osdc/ObjectCacher.h
@@ -0,0 +1,781 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OBJECTCACHER_H
+#define CEPH_OBJECTCACHER_H
+
+#include "include/types.h"
+#include "include/lru.h"
+#include "include/Context.h"
+#include "include/xlist.h"
+#include "include/common_fwd.h"
+
+#include "common/Cond.h"
+#include "common/Finisher.h"
+#include "common/Thread.h"
+#include "common/zipkin_trace.h"
+
+#include "Objecter.h"
+#include "Striper.h"
+
+class WritebackHandler;
+
+enum {
+  l_objectcacher_first = 25000,
+
+  l_objectcacher_cache_ops_hit, // ops we satisfy completely from cache
+  l_objectcacher_cache_ops_miss, // ops we don't satisfy completely from cache
+
+  l_objectcacher_cache_bytes_hit, // bytes read directly from cache
+
+  l_objectcacher_cache_bytes_miss, // bytes we couldn't read directly
+
+				   // from cache
+
+  l_objectcacher_data_read, // total bytes read out
+  l_objectcacher_data_written, // bytes written to cache
+  l_objectcacher_data_flushed, // bytes flushed to WritebackHandler
+  l_objectcacher_overwritten_in_flush, // bytes overwritten while
+				       // flushing is in progress
+
+  l_objectcacher_write_ops_blocked, // total write ops we delayed due
+				    // to dirty limits
+  l_objectcacher_write_bytes_blocked, // total number of write bytes
+				      // we delayed due to dirty
+				      // limits
+  l_objectcacher_write_time_blocked, // total time in seconds spent
+				     // blocking a write due to dirty
+				     // limits
+
+  l_objectcacher_last,
+};
+
+class ObjectCacher {
+  PerfCounters *perfcounter;
+ public:
+  CephContext *cct;
+  class Object;
+  struct ObjectSet;
+  class C_ReadFinish;
+
+  typedef void (*flush_set_callback_t) (void *p, ObjectSet *oset);
+
+  // read scatter/gather
+  struct OSDRead {
+    std::vector<ObjectExtent> extents;
+    snapid_t snap;
+    ceph::buffer::list *bl;
+    int fadvise_flags;
+    OSDRead(snapid_t s, ceph::buffer::list *b, int f)
+      : snap(s), bl(b), fadvise_flags(f) {}
+  };
+
+  OSDRead *prepare_read(snapid_t snap, ceph::buffer::list *b, int f) const {
+    return new OSDRead(snap, b, f);
+  }
+
+  // write scatter/gather
+  struct OSDWrite {
+    std::vector<ObjectExtent> extents;
+    SnapContext snapc;
+    ceph::buffer::list bl;
+    ceph::real_time mtime;
+    int fadvise_flags;
+    ceph_tid_t journal_tid;
+    OSDWrite(const SnapContext& sc, const ceph::buffer::list& b, ceph::real_time mt,
+	     int f, ceph_tid_t _journal_tid)
+      : snapc(sc), bl(b), mtime(mt), fadvise_flags(f),
+	journal_tid(_journal_tid) {}
+  };
+
+  OSDWrite *prepare_write(const SnapContext& sc,
+			  const ceph::buffer::list &b,
+			  ceph::real_time mt,
+			  int f,
+			  ceph_tid_t journal_tid) const {
+    return new OSDWrite(sc, b, mt, f, journal_tid);
+  }
+
+
+
+  // ******* BufferHead *********
+  class BufferHead : public LRUObject {
+  public:
+    // states
+    static const int STATE_MISSING = 0;
+    static const int STATE_CLEAN = 1;
+    static const int STATE_ZERO = 2;   // NOTE: these are *clean* zeros
+    static const int STATE_DIRTY = 3;
+    static const int STATE_RX = 4;
+    static const int STATE_TX = 5;
+    static const int STATE_ERROR = 6; // a read error occurred
+
+  private:
+    // my fields
+    int state;
+    int ref;
+    struct {
+      loff_t start, length;   // bh extent in object
+    } ex;
+    bool dontneed; //indicate bh don't need by anyone
+    bool nocache; //indicate bh don't need by this caller
+
+  public:
+    Object *ob;
+    ceph::buffer::list  bl;
+    ceph_tid_t last_write_tid;  // version of bh (if non-zero)
+    ceph_tid_t last_read_tid;   // tid of last read op (if any)
+    ceph::real_time last_write;
+    SnapContext snapc;
+    ceph_tid_t journal_tid;
+    int error; // holds return value for failed reads
+
+    std::map<loff_t, std::list<Context*> > waitfor_read;
+
+    // cons
+    explicit BufferHead(Object *o) :
+      state(STATE_MISSING),
+      ref(0),
+      dontneed(false),
+      nocache(false),
+      ob(o),
+      last_write_tid(0),
+      last_read_tid(0),
+      journal_tid(0),
+      error(0) {
+      ex.start = ex.length = 0;
+    }
+
+    // extent
+    loff_t start() const { return ex.start; }
+    void set_start(loff_t s) { ex.start = s; }
+    loff_t length() const { return ex.length; }
+    void set_length(loff_t l) { ex.length = l; }
+    loff_t end() const { return ex.start + ex.length; }
+    loff_t last() const { return end() - 1; }
+
+    // states
+    void set_state(int s) {
+      if (s == STATE_RX || s == STATE_TX) get();
+      if (state == STATE_RX || state == STATE_TX) put();
+      state = s;
+    }
+    int get_state() const { return state; }
+
+    inline int get_error() const {
+      return error;
+    }
+    inline void set_error(int _error) {
+      error = _error;
+    }
+
+    inline ceph_tid_t get_journal_tid() const {
+      return journal_tid;
+    }
+    inline void set_journal_tid(ceph_tid_t _journal_tid) {
+      journal_tid = _journal_tid;
+    }
+
+    bool is_missing() const { return state == STATE_MISSING; }
+    bool is_dirty() const { return state == STATE_DIRTY; }
+    bool is_clean() const { return state == STATE_CLEAN; }
+    bool is_zero() const { return state == STATE_ZERO; }
+    bool is_tx() const { return state == STATE_TX; }
+    bool is_rx() const { return state == STATE_RX; }
+    bool is_error() const { return state == STATE_ERROR; }
+
+    // reference counting
+    int get() {
+      ceph_assert(ref >= 0);
+      if (ref == 0) lru_pin();
+      return ++ref;
+    }
+    int put() {
+      ceph_assert(ref > 0);
+      if (ref == 1) lru_unpin();
+      --ref;
+      return ref;
+    }
+
+    void set_dontneed(bool v) {
+      dontneed = v;
+    }
+    bool get_dontneed() const {
+      return dontneed;
+    }
+
+    void set_nocache(bool v) {
+      nocache = v;
+    }
+    bool get_nocache() const {
+      return nocache;
+    }
+
+    inline bool can_merge_journal(BufferHead *bh) const {
+      return (get_journal_tid() == bh->get_journal_tid());
+    }
+
+    struct ptr_lt {
+      bool operator()(const BufferHead* l, const BufferHead* r) const {
+	const Object *lob = l->ob;
+	const Object *rob = r->ob;
+	const ObjectSet *loset = lob->oset;
+	const ObjectSet *roset = rob->oset;
+	if (loset != roset)
+	  return loset < roset;
+	if (lob != rob)
+	  return lob < rob;
+	if (l->start() != r->start())
+	  return l->start() < r->start();
+	return l < r;
+      }
+    };
+  };
+
+  // ******* Object *********
+  class Object : public LRUObject {
+  private:
+    // ObjectCacher::Object fields
+    int ref;
+    ObjectCacher *oc;
+    sobject_t oid;
+    friend struct ObjectSet;
+
+  public:
+    uint64_t object_no;
+    ObjectSet *oset;
+    xlist<Object*>::item set_item;
+    object_locator_t oloc;
+    uint64_t truncate_size, truncate_seq;
+
+    bool complete;
+    bool exists;
+
+    std::map<loff_t, BufferHead*>     data;
+
+    ceph_tid_t last_write_tid;  // version of bh (if non-zero)
+    ceph_tid_t last_commit_tid; // last update committed.
+
+    int dirty_or_tx;
+
+    std::map< ceph_tid_t, std::list<Context*> > waitfor_commit;
+    xlist<C_ReadFinish*> reads;
+
+    Object(const Object&) = delete;
+    Object& operator=(const Object&) = delete;
+
+    Object(ObjectCacher *_oc, sobject_t o, uint64_t ono, ObjectSet *os,
+	   object_locator_t& l, uint64_t ts, uint64_t tq) :
+      ref(0),
+      oc(_oc),
+      oid(o), object_no(ono), oset(os), set_item(this), oloc(l),
+      truncate_size(ts), truncate_seq(tq),
+      complete(false), exists(true),
+      last_write_tid(0), last_commit_tid(0),
+      dirty_or_tx(0) {
+      // add to set
+      os->objects.push_back(&set_item);
+    }
+    ~Object() {
+      reads.clear();
+      ceph_assert(ref == 0);
+      ceph_assert(data.empty());
+      ceph_assert(dirty_or_tx == 0);
+      set_item.remove_myself();
+    }
+
+    sobject_t get_soid() const { return oid; }
+    object_t get_oid() { return oid.oid; }
+    snapid_t get_snap() { return oid.snap; }
+    ObjectSet *get_object_set() const { return oset; }
+    std::string get_namespace() { return oloc.nspace; }
+    uint64_t get_object_number() const { return object_no; }
+
+    const object_locator_t& get_oloc() const { return oloc; }
+    void set_object_locator(object_locator_t& l) { oloc = l; }
+
+    bool can_close() const {
+      if (lru_is_expireable()) {
+	ceph_assert(data.empty());
+	ceph_assert(waitfor_commit.empty());
+	return true;
+      }
+      return false;
+    }
+
+    /**
+     * Check buffers and waiters for consistency
+     * - no overlapping buffers
+     * - index in map matches BH
+     * - waiters fall within BH
+     */
+    void audit_buffers();
+
+    /**
+     * find first buffer that includes or follows an offset
+     *
+     * @param offset object byte offset
+     * @return iterator pointing to buffer, or data.end()
+     */
+    std::map<loff_t,BufferHead*>::const_iterator data_lower_bound(loff_t offset) const {
+      auto p = data.lower_bound(offset);
+      if (p != data.begin() &&
+	  (p == data.end() || p->first > offset)) {
+	--p;     // might overlap!
+	if (p->first + p->second->length() <= offset)
+	  ++p;   // doesn't overlap.
+      }
+      return p;
+    }
+
+    // bh
+    // add to my map
+    void add_bh(BufferHead *bh) {
+      if (data.empty())
+	get();
+      ceph_assert(data.count(bh->start()) == 0);
+      data[bh->start()] = bh;
+    }
+    void remove_bh(BufferHead *bh) {
+      ceph_assert(data.count(bh->start()));
+      data.erase(bh->start());
+      if (data.empty())
+	put();
+    }
+
+    bool is_empty() const { return data.empty(); }
+
+    // mid-level
+    BufferHead *split(BufferHead *bh, loff_t off);
+    void merge_left(BufferHead *left, BufferHead *right);
+    bool can_merge_bh(BufferHead *left, BufferHead *right);
+    void try_merge_bh(BufferHead *bh);
+    void maybe_rebuild_buffer(BufferHead *bh);
+
+    bool is_cached(loff_t off, loff_t len) const;
+    bool include_all_cached_data(loff_t off, loff_t len);
+    int map_read(ObjectExtent &ex,
+                 std::map<loff_t, BufferHead*>& hits,
+                 std::map<loff_t, BufferHead*>& missing,
+                 std::map<loff_t, BufferHead*>& rx,
+		 std::map<loff_t, BufferHead*>& errors);
+    BufferHead *map_write(ObjectExtent &ex, ceph_tid_t tid);
+
+    void replace_journal_tid(BufferHead *bh, ceph_tid_t tid);
+    void truncate(loff_t s);
+    void discard(loff_t off, loff_t len, C_GatherBuilder* commit_gather);
+
+    // reference counting
+    int get() {
+      ceph_assert(ref >= 0);
+      if (ref == 0) lru_pin();
+      return ++ref;
+    }
+    int put() {
+      ceph_assert(ref > 0);
+      if (ref == 1) lru_unpin();
+      --ref;
+      return ref;
+    }
+  };
+
+
+  struct ObjectSet {
+    void *parent;
+
+    inodeno_t ino;
+    uint64_t truncate_seq, truncate_size;
+
+    int64_t poolid;
+    xlist<Object*> objects;
+
+    int dirty_or_tx;
+    bool return_enoent;
+
+    ObjectSet(void *p, int64_t _poolid, inodeno_t i)
+      : parent(p), ino(i), truncate_seq(0),
+	truncate_size(0), poolid(_poolid), dirty_or_tx(0),
+	return_enoent(false) {}
+
+  };
+
+
+  // ******* ObjectCacher *********
+  // ObjectCacher fields
+ private:
+  WritebackHandler& writeback_handler;
+  bool scattered_write;
+
+  std::string name;
+  ceph::mutex& lock;
+
+  uint64_t max_dirty, target_dirty, max_size, max_objects;
+  ceph::timespan max_dirty_age;
+  bool block_writes_upfront;
+
+  ZTracer::Endpoint trace_endpoint;
+
+  flush_set_callback_t flush_set_callback;
+  void *flush_set_callback_arg;
+
+  // indexed by pool_id
+  std::vector<ceph::unordered_map<sobject_t, Object*> > objects;
+
+  std::list<Context*> waitfor_read;
+
+  ceph_tid_t last_read_tid;
+
+  std::set<BufferHead*, BufferHead::ptr_lt> dirty_or_tx_bh;
+  LRU   bh_lru_dirty, bh_lru_rest;
+  LRU   ob_lru;
+
+  ceph::condition_variable flusher_cond;
+  bool flusher_stop;
+  void flusher_entry();
+  class FlusherThread : public Thread {
+    ObjectCacher *oc;
+  public:
+    explicit FlusherThread(ObjectCacher *o) : oc(o) {}
+    void *entry() override {
+      oc->flusher_entry();
+      return 0;
+    }
+  } flusher_thread;
+
+  Finisher finisher;
+
+  // objects
+  Object *get_object_maybe(sobject_t oid, object_locator_t &l) {
+    // have it?
+    if (((uint32_t)l.pool < objects.size()) &&
+	(objects[l.pool].count(oid)))
+      return objects[l.pool][oid];
+    return NULL;
+  }
+
+  Object *get_object(sobject_t oid, uint64_t object_no, ObjectSet *oset,
+		     object_locator_t &l, uint64_t truncate_size,
+		     uint64_t truncate_seq);
+  void close_object(Object *ob);
+
+  // bh stats
+  ceph::condition_variable  stat_cond;
+
+  loff_t stat_clean;
+  loff_t stat_zero;
+  loff_t stat_dirty;
+  loff_t stat_rx;
+  loff_t stat_tx;
+  loff_t stat_missing;
+  loff_t stat_error;
+  loff_t stat_dirty_waiting;   // bytes that writers are waiting on to write
+
+  size_t stat_nr_dirty_waiters;
+
+  void verify_stats() const;
+
+  void bh_stat_add(BufferHead *bh);
+  void bh_stat_sub(BufferHead *bh);
+  loff_t get_stat_tx() const { return stat_tx; }
+  loff_t get_stat_rx() const { return stat_rx; }
+  loff_t get_stat_dirty() const { return stat_dirty; }
+  loff_t get_stat_clean() const { return stat_clean; }
+  loff_t get_stat_zero() const { return stat_zero; }
+  loff_t get_stat_dirty_waiting() const { return stat_dirty_waiting; }
+  size_t get_stat_nr_dirty_waiters() const { return stat_nr_dirty_waiters; }
+
+  void touch_bh(BufferHead *bh) {
+    if (bh->is_dirty())
+      bh_lru_dirty.lru_touch(bh);
+    else
+      bh_lru_rest.lru_touch(bh);
+
+    bh->set_dontneed(false);
+    bh->set_nocache(false);
+    touch_ob(bh->ob);
+  }
+  void touch_ob(Object *ob) {
+    ob_lru.lru_touch(ob);
+  }
+  void bottouch_ob(Object *ob) {
+    ob_lru.lru_bottouch(ob);
+  }
+
+  // bh states
+  void bh_set_state(BufferHead *bh, int s);
+  void copy_bh_state(BufferHead *bh1, BufferHead *bh2) {
+    bh_set_state(bh2, bh1->get_state());
+  }
+
+  void mark_missing(BufferHead *bh) {
+    bh_set_state(bh,BufferHead::STATE_MISSING);
+  }
+  void mark_clean(BufferHead *bh) {
+    bh_set_state(bh, BufferHead::STATE_CLEAN);
+  }
+  void mark_zero(BufferHead *bh) {
+    bh_set_state(bh, BufferHead::STATE_ZERO);
+  }
+  void mark_rx(BufferHead *bh) {
+    bh_set_state(bh, BufferHead::STATE_RX);
+  }
+  void mark_tx(BufferHead *bh) {
+    bh_set_state(bh, BufferHead::STATE_TX); }
+  void mark_error(BufferHead *bh) {
+    bh_set_state(bh, BufferHead::STATE_ERROR);
+  }
+  void mark_dirty(BufferHead *bh) {
+    bh_set_state(bh, BufferHead::STATE_DIRTY);
+    bh_lru_dirty.lru_touch(bh);
+    //bh->set_dirty_stamp(ceph_clock_now());
+  }
+
+  void bh_add(Object *ob, BufferHead *bh);
+  void bh_remove(Object *ob, BufferHead *bh);
+
+  // io
+  void bh_read(BufferHead *bh, int op_flags,
+               const ZTracer::Trace &parent_trace);
+  void bh_write(BufferHead *bh, const ZTracer::Trace &parent_trace);
+  void bh_write_scattered(std::list<BufferHead*>& blist);
+  void bh_write_adjacencies(BufferHead *bh, ceph::real_time cutoff,
+			    int64_t *amount, int *max_count);
+
+  void trim();
+  void flush(ZTracer::Trace *trace, loff_t amount=0);
+
+  /**
+   * flush a range of buffers
+   *
+   * Flush any buffers that intersect the specified extent.  If len==0,
+   * flush *all* buffers for the object.
+   *
+   * @param o object
+   * @param off start offset
+   * @param len extent length, or 0 for entire object
+   * @return true if object was already clean/flushed.
+   */
+  bool flush(Object *o, loff_t off, loff_t len,
+             ZTracer::Trace *trace);
+  loff_t release(Object *o);
+  void purge(Object *o);
+
+  int64_t reads_outstanding;
+  ceph::condition_variable read_cond;
+
+  int _readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
+	     bool external_call, ZTracer::Trace *trace);
+  void retry_waiting_reads();
+
+ public:
+  void bh_read_finish(int64_t poolid, sobject_t oid, ceph_tid_t tid,
+		      loff_t offset, uint64_t length,
+		      ceph::buffer::list &bl, int r,
+		      bool trust_enoent);
+  void bh_write_commit(int64_t poolid, sobject_t oid,
+		       std::vector<std::pair<loff_t, uint64_t> >& ranges,
+		       ceph_tid_t t, int r);
+
+  class C_WriteCommit;
+  class C_WaitForWrite;
+
+  void perf_start();
+  void perf_stop();
+
+
+
+  ObjectCacher(CephContext *cct_, std::string name, WritebackHandler& wb, ceph::mutex& l,
+	       flush_set_callback_t flush_callback,
+	       void *flush_callback_arg,
+	       uint64_t max_bytes, uint64_t max_objects,
+	       uint64_t max_dirty, uint64_t target_dirty, double max_age,
+	       bool block_writes_upfront);
+  ~ObjectCacher();
+
+  void start() {
+    flusher_thread.create("flusher");
+  }
+  void stop() {
+    ceph_assert(flusher_thread.is_started());
+    lock.lock();  // hmm.. watch out for deadlock!
+    flusher_stop = true;
+    flusher_cond.notify_all();
+    lock.unlock();
+    flusher_thread.join();
+  }
+
+
+  class C_RetryRead;
+
+
+  // non-blocking.  async.
+
+  /**
+   * @note total read size must be <= INT_MAX, since
+   * the return value is total bytes read
+   */
+  int readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
+	    ZTracer::Trace *parent_trace = nullptr);
+  int writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
+	     ZTracer::Trace *parent_trace = nullptr);
+  bool is_cached(ObjectSet *oset, std::vector<ObjectExtent>& extents,
+		 snapid_t snapid);
+
+private:
+  // write blocking
+  int _wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset,
+                      ZTracer::Trace *trace, Context *onfreespace);
+  void _maybe_wait_for_writeback(uint64_t len, ZTracer::Trace *trace);
+  bool _flush_set_finish(C_GatherBuilder *gather, Context *onfinish);
+
+  void _discard(ObjectSet *oset, const std::vector<ObjectExtent>& exls,
+                C_GatherBuilder* gather);
+  void _discard_finish(ObjectSet *oset, bool was_dirty, Context* on_finish);
+
+public:
+  bool set_is_empty(ObjectSet *oset);
+  bool set_is_cached(ObjectSet *oset);
+  bool set_is_dirty_or_committing(ObjectSet *oset);
+
+  bool flush_set(ObjectSet *oset, Context *onfinish=0);
+  bool flush_set(ObjectSet *oset, std::vector<ObjectExtent>& ex,
+                 ZTracer::Trace *trace, Context *onfinish = 0);
+  bool flush_all(Context *onfinish = 0);
+
+  void purge_set(ObjectSet *oset);
+
+  // returns # of bytes not released (ie non-clean)
+  loff_t release_set(ObjectSet *oset);
+  uint64_t release_all();
+
+  void discard_set(ObjectSet *oset, const std::vector<ObjectExtent>& ex);
+  void discard_writeback(ObjectSet *oset, const std::vector<ObjectExtent>& ex,
+                         Context* on_finish);
+
+  /**
+   * Retry any in-flight reads that get -ENOENT instead of marking
+   * them zero, and get rid of any cached -ENOENTs.
+   * After this is called and the cache's lock is unlocked,
+   * any new requests will treat -ENOENT normally.
+   */
+  void clear_nonexistence(ObjectSet *oset);
+
+
+  // cache sizes
+  void set_max_dirty(uint64_t v) {
+    max_dirty = v;
+  }
+  void set_target_dirty(int64_t v) {
+    target_dirty = v;
+  }
+  void set_max_size(int64_t v) {
+    max_size = v;
+  }
+  void set_max_dirty_age(double a) {
+    max_dirty_age = ceph::make_timespan(a);
+  }
+  void set_max_objects(int64_t v) {
+    max_objects = v;
+  }
+
+
+  // file functions
+
+  /*** async+caching (non-blocking) file interface ***/
+  int file_is_cached(ObjectSet *oset, file_layout_t *layout,
+		     snapid_t snapid, loff_t offset, uint64_t len) {
+    std::vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, oset->ino, layout, offset, len,
+			     oset->truncate_size, extents);
+    return is_cached(oset, extents, snapid);
+  }
+
+  int file_read(ObjectSet *oset, file_layout_t *layout, snapid_t snapid,
+		loff_t offset, uint64_t len, ceph::buffer::list *bl, int flags,
+		Context *onfinish) {
+    OSDRead *rd = prepare_read(snapid, bl, flags);
+    Striper::file_to_extents(cct, oset->ino, layout, offset, len,
+			     oset->truncate_size, rd->extents);
+    return readx(rd, oset, onfinish);
+  }
+
+  int file_write(ObjectSet *oset, file_layout_t *layout,
+		 const SnapContext& snapc, loff_t offset, uint64_t len,
+		 ceph::buffer::list& bl, ceph::real_time mtime, int flags) {
+    OSDWrite *wr = prepare_write(snapc, bl, mtime, flags, 0);
+    Striper::file_to_extents(cct, oset->ino, layout, offset, len,
+			     oset->truncate_size, wr->extents);
+    return writex(wr, oset, nullptr);
+  }
+
+  bool file_flush(ObjectSet *oset, file_layout_t *layout,
+		  const SnapContext& snapc, loff_t offset, uint64_t len,
+		  Context *onfinish) {
+    std::vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, oset->ino, layout, offset, len,
+			     oset->truncate_size, extents);
+    ZTracer::Trace trace;
+    return flush_set(oset, extents, &trace, onfinish);
+  }
+};
+
+
+inline std::ostream& operator<<(std::ostream &out,
+				const ObjectCacher::BufferHead &bh)
+{
+  out << "bh[ " << &bh << " "
+      << bh.start() << "~" << bh.length()
+      << " " << bh.ob
+      << " (" << bh.bl.length() << ")"
+      << " v " << bh.last_write_tid;
+  if (bh.get_journal_tid() != 0) {
+    out << " j " << bh.get_journal_tid();
+  }
+  if (bh.is_tx()) out << " tx";
+  if (bh.is_rx()) out << " rx";
+  if (bh.is_dirty()) out << " dirty";
+  if (bh.is_clean()) out << " clean";
+  if (bh.is_zero()) out << " zero";
+  if (bh.is_missing()) out << " missing";
+  if (bh.bl.length() > 0) out << " firstbyte=" << (int)bh.bl[0];
+  if (bh.error) out << " error=" << bh.error;
+  out << "]";
+  out << " waiters = {";
+  for (auto it = bh.waitfor_read.begin(); it != bh.waitfor_read.end(); ++it) {
+    out << " " << it->first << "->[";
+    for (auto lit = it->second.begin();
+	 lit != it->second.end(); ++lit) {
+	 out << *lit << ", ";
+    }
+    out << "]";
+  }
+  out << "}";
+  return out;
+}
+
+inline std::ostream& operator<<(std::ostream &out,
+				const ObjectCacher::ObjectSet &os)
+{
+  return out << "objectset[" << os.ino
+	     << " ts " << os.truncate_seq << "/" << os.truncate_size
+	     << " objects " << os.objects.size()
+	     << " dirty_or_tx " << os.dirty_or_tx
+	     << "]";
+}
+
+inline std::ostream& operator<<(std::ostream &out,
+				const ObjectCacher::Object &ob)
+{
+  out << "object["
+      << ob.get_soid() << " oset " << ob.oset << std::dec
+      << " wr " << ob.last_write_tid << "/" << ob.last_commit_tid;
+
+  if (ob.complete)
+    out << " COMPLETE";
+  if (!ob.exists)
+    out << " !EXISTS";
+
+  out << "]";
+  return out;
+}
+
+#endif
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
new file mode 100644
index 000000000..6fb200eb1
--- /dev/null
+++ b/src/osdc/Objecter.cc
@@ -0,0 +1,5344 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <algorithm>
+#include <cerrno>
+
+#include "Objecter.h"
+#include "osd/OSDMap.h"
+#include "osd/error_code.h"
+#include "Filer.h"
+
+#include "mon/MonClient.h"
+#include "mon/error_code.h"
+
+#include "msg/Messenger.h"
+#include "msg/Message.h"
+
+#include "messages/MPing.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDBackoff.h"
+#include "messages/MOSDMap.h"
+
+#include "messages/MPoolOp.h"
+#include "messages/MPoolOpReply.h"
+
+#include "messages/MGetPoolStats.h"
+#include "messages/MGetPoolStatsReply.h"
+#include "messages/MStatfs.h"
+#include "messages/MStatfsReply.h"
+
+#include "messages/MMonCommand.h"
+
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+
+#include "messages/MWatchNotify.h"
+
+
+#include "common/Cond.h"
+#include "common/config.h"
+#include "common/perf_counters.h"
+#include "common/scrub_types.h"
+#include "include/str_list.h"
+#include "common/errno.h"
+#include "common/EventTrace.h"
+#include "common/async/waiter.h"
+#include "error_code.h"
+
+
+using std::list;
+using std::make_pair;
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+
+using std::defer_lock;
+
+using ceph::real_time;
+using ceph::real_clock;
+
+using ceph::mono_clock;
+using ceph::mono_time;
+
+using ceph::timespan;
+
+using ceph::shunique_lock;
+using ceph::acquire_shared;
+using ceph::acquire_unique;
+
+namespace bc = boost::container;
+namespace bs = boost::system;
+namespace ca = ceph::async;
+namespace cb = ceph::buffer;
+
+#define dout_subsys ceph_subsys_objecter
+#undef dout_prefix
+#define dout_prefix *_dout << messenger->get_myname() << ".objecter "
+
+
+enum {
+  l_osdc_first = 123200,
+  l_osdc_op_active,
+  l_osdc_op_laggy,
+  l_osdc_op_send,
+  l_osdc_op_send_bytes,
+  l_osdc_op_resend,
+  l_osdc_op_reply,
+  l_osdc_oplen_avg,
+
+  l_osdc_op,
+  l_osdc_op_r,
+  l_osdc_op_w,
+  l_osdc_op_rmw,
+  l_osdc_op_pg,
+
+  l_osdc_osdop_stat,
+  l_osdc_osdop_create,
+  l_osdc_osdop_read,
+  l_osdc_osdop_write,
+  l_osdc_osdop_writefull,
+  l_osdc_osdop_writesame,
+  l_osdc_osdop_append,
+  l_osdc_osdop_zero,
+  l_osdc_osdop_truncate,
+  l_osdc_osdop_delete,
+  l_osdc_osdop_mapext,
+  l_osdc_osdop_sparse_read,
+  l_osdc_osdop_clonerange,
+  l_osdc_osdop_getxattr,
+  l_osdc_osdop_setxattr,
+  l_osdc_osdop_cmpxattr,
+  l_osdc_osdop_rmxattr,
+  l_osdc_osdop_resetxattrs,
+  l_osdc_osdop_call,
+  l_osdc_osdop_watch,
+  l_osdc_osdop_notify,
+  l_osdc_osdop_src_cmpxattr,
+  l_osdc_osdop_pgls,
+  l_osdc_osdop_pgls_filter,
+  l_osdc_osdop_other,
+
+  l_osdc_linger_active,
+  l_osdc_linger_send,
+  l_osdc_linger_resend,
+  l_osdc_linger_ping,
+
+  l_osdc_poolop_active,
+  l_osdc_poolop_send,
+  l_osdc_poolop_resend,
+
+  l_osdc_poolstat_active,
+  l_osdc_poolstat_send,
+  l_osdc_poolstat_resend,
+
+  l_osdc_statfs_active,
+  l_osdc_statfs_send,
+  l_osdc_statfs_resend,
+
+  l_osdc_command_active,
+  l_osdc_command_send,
+  l_osdc_command_resend,
+
+  l_osdc_map_epoch,
+  l_osdc_map_full,
+  l_osdc_map_inc,
+
+  l_osdc_osd_sessions,
+  l_osdc_osd_session_open,
+  l_osdc_osd_session_close,
+  l_osdc_osd_laggy,
+
+  l_osdc_osdop_omap_wr,
+  l_osdc_osdop_omap_rd,
+  l_osdc_osdop_omap_del,
+
+  l_osdc_last,
+};
+
+namespace {
+inline bs::error_code osdcode(int r) {
+  return (r < 0) ? bs::error_code(-r, osd_category()) : bs::error_code();
+}
+}
+
+// config obs ----------------------------
+
+class Objecter::RequestStateHook : public AdminSocketHook {
+  Objecter *m_objecter;
+public:
+  explicit RequestStateHook(Objecter *objecter);
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+	   Formatter *f,
+	   std::ostream& ss,
+	   cb::list& out) override;
+};
+
+std::unique_lock<std::mutex> Objecter::OSDSession::get_lock(object_t& oid)
+{
+  if (oid.name.empty())
+    return {};
+
+  static constexpr uint32_t HASH_PRIME = 1021;
+  uint32_t h = ceph_str_hash_linux(oid.name.c_str(), oid.name.size())
+    % HASH_PRIME;
+
+  return {completion_locks[h % num_locks], std::defer_lock};
+}
+
+const char** Objecter::get_tracked_conf_keys() const
+{
+  static const char *config_keys[] = {
+    "crush_location",
+    "rados_mon_op_timeout",
+    "rados_osd_op_timeout",
+    NULL
+  };
+  return config_keys;
+}
+
+
+void Objecter::handle_conf_change(const ConfigProxy& conf,
+				  const std::set <std::string> &changed)
+{
+  if (changed.count("crush_location")) {
+    update_crush_location();
+  }
+  if (changed.count("rados_mon_op_timeout")) {
+    mon_timeout = conf.get_val<std::chrono::seconds>("rados_mon_op_timeout");
+  }
+  if (changed.count("rados_osd_op_timeout")) {
+    osd_timeout = conf.get_val<std::chrono::seconds>("rados_osd_op_timeout");
+  }
+}
+
+void Objecter::update_crush_location()
+{
+  unique_lock wl(rwlock);
+  crush_location = cct->crush_location.get_location();
+}
+
+// messages ------------------------------
+
+/*
+ * initialize only internal data structures, don't initiate cluster interaction
+ */
+void Objecter::init()
+{
+  ceph_assert(!initialized);
+
+  if (!logger) {
+    PerfCountersBuilder pcb(cct, "objecter", l_osdc_first, l_osdc_last);
+
+    pcb.add_u64(l_osdc_op_active, "op_active", "Operations active", "actv",
+		PerfCountersBuilder::PRIO_CRITICAL);
+    pcb.add_u64(l_osdc_op_laggy, "op_laggy", "Laggy operations");
+    pcb.add_u64_counter(l_osdc_op_send, "op_send", "Sent operations");
+    pcb.add_u64_counter(l_osdc_op_send_bytes, "op_send_bytes", "Sent data", NULL, 0, unit_t(UNIT_BYTES));
+    pcb.add_u64_counter(l_osdc_op_resend, "op_resend", "Resent operations");
+    pcb.add_u64_counter(l_osdc_op_reply, "op_reply", "Operation reply");
+    pcb.add_u64_avg(l_osdc_oplen_avg, "oplen_avg", "Average length of operation vector");
+
+    pcb.add_u64_counter(l_osdc_op, "op", "Operations");
+    pcb.add_u64_counter(l_osdc_op_r, "op_r", "Read operations", "rd",
+			PerfCountersBuilder::PRIO_CRITICAL);
+    pcb.add_u64_counter(l_osdc_op_w, "op_w", "Write operations", "wr",
+			PerfCountersBuilder::PRIO_CRITICAL);
+    pcb.add_u64_counter(l_osdc_op_rmw, "op_rmw", "Read-modify-write operations",
+			"rdwr", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_osdc_op_pg, "op_pg", "PG operation");
+
+    pcb.add_u64_counter(l_osdc_osdop_stat, "osdop_stat", "Stat operations");
+    pcb.add_u64_counter(l_osdc_osdop_create, "osdop_create",
+			"Create object operations");
+    pcb.add_u64_counter(l_osdc_osdop_read, "osdop_read", "Read operations");
+    pcb.add_u64_counter(l_osdc_osdop_write, "osdop_write", "Write operations");
+    pcb.add_u64_counter(l_osdc_osdop_writefull, "osdop_writefull",
+			"Write full object operations");
+    pcb.add_u64_counter(l_osdc_osdop_writesame, "osdop_writesame",
+                        "Write same operations");
+    pcb.add_u64_counter(l_osdc_osdop_append, "osdop_append",
+			"Append operation");
+    pcb.add_u64_counter(l_osdc_osdop_zero, "osdop_zero",
+			"Set object to zero operations");
+    pcb.add_u64_counter(l_osdc_osdop_truncate, "osdop_truncate",
+			"Truncate object operations");
+    pcb.add_u64_counter(l_osdc_osdop_delete, "osdop_delete",
+			"Delete object operations");
+    pcb.add_u64_counter(l_osdc_osdop_mapext, "osdop_mapext",
+			"Map extent operations");
+    pcb.add_u64_counter(l_osdc_osdop_sparse_read, "osdop_sparse_read",
+			"Sparse read operations");
+    pcb.add_u64_counter(l_osdc_osdop_clonerange, "osdop_clonerange",
+			"Clone range operations");
+    pcb.add_u64_counter(l_osdc_osdop_getxattr, "osdop_getxattr",
+			"Get xattr operations");
+    pcb.add_u64_counter(l_osdc_osdop_setxattr, "osdop_setxattr",
+			"Set xattr operations");
+    pcb.add_u64_counter(l_osdc_osdop_cmpxattr, "osdop_cmpxattr",
+			"Xattr comparison operations");
+    pcb.add_u64_counter(l_osdc_osdop_rmxattr, "osdop_rmxattr",
+			"Remove xattr operations");
+    pcb.add_u64_counter(l_osdc_osdop_resetxattrs, "osdop_resetxattrs",
+			"Reset xattr operations");
+    pcb.add_u64_counter(l_osdc_osdop_call, "osdop_call",
+			"Call (execute) operations");
+    pcb.add_u64_counter(l_osdc_osdop_watch, "osdop_watch",
+			"Watch by object operations");
+    pcb.add_u64_counter(l_osdc_osdop_notify, "osdop_notify",
+			"Notify about object operations");
+    pcb.add_u64_counter(l_osdc_osdop_src_cmpxattr, "osdop_src_cmpxattr",
+			"Extended attribute comparison in multi operations");
+    pcb.add_u64_counter(l_osdc_osdop_pgls, "osdop_pgls");
+    pcb.add_u64_counter(l_osdc_osdop_pgls_filter, "osdop_pgls_filter");
+    pcb.add_u64_counter(l_osdc_osdop_other, "osdop_other", "Other operations");
+
+    pcb.add_u64(l_osdc_linger_active, "linger_active",
+		"Active lingering operations");
+    pcb.add_u64_counter(l_osdc_linger_send, "linger_send",
+			"Sent lingering operations");
+    pcb.add_u64_counter(l_osdc_linger_resend, "linger_resend",
+			"Resent lingering operations");
+    pcb.add_u64_counter(l_osdc_linger_ping, "linger_ping",
+			"Sent pings to lingering operations");
+
+    pcb.add_u64(l_osdc_poolop_active, "poolop_active",
+		"Active pool operations");
+    pcb.add_u64_counter(l_osdc_poolop_send, "poolop_send",
+			"Sent pool operations");
+    pcb.add_u64_counter(l_osdc_poolop_resend, "poolop_resend",
+			"Resent pool operations");
+
+    pcb.add_u64(l_osdc_poolstat_active, "poolstat_active",
+		"Active get pool stat operations");
+    pcb.add_u64_counter(l_osdc_poolstat_send, "poolstat_send",
+			"Pool stat operations sent");
+    pcb.add_u64_counter(l_osdc_poolstat_resend, "poolstat_resend",
+			"Resent pool stats");
+
+    pcb.add_u64(l_osdc_statfs_active, "statfs_active", "Statfs operations");
+    pcb.add_u64_counter(l_osdc_statfs_send, "statfs_send", "Sent FS stats");
+    pcb.add_u64_counter(l_osdc_statfs_resend, "statfs_resend",
+			"Resent FS stats");
+
+    pcb.add_u64(l_osdc_command_active, "command_active", "Active commands");
+    pcb.add_u64_counter(l_osdc_command_send, "command_send",
+			"Sent commands");
+    pcb.add_u64_counter(l_osdc_command_resend, "command_resend",
+			"Resent commands");
+
+    pcb.add_u64(l_osdc_map_epoch, "map_epoch", "OSD map epoch");
+    pcb.add_u64_counter(l_osdc_map_full, "map_full",
+			"Full OSD maps received");
+    pcb.add_u64_counter(l_osdc_map_inc, "map_inc",
+			"Incremental OSD maps received");
+
+    pcb.add_u64(l_osdc_osd_sessions, "osd_sessions",
+		"Open sessions");  // open sessions
+    pcb.add_u64_counter(l_osdc_osd_session_open, "osd_session_open",
+			"Sessions opened");
+    pcb.add_u64_counter(l_osdc_osd_session_close, "osd_session_close",
+			"Sessions closed");
+    pcb.add_u64(l_osdc_osd_laggy, "osd_laggy", "Laggy OSD sessions");
+
+    pcb.add_u64_counter(l_osdc_osdop_omap_wr, "omap_wr",
+			"OSD OMAP write operations");
+    pcb.add_u64_counter(l_osdc_osdop_omap_rd, "omap_rd",
+			"OSD OMAP read operations");
+    pcb.add_u64_counter(l_osdc_osdop_omap_del, "omap_del",
+			"OSD OMAP delete operations");
+
+    logger = pcb.create_perf_counters();
+    cct->get_perfcounters_collection()->add(logger);
+  }
+
+  m_request_state_hook = new RequestStateHook(this);
+  auto admin_socket = cct->get_admin_socket();
+  int ret = admin_socket->register_command("objecter_requests",
+					   m_request_state_hook,
+					   "show in-progress osd requests");
+
+  /* Don't warn on EEXIST, happens if multiple ceph clients
+   * are instantiated from one process */
+  if (ret < 0 && ret != -EEXIST) {
+    lderr(cct) << "error registering admin socket command: "
+	       << cpp_strerror(ret) << dendl;
+  }
+
+  update_crush_location();
+
+  cct->_conf.add_observer(this);
+
+  initialized = true;
+}
+
+/*
+ * ok, cluster interaction can happen
+ */
+void Objecter::start(const OSDMap* o)
+{
+  shared_lock rl(rwlock);
+
+  start_tick();
+  if (o) {
+    osdmap->deepish_copy_from(*o);
+    prune_pg_mapping(osdmap->get_pools());
+  } else if (osdmap->get_epoch() == 0) {
+    _maybe_request_map();
+  }
+}
+
+void Objecter::shutdown()
+{
+  ceph_assert(initialized);
+
+  unique_lock wl(rwlock);
+
+  initialized = false;
+
+  wl.unlock();
+  cct->_conf.remove_observer(this);
+  wl.lock();
+
+  while (!osd_sessions.empty()) {
+    auto p = osd_sessions.begin();
+    close_session(p->second);
+  }
+
+  while(!check_latest_map_lingers.empty()) {
+    auto i = check_latest_map_lingers.begin();
+    i->second->put();
+    check_latest_map_lingers.erase(i->first);
+  }
+
+  while(!check_latest_map_ops.empty()) {
+    auto i = check_latest_map_ops.begin();
+    i->second->put();
+    check_latest_map_ops.erase(i->first);
+  }
+
+  while(!check_latest_map_commands.empty()) {
+    auto i = check_latest_map_commands.begin();
+    i->second->put();
+    check_latest_map_commands.erase(i->first);
+  }
+
+  while(!poolstat_ops.empty()) {
+    auto i = poolstat_ops.begin();
+    delete i->second;
+    poolstat_ops.erase(i->first);
+  }
+
+  while(!statfs_ops.empty()) {
+    auto i = statfs_ops.begin();
+    delete i->second;
+    statfs_ops.erase(i->first);
+  }
+
+  while(!pool_ops.empty()) {
+    auto i = pool_ops.begin();
+    delete i->second;
+    pool_ops.erase(i->first);
+  }
+
+  ldout(cct, 20) << __func__ << " clearing up homeless session..." << dendl;
+  while(!homeless_session->linger_ops.empty()) {
+    auto i = homeless_session->linger_ops.begin();
+    ldout(cct, 10) << " linger_op " << i->first << dendl;
+    LingerOp *lop = i->second;
+    {
+      std::unique_lock swl(homeless_session->lock);
+      _session_linger_op_remove(homeless_session, lop);
+    }
+    linger_ops.erase(lop->linger_id);
+    linger_ops_set.erase(lop);
+    lop->put();
+  }
+
+  while(!homeless_session->ops.empty()) {
+    auto i = homeless_session->ops.begin();
+    ldout(cct, 10) << " op " << i->first << dendl;
+    auto op = i->second;
+    {
+      std::unique_lock swl(homeless_session->lock);
+      _session_op_remove(homeless_session, op);
+    }
+    op->put();
+  }
+
+  while(!homeless_session->command_ops.empty()) {
+    auto i = homeless_session->command_ops.begin();
+    ldout(cct, 10) << " command_op " << i->first << dendl;
+    auto cop = i->second;
+    {
+      std::unique_lock swl(homeless_session->lock);
+      _session_command_op_remove(homeless_session, cop);
+    }
+    cop->put();
+  }
+
+  if (tick_event) {
+    if (timer.cancel_event(tick_event)) {
+      ldout(cct, 10) <<  " successfully canceled tick" << dendl;
+    }
+    tick_event = 0;
+  }
+
+  if (logger) {
+    cct->get_perfcounters_collection()->remove(logger);
+    delete logger;
+    logger = NULL;
+  }
+
+  // Let go of Objecter write lock so timer thread can shutdown
+  wl.unlock();
+
+  // Outside of lock to avoid cycle WRT calls to RequestStateHook
+  // This is safe because we guarantee no concurrent calls to
+  // shutdown() with the ::initialized check at start.
+  if (m_request_state_hook) {
+    auto admin_socket = cct->get_admin_socket();
+    admin_socket->unregister_commands(m_request_state_hook);
+    delete m_request_state_hook;
+    m_request_state_hook = NULL;
+  }
+}
+
+void Objecter::_send_linger(LingerOp *info,
+			    ceph::shunique_lock<ceph::shared_mutex>& sul)
+{
+  ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock);
+
+  fu2::unique_function<Op::OpSig> oncommit;
+  osdc_opvec opv;
+  std::shared_lock watchl(info->watch_lock);
+  cb::list *poutbl = nullptr;
+  if (info->registered && info->is_watch) {
+    ldout(cct, 15) << "send_linger " << info->linger_id << " reconnect"
+		   << dendl;
+    opv.push_back(OSDOp());
+    opv.back().op.op = CEPH_OSD_OP_WATCH;
+    opv.back().op.watch.cookie = info->get_cookie();
+    opv.back().op.watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
+    opv.back().op.watch.gen = ++info->register_gen;
+    oncommit = CB_Linger_Reconnect(this, info);
+  } else {
+    ldout(cct, 15) << "send_linger " << info->linger_id << " register"
+		   << dendl;
+    opv = info->ops;
+    // TODO Augment ca::Completion with an equivalent of
+    // target so we can handle these cases better.
+    auto c = std::make_unique<CB_Linger_Commit>(this, info);
+    if (!info->is_watch) {
+      info->notify_id = 0;
+      poutbl = &c->outbl;
+    }
+    oncommit = [c = std::move(c)](bs::error_code ec) mutable {
+		 std::move(*c)(ec);
+	       };
+  }
+  watchl.unlock();
+  auto o = new Op(info->target.base_oid, info->target.base_oloc,
+		  std::move(opv), info->target.flags | CEPH_OSD_FLAG_READ,
+		  std::move(oncommit), info->pobjver);
+  o->outbl = poutbl;
+  o->snapid = info->snap;
+  o->snapc = info->snapc;
+  o->mtime = info->mtime;
+
+  o->target = info->target;
+  o->tid = ++last_tid;
+
+  // do not resend this; we will send a new op to reregister
+  o->should_resend = false;
+  o->ctx_budgeted = true;
+
+  if (info->register_tid) {
+    // repeat send.  cancel old registration op, if any.
+    std::unique_lock sl(info->session->lock);
+    if (info->session->ops.count(info->register_tid)) {
+      auto o = info->session->ops[info->register_tid];
+      _op_cancel_map_check(o);
+      _cancel_linger_op(o);
+    }
+    sl.unlock();
+  }
+
+  _op_submit_with_budget(o, sul, &info->register_tid, &info->ctx_budget);
+
+  logger->inc(l_osdc_linger_send);
+}
+
+void Objecter::_linger_commit(LingerOp *info, bs::error_code ec,
+			      cb::list& outbl)
+{
+  std::unique_lock wl(info->watch_lock);
+  ldout(cct, 10) << "_linger_commit " << info->linger_id << dendl;
+  if (info->on_reg_commit) {
+    info->on_reg_commit->defer(std::move(info->on_reg_commit),
+			       ec, cb::list{});
+    info->on_reg_commit.reset();
+  }
+  if (ec && info->on_notify_finish) {
+    info->on_notify_finish->defer(std::move(info->on_notify_finish),
+				  ec, cb::list{});
+    info->on_notify_finish.reset();
+  }
+
+  // only tell the user the first time we do this
+  info->registered = true;
+  info->pobjver = NULL;
+
+  if (!info->is_watch) {
+    // make note of the notify_id
+    auto p = outbl.cbegin();
+    try {
+      decode(info->notify_id, p);
+      ldout(cct, 10) << "_linger_commit  notify_id=" << info->notify_id
+		     << dendl;
+    }
+    catch (cb::error& e) {
+    }
+  }
+}
+
+class CB_DoWatchError {
+  Objecter *objecter;
+  boost::intrusive_ptr<Objecter::LingerOp> info;
+  bs::error_code ec;
+public:
+  CB_DoWatchError(Objecter *o, Objecter::LingerOp *i,
+		  bs::error_code ec)
+    : objecter(o), info(i), ec(ec) {
+    info->_queued_async();
+  }
+  void operator()() {
+    std::unique_lock wl(objecter->rwlock);
+    bool canceled = info->canceled;
+    wl.unlock();
+
+    if (!canceled) {
+      info->handle(ec, 0, info->get_cookie(), 0, {});
+    }
+
+    info->finished_async();
+  }
+};
+
+bs::error_code Objecter::_normalize_watch_error(bs::error_code ec)
+{
+  // translate ENOENT -> ENOTCONN so that a delete->disconnection
+  // notification and a failure to reconnect because we raced with
+  // the delete appear the same to the user.
+  if (ec == bs::errc::no_such_file_or_directory)
+    ec = bs::error_code(ENOTCONN, osd_category());
+  return ec;
+}
+
+void Objecter::_linger_reconnect(LingerOp *info, bs::error_code ec)
+{
+  ldout(cct, 10) << __func__ << " " << info->linger_id << " = " << ec 
+		 << " (last_error " << info->last_error << ")" << dendl;
+  std::unique_lock wl(info->watch_lock);
+  if (ec) {
+    if (!info->last_error) {
+      ec = _normalize_watch_error(ec);
+      if (info->handle) {
+	boost::asio::defer(finish_strand, CB_DoWatchError(this, info, ec));
+      }
+    }
+  }
+
+  info->last_error = ec;
+}
+
+void Objecter::_send_linger_ping(LingerOp *info)
+{
+  // rwlock is locked unique
+  // info->session->lock is locked
+
+  if (cct->_conf->objecter_inject_no_watch_ping) {
+    ldout(cct, 10) << __func__ << " " << info->linger_id << " SKIPPING"
+		   << dendl;
+    return;
+  }
+  if (osdmap->test_flag(CEPH_OSDMAP_PAUSERD)) {
+    ldout(cct, 10) << __func__ << " PAUSERD" << dendl;
+    return;
+  }
+
+  ceph::coarse_mono_time now = ceph::coarse_mono_clock::now();
+  ldout(cct, 10) << __func__ << " " << info->linger_id << " now " << now
+		 << dendl;
+
+  osdc_opvec opv(1);
+  opv[0].op.op = CEPH_OSD_OP_WATCH;
+  opv[0].op.watch.cookie = info->get_cookie();
+  opv[0].op.watch.op = CEPH_OSD_WATCH_OP_PING;
+  opv[0].op.watch.gen = info->register_gen;
+
+  Op *o = new Op(info->target.base_oid, info->target.base_oloc,
+		 std::move(opv), info->target.flags | CEPH_OSD_FLAG_READ,
+		 CB_Linger_Ping(this, info, now),
+		 nullptr, nullptr);
+  o->target = info->target;
+  o->should_resend = false;
+  _send_op_account(o);
+  o->tid = ++last_tid;
+  _session_op_assign(info->session, o);
+  _send_op(o);
+  info->ping_tid = o->tid;
+
+  logger->inc(l_osdc_linger_ping);
+}
+
+void Objecter::_linger_ping(LingerOp *info, bs::error_code ec, ceph::coarse_mono_time sent,
+			    uint32_t register_gen)
+{
+  std::unique_lock l(info->watch_lock);
+  ldout(cct, 10) << __func__ << " " << info->linger_id
+		 << " sent " << sent << " gen " << register_gen << " = " << ec
+		 << " (last_error " << info->last_error
+		 << " register_gen " << info->register_gen << ")" << dendl;
+  if (info->register_gen == register_gen) {
+    if (!ec) {
+      info->watch_valid_thru = sent;
+    } else if (ec && !info->last_error) {
+      ec = _normalize_watch_error(ec);
+      info->last_error = ec;
+      if (info->handle) {
+	boost::asio::defer(finish_strand, CB_DoWatchError(this, info, ec));
+      }
+    }
+  } else {
+    ldout(cct, 20) << " ignoring old gen" << dendl;
+  }
+}
+
+tl::expected<ceph::timespan,
+	     bs::error_code> Objecter::linger_check(LingerOp *info)
+{
+  std::shared_lock l(info->watch_lock);
+
+  ceph::coarse_mono_time stamp = info->watch_valid_thru;
+  if (!info->watch_pending_async.empty())
+    stamp = std::min(info->watch_valid_thru, info->watch_pending_async.front());
+  auto age = ceph::coarse_mono_clock::now() - stamp;
+
+  ldout(cct, 10) << __func__ << " " << info->linger_id
+		 << " err " << info->last_error
+		 << " age " << age << dendl;
+  if (info->last_error)
+    return tl::unexpected(info->last_error);
+  // return a safe upper bound (we are truncating to ms)
+  return age;
+}
+
+void Objecter::linger_cancel(LingerOp *info)
+{
+  unique_lock wl(rwlock);
+  _linger_cancel(info);
+  info->put();
+}
+
+void Objecter::_linger_cancel(LingerOp *info)
+{
+  // rwlock is locked unique
+  ldout(cct, 20) << __func__ << " linger_id=" << info->linger_id << dendl;
+  if (!info->canceled) {
+    OSDSession *s = info->session;
+    std::unique_lock sl(s->lock);
+    _session_linger_op_remove(s, info);
+    sl.unlock();
+
+    linger_ops.erase(info->linger_id);
+    linger_ops_set.erase(info);
+    ceph_assert(linger_ops.size() == linger_ops_set.size());
+
+    info->canceled = true;
+    info->put();
+
+    logger->dec(l_osdc_linger_active);
+  }
+}
+
+
+
+Objecter::LingerOp *Objecter::linger_register(const object_t& oid,
+					      const object_locator_t& oloc,
+					      int flags)
+{
+  unique_lock l(rwlock);
+  // Acquire linger ID
+  auto info = new LingerOp(this, ++max_linger_id);
+  info->target.base_oid = oid;
+  info->target.base_oloc = oloc;
+  if (info->target.base_oloc.key == oid)
+    info->target.base_oloc.key.clear();
+  info->target.flags = flags;
+  info->watch_valid_thru = ceph::coarse_mono_clock::now();
+  ldout(cct, 10) << __func__ << " info " << info
+		 << " linger_id " << info->linger_id
+		 << " cookie " << info->get_cookie()
+		 << dendl;
+  linger_ops[info->linger_id] = info;
+  linger_ops_set.insert(info);
+  ceph_assert(linger_ops.size() == linger_ops_set.size());
+
+  info->get(); // for the caller
+  return info;
+}
+
+ceph_tid_t Objecter::linger_watch(LingerOp *info,
+				  ObjectOperation& op,
+				  const SnapContext& snapc,
+				  real_time mtime,
+				  cb::list& inbl,
+				  decltype(info->on_reg_commit)&& oncommit,
+				  version_t *objver)
+{
+  info->is_watch = true;
+  info->snapc = snapc;
+  info->mtime = mtime;
+  info->target.flags |= CEPH_OSD_FLAG_WRITE;
+  info->ops = op.ops;
+  info->inbl = inbl;
+  info->pobjver = objver;
+  info->on_reg_commit = std::move(oncommit);
+
+  info->ctx_budget = take_linger_budget(info);
+
+  shunique_lock sul(rwlock, ceph::acquire_unique);
+  _linger_submit(info, sul);
+  logger->inc(l_osdc_linger_active);
+
+  op.clear();
+  return info->linger_id;
+}
+
+ceph_tid_t Objecter::linger_notify(LingerOp *info,
+				   ObjectOperation& op,
+				   snapid_t snap, cb::list& inbl,
+				   decltype(LingerOp::on_reg_commit)&& onfinish,
+				   version_t *objver)
+{
+  info->snap = snap;
+  info->target.flags |= CEPH_OSD_FLAG_READ;
+  info->ops = op.ops;
+  info->inbl = inbl;
+  info->pobjver = objver;
+  info->on_reg_commit = std::move(onfinish);
+  info->ctx_budget = take_linger_budget(info);
+
+  shunique_lock sul(rwlock, ceph::acquire_unique);
+  _linger_submit(info, sul);
+  logger->inc(l_osdc_linger_active);
+
+  op.clear();
+  return info->linger_id;
+}
+
+void Objecter::_linger_submit(LingerOp *info,
+			      ceph::shunique_lock<ceph::shared_mutex>& sul)
+{
+  ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock);
+  ceph_assert(info->linger_id);
+  ceph_assert(info->ctx_budget != -1); // caller needs to have taken budget already!
+
+  // Populate Op::target
+  OSDSession *s = NULL;
+  _calc_target(&info->target, nullptr);
+
+  // Create LingerOp<->OSDSession relation
+  int r = _get_session(info->target.osd, &s, sul);
+  ceph_assert(r == 0);
+  unique_lock sl(s->lock);
+  _session_linger_op_assign(s, info);
+  sl.unlock();
+  put_session(s);
+
+  _send_linger(info, sul);
+}
+
+struct CB_DoWatchNotify {
+  Objecter *objecter;
+  boost::intrusive_ptr<Objecter::LingerOp> info;
+  boost::intrusive_ptr<MWatchNotify> msg;
+  CB_DoWatchNotify(Objecter *o, Objecter::LingerOp *i, MWatchNotify *m)
+    : objecter(o), info(i), msg(m) {
+    info->_queued_async();
+  }
+  void operator()() {
+    objecter->_do_watch_notify(std::move(info), std::move(msg));
+  }
+};
+
+void Objecter::handle_watch_notify(MWatchNotify *m)
+{
+  shared_lock l(rwlock);
+  if (!initialized) {
+    return;
+  }
+
+  LingerOp *info = reinterpret_cast<LingerOp*>(m->cookie);
+  if (linger_ops_set.count(info) == 0) {
+    ldout(cct, 7) << __func__ << " cookie " << m->cookie << " dne" << dendl;
+    return;
+  }
+  std::unique_lock wl(info->watch_lock);
+  if (m->opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+    if (!info->last_error) {
+      info->last_error = bs::error_code(ENOTCONN, osd_category());
+      if (info->handle) {
+	boost::asio::defer(finish_strand, CB_DoWatchError(this, info,
+							  info->last_error));
+      }
+    }
+  } else if (!info->is_watch) {
+    // we have CEPH_WATCH_EVENT_NOTIFY_COMPLETE; we can do this inline
+    // since we know the only user (librados) is safe to call in
+    // fast-dispatch context
+    if (info->notify_id &&
+	info->notify_id != m->notify_id) {
+      ldout(cct, 10) << __func__ << " reply notify " << m->notify_id
+		     << " != " << info->notify_id << ", ignoring" << dendl;
+    } else if (info->on_notify_finish) {
+      info->on_notify_finish->defer(
+	std::move(info->on_notify_finish),
+	osdcode(m->return_code), std::move(m->get_data()));
+
+      // if we race with reconnect we might get a second notify; only
+      // notify the caller once!
+      info->on_notify_finish = nullptr;
+    }
+  } else {
+    boost::asio::defer(finish_strand, CB_DoWatchNotify(this, info, m));
+  }
+}
+
+void Objecter::_do_watch_notify(boost::intrusive_ptr<LingerOp> info,
+                                boost::intrusive_ptr<MWatchNotify> m)
+{
+  ldout(cct, 10) << __func__ << " " << *m << dendl;
+
+  shared_lock l(rwlock);
+  ceph_assert(initialized);
+
+  if (info->canceled) {
+    l.unlock();
+    goto out;
+  }
+
+  // notify completion?
+  ceph_assert(info->is_watch);
+  ceph_assert(info->handle);
+  ceph_assert(m->opcode != CEPH_WATCH_EVENT_DISCONNECT);
+
+  l.unlock();
+
+  switch (m->opcode) {
+  case CEPH_WATCH_EVENT_NOTIFY:
+    info->handle({}, m->notify_id, m->cookie, m->notifier_gid, std::move(m->bl));
+    break;
+  }
+
+ out:
+  info->finished_async();
+}
+
+bool Objecter::ms_dispatch(Message *m)
+{
+  ldout(cct, 10) << __func__ << " " << cct << " " << *m << dendl;
+  switch (m->get_type()) {
+    // these we exlusively handle
+  case CEPH_MSG_OSD_OPREPLY:
+    handle_osd_op_reply(static_cast<MOSDOpReply*>(m));
+    return true;
+
+  case CEPH_MSG_OSD_BACKOFF:
+    handle_osd_backoff(static_cast<MOSDBackoff*>(m));
+    return true;
+
+  case CEPH_MSG_WATCH_NOTIFY:
+    handle_watch_notify(static_cast<MWatchNotify*>(m));
+    m->put();
+    return true;
+
+  case MSG_COMMAND_REPLY:
+    if (m->get_source().type() == CEPH_ENTITY_TYPE_OSD) {
+      handle_command_reply(static_cast<MCommandReply*>(m));
+      return true;
+    } else {
+      return false;
+    }
+
+  case MSG_GETPOOLSTATSREPLY:
+    handle_get_pool_stats_reply(static_cast<MGetPoolStatsReply*>(m));
+    return true;
+
+  case CEPH_MSG_POOLOP_REPLY:
+    handle_pool_op_reply(static_cast<MPoolOpReply*>(m));
+    return true;
+
+  case CEPH_MSG_STATFS_REPLY:
+    handle_fs_stats_reply(static_cast<MStatfsReply*>(m));
+    return true;
+
+    // these we give others a chance to inspect
+
+    // MDS, OSD
+  case CEPH_MSG_OSD_MAP:
+    handle_osd_map(static_cast<MOSDMap*>(m));
+    return false;
+  }
+  return false;
+}
+
+void Objecter::_scan_requests(
+  OSDSession *s,
+  bool skipped_map,
+  bool cluster_full,
+  map<int64_t, bool> *pool_full_map,
+  map<ceph_tid_t, Op*>& need_resend,
+  list<LingerOp*>& need_resend_linger,
+  map<ceph_tid_t, CommandOp*>& need_resend_command,
+  ceph::shunique_lock<ceph::shared_mutex>& sul)
+{
+  ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock);
+
+  list<LingerOp*> unregister_lingers;
+
+  std::unique_lock sl(s->lock);
+
+  // check for changed linger mappings (_before_ regular ops)
+  auto lp = s->linger_ops.begin();
+  while (lp != s->linger_ops.end()) {
+    auto op = lp->second;
+    ceph_assert(op->session == s);
+    // check_linger_pool_dne() may touch linger_ops; prevent iterator
+    // invalidation
+    ++lp;
+    ldout(cct, 10) << " checking linger op " << op->linger_id << dendl;
+    bool unregister, force_resend_writes = cluster_full;
+    int r = _recalc_linger_op_target(op, sul);
+    if (pool_full_map)
+      force_resend_writes = force_resend_writes ||
+	(*pool_full_map)[op->target.base_oloc.pool];
+    switch (r) {
+    case RECALC_OP_TARGET_NO_ACTION:
+      if (!skipped_map && !force_resend_writes)
+	break;
+      // -- fall-thru --
+    case RECALC_OP_TARGET_NEED_RESEND:
+      need_resend_linger.push_back(op);
+      _linger_cancel_map_check(op);
+      break;
+    case RECALC_OP_TARGET_POOL_DNE:
+      _check_linger_pool_dne(op, &unregister);
+      if (unregister) {
+	ldout(cct, 10) << " need to unregister linger op "
+		       << op->linger_id << dendl;
+	op->get();
+	unregister_lingers.push_back(op);
+      }
+      break;
+    }
+  }
+
+  // check for changed request mappings
+  auto p = s->ops.begin();
+  while (p != s->ops.end()) {
+    Op *op = p->second;
+    ++p;   // check_op_pool_dne() may touch ops; prevent iterator invalidation
+    ldout(cct, 10) << " checking op " << op->tid << dendl;
+    _prune_snapc(osdmap->get_new_removed_snaps(), op);
+    bool force_resend_writes = cluster_full;
+    if (pool_full_map)
+      force_resend_writes = force_resend_writes ||
+	(*pool_full_map)[op->target.base_oloc.pool];
+    int r = _calc_target(&op->target,
+			 op->session ? op->session->con.get() : nullptr);
+    switch (r) {
+    case RECALC_OP_TARGET_NO_ACTION:
+      if (!skipped_map && !(force_resend_writes && op->target.respects_full()))
+	break;
+      // -- fall-thru --
+    case RECALC_OP_TARGET_NEED_RESEND:
+      _session_op_remove(op->session, op);
+      need_resend[op->tid] = op;
+      _op_cancel_map_check(op);
+      break;
+    case RECALC_OP_TARGET_POOL_DNE:
+      _check_op_pool_dne(op, &sl);
+      break;
+    }
+  }
+
+  // commands
+  auto cp = s->command_ops.begin();
+  while (cp != s->command_ops.end()) {
+    auto c = cp->second;
+    ++cp;
+    ldout(cct, 10) << " checking command " << c->tid << dendl;
+    bool force_resend_writes = cluster_full;
+    if (pool_full_map)
+      force_resend_writes = force_resend_writes ||
+	(*pool_full_map)[c->target_pg.pool()];
+    int r = _calc_command_target(c, sul);
+    switch (r) {
+    case RECALC_OP_TARGET_NO_ACTION:
+      // resend if skipped map; otherwise do nothing.
+      if (!skipped_map && !force_resend_writes)
+	break;
+      // -- fall-thru --
+    case RECALC_OP_TARGET_NEED_RESEND:
+      need_resend_command[c->tid] = c;
+      _session_command_op_remove(c->session, c);
+      _command_cancel_map_check(c);
+      break;
+    case RECALC_OP_TARGET_POOL_DNE:
+    case RECALC_OP_TARGET_OSD_DNE:
+    case RECALC_OP_TARGET_OSD_DOWN:
+      _check_command_map_dne(c);
+      break;
+    }
+  }
+
+  sl.unlock();
+
+  for (auto iter = unregister_lingers.begin();
+       iter != unregister_lingers.end();
+       ++iter) {
+    _linger_cancel(*iter);
+    (*iter)->put();
+  }
+}
+
+void Objecter::handle_osd_map(MOSDMap *m)
+{
+  ceph::shunique_lock sul(rwlock, acquire_unique);
+  if (!initialized)
+    return;
+
+  ceph_assert(osdmap);
+
+  if (m->fsid != monc->get_fsid()) {
+    ldout(cct, 0) << "handle_osd_map fsid " << m->fsid
+		  << " != " << monc->get_fsid() << dendl;
+    return;
+  }
+
+  bool was_pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD);
+  bool cluster_full = _osdmap_full_flag();
+  bool was_pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || cluster_full ||
+    _osdmap_has_pool_full();
+  map<int64_t, bool> pool_full_map;
+  for (auto it = osdmap->get_pools().begin();
+       it != osdmap->get_pools().end(); ++it)
+    pool_full_map[it->first] = _osdmap_pool_full(it->second);
+
+
+  list<LingerOp*> need_resend_linger;
+  map<ceph_tid_t, Op*> need_resend;
+  map<ceph_tid_t, CommandOp*> need_resend_command;
+
+  if (m->get_last() <= osdmap->get_epoch()) {
+    ldout(cct, 3) << "handle_osd_map ignoring epochs ["
+		  << m->get_first() << "," << m->get_last()
+		  << "] <= " << osdmap->get_epoch() << dendl;
+  } else {
+    ldout(cct, 3) << "handle_osd_map got epochs ["
+		  << m->get_first() << "," << m->get_last()
+		  << "] > " << osdmap->get_epoch() << dendl;
+
+    if (osdmap->get_epoch()) {
+      bool skipped_map = false;
+      // we want incrementals
+      for (epoch_t e = osdmap->get_epoch() + 1;
+	   e <= m->get_last();
+	   e++) {
+
+	if (osdmap->get_epoch() == e-1 &&
+	    m->incremental_maps.count(e)) {
+	  ldout(cct, 3) << "handle_osd_map decoding incremental epoch " << e
+			<< dendl;
+	  OSDMap::Incremental inc(m->incremental_maps[e]);
+	  osdmap->apply_incremental(inc);
+
+          emit_blocklist_events(inc);
+
+	  logger->inc(l_osdc_map_inc);
+	}
+	else if (m->maps.count(e)) {
+	  ldout(cct, 3) << "handle_osd_map decoding full epoch " << e << dendl;
+          auto new_osdmap = std::make_unique<OSDMap>();
+          new_osdmap->decode(m->maps[e]);
+
+          emit_blocklist_events(*osdmap, *new_osdmap);
+          osdmap = std::move(new_osdmap);
+
+	  logger->inc(l_osdc_map_full);
+	}
+	else {
+	  if (e >= m->get_oldest()) {
+	    ldout(cct, 3) << "handle_osd_map requesting missing epoch "
+			  << osdmap->get_epoch()+1 << dendl;
+	    _maybe_request_map();
+	    break;
+	  }
+	  ldout(cct, 3) << "handle_osd_map missing epoch "
+			<< osdmap->get_epoch()+1
+			<< ", jumping to " << m->get_oldest() << dendl;
+	  e = m->get_oldest() - 1;
+	  skipped_map = true;
+	  continue;
+	}
+	logger->set(l_osdc_map_epoch, osdmap->get_epoch());
+
+        prune_pg_mapping(osdmap->get_pools());
+	cluster_full = cluster_full || _osdmap_full_flag();
+	update_pool_full_map(pool_full_map);
+
+	// check all outstanding requests on every epoch
+	for (auto& i : need_resend) {
+	  _prune_snapc(osdmap->get_new_removed_snaps(), i.second);
+	}
+	_scan_requests(homeless_session, skipped_map, cluster_full,
+		       &pool_full_map, need_resend,
+		       need_resend_linger, need_resend_command, sul);
+	for (auto p = osd_sessions.begin();
+	     p != osd_sessions.end(); ) {
+	  auto s = p->second;
+	  _scan_requests(s, skipped_map, cluster_full,
+			 &pool_full_map, need_resend,
+			 need_resend_linger, need_resend_command, sul);
+	  ++p;
+	  // osd down or addr change?
+	  if (!osdmap->is_up(s->osd) ||
+	      (s->con &&
+	       s->con->get_peer_addrs() != osdmap->get_addrs(s->osd))) {
+	    close_session(s);
+	  }
+	}
+
+	ceph_assert(e == osdmap->get_epoch());
+      }
+
+    } else {
+      // first map.  we want the full thing.
+      if (m->maps.count(m->get_last())) {
+	for (auto p = osd_sessions.begin();
+	     p != osd_sessions.end(); ++p) {
+	  OSDSession *s = p->second;
+	  _scan_requests(s, false, false, NULL, need_resend,
+			 need_resend_linger, need_resend_command, sul);
+	}
+	ldout(cct, 3) << "handle_osd_map decoding full epoch "
+		      << m->get_last() << dendl;
+	osdmap->decode(m->maps[m->get_last()]);
+        prune_pg_mapping(osdmap->get_pools());
+
+	_scan_requests(homeless_session, false, false, NULL,
+		       need_resend, need_resend_linger,
+		       need_resend_command, sul);
+      } else {
+	ldout(cct, 3) << "handle_osd_map hmm, i want a full map, requesting"
+		      << dendl;
+	monc->sub_want("osdmap", 0, CEPH_SUBSCRIBE_ONETIME);
+	monc->renew_subs();
+      }
+    }
+  }
+
+  // make sure need_resend targets reflect latest map
+  for (auto p = need_resend.begin(); p != need_resend.end(); ) {
+    Op *op = p->second;
+    if (op->target.epoch < osdmap->get_epoch()) {
+      ldout(cct, 10) << __func__ << "  checking op " << p->first << dendl;
+      int r = _calc_target(&op->target, nullptr);
+      if (r == RECALC_OP_TARGET_POOL_DNE) {
+	p = need_resend.erase(p);
+	_check_op_pool_dne(op, nullptr);
+      } else {
+	++p;
+      }
+    } else {
+      ++p;
+    }
+  }
+
+  bool pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD);
+  bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || _osdmap_full_flag()
+    || _osdmap_has_pool_full();
+
+  // was/is paused?
+  if (was_pauserd || was_pausewr || pauserd || pausewr ||
+      osdmap->get_epoch() < epoch_barrier) {
+    _maybe_request_map();
+  }
+
+  // resend requests
+  for (auto p = need_resend.begin();
+       p != need_resend.end(); ++p) {
+    auto op = p->second;
+    auto s = op->session;
+    bool mapped_session = false;
+    if (!s) {
+      int r = _map_session(&op->target, &s, sul);
+      ceph_assert(r == 0);
+      mapped_session = true;
+    } else {
+      get_session(s);
+    }
+    std::unique_lock sl(s->lock);
+    if (mapped_session) {
+      _session_op_assign(s, op);
+    }
+    if (op->should_resend) {
+      if (!op->session->is_homeless() && !op->target.paused) {
+	logger->inc(l_osdc_op_resend);
+	_send_op(op);
+      }
+    } else {
+      _op_cancel_map_check(op);
+      _cancel_linger_op(op);
+    }
+    sl.unlock();
+    put_session(s);
+  }
+  for (auto p = need_resend_linger.begin();
+       p != need_resend_linger.end(); ++p) {
+    LingerOp *op = *p;
+    ceph_assert(op->session);
+    if (!op->session->is_homeless()) {
+      logger->inc(l_osdc_linger_resend);
+      _send_linger(op, sul);
+    }
+  }
+  for (auto p = need_resend_command.begin();
+       p != need_resend_command.end(); ++p) {
+    auto c = p->second;
+    if (c->target.osd >= 0) {
+      _assign_command_session(c, sul);
+      if (c->session && !c->session->is_homeless()) {
+	_send_command(c);
+      }
+    }
+  }
+
+  _dump_active();
+
+  // finish any Contexts that were waiting on a map update
+  auto p = waiting_for_map.begin();
+  while (p != waiting_for_map.end() &&
+	 p->first <= osdmap->get_epoch()) {
+    //go through the list and call the onfinish methods
+    for (auto& [c, ec] : p->second) {
+      ca::post(std::move(c), ec);
+    }
+    waiting_for_map.erase(p++);
+  }
+
+  monc->sub_got("osdmap", osdmap->get_epoch());
+
+  if (!waiting_for_map.empty()) {
+    _maybe_request_map();
+  }
+}
+
+void Objecter::enable_blocklist_events()
+{
+  unique_lock wl(rwlock);
+
+  blocklist_events_enabled = true;
+}
+
+void Objecter::consume_blocklist_events(std::set<entity_addr_t> *events)
+{
+  unique_lock wl(rwlock);
+
+  if (events->empty()) {
+    events->swap(blocklist_events);
+  } else {
+    for (const auto &i : blocklist_events) {
+      events->insert(i);
+    }
+    blocklist_events.clear();
+  }
+}
+
+void Objecter::emit_blocklist_events(const OSDMap::Incremental &inc)
+{
+  if (!blocklist_events_enabled) {
+    return;
+  }
+
+  for (const auto &i : inc.new_blocklist) {
+    blocklist_events.insert(i.first);
+  }
+}
+
+void Objecter::emit_blocklist_events(const OSDMap &old_osd_map,
+                                     const OSDMap &new_osd_map)
+{
+  if (!blocklist_events_enabled) {
+    return;
+  }
+
+  std::set<entity_addr_t> old_set;
+  std::set<entity_addr_t> new_set;
+  std::set<entity_addr_t> old_range_set;
+  std::set<entity_addr_t> new_range_set;
+
+  old_osd_map.get_blocklist(&old_set, &old_range_set);
+  new_osd_map.get_blocklist(&new_set, &new_range_set);
+
+  std::set<entity_addr_t> delta_set;
+  std::set_difference(
+      new_set.begin(), new_set.end(), old_set.begin(), old_set.end(),
+      std::inserter(delta_set, delta_set.begin()));
+  std::set_difference(
+      new_range_set.begin(), new_range_set.end(),
+      old_range_set.begin(), old_range_set.end(),
+      std::inserter(delta_set, delta_set.begin()));
+  blocklist_events.insert(delta_set.begin(), delta_set.end());
+}
+
+// op pool check
+
+void Objecter::CB_Op_Map_Latest::operator()(bs::error_code e,
+					    version_t latest, version_t)
+{
+  if (e == bs::errc::resource_unavailable_try_again ||
+      e == bs::errc::operation_canceled)
+    return;
+
+  lgeneric_subdout(objecter->cct, objecter, 10)
+    << "op_map_latest r=" << e << " tid=" << tid
+    << " latest " << latest << dendl;
+
+  unique_lock wl(objecter->rwlock);
+
+  auto iter = objecter->check_latest_map_ops.find(tid);
+  if (iter == objecter->check_latest_map_ops.end()) {
+    lgeneric_subdout(objecter->cct, objecter, 10)
+      << "op_map_latest op "<< tid << " not found" << dendl;
+    return;
+  }
+
+  Op *op = iter->second;
+  objecter->check_latest_map_ops.erase(iter);
+
+  lgeneric_subdout(objecter->cct, objecter, 20)
+    << "op_map_latest op "<< op << dendl;
+
+  if (op->map_dne_bound == 0)
+    op->map_dne_bound = latest;
+
+  unique_lock sl(op->session->lock, defer_lock);
+  objecter->_check_op_pool_dne(op, &sl);
+
+  op->put();
+}
+
+int Objecter::pool_snap_by_name(int64_t poolid, const char *snap_name,
+				snapid_t *snap) const
+{
+  shared_lock rl(rwlock);
+
+  auto& pools = osdmap->get_pools();
+  auto iter = pools.find(poolid);
+  if (iter == pools.end()) {
+    return -ENOENT;
+  }
+  const pg_pool_t& pg_pool = iter->second;
+  for (auto p = pg_pool.snaps.begin();
+       p != pg_pool.snaps.end();
+       ++p) {
+    if (p->second.name == snap_name) {
+      *snap = p->first;
+      return 0;
+    }
+  }
+  return -ENOENT;
+}
+
+int Objecter::pool_snap_get_info(int64_t poolid, snapid_t snap,
+				 pool_snap_info_t *info) const
+{
+  shared_lock rl(rwlock);
+
+  auto& pools = osdmap->get_pools();
+  auto iter = pools.find(poolid);
+  if (iter == pools.end()) {
+    return -ENOENT;
+  }
+  const pg_pool_t& pg_pool = iter->second;
+  auto p = pg_pool.snaps.find(snap);
+  if (p == pg_pool.snaps.end())
+    return -ENOENT;
+  *info = p->second;
+
+  return 0;
+}
+
+int Objecter::pool_snap_list(int64_t poolid, vector<uint64_t> *snaps)
+{
+  shared_lock rl(rwlock);
+
+  const pg_pool_t *pi = osdmap->get_pg_pool(poolid);
+  if (!pi)
+    return -ENOENT;
+  for (auto p = pi->snaps.begin();
+       p != pi->snaps.end();
+       ++p) {
+    snaps->push_back(p->first);
+  }
+  return 0;
+}
+
+// sl may be unlocked.
+void Objecter::_check_op_pool_dne(Op *op, std::unique_lock<std::shared_mutex> *sl)
+{
+  // rwlock is locked unique
+
+  if (op->target.pool_ever_existed) {
+    // the pool previously existed and now it does not, which means it
+    // was deleted.
+    op->map_dne_bound = osdmap->get_epoch();
+    ldout(cct, 10) << "check_op_pool_dne tid " << op->tid
+		   << " pool previously exists but now does not"
+		   << dendl;
+  } else {
+    ldout(cct, 10) << "check_op_pool_dne tid " << op->tid
+		   << " current " << osdmap->get_epoch()
+		   << " map_dne_bound " << op->map_dne_bound
+		   << dendl;
+  }
+  if (op->map_dne_bound > 0) {
+    if (osdmap->get_epoch() >= op->map_dne_bound) {
+      // we had a new enough map
+      ldout(cct, 10) << "check_op_pool_dne tid " << op->tid
+		     << " concluding pool " << op->target.base_pgid.pool()
+		     << " dne" << dendl;
+      if (op->has_completion()) {
+	num_in_flight--;
+	op->complete(osdc_errc::pool_dne, -ENOENT);
+      }
+
+      OSDSession *s = op->session;
+      if (s) {
+	ceph_assert(s != NULL);
+	ceph_assert(sl->mutex() == &s->lock);
+	bool session_locked = sl->owns_lock();
+	if (!session_locked) {
+	  sl->lock();
+	}
+	_finish_op(op, 0);
+	if (!session_locked) {
+	  sl->unlock();
+	}
+      } else {
+	_finish_op(op, 0);	// no session
+      }
+    }
+  } else {
+    _send_op_map_check(op);
+  }
+}
+
+void Objecter::_send_op_map_check(Op *op)
+{
+  // rwlock is locked unique
+  // ask the monitor
+  if (check_latest_map_ops.count(op->tid) == 0) {
+    op->get();
+    check_latest_map_ops[op->tid] = op;
+    monc->get_version("osdmap", CB_Op_Map_Latest(this, op->tid));
+  }
+}
+
+void Objecter::_op_cancel_map_check(Op *op)
+{
+  // rwlock is locked unique
+  auto iter = check_latest_map_ops.find(op->tid);
+  if (iter != check_latest_map_ops.end()) {
+    Op *op = iter->second;
+    op->put();
+    check_latest_map_ops.erase(iter);
+  }
+}
+
+// linger pool check
+
+void Objecter::CB_Linger_Map_Latest::operator()(bs::error_code e,
+						version_t latest,
+						version_t)
+{
+  if (e == bs::errc::resource_unavailable_try_again ||
+      e == bs::errc::operation_canceled) {
+    // ignore callback; we will retry in resend_mon_ops()
+    return;
+  }
+
+  unique_lock wl(objecter->rwlock);
+
+  auto iter = objecter->check_latest_map_lingers.find(linger_id);
+  if (iter == objecter->check_latest_map_lingers.end()) {
+    return;
+  }
+
+  auto op = iter->second;
+  objecter->check_latest_map_lingers.erase(iter);
+
+  if (op->map_dne_bound == 0)
+    op->map_dne_bound = latest;
+
+  bool unregister;
+  objecter->_check_linger_pool_dne(op, &unregister);
+
+  if (unregister) {
+    objecter->_linger_cancel(op);
+  }
+
+  op->put();
+}
+
+void Objecter::_check_linger_pool_dne(LingerOp *op, bool *need_unregister)
+{
+  // rwlock is locked unique
+
+  *need_unregister = false;
+
+  if (op->register_gen > 0) {
+    ldout(cct, 10) << "_check_linger_pool_dne linger_id " << op->linger_id
+		   << " pool previously existed but now does not"
+		   << dendl;
+    op->map_dne_bound = osdmap->get_epoch();
+  } else {
+    ldout(cct, 10) << "_check_linger_pool_dne linger_id " << op->linger_id
+		   << " current " << osdmap->get_epoch()
+		   << " map_dne_bound " << op->map_dne_bound
+		   << dendl;
+  }
+  if (op->map_dne_bound > 0) {
+    if (osdmap->get_epoch() >= op->map_dne_bound) {
+      std::unique_lock wl{op->watch_lock};
+      if (op->on_reg_commit) {
+	op->on_reg_commit->defer(std::move(op->on_reg_commit),
+				 osdc_errc::pool_dne, cb::list{});
+	op->on_reg_commit = nullptr;
+      }
+      if (op->on_notify_finish) {
+	op->on_notify_finish->defer(std::move(op->on_notify_finish),
+				    osdc_errc::pool_dne, cb::list{});
+        op->on_notify_finish = nullptr;
+      }
+      *need_unregister = true;
+    }
+  } else {
+    _send_linger_map_check(op);
+  }
+}
+
+void Objecter::_send_linger_map_check(LingerOp *op)
+{
+  // ask the monitor
+  if (check_latest_map_lingers.count(op->linger_id) == 0) {
+    op->get();
+    check_latest_map_lingers[op->linger_id] = op;
+    monc->get_version("osdmap", CB_Linger_Map_Latest(this, op->linger_id));
+  }
+}
+
+void Objecter::_linger_cancel_map_check(LingerOp *op)
+{
+  // rwlock is locked unique
+
+  auto iter = check_latest_map_lingers.find(op->linger_id);
+  if (iter != check_latest_map_lingers.end()) {
+    LingerOp *op = iter->second;
+    op->put();
+    check_latest_map_lingers.erase(iter);
+  }
+}
+
+// command pool check
+
+void Objecter::CB_Command_Map_Latest::operator()(bs::error_code e,
+						 version_t latest, version_t)
+{
+  if (e == bs::errc::resource_unavailable_try_again ||
+      e == bs::errc::operation_canceled) {
+    // ignore callback; we will retry in resend_mon_ops()
+    return;
+  }
+
+  unique_lock wl(objecter->rwlock);
+
+  auto iter = objecter->check_latest_map_commands.find(tid);
+  if (iter == objecter->check_latest_map_commands.end()) {
+    return;
+  }
+
+  auto c = iter->second;
+  objecter->check_latest_map_commands.erase(iter);
+
+  if (c->map_dne_bound == 0)
+    c->map_dne_bound = latest;
+
+  unique_lock sul(c->session->lock);
+  objecter->_check_command_map_dne(c);
+  sul.unlock();
+
+  c->put();
+}
+
+void Objecter::_check_command_map_dne(CommandOp *c)
+{
+  // rwlock is locked unique
+  // session is locked unique
+
+  ldout(cct, 10) << "_check_command_map_dne tid " << c->tid
+		 << " current " << osdmap->get_epoch()
+		 << " map_dne_bound " << c->map_dne_bound
+		 << dendl;
+  if (c->map_dne_bound > 0) {
+    if (osdmap->get_epoch() >= c->map_dne_bound) {
+      _finish_command(c, osdcode(c->map_check_error),
+		      std::move(c->map_check_error_str), {});
+    }
+  } else {
+    _send_command_map_check(c);
+  }
+}
+
+void Objecter::_send_command_map_check(CommandOp *c)
+{
+  // rwlock is locked unique
+  // session is locked unique
+
+  // ask the monitor
+  if (check_latest_map_commands.count(c->tid) == 0) {
+    c->get();
+    check_latest_map_commands[c->tid] = c;
+    monc->get_version("osdmap", CB_Command_Map_Latest(this, c->tid));
+  }
+}
+
+void Objecter::_command_cancel_map_check(CommandOp *c)
+{
+  // rwlock is locked uniqe
+
+  auto iter = check_latest_map_commands.find(c->tid);
+  if (iter != check_latest_map_commands.end()) {
+    auto c = iter->second;
+    c->put();
+    check_latest_map_commands.erase(iter);
+  }
+}
+
+
+/**
+ * Look up OSDSession by OSD id.
+ *
+ * @returns 0 on success, or -EAGAIN if the lock context requires
+ * promotion to write.
+ */
+int Objecter::_get_session(int osd, OSDSession **session,
+			   shunique_lock<ceph::shared_mutex>& sul)
+{
+  ceph_assert(sul && sul.mutex() == &rwlock);
+
+  if (osd < 0) {
+    *session = homeless_session;
+    ldout(cct, 20) << __func__ << " osd=" << osd << " returning homeless"
+		   << dendl;
+    return 0;
+  }
+
+  auto p = osd_sessions.find(osd);
+  if (p != osd_sessions.end()) {
+    auto s = p->second;
+    s->get();
+    *session = s;
+    ldout(cct, 20) << __func__ << " s=" << s << " osd=" << osd << " "
+		   << s->get_nref() << dendl;
+    return 0;
+  }
+  if (!sul.owns_lock()) {
+    return -EAGAIN;
+  }
+  auto s = new OSDSession(cct, osd);
+  osd_sessions[osd] = s;
+  s->con = messenger->connect_to_osd(osdmap->get_addrs(osd));
+  s->con->set_priv(RefCountedPtr{s});
+  logger->inc(l_osdc_osd_session_open);
+  logger->set(l_osdc_osd_sessions, osd_sessions.size());
+  s->get();
+  *session = s;
+  ldout(cct, 20) << __func__ << " s=" << s << " osd=" << osd << " "
+		 << s->get_nref() << dendl;
+  return 0;
+}
+
+void Objecter::put_session(Objecter::OSDSession *s)
+{
+  if (s && !s->is_homeless()) {
+    ldout(cct, 20) << __func__ << " s=" << s << " osd=" << s->osd << " "
+		   << s->get_nref() << dendl;
+    s->put();
+  }
+}
+
+void Objecter::get_session(Objecter::OSDSession *s)
+{
+  ceph_assert(s != NULL);
+
+  if (!s->is_homeless()) {
+    ldout(cct, 20) << __func__ << " s=" << s << " osd=" << s->osd << " "
+		   << s->get_nref() << dendl;
+    s->get();
+  }
+}
+
+void Objecter::_reopen_session(OSDSession *s)
+{
+  // rwlock is locked unique
+  // s->lock is locked
+
+  auto addrs = osdmap->get_addrs(s->osd);
+  ldout(cct, 10) << "reopen_session osd." << s->osd << " session, addr now "
+		 << addrs << dendl;
+  if (s->con) {
+    s->con->set_priv(NULL);
+    s->con->mark_down();
+    logger->inc(l_osdc_osd_session_close);
+  }
+  s->con = messenger->connect_to_osd(addrs);
+  s->con->set_priv(RefCountedPtr{s});
+  s->incarnation++;
+  logger->inc(l_osdc_osd_session_open);
+}
+
+void Objecter::close_session(OSDSession *s)
+{
+  // rwlock is locked unique
+
+  ldout(cct, 10) << "close_session for osd." << s->osd << dendl;
+  if (s->con) {
+    s->con->set_priv(NULL);
+    s->con->mark_down();
+    logger->inc(l_osdc_osd_session_close);
+  }
+  unique_lock sl(s->lock);
+
+  std::list<LingerOp*> homeless_lingers;
+  std::list<CommandOp*> homeless_commands;
+  std::list<Op*> homeless_ops;
+
+  while (!s->linger_ops.empty()) {
+    auto i = s->linger_ops.begin();
+    ldout(cct, 10) << " linger_op " << i->first << dendl;
+    homeless_lingers.push_back(i->second);
+    _session_linger_op_remove(s, i->second);
+  }
+
+  while (!s->ops.empty()) {
+    auto i = s->ops.begin();
+    ldout(cct, 10) << " op " << i->first << dendl;
+    homeless_ops.push_back(i->second);
+    _session_op_remove(s, i->second);
+  }
+
+  while (!s->command_ops.empty()) {
+    auto i = s->command_ops.begin();
+    ldout(cct, 10) << " command_op " << i->first << dendl;
+    homeless_commands.push_back(i->second);
+    _session_command_op_remove(s, i->second);
+  }
+
+  osd_sessions.erase(s->osd);
+  sl.unlock();
+  put_session(s);
+
+  // Assign any leftover ops to the homeless session
+  {
+    unique_lock hsl(homeless_session->lock);
+    for (auto i = homeless_lingers.begin();
+	 i != homeless_lingers.end(); ++i) {
+      _session_linger_op_assign(homeless_session, *i);
+    }
+    for (auto i = homeless_ops.begin();
+	 i != homeless_ops.end(); ++i) {
+      _session_op_assign(homeless_session, *i);
+    }
+    for (auto i = homeless_commands.begin();
+	 i != homeless_commands.end(); ++i) {
+      _session_command_op_assign(homeless_session, *i);
+    }
+  }
+
+  logger->set(l_osdc_osd_sessions, osd_sessions.size());
+}
+
+void Objecter::wait_for_osd_map(epoch_t e)
+{
+  unique_lock l(rwlock);
+  if (osdmap->get_epoch() >= e) {
+    l.unlock();
+    return;
+  }
+
+  ca::waiter<bs::error_code> w;
+  waiting_for_map[e].emplace_back(OpCompletion::create(
+				    service.get_executor(),
+				    w.ref()),
+				  bs::error_code{});
+  l.unlock();
+  w.wait();
+}
+
+void Objecter::_get_latest_version(epoch_t oldest, epoch_t newest,
+				   std::unique_ptr<OpCompletion> fin,
+				   std::unique_lock<ceph::shared_mutex>&& l)
+{
+  ceph_assert(fin);
+  if (osdmap->get_epoch() >= newest) {
+    ldout(cct, 10) << __func__ << " latest " << newest << ", have it" << dendl;
+    l.unlock();
+    ca::defer(std::move(fin), bs::error_code{});
+  } else {
+    ldout(cct, 10) << __func__ << " latest " << newest << ", waiting" << dendl;
+    _wait_for_new_map(std::move(fin), newest, bs::error_code{});
+    l.unlock();
+  }
+}
+
+void Objecter::maybe_request_map()
+{
+  shared_lock rl(rwlock);
+  _maybe_request_map();
+}
+
+void Objecter::_maybe_request_map()
+{
+  // rwlock is locked
+  int flag = 0;
+  if (_osdmap_full_flag()
+      || osdmap->test_flag(CEPH_OSDMAP_PAUSERD)
+      || osdmap->test_flag(CEPH_OSDMAP_PAUSEWR)) {
+    ldout(cct, 10) << "_maybe_request_map subscribing (continuous) to next "
+      "osd map (FULL flag is set)" << dendl;
+  } else {
+    ldout(cct, 10)
+      << "_maybe_request_map subscribing (onetime) to next osd map" << dendl;
+    flag = CEPH_SUBSCRIBE_ONETIME;
+  }
+  epoch_t epoch = osdmap->get_epoch() ? osdmap->get_epoch()+1 : 0;
+  if (monc->sub_want("osdmap", epoch, flag)) {
+    monc->renew_subs();
+  }
+}
+
+void Objecter::_wait_for_new_map(std::unique_ptr<OpCompletion> c, epoch_t epoch,
+				 bs::error_code ec)
+{
+  // rwlock is locked unique
+  waiting_for_map[epoch].emplace_back(std::move(c), ec);
+  _maybe_request_map();
+}
+
+
+/**
+ * Use this together with wait_for_map: this is a pre-check to avoid
+ * allocating a Context for wait_for_map if we can see that we
+ * definitely already have the epoch.
+ *
+ * This does *not* replace the need to handle the return value of
+ * wait_for_map: just because we don't have it in this pre-check
+ * doesn't mean we won't have it when calling back into wait_for_map,
+ * since the objecter lock is dropped in between.
+ */
+bool Objecter::have_map(const epoch_t epoch)
+{
+  shared_lock rl(rwlock);
+  if (osdmap->get_epoch() >= epoch) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void Objecter::_kick_requests(OSDSession *session,
+			      map<uint64_t, LingerOp *>& lresend)
+{
+  // rwlock is locked unique
+
+  // clear backoffs
+  session->backoffs.clear();
+  session->backoffs_by_id.clear();
+
+  // resend ops
+  map<ceph_tid_t,Op*> resend;  // resend in tid order
+  for (auto p = session->ops.begin(); p != session->ops.end();) {
+    Op *op = p->second;
+    ++p;
+    if (op->should_resend) {
+      if (!op->target.paused)
+	resend[op->tid] = op;
+    } else {
+      _op_cancel_map_check(op);
+      _cancel_linger_op(op);
+    }
+  }
+
+  logger->inc(l_osdc_op_resend, resend.size());
+  while (!resend.empty()) {
+    _send_op(resend.begin()->second);
+    resend.erase(resend.begin());
+  }
+
+  // resend lingers
+  logger->inc(l_osdc_linger_resend, session->linger_ops.size());
+  for (auto j = session->linger_ops.begin();
+       j != session->linger_ops.end(); ++j) {
+    LingerOp *op = j->second;
+    op->get();
+    ceph_assert(lresend.count(j->first) == 0);
+    lresend[j->first] = op;
+  }
+
+  // resend commands
+  logger->inc(l_osdc_command_resend, session->command_ops.size());
+  map<uint64_t,CommandOp*> cresend;  // resend in order
+  for (auto k = session->command_ops.begin();
+       k != session->command_ops.end(); ++k) {
+    cresend[k->first] = k->second;
+  }
+  while (!cresend.empty()) {
+    _send_command(cresend.begin()->second);
+    cresend.erase(cresend.begin());
+  }
+}
+
+void Objecter::_linger_ops_resend(map<uint64_t, LingerOp *>& lresend,
+				  unique_lock<ceph::shared_mutex>& ul)
+{
+  ceph_assert(ul.owns_lock());
+  shunique_lock sul(std::move(ul));
+  while (!lresend.empty()) {
+    LingerOp *op = lresend.begin()->second;
+    if (!op->canceled) {
+      _send_linger(op, sul);
+    }
+    op->put();
+    lresend.erase(lresend.begin());
+  }
+  ul = sul.release_to_unique();
+}
+
+void Objecter::start_tick()
+{
+  ceph_assert(tick_event == 0);
+  tick_event =
+    timer.add_event(ceph::make_timespan(cct->_conf->objecter_tick_interval),
+		    &Objecter::tick, this);
+}
+
+void Objecter::tick()
+{
+  shared_lock rl(rwlock);
+
+  ldout(cct, 10) << "tick" << dendl;
+
+  // we are only called by C_Tick
+  tick_event = 0;
+
+  if (!initialized) {
+    // we raced with shutdown
+    ldout(cct, 10) << __func__ << " raced with shutdown" << dendl;
+    return;
+  }
+
+  set<OSDSession*> toping;
+
+
+  // look for laggy requests
+  auto cutoff = ceph::coarse_mono_clock::now();
+  cutoff -= ceph::make_timespan(cct->_conf->objecter_timeout);  // timeout
+
+  unsigned laggy_ops = 0;
+
+  for (auto siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    auto s = siter->second;
+    scoped_lock l(s->lock);
+    bool found = false;
+    for (auto p = s->ops.begin(); p != s->ops.end(); ++p) {
+      auto op = p->second;
+      ceph_assert(op->session);
+      if (op->stamp < cutoff) {
+	ldout(cct, 2) << " tid " << p->first << " on osd." << op->session->osd
+		      << " is laggy" << dendl;
+	found = true;
+	++laggy_ops;
+      }
+    }
+    for (auto p = s->linger_ops.begin();
+	p != s->linger_ops.end();
+	++p) {
+      auto op = p->second;
+      std::unique_lock wl(op->watch_lock);
+      ceph_assert(op->session);
+      ldout(cct, 10) << " pinging osd that serves lingering tid " << p->first
+		     << " (osd." << op->session->osd << ")" << dendl;
+      found = true;
+      if (op->is_watch && op->registered && !op->last_error)
+	_send_linger_ping(op);
+    }
+    for (auto p = s->command_ops.begin();
+	p != s->command_ops.end();
+	++p) {
+      auto op = p->second;
+      ceph_assert(op->session);
+      ldout(cct, 10) << " pinging osd that serves command tid " << p->first
+		     << " (osd." << op->session->osd << ")" << dendl;
+      found = true;
+    }
+    if (found)
+      toping.insert(s);
+  }
+  if (num_homeless_ops || !toping.empty()) {
+    _maybe_request_map();
+  }
+
+  logger->set(l_osdc_op_laggy, laggy_ops);
+  logger->set(l_osdc_osd_laggy, toping.size());
+
+  if (!toping.empty()) {
+    // send a ping to these osds, to ensure we detect any session resets
+    // (osd reply message policy is lossy)
+    for (auto i = toping.begin(); i != toping.end(); ++i) {
+      (*i)->con->send_message(new MPing);
+    }
+  }
+
+  // Make sure we don't reschedule if we wake up after shutdown
+  if (initialized) {
+    tick_event = timer.reschedule_me(ceph::make_timespan(
+				       cct->_conf->objecter_tick_interval));
+  }
+}
+
+void Objecter::resend_mon_ops()
+{
+  unique_lock wl(rwlock);
+
+  ldout(cct, 10) << "resend_mon_ops" << dendl;
+
+  for (auto p = poolstat_ops.begin(); p != poolstat_ops.end(); ++p) {
+    _poolstat_submit(p->second);
+    logger->inc(l_osdc_poolstat_resend);
+  }
+
+  for (auto p = statfs_ops.begin(); p != statfs_ops.end(); ++p) {
+    _fs_stats_submit(p->second);
+    logger->inc(l_osdc_statfs_resend);
+  }
+
+  for (auto p = pool_ops.begin(); p != pool_ops.end(); ++p) {
+    _pool_op_submit(p->second);
+    logger->inc(l_osdc_poolop_resend);
+  }
+
+  for (auto p = check_latest_map_ops.begin();
+       p != check_latest_map_ops.end();
+       ++p) {
+    monc->get_version("osdmap", CB_Op_Map_Latest(this, p->second->tid));
+  }
+
+  for (auto p = check_latest_map_lingers.begin();
+       p != check_latest_map_lingers.end();
+       ++p) {
+    monc->get_version("osdmap", CB_Linger_Map_Latest(this, p->second->linger_id));
+  }
+
+  for (auto p = check_latest_map_commands.begin();
+       p != check_latest_map_commands.end();
+       ++p) {
+    monc->get_version("osdmap", CB_Command_Map_Latest(this, p->second->tid));
+  }
+}
+
+// read | write ---------------------------
+
+void Objecter::op_submit(Op *op, ceph_tid_t *ptid, int *ctx_budget)
+{
+  shunique_lock rl(rwlock, ceph::acquire_shared);
+  ceph_tid_t tid = 0;
+  if (!ptid)
+    ptid = &tid;
+  op->trace.event("op submit");
+  _op_submit_with_budget(op, rl, ptid, ctx_budget);
+}
+
+void Objecter::_op_submit_with_budget(Op *op,
+				      shunique_lock<ceph::shared_mutex>& sul,
+				      ceph_tid_t *ptid,
+				      int *ctx_budget)
+{
+  ceph_assert(initialized);
+
+  ceph_assert(op->ops.size() == op->out_bl.size());
+  ceph_assert(op->ops.size() == op->out_rval.size());
+  ceph_assert(op->ops.size() == op->out_handler.size());
+
+  // throttle.  before we look at any state, because
+  // _take_op_budget() may drop our lock while it blocks.
+  if (!op->ctx_budgeted || (ctx_budget && (*ctx_budget == -1))) {
+    int op_budget = _take_op_budget(op, sul);
+    // take and pass out the budget for the first OP
+    // in the context session
+    if (ctx_budget && (*ctx_budget == -1)) {
+      *ctx_budget = op_budget;
+    }
+  }
+
+  if (osd_timeout > timespan(0)) {
+    if (op->tid == 0)
+      op->tid = ++last_tid;
+    auto tid = op->tid;
+    op->ontimeout = timer.add_event(osd_timeout,
+				    [this, tid]() {
+				      op_cancel(tid, -ETIMEDOUT); });
+  }
+
+  _op_submit(op, sul, ptid);
+}
+
+void Objecter::_send_op_account(Op *op)
+{
+  inflight_ops++;
+
+  // add to gather set(s)
+  if (op->has_completion()) {
+    num_in_flight++;
+  } else {
+    ldout(cct, 20) << " note: not requesting reply" << dendl;
+  }
+
+  logger->inc(l_osdc_op_active);
+  logger->inc(l_osdc_op);
+  logger->inc(l_osdc_oplen_avg, op->ops.size());
+
+  if ((op->target.flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)) ==
+      (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE))
+    logger->inc(l_osdc_op_rmw);
+  else if (op->target.flags & CEPH_OSD_FLAG_WRITE)
+    logger->inc(l_osdc_op_w);
+  else if (op->target.flags & CEPH_OSD_FLAG_READ)
+    logger->inc(l_osdc_op_r);
+
+  if (op->target.flags & CEPH_OSD_FLAG_PGOP)
+    logger->inc(l_osdc_op_pg);
+
+  for (auto p = op->ops.begin(); p != op->ops.end(); ++p) {
+    int code = l_osdc_osdop_other;
+    switch (p->op.op) {
+    case CEPH_OSD_OP_STAT: code = l_osdc_osdop_stat; break;
+    case CEPH_OSD_OP_CREATE: code = l_osdc_osdop_create; break;
+    case CEPH_OSD_OP_READ: code = l_osdc_osdop_read; break;
+    case CEPH_OSD_OP_WRITE: code = l_osdc_osdop_write; break;
+    case CEPH_OSD_OP_WRITEFULL: code = l_osdc_osdop_writefull; break;
+    case CEPH_OSD_OP_WRITESAME: code = l_osdc_osdop_writesame; break;
+    case CEPH_OSD_OP_APPEND: code = l_osdc_osdop_append; break;
+    case CEPH_OSD_OP_ZERO: code = l_osdc_osdop_zero; break;
+    case CEPH_OSD_OP_TRUNCATE: code = l_osdc_osdop_truncate; break;
+    case CEPH_OSD_OP_DELETE: code = l_osdc_osdop_delete; break;
+    case CEPH_OSD_OP_MAPEXT: code = l_osdc_osdop_mapext; break;
+    case CEPH_OSD_OP_SPARSE_READ: code = l_osdc_osdop_sparse_read; break;
+    case CEPH_OSD_OP_GETXATTR: code = l_osdc_osdop_getxattr; break;
+    case CEPH_OSD_OP_SETXATTR: code = l_osdc_osdop_setxattr; break;
+    case CEPH_OSD_OP_CMPXATTR: code = l_osdc_osdop_cmpxattr; break;
+    case CEPH_OSD_OP_RMXATTR: code = l_osdc_osdop_rmxattr; break;
+    case CEPH_OSD_OP_RESETXATTRS: code = l_osdc_osdop_resetxattrs; break;
+
+    // OMAP read operations
+    case CEPH_OSD_OP_OMAPGETVALS:
+    case CEPH_OSD_OP_OMAPGETKEYS:
+    case CEPH_OSD_OP_OMAPGETHEADER:
+    case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
+    case CEPH_OSD_OP_OMAP_CMP: code = l_osdc_osdop_omap_rd; break;
+
+    // OMAP write operations
+    case CEPH_OSD_OP_OMAPSETVALS:
+    case CEPH_OSD_OP_OMAPSETHEADER: code = l_osdc_osdop_omap_wr; break;
+
+    // OMAP del operations
+    case CEPH_OSD_OP_OMAPCLEAR:
+    case CEPH_OSD_OP_OMAPRMKEYS: code = l_osdc_osdop_omap_del; break;
+
+    case CEPH_OSD_OP_CALL: code = l_osdc_osdop_call; break;
+    case CEPH_OSD_OP_WATCH: code = l_osdc_osdop_watch; break;
+    case CEPH_OSD_OP_NOTIFY: code = l_osdc_osdop_notify; break;
+    }
+    if (code)
+      logger->inc(code);
+  }
+}
+
+void Objecter::_op_submit(Op *op, shunique_lock<ceph::shared_mutex>& sul, ceph_tid_t *ptid)
+{
+  // rwlock is locked
+
+  ldout(cct, 10) << __func__ << " op " << op << dendl;
+
+  // pick target
+  ceph_assert(op->session == NULL);
+  OSDSession *s = NULL;
+
+  bool check_for_latest_map = _calc_target(&op->target, nullptr)
+    == RECALC_OP_TARGET_POOL_DNE;
+
+  // Try to get a session, including a retry if we need to take write lock
+  int r = _get_session(op->target.osd, &s, sul);
+  if (r == -EAGAIN ||
+      (check_for_latest_map && sul.owns_lock_shared()) ||
+      cct->_conf->objecter_debug_inject_relock_delay) {
+    epoch_t orig_epoch = osdmap->get_epoch();
+    sul.unlock();
+    if (cct->_conf->objecter_debug_inject_relock_delay) {
+      sleep(1);
+    }
+    sul.lock();
+    if (orig_epoch != osdmap->get_epoch()) {
+      // map changed; recalculate mapping
+      ldout(cct, 10) << __func__ << " relock raced with osdmap, recalc target"
+		     << dendl;
+      check_for_latest_map = _calc_target(&op->target, nullptr)
+	== RECALC_OP_TARGET_POOL_DNE;
+      if (s) {
+	put_session(s);
+	s = NULL;
+	r = -EAGAIN;
+      }
+    }
+  }
+  if (r == -EAGAIN) {
+    ceph_assert(s == NULL);
+    r = _get_session(op->target.osd, &s, sul);
+  }
+  ceph_assert(r == 0);
+  ceph_assert(s);  // may be homeless
+
+  _send_op_account(op);
+
+  // send?
+
+  ceph_assert(op->target.flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE));
+
+  bool need_send = false;
+  if (op->target.paused) {
+    ldout(cct, 10) << " tid " << op->tid << " op " << op << " is paused"
+		   << dendl;
+    _maybe_request_map();
+  } else if (!s->is_homeless()) {
+    need_send = true;
+  } else {
+    _maybe_request_map();
+  }
+
+  unique_lock sl(s->lock);
+  if (op->tid == 0)
+    op->tid = ++last_tid;
+
+  ldout(cct, 10) << "_op_submit oid " << op->target.base_oid
+		 << " '" << op->target.base_oloc << "' '"
+		 << op->target.target_oloc << "' " << op->ops << " tid "
+		 << op->tid << " osd." << (!s->is_homeless() ? s->osd : -1)
+		 << dendl;
+
+  _session_op_assign(s, op);
+
+  if (need_send) {
+    _send_op(op);
+  }
+
+  // Last chance to touch Op here, after giving up session lock it can
+  // be freed at any time by response handler.
+  ceph_tid_t tid = op->tid;
+  if (check_for_latest_map) {
+    _send_op_map_check(op);
+  }
+  if (ptid)
+    *ptid = tid;
+  op = NULL;
+
+  sl.unlock();
+  put_session(s);
+
+  ldout(cct, 5) << num_in_flight << " in flight" << dendl;
+}
+
+int Objecter::op_cancel(OSDSession *s, ceph_tid_t tid, int r)
+{
+  ceph_assert(initialized);
+
+  unique_lock sl(s->lock);
+
+  auto p = s->ops.find(tid);
+  if (p == s->ops.end()) {
+    ldout(cct, 10) << __func__ << " tid " << tid << " dne in session "
+		   << s->osd << dendl;
+    return -ENOENT;
+  }
+
+#if 0
+  if (s->con) {
+    ldout(cct, 20) << " revoking rx ceph::buffer for " << tid
+		   << " on " << s->con << dendl;
+    s->con->revoke_rx_buffer(tid);
+  }
+#endif
+
+  ldout(cct, 10) << __func__ << " tid " << tid << " in session " << s->osd
+		 << dendl;
+  Op *op = p->second;
+  if (op->has_completion()) {
+    num_in_flight--;
+    op->complete(osdcode(r), r);
+  }
+  _op_cancel_map_check(op);
+  _finish_op(op, r);
+  sl.unlock();
+
+  return 0;
+}
+
+int Objecter::op_cancel(ceph_tid_t tid, int r)
+{
+  int ret = 0;
+
+  unique_lock wl(rwlock);
+  ret = _op_cancel(tid, r);
+
+  return ret;
+}
+
+int Objecter::op_cancel(const vector<ceph_tid_t>& tids, int r)
+{
+  unique_lock wl(rwlock);
+  ldout(cct,10) << __func__ << " " << tids << dendl;
+  for (auto tid : tids) {
+    _op_cancel(tid, r);
+  }
+  return 0;
+}
+
+int Objecter::_op_cancel(ceph_tid_t tid, int r)
+{
+  int ret = 0;
+
+  ldout(cct, 5) << __func__ << ": cancelling tid " << tid << " r=" << r
+		<< dendl;
+
+start:
+
+  for (auto siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    OSDSession *s = siter->second;
+    shared_lock sl(s->lock);
+    if (s->ops.find(tid) != s->ops.end()) {
+      sl.unlock();
+      ret = op_cancel(s, tid, r);
+      if (ret == -ENOENT) {
+	/* oh no! raced, maybe tid moved to another session, restarting */
+	goto start;
+      }
+      return ret;
+    }
+  }
+
+  ldout(cct, 5) << __func__ << ": tid " << tid
+		<< " not found in live sessions" << dendl;
+
+  // Handle case where the op is in homeless session
+  shared_lock sl(homeless_session->lock);
+  if (homeless_session->ops.find(tid) != homeless_session->ops.end()) {
+    sl.unlock();
+    ret = op_cancel(homeless_session, tid, r);
+    if (ret == -ENOENT) {
+      /* oh no! raced, maybe tid moved to another session, restarting */
+      goto start;
+    } else {
+      return ret;
+    }
+  } else {
+    sl.unlock();
+  }
+
+  ldout(cct, 5) << __func__ << ": tid " << tid
+		<< " not found in homeless session" << dendl;
+
+  return ret;
+}
+
+
+epoch_t Objecter::op_cancel_writes(int r, int64_t pool)
+{
+  unique_lock wl(rwlock);
+
+  std::vector<ceph_tid_t> to_cancel;
+  bool found = false;
+
+  for (auto siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    OSDSession *s = siter->second;
+    shared_lock sl(s->lock);
+    for (auto op_i = s->ops.begin();
+	 op_i != s->ops.end(); ++op_i) {
+      if (op_i->second->target.flags & CEPH_OSD_FLAG_WRITE
+	  && (pool == -1 || op_i->second->target.target_oloc.pool == pool)) {
+	to_cancel.push_back(op_i->first);
+      }
+    }
+    sl.unlock();
+
+    for (auto titer = to_cancel.begin(); titer != to_cancel.end(); ++titer) {
+      int cancel_result = op_cancel(s, *titer, r);
+      // We hold rwlock across search and cancellation, so cancels
+      // should always succeed
+      ceph_assert(cancel_result == 0);
+    }
+    if (!found && to_cancel.size())
+      found = true;
+    to_cancel.clear();
+  }
+
+  const epoch_t epoch = osdmap->get_epoch();
+
+  wl.unlock();
+
+  if (found) {
+    return epoch;
+  } else {
+    return -1;
+  }
+}
+
+bool Objecter::is_pg_changed(
+  int oldprimary,
+  const vector<int>& oldacting,
+  int newprimary,
+  const vector<int>& newacting,
+  bool any_change)
+{
+  if (OSDMap::primary_changed_broken( // https://tracker.ceph.com/issues/43213
+	oldprimary,
+	oldacting,
+	newprimary,
+	newacting))
+    return true;
+  if (any_change && oldacting != newacting)
+    return true;
+  return false;      // same primary (tho replicas may have changed)
+}
+
+bool Objecter::target_should_be_paused(op_target_t *t)
+{
+  const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool);
+  bool pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD);
+  bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) ||
+    (t->respects_full() && (_osdmap_full_flag() || _osdmap_pool_full(*pi)));
+
+  return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+    (t->flags & CEPH_OSD_FLAG_WRITE && pausewr) ||
+    (osdmap->get_epoch() < epoch_barrier);
+}
+
+/**
+ * Locking public accessor for _osdmap_full_flag
+ */
+bool Objecter::osdmap_full_flag() const
+{
+  shared_lock rl(rwlock);
+
+  return _osdmap_full_flag();
+}
+
+bool Objecter::osdmap_pool_full(const int64_t pool_id) const
+{
+  shared_lock rl(rwlock);
+
+  if (_osdmap_full_flag()) {
+    return true;
+  }
+
+  return _osdmap_pool_full(pool_id);
+}
+
+bool Objecter::_osdmap_pool_full(const int64_t pool_id) const
+{
+  const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
+  if (pool == NULL) {
+    ldout(cct, 4) << __func__ << ": DNE pool " << pool_id << dendl;
+    return false;
+  }
+
+  return _osdmap_pool_full(*pool);
+}
+
+bool Objecter::_osdmap_has_pool_full() const
+{
+  for (auto it = osdmap->get_pools().begin();
+       it != osdmap->get_pools().end(); ++it) {
+    if (_osdmap_pool_full(it->second))
+      return true;
+  }
+  return false;
+}
+
+/**
+ * Wrapper around osdmap->test_flag for special handling of the FULL flag.
+ */
+bool Objecter::_osdmap_full_flag() const
+{
+  // Ignore the FULL flag if the caller does not have honor_osdmap_full
+  return osdmap->test_flag(CEPH_OSDMAP_FULL) && honor_pool_full;
+}
+
+void Objecter::update_pool_full_map(map<int64_t, bool>& pool_full_map)
+{
+  for (map<int64_t, pg_pool_t>::const_iterator it
+	 = osdmap->get_pools().begin();
+       it != osdmap->get_pools().end(); ++it) {
+    if (pool_full_map.find(it->first) == pool_full_map.end()) {
+      pool_full_map[it->first] = _osdmap_pool_full(it->second);
+    } else {
+      pool_full_map[it->first] = _osdmap_pool_full(it->second) ||
+	pool_full_map[it->first];
+    }
+  }
+}
+
+int64_t Objecter::get_object_hash_position(int64_t pool, const string& key,
+					   const string& ns)
+{
+  shared_lock rl(rwlock);
+  const pg_pool_t *p = osdmap->get_pg_pool(pool);
+  if (!p)
+    return -ENOENT;
+  return p->hash_key(key, ns);
+}
+
+int64_t Objecter::get_object_pg_hash_position(int64_t pool, const string& key,
+					      const string& ns)
+{
+  shared_lock rl(rwlock);
+  const pg_pool_t *p = osdmap->get_pg_pool(pool);
+  if (!p)
+    return -ENOENT;
+  return p->raw_hash_to_pg(p->hash_key(key, ns));
+}
+
+void Objecter::_prune_snapc(
+  const mempool::osdmap::map<int64_t,
+  snap_interval_set_t>& new_removed_snaps,
+  Op *op)
+{
+  bool match = false;
+  auto i = new_removed_snaps.find(op->target.base_pgid.pool());
+  if (i != new_removed_snaps.end()) {
+    for (auto s : op->snapc.snaps) {
+      if (i->second.contains(s)) {
+	match = true;
+	break;
+      }
+    }
+    if (match) {
+      vector<snapid_t> new_snaps;
+      for (auto s : op->snapc.snaps) {
+	if (!i->second.contains(s)) {
+	  new_snaps.push_back(s);
+	}
+      }
+      op->snapc.snaps.swap(new_snaps);
+      ldout(cct,10) << __func__ << " op " << op->tid << " snapc " << op->snapc
+		    << " (was " << new_snaps << ")" << dendl;
+    }
+  }
+}
+
+int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
+{
+  // rwlock is locked
+  bool is_read = t->flags & CEPH_OSD_FLAG_READ;
+  bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
+  t->epoch = osdmap->get_epoch();
+  ldout(cct,20) << __func__ << " epoch " << t->epoch
+		<< " base " << t->base_oid << " " << t->base_oloc
+		<< " precalc_pgid " << (int)t->precalc_pgid
+		<< " pgid " << t->base_pgid
+		<< (is_read ? " is_read" : "")
+		<< (is_write ? " is_write" : "")
+		<< dendl;
+
+  const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool);
+  if (!pi) {
+    t->osd = -1;
+    return RECALC_OP_TARGET_POOL_DNE;
+  }
+  ldout(cct,30) << __func__ << "  base pi " << pi
+		<< " pg_num " << pi->get_pg_num() << dendl;
+
+  bool force_resend = false;
+  if (osdmap->get_epoch() == pi->last_force_op_resend) {
+    if (t->last_force_resend < pi->last_force_op_resend) {
+      t->last_force_resend = pi->last_force_op_resend;
+      force_resend = true;
+    } else if (t->last_force_resend == 0) {
+      force_resend = true;
+    }
+  }
+
+  // apply tiering
+  t->target_oid = t->base_oid;
+  t->target_oloc = t->base_oloc;
+  if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+    if (is_read && pi->has_read_tier())
+      t->target_oloc.pool = pi->read_tier;
+    if (is_write && pi->has_write_tier())
+      t->target_oloc.pool = pi->write_tier;
+    pi = osdmap->get_pg_pool(t->target_oloc.pool);
+    if (!pi) {
+      t->osd = -1;
+      return RECALC_OP_TARGET_POOL_DNE;
+    }
+  }
+
+  pg_t pgid;
+  if (t->precalc_pgid) {
+    ceph_assert(t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY);
+    ceph_assert(t->base_oid.name.empty()); // make sure this is a pg op
+    ceph_assert(t->base_oloc.pool == (int64_t)t->base_pgid.pool());
+    pgid = t->base_pgid;
+  } else {
+    int ret = osdmap->object_locator_to_pg(t->target_oid, t->target_oloc,
+					   pgid);
+    if (ret == -ENOENT) {
+      t->osd = -1;
+      return RECALC_OP_TARGET_POOL_DNE;
+    }
+  }
+  ldout(cct,20) << __func__ << " target " << t->target_oid << " "
+		<< t->target_oloc << " -> pgid " << pgid << dendl;
+  ldout(cct,30) << __func__ << "  target pi " << pi
+		<< " pg_num " << pi->get_pg_num() << dendl;
+  t->pool_ever_existed = true;
+
+  int size = pi->size;
+  int min_size = pi->min_size;
+  unsigned pg_num = pi->get_pg_num();
+  unsigned pg_num_mask = pi->get_pg_num_mask();
+  unsigned pg_num_pending = pi->get_pg_num_pending();
+  int up_primary, acting_primary;
+  vector<int> up, acting;
+  ps_t actual_ps = ceph_stable_mod(pgid.ps(), pg_num, pg_num_mask);
+  pg_t actual_pgid(actual_ps, pgid.pool());
+  pg_mapping_t pg_mapping;
+  pg_mapping.epoch = osdmap->get_epoch();
+  if (lookup_pg_mapping(actual_pgid, &pg_mapping)) {
+    up = pg_mapping.up;
+    up_primary = pg_mapping.up_primary;
+    acting = pg_mapping.acting;
+    acting_primary = pg_mapping.acting_primary;
+  } else {
+    osdmap->pg_to_up_acting_osds(actual_pgid, &up, &up_primary,
+                                 &acting, &acting_primary);
+    pg_mapping_t pg_mapping(osdmap->get_epoch(),
+                            up, up_primary, acting, acting_primary);
+    update_pg_mapping(actual_pgid, std::move(pg_mapping));
+  }
+  bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
+  bool recovery_deletes = osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES);
+  unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask);
+  pg_t prev_pgid(prev_seed, pgid.pool());
+  if (any_change && PastIntervals::is_new_interval(
+	t->acting_primary,
+	acting_primary,
+	t->acting,
+	acting,
+	t->up_primary,
+	up_primary,
+	t->up,
+	up,
+	t->size,
+	size,
+	t->min_size,
+	min_size,
+	t->pg_num,
+	pg_num,
+	t->pg_num_pending,
+	pg_num_pending,
+	t->sort_bitwise,
+	sort_bitwise,
+	t->recovery_deletes,
+	recovery_deletes,
+	t->peering_crush_bucket_count,
+	pi->peering_crush_bucket_count,
+	t->peering_crush_bucket_target,
+	pi->peering_crush_bucket_target,
+	t->peering_crush_bucket_barrier,
+	pi->peering_crush_bucket_barrier,
+	t->peering_crush_mandatory_member,
+	pi->peering_crush_mandatory_member,
+	prev_pgid)) {
+    force_resend = true;
+  }
+
+  bool unpaused = false;
+  bool should_be_paused = target_should_be_paused(t);
+  if (t->paused && !should_be_paused) {
+    unpaused = true;
+  }
+  if (t->paused != should_be_paused) {
+    ldout(cct, 10) << __func__ << " paused " << t->paused
+		   << " -> " << should_be_paused << dendl;
+    t->paused = should_be_paused;
+  }
+
+  bool legacy_change =
+    t->pgid != pgid ||
+      is_pg_changed(
+	t->acting_primary, t->acting, acting_primary, acting,
+	t->used_replica || any_change);
+  bool split_or_merge = false;
+  if (t->pg_num) {
+    split_or_merge =
+      prev_pgid.is_split(t->pg_num, pg_num, nullptr) ||
+      prev_pgid.is_merge_source(t->pg_num, pg_num, nullptr) ||
+      prev_pgid.is_merge_target(t->pg_num, pg_num);
+  }
+
+  if (legacy_change || split_or_merge || force_resend) {
+    t->pgid = pgid;
+    t->acting = acting;
+    t->acting_primary = acting_primary;
+    t->up_primary = up_primary;
+    t->up = up;
+    t->size = size;
+    t->min_size = min_size;
+    t->pg_num = pg_num;
+    t->pg_num_mask = pg_num_mask;
+    t->pg_num_pending = pg_num_pending;
+    spg_t spgid(actual_pgid);
+    if (pi->is_erasure()) {
+      for (uint8_t i = 0; i < acting.size(); ++i) {
+        if (acting[i] == acting_primary) {
+          spgid.reset_shard(shard_id_t(i));
+          break;
+        }
+      }
+    }
+    t->actual_pgid = spgid;
+    t->sort_bitwise = sort_bitwise;
+    t->recovery_deletes = recovery_deletes;
+    t->peering_crush_bucket_count = pi->peering_crush_bucket_count;
+    t->peering_crush_bucket_target = pi->peering_crush_bucket_target;
+    t->peering_crush_bucket_barrier = pi->peering_crush_bucket_barrier;
+    t->peering_crush_mandatory_member = pi->peering_crush_mandatory_member;
+    ldout(cct, 10) << __func__ << " "
+		   << " raw pgid " << pgid << " -> actual " << t->actual_pgid
+		   << " acting " << acting
+		   << " primary " << acting_primary << dendl;
+    t->used_replica = false;
+    if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS |
+                     CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+        !is_write && pi->is_replicated() && acting.size() > 1) {
+      int osd;
+      ceph_assert(is_read && acting[0] == acting_primary);
+      if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) {
+	int p = rand() % acting.size();
+	if (p)
+	  t->used_replica = true;
+	osd = acting[p];
+	ldout(cct, 10) << " chose random osd." << osd << " of " << acting
+		       << dendl;
+      } else {
+	// look for a local replica.  prefer the primary if the
+	// distance is the same.
+	int best = -1;
+	int best_locality = 0;
+	for (unsigned i = 0; i < acting.size(); ++i) {
+	  int locality = osdmap->crush->get_common_ancestor_distance(
+		 cct, acting[i], crush_location);
+	  ldout(cct, 20) << __func__ << " localize: rank " << i
+			 << " osd." << acting[i]
+			 << " locality " << locality << dendl;
+	  if (i == 0 ||
+	      (locality >= 0 && best_locality >= 0 &&
+	       locality < best_locality) ||
+	      (best_locality < 0 && locality >= 0)) {
+	    best = i;
+	    best_locality = locality;
+	    if (i)
+	      t->used_replica = true;
+	  }
+	}
+	ceph_assert(best >= 0);
+	osd = acting[best];
+      }
+      t->osd = osd;
+    } else {
+      t->osd = acting_primary;
+    }
+  }
+  if (legacy_change || unpaused || force_resend) {
+    return RECALC_OP_TARGET_NEED_RESEND;
+  }
+  if (split_or_merge &&
+      (osdmap->require_osd_release >= ceph_release_t::luminous ||
+       HAVE_FEATURE(osdmap->get_xinfo(acting_primary).features,
+		    RESEND_ON_SPLIT))) {
+    return RECALC_OP_TARGET_NEED_RESEND;
+  }
+  return RECALC_OP_TARGET_NO_ACTION;
+}
+
+int Objecter::_map_session(op_target_t *target, OSDSession **s,
+			   shunique_lock<ceph::shared_mutex>& sul)
+{
+  _calc_target(target, nullptr);
+  return _get_session(target->osd, s, sul);
+}
+
+void Objecter::_session_op_assign(OSDSession *to, Op *op)
+{
+  // to->lock is locked
+  ceph_assert(op->session == NULL);
+  ceph_assert(op->tid);
+
+  get_session(to);
+  op->session = to;
+  to->ops[op->tid] = op;
+
+  if (to->is_homeless()) {
+    num_homeless_ops++;
+  }
+
+  ldout(cct, 15) << __func__ << " " << to->osd << " " << op->tid << dendl;
+}
+
+void Objecter::_session_op_remove(OSDSession *from, Op *op)
+{
+  ceph_assert(op->session == from);
+  // from->lock is locked
+
+  if (from->is_homeless()) {
+    num_homeless_ops--;
+  }
+
+  from->ops.erase(op->tid);
+  put_session(from);
+  op->session = NULL;
+
+  ldout(cct, 15) << __func__ << " " << from->osd << " " << op->tid << dendl;
+}
+
+void Objecter::_session_linger_op_assign(OSDSession *to, LingerOp *op)
+{
+  // to lock is locked unique
+  ceph_assert(op->session == NULL);
+
+  if (to->is_homeless()) {
+    num_homeless_ops++;
+  }
+
+  get_session(to);
+  op->session = to;
+  to->linger_ops[op->linger_id] = op;
+
+  ldout(cct, 15) << __func__ << " " << to->osd << " " << op->linger_id
+		 << dendl;
+}
+
+void Objecter::_session_linger_op_remove(OSDSession *from, LingerOp *op)
+{
+  ceph_assert(from == op->session);
+  // from->lock is locked unique
+
+  if (from->is_homeless()) {
+    num_homeless_ops--;
+  }
+
+  from->linger_ops.erase(op->linger_id);
+  put_session(from);
+  op->session = NULL;
+
+  ldout(cct, 15) << __func__ << " " << from->osd << " " << op->linger_id
+		 << dendl;
+}
+
+void Objecter::_session_command_op_remove(OSDSession *from, CommandOp *op)
+{
+  ceph_assert(from == op->session);
+  // from->lock is locked
+
+  if (from->is_homeless()) {
+    num_homeless_ops--;
+  }
+
+  from->command_ops.erase(op->tid);
+  put_session(from);
+  op->session = NULL;
+
+  ldout(cct, 15) << __func__ << " " << from->osd << " " << op->tid << dendl;
+}
+
+void Objecter::_session_command_op_assign(OSDSession *to, CommandOp *op)
+{
+  // to->lock is locked
+  ceph_assert(op->session == NULL);
+  ceph_assert(op->tid);
+
+  if (to->is_homeless()) {
+    num_homeless_ops++;
+  }
+
+  get_session(to);
+  op->session = to;
+  to->command_ops[op->tid] = op;
+
+  ldout(cct, 15) << __func__ << " " << to->osd << " " << op->tid << dendl;
+}
+
+int Objecter::_recalc_linger_op_target(LingerOp *linger_op,
+				       shunique_lock<ceph::shared_mutex>& sul)
+{
+  // rwlock is locked unique
+
+  int r = _calc_target(&linger_op->target, nullptr, true);
+  if (r == RECALC_OP_TARGET_NEED_RESEND) {
+    ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id
+		   << " pgid " << linger_op->target.pgid
+		   << " acting " << linger_op->target.acting << dendl;
+
+    OSDSession *s = NULL;
+    r = _get_session(linger_op->target.osd, &s, sul);
+    ceph_assert(r == 0);
+
+    if (linger_op->session != s) {
+      // NB locking two sessions (s and linger_op->session) at the
+      // same time here is only safe because we are the only one that
+      // takes two, and we are holding rwlock for write.  We use
+      // std::shared_mutex in OSDSession because lockdep doesn't know
+      // that.
+      unique_lock sl(s->lock);
+      _session_linger_op_remove(linger_op->session, linger_op);
+      _session_linger_op_assign(s, linger_op);
+    }
+
+    put_session(s);
+    return RECALC_OP_TARGET_NEED_RESEND;
+  }
+  return r;
+}
+
+void Objecter::_cancel_linger_op(Op *op)
+{
+  ldout(cct, 15) << "cancel_op " << op->tid << dendl;
+
+  ceph_assert(!op->should_resend);
+  if (op->has_completion()) {
+    op->onfinish = nullptr;
+    num_in_flight--;
+  }
+
+  _finish_op(op, 0);
+}
+
+void Objecter::_finish_op(Op *op, int r)
+{
+  ldout(cct, 15) << __func__ << " " << op->tid << dendl;
+
+  // op->session->lock is locked unique or op->session is null
+
+  if (!op->ctx_budgeted && op->budget >= 0) {
+    put_op_budget_bytes(op->budget);
+    op->budget = -1;
+  }
+
+  if (op->ontimeout && r != -ETIMEDOUT)
+    timer.cancel_event(op->ontimeout);
+
+  if (op->session) {
+    _session_op_remove(op->session, op);
+  }
+
+  logger->dec(l_osdc_op_active);
+
+  ceph_assert(check_latest_map_ops.find(op->tid) == check_latest_map_ops.end());
+
+  inflight_ops--;
+
+  op->put();
+}
+
+Objecter::MOSDOp *Objecter::_prepare_osd_op(Op *op)
+{
+  // rwlock is locked
+
+  int flags = op->target.flags;
+  flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+
+  // Nothing checks this any longer, but needed for compatibility with
+  // pre-luminous osds
+  flags |= CEPH_OSD_FLAG_ONDISK;
+
+  if (!honor_pool_full)
+    flags |= CEPH_OSD_FLAG_FULL_FORCE;
+
+  op->target.paused = false;
+  op->stamp = ceph::coarse_mono_clock::now();
+
+  hobject_t hobj = op->target.get_hobj();
+  auto m = new MOSDOp(client_inc, op->tid,
+		      hobj, op->target.actual_pgid,
+		      osdmap->get_epoch(),
+		      flags, op->features);
+
+  m->set_snapid(op->snapid);
+  m->set_snap_seq(op->snapc.seq);
+  m->set_snaps(op->snapc.snaps);
+
+  m->ops = op->ops;
+  m->set_mtime(op->mtime);
+  m->set_retry_attempt(op->attempts++);
+
+  if (!op->trace.valid() && cct->_conf->osdc_blkin_trace_all) {
+    op->trace.init("op", &trace_endpoint);
+  }
+
+  if (op->priority)
+    m->set_priority(op->priority);
+  else
+    m->set_priority(cct->_conf->osd_client_op_priority);
+
+  if (op->reqid != osd_reqid_t()) {
+    m->set_reqid(op->reqid);
+  }
+
+  logger->inc(l_osdc_op_send);
+  ssize_t sum = 0;
+  for (unsigned i = 0; i < m->ops.size(); i++) {
+    sum += m->ops[i].indata.length();
+  }
+  logger->inc(l_osdc_op_send_bytes, sum);
+
+  return m;
+}
+
+void Objecter::_send_op(Op *op)
+{
+  // rwlock is locked
+  // op->session->lock is locked
+
+  // backoff?
+  auto p = op->session->backoffs.find(op->target.actual_pgid);
+  if (p != op->session->backoffs.end()) {
+    hobject_t hoid = op->target.get_hobj();
+    auto q = p->second.lower_bound(hoid);
+    if (q != p->second.begin()) {
+      --q;
+      if (hoid >= q->second.end) {
+	++q;
+      }
+    }
+    if (q != p->second.end()) {
+      ldout(cct, 20) << __func__ << " ? " << q->first << " [" << q->second.begin
+		     << "," << q->second.end << ")" << dendl;
+      int r = cmp(hoid, q->second.begin);
+      if (r == 0 || (r > 0 && hoid < q->second.end)) {
+	ldout(cct, 10) << __func__ << " backoff " << op->target.actual_pgid
+		       << " id " << q->second.id << " on " << hoid
+		       << ", queuing " << op << " tid " << op->tid << dendl;
+	return;
+      }
+    }
+  }
+
+  ceph_assert(op->tid > 0);
+  MOSDOp *m = _prepare_osd_op(op);
+
+  if (op->target.actual_pgid != m->get_spg()) {
+    ldout(cct, 10) << __func__ << " " << op->tid << " pgid change from "
+		   << m->get_spg() << " to " << op->target.actual_pgid
+		   << ", updating and reencoding" << dendl;
+    m->set_spg(op->target.actual_pgid);
+    m->clear_payload();  // reencode
+  }
+
+  ldout(cct, 15) << "_send_op " << op->tid << " to "
+		 << op->target.actual_pgid << " on osd." << op->session->osd
+		 << dendl;
+
+  ConnectionRef con = op->session->con;
+  ceph_assert(con);
+
+#if 0
+  // preallocated rx ceph::buffer?
+  if (op->con) {
+    ldout(cct, 20) << " revoking rx ceph::buffer for " << op->tid << " on "
+		   << op->con << dendl;
+    op->con->revoke_rx_buffer(op->tid);
+  }
+  if (op->outbl &&
+      op->ontimeout == 0 &&  // only post rx_buffer if no timeout; see #9582
+      op->outbl->length()) {
+    op->outbl->invalidate_crc();  // messenger writes through c_str()
+    ldout(cct, 20) << " posting rx ceph::buffer for " << op->tid << " on " << con
+		   << dendl;
+    op->con = con;
+    op->con->post_rx_buffer(op->tid, *op->outbl);
+  }
+#endif
+
+  op->incarnation = op->session->incarnation;
+
+  if (op->trace.valid()) {
+    m->trace.init("op msg", nullptr, &op->trace);
+  }
+  op->session->con->send_message(m);
+}
+
+int Objecter::calc_op_budget(const bc::small_vector_base<OSDOp>& ops)
+{
+  int op_budget = 0;
+  for (auto i = ops.begin(); i != ops.end(); ++i) {
+    if (i->op.op & CEPH_OSD_OP_MODE_WR) {
+      op_budget += i->indata.length();
+    } else if (ceph_osd_op_mode_read(i->op.op)) {
+      if (ceph_osd_op_uses_extent(i->op.op)) {
+        if ((int64_t)i->op.extent.length > 0)
+          op_budget += (int64_t)i->op.extent.length;
+      } else if (ceph_osd_op_type_attr(i->op.op)) {
+        op_budget += i->op.xattr.name_len + i->op.xattr.value_len;
+      }
+    }
+  }
+  return op_budget;
+}
+
+void Objecter::_throttle_op(Op *op,
+			    shunique_lock<ceph::shared_mutex>& sul,
+			    int op_budget)
+{
+  ceph_assert(sul && sul.mutex() == &rwlock);
+  bool locked_for_write = sul.owns_lock();
+
+  if (!op_budget)
+    op_budget = calc_op_budget(op->ops);
+  if (!op_throttle_bytes.get_or_fail(op_budget)) { //couldn't take right now
+    sul.unlock();
+    op_throttle_bytes.get(op_budget);
+    if (locked_for_write)
+      sul.lock();
+    else
+      sul.lock_shared();
+  }
+  if (!op_throttle_ops.get_or_fail(1)) { //couldn't take right now
+    sul.unlock();
+    op_throttle_ops.get(1);
+    if (locked_for_write)
+      sul.lock();
+    else
+      sul.lock_shared();
+  }
+}
+
+int Objecter::take_linger_budget(LingerOp *info)
+{
+  return 1;
+}
+
+/* This function DOES put the passed message before returning */
+void Objecter::handle_osd_op_reply(MOSDOpReply *m)
+{
+  ldout(cct, 10) << "in handle_osd_op_reply" << dendl;
+
+  // get pio
+  ceph_tid_t tid = m->get_tid();
+
+  shunique_lock sul(rwlock, ceph::acquire_shared);
+  if (!initialized) {
+    m->put();
+    return;
+  }
+
+  ConnectionRef con = m->get_connection();
+  auto priv = con->get_priv();
+  auto s = static_cast<OSDSession*>(priv.get());
+  if (!s || s->con != con) {
+    ldout(cct, 7) << __func__ << " no session on con " << con << dendl;
+    m->put();
+    return;
+  }
+
+  unique_lock sl(s->lock);
+
+  map<ceph_tid_t, Op *>::iterator iter = s->ops.find(tid);
+  if (iter == s->ops.end()) {
+    ldout(cct, 7) << "handle_osd_op_reply " << tid
+		  << (m->is_ondisk() ? " ondisk" : (m->is_onnvram() ?
+						    " onnvram" : " ack"))
+		  << " ... stray" << dendl;
+    sl.unlock();
+    m->put();
+    return;
+  }
+
+  ldout(cct, 7) << "handle_osd_op_reply " << tid
+		<< (m->is_ondisk() ? " ondisk" :
+		    (m->is_onnvram() ? " onnvram" : " ack"))
+		<< " uv " << m->get_user_version()
+		<< " in " << m->get_pg()
+		<< " attempt " << m->get_retry_attempt()
+		<< dendl;
+  Op *op = iter->second;
+  op->trace.event("osd op reply");
+
+  if (retry_writes_after_first_reply && op->attempts == 1 &&
+      (op->target.flags & CEPH_OSD_FLAG_WRITE)) {
+    ldout(cct, 7) << "retrying write after first reply: " << tid << dendl;
+    if (op->has_completion()) {
+      num_in_flight--;
+    }
+    _session_op_remove(s, op);
+    sl.unlock();
+
+    _op_submit(op, sul, NULL);
+    m->put();
+    return;
+  }
+
+  if (m->get_retry_attempt() >= 0) {
+    if (m->get_retry_attempt() != (op->attempts - 1)) {
+      ldout(cct, 7) << " ignoring reply from attempt "
+		    << m->get_retry_attempt()
+		    << " from " << m->get_source_inst()
+		    << "; last attempt " << (op->attempts - 1) << " sent to "
+		    << op->session->con->get_peer_addr() << dendl;
+      m->put();
+      sl.unlock();
+      return;
+    }
+  } else {
+    // we don't know the request attempt because the server is old, so
+    // just accept this one.  we may do ACK callbacks we shouldn't
+    // have, but that is better than doing callbacks out of order.
+  }
+
+  decltype(op->onfinish) onfinish;
+
+  int rc = m->get_result();
+
+  if (m->is_redirect_reply()) {
+    ldout(cct, 5) << " got redirect reply; redirecting" << dendl;
+    if (op->has_completion())
+      num_in_flight--;
+    _session_op_remove(s, op);
+    sl.unlock();
+
+    // FIXME: two redirects could race and reorder
+
+    op->tid = 0;
+    m->get_redirect().combine_with_locator(op->target.target_oloc,
+					   op->target.target_oid.name);
+    op->target.flags |= (CEPH_OSD_FLAG_REDIRECTED |
+			 CEPH_OSD_FLAG_IGNORE_CACHE |
+			 CEPH_OSD_FLAG_IGNORE_OVERLAY);
+    _op_submit(op, sul, NULL);
+    m->put();
+    return;
+  }
+
+  if (rc == -EAGAIN) {
+    ldout(cct, 7) << " got -EAGAIN, resubmitting" << dendl;
+    if (op->has_completion())
+      num_in_flight--;
+    _session_op_remove(s, op);
+    sl.unlock();
+
+    op->tid = 0;
+    op->target.flags &= ~(CEPH_OSD_FLAG_BALANCE_READS |
+			  CEPH_OSD_FLAG_LOCALIZE_READS);
+    op->target.pgid = pg_t();
+    _op_submit(op, sul, NULL);
+    m->put();
+    return;
+  }
+
+  sul.unlock();
+
+  if (op->objver)
+    *op->objver = m->get_user_version();
+  if (op->reply_epoch)
+    *op->reply_epoch = m->get_map_epoch();
+  if (op->data_offset)
+    *op->data_offset = m->get_header().data_off;
+
+  // got data?
+  if (op->outbl) {
+#if 0
+    if (op->con)
+      op->con->revoke_rx_buffer(op->tid);
+#endif
+    auto& bl = m->get_data();
+    if (op->outbl->length() == bl.length() &&
+	bl.get_num_buffers() <= 1) {
+      // this is here to keep previous users to *relied* on getting data
+      // read into existing buffers happy.  Notably,
+      // libradosstriper::RadosStriperImpl::aio_read().
+      ldout(cct,10) << __func__ << " copying resulting " << bl.length()
+		    << " into existing ceph::buffer of length " << op->outbl->length()
+		    << dendl;
+      cb::list t;
+      t = std::move(*op->outbl);
+      t.invalidate_crc();  // we're overwriting the raw buffers via c_str()
+      bl.begin().copy(bl.length(), t.c_str());
+      op->outbl->substr_of(t, 0, bl.length());
+    } else {
+      m->claim_data(*op->outbl);
+    }
+    op->outbl = 0;
+  }
+
+  // per-op result demuxing
+  vector<OSDOp> out_ops;
+  m->claim_ops(out_ops);
+
+  if (out_ops.size() != op->ops.size())
+    ldout(cct, 0) << "WARNING: tid " << op->tid << " reply ops " << out_ops
+		  << " != request ops " << op->ops
+		  << " from " << m->get_source_inst() << dendl;
+
+  ceph_assert(op->ops.size() == op->out_bl.size());
+  ceph_assert(op->ops.size() == op->out_rval.size());
+  ceph_assert(op->ops.size() == op->out_ec.size());
+  ceph_assert(op->ops.size() == op->out_handler.size());
+  auto pb = op->out_bl.begin();
+  auto pr = op->out_rval.begin();
+  auto pe = op->out_ec.begin();
+  auto ph = op->out_handler.begin();
+  ceph_assert(op->out_bl.size() == op->out_rval.size());
+  ceph_assert(op->out_bl.size() == op->out_handler.size());
+  auto p = out_ops.begin();
+  for (unsigned i = 0;
+       p != out_ops.end() && pb != op->out_bl.end();
+       ++i, ++p, ++pb, ++pr, ++pe, ++ph) {
+    ldout(cct, 10) << " op " << i << " rval " << p->rval
+		   << " len " << p->outdata.length() << dendl;
+    if (*pb)
+      **pb = p->outdata;
+    // set rval before running handlers so that handlers
+    // can change it if e.g. decoding fails
+    if (*pr)
+      **pr = ceph_to_hostos_errno(p->rval);
+    if (*pe)
+      **pe = p->rval < 0 ? bs::error_code(-p->rval, osd_category()) :
+	bs::error_code();
+    if (*ph) {
+      std::move((*ph))(p->rval < 0 ?
+		       bs::error_code(-p->rval, osd_category()) :
+		       bs::error_code(),
+		       p->rval, p->outdata);
+    }
+  }
+
+  // NOTE: we assume that since we only request ONDISK ever we will
+  // only ever get back one (type of) ack ever.
+
+  if (op->has_completion()) {
+    num_in_flight--;
+    onfinish = std::move(op->onfinish);
+    op->onfinish = nullptr;
+  }
+  logger->inc(l_osdc_op_reply);
+
+  /* get it before we call _finish_op() */
+  auto completion_lock = s->get_lock(op->target.base_oid);
+
+  ldout(cct, 15) << "handle_osd_op_reply completed tid " << tid << dendl;
+  _finish_op(op, 0);
+
+  ldout(cct, 5) << num_in_flight << " in flight" << dendl;
+
+  // serialize completions
+  if (completion_lock.mutex()) {
+    completion_lock.lock();
+  }
+  sl.unlock();
+
+  // do callbacks
+  if (Op::has_completion(onfinish)) {
+    Op::complete(std::move(onfinish), osdcode(rc), rc);
+  }
+  if (completion_lock.mutex()) {
+    completion_lock.unlock();
+  }
+
+  m->put();
+}
+
+void Objecter::handle_osd_backoff(MOSDBackoff *m)
+{
+  ldout(cct, 10) << __func__ << " " << *m << dendl;
+  shunique_lock sul(rwlock, ceph::acquire_shared);
+  if (!initialized) {
+    m->put();
+    return;
+  }
+
+  ConnectionRef con = m->get_connection();
+  auto priv = con->get_priv();
+  auto s = static_cast<OSDSession*>(priv.get());
+  if (!s || s->con != con) {
+    ldout(cct, 7) << __func__ << " no session on con " << con << dendl;
+    m->put();
+    return;
+  }
+
+  get_session(s);
+
+  unique_lock sl(s->lock);
+
+  switch (m->op) {
+  case CEPH_OSD_BACKOFF_OP_BLOCK:
+    {
+      // register
+      OSDBackoff& b = s->backoffs[m->pgid][m->begin];
+      s->backoffs_by_id.insert(make_pair(m->id, &b));
+      b.pgid = m->pgid;
+      b.id = m->id;
+      b.begin = m->begin;
+      b.end = m->end;
+
+      // ack with original backoff's epoch so that the osd can discard this if
+      // there was a pg split.
+      auto r = new MOSDBackoff(m->pgid, m->map_epoch,
+			       CEPH_OSD_BACKOFF_OP_ACK_BLOCK,
+			       m->id, m->begin, m->end);
+      // this priority must match the MOSDOps from _prepare_osd_op
+      r->set_priority(cct->_conf->osd_client_op_priority);
+      con->send_message(r);
+    }
+    break;
+
+  case CEPH_OSD_BACKOFF_OP_UNBLOCK:
+    {
+      auto p = s->backoffs_by_id.find(m->id);
+      if (p != s->backoffs_by_id.end()) {
+	OSDBackoff *b = p->second;
+	if (b->begin != m->begin &&
+	    b->end != m->end) {
+	  lderr(cct) << __func__ << " got " << m->pgid << " id " << m->id
+		     << " unblock on ["
+		     << m->begin << "," << m->end << ") but backoff is ["
+		     << b->begin << "," << b->end << ")" << dendl;
+	  // hrmpf, unblock it anyway.
+	}
+	ldout(cct, 10) << __func__ << " unblock backoff " << b->pgid
+		       << " id " << b->id
+		       << " [" << b->begin << "," << b->end
+		       << ")" << dendl;
+	auto spgp = s->backoffs.find(b->pgid);
+	ceph_assert(spgp != s->backoffs.end());
+	spgp->second.erase(b->begin);
+	if (spgp->second.empty()) {
+	  s->backoffs.erase(spgp);
+	}
+	s->backoffs_by_id.erase(p);
+
+	// check for any ops to resend
+	for (auto& q : s->ops) {
+	  if (q.second->target.actual_pgid == m->pgid) {
+	    int r = q.second->target.contained_by(m->begin, m->end);
+	    ldout(cct, 20) << __func__ <<  " contained_by " << r << " on "
+			   << q.second->target.get_hobj() << dendl;
+	    if (r) {
+	      _send_op(q.second);
+	    }
+	  }
+	}
+      } else {
+	lderr(cct) << __func__ << " " << m->pgid << " id " << m->id
+		   << " unblock on ["
+		   << m->begin << "," << m->end << ") but backoff dne" << dendl;
+      }
+    }
+    break;
+
+  default:
+    ldout(cct, 10) << __func__ << " unrecognized op " << (int)m->op << dendl;
+  }
+
+  sul.unlock();
+  sl.unlock();
+
+  m->put();
+  put_session(s);
+}
+
+uint32_t Objecter::list_nobjects_seek(NListContext *list_context,
+				      uint32_t pos)
+{
+  shared_lock rl(rwlock);
+  list_context->pos = hobject_t(object_t(), string(), CEPH_NOSNAP,
+				pos, list_context->pool_id, string());
+  ldout(cct, 10) << __func__ << " " << list_context
+		 << " pos " << pos << " -> " << list_context->pos << dendl;
+  pg_t actual = osdmap->raw_pg_to_pg(pg_t(pos, list_context->pool_id));
+  list_context->current_pg = actual.ps();
+  list_context->at_end_of_pool = false;
+  return pos;
+}
+
+uint32_t Objecter::list_nobjects_seek(NListContext *list_context,
+				      const hobject_t& cursor)
+{
+  shared_lock rl(rwlock);
+  ldout(cct, 10) << "list_nobjects_seek " << list_context << dendl;
+  list_context->pos = cursor;
+  list_context->at_end_of_pool = false;
+  pg_t actual = osdmap->raw_pg_to_pg(pg_t(cursor.get_hash(), list_context->pool_id));
+  list_context->current_pg = actual.ps();
+  list_context->sort_bitwise = true;
+  return list_context->current_pg;
+}
+
+void Objecter::list_nobjects_get_cursor(NListContext *list_context,
+                                        hobject_t *cursor)
+{
+  shared_lock rl(rwlock);
+  if (list_context->list.empty()) {
+    *cursor = list_context->pos;
+  } else {
+    const librados::ListObjectImpl& entry = list_context->list.front();
+    const string *key = (entry.locator.empty() ? &entry.oid : &entry.locator);
+    uint32_t h = osdmap->get_pg_pool(list_context->pool_id)->hash_key(*key, entry.nspace);
+    *cursor = hobject_t(entry.oid, entry.locator, list_context->pool_snap_seq, h, list_context->pool_id, entry.nspace);
+  }
+}
+
+void Objecter::list_nobjects(NListContext *list_context, Context *onfinish)
+{
+  ldout(cct, 10) << __func__ << " pool_id " << list_context->pool_id
+		 << " pool_snap_seq " << list_context->pool_snap_seq
+		 << " max_entries " << list_context->max_entries
+		 << " list_context " << list_context
+		 << " onfinish " << onfinish
+		 << " current_pg " << list_context->current_pg
+		 << " pos " << list_context->pos << dendl;
+
+  shared_lock rl(rwlock);
+  const pg_pool_t *pool = osdmap->get_pg_pool(list_context->pool_id);
+  if (!pool) { // pool is gone
+    rl.unlock();
+    put_nlist_context_budget(list_context);
+    onfinish->complete(-ENOENT);
+    return;
+  }
+  int pg_num = pool->get_pg_num();
+  bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
+
+  if (list_context->pos.is_min()) {
+    list_context->starting_pg_num = 0;
+    list_context->sort_bitwise = sort_bitwise;
+    list_context->starting_pg_num = pg_num;
+  }
+  if (list_context->sort_bitwise != sort_bitwise) {
+    list_context->pos = hobject_t(
+      object_t(), string(), CEPH_NOSNAP,
+      list_context->current_pg, list_context->pool_id, string());
+    list_context->sort_bitwise = sort_bitwise;
+    ldout(cct, 10) << " hobject sort order changed, restarting this pg at "
+		   << list_context->pos << dendl;
+  }
+  if (list_context->starting_pg_num != pg_num) {
+    if (!sort_bitwise) {
+      // start reading from the beginning; the pgs have changed
+      ldout(cct, 10) << " pg_num changed; restarting with " << pg_num << dendl;
+      list_context->pos = collection_list_handle_t();
+    }
+    list_context->starting_pg_num = pg_num;
+  }
+
+  if (list_context->pos.is_max()) {
+    ldout(cct, 20) << __func__ << " end of pool, list "
+		   << list_context->list << dendl;
+    if (list_context->list.empty()) {
+      list_context->at_end_of_pool = true;
+    }
+    // release the listing context's budget once all
+    // OPs (in the session) are finished
+    put_nlist_context_budget(list_context);
+    onfinish->complete(0);
+    return;
+  }
+
+  ObjectOperation op;
+  op.pg_nls(list_context->max_entries, list_context->filter,
+	    list_context->pos, osdmap->get_epoch());
+  list_context->bl.clear();
+  auto onack = new C_NList(list_context, onfinish, this);
+  object_locator_t oloc(list_context->pool_id, list_context->nspace);
+
+  // note current_pg in case we don't have (or lose) SORTBITWISE
+  list_context->current_pg = pool->raw_hash_to_pg(list_context->pos.get_hash());
+  rl.unlock();
+
+  pg_read(list_context->current_pg, oloc, op,
+	  &list_context->bl, 0, onack, &onack->epoch,
+	  &list_context->ctx_budget);
+}
+
+void Objecter::_nlist_reply(NListContext *list_context, int r,
+			    Context *final_finish, epoch_t reply_epoch)
+{
+  ldout(cct, 10) << __func__ << " " << list_context << dendl;
+
+  auto iter = list_context->bl.cbegin();
+  pg_nls_response_t response;
+  decode(response, iter);
+  if (!iter.end()) {
+    // we do this as legacy.
+    cb::list legacy_extra_info;
+    decode(legacy_extra_info, iter);
+  }
+
+  // if the osd returns 1 (newer code), or handle MAX, it means we
+  // hit the end of the pg.
+  if ((response.handle.is_max() || r == 1) &&
+      !list_context->sort_bitwise) {
+    // legacy OSD and !sortbitwise, figure out the next PG on our own
+    ++list_context->current_pg;
+    if (list_context->current_pg == list_context->starting_pg_num) {
+      // end of pool
+      list_context->pos = hobject_t::get_max();
+    } else {
+      // next pg
+      list_context->pos = hobject_t(object_t(), string(), CEPH_NOSNAP,
+				    list_context->current_pg,
+				    list_context->pool_id, string());
+    }
+  } else {
+    list_context->pos = response.handle;
+  }
+
+  int response_size = response.entries.size();
+  ldout(cct, 20) << " response.entries.size " << response_size
+		 << ", response.entries " << response.entries
+		 << ", handle " << response.handle
+		 << ", tentative new pos " << list_context->pos << dendl;
+  if (response_size) {
+    std::move(response.entries.begin(), response.entries.end(),
+	      std::back_inserter(list_context->list));
+    response.entries.clear();
+  }
+
+  if (list_context->list.size() >= list_context->max_entries) {
+    ldout(cct, 20) << " hit max, returning results so far, "
+		   << list_context->list << dendl;
+    // release the listing context's budget once all
+    // OPs (in the session) are finished
+    put_nlist_context_budget(list_context);
+    final_finish->complete(0);
+    return;
+  }
+
+  // continue!
+  list_nobjects(list_context, final_finish);
+}
+
+void Objecter::put_nlist_context_budget(NListContext *list_context)
+{
+  if (list_context->ctx_budget >= 0) {
+    ldout(cct, 10) << " release listing context's budget " <<
+      list_context->ctx_budget << dendl;
+    put_op_budget_bytes(list_context->ctx_budget);
+    list_context->ctx_budget = -1;
+  }
+}
+
+// snapshots
+
+void Objecter::create_pool_snap(int64_t pool, std::string_view snap_name,
+				decltype(PoolOp::onfinish)&& onfinish)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "create_pool_snap; pool: " << pool << "; snap: "
+		 << snap_name << dendl;
+
+  const pg_pool_t *p = osdmap->get_pg_pool(pool);
+  if (!p) {
+    onfinish->defer(std::move(onfinish), osdc_errc::pool_dne, cb::list{});
+    return;
+  }
+  if (p->snap_exists(snap_name)) {
+    onfinish->defer(std::move(onfinish), osdc_errc::snapshot_exists,
+		    cb::list{});
+    return;
+  }
+
+  auto op = new PoolOp;
+  op->tid = ++last_tid;
+  op->pool = pool;
+  op->name = snap_name;
+  op->onfinish = std::move(onfinish);
+  op->pool_op = POOL_OP_CREATE_SNAP;
+  pool_ops[op->tid] = op;
+
+  pool_op_submit(op);
+}
+
+struct CB_SelfmanagedSnap {
+  std::unique_ptr<ca::Completion<void(bs::error_code, snapid_t)>> fin;
+  CB_SelfmanagedSnap(decltype(fin)&& fin)
+    : fin(std::move(fin)) {}
+  void operator()(bs::error_code ec, const cb::list& bl) {
+    snapid_t snapid = 0;
+    if (!ec) {
+      try {
+	auto p = bl.cbegin();
+	decode(snapid, p);
+      } catch (const cb::error& e) {
+        ec = e.code();
+      }
+    }
+    fin->defer(std::move(fin), ec, snapid);
+  }
+};
+
+void Objecter::allocate_selfmanaged_snap(
+  int64_t pool,
+  std::unique_ptr<ca::Completion<void(bs::error_code, snapid_t)>> onfinish)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "allocate_selfmanaged_snap; pool: " << pool << dendl;
+  auto op = new PoolOp;
+  op->tid = ++last_tid;
+  op->pool = pool;
+  op->onfinish = PoolOp::OpComp::create(
+    service.get_executor(),
+    CB_SelfmanagedSnap(std::move(onfinish)));
+  op->pool_op = POOL_OP_CREATE_UNMANAGED_SNAP;
+  pool_ops[op->tid] = op;
+
+  pool_op_submit(op);
+}
+
+void Objecter::delete_pool_snap(
+  int64_t pool, std::string_view snap_name,
+  decltype(PoolOp::onfinish)&& onfinish)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "delete_pool_snap; pool: " << pool << "; snap: "
+		 << snap_name << dendl;
+
+  const pg_pool_t *p = osdmap->get_pg_pool(pool);
+  if (!p) {
+    onfinish->defer(std::move(onfinish), osdc_errc::pool_dne, cb::list{});
+    return;
+  }
+
+  if (!p->snap_exists(snap_name)) {
+    onfinish->defer(std::move(onfinish), osdc_errc::snapshot_dne, cb::list{});
+    return;
+  }
+
+  auto op = new PoolOp;
+  op->tid = ++last_tid;
+  op->pool = pool;
+  op->name = snap_name;
+  op->onfinish = std::move(onfinish);
+  op->pool_op = POOL_OP_DELETE_SNAP;
+  pool_ops[op->tid] = op;
+
+  pool_op_submit(op);
+}
+
+void Objecter::delete_selfmanaged_snap(int64_t pool, snapid_t snap,
+				       decltype(PoolOp::onfinish)&& onfinish)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "delete_selfmanaged_snap; pool: " << pool << "; snap: "
+		 << snap << dendl;
+  auto op = new PoolOp;
+  op->tid = ++last_tid;
+  op->pool = pool;
+  op->onfinish = std::move(onfinish);
+  op->pool_op = POOL_OP_DELETE_UNMANAGED_SNAP;
+  op->snapid = snap;
+  pool_ops[op->tid] = op;
+
+  pool_op_submit(op);
+}
+
+void Objecter::create_pool(std::string_view name,
+			   decltype(PoolOp::onfinish)&& onfinish,
+			   int crush_rule)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "create_pool name=" << name << dendl;
+
+  if (osdmap->lookup_pg_pool_name(name) >= 0) {
+    onfinish->defer(std::move(onfinish), osdc_errc::pool_exists, cb::list{});
+    return;
+  }
+
+  auto op = new PoolOp;
+  op->tid = ++last_tid;
+  op->pool = 0;
+  op->name = name;
+  op->onfinish = std::move(onfinish);
+  op->pool_op = POOL_OP_CREATE;
+  pool_ops[op->tid] = op;
+  op->crush_rule = crush_rule;
+
+  pool_op_submit(op);
+}
+
+void Objecter::delete_pool(int64_t pool,
+			   decltype(PoolOp::onfinish)&& onfinish)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "delete_pool " << pool << dendl;
+
+  if (!osdmap->have_pg_pool(pool))
+    onfinish->defer(std::move(onfinish), osdc_errc::pool_dne, cb::list{});
+  else
+    _do_delete_pool(pool, std::move(onfinish));
+}
+
+void Objecter::delete_pool(std::string_view pool_name,
+			   decltype(PoolOp::onfinish)&& onfinish)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "delete_pool " << pool_name << dendl;
+
+  int64_t pool = osdmap->lookup_pg_pool_name(pool_name);
+  if (pool < 0)
+    // This only returns one error: -ENOENT.
+    onfinish->defer(std::move(onfinish), osdc_errc::pool_dne, cb::list{});
+  else
+    _do_delete_pool(pool, std::move(onfinish));
+}
+
+void Objecter::_do_delete_pool(int64_t pool,
+			       decltype(PoolOp::onfinish)&& onfinish)
+
+{
+  auto op = new PoolOp;
+  op->tid = ++last_tid;
+  op->pool = pool;
+  op->name = "delete";
+  op->onfinish = std::move(onfinish);
+  op->pool_op = POOL_OP_DELETE;
+  pool_ops[op->tid] = op;
+  pool_op_submit(op);
+}
+
+void Objecter::pool_op_submit(PoolOp *op)
+{
+  // rwlock is locked
+  if (mon_timeout > timespan(0)) {
+    op->ontimeout = timer.add_event(mon_timeout,
+				    [this, op]() {
+				      pool_op_cancel(op->tid, -ETIMEDOUT); });
+  }
+  _pool_op_submit(op);
+}
+
+void Objecter::_pool_op_submit(PoolOp *op)
+{
+  // rwlock is locked unique
+
+  ldout(cct, 10) << "pool_op_submit " << op->tid << dendl;
+  auto m = new MPoolOp(monc->get_fsid(), op->tid, op->pool,
+		       op->name, op->pool_op,
+		       last_seen_osdmap_version);
+  if (op->snapid) m->snapid = op->snapid;
+  if (op->crush_rule) m->crush_rule = op->crush_rule;
+  monc->send_mon_message(m);
+  op->last_submit = ceph::coarse_mono_clock::now();
+
+  logger->inc(l_osdc_poolop_send);
+}
+
+/**
+ * Handle a reply to a PoolOp message. Check that we sent the message
+ * and give the caller responsibility for the returned cb::list.
+ * Then either call the finisher or stash the PoolOp, depending on if we
+ * have a new enough map.
+ * Lastly, clean up the message and PoolOp.
+ */
+void Objecter::handle_pool_op_reply(MPoolOpReply *m)
+{
+  int rc = m->replyCode;
+  auto ec = rc < 0 ? bs::error_code(-rc, mon_category()) : bs::error_code();
+  FUNCTRACE(cct);
+  shunique_lock sul(rwlock, acquire_shared);
+  if (!initialized) {
+    sul.unlock();
+    m->put();
+    return;
+  }
+
+  ldout(cct, 10) << "handle_pool_op_reply " << *m << dendl;
+  ceph_tid_t tid = m->get_tid();
+  auto iter = pool_ops.find(tid);
+  if (iter != pool_ops.end()) {
+    PoolOp *op = iter->second;
+    ldout(cct, 10) << "have request " << tid << " at " << op << " Op: "
+		   << ceph_pool_op_name(op->pool_op) << dendl;
+    cb::list bl{std::move(m->response_data)};
+    if (m->version > last_seen_osdmap_version)
+      last_seen_osdmap_version = m->version;
+    if (osdmap->get_epoch() < m->epoch) {
+      sul.unlock();
+      sul.lock();
+      // recheck op existence since we have let go of rwlock
+      // (for promotion) above.
+      iter = pool_ops.find(tid);
+      if (iter == pool_ops.end())
+	goto done; // op is gone.
+      if (osdmap->get_epoch() < m->epoch) {
+	ldout(cct, 20) << "waiting for client to reach epoch " << m->epoch
+		       << " before calling back" << dendl;
+	_wait_for_new_map(OpCompletion::create(
+			    service.get_executor(),
+			    [o = std::move(op->onfinish),
+			     bl = std::move(bl)](
+			      bs::error_code ec) mutable {
+			      o->defer(std::move(o), ec, bl);
+			    }),
+			  m->epoch,
+			  ec);
+      } else {
+	// map epoch changed, probably because a MOSDMap message
+	// sneaked in. Do caller-specified callback now or else
+	// we lose it forever.
+	ceph_assert(op->onfinish);
+	op->onfinish->defer(std::move(op->onfinish), ec, std::move(bl));
+      }
+    } else {
+      ceph_assert(op->onfinish);
+      op->onfinish->defer(std::move(op->onfinish), ec, std::move(bl));
+    }
+    op->onfinish = nullptr;
+    if (!sul.owns_lock()) {
+      sul.unlock();
+      sul.lock();
+    }
+    iter = pool_ops.find(tid);
+    if (iter != pool_ops.end()) {
+      _finish_pool_op(op, 0);
+    }
+  } else {
+    ldout(cct, 10) << "unknown request " << tid << dendl;
+  }
+
+done:
+  // Not strictly necessary, since we'll release it on return.
+  sul.unlock();
+
+  ldout(cct, 10) << "done" << dendl;
+  m->put();
+}
+
+int Objecter::pool_op_cancel(ceph_tid_t tid, int r)
+{
+  ceph_assert(initialized);
+
+  unique_lock wl(rwlock);
+
+  auto it = pool_ops.find(tid);
+  if (it == pool_ops.end()) {
+    ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl;
+    return -ENOENT;
+  }
+
+  ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+
+  PoolOp *op = it->second;
+  if (op->onfinish)
+    op->onfinish->defer(std::move(op->onfinish), osdcode(r), cb::list{});
+
+  _finish_pool_op(op, r);
+  return 0;
+}
+
+void Objecter::_finish_pool_op(PoolOp *op, int r)
+{
+  // rwlock is locked unique
+  pool_ops.erase(op->tid);
+  logger->set(l_osdc_poolop_active, pool_ops.size());
+
+  if (op->ontimeout && r != -ETIMEDOUT) {
+    timer.cancel_event(op->ontimeout);
+  }
+
+  delete op;
+}
+
+// pool stats
+
+void Objecter::get_pool_stats(
+  const std::vector<std::string>& pools,
+  decltype(PoolStatOp::onfinish)&& onfinish)
+{
+  ldout(cct, 10) << "get_pool_stats " << pools << dendl;
+
+  auto op = new PoolStatOp;
+  op->tid = ++last_tid;
+  op->pools = pools;
+  op->onfinish = std::move(onfinish);
+  if (mon_timeout > timespan(0)) {
+    op->ontimeout = timer.add_event(mon_timeout,
+				    [this, op]() {
+				      pool_stat_op_cancel(op->tid,
+							  -ETIMEDOUT); });
+  } else {
+    op->ontimeout = 0;
+  }
+
+  unique_lock wl(rwlock);
+
+  poolstat_ops[op->tid] = op;
+
+  logger->set(l_osdc_poolstat_active, poolstat_ops.size());
+
+  _poolstat_submit(op);
+}
+
+void Objecter::_poolstat_submit(PoolStatOp *op)
+{
+  ldout(cct, 10) << "_poolstat_submit " << op->tid << dendl;
+  monc->send_mon_message(new MGetPoolStats(monc->get_fsid(), op->tid,
+					   op->pools,
+					   last_seen_pgmap_version));
+  op->last_submit = ceph::coarse_mono_clock::now();
+
+  logger->inc(l_osdc_poolstat_send);
+}
+
+void Objecter::handle_get_pool_stats_reply(MGetPoolStatsReply *m)
+{
+  ldout(cct, 10) << "handle_get_pool_stats_reply " << *m << dendl;
+  ceph_tid_t tid = m->get_tid();
+
+  unique_lock wl(rwlock);
+  if (!initialized) {
+    m->put();
+    return;
+  }
+
+  auto iter = poolstat_ops.find(tid);
+  if (iter != poolstat_ops.end()) {
+    PoolStatOp *op = poolstat_ops[tid];
+    ldout(cct, 10) << "have request " << tid << " at " << op << dendl;
+    if (m->version > last_seen_pgmap_version) {
+      last_seen_pgmap_version = m->version;
+    }
+    op->onfinish->defer(std::move(op->onfinish), bs::error_code{},
+			std::move(m->pool_stats), m->per_pool);
+    _finish_pool_stat_op(op, 0);
+  } else {
+    ldout(cct, 10) << "unknown request " << tid << dendl;
+  }
+  ldout(cct, 10) << "done" << dendl;
+  m->put();
+}
+
+int Objecter::pool_stat_op_cancel(ceph_tid_t tid, int r)
+{
+  ceph_assert(initialized);
+
+  unique_lock wl(rwlock);
+
+  auto it = poolstat_ops.find(tid);
+  if (it == poolstat_ops.end()) {
+    ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl;
+    return -ENOENT;
+  }
+
+  ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+
+  auto op = it->second;
+  if (op->onfinish)
+    op->onfinish->defer(std::move(op->onfinish), osdcode(r),
+			bc::flat_map<std::string, pool_stat_t>{}, false);
+  _finish_pool_stat_op(op, r);
+  return 0;
+}
+
+void Objecter::_finish_pool_stat_op(PoolStatOp *op, int r)
+{
+  // rwlock is locked unique
+
+  poolstat_ops.erase(op->tid);
+  logger->set(l_osdc_poolstat_active, poolstat_ops.size());
+
+  if (op->ontimeout && r != -ETIMEDOUT)
+    timer.cancel_event(op->ontimeout);
+
+  delete op;
+}
+
+void Objecter::get_fs_stats(boost::optional<int64_t> poolid,
+			    decltype(StatfsOp::onfinish)&& onfinish)
+{
+  ldout(cct, 10) << "get_fs_stats" << dendl;
+  unique_lock l(rwlock);
+
+  auto op = new StatfsOp;
+  op->tid = ++last_tid;
+  op->data_pool = poolid;
+  op->onfinish = std::move(onfinish);
+  if (mon_timeout > timespan(0)) {
+    op->ontimeout = timer.add_event(mon_timeout,
+				    [this, op]() {
+				      statfs_op_cancel(op->tid,
+						       -ETIMEDOUT); });
+  } else {
+    op->ontimeout = 0;
+  }
+  statfs_ops[op->tid] = op;
+
+  logger->set(l_osdc_statfs_active, statfs_ops.size());
+
+  _fs_stats_submit(op);
+}
+
+void Objecter::_fs_stats_submit(StatfsOp *op)
+{
+  // rwlock is locked unique
+
+  ldout(cct, 10) << "fs_stats_submit" << op->tid << dendl;
+  monc->send_mon_message(new MStatfs(monc->get_fsid(), op->tid,
+				     op->data_pool,
+				     last_seen_pgmap_version));
+  op->last_submit = ceph::coarse_mono_clock::now();
+
+  logger->inc(l_osdc_statfs_send);
+}
+
+void Objecter::handle_fs_stats_reply(MStatfsReply *m)
+{
+  unique_lock wl(rwlock);
+  if (!initialized) {
+    m->put();
+    return;
+  }
+
+  ldout(cct, 10) << "handle_fs_stats_reply " << *m << dendl;
+  ceph_tid_t tid = m->get_tid();
+
+  if (statfs_ops.count(tid)) {
+    StatfsOp *op = statfs_ops[tid];
+    ldout(cct, 10) << "have request " << tid << " at " << op << dendl;
+    if (m->h.version > last_seen_pgmap_version)
+      last_seen_pgmap_version = m->h.version;
+    op->onfinish->defer(std::move(op->onfinish), bs::error_code{}, m->h.st);
+    _finish_statfs_op(op, 0);
+  } else {
+    ldout(cct, 10) << "unknown request " << tid << dendl;
+  }
+  m->put();
+  ldout(cct, 10) << "done" << dendl;
+}
+
+int Objecter::statfs_op_cancel(ceph_tid_t tid, int r)
+{
+  ceph_assert(initialized);
+
+  unique_lock wl(rwlock);
+
+  auto it = statfs_ops.find(tid);
+  if (it == statfs_ops.end()) {
+    ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl;
+    return -ENOENT;
+  }
+
+  ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+
+  auto op = it->second;
+  if (op->onfinish)
+    op->onfinish->defer(std::move(op->onfinish), osdcode(r), ceph_statfs{});
+  _finish_statfs_op(op, r);
+  return 0;
+}
+
+void Objecter::_finish_statfs_op(StatfsOp *op, int r)
+{
+  // rwlock is locked unique
+
+  statfs_ops.erase(op->tid);
+  logger->set(l_osdc_statfs_active, statfs_ops.size());
+
+  if (op->ontimeout && r != -ETIMEDOUT)
+    timer.cancel_event(op->ontimeout);
+
+  delete op;
+}
+
+// scatter/gather
+
+void Objecter::_sg_read_finish(vector<ObjectExtent>& extents,
+			       vector<cb::list>& resultbl,
+			       cb::list *bl, Context *onfinish)
+{
+  // all done
+  ldout(cct, 15) << "_sg_read_finish" << dendl;
+
+  if (extents.size() > 1) {
+    Striper::StripedReadResult r;
+    auto bit = resultbl.begin();
+    for (auto eit = extents.begin();
+	 eit != extents.end();
+	 ++eit, ++bit) {
+      r.add_partial_result(cct, *bit, eit->buffer_extents);
+    }
+    bl->clear();
+    r.assemble_result(cct, *bl, false);
+  } else {
+    ldout(cct, 15) << "  only one frag" << dendl;
+    *bl = std::move(resultbl[0]);
+  }
+
+  // done
+  uint64_t bytes_read = bl->length();
+  ldout(cct, 7) << "_sg_read_finish " << bytes_read << " bytes" << dendl;
+
+  if (onfinish) {
+    onfinish->complete(bytes_read);// > 0 ? bytes_read:m->get_result());
+  }
+}
+
+
+void Objecter::ms_handle_connect(Connection *con)
+{
+  ldout(cct, 10) << "ms_handle_connect " << con << dendl;
+  if (!initialized)
+    return;
+
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON)
+    resend_mon_ops();
+}
+
+bool Objecter::ms_handle_reset(Connection *con)
+{
+  if (!initialized)
+    return false;
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
+    unique_lock wl(rwlock);
+
+    auto priv = con->get_priv();
+    auto session = static_cast<OSDSession*>(priv.get());
+    if (session) {
+      ldout(cct, 1) << "ms_handle_reset " << con << " session " << session
+		    << " osd." << session->osd << dendl;
+      // the session maybe had been closed if new osdmap just handled
+      // says the osd down
+      if (!(initialized && osdmap->is_up(session->osd))) {
+	ldout(cct, 1) << "ms_handle_reset aborted,initialized=" << initialized << dendl;
+	wl.unlock();
+	return false;
+      }
+      map<uint64_t, LingerOp *> lresend;
+      unique_lock sl(session->lock);
+      _reopen_session(session);
+      _kick_requests(session, lresend);
+      sl.unlock();
+      _linger_ops_resend(lresend, wl);
+      wl.unlock();
+      maybe_request_map();
+    }
+    return true;
+  }
+  return false;
+}
+
+void Objecter::ms_handle_remote_reset(Connection *con)
+{
+  /*
+   * treat these the same.
+   */
+  ms_handle_reset(con);
+}
+
+bool Objecter::ms_handle_refused(Connection *con)
+{
+  // just log for now
+  if (osdmap && (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD)) {
+    int osd = osdmap->identify_osd(con->get_peer_addr());
+    if (osd >= 0) {
+      ldout(cct, 1) << "ms_handle_refused on osd." << osd << dendl;
+    }
+  }
+  return false;
+}
+
+void Objecter::op_target_t::dump(Formatter *f) const
+{
+  f->dump_stream("pg") << pgid;
+  f->dump_int("osd", osd);
+  f->dump_stream("object_id") << base_oid;
+  f->dump_stream("object_locator") << base_oloc;
+  f->dump_stream("target_object_id") << target_oid;
+  f->dump_stream("target_object_locator") << target_oloc;
+  f->dump_int("paused", (int)paused);
+  f->dump_int("used_replica", (int)used_replica);
+  f->dump_int("precalc_pgid", (int)precalc_pgid);
+}
+
+void Objecter::_dump_active(OSDSession *s)
+{
+  for (auto p = s->ops.begin(); p != s->ops.end(); ++p) {
+    Op *op = p->second;
+    ldout(cct, 20) << op->tid << "\t" << op->target.pgid
+		   << "\tosd." << (op->session ? op->session->osd : -1)
+		   << "\t" << op->target.base_oid
+		   << "\t" << op->ops << dendl;
+  }
+}
+
+void Objecter::_dump_active()
+{
+  ldout(cct, 20) << "dump_active .. " << num_homeless_ops << " homeless"
+		 << dendl;
+  for (auto siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    auto s = siter->second;
+    shared_lock sl(s->lock);
+    _dump_active(s);
+    sl.unlock();
+  }
+  _dump_active(homeless_session);
+}
+
+void Objecter::dump_active()
+{
+  shared_lock rl(rwlock);
+  _dump_active();
+  rl.unlock();
+}
+
+void Objecter::dump_requests(Formatter *fmt)
+{
+  // Read-lock on Objecter held here
+  fmt->open_object_section("requests");
+  dump_ops(fmt);
+  dump_linger_ops(fmt);
+  dump_pool_ops(fmt);
+  dump_pool_stat_ops(fmt);
+  dump_statfs_ops(fmt);
+  dump_command_ops(fmt);
+  fmt->close_section(); // requests object
+}
+
+void Objecter::_dump_ops(const OSDSession *s, Formatter *fmt)
+{
+  for (auto p = s->ops.begin(); p != s->ops.end(); ++p) {
+    Op *op = p->second;
+    auto age = std::chrono::duration<double>(ceph::coarse_mono_clock::now() - op->stamp);
+    fmt->open_object_section("op");
+    fmt->dump_unsigned("tid", op->tid);
+    op->target.dump(fmt);
+    fmt->dump_stream("last_sent") << op->stamp;
+    fmt->dump_float("age", age.count());
+    fmt->dump_int("attempts", op->attempts);
+    fmt->dump_stream("snapid") << op->snapid;
+    fmt->dump_stream("snap_context") << op->snapc;
+    fmt->dump_stream("mtime") << op->mtime;
+
+    fmt->open_array_section("osd_ops");
+    for (auto it = op->ops.begin(); it != op->ops.end(); ++it) {
+      fmt->dump_stream("osd_op") << *it;
+    }
+    fmt->close_section(); // osd_ops array
+
+    fmt->close_section(); // op object
+  }
+}
+
+void Objecter::dump_ops(Formatter *fmt)
+{
+  // Read-lock on Objecter held
+  fmt->open_array_section("ops");
+  for (auto siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    OSDSession *s = siter->second;
+    shared_lock sl(s->lock);
+    _dump_ops(s, fmt);
+    sl.unlock();
+  }
+  _dump_ops(homeless_session, fmt);
+  fmt->close_section(); // ops array
+}
+
+void Objecter::_dump_linger_ops(const OSDSession *s, Formatter *fmt)
+{
+  for (auto p = s->linger_ops.begin(); p != s->linger_ops.end(); ++p) {
+    auto op = p->second;
+    fmt->open_object_section("linger_op");
+    fmt->dump_unsigned("linger_id", op->linger_id);
+    op->target.dump(fmt);
+    fmt->dump_stream("snapid") << op->snap;
+    fmt->dump_stream("registered") << op->registered;
+    fmt->close_section(); // linger_op object
+  }
+}
+
+void Objecter::dump_linger_ops(Formatter *fmt)
+{
+  // We have a read-lock on the objecter
+  fmt->open_array_section("linger_ops");
+  for (auto siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    auto s = siter->second;
+    shared_lock sl(s->lock);
+    _dump_linger_ops(s, fmt);
+    sl.unlock();
+  }
+  _dump_linger_ops(homeless_session, fmt);
+  fmt->close_section(); // linger_ops array
+}
+
+void Objecter::_dump_command_ops(const OSDSession *s, Formatter *fmt)
+{
+  for (auto p = s->command_ops.begin(); p != s->command_ops.end(); ++p) {
+    auto op = p->second;
+    fmt->open_object_section("command_op");
+    fmt->dump_unsigned("command_id", op->tid);
+    fmt->dump_int("osd", op->session ? op->session->osd : -1);
+    fmt->open_array_section("command");
+    for (auto q = op->cmd.begin(); q != op->cmd.end(); ++q)
+      fmt->dump_string("word", *q);
+    fmt->close_section();
+    if (op->target_osd >= 0)
+      fmt->dump_int("target_osd", op->target_osd);
+    else
+      fmt->dump_stream("target_pg") << op->target_pg;
+    fmt->close_section(); // command_op object
+  }
+}
+
+void Objecter::dump_command_ops(Formatter *fmt)
+{
+  // We have a read-lock on the Objecter here
+  fmt->open_array_section("command_ops");
+  for (auto siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    auto s = siter->second;
+    shared_lock sl(s->lock);
+    _dump_command_ops(s, fmt);
+    sl.unlock();
+  }
+  _dump_command_ops(homeless_session, fmt);
+  fmt->close_section(); // command_ops array
+}
+
+void Objecter::dump_pool_ops(Formatter *fmt) const
+{
+  fmt->open_array_section("pool_ops");
+  for (auto p = pool_ops.begin(); p != pool_ops.end(); ++p) {
+    auto op = p->second;
+    fmt->open_object_section("pool_op");
+    fmt->dump_unsigned("tid", op->tid);
+    fmt->dump_int("pool", op->pool);
+    fmt->dump_string("name", op->name);
+    fmt->dump_int("operation_type", op->pool_op);
+    fmt->dump_unsigned("crush_rule", op->crush_rule);
+    fmt->dump_stream("snapid") << op->snapid;
+    fmt->dump_stream("last_sent") << op->last_submit;
+    fmt->close_section(); // pool_op object
+  }
+  fmt->close_section(); // pool_ops array
+}
+
+void Objecter::dump_pool_stat_ops(Formatter *fmt) const
+{
+  fmt->open_array_section("pool_stat_ops");
+  for (auto p = poolstat_ops.begin();
+       p != poolstat_ops.end();
+       ++p) {
+    PoolStatOp *op = p->second;
+    fmt->open_object_section("pool_stat_op");
+    fmt->dump_unsigned("tid", op->tid);
+    fmt->dump_stream("last_sent") << op->last_submit;
+
+    fmt->open_array_section("pools");
+    for (const auto& it : op->pools) {
+      fmt->dump_string("pool", it);
+    }
+    fmt->close_section(); // pools array
+
+    fmt->close_section(); // pool_stat_op object
+  }
+  fmt->close_section(); // pool_stat_ops array
+}
+
+void Objecter::dump_statfs_ops(Formatter *fmt) const
+{
+  fmt->open_array_section("statfs_ops");
+  for (auto p = statfs_ops.begin(); p != statfs_ops.end(); ++p) {
+    auto op = p->second;
+    fmt->open_object_section("statfs_op");
+    fmt->dump_unsigned("tid", op->tid);
+    fmt->dump_stream("last_sent") << op->last_submit;
+    fmt->close_section(); // statfs_op object
+  }
+  fmt->close_section(); // statfs_ops array
+}
+
+Objecter::RequestStateHook::RequestStateHook(Objecter *objecter) :
+  m_objecter(objecter)
+{
+}
+
+int Objecter::RequestStateHook::call(std::string_view command,
+				     const cmdmap_t& cmdmap,
+				     Formatter *f,
+				     std::ostream& ss,
+				     cb::list& out)
+{
+  shared_lock rl(m_objecter->rwlock);
+  m_objecter->dump_requests(f);
+  return 0;
+}
+
+void Objecter::blocklist_self(bool set)
+{
+  ldout(cct, 10) << "blocklist_self " << (set ? "add" : "rm") << dendl;
+
+  vector<string> cmd;
+  cmd.push_back("{\"prefix\":\"osd blocklist\", ");
+  if (set)
+    cmd.push_back("\"blocklistop\":\"add\",");
+  else
+    cmd.push_back("\"blocklistop\":\"rm\",");
+  stringstream ss;
+  // this is somewhat imprecise in that we are blocklisting our first addr only
+  ss << messenger->get_myaddrs().front().get_legacy_str();
+  cmd.push_back("\"addr\":\"" + ss.str() + "\"");
+
+  auto m = new MMonCommand(monc->get_fsid());
+  m->cmd = cmd;
+
+  // NOTE: no fallback to legacy blacklist command implemented here
+  // since this is only used for test code.
+
+  monc->send_mon_message(m);
+}
+
+// commands
+
+void Objecter::handle_command_reply(MCommandReply *m)
+{
+  unique_lock wl(rwlock);
+  if (!initialized) {
+    m->put();
+    return;
+  }
+
+  ConnectionRef con = m->get_connection();
+  auto priv = con->get_priv();
+  auto s = static_cast<OSDSession*>(priv.get());
+  if (!s || s->con != con) {
+    ldout(cct, 7) << __func__ << " no session on con " << con << dendl;
+    m->put();
+    return;
+  }
+
+  shared_lock sl(s->lock);
+  auto p = s->command_ops.find(m->get_tid());
+  if (p == s->command_ops.end()) {
+    ldout(cct, 10) << "handle_command_reply tid " << m->get_tid()
+		   << " not found" << dendl;
+    m->put();
+    sl.unlock();
+    return;
+  }
+
+  CommandOp *c = p->second;
+  if (!c->session ||
+      m->get_connection() != c->session->con) {
+    ldout(cct, 10) << "handle_command_reply tid " << m->get_tid()
+		   << " got reply from wrong connection "
+		   << m->get_connection() << " " << m->get_source_inst()
+		   << dendl;
+    m->put();
+    sl.unlock();
+    return;
+  }
+
+  if (m->r == -EAGAIN) {
+    ldout(cct,10) << __func__ << " tid " << m->get_tid()
+		  << " got EAGAIN, requesting map and resending" << dendl;
+    // NOTE: This might resend twice... once now, and once again when
+    // we get an updated osdmap and the PG is found to have moved.
+    _maybe_request_map();
+    _send_command(c);
+    m->put();
+    sl.unlock();
+    return;
+  }
+
+  sl.unlock();
+
+  unique_lock sul(s->lock);
+  _finish_command(c, m->r < 0 ? bs::error_code(-m->r, osd_category()) :
+		  bs::error_code(), std::move(m->rs),
+		  std::move(m->get_data()));
+  sul.unlock();
+
+  m->put();
+}
+
+Objecter::LingerOp::LingerOp(Objecter *o, uint64_t linger_id)
+  : objecter(o),
+    linger_id(linger_id),
+    watch_lock(ceph::make_shared_mutex(
+		 fmt::format("LingerOp::watch_lock #{}", linger_id)))
+{}
+
+void Objecter::submit_command(CommandOp *c, ceph_tid_t *ptid)
+{
+  shunique_lock sul(rwlock, ceph::acquire_unique);
+
+  ceph_tid_t tid = ++last_tid;
+  ldout(cct, 10) << "_submit_command " << tid << " " << c->cmd << dendl;
+  c->tid = tid;
+
+  {
+    unique_lock hs_wl(homeless_session->lock);
+    _session_command_op_assign(homeless_session, c);
+  }
+
+  _calc_command_target(c, sul);
+  _assign_command_session(c, sul);
+  if (osd_timeout > timespan(0)) {
+    c->ontimeout = timer.add_event(osd_timeout,
+				   [this, c, tid]() {
+				     command_op_cancel(
+				       c->session, tid,
+				       osdc_errc::timed_out); });
+  }
+
+  if (!c->session->is_homeless()) {
+    _send_command(c);
+  } else {
+    _maybe_request_map();
+  }
+  if (c->map_check_error)
+    _send_command_map_check(c);
+  if (ptid)
+    *ptid = tid;
+
+  logger->inc(l_osdc_command_active);
+}
+
+int Objecter::_calc_command_target(CommandOp *c,
+				   shunique_lock<ceph::shared_mutex>& sul)
+{
+  ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock);
+
+  c->map_check_error = 0;
+
+  // ignore overlays, just like we do with pg ops
+  c->target.flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
+
+  if (c->target_osd >= 0) {
+    if (!osdmap->exists(c->target_osd)) {
+      c->map_check_error = -ENOENT;
+      c->map_check_error_str = "osd dne";
+      c->target.osd = -1;
+      return RECALC_OP_TARGET_OSD_DNE;
+    }
+    if (osdmap->is_down(c->target_osd)) {
+      c->map_check_error = -ENXIO;
+      c->map_check_error_str = "osd down";
+      c->target.osd = -1;
+      return RECALC_OP_TARGET_OSD_DOWN;
+    }
+    c->target.osd = c->target_osd;
+  } else {
+    int ret = _calc_target(&(c->target), nullptr, true);
+    if (ret == RECALC_OP_TARGET_POOL_DNE) {
+      c->map_check_error = -ENOENT;
+      c->map_check_error_str = "pool dne";
+      c->target.osd = -1;
+      return ret;
+    } else if (ret == RECALC_OP_TARGET_OSD_DOWN) {
+      c->map_check_error = -ENXIO;
+      c->map_check_error_str = "osd down";
+      c->target.osd = -1;
+      return ret;
+    }
+  }
+
+  OSDSession *s;
+  int r = _get_session(c->target.osd, &s, sul);
+  ceph_assert(r != -EAGAIN); /* shouldn't happen as we're holding the write lock */
+
+  if (c->session != s) {
+    put_session(s);
+    return RECALC_OP_TARGET_NEED_RESEND;
+  }
+
+  put_session(s);
+
+  ldout(cct, 20) << "_recalc_command_target " << c->tid << " no change, "
+		 << c->session << dendl;
+
+  return RECALC_OP_TARGET_NO_ACTION;
+}
+
+void Objecter::_assign_command_session(CommandOp *c,
+				       shunique_lock<ceph::shared_mutex>& sul)
+{
+  ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock);
+
+  OSDSession *s;
+  int r = _get_session(c->target.osd, &s, sul);
+  ceph_assert(r != -EAGAIN); /* shouldn't happen as we're holding the write lock */
+
+  if (c->session != s) {
+    if (c->session) {
+      OSDSession *cs = c->session;
+      unique_lock csl(cs->lock);
+      _session_command_op_remove(c->session, c);
+      csl.unlock();
+    }
+    unique_lock sl(s->lock);
+    _session_command_op_assign(s, c);
+  }
+
+  put_session(s);
+}
+
+void Objecter::_send_command(CommandOp *c)
+{
+  ldout(cct, 10) << "_send_command " << c->tid << dendl;
+  ceph_assert(c->session);
+  ceph_assert(c->session->con);
+  auto m = new MCommand(monc->monmap.fsid);
+  m->cmd = c->cmd;
+  m->set_data(c->inbl);
+  m->set_tid(c->tid);
+  c->session->con->send_message(m);
+  logger->inc(l_osdc_command_send);
+}
+
+int Objecter::command_op_cancel(OSDSession *s, ceph_tid_t tid,
+				bs::error_code ec)
+{
+  ceph_assert(initialized);
+
+  unique_lock wl(rwlock);
+
+  auto it = s->command_ops.find(tid);
+  if (it == s->command_ops.end()) {
+    ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl;
+    return -ENOENT;
+  }
+
+  ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+
+  CommandOp *op = it->second;
+  _command_cancel_map_check(op);
+  unique_lock sl(op->session->lock);
+  _finish_command(op, ec, {}, {});
+  sl.unlock();
+  return 0;
+}
+
+void Objecter::_finish_command(CommandOp *c, bs::error_code ec,
+			       string&& rs, cb::list&& bl)
+{
+  // rwlock is locked unique
+  // session lock is locked
+
+  ldout(cct, 10) << "_finish_command " << c->tid << " = " << ec << " "
+		 << rs << dendl;
+
+  if (c->onfinish)
+    c->onfinish->defer(std::move(c->onfinish), ec, std::move(rs), std::move(bl));
+
+  if (c->ontimeout && ec != bs::errc::timed_out)
+    timer.cancel_event(c->ontimeout);
+
+  _session_command_op_remove(c->session, c);
+
+  c->put();
+
+  logger->dec(l_osdc_command_active);
+}
+
+Objecter::OSDSession::~OSDSession()
+{
+  // Caller is responsible for re-assigning or
+  // destroying any ops that were assigned to us
+  ceph_assert(ops.empty());
+  ceph_assert(linger_ops.empty());
+  ceph_assert(command_ops.empty());
+}
+
+Objecter::Objecter(CephContext *cct,
+		   Messenger *m, MonClient *mc,
+		   boost::asio::io_context& service) :
+  Dispatcher(cct), messenger(m), monc(mc), service(service)
+{
+  mon_timeout = cct->_conf.get_val<std::chrono::seconds>("rados_mon_op_timeout");
+  osd_timeout = cct->_conf.get_val<std::chrono::seconds>("rados_osd_op_timeout");
+}
+
+Objecter::~Objecter()
+{
+  ceph_assert(homeless_session->get_nref() == 1);
+  ceph_assert(num_homeless_ops == 0);
+  homeless_session->put();
+
+  ceph_assert(osd_sessions.empty());
+  ceph_assert(poolstat_ops.empty());
+  ceph_assert(statfs_ops.empty());
+  ceph_assert(pool_ops.empty());
+  ceph_assert(waiting_for_map.empty());
+  ceph_assert(linger_ops.empty());
+  ceph_assert(check_latest_map_lingers.empty());
+  ceph_assert(check_latest_map_ops.empty());
+  ceph_assert(check_latest_map_commands.empty());
+
+  ceph_assert(!m_request_state_hook);
+  ceph_assert(!logger);
+}
+
+/**
+ * Wait until this OSD map epoch is received before
+ * sending any more operations to OSDs.  Use this
+ * when it is known that the client can't trust
+ * anything from before this epoch (e.g. due to
+ * client blocklist at this epoch).
+ */
+void Objecter::set_epoch_barrier(epoch_t epoch)
+{
+  unique_lock wl(rwlock);
+
+  ldout(cct, 7) << __func__ << ": barrier " << epoch << " (was "
+		<< epoch_barrier << ") current epoch " << osdmap->get_epoch()
+		<< dendl;
+  if (epoch > epoch_barrier) {
+    epoch_barrier = epoch;
+    _maybe_request_map();
+  }
+}
+
+
+
+hobject_t Objecter::enumerate_objects_begin()
+{
+  return hobject_t();
+}
+
+hobject_t Objecter::enumerate_objects_end()
+{
+  return hobject_t::get_max();
+}
+
+template<typename T>
+struct EnumerationContext {
+  Objecter* objecter;
+  const hobject_t end;
+  const cb::list filter;
+  uint32_t max;
+  const object_locator_t oloc;
+  std::vector<T> ls;
+private:
+  fu2::unique_function<void(bs::error_code,
+			    std::vector<T>,
+			    hobject_t) &&> on_finish;
+public:
+  epoch_t epoch = 0;
+  int budget = -1;
+
+  EnumerationContext(Objecter* objecter,
+		     hobject_t end, cb::list filter,
+		     uint32_t max, object_locator_t oloc,
+		     decltype(on_finish) on_finish)
+    : objecter(objecter), end(std::move(end)), filter(std::move(filter)),
+      max(max), oloc(std::move(oloc)), on_finish(std::move(on_finish)) {}
+
+  void operator()(bs::error_code ec,
+		  std::vector<T> v,
+		  hobject_t h) && {
+    if (budget >= 0) {
+      objecter->put_op_budget_bytes(budget);
+      budget = -1;
+    }
+
+    std::move(on_finish)(ec, std::move(v), std::move(h));
+  }
+};
+
+template<typename T>
+struct CB_EnumerateReply {
+  cb::list bl;
+
+  Objecter* objecter;
+  std::unique_ptr<EnumerationContext<T>> ctx;
+
+  CB_EnumerateReply(Objecter* objecter,
+		    std::unique_ptr<EnumerationContext<T>>&& ctx) :
+    objecter(objecter), ctx(std::move(ctx)) {}
+
+  void operator()(bs::error_code ec) {
+    objecter->_enumerate_reply(std::move(bl), ec, std::move(ctx));
+  }
+};
+
+template<typename T>
+void Objecter::enumerate_objects(
+  int64_t pool_id,
+  std::string_view ns,
+  hobject_t start,
+  hobject_t end,
+  const uint32_t max,
+  const cb::list& filter_bl,
+  fu2::unique_function<void(bs::error_code,
+			    std::vector<T>,
+			    hobject_t) &&> on_finish) {
+  if (!end.is_max() && start > end) {
+    lderr(cct) << __func__ << ": start " << start << " > end " << end << dendl;
+    std::move(on_finish)(osdc_errc::precondition_violated, {}, {});
+    return;
+  }
+
+  if (max < 1) {
+    lderr(cct) << __func__ << ": result size may not be zero" << dendl;
+    std::move(on_finish)(osdc_errc::precondition_violated, {}, {});
+    return;
+  }
+
+  if (start.is_max()) {
+    std::move(on_finish)({}, {}, {});
+    return;
+  }
+
+  shared_lock rl(rwlock);
+  ceph_assert(osdmap->get_epoch());
+  if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
+    rl.unlock();
+    lderr(cct) << __func__ << ": SORTBITWISE cluster flag not set" << dendl;
+    std::move(on_finish)(osdc_errc::not_supported, {}, {});
+    return;
+  }
+  const pg_pool_t* p = osdmap->get_pg_pool(pool_id);
+  if (!p) {
+    lderr(cct) << __func__ << ": pool " << pool_id << " DNE in osd epoch "
+	       << osdmap->get_epoch() << dendl;
+    rl.unlock();
+    std::move(on_finish)(osdc_errc::pool_dne, {}, {});
+    return;
+  } else {
+    rl.unlock();
+  }
+
+  _issue_enumerate(start,
+		   std::make_unique<EnumerationContext<T>>(
+		     this, std::move(end), filter_bl,
+		     max, object_locator_t{pool_id, ns},
+		     std::move(on_finish)));
+}
+
+template
+void Objecter::enumerate_objects<librados::ListObjectImpl>(
+  int64_t pool_id,
+  std::string_view ns,
+  hobject_t start,
+  hobject_t end,
+  const uint32_t max,
+  const cb::list& filter_bl,
+  fu2::unique_function<void(bs::error_code,
+			    std::vector<librados::ListObjectImpl>,
+			    hobject_t) &&> on_finish);
+
+template
+void Objecter::enumerate_objects<neorados::Entry>(
+  int64_t pool_id,
+  std::string_view ns,
+  hobject_t start,
+  hobject_t end,
+  const uint32_t max,
+  const cb::list& filter_bl,
+  fu2::unique_function<void(bs::error_code,
+			    std::vector<neorados::Entry>,
+			    hobject_t) &&> on_finish);
+
+
+
+template<typename T>
+void Objecter::_issue_enumerate(hobject_t start,
+				std::unique_ptr<EnumerationContext<T>> ctx) {
+  ObjectOperation op;
+  auto c = ctx.get();
+  op.pg_nls(c->max, c->filter, start, osdmap->get_epoch());
+  auto on_ack = std::make_unique<CB_EnumerateReply<T>>(this, std::move(ctx));
+  // I hate having to do this. Try to find a cleaner way
+  // later.
+  auto epoch = &c->epoch;
+  auto budget = &c->budget;
+  auto pbl = &on_ack->bl;
+
+  // Issue.  See you later in _enumerate_reply
+  pg_read(start.get_hash(),
+	  c->oloc, op, pbl, 0,
+	  Op::OpComp::create(service.get_executor(),
+			     [c = std::move(on_ack)]
+			     (bs::error_code ec) mutable {
+			       (*c)(ec);
+			     }), epoch, budget);
+}
+
+template
+void Objecter::_issue_enumerate<librados::ListObjectImpl>(
+  hobject_t start,
+  std::unique_ptr<EnumerationContext<librados::ListObjectImpl>> ctx);
+template
+void Objecter::_issue_enumerate<neorados::Entry>(
+  hobject_t start, std::unique_ptr<EnumerationContext<neorados::Entry>> ctx);
+
+template<typename T>
+void Objecter::_enumerate_reply(
+  cb::list&& bl,
+  bs::error_code ec,
+  std::unique_ptr<EnumerationContext<T>>&& ctx)
+{
+  if (ec) {
+    std::move(*ctx)(ec, {}, {});
+    return;
+  }
+
+  // Decode the results
+  auto iter = bl.cbegin();
+  pg_nls_response_template<T> response;
+
+  try {
+    response.decode(iter);
+    if (!iter.end()) {
+      // extra_info isn't used anywhere. We do this solely to preserve
+      // backward compatibility
+      cb::list legacy_extra_info;
+      decode(legacy_extra_info, iter);
+    }
+  } catch (const bs::system_error& e) {
+    std::move(*ctx)(e.code(), {}, {});
+    return;
+  }
+
+  shared_lock rl(rwlock);
+  auto pool = osdmap->get_pg_pool(ctx->oloc.get_pool());
+  rl.unlock();
+  if (!pool) {
+    // pool is gone, drop any results which are now meaningless.
+    std::move(*ctx)(osdc_errc::pool_dne, {}, {});
+    return;
+  }
+
+  hobject_t next;
+  if ((response.handle <= ctx->end)) {
+    next = response.handle;
+  } else {
+    next = ctx->end;
+
+    // drop anything after 'end'
+    while (!response.entries.empty()) {
+      uint32_t hash = response.entries.back().locator.empty() ?
+	pool->hash_key(response.entries.back().oid,
+		       response.entries.back().nspace) :
+	pool->hash_key(response.entries.back().locator,
+		       response.entries.back().nspace);
+      hobject_t last(response.entries.back().oid,
+		     response.entries.back().locator,
+		     CEPH_NOSNAP,
+		     hash,
+		     ctx->oloc.get_pool(),
+		     response.entries.back().nspace);
+      if (last < ctx->end)
+	break;
+      response.entries.pop_back();
+    }
+  }
+
+  if (response.entries.size() <= ctx->max) {
+    ctx->max -= response.entries.size();
+    std::move(response.entries.begin(), response.entries.end(),
+	      std::back_inserter(ctx->ls));
+  } else {
+    auto i = response.entries.begin();
+    while (ctx->max > 0) {
+      ctx->ls.push_back(std::move(*i));
+      --(ctx->max);
+      ++i;
+    }
+    uint32_t hash =
+      i->locator.empty() ?
+      pool->hash_key(i->oid, i->nspace) :
+      pool->hash_key(i->locator, i->nspace);
+
+    next = hobject_t{i->oid, i->locator,
+		     CEPH_NOSNAP,
+		     hash,
+		     ctx->oloc.get_pool(),
+		     i->nspace};
+  }
+
+  if (next == ctx->end || ctx->max == 0) {
+    std::move(*ctx)(ec, std::move(ctx->ls), std::move(next));
+  } else {
+    _issue_enumerate(next, std::move(ctx));
+  }
+}
+
+template
+void Objecter::_enumerate_reply<librados::ListObjectImpl>(
+  cb::list&& bl,
+  bs::error_code ec,
+  std::unique_ptr<EnumerationContext<librados::ListObjectImpl>>&& ctx);
+
+template
+void Objecter::_enumerate_reply<neorados::Entry>(
+  cb::list&& bl,
+  bs::error_code ec,
+  std::unique_ptr<EnumerationContext<neorados::Entry>>&& ctx);
+
+namespace {
+  using namespace librados;
+
+  template <typename T>
+  void do_decode(std::vector<T>& items, std::vector<cb::list>& bls)
+  {
+    for (auto bl : bls) {
+      auto p = bl.cbegin();
+      T t;
+      decode(t, p);
+      items.push_back(t);
+    }
+  }
+
+  struct C_ObjectOperation_scrub_ls : public Context {
+    cb::list bl;
+    uint32_t* interval;
+    std::vector<inconsistent_obj_t> *objects = nullptr;
+    std::vector<inconsistent_snapset_t> *snapsets = nullptr;
+    int* rval;
+
+    C_ObjectOperation_scrub_ls(uint32_t* interval,
+			       std::vector<inconsistent_obj_t>* objects,
+			       int* rval)
+      : interval(interval), objects(objects), rval(rval) {}
+    C_ObjectOperation_scrub_ls(uint32_t* interval,
+			       std::vector<inconsistent_snapset_t>* snapsets,
+			       int* rval)
+      : interval(interval), snapsets(snapsets), rval(rval) {}
+    void finish(int r) override {
+      if (r < 0 && r != -EAGAIN) {
+        if (rval)
+          *rval = r;
+	return;
+      }
+
+      if (rval)
+        *rval = 0;
+
+      try {
+	decode();
+      } catch (cb::error&) {
+	if (rval)
+	  *rval = -EIO;
+      }
+    }
+  private:
+    void decode() {
+      scrub_ls_result_t result;
+      auto p = bl.cbegin();
+      result.decode(p);
+      *interval = result.interval;
+      if (objects) {
+	do_decode(*objects, result.vals);
+      } else {
+	do_decode(*snapsets, result.vals);
+      }
+    }
+  };
+
+  template <typename T>
+  void do_scrub_ls(::ObjectOperation* op,
+		   const scrub_ls_arg_t& arg,
+		   std::vector<T> *items,
+		   uint32_t* interval,
+		   int* rval)
+  {
+    OSDOp& osd_op = op->add_op(CEPH_OSD_OP_SCRUBLS);
+    op->flags |= CEPH_OSD_FLAG_PGOP;
+    ceph_assert(interval);
+    arg.encode(osd_op.indata);
+    unsigned p = op->ops.size() - 1;
+    auto h = new C_ObjectOperation_scrub_ls{interval, items, rval};
+    op->set_handler(h);
+    op->out_bl[p] = &h->bl;
+    op->out_rval[p] = rval;
+  }
+}
+
+void ::ObjectOperation::scrub_ls(const librados::object_id_t& start_after,
+				 uint64_t max_to_get,
+				 std::vector<librados::inconsistent_obj_t>* objects,
+				 uint32_t* interval,
+				 int* rval)
+{
+  scrub_ls_arg_t arg = {*interval, 0, start_after, max_to_get};
+  do_scrub_ls(this, arg, objects, interval, rval);
+}
+
+void ::ObjectOperation::scrub_ls(const librados::object_id_t& start_after,
+				 uint64_t max_to_get,
+				 std::vector<librados::inconsistent_snapset_t> *snapsets,
+				 uint32_t *interval,
+				 int *rval)
+{
+  scrub_ls_arg_t arg = {*interval, 1, start_after, max_to_get};
+  do_scrub_ls(this, arg, snapsets, interval, rval);
+}
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
new file mode 100644
index 000000000..163a3359d
--- /dev/null
+++ b/src/osdc/Objecter.h
@@ -0,0 +1,3910 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OBJECTER_H
+#define CEPH_OBJECTER_H
+
+#include <condition_variable>
+#include <list>
+#include <map>
+#include <mutex>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <variant>
+
+#include <boost/container/small_vector.hpp>
+#include <boost/asio.hpp>
+
+#include <fmt/format.h>
+
+#include "include/buffer.h"
+#include "include/ceph_assert.h"
+#include "include/ceph_fs.h"
+#include "include/common_fwd.h"
+#include "include/expected.hpp"
+#include "include/types.h"
+#include "include/rados/rados_types.hpp"
+#include "include/function2.hpp"
+#include "include/neorados/RADOS_Decodable.hpp"
+
+#include "common/admin_socket.h"
+#include "common/async/completion.h"
+#include "common/ceph_time.h"
+#include "common/ceph_mutex.h"
+#include "common/ceph_timer.h"
+#include "common/config_obs.h"
+#include "common/shunique_lock.h"
+#include "common/zipkin_trace.h"
+#include "common/Throttle.h"
+
+#include "mon/MonClient.h"
+
+#include "messages/MOSDOp.h"
+#include "msg/Dispatcher.h"
+
+#include "osd/OSDMap.h"
+
+class Context;
+class Messenger;
+class MonClient;
+class Message;
+
+class MPoolOpReply;
+
+class MGetPoolStatsReply;
+class MStatfsReply;
+class MCommandReply;
+class MWatchNotify;
+template<typename T>
+struct EnumerationContext;
+template<typename t>
+struct CB_EnumerateReply;
+
+inline constexpr std::size_t osdc_opvec_len = 2;
+using osdc_opvec = boost::container::small_vector<OSDOp, osdc_opvec_len>;
+
+// -----------------------------------------
+
+struct ObjectOperation {
+  osdc_opvec ops;
+  int flags = 0;
+  int priority = 0;
+
+  boost::container::small_vector<ceph::buffer::list*, osdc_opvec_len> out_bl;
+  boost::container::small_vector<
+    fu2::unique_function<void(boost::system::error_code, int,
+			      const ceph::buffer::list& bl) &&>,
+    osdc_opvec_len> out_handler;
+  boost::container::small_vector<int*, osdc_opvec_len> out_rval;
+  boost::container::small_vector<boost::system::error_code*,
+				 osdc_opvec_len> out_ec;
+
+  ObjectOperation() = default;
+  ObjectOperation(const ObjectOperation&) = delete;
+  ObjectOperation& operator =(const ObjectOperation&) = delete;
+  ObjectOperation(ObjectOperation&&) = default;
+  ObjectOperation& operator =(ObjectOperation&&) = default;
+  ~ObjectOperation() = default;
+
+  size_t size() const {
+    return ops.size();
+  }
+
+  void clear() {
+    ops.clear();
+    flags = 0;
+    priority = 0;
+    out_bl.clear();
+    out_handler.clear();
+    out_rval.clear();
+    out_ec.clear();
+  }
+
+  void set_last_op_flags(int flags) {
+    ceph_assert(!ops.empty());
+    ops.rbegin()->op.flags = flags;
+  }
+
+
+  void set_handler(fu2::unique_function<void(boost::system::error_code, int,
+					     const ceph::buffer::list&) &&> f) {
+    if (f) {
+      if (out_handler.back()) {
+	// This happens seldom enough that we may as well keep folding
+	// functions together when we get another one rather than
+	// using a container.
+	out_handler.back() =
+	  [f = std::move(f),
+	   g = std::move(std::move(out_handler.back()))]
+	  (boost::system::error_code ec, int r,
+	   const ceph::buffer::list& bl) mutable {
+	    std::move(g)(ec, r, bl);
+	    std::move(f)(ec, r, bl);
+	  };
+      } else {
+	out_handler.back() = std::move(f);
+      }
+    }
+    ceph_assert(ops.size() == out_handler.size());
+  }
+
+  void set_handler(Context *c) {
+    if (c)
+      set_handler([c = std::unique_ptr<Context>(c)](boost::system::error_code,
+						    int r,
+						    const ceph::buffer::list&) mutable {
+		    c.release()->complete(r);
+		  });
+
+  }
+
+  OSDOp& add_op(int op) {
+    ops.emplace_back();
+    ops.back().op.op = op;
+    out_bl.push_back(nullptr);
+    ceph_assert(ops.size() == out_bl.size());
+    out_handler.emplace_back();
+    ceph_assert(ops.size() == out_handler.size());
+    out_rval.push_back(nullptr);
+    ceph_assert(ops.size() == out_rval.size());
+    out_ec.push_back(nullptr);
+    ceph_assert(ops.size() == out_ec.size());
+    return ops.back();
+  }
+  void add_data(int op, uint64_t off, uint64_t len, ceph::buffer::list& bl) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.extent.offset = off;
+    osd_op.op.extent.length = len;
+    osd_op.indata.claim_append(bl);
+  }
+  void add_writesame(int op, uint64_t off, uint64_t write_len,
+		     ceph::buffer::list& bl) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.writesame.offset = off;
+    osd_op.op.writesame.length = write_len;
+    osd_op.op.writesame.data_length = bl.length();
+    osd_op.indata.claim_append(bl);
+  }
+  void add_xattr(int op, const char *name, const ceph::buffer::list& data) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.xattr.name_len = (name ? strlen(name) : 0);
+    osd_op.op.xattr.value_len = data.length();
+    if (name)
+      osd_op.indata.append(name, osd_op.op.xattr.name_len);
+    osd_op.indata.append(data);
+  }
+  void add_xattr_cmp(int op, const char *name, uint8_t cmp_op,
+		     uint8_t cmp_mode, const ceph::buffer::list& data) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.xattr.name_len = (name ? strlen(name) : 0);
+    osd_op.op.xattr.value_len = data.length();
+    osd_op.op.xattr.cmp_op = cmp_op;
+    osd_op.op.xattr.cmp_mode = cmp_mode;
+    if (name)
+      osd_op.indata.append(name, osd_op.op.xattr.name_len);
+    osd_op.indata.append(data);
+  }
+  void add_xattr(int op, std::string_view name, const ceph::buffer::list& data) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.xattr.name_len = name.size();
+    osd_op.op.xattr.value_len = data.length();
+    osd_op.indata.append(name.data(), osd_op.op.xattr.name_len);
+    osd_op.indata.append(data);
+  }
+  void add_xattr_cmp(int op, std::string_view name, uint8_t cmp_op,
+		     uint8_t cmp_mode, const ceph::buffer::list& data) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.xattr.name_len = name.size();
+    osd_op.op.xattr.value_len = data.length();
+    osd_op.op.xattr.cmp_op = cmp_op;
+    osd_op.op.xattr.cmp_mode = cmp_mode;
+    if (!name.empty())
+      osd_op.indata.append(name.data(), osd_op.op.xattr.name_len);
+    osd_op.indata.append(data);
+  }
+  void add_call(int op, std::string_view cname, std::string_view method,
+		const ceph::buffer::list &indata,
+		ceph::buffer::list *outbl, Context *ctx, int *prval) {
+    OSDOp& osd_op = add_op(op);
+
+    unsigned p = ops.size() - 1;
+    set_handler(ctx);
+    out_bl[p] = outbl;
+    out_rval[p] = prval;
+
+    osd_op.op.cls.class_len = cname.size();
+    osd_op.op.cls.method_len = method.size();
+    osd_op.op.cls.indata_len = indata.length();
+    osd_op.indata.append(cname.data(), osd_op.op.cls.class_len);
+    osd_op.indata.append(method.data(), osd_op.op.cls.method_len);
+    osd_op.indata.append(indata);
+  }
+  void add_call(int op, std::string_view cname, std::string_view method,
+		const ceph::buffer::list &indata,
+		fu2::unique_function<void(boost::system::error_code,
+					  const ceph::buffer::list&) &&> f) {
+    OSDOp& osd_op = add_op(op);
+
+    set_handler([f = std::move(f)](boost::system::error_code ec,
+				   int,
+				   const ceph::buffer::list& bl) mutable {
+		  std::move(f)(ec, bl);
+		});
+
+    osd_op.op.cls.class_len = cname.size();
+    osd_op.op.cls.method_len = method.size();
+    osd_op.op.cls.indata_len = indata.length();
+    osd_op.indata.append(cname.data(), osd_op.op.cls.class_len);
+    osd_op.indata.append(method.data(), osd_op.op.cls.method_len);
+    osd_op.indata.append(indata);
+  }
+  void add_call(int op, std::string_view cname, std::string_view method,
+		const ceph::buffer::list &indata,
+		fu2::unique_function<void(boost::system::error_code, int,
+					  const ceph::buffer::list&) &&> f) {
+    OSDOp& osd_op = add_op(op);
+
+    set_handler([f = std::move(f)](boost::system::error_code ec,
+				   int r,
+				   const ceph::buffer::list& bl) mutable {
+		  std::move(f)(ec, r, bl);
+		});
+
+    osd_op.op.cls.class_len = cname.size();
+    osd_op.op.cls.method_len = method.size();
+    osd_op.op.cls.indata_len = indata.length();
+    osd_op.indata.append(cname.data(), osd_op.op.cls.class_len);
+    osd_op.indata.append(method.data(), osd_op.op.cls.method_len);
+    osd_op.indata.append(indata);
+  }
+  void add_pgls(int op, uint64_t count, collection_list_handle_t cookie,
+		epoch_t start_epoch) {
+    using ceph::encode;
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.pgls.count = count;
+    osd_op.op.pgls.start_epoch = start_epoch;
+    encode(cookie, osd_op.indata);
+  }
+  void add_pgls_filter(int op, uint64_t count, const ceph::buffer::list& filter,
+		       collection_list_handle_t cookie, epoch_t start_epoch) {
+    using ceph::encode;
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.pgls.count = count;
+    osd_op.op.pgls.start_epoch = start_epoch;
+    std::string cname = "pg";
+    std::string mname = "filter";
+    encode(cname, osd_op.indata);
+    encode(mname, osd_op.indata);
+    osd_op.indata.append(filter);
+    encode(cookie, osd_op.indata);
+  }
+  void add_alloc_hint(int op, uint64_t expected_object_size,
+                      uint64_t expected_write_size,
+		      uint32_t flags) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.alloc_hint.expected_object_size = expected_object_size;
+    osd_op.op.alloc_hint.expected_write_size = expected_write_size;
+    osd_op.op.alloc_hint.flags = flags;
+  }
+
+  // ------
+
+  // pg
+  void pg_ls(uint64_t count, ceph::buffer::list& filter,
+	     collection_list_handle_t cookie, epoch_t start_epoch) {
+    if (filter.length() == 0)
+      add_pgls(CEPH_OSD_OP_PGLS, count, cookie, start_epoch);
+    else
+      add_pgls_filter(CEPH_OSD_OP_PGLS_FILTER, count, filter, cookie,
+		      start_epoch);
+    flags |= CEPH_OSD_FLAG_PGOP;
+  }
+
+  void pg_nls(uint64_t count, const ceph::buffer::list& filter,
+	      collection_list_handle_t cookie, epoch_t start_epoch) {
+    if (filter.length() == 0)
+      add_pgls(CEPH_OSD_OP_PGNLS, count, cookie, start_epoch);
+    else
+      add_pgls_filter(CEPH_OSD_OP_PGNLS_FILTER, count, filter, cookie,
+		      start_epoch);
+    flags |= CEPH_OSD_FLAG_PGOP;
+  }
+
+  void scrub_ls(const librados::object_id_t& start_after,
+		uint64_t max_to_get,
+		std::vector<librados::inconsistent_obj_t> *objects,
+		uint32_t *interval,
+		int *rval);
+  void scrub_ls(const librados::object_id_t& start_after,
+		uint64_t max_to_get,
+		std::vector<librados::inconsistent_snapset_t> *objects,
+		uint32_t *interval,
+		int *rval);
+
+  void create(bool excl) {
+    OSDOp& o = add_op(CEPH_OSD_OP_CREATE);
+    o.op.flags = (excl ? CEPH_OSD_OP_FLAG_EXCL : 0);
+  }
+
+  struct CB_ObjectOperation_stat {
+    ceph::buffer::list bl;
+    uint64_t *psize;
+    ceph::real_time *pmtime;
+    time_t *ptime;
+    struct timespec *pts;
+    int *prval;
+    boost::system::error_code* pec;
+    CB_ObjectOperation_stat(uint64_t *ps, ceph::real_time *pm, time_t *pt, struct timespec *_pts,
+			    int *prval, boost::system::error_code* pec)
+      : psize(ps), pmtime(pm), ptime(pt), pts(_pts), prval(prval), pec(pec) {}
+    void operator()(boost::system::error_code ec, int r, const ceph::buffer::list& bl) {
+      using ceph::decode;
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	try {
+	  uint64_t size;
+	  ceph::real_time mtime;
+	  decode(size, p);
+	  decode(mtime, p);
+	  if (psize)
+	    *psize = size;
+	  if (pmtime)
+	    *pmtime = mtime;
+	  if (ptime)
+	    *ptime = ceph::real_clock::to_time_t(mtime);
+	  if (pts)
+	    *pts = ceph::real_clock::to_timespec(mtime);
+	} catch (const ceph::buffer::error& e) {
+	  if (prval)
+	    *prval = -EIO;
+	  if (pec)
+	    *pec = e.code();
+	}
+      }
+    }
+  };
+  void stat(uint64_t *psize, ceph::real_time *pmtime, int *prval) {
+    add_op(CEPH_OSD_OP_STAT);
+    set_handler(CB_ObjectOperation_stat(psize, pmtime, nullptr, nullptr, prval,
+					nullptr));
+    out_rval.back() = prval;
+  }
+  void stat(uint64_t *psize, ceph::real_time *pmtime,
+	    boost::system::error_code* ec) {
+    add_op(CEPH_OSD_OP_STAT);
+    set_handler(CB_ObjectOperation_stat(psize, pmtime, nullptr, nullptr,
+					nullptr, ec));
+    out_ec.back() = ec;
+  }
+  void stat(uint64_t *psize, time_t *ptime, int *prval) {
+    add_op(CEPH_OSD_OP_STAT);
+    set_handler(CB_ObjectOperation_stat(psize, nullptr, ptime, nullptr, prval,
+					nullptr));
+    out_rval.back() = prval;
+  }
+  void stat(uint64_t *psize, struct timespec *pts, int *prval) {
+    add_op(CEPH_OSD_OP_STAT);
+    set_handler(CB_ObjectOperation_stat(psize, nullptr, nullptr, pts, prval, nullptr));
+    out_rval.back() = prval;
+  }
+  void stat(uint64_t *psize, ceph::real_time *pmtime, nullptr_t) {
+    add_op(CEPH_OSD_OP_STAT);
+    set_handler(CB_ObjectOperation_stat(psize, pmtime, nullptr, nullptr, nullptr,
+					nullptr));
+  }
+  void stat(uint64_t *psize, time_t *ptime, nullptr_t) {
+    add_op(CEPH_OSD_OP_STAT);
+    set_handler(CB_ObjectOperation_stat(psize, nullptr, ptime, nullptr, nullptr,
+					nullptr));
+  }
+  void stat(uint64_t *psize, struct timespec *pts, nullptr_t) {
+    add_op(CEPH_OSD_OP_STAT);
+    set_handler(CB_ObjectOperation_stat(psize, nullptr, nullptr, pts, nullptr,
+					nullptr));
+  }
+  void stat(uint64_t *psize, nullptr_t, nullptr_t) {
+    add_op(CEPH_OSD_OP_STAT);
+    set_handler(CB_ObjectOperation_stat(psize, nullptr, nullptr, nullptr,
+					nullptr, nullptr));
+  }
+
+  // object cmpext
+  struct CB_ObjectOperation_cmpext {
+    int* prval = nullptr;
+    boost::system::error_code* ec = nullptr;
+    std::size_t* s = nullptr;
+    explicit CB_ObjectOperation_cmpext(int *prval)
+      : prval(prval) {}
+    CB_ObjectOperation_cmpext(boost::system::error_code* ec, std::size_t* s)
+      : ec(ec), s(s) {}
+
+    void operator()(boost::system::error_code ec, int r, const ceph::buffer::list&) {
+      if (prval)
+        *prval = r;
+      if (this->ec)
+	*this->ec = ec;
+      if (s)
+	*s = static_cast<std::size_t>(-(MAX_ERRNO - r));
+    }
+  };
+
+  void cmpext(uint64_t off, ceph::buffer::list& cmp_bl, int *prval) {
+    add_data(CEPH_OSD_OP_CMPEXT, off, cmp_bl.length(), cmp_bl);
+    set_handler(CB_ObjectOperation_cmpext(prval));
+    out_rval.back() = prval;
+  }
+
+  void cmpext(uint64_t off, ceph::buffer::list&& cmp_bl, boost::system::error_code* ec,
+	      std::size_t* s) {
+    add_data(CEPH_OSD_OP_CMPEXT, off, cmp_bl.length(), cmp_bl);
+    set_handler(CB_ObjectOperation_cmpext(ec, s));
+    out_ec.back() = ec;
+  }
+
+  // Used by C API
+  void cmpext(uint64_t off, uint64_t cmp_len, const char *cmp_buf, int *prval) {
+    ceph::buffer::list cmp_bl;
+    cmp_bl.append(cmp_buf, cmp_len);
+    add_data(CEPH_OSD_OP_CMPEXT, off, cmp_len, cmp_bl);
+    set_handler(CB_ObjectOperation_cmpext(prval));
+    out_rval.back() = prval;
+  }
+
+  void read(uint64_t off, uint64_t len, ceph::buffer::list *pbl, int *prval,
+	    Context* ctx) {
+    ceph::buffer::list bl;
+    add_data(CEPH_OSD_OP_READ, off, len, bl);
+    unsigned p = ops.size() - 1;
+    out_bl[p] = pbl;
+    out_rval[p] = prval;
+    set_handler(ctx);
+  }
+
+  void read(uint64_t off, uint64_t len, boost::system::error_code* ec,
+	    ceph::buffer::list* pbl) {
+    ceph::buffer::list bl;
+    add_data(CEPH_OSD_OP_READ, off, len, bl);
+    out_ec.back() = ec;
+    out_bl.back() = pbl;
+  }
+
+  template<typename Ex>
+  struct CB_ObjectOperation_sparse_read {
+    ceph::buffer::list* data_bl;
+    Ex* extents;
+    int* prval;
+    boost::system::error_code* pec;
+    CB_ObjectOperation_sparse_read(ceph::buffer::list* data_bl,
+				   Ex* extents,
+				   int* prval,
+				   boost::system::error_code* pec)
+      : data_bl(data_bl), extents(extents), prval(prval), pec(pec) {}
+    void operator()(boost::system::error_code ec, int r, const ceph::buffer::list& bl) {
+      auto iter = bl.cbegin();
+      if (r >= 0) {
+        // NOTE: it's possible the sub-op has not been executed but the result
+        // code remains zeroed. Avoid the costly exception handling on a
+        // potential IO path.
+        if (bl.length() > 0) {
+	  try {
+	    decode(*extents, iter);
+	    decode(*data_bl, iter);
+	  } catch (const ceph::buffer::error& e) {
+	    if (prval)
+              *prval = -EIO;
+	    if (pec)
+	      *pec = e.code();
+	  }
+        } else if (prval) {
+          *prval = -EIO;
+	  if (pec)
+	    *pec = buffer::errc::end_of_buffer;
+	}
+      }
+    }
+  };
+  void sparse_read(uint64_t off, uint64_t len, std::map<uint64_t, uint64_t>* m,
+		   ceph::buffer::list* data_bl, int* prval) {
+    ceph::buffer::list bl;
+    add_data(CEPH_OSD_OP_SPARSE_READ, off, len, bl);
+    set_handler(CB_ObjectOperation_sparse_read(data_bl, m, prval, nullptr));
+    out_rval.back() = prval;
+  }
+  void sparse_read(uint64_t off, uint64_t len,
+		   boost::system::error_code* ec,
+		   std::vector<std::pair<uint64_t, uint64_t>>* m,
+		   ceph::buffer::list* data_bl) {
+    ceph::buffer::list bl;
+    add_data(CEPH_OSD_OP_SPARSE_READ, off, len, bl);
+    set_handler(CB_ObjectOperation_sparse_read(data_bl, m, nullptr, ec));
+    out_ec.back() = ec;
+  }
+  void write(uint64_t off, ceph::buffer::list& bl,
+	     uint64_t truncate_size,
+	     uint32_t truncate_seq) {
+    add_data(CEPH_OSD_OP_WRITE, off, bl.length(), bl);
+    OSDOp& o = *ops.rbegin();
+    o.op.extent.truncate_size = truncate_size;
+    o.op.extent.truncate_seq = truncate_seq;
+  }
+  void write(uint64_t off, ceph::buffer::list& bl) {
+    write(off, bl, 0, 0);
+  }
+  void write_full(ceph::buffer::list& bl) {
+    add_data(CEPH_OSD_OP_WRITEFULL, 0, bl.length(), bl);
+  }
+  void writesame(uint64_t off, uint64_t write_len, ceph::buffer::list& bl) {
+    add_writesame(CEPH_OSD_OP_WRITESAME, off, write_len, bl);
+  }
+  void append(ceph::buffer::list& bl) {
+    add_data(CEPH_OSD_OP_APPEND, 0, bl.length(), bl);
+  }
+  void zero(uint64_t off, uint64_t len) {
+    ceph::buffer::list bl;
+    add_data(CEPH_OSD_OP_ZERO, off, len, bl);
+  }
+  void truncate(uint64_t off) {
+    ceph::buffer::list bl;
+    add_data(CEPH_OSD_OP_TRUNCATE, off, 0, bl);
+  }
+  void remove() {
+    ceph::buffer::list bl;
+    add_data(CEPH_OSD_OP_DELETE, 0, 0, bl);
+  }
+  void mapext(uint64_t off, uint64_t len) {
+    ceph::buffer::list bl;
+    add_data(CEPH_OSD_OP_MAPEXT, off, len, bl);
+  }
+  void sparse_read(uint64_t off, uint64_t len) {
+    ceph::buffer::list bl;
+    add_data(CEPH_OSD_OP_SPARSE_READ, off, len, bl);
+  }
+
+  void checksum(uint8_t type, const ceph::buffer::list &init_value_bl,
+		uint64_t off, uint64_t len, size_t chunk_size,
+		ceph::buffer::list *pbl, int *prval, Context *ctx) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_CHECKSUM);
+    osd_op.op.checksum.offset = off;
+    osd_op.op.checksum.length = len;
+    osd_op.op.checksum.type = type;
+    osd_op.op.checksum.chunk_size = chunk_size;
+    osd_op.indata.append(init_value_bl);
+
+    unsigned p = ops.size() - 1;
+    out_bl[p] = pbl;
+    out_rval[p] = prval;
+    set_handler(ctx);
+  }
+
+  // object attrs
+  void getxattr(const char *name, ceph::buffer::list *pbl, int *prval) {
+    ceph::buffer::list bl;
+    add_xattr(CEPH_OSD_OP_GETXATTR, name, bl);
+    unsigned p = ops.size() - 1;
+    out_bl[p] = pbl;
+    out_rval[p] = prval;
+  }
+  void getxattr(std::string_view name, boost::system::error_code* ec,
+		buffer::list *pbl) {
+    ceph::buffer::list bl;
+    add_xattr(CEPH_OSD_OP_GETXATTR, name, bl);
+    out_bl.back() = pbl;
+    out_ec.back() = ec;
+  }
+
+  template<typename Vals>
+  struct CB_ObjectOperation_decodevals {
+    uint64_t max_entries;
+    Vals* pattrs;
+    bool* ptruncated;
+    int* prval;
+    boost::system::error_code* pec;
+    CB_ObjectOperation_decodevals(uint64_t m, Vals* pa,
+				  bool *pt, int *pr,
+				  boost::system::error_code* pec)
+      : max_entries(m), pattrs(pa), ptruncated(pt), prval(pr), pec(pec) {
+      if (ptruncated) {
+	*ptruncated = false;
+      }
+    }
+    void operator()(boost::system::error_code ec, int r, const ceph::buffer::list& bl) {
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	try {
+	  if (pattrs)
+	    decode(*pattrs, p);
+	  if (ptruncated) {
+	    Vals ignore;
+	    if (!pattrs) {
+	      decode(ignore, p);
+	      pattrs = &ignore;
+	    }
+	    if (!p.end()) {
+	      decode(*ptruncated, p);
+	    } else {
+	      // The OSD did not provide this.  Since old OSDs do not
+	      // enfoce omap result limits either, we can infer it from
+	      // the size of the result
+	      *ptruncated = (pattrs->size() == max_entries);
+	    }
+	  }
+	} catch (const ceph::buffer::error& e) {
+	  if (prval)
+	    *prval = -EIO;
+	  if (pec)
+	    *pec = e.code();
+	}
+      }
+    }
+  };
+  template<typename Keys>
+  struct CB_ObjectOperation_decodekeys {
+    uint64_t max_entries;
+    Keys* pattrs;
+    bool *ptruncated;
+    int *prval;
+    boost::system::error_code* pec;
+    CB_ObjectOperation_decodekeys(uint64_t m, Keys* pa, bool *pt,
+				  int *pr, boost::system::error_code* pec)
+      : max_entries(m), pattrs(pa), ptruncated(pt), prval(pr), pec(pec) {
+      if (ptruncated) {
+	*ptruncated = false;
+      }
+    }
+    void operator()(boost::system::error_code ec, int r, const ceph::buffer::list& bl) {
+      if (r >= 0) {
+	using ceph::decode;
+	auto p = bl.cbegin();
+	try {
+	  if (pattrs)
+	    decode(*pattrs, p);
+	  if (ptruncated) {
+	    Keys ignore;
+	    if (!pattrs) {
+	      decode(ignore, p);
+	      pattrs = &ignore;
+	    }
+	    if (!p.end()) {
+	      decode(*ptruncated, p);
+	    } else {
+	      // the OSD did not provide this.  since old OSDs do not
+	      // enforce omap result limits either, we can infer it from
+	      // the size of the result
+	      *ptruncated = (pattrs->size() == max_entries);
+	    }
+	  }
+	} catch (const ceph::buffer::error& e) {
+	  if (prval)
+	    *prval = -EIO;
+	  if (pec)
+	    *pec = e.code();
+	}
+      }
+    }
+  };
+  struct CB_ObjectOperation_decodewatchers {
+    std::list<obj_watch_t>* pwatchers;
+    int* prval;
+    boost::system::error_code* pec;
+    CB_ObjectOperation_decodewatchers(std::list<obj_watch_t>* pw, int* pr,
+				      boost::system::error_code* pec)
+      : pwatchers(pw), prval(pr), pec(pec) {}
+    void operator()(boost::system::error_code ec, int r,
+		    const ceph::buffer::list& bl) {
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	try {
+	  obj_list_watch_response_t resp;
+	  decode(resp, p);
+	  if (pwatchers) {
+	    for (const auto& watch_item : resp.entries) {
+	      obj_watch_t ow;
+	      std::string sa = watch_item.addr.get_legacy_str();
+	      strncpy(ow.addr, sa.c_str(), sizeof(ow.addr) - 1);
+	      ow.addr[sizeof(ow.addr) - 1] = '\0';
+	      ow.watcher_id = watch_item.name.num();
+	      ow.cookie = watch_item.cookie;
+	      ow.timeout_seconds = watch_item.timeout_seconds;
+	      pwatchers->push_back(std::move(ow));
+	    }
+	  }
+	} catch (const ceph::buffer::error& e) {
+	  if (prval)
+	    *prval = -EIO;
+	  if (pec)
+	    *pec = e.code();
+	}
+      }
+    }
+  };
+
+  struct CB_ObjectOperation_decodewatchersneo {
+    std::vector<neorados::ObjWatcher>* pwatchers;
+    int* prval;
+    boost::system::error_code* pec;
+    CB_ObjectOperation_decodewatchersneo(std::vector<neorados::ObjWatcher>* pw,
+					 int* pr,
+					 boost::system::error_code* pec)
+      : pwatchers(pw), prval(pr), pec(pec) {}
+    void operator()(boost::system::error_code ec, int r,
+		    const ceph::buffer::list& bl) {
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	try {
+	  obj_list_watch_response_t resp;
+	  decode(resp, p);
+	  if (pwatchers) {
+	    for (const auto& watch_item : resp.entries) {
+	      neorados::ObjWatcher ow;
+	      ow.addr = watch_item.addr.get_legacy_str();
+	      ow.watcher_id = watch_item.name.num();
+	      ow.cookie = watch_item.cookie;
+	      ow.timeout_seconds = watch_item.timeout_seconds;
+	      pwatchers->push_back(std::move(ow));
+	    }
+	  }
+	} catch (const ceph::buffer::error& e) {
+	  if (prval)
+	    *prval = -EIO;
+	  if (pec)
+	    *pec = e.code();
+	}
+      }
+    }
+  };
+
+
+  struct CB_ObjectOperation_decodesnaps {
+    librados::snap_set_t *psnaps;
+    neorados::SnapSet *neosnaps;
+    int *prval;
+    boost::system::error_code* pec;
+    CB_ObjectOperation_decodesnaps(librados::snap_set_t* ps,
+				   neorados::SnapSet* ns, int* pr,
+				   boost::system::error_code* pec)
+      : psnaps(ps), neosnaps(ns), prval(pr), pec(pec) {}
+    void operator()(boost::system::error_code ec, int r, const ceph::buffer::list& bl) {
+      if (r >= 0) {
+	using ceph::decode;
+	auto p = bl.cbegin();
+	try {
+	  obj_list_snap_response_t resp;
+	  decode(resp, p);
+	  if (psnaps) {
+	    psnaps->clones.clear();
+	    for (auto ci = resp.clones.begin();
+		 ci != resp.clones.end();
+		 ++ci) {
+	      librados::clone_info_t clone;
+
+	      clone.cloneid = ci->cloneid;
+	      clone.snaps.reserve(ci->snaps.size());
+	      clone.snaps.insert(clone.snaps.end(), ci->snaps.begin(),
+				 ci->snaps.end());
+	      clone.overlap = ci->overlap;
+	      clone.size = ci->size;
+
+	      psnaps->clones.push_back(clone);
+	    }
+	    psnaps->seq = resp.seq;
+	  }
+
+	  if (neosnaps) {
+	    neosnaps->clones.clear();
+	    for (auto&& c : resp.clones) {
+	      neorados::CloneInfo clone;
+
+	      clone.cloneid = std::move(c.cloneid);
+	      clone.snaps.reserve(c.snaps.size());
+	      std::move(c.snaps.begin(), c.snaps.end(),
+			std::back_inserter(clone.snaps));
+	      clone.overlap = c.overlap;
+	      clone.size = c.size;
+	      neosnaps->clones.push_back(std::move(clone));
+	    }
+	    neosnaps->seq = resp.seq;
+	  }
+	} catch (const ceph::buffer::error& e) {
+	  if (prval)
+	    *prval = -EIO;
+	  if (pec)
+	    *pec = e.code();
+	}
+      }
+    }
+  };
+  void getxattrs(std::map<std::string,ceph::buffer::list> *pattrs, int *prval) {
+    add_op(CEPH_OSD_OP_GETXATTRS);
+    if (pattrs || prval) {
+      set_handler(CB_ObjectOperation_decodevals(0, pattrs, nullptr, prval,
+						nullptr));
+      out_rval.back() = prval;
+    }
+  }
+  void getxattrs(boost::system::error_code* ec,
+		 boost::container::flat_map<std::string, ceph::buffer::list> *pattrs) {
+    add_op(CEPH_OSD_OP_GETXATTRS);
+    set_handler(CB_ObjectOperation_decodevals(0, pattrs, nullptr, nullptr, ec));
+    out_ec.back() = ec;
+  }
+  void setxattr(const char *name, const ceph::buffer::list& bl) {
+    add_xattr(CEPH_OSD_OP_SETXATTR, name, bl);
+  }
+  void setxattr(std::string_view name, const ceph::buffer::list& bl) {
+    add_xattr(CEPH_OSD_OP_SETXATTR, name, bl);
+  }
+  void setxattr(const char *name, const std::string& s) {
+    ceph::buffer::list bl;
+    bl.append(s);
+    add_xattr(CEPH_OSD_OP_SETXATTR, name, bl);
+  }
+  void cmpxattr(const char *name, uint8_t cmp_op, uint8_t cmp_mode,
+		const ceph::buffer::list& bl) {
+    add_xattr_cmp(CEPH_OSD_OP_CMPXATTR, name, cmp_op, cmp_mode, bl);
+  }
+  void cmpxattr(std::string_view name, uint8_t cmp_op, uint8_t cmp_mode,
+		const ceph::buffer::list& bl) {
+    add_xattr_cmp(CEPH_OSD_OP_CMPXATTR, name, cmp_op, cmp_mode, bl);
+  }
+  void rmxattr(const char *name) {
+    ceph::buffer::list bl;
+    add_xattr(CEPH_OSD_OP_RMXATTR, name, bl);
+  }
+  void rmxattr(std::string_view name) {
+    ceph::buffer::list bl;
+    add_xattr(CEPH_OSD_OP_RMXATTR, name, bl);
+  }
+  void setxattrs(map<string, ceph::buffer::list>& attrs) {
+    using ceph::encode;
+    ceph::buffer::list bl;
+    encode(attrs, bl);
+    add_xattr(CEPH_OSD_OP_RESETXATTRS, 0, bl.length());
+  }
+  void resetxattrs(const char *prefix, std::map<std::string, ceph::buffer::list>& attrs) {
+    using ceph::encode;
+    ceph::buffer::list bl;
+    encode(attrs, bl);
+    add_xattr(CEPH_OSD_OP_RESETXATTRS, prefix, bl);
+  }
+
+  // trivialmap
+  void tmap_update(ceph::buffer::list& bl) {
+    add_data(CEPH_OSD_OP_TMAPUP, 0, 0, bl);
+  }
+
+  // objectmap
+  void omap_get_keys(const std::string &start_after,
+		     uint64_t max_to_get,
+		     std::set<std::string> *out_set,
+		     bool *ptruncated,
+		     int *prval) {
+    using ceph::encode;
+    OSDOp &op = add_op(CEPH_OSD_OP_OMAPGETKEYS);
+    ceph::buffer::list bl;
+    encode(start_after, bl);
+    encode(max_to_get, bl);
+    op.op.extent.offset = 0;
+    op.op.extent.length = bl.length();
+    op.indata.claim_append(bl);
+    if (prval || ptruncated || out_set) {
+      set_handler(CB_ObjectOperation_decodekeys(max_to_get, out_set, ptruncated, prval,
+						nullptr));
+      out_rval.back() = prval;
+    }
+  }
+  void omap_get_keys(std::optional<std::string_view> start_after,
+		     uint64_t max_to_get,
+		     boost::system::error_code* ec,
+		     boost::container::flat_set<std::string> *out_set,
+		     bool *ptruncated) {
+    OSDOp& op = add_op(CEPH_OSD_OP_OMAPGETKEYS);
+    ceph::buffer::list bl;
+    encode(start_after ? *start_after : std::string_view{}, bl);
+    encode(max_to_get, bl);
+    op.op.extent.offset = 0;
+    op.op.extent.length = bl.length();
+    op.indata.claim_append(bl);
+    set_handler(
+      CB_ObjectOperation_decodekeys(max_to_get, out_set, ptruncated, nullptr,
+				    ec));
+    out_ec.back() = ec;
+  }
+
+  void omap_get_vals(const std::string &start_after,
+		     const std::string &filter_prefix,
+		     uint64_t max_to_get,
+		     std::map<std::string, ceph::buffer::list> *out_set,
+		     bool *ptruncated,
+		     int *prval) {
+    using ceph::encode;
+    OSDOp &op = add_op(CEPH_OSD_OP_OMAPGETVALS);
+    ceph::buffer::list bl;
+    encode(start_after, bl);
+    encode(max_to_get, bl);
+    encode(filter_prefix, bl);
+    op.op.extent.offset = 0;
+    op.op.extent.length = bl.length();
+    op.indata.claim_append(bl);
+    if (prval || out_set || ptruncated) {
+      set_handler(CB_ObjectOperation_decodevals(max_to_get, out_set, ptruncated,
+						prval, nullptr));
+      out_rval.back() = prval;
+    }
+  }
+
+  void omap_get_vals(std::optional<std::string_view> start_after,
+		     std::optional<std::string_view> filter_prefix,
+		     uint64_t max_to_get,
+		     boost::system::error_code* ec,
+		     boost::container::flat_map<std::string, ceph::buffer::list> *out_set,
+		     bool *ptruncated) {
+    OSDOp &op = add_op(CEPH_OSD_OP_OMAPGETVALS);
+    ceph::buffer::list bl;
+    encode(start_after ? *start_after : std::string_view{}, bl);
+    encode(max_to_get, bl);
+    encode(filter_prefix ? *start_after : std::string_view{}, bl);
+    op.op.extent.offset = 0;
+    op.op.extent.length = bl.length();
+    op.indata.claim_append(bl);
+    set_handler(CB_ObjectOperation_decodevals(max_to_get, out_set, ptruncated,
+					      nullptr, ec));
+    out_ec.back() = ec;
+  }
+
+  void omap_get_vals_by_keys(const std::set<std::string> &to_get,
+			     std::map<std::string, ceph::buffer::list> *out_set,
+			     int *prval) {
+    OSDOp &op = add_op(CEPH_OSD_OP_OMAPGETVALSBYKEYS);
+    ceph::buffer::list bl;
+    encode(to_get, bl);
+    op.op.extent.offset = 0;
+    op.op.extent.length = bl.length();
+    op.indata.claim_append(bl);
+    if (prval || out_set) {
+      set_handler(CB_ObjectOperation_decodevals(0, out_set, nullptr, prval,
+						nullptr));
+      out_rval.back() = prval;
+    }
+  }
+
+  void omap_get_vals_by_keys(
+    const boost::container::flat_set<std::string>& to_get,
+    boost::system::error_code* ec,
+    boost::container::flat_map<std::string, ceph::buffer::list> *out_set) {
+    OSDOp &op = add_op(CEPH_OSD_OP_OMAPGETVALSBYKEYS);
+    ceph::buffer::list bl;
+    encode(to_get, bl);
+    op.op.extent.offset = 0;
+    op.op.extent.length = bl.length();
+    op.indata.claim_append(bl);
+    set_handler(CB_ObjectOperation_decodevals(0, out_set, nullptr, nullptr,
+					      ec));
+    out_ec.back() = ec;
+  }
+
+  void omap_cmp(const std::map<std::string, pair<ceph::buffer::list,int> > &assertions,
+		int *prval) {
+    using ceph::encode;
+    OSDOp &op = add_op(CEPH_OSD_OP_OMAP_CMP);
+    ceph::buffer::list bl;
+    encode(assertions, bl);
+    op.op.extent.offset = 0;
+    op.op.extent.length = bl.length();
+    op.indata.claim_append(bl);
+    if (prval) {
+      unsigned p = ops.size() - 1;
+      out_rval[p] = prval;
+    }
+  }
+
+  void omap_cmp(const boost::container::flat_map<
+		  std::string, pair<ceph::buffer::list, int>>& assertions,
+		boost::system::error_code *ec) {
+    OSDOp &op = add_op(CEPH_OSD_OP_OMAP_CMP);
+    ceph::buffer::list bl;
+    encode(assertions, bl);
+    op.op.extent.offset = 0;
+    op.op.extent.length = bl.length();
+    op.indata.claim_append(bl);
+    out_ec.back() = ec;
+  }
+
+  struct C_ObjectOperation_copyget : public Context {
+    ceph::buffer::list bl;
+    object_copy_cursor_t *cursor;
+    uint64_t *out_size;
+    ceph::real_time *out_mtime;
+    std::map<std::string,ceph::buffer::list> *out_attrs;
+    ceph::buffer::list *out_data, *out_omap_header, *out_omap_data;
+    std::vector<snapid_t> *out_snaps;
+    snapid_t *out_snap_seq;
+    uint32_t *out_flags;
+    uint32_t *out_data_digest;
+    uint32_t *out_omap_digest;
+    mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > *out_reqids;
+    mempool::osd_pglog::map<uint32_t, int> *out_reqid_return_codes;
+    uint64_t *out_truncate_seq;
+    uint64_t *out_truncate_size;
+    int *prval;
+    C_ObjectOperation_copyget(object_copy_cursor_t *c,
+			      uint64_t *s,
+			      ceph::real_time *m,
+			      std::map<std::string,ceph::buffer::list> *a,
+			      ceph::buffer::list *d, ceph::buffer::list *oh,
+			      ceph::buffer::list *o,
+			      std::vector<snapid_t> *osnaps,
+			      snapid_t *osnap_seq,
+			      uint32_t *flags,
+			      uint32_t *dd,
+			      uint32_t *od,
+			      mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > *oreqids,
+			      mempool::osd_pglog::map<uint32_t, int> *oreqid_return_codes,
+			      uint64_t *otseq,
+			      uint64_t *otsize,
+			      int *r)
+      : cursor(c),
+	out_size(s), out_mtime(m),
+	out_attrs(a), out_data(d), out_omap_header(oh),
+	out_omap_data(o), out_snaps(osnaps), out_snap_seq(osnap_seq),
+	out_flags(flags), out_data_digest(dd), out_omap_digest(od),
+	out_reqids(oreqids),
+	out_reqid_return_codes(oreqid_return_codes),
+	out_truncate_seq(otseq),
+	out_truncate_size(otsize),
+	prval(r) {}
+    void finish(int r) override {
+      using ceph::decode;
+      // reqids are copied on ENOENT
+      if (r < 0 && r != -ENOENT)
+	return;
+      try {
+	auto p = bl.cbegin();
+	object_copy_data_t copy_reply;
+	decode(copy_reply, p);
+	if (r == -ENOENT) {
+	  if (out_reqids)
+	    *out_reqids = copy_reply.reqids;
+	  return;
+	}
+	if (out_size)
+	  *out_size = copy_reply.size;
+	if (out_mtime)
+	  *out_mtime = ceph::real_clock::from_ceph_timespec(copy_reply.mtime);
+	if (out_attrs)
+	  *out_attrs = copy_reply.attrs;
+	if (out_data)
+	  out_data->claim_append(copy_reply.data);
+	if (out_omap_header)
+	  out_omap_header->claim_append(copy_reply.omap_header);
+	if (out_omap_data)
+	  *out_omap_data = copy_reply.omap_data;
+	if (out_snaps)
+	  *out_snaps = copy_reply.snaps;
+	if (out_snap_seq)
+	  *out_snap_seq = copy_reply.snap_seq;
+	if (out_flags)
+	  *out_flags = copy_reply.flags;
+	if (out_data_digest)
+	  *out_data_digest = copy_reply.data_digest;
+	if (out_omap_digest)
+	  *out_omap_digest = copy_reply.omap_digest;
+	if (out_reqids)
+	  *out_reqids = copy_reply.reqids;
+	if (out_reqid_return_codes)
+	  *out_reqid_return_codes = copy_reply.reqid_return_codes;
+	if (out_truncate_seq)
+	  *out_truncate_seq = copy_reply.truncate_seq;
+	if (out_truncate_size)
+	  *out_truncate_size = copy_reply.truncate_size;
+	*cursor = copy_reply.cursor;
+      } catch (const ceph::buffer::error& e) {
+	if (prval)
+	  *prval = -EIO;
+      }
+    }
+  };
+
+  void copy_get(object_copy_cursor_t *cursor,
+		uint64_t max,
+		uint64_t *out_size,
+		ceph::real_time *out_mtime,
+		std::map<std::string,ceph::buffer::list> *out_attrs,
+		ceph::buffer::list *out_data,
+		ceph::buffer::list *out_omap_header,
+		ceph::buffer::list *out_omap_data,
+		std::vector<snapid_t> *out_snaps,
+		snapid_t *out_snap_seq,
+		uint32_t *out_flags,
+		uint32_t *out_data_digest,
+		uint32_t *out_omap_digest,
+		mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > *out_reqids,
+		mempool::osd_pglog::map<uint32_t, int> *out_reqid_return_codes,
+		uint64_t *truncate_seq,
+		uint64_t *truncate_size,
+		int *prval) {
+    using ceph::encode;
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_GET);
+    osd_op.op.copy_get.max = max;
+    encode(*cursor, osd_op.indata);
+    encode(max, osd_op.indata);
+    unsigned p = ops.size() - 1;
+    out_rval[p] = prval;
+    C_ObjectOperation_copyget *h =
+      new C_ObjectOperation_copyget(cursor, out_size, out_mtime,
+				    out_attrs, out_data, out_omap_header,
+				    out_omap_data, out_snaps, out_snap_seq,
+				    out_flags, out_data_digest,
+				    out_omap_digest, out_reqids,
+				    out_reqid_return_codes, truncate_seq,
+				    truncate_size, prval);
+    out_bl[p] = &h->bl;
+    set_handler(h);
+  }
+
+  void undirty() {
+    add_op(CEPH_OSD_OP_UNDIRTY);
+  }
+
+  struct C_ObjectOperation_isdirty : public Context {
+    ceph::buffer::list bl;
+    bool *pisdirty;
+    int *prval;
+    C_ObjectOperation_isdirty(bool *p, int *r)
+      : pisdirty(p), prval(r) {}
+    void finish(int r) override {
+      using ceph::decode;
+      if (r < 0)
+	return;
+      try {
+	auto p = bl.cbegin();
+	bool isdirty;
+	decode(isdirty, p);
+	if (pisdirty)
+	  *pisdirty = isdirty;
+      } catch (const ceph::buffer::error& e) {
+	if (prval)
+	  *prval = -EIO;
+      }
+    }
+  };
+
+  void is_dirty(bool *pisdirty, int *prval) {
+    add_op(CEPH_OSD_OP_ISDIRTY);
+    unsigned p = ops.size() - 1;
+    out_rval[p] = prval;
+    C_ObjectOperation_isdirty *h =
+      new C_ObjectOperation_isdirty(pisdirty, prval);
+    out_bl[p] = &h->bl;
+    set_handler(h);
+  }
+
+  struct C_ObjectOperation_hit_set_ls : public Context {
+    ceph::buffer::list bl;
+    std::list< std::pair<time_t, time_t> > *ptls;
+    std::list< std::pair<ceph::real_time, ceph::real_time> > *putls;
+    int *prval;
+    C_ObjectOperation_hit_set_ls(std::list< std::pair<time_t, time_t> > *t,
+				 std::list< std::pair<ceph::real_time,
+						      ceph::real_time> > *ut,
+				 int *r)
+      : ptls(t), putls(ut), prval(r) {}
+    void finish(int r) override {
+      using ceph::decode;
+      if (r < 0)
+	return;
+      try {
+	auto p = bl.cbegin();
+	std::list< std::pair<ceph::real_time, ceph::real_time> > ls;
+	decode(ls, p);
+	if (ptls) {
+	  ptls->clear();
+	  for (auto p = ls.begin(); p != ls.end(); ++p)
+	    // round initial timestamp up to the next full second to
+	    // keep this a valid interval.
+	    ptls->push_back(
+	      std::make_pair(ceph::real_clock::to_time_t(
+			  ceph::ceil(p->first,
+				     // Sadly, no time literals until C++14.
+				     std::chrono::seconds(1))),
+			ceph::real_clock::to_time_t(p->second)));
+	}
+	if (putls)
+	  putls->swap(ls);
+      } catch (const ceph::buffer::error& e) {
+	r = -EIO;
+      }
+      if (prval)
+	*prval = r;
+    }
+  };
+
+  /**
+   * std::list available HitSets.
+   *
+   * We will get back a std::list of time intervals.  Note that the most
+   * recent range may have an empty end timestamp if it is still
+   * accumulating.
+   *
+   * @param pls [out] std::list of time intervals
+   * @param prval [out] return value
+   */
+  void hit_set_ls(std::list< std::pair<time_t, time_t> > *pls, int *prval) {
+    add_op(CEPH_OSD_OP_PG_HITSET_LS);
+    unsigned p = ops.size() - 1;
+    out_rval[p] = prval;
+    C_ObjectOperation_hit_set_ls *h =
+      new C_ObjectOperation_hit_set_ls(pls, NULL, prval);
+    out_bl[p] = &h->bl;
+    set_handler(h);
+  }
+  void hit_set_ls(std::list<std::pair<ceph::real_time, ceph::real_time> > *pls,
+		  int *prval) {
+    add_op(CEPH_OSD_OP_PG_HITSET_LS);
+    unsigned p = ops.size() - 1;
+    out_rval[p] = prval;
+    C_ObjectOperation_hit_set_ls *h =
+      new C_ObjectOperation_hit_set_ls(NULL, pls, prval);
+    out_bl[p] = &h->bl;
+    set_handler(h);
+  }
+
+  /**
+   * get HitSet
+   *
+   * Return an encoded HitSet that includes the provided time
+   * interval.
+   *
+   * @param stamp [in] timestamp
+   * @param pbl [out] target buffer for encoded HitSet
+   * @param prval [out] return value
+   */
+  void hit_set_get(ceph::real_time stamp, ceph::buffer::list *pbl, int *prval) {
+    OSDOp& op = add_op(CEPH_OSD_OP_PG_HITSET_GET);
+    op.op.hit_set_get.stamp = ceph::real_clock::to_ceph_timespec(stamp);
+    unsigned p = ops.size() - 1;
+    out_rval[p] = prval;
+    out_bl[p] = pbl;
+  }
+
+  void omap_get_header(ceph::buffer::list *bl, int *prval) {
+    add_op(CEPH_OSD_OP_OMAPGETHEADER);
+    unsigned p = ops.size() - 1;
+    out_bl[p] = bl;
+    out_rval[p] = prval;
+  }
+
+  void omap_get_header(boost::system::error_code* ec, ceph::buffer::list *bl) {
+    add_op(CEPH_OSD_OP_OMAPGETHEADER);
+    out_bl.back() = bl;
+    out_ec.back() = ec;
+  }
+
+  void omap_set(const map<string, ceph::buffer::list> &map) {
+    ceph::buffer::list bl;
+    encode(map, bl);
+    add_data(CEPH_OSD_OP_OMAPSETVALS, 0, bl.length(), bl);
+  }
+
+  void omap_set(const boost::container::flat_map<string, ceph::buffer::list>& map) {
+    ceph::buffer::list bl;
+    encode(map, bl);
+    add_data(CEPH_OSD_OP_OMAPSETVALS, 0, bl.length(), bl);
+  }
+
+  void omap_set_header(ceph::buffer::list &bl) {
+    add_data(CEPH_OSD_OP_OMAPSETHEADER, 0, bl.length(), bl);
+  }
+
+  void omap_clear() {
+    add_op(CEPH_OSD_OP_OMAPCLEAR);
+  }
+
+  void omap_rm_keys(const std::set<std::string> &to_remove) {
+    using ceph::encode;
+    ceph::buffer::list bl;
+    encode(to_remove, bl);
+    add_data(CEPH_OSD_OP_OMAPRMKEYS, 0, bl.length(), bl);
+  }
+  void omap_rm_keys(const boost::container::flat_set<std::string>& to_remove) {
+    ceph::buffer::list bl;
+    encode(to_remove, bl);
+    add_data(CEPH_OSD_OP_OMAPRMKEYS, 0, bl.length(), bl);
+  }
+
+  void omap_rm_range(std::string_view key_begin, std::string_view key_end) {
+    ceph::buffer::list bl;
+    using ceph::encode;
+    encode(key_begin, bl);
+    encode(key_end, bl);
+    add_data(CEPH_OSD_OP_OMAPRMKEYRANGE, 0, bl.length(), bl);
+  }
+
+  // object classes
+  void call(const char *cname, const char *method, ceph::buffer::list &indata) {
+    add_call(CEPH_OSD_OP_CALL, cname, method, indata, NULL, NULL, NULL);
+  }
+
+  void call(const char *cname, const char *method, ceph::buffer::list &indata,
+	    ceph::buffer::list *outdata, Context *ctx, int *prval) {
+    add_call(CEPH_OSD_OP_CALL, cname, method, indata, outdata, ctx, prval);
+  }
+
+  void call(std::string_view cname, std::string_view method,
+	    const ceph::buffer::list& indata, boost::system::error_code* ec) {
+    add_call(CEPH_OSD_OP_CALL, cname, method, indata, NULL, NULL, NULL);
+    out_ec.back() = ec;
+  }
+
+  void call(std::string_view cname, std::string_view method, const ceph::buffer::list& indata,
+	    boost::system::error_code* ec, ceph::buffer::list *outdata) {
+    add_call(CEPH_OSD_OP_CALL, cname, method, indata, outdata, nullptr, nullptr);
+    out_ec.back() = ec;
+  }
+  void call(std::string_view cname, std::string_view method,
+	    const ceph::buffer::list& indata,
+	    fu2::unique_function<void (boost::system::error_code,
+				       const ceph::buffer::list&) &&> f) {
+    add_call(CEPH_OSD_OP_CALL, cname, method, indata, std::move(f));
+  }
+  void call(std::string_view cname, std::string_view method,
+	    const ceph::buffer::list& indata,
+	    fu2::unique_function<void (boost::system::error_code, int,
+				       const ceph::buffer::list&) &&> f) {
+    add_call(CEPH_OSD_OP_CALL, cname, method, indata, std::move(f));
+  }
+
+  // watch/notify
+  void watch(uint64_t cookie, __u8 op, uint32_t timeout = 0) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_WATCH);
+    osd_op.op.watch.cookie = cookie;
+    osd_op.op.watch.op = op;
+    osd_op.op.watch.timeout = timeout;
+  }
+
+  void notify(uint64_t cookie, uint32_t prot_ver, uint32_t timeout,
+              ceph::buffer::list &bl, ceph::buffer::list *inbl) {
+    using ceph::encode;
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_NOTIFY);
+    osd_op.op.notify.cookie = cookie;
+    encode(prot_ver, *inbl);
+    encode(timeout, *inbl);
+    encode(bl, *inbl);
+    osd_op.indata.append(*inbl);
+  }
+
+  void notify_ack(uint64_t notify_id, uint64_t cookie,
+		  ceph::buffer::list& reply_bl) {
+    using ceph::encode;
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_NOTIFY_ACK);
+    ceph::buffer::list bl;
+    encode(notify_id, bl);
+    encode(cookie, bl);
+    encode(reply_bl, bl);
+    osd_op.indata.append(bl);
+  }
+
+  void list_watchers(std::list<obj_watch_t> *out,
+		     int *prval) {
+    add_op(CEPH_OSD_OP_LIST_WATCHERS);
+    if (prval || out) {
+      set_handler(CB_ObjectOperation_decodewatchers(out, prval, nullptr));
+      out_rval.back() = prval;
+    }
+  }
+  void list_watchers(vector<neorados::ObjWatcher>* out,
+		     boost::system::error_code* ec) {
+    add_op(CEPH_OSD_OP_LIST_WATCHERS);
+    set_handler(CB_ObjectOperation_decodewatchersneo(out, nullptr, ec));
+    out_ec.back() = ec;
+  }
+
+  void list_snaps(librados::snap_set_t *out, int *prval,
+		  boost::system::error_code* ec = nullptr) {
+    add_op(CEPH_OSD_OP_LIST_SNAPS);
+    if (prval || out || ec) {
+      set_handler(CB_ObjectOperation_decodesnaps(out, nullptr, prval, ec));
+      out_rval.back() = prval;
+      out_ec.back() = ec;
+    }
+  }
+
+  void list_snaps(neorados::SnapSet *out, int *prval,
+		  boost::system::error_code* ec = nullptr) {
+    add_op(CEPH_OSD_OP_LIST_SNAPS);
+    if (prval || out || ec) {
+      set_handler(CB_ObjectOperation_decodesnaps(nullptr, out, prval, ec));
+      out_rval.back() = prval;
+      out_ec.back() = ec;
+    }
+  }
+
+  void assert_version(uint64_t ver) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_ASSERT_VER);
+    osd_op.op.assert_ver.ver = ver;
+  }
+
+  void cmpxattr(const char *name, const ceph::buffer::list& val,
+		int op, int mode) {
+    add_xattr(CEPH_OSD_OP_CMPXATTR, name, val);
+    OSDOp& o = *ops.rbegin();
+    o.op.xattr.cmp_op = op;
+    o.op.xattr.cmp_mode = mode;
+  }
+
+  void rollback(uint64_t snapid) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_ROLLBACK);
+    osd_op.op.snap.snapid = snapid;
+  }
+
+  void copy_from(object_t src, snapid_t snapid, object_locator_t src_oloc,
+		 version_t src_version, unsigned flags,
+		 unsigned src_fadvise_flags) {
+    using ceph::encode;
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_FROM);
+    osd_op.op.copy_from.snapid = snapid;
+    osd_op.op.copy_from.src_version = src_version;
+    osd_op.op.copy_from.flags = flags;
+    osd_op.op.copy_from.src_fadvise_flags = src_fadvise_flags;
+    encode(src, osd_op.indata);
+    encode(src_oloc, osd_op.indata);
+  }
+  void copy_from2(object_t src, snapid_t snapid, object_locator_t src_oloc,
+		 version_t src_version, unsigned flags,
+		 uint32_t truncate_seq, uint64_t truncate_size,
+		 unsigned src_fadvise_flags) {
+    using ceph::encode;
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_FROM2);
+    osd_op.op.copy_from.snapid = snapid;
+    osd_op.op.copy_from.src_version = src_version;
+    osd_op.op.copy_from.flags = flags;
+    osd_op.op.copy_from.src_fadvise_flags = src_fadvise_flags;
+    encode(src, osd_op.indata);
+    encode(src_oloc, osd_op.indata);
+    encode(truncate_seq, osd_op.indata);
+    encode(truncate_size, osd_op.indata);
+  }
+
+  /**
+   * writeback content to backing tier
+   *
+   * If object is marked dirty in the cache tier, write back content
+   * to backing tier. If the object is clean this is a no-op.
+   *
+   * If writeback races with an update, the update will block.
+   *
+   * use with IGNORE_CACHE to avoid triggering promote.
+   */
+  void cache_flush() {
+    add_op(CEPH_OSD_OP_CACHE_FLUSH);
+  }
+
+  /**
+   * writeback content to backing tier
+   *
+   * If object is marked dirty in the cache tier, write back content
+   * to backing tier. If the object is clean this is a no-op.
+   *
+   * If writeback races with an update, return EAGAIN.  Requires that
+   * the SKIPRWLOCKS flag be set.
+   *
+   * use with IGNORE_CACHE to avoid triggering promote.
+   */
+  void cache_try_flush() {
+    add_op(CEPH_OSD_OP_CACHE_TRY_FLUSH);
+  }
+
+  /**
+   * evict object from cache tier
+   *
+   * If object is marked clean, remove the object from the cache tier.
+   * Otherwise, return EBUSY.
+   *
+   * use with IGNORE_CACHE to avoid triggering promote.
+   */
+  void cache_evict() {
+    add_op(CEPH_OSD_OP_CACHE_EVICT);
+  }
+
+  /*
+   * Extensible tier
+   */
+  void set_redirect(object_t tgt, snapid_t snapid, object_locator_t tgt_oloc, 
+		    version_t tgt_version, int flag) {
+    using ceph::encode;
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_SET_REDIRECT);
+    osd_op.op.copy_from.snapid = snapid;
+    osd_op.op.copy_from.src_version = tgt_version;
+    encode(tgt, osd_op.indata);
+    encode(tgt_oloc, osd_op.indata);
+    set_last_op_flags(flag);
+  }
+
+  void set_chunk(uint64_t src_offset, uint64_t src_length, object_locator_t tgt_oloc,
+		 object_t tgt_oid, uint64_t tgt_offset, int flag) {
+    using ceph::encode;
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_SET_CHUNK);
+    encode(src_offset, osd_op.indata);
+    encode(src_length, osd_op.indata);
+    encode(tgt_oloc, osd_op.indata);
+    encode(tgt_oid, osd_op.indata);
+    encode(tgt_offset, osd_op.indata);
+    set_last_op_flags(flag);
+  }
+
+  void tier_promote() {
+    add_op(CEPH_OSD_OP_TIER_PROMOTE);
+  }
+
+  void unset_manifest() {
+    add_op(CEPH_OSD_OP_UNSET_MANIFEST);
+  }
+
+  void tier_flush() {
+    add_op(CEPH_OSD_OP_TIER_FLUSH);
+  }
+
+  void tier_evict() {
+    add_op(CEPH_OSD_OP_TIER_EVICT);
+  }
+
+  void set_alloc_hint(uint64_t expected_object_size,
+                      uint64_t expected_write_size,
+		      uint32_t flags) {
+    add_alloc_hint(CEPH_OSD_OP_SETALLOCHINT, expected_object_size,
+		   expected_write_size, flags);
+
+    // CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
+    // not worth a feature bit.  Set FAILOK per-op flag to make
+    // sure older osds don't trip over an unsupported opcode.
+    set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK);
+  }
+
+  template<typename V>
+  void dup(V& sops) {
+    ops.clear();
+    std::copy(sops.begin(), sops.end(),
+	      std::back_inserter(ops));
+    out_bl.resize(sops.size());
+    out_handler.resize(sops.size());
+    out_rval.resize(sops.size());
+    out_ec.resize(sops.size());
+    for (uint32_t i = 0; i < sops.size(); i++) {
+      out_bl[i] = &sops[i].outdata;
+      out_rval[i] = &sops[i].rval;
+      out_ec[i] = nullptr;
+    }
+  }
+
+  /**
+   * Pin/unpin an object in cache tier
+   */
+  void cache_pin() {
+    add_op(CEPH_OSD_OP_CACHE_PIN);
+  }
+
+  void cache_unpin() {
+    add_op(CEPH_OSD_OP_CACHE_UNPIN);
+  }
+};
+
+inline std::ostream& operator <<(std::ostream& m, const ObjectOperation& oo) {
+  auto i = oo.ops.cbegin();
+  m << '[';
+  while (i != oo.ops.cend()) {
+    if (i != oo.ops.cbegin())
+      m << ' ';
+    m << *i;
+    ++i;
+  }
+  m << ']';
+  return m;
+}
+
+
+// ----------------
+
+class Objecter : public md_config_obs_t, public Dispatcher {
+  using MOSDOp = _mosdop::MOSDOp<osdc_opvec>;
+public:
+  using OpSignature = void(boost::system::error_code);
+  using OpCompletion = ceph::async::Completion<OpSignature>;
+
+  // config observer bits
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override;
+
+public:
+  Messenger *messenger;
+  MonClient *monc;
+  boost::asio::io_context& service;
+  // The guaranteed sequenced, one-at-a-time execution and apparently
+  // people sometimes depend on this.
+  boost::asio::io_context::strand finish_strand{service};
+  ZTracer::Endpoint trace_endpoint{"0.0.0.0", 0, "Objecter"};
+private:
+  std::unique_ptr<OSDMap> osdmap{std::make_unique<OSDMap>()};
+public:
+  using Dispatcher::cct;
+  std::multimap<std::string,std::string> crush_location;
+
+  std::atomic<bool> initialized{false};
+
+private:
+  std::atomic<uint64_t> last_tid{0};
+  std::atomic<unsigned> inflight_ops{0};
+  std::atomic<int> client_inc{-1};
+  uint64_t max_linger_id{0};
+  std::atomic<unsigned> num_in_flight{0};
+  std::atomic<int> global_op_flags{0}; // flags which are applied to each IO op
+  bool keep_balanced_budget = false;
+  bool honor_pool_full = true;
+
+  // If this is true, accumulate a set of blocklisted entities
+  // to be drained by consume_blocklist_events.
+  bool blocklist_events_enabled = false;
+  std::set<entity_addr_t> blocklist_events;
+  struct pg_mapping_t {
+    epoch_t epoch = 0;
+    std::vector<int> up;
+    int up_primary = -1;
+    std::vector<int> acting;
+    int acting_primary = -1;
+
+    pg_mapping_t() {}
+    pg_mapping_t(epoch_t epoch, std::vector<int> up, int up_primary,
+                 std::vector<int> acting, int acting_primary)
+               : epoch(epoch), up(up), up_primary(up_primary),
+                 acting(acting), acting_primary(acting_primary) {}
+  };
+  ceph::shared_mutex pg_mapping_lock =
+    ceph::make_shared_mutex("Objecter::pg_mapping_lock");
+  // pool -> pg mapping
+  std::map<int64_t, std::vector<pg_mapping_t>> pg_mappings;
+
+  // convenient accessors
+  bool lookup_pg_mapping(const pg_t& pg, pg_mapping_t* pg_mapping) {
+    std::shared_lock l{pg_mapping_lock};
+    auto it = pg_mappings.find(pg.pool());
+    if (it == pg_mappings.end())
+      return false;
+    auto& mapping_array = it->second;
+    if (pg.ps() >= mapping_array.size())
+      return false;
+    if (mapping_array[pg.ps()].epoch != pg_mapping->epoch) // stale
+      return false;
+    *pg_mapping = mapping_array[pg.ps()];
+    return true;
+  }
+  void update_pg_mapping(const pg_t& pg, pg_mapping_t&& pg_mapping) {
+    std::lock_guard l{pg_mapping_lock};
+    auto& mapping_array = pg_mappings[pg.pool()];
+    ceph_assert(pg.ps() < mapping_array.size());
+    mapping_array[pg.ps()] = std::move(pg_mapping);
+  }
+  void prune_pg_mapping(const mempool::osdmap::map<int64_t,pg_pool_t>& pools) {
+    std::lock_guard l{pg_mapping_lock};
+    for (auto& pool : pools) {
+      auto& mapping_array = pg_mappings[pool.first];
+      size_t pg_num = pool.second.get_pg_num();
+      if (mapping_array.size() != pg_num) {
+        // catch both pg_num increasing & decreasing
+        mapping_array.resize(pg_num);
+      }
+    }
+    for (auto it = pg_mappings.begin(); it != pg_mappings.end(); ) {
+      if (!pools.count(it->first)) {
+        // pool is gone
+        pg_mappings.erase(it++);
+        continue;
+      }
+      it++;
+    }
+  }
+
+public:
+  void maybe_request_map();
+
+  void enable_blocklist_events();
+private:
+
+  void _maybe_request_map();
+
+  version_t last_seen_osdmap_version = 0;
+  version_t last_seen_pgmap_version = 0;
+
+  mutable ceph::shared_mutex rwlock =
+	   ceph::make_shared_mutex("Objecter::rwlock");
+  ceph::timer<ceph::coarse_mono_clock> timer;
+
+  PerfCounters* logger = nullptr;
+
+  uint64_t tick_event = 0;
+
+  void start_tick();
+  void tick();
+  void update_crush_location();
+
+  class RequestStateHook;
+
+  RequestStateHook *m_request_state_hook = nullptr;
+
+public:
+  /*** track pending operations ***/
+  // read
+
+  struct OSDSession;
+
+  struct op_target_t {
+    int flags = 0;
+
+    epoch_t epoch = 0;  ///< latest epoch we calculated the mapping
+
+    object_t base_oid;
+    object_locator_t base_oloc;
+    object_t target_oid;
+    object_locator_t target_oloc;
+
+    ///< true if we are directed at base_pgid, not base_oid
+    bool precalc_pgid = false;
+
+    ///< true if we have ever mapped to a valid pool
+    bool pool_ever_existed = false;
+
+    ///< explcit pg target, if any
+    pg_t base_pgid;
+
+    pg_t pgid; ///< last (raw) pg we mapped to
+    spg_t actual_pgid; ///< last (actual) spg_t we mapped to
+    unsigned pg_num = 0; ///< last pg_num we mapped to
+    unsigned pg_num_mask = 0; ///< last pg_num_mask we mapped to
+    unsigned pg_num_pending = 0; ///< last pg_num we mapped to
+    std::vector<int> up; ///< set of up osds for last pg we mapped to
+    std::vector<int> acting; ///< set of acting osds for last pg we mapped to
+    int up_primary = -1; ///< last up_primary we mapped to
+    int acting_primary = -1;  ///< last acting_primary we mapped to
+    int size = -1; ///< the size of the pool when were were last mapped
+    int min_size = -1; ///< the min size of the pool when were were last mapped
+    bool sort_bitwise = false; ///< whether the hobject_t sort order is bitwise
+    bool recovery_deletes = false; ///< whether the deletes are performed during recovery instead of peering
+    uint32_t peering_crush_bucket_count = 0;
+    uint32_t peering_crush_bucket_target = 0;
+    uint32_t peering_crush_bucket_barrier = 0;
+    int32_t peering_crush_mandatory_member = CRUSH_ITEM_NONE;
+
+    bool used_replica = false;
+    bool paused = false;
+
+    int osd = -1;      ///< the final target osd, or -1
+
+    epoch_t last_force_resend = 0;
+
+    op_target_t(object_t oid, object_locator_t oloc, int flags)
+      : flags(flags),
+	base_oid(oid),
+	base_oloc(oloc)
+      {}
+
+    explicit op_target_t(pg_t pgid)
+      : base_oloc(pgid.pool(), pgid.ps()),
+	precalc_pgid(true),
+	base_pgid(pgid)
+      {}
+
+    op_target_t() = default;
+
+    hobject_t get_hobj() {
+      return hobject_t(target_oid,
+		       target_oloc.key,
+		       CEPH_NOSNAP,
+		       target_oloc.hash >= 0 ? target_oloc.hash : pgid.ps(),
+		       target_oloc.pool,
+		       target_oloc.nspace);
+    }
+
+    bool contained_by(const hobject_t& begin, const hobject_t& end) {
+      hobject_t h = get_hobj();
+      int r = cmp(h, begin);
+      return r == 0 || (r > 0 && h < end);
+    }
+
+    bool respects_full() const {
+      return
+	(flags & (CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_RWORDERED)) &&
+	!(flags & (CEPH_OSD_FLAG_FULL_TRY | CEPH_OSD_FLAG_FULL_FORCE));
+    }
+
+    void dump(ceph::Formatter *f) const;
+  };
+
+  std::unique_ptr<ceph::async::Completion<void(boost::system::error_code)>>
+  OpContextVert(Context* c) {
+    if (c)
+      return ceph::async::Completion<void(boost::system::error_code)>::create(
+	service.get_executor(),
+	[c = std::unique_ptr<Context>(c)]
+	(boost::system::error_code e) mutable {
+	  c.release()->complete(e);
+	});
+    else
+      return nullptr;
+  }
+
+  template<typename T>
+  std::unique_ptr<ceph::async::Completion<void(boost::system::error_code, T)>>
+  OpContextVert(Context* c, T* p) {
+
+    if (c || p)
+      return
+	ceph::async::Completion<void(boost::system::error_code, T)>::create(
+	  service.get_executor(),
+	  [c = std::unique_ptr<Context>(c), p]
+	  (boost::system::error_code e, T r) mutable {
+	      if (p)
+		*p = std::move(r);
+	      if (c)
+		c.release()->complete(ceph::from_error_code(e));
+	    });
+    else
+      return nullptr;
+  }
+
+  template<typename T>
+  std::unique_ptr<ceph::async::Completion<void(boost::system::error_code, T)>>
+  OpContextVert(Context* c, T& p) {
+    if (c)
+      return ceph::async::Completion<
+	void(boost::system::error_code, T)>::create(
+	  service.get_executor(),
+	  [c = std::unique_ptr<Context>(c), &p]
+	  (boost::system::error_code e, T r) mutable {
+	    p = std::move(r);
+	    if (c)
+	      c.release()->complete(ceph::from_error_code(e));
+	  });
+    else
+      return nullptr;
+  }
+
+  struct Op : public RefCountedObject {
+    OSDSession *session = nullptr;
+    int incarnation = 0;
+
+    op_target_t target;
+
+    ConnectionRef con = nullptr;  // for rx buffer only
+    uint64_t features = CEPH_FEATURES_SUPPORTED_DEFAULT; // explicitly specified op features
+
+    osdc_opvec ops;
+
+    snapid_t snapid = CEPH_NOSNAP;
+    SnapContext snapc;
+    ceph::real_time mtime;
+
+    ceph::buffer::list *outbl = nullptr;
+    boost::container::small_vector<ceph::buffer::list*, osdc_opvec_len> out_bl;
+    boost::container::small_vector<
+      fu2::unique_function<void(boost::system::error_code, int,
+				const ceph::buffer::list& bl) &&>,
+      osdc_opvec_len> out_handler;
+    boost::container::small_vector<int*, osdc_opvec_len> out_rval;
+    boost::container::small_vector<boost::system::error_code*,
+				   osdc_opvec_len> out_ec;
+
+    int priority = 0;
+    using OpSig = void(boost::system::error_code);
+    using OpComp = ceph::async::Completion<OpSig>;
+    // Due to an irregularity of cmpxattr, we actualy need the 'int'
+    // value for onfinish for legacy librados users. As such just
+    // preserve the Context* in this one case. That way we can have
+    // our callers just pass in a unique_ptr<OpComp> and not deal with
+    // our signature in Objecter being different than the exposed
+    // signature in RADOS.
+    //
+    // Add a function for the linger case, where we want better
+    // semantics than Context, but still need to be under the completion_lock.
+    std::variant<std::unique_ptr<OpComp>, fu2::unique_function<OpSig>,
+		 Context*> onfinish;
+    uint64_t ontimeout = 0;
+
+    ceph_tid_t tid = 0;
+    int attempts = 0;
+
+    version_t *objver;
+    epoch_t *reply_epoch = nullptr;
+
+    ceph::coarse_mono_time stamp;
+
+    epoch_t map_dne_bound = 0;
+
+    int budget = -1;
+
+    /// true if we should resend this message on failure
+    bool should_resend = true;
+
+    /// true if the throttle budget is get/put on a series of OPs,
+    /// instead of per OP basis, when this flag is set, the budget is
+    /// acquired before sending the very first OP of the series and
+    /// released upon receiving the last OP reply.
+    bool ctx_budgeted = false;
+
+    int *data_offset;
+
+    osd_reqid_t reqid; // explicitly setting reqid
+    ZTracer::Trace trace;
+
+    static bool has_completion(decltype(onfinish)& f) {
+      return std::visit([](auto&& arg) { return bool(arg);}, f);
+    }
+    bool has_completion() {
+      return has_completion(onfinish);
+    }
+
+    static void complete(decltype(onfinish)&& f, boost::system::error_code ec,
+			 int r) {
+      std::visit([ec, r](auto&& arg) {
+		   if constexpr (std::is_same_v<std::decay_t<decltype(arg)>,
+				 Context*>) {
+		     arg->complete(r);
+		   } else if constexpr (std::is_same_v<std::decay_t<decltype(arg)>,
+			      fu2::unique_function<OpSig>>) {
+		     std::move(arg)(ec);
+                   } else {
+		     arg->defer(std::move(arg), ec);
+		   }
+		 }, std::move(f));
+    }
+    void complete(boost::system::error_code ec, int r) {
+      complete(std::move(onfinish), ec, r);
+    }
+
+    Op(const object_t& o, const object_locator_t& ol,  osdc_opvec&& _ops,
+       int f, std::unique_ptr<OpComp>&& fin,
+       version_t *ov, int *offset = nullptr,
+       ZTracer::Trace *parent_trace = nullptr) :
+      target(o, ol, f),
+      ops(std::move(_ops)),
+      out_bl(ops.size(), nullptr),
+      out_handler(ops.size()),
+      out_rval(ops.size(), nullptr),
+      out_ec(ops.size(), nullptr),
+      onfinish(std::move(fin)),
+      objver(ov),
+      data_offset(offset) {
+      if (target.base_oloc.key == o)
+	target.base_oloc.key.clear();
+      if (parent_trace && parent_trace->valid()) {
+        trace.init("op", nullptr, parent_trace);
+        trace.event("start");
+      }
+    }
+
+    Op(const object_t& o, const object_locator_t& ol, osdc_opvec&& _ops,
+       int f, Context* fin, version_t *ov, int *offset = nullptr,
+       ZTracer::Trace *parent_trace = nullptr) :
+      target(o, ol, f),
+      ops(std::move(_ops)),
+      out_bl(ops.size(), nullptr),
+      out_handler(ops.size()),
+      out_rval(ops.size(), nullptr),
+      out_ec(ops.size(), nullptr),
+      onfinish(fin),
+      objver(ov),
+      data_offset(offset) {
+      if (target.base_oloc.key == o)
+	target.base_oloc.key.clear();
+      if (parent_trace && parent_trace->valid()) {
+        trace.init("op", nullptr, parent_trace);
+        trace.event("start");
+      }
+    }
+
+    Op(const object_t& o, const object_locator_t& ol, osdc_opvec&&  _ops,
+       int f, fu2::unique_function<OpSig>&& fin, version_t *ov, int *offset = nullptr,
+       ZTracer::Trace *parent_trace = nullptr) :
+      target(o, ol, f),
+      ops(std::move(_ops)),
+      out_bl(ops.size(), nullptr),
+      out_handler(ops.size()),
+      out_rval(ops.size(), nullptr),
+      out_ec(ops.size(), nullptr),
+      onfinish(std::move(fin)),
+      objver(ov),
+      data_offset(offset) {
+      if (target.base_oloc.key == o)
+	target.base_oloc.key.clear();
+      if (parent_trace && parent_trace->valid()) {
+        trace.init("op", nullptr, parent_trace);
+        trace.event("start");
+      }
+    }
+
+    bool operator<(const Op& other) const {
+      return tid < other.tid;
+    }
+
+  private:
+    ~Op() override {
+      trace.event("finish");
+    }
+  };
+
+  struct CB_Op_Map_Latest {
+    Objecter *objecter;
+    ceph_tid_t tid;
+    CB_Op_Map_Latest(Objecter *o, ceph_tid_t t) : objecter(o), tid(t) {}
+    void operator()(boost::system::error_code err, version_t latest, version_t);
+  };
+
+  struct CB_Command_Map_Latest {
+    Objecter *objecter;
+    uint64_t tid;
+    CB_Command_Map_Latest(Objecter *o, ceph_tid_t t) :  objecter(o), tid(t) {}
+    void operator()(boost::system::error_code err, version_t latest, version_t);
+  };
+
+  struct C_Stat : public Context {
+    ceph::buffer::list bl;
+    uint64_t *psize;
+    ceph::real_time *pmtime;
+    Context *fin;
+    C_Stat(uint64_t *ps, ceph::real_time *pm, Context *c) :
+      psize(ps), pmtime(pm), fin(c) {}
+    void finish(int r) override {
+      using ceph::decode;
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	uint64_t s;
+	ceph::real_time m;
+	decode(s, p);
+	decode(m, p);
+	if (psize)
+	  *psize = s;
+	if (pmtime)
+	  *pmtime = m;
+      }
+      fin->complete(r);
+    }
+  };
+
+  struct C_GetAttrs : public Context {
+    ceph::buffer::list bl;
+    std::map<std::string,ceph::buffer::list>& attrset;
+    Context *fin;
+    C_GetAttrs(std::map<std::string, ceph::buffer::list>& set, Context *c) : attrset(set),
+							   fin(c) {}
+    void finish(int r) override {
+      using ceph::decode;
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	decode(attrset, p);
+      }
+      fin->complete(r);
+    }
+  };
+
+
+  // Pools and statistics
+  struct NListContext {
+    collection_list_handle_t pos;
+
+    // these are for !sortbitwise compat only
+    int current_pg = 0;
+    int starting_pg_num = 0;
+    bool sort_bitwise = false;
+
+    bool at_end_of_pool = false; ///< publicly visible end flag
+
+    int64_t pool_id = -1;
+    int pool_snap_seq = 0;
+    uint64_t max_entries = 0;
+    std::string nspace;
+
+    ceph::buffer::list bl;   // raw data read to here
+    std::list<librados::ListObjectImpl> list;
+
+    ceph::buffer::list filter;
+
+    // The budget associated with this context, once it is set (>= 0),
+    // the budget is not get/released on OP basis, instead the budget
+    // is acquired before sending the first OP and released upon receiving
+    // the last op reply.
+    int ctx_budget = -1;
+
+    bool at_end() const {
+      return at_end_of_pool;
+    }
+
+    uint32_t get_pg_hash_position() const {
+      return pos.get_hash();
+    }
+  };
+
+  struct C_NList : public Context {
+    NListContext *list_context;
+    Context *final_finish;
+    Objecter *objecter;
+    epoch_t epoch;
+    C_NList(NListContext *lc, Context * finish, Objecter *ob) :
+      list_context(lc), final_finish(finish), objecter(ob), epoch(0) {}
+    void finish(int r) override {
+      if (r >= 0) {
+	objecter->_nlist_reply(list_context, r, final_finish, epoch);
+      } else {
+	final_finish->complete(r);
+      }
+    }
+  };
+
+  struct PoolStatOp {
+    ceph_tid_t tid;
+    std::vector<std::string> pools;
+    using OpSig = void(boost::system::error_code,
+		       boost::container::flat_map<std::string, pool_stat_t>,
+		       bool);
+    using OpComp = ceph::async::Completion<OpSig>;
+    std::unique_ptr<OpComp> onfinish;
+    std::uint64_t ontimeout;
+    ceph::coarse_mono_time last_submit;
+  };
+
+  struct StatfsOp {
+    ceph_tid_t tid;
+    boost::optional<int64_t> data_pool;
+    using OpSig = void(boost::system::error_code,
+		       const struct ceph_statfs);
+    using OpComp = ceph::async::Completion<OpSig>;
+
+    std::unique_ptr<OpComp> onfinish;
+    uint64_t ontimeout;
+
+    ceph::coarse_mono_time last_submit;
+  };
+
+  struct PoolOp {
+    ceph_tid_t tid = 0;
+    int64_t pool = 0;
+    std::string name;
+    using OpSig = void(boost::system::error_code, ceph::buffer::list);
+    using OpComp = ceph::async::Completion<OpSig>;
+    std::unique_ptr<OpComp> onfinish;
+    uint64_t ontimeout = 0;
+    int pool_op = 0;
+    int16_t crush_rule = 0;
+    snapid_t snapid = 0;
+    ceph::coarse_mono_time last_submit;
+
+    PoolOp() {}
+  };
+
+  // -- osd commands --
+  struct CommandOp : public RefCountedObject {
+    OSDSession *session = nullptr;
+    ceph_tid_t tid = 0;
+    std::vector<std::string> cmd;
+    ceph::buffer::list inbl;
+
+    // target_osd == -1 means target_pg is valid
+    const int target_osd = -1;
+    const pg_t target_pg;
+
+    op_target_t target;
+
+    epoch_t map_dne_bound = 0;
+    int map_check_error = 0; // error to return if std::map check fails
+    const char *map_check_error_str = nullptr;
+
+    using OpSig = void(boost::system::error_code, std::string,
+		       ceph::buffer::list);
+    using OpComp = ceph::async::Completion<OpSig>;
+    std::unique_ptr<OpComp> onfinish;
+
+    uint64_t ontimeout = 0;
+    ceph::coarse_mono_time last_submit;
+
+    CommandOp(
+      int target_osd,
+      std::vector<string>&& cmd,
+      ceph::buffer::list&& inbl,
+      decltype(onfinish)&& onfinish)
+      : cmd(std::move(cmd)),
+	inbl(std::move(inbl)),
+	target_osd(target_osd),
+	onfinish(std::move(onfinish)) {}
+
+    CommandOp(
+      pg_t pgid,
+      std::vector<string>&& cmd,
+      ceph::buffer::list&& inbl,
+      decltype(onfinish)&& onfinish)
+      : cmd(std::move(cmd)),
+	inbl(std::move(inbl)),
+	target_pg(pgid),
+	target(pgid),
+	onfinish(std::move(onfinish)) {}
+  };
+
+  void submit_command(CommandOp *c, ceph_tid_t *ptid);
+  int _calc_command_target(CommandOp *c,
+			   ceph::shunique_lock<ceph::shared_mutex> &sul);
+  void _assign_command_session(CommandOp *c,
+			       ceph::shunique_lock<ceph::shared_mutex> &sul);
+  void _send_command(CommandOp *c);
+  int command_op_cancel(OSDSession *s, ceph_tid_t tid,
+			boost::system::error_code ec);
+  void _finish_command(CommandOp *c, boost::system::error_code ec,
+		       std::string&& rs, ceph::buffer::list&& bl);
+  void handle_command_reply(MCommandReply *m);
+
+  // -- lingering ops --
+
+  struct LingerOp : public RefCountedObject {
+    Objecter *objecter;
+    uint64_t linger_id{0};
+    op_target_t target{object_t(), object_locator_t(), 0};
+    snapid_t snap{CEPH_NOSNAP};
+    SnapContext snapc;
+    ceph::real_time mtime;
+
+    osdc_opvec ops;
+    ceph::buffer::list inbl;
+    version_t *pobjver{nullptr};
+
+    bool is_watch{false};
+    ceph::coarse_mono_time watch_valid_thru; ///< send time for last acked ping
+    boost::system::error_code last_error;  ///< error from last failed ping|reconnect, if any
+    ceph::shared_mutex watch_lock;
+
+    // queue of pending async operations, with the timestamp of
+    // when they were queued.
+    std::list<ceph::coarse_mono_time> watch_pending_async;
+
+    uint32_t register_gen{0};
+    bool registered{false};
+    bool canceled{false};
+    using OpSig = void(boost::system::error_code, ceph::buffer::list);
+    using OpComp = ceph::async::Completion<OpSig>;
+    std::unique_ptr<OpComp> on_reg_commit;
+    std::unique_ptr<OpComp> on_notify_finish;
+    uint64_t notify_id{0};
+
+    fu2::unique_function<void(boost::system::error_code,
+			      uint64_t notify_id,
+			      uint64_t cookie,
+			      uint64_t notifier_id,
+			      ceph::buffer::list&& bl)> handle;
+    OSDSession *session{nullptr};
+
+    int ctx_budget{-1};
+    ceph_tid_t register_tid{0};
+    ceph_tid_t ping_tid{0};
+    epoch_t map_dne_bound{0};
+
+    void _queued_async() {
+      // watch_lock ust be locked unique
+      watch_pending_async.push_back(ceph::coarse_mono_clock::now());
+    }
+    void finished_async() {
+      unique_lock l(watch_lock);
+      ceph_assert(!watch_pending_async.empty());
+      watch_pending_async.pop_front();
+    }
+
+    LingerOp(Objecter *o, uint64_t linger_id);
+    const LingerOp& operator=(const LingerOp& r) = delete;
+    LingerOp(const LingerOp& o) = delete;
+
+    uint64_t get_cookie() {
+      return reinterpret_cast<uint64_t>(this);
+    }
+  };
+
+  struct CB_Linger_Commit {
+    Objecter *objecter;
+    boost::intrusive_ptr<LingerOp> info;
+    ceph::buffer::list outbl;  // used for notify only
+    CB_Linger_Commit(Objecter *o, LingerOp *l) : objecter(o), info(l) {}
+    ~CB_Linger_Commit() = default;
+
+    void operator()(boost::system::error_code ec) {
+      objecter->_linger_commit(info.get(), ec, outbl);
+    }
+  };
+
+  struct CB_Linger_Reconnect {
+    Objecter *objecter;
+    boost::intrusive_ptr<LingerOp> info;
+    CB_Linger_Reconnect(Objecter *o, LingerOp *l) : objecter(o), info(l) {}
+    ~CB_Linger_Reconnect() = default;
+
+    void operator()(boost::system::error_code ec) {
+      objecter->_linger_reconnect(info.get(), ec);
+      info.reset();
+    }
+  };
+
+  struct CB_Linger_Ping {
+    Objecter *objecter;
+    boost::intrusive_ptr<LingerOp> info;
+    ceph::coarse_mono_time sent;
+    uint32_t register_gen;
+    CB_Linger_Ping(Objecter *o, LingerOp *l, ceph::coarse_mono_time s)
+      : objecter(o), info(l), sent(s), register_gen(info->register_gen) {}
+    void operator()(boost::system::error_code ec) {
+      objecter->_linger_ping(info.get(), ec, sent, register_gen);
+      info.reset();
+    }
+  };
+
+  struct CB_Linger_Map_Latest {
+    Objecter *objecter;
+    uint64_t linger_id;
+    CB_Linger_Map_Latest(Objecter *o, uint64_t id) : objecter(o), linger_id(id) {}
+    void operator()(boost::system::error_code err, version_t latest, version_t);
+  };
+
+  // -- osd sessions --
+  struct OSDBackoff {
+    spg_t pgid;
+    uint64_t id;
+    hobject_t begin, end;
+  };
+
+  struct OSDSession : public RefCountedObject {
+    // pending ops
+    std::map<ceph_tid_t,Op*> ops;
+    std::map<uint64_t, LingerOp*> linger_ops;
+    std::map<ceph_tid_t,CommandOp*> command_ops;
+
+    // backoffs
+    std::map<spg_t,std::map<hobject_t,OSDBackoff>> backoffs;
+    std::map<uint64_t,OSDBackoff*> backoffs_by_id;
+
+    int osd;
+    // NB locking two sessions at the same time is only safe because
+    // it is only done in _recalc_linger_op_target with s and
+    // linger_op->session, and it holds rwlock for write.  We disable
+    // lockdep (using std::sharedMutex) because lockdep doesn't know
+    // that.
+    std::shared_mutex lock;
+
+    int incarnation;
+    ConnectionRef con;
+    int num_locks;
+    std::unique_ptr<std::mutex[]> completion_locks;
+
+    OSDSession(CephContext *cct, int o) :
+      osd(o), incarnation(0), con(NULL),
+      num_locks(cct->_conf->objecter_completion_locks_per_session),
+      completion_locks(new std::mutex[num_locks]) {}
+
+    ~OSDSession() override;
+
+    bool is_homeless() { return (osd == -1); }
+
+    std::unique_lock<std::mutex> get_lock(object_t& oid);
+  };
+  std::map<int,OSDSession*> osd_sessions;
+
+  bool osdmap_full_flag() const;
+  bool osdmap_pool_full(const int64_t pool_id) const;
+
+
+ private:
+
+  /**
+   * Test pg_pool_t::FLAG_FULL on a pool
+   *
+   * @return true if the pool exists and has the flag set, or
+   *         the global full flag is set, else false
+   */
+  bool _osdmap_pool_full(const int64_t pool_id) const;
+  bool _osdmap_pool_full(const pg_pool_t &p) const {
+    return p.has_flag(pg_pool_t::FLAG_FULL) && honor_pool_full;
+  }
+  void update_pool_full_map(std::map<int64_t, bool>& pool_full_map);
+
+  std::map<uint64_t, LingerOp*> linger_ops;
+  // we use this just to confirm a cookie is valid before dereferencing the ptr
+  std::set<LingerOp*> linger_ops_set;
+
+  std::map<ceph_tid_t,PoolStatOp*> poolstat_ops;
+  std::map<ceph_tid_t,StatfsOp*> statfs_ops;
+  std::map<ceph_tid_t,PoolOp*> pool_ops;
+  std::atomic<unsigned> num_homeless_ops{0};
+
+  OSDSession* homeless_session = new OSDSession(cct, -1);
+
+
+  // ops waiting for an osdmap with a new pool or confirmation that
+  // the pool does not exist (may be expanded to other uses later)
+  std::map<uint64_t, LingerOp*> check_latest_map_lingers;
+  std::map<ceph_tid_t, Op*> check_latest_map_ops;
+  std::map<ceph_tid_t, CommandOp*> check_latest_map_commands;
+
+  std::map<epoch_t,
+	   std::vector<std::pair<std::unique_ptr<OpCompletion>,
+				 boost::system::error_code>>> waiting_for_map;
+
+  ceph::timespan mon_timeout;
+  ceph::timespan osd_timeout;
+
+  MOSDOp *_prepare_osd_op(Op *op);
+  void _send_op(Op *op);
+  void _send_op_account(Op *op);
+  void _cancel_linger_op(Op *op);
+  void _finish_op(Op *op, int r);
+  static bool is_pg_changed(
+    int oldprimary,
+    const std::vector<int>& oldacting,
+    int newprimary,
+    const std::vector<int>& newacting,
+    bool any_change=false);
+  enum recalc_op_target_result {
+    RECALC_OP_TARGET_NO_ACTION = 0,
+    RECALC_OP_TARGET_NEED_RESEND,
+    RECALC_OP_TARGET_POOL_DNE,
+    RECALC_OP_TARGET_OSD_DNE,
+    RECALC_OP_TARGET_OSD_DOWN,
+  };
+  bool _osdmap_full_flag() const;
+  bool _osdmap_has_pool_full() const;
+  void _prune_snapc(
+    const mempool::osdmap::map<int64_t, snap_interval_set_t>& new_removed_snaps,
+    Op *op);
+
+  bool target_should_be_paused(op_target_t *op);
+  int _calc_target(op_target_t *t, Connection *con,
+		   bool any_change = false);
+  int _map_session(op_target_t *op, OSDSession **s,
+		   ceph::shunique_lock<ceph::shared_mutex>& lc);
+
+  void _session_op_assign(OSDSession *s, Op *op);
+  void _session_op_remove(OSDSession *s, Op *op);
+  void _session_linger_op_assign(OSDSession *to, LingerOp *op);
+  void _session_linger_op_remove(OSDSession *from, LingerOp *op);
+  void _session_command_op_assign(OSDSession *to, CommandOp *op);
+  void _session_command_op_remove(OSDSession *from, CommandOp *op);
+
+  int _assign_op_target_session(Op *op, ceph::shunique_lock<ceph::shared_mutex>& lc,
+				bool src_session_locked,
+				bool dst_session_locked);
+  int _recalc_linger_op_target(LingerOp *op,
+			       ceph::shunique_lock<ceph::shared_mutex>& lc);
+
+  void _linger_submit(LingerOp *info,
+		      ceph::shunique_lock<ceph::shared_mutex>& sul);
+  void _send_linger(LingerOp *info,
+		    ceph::shunique_lock<ceph::shared_mutex>& sul);
+  void _linger_commit(LingerOp *info, boost::system::error_code ec,
+		      ceph::buffer::list& outbl);
+  void _linger_reconnect(LingerOp *info, boost::system::error_code ec);
+  void _send_linger_ping(LingerOp *info);
+  void _linger_ping(LingerOp *info, boost::system::error_code ec,
+		    ceph::coarse_mono_time sent, uint32_t register_gen);
+  boost::system::error_code _normalize_watch_error(boost::system::error_code ec);
+
+  friend class CB_Objecter_GetVersion;
+  friend class CB_DoWatchError;
+public:
+  template<typename CT>
+  auto linger_callback_flush(CT&& ct) {
+    boost::asio::async_completion<CT, void(void)> init(ct);
+    boost::asio::defer(finish_strand, std::move(init.completion_handler));
+    return init.result.get();
+  }
+
+private:
+  void _check_op_pool_dne(Op *op, std::unique_lock<std::shared_mutex> *sl);
+  void _send_op_map_check(Op *op);
+  void _op_cancel_map_check(Op *op);
+  void _check_linger_pool_dne(LingerOp *op, bool *need_unregister);
+  void _send_linger_map_check(LingerOp *op);
+  void _linger_cancel_map_check(LingerOp *op);
+  void _check_command_map_dne(CommandOp *op);
+  void _send_command_map_check(CommandOp *op);
+  void _command_cancel_map_check(CommandOp *op);
+
+  void _kick_requests(OSDSession *session, std::map<uint64_t, LingerOp *>& lresend);
+  void _linger_ops_resend(std::map<uint64_t, LingerOp *>& lresend,
+			  std::unique_lock<ceph::shared_mutex>& ul);
+
+  int _get_session(int osd, OSDSession **session,
+		   ceph::shunique_lock<ceph::shared_mutex>& sul);
+  void put_session(OSDSession *s);
+  void get_session(OSDSession *s);
+  void _reopen_session(OSDSession *session);
+  void close_session(OSDSession *session);
+
+  void _nlist_reply(NListContext *list_context, int r, Context *final_finish,
+		   epoch_t reply_epoch);
+
+  void resend_mon_ops();
+
+  /**
+   * handle a budget for in-flight ops
+   * budget is taken whenever an op goes into the ops std::map
+   * and returned whenever an op is removed from the std::map
+   * If throttle_op needs to throttle it will unlock client_lock.
+   */
+  int calc_op_budget(const boost::container::small_vector_base<OSDOp>& ops);
+  void _throttle_op(Op *op, ceph::shunique_lock<ceph::shared_mutex>& sul,
+		    int op_size = 0);
+  int _take_op_budget(Op *op, ceph::shunique_lock<ceph::shared_mutex>& sul) {
+    ceph_assert(sul && sul.mutex() == &rwlock);
+    int op_budget = calc_op_budget(op->ops);
+    if (keep_balanced_budget) {
+      _throttle_op(op, sul, op_budget);
+    } else { // update take_linger_budget to match this!
+      op_throttle_bytes.take(op_budget);
+      op_throttle_ops.take(1);
+    }
+    op->budget = op_budget;
+    return op_budget;
+  }
+  int take_linger_budget(LingerOp *info);
+  void put_op_budget_bytes(int op_budget) {
+    ceph_assert(op_budget >= 0);
+    op_throttle_bytes.put(op_budget);
+    op_throttle_ops.put(1);
+  }
+  void put_nlist_context_budget(NListContext *list_context);
+  Throttle op_throttle_bytes{cct, "objecter_bytes",
+			     static_cast<int64_t>(
+			       cct->_conf->objecter_inflight_op_bytes)};
+  Throttle op_throttle_ops{cct, "objecter_ops",
+			   static_cast<int64_t>(
+			     cct->_conf->objecter_inflight_ops)};
+ public:
+  Objecter(CephContext *cct, Messenger *m, MonClient *mc,
+	   boost::asio::io_context& service);
+  ~Objecter() override;
+
+  void init();
+  void start(const OSDMap *o = nullptr);
+  void shutdown();
+
+  // These two templates replace osdmap_(get)|(put)_read. Simply wrap
+  // whatever functionality you want to use the OSDMap in a lambda like:
+  //
+  // with_osdmap([](const OSDMap& o) { o.do_stuff(); });
+  //
+  // or
+  //
+  // auto t = with_osdmap([&](const OSDMap& o) { return o.lookup_stuff(x); });
+  //
+  // Do not call into something that will try to lock the OSDMap from
+  // here or you will have great woe and misery.
+
+  template<typename Callback, typename...Args>
+  decltype(auto) with_osdmap(Callback&& cb, Args&&... args) {
+    shared_lock l(rwlock);
+    return std::forward<Callback>(cb)(*osdmap, std::forward<Args>(args)...);
+  }
+
+
+  /**
+   * Tell the objecter to throttle outgoing ops according to its
+   * budget (in _conf). If you do this, ops can block, in
+   * which case it will unlock client_lock and sleep until
+   * incoming messages reduce the used budget low enough for
+   * the ops to continue going; then it will lock client_lock again.
+   */
+  void set_balanced_budget() { keep_balanced_budget = true; }
+  void unset_balanced_budget() { keep_balanced_budget = false; }
+
+  void set_honor_pool_full() { honor_pool_full = true; }
+  void unset_honor_pool_full() { honor_pool_full = false; }
+
+  void _scan_requests(
+    OSDSession *s,
+    bool skipped_map,
+    bool cluster_full,
+    std::map<int64_t, bool> *pool_full_map,
+    std::map<ceph_tid_t, Op*>& need_resend,
+    std::list<LingerOp*>& need_resend_linger,
+    std::map<ceph_tid_t, CommandOp*>& need_resend_command,
+    ceph::shunique_lock<ceph::shared_mutex>& sul);
+
+  int64_t get_object_hash_position(int64_t pool, const std::string& key,
+				   const std::string& ns);
+  int64_t get_object_pg_hash_position(int64_t pool, const std::string& key,
+				      const std::string& ns);
+
+  // messages
+ public:
+  bool ms_dispatch(Message *m) override;
+  bool ms_can_fast_dispatch_any() const override {
+    return true;
+  }
+  bool ms_can_fast_dispatch(const Message *m) const override {
+    switch (m->get_type()) {
+    case CEPH_MSG_OSD_OPREPLY:
+    case CEPH_MSG_WATCH_NOTIFY:
+      return true;
+    default:
+      return false;
+    }
+  }
+  void ms_fast_dispatch(Message *m) override {
+    if (!ms_dispatch(m)) {
+      m->put();
+    }
+  }
+
+  void handle_osd_op_reply(class MOSDOpReply *m);
+  void handle_osd_backoff(class MOSDBackoff *m);
+  void handle_watch_notify(class MWatchNotify *m);
+  void handle_osd_map(class MOSDMap *m);
+  void wait_for_osd_map(epoch_t e=0);
+
+  template<typename CompletionToken>
+  auto wait_for_osd_map(CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, void()> init(token);
+    unique_lock l(rwlock);
+    if (osdmap->get_epoch()) {
+      l.unlock();
+      boost::asio::post(std::move(init.completion_handler));
+    } else {
+      waiting_for_map[0].emplace_back(
+	OpCompletion::create(
+	  service.get_executor(),
+	  [c = std::move(init.completion_handler)]
+	  (boost::system::error_code) mutable {
+	    std::move(c)();
+	  }), boost::system::error_code{});
+      l.unlock();
+    }
+    return init.result.get();
+  }
+
+
+  /**
+   * Get std::list of entities blocklisted since this was last called,
+   * and reset the std::list.
+   *
+   * Uses a std::set because typical use case is to compare some
+   * other std::list of clients to see which overlap with the blocklisted
+   * addrs.
+   *
+   */
+  void consume_blocklist_events(std::set<entity_addr_t> *events);
+
+  int pool_snap_by_name(int64_t poolid,
+			const char *snap_name,
+			snapid_t *snap) const;
+  int pool_snap_get_info(int64_t poolid, snapid_t snap,
+			 pool_snap_info_t *info) const;
+  int pool_snap_list(int64_t poolid, std::vector<uint64_t> *snaps);
+private:
+
+  void emit_blocklist_events(const OSDMap::Incremental &inc);
+  void emit_blocklist_events(const OSDMap &old_osd_map,
+                             const OSDMap &new_osd_map);
+
+  // low-level
+  void _op_submit(Op *op, ceph::shunique_lock<ceph::shared_mutex>& lc,
+		  ceph_tid_t *ptid);
+  void _op_submit_with_budget(Op *op,
+			      ceph::shunique_lock<ceph::shared_mutex>& lc,
+			      ceph_tid_t *ptid,
+			      int *ctx_budget = NULL);
+  // public interface
+public:
+  void op_submit(Op *op, ceph_tid_t *ptid = NULL, int *ctx_budget = NULL);
+  bool is_active() {
+    std::shared_lock l(rwlock);
+    return !((!inflight_ops) && linger_ops.empty() &&
+	     poolstat_ops.empty() && statfs_ops.empty());
+  }
+
+  /**
+   * Output in-flight requests
+   */
+  void _dump_active(OSDSession *s);
+  void _dump_active();
+  void dump_active();
+  void dump_requests(ceph::Formatter *fmt);
+  void _dump_ops(const OSDSession *s, ceph::Formatter *fmt);
+  void dump_ops(ceph::Formatter *fmt);
+  void _dump_linger_ops(const OSDSession *s, ceph::Formatter *fmt);
+  void dump_linger_ops(ceph::Formatter *fmt);
+  void _dump_command_ops(const OSDSession *s, ceph::Formatter *fmt);
+  void dump_command_ops(ceph::Formatter *fmt);
+  void dump_pool_ops(ceph::Formatter *fmt) const;
+  void dump_pool_stat_ops(ceph::Formatter *fmt) const;
+  void dump_statfs_ops(ceph::Formatter *fmt) const;
+
+  int get_client_incarnation() const { return client_inc; }
+  void set_client_incarnation(int inc) { client_inc = inc; }
+
+  bool have_map(epoch_t epoch);
+
+  struct CB_Objecter_GetVersion {
+    Objecter *objecter;
+    std::unique_ptr<OpCompletion> fin;
+
+    CB_Objecter_GetVersion(Objecter *o, std::unique_ptr<OpCompletion> c)
+      : objecter(o), fin(std::move(c)) {}
+    void operator()(boost::system::error_code ec, version_t newest,
+		    version_t oldest) {
+      if (ec == boost::system::errc::resource_unavailable_try_again) {
+	// try again as instructed
+	objecter->_wait_for_latest_osdmap(std::move(*this));
+      } else if (ec) {
+	ceph::async::post(std::move(fin), ec);
+      } else {
+	auto l = std::unique_lock(objecter->rwlock);
+	objecter->_get_latest_version(oldest, newest, std::move(fin),
+				      std::move(l));
+      }
+    }
+  };
+
+  template<typename CompletionToken>
+  auto wait_for_map(epoch_t epoch, CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, OpSignature> init(token);
+
+    if (osdmap->get_epoch() >= epoch) {
+      boost::asio::post(service,
+			ceph::async::bind_handler(
+			  std::move(init.completion_handler),
+			  boost::system::error_code()));
+    } else {
+      monc->get_version("osdmap",
+			CB_Objecter_GetVersion(
+			  this,
+			  OpCompletion::create(service.get_executor(),
+					       std::move(init.completion_handler))));
+    }
+    return init.result.get();
+  }
+
+  void _wait_for_new_map(std::unique_ptr<OpCompletion>, epoch_t epoch,
+			 boost::system::error_code = {});
+
+private:
+  void _wait_for_latest_osdmap(CB_Objecter_GetVersion&& c) {
+    monc->get_version("osdmap", std::move(c));
+  }
+
+public:
+
+  template<typename CompletionToken>
+  auto wait_for_latest_osdmap(CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, OpSignature> init(token);
+
+    monc->get_version("osdmap",
+		      CB_Objecter_GetVersion(
+			this,
+			OpCompletion::create(service.get_executor(),
+					     std::move(init.completion_handler))));
+    return init.result.get();
+  }
+
+  void wait_for_latest_osdmap(std::unique_ptr<OpCompletion> c) {
+    monc->get_version("osdmap",
+		      CB_Objecter_GetVersion(this, std::move(c)));
+  }
+
+  template<typename CompletionToken>
+  auto get_latest_version(epoch_t oldest, epoch_t newest,
+			  CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, OpSignature> init(token);
+    {
+      std::unique_lock wl(rwlock);
+      _get_latest_version(oldest, newest,
+			  OpCompletion::create(
+			    service.get_executor(),
+			    std::move(init.completion_handler)),
+			  std::move(wl));
+    }
+    return init.result.get();
+  }
+
+  void _get_latest_version(epoch_t oldest, epoch_t neweset,
+			   std::unique_ptr<OpCompletion> fin,
+			   std::unique_lock<ceph::shared_mutex>&& ul);
+
+  /** Get the current set of global op flags */
+  int get_global_op_flags() const { return global_op_flags; }
+  /** Add a flag to the global op flags, not really atomic operation */
+  void add_global_op_flags(int flag) {
+    global_op_flags.fetch_or(flag);
+  }
+  /** Clear the passed flags from the global op flag set */
+  void clear_global_op_flag(int flags) {
+    global_op_flags.fetch_and(~flags);
+  }
+
+  /// cancel an in-progress request with the given return code
+private:
+  int op_cancel(OSDSession *s, ceph_tid_t tid, int r);
+  int _op_cancel(ceph_tid_t tid, int r);
+public:
+  int op_cancel(ceph_tid_t tid, int r);
+  int op_cancel(const std::vector<ceph_tid_t>& tidls, int r);
+
+  /**
+   * Any write op which is in progress at the start of this call shall no
+   * longer be in progress when this call ends.  Operations started after the
+   * start of this call may still be in progress when this call ends.
+   *
+   * @return the latest possible epoch in which a cancelled op could have
+   *         existed, or -1 if nothing was cancelled.
+   */
+  epoch_t op_cancel_writes(int r, int64_t pool=-1);
+
+  // commands
+  void osd_command(int osd, std::vector<std::string> cmd,
+		   ceph::buffer::list inbl, ceph_tid_t *ptid,
+		   decltype(CommandOp::onfinish)&& onfinish) {
+    ceph_assert(osd >= 0);
+    auto c = new CommandOp(
+      osd,
+      std::move(cmd),
+      std::move(inbl),
+      std::move(onfinish));
+    submit_command(c, ptid);
+  }
+  template<typename CompletionToken>
+  auto osd_command(int osd, std::vector<std::string> cmd,
+		   ceph::buffer::list inbl, ceph_tid_t *ptid,
+		   CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken,
+				  CommandOp::OpSig> init(token);
+    osd_command(osd, std::move(cmd), std::move(inbl), ptid,
+		CommandOp::OpComp::create(service.get_executor(),
+					  std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  void pg_command(pg_t pgid, std::vector<std::string> cmd,
+		  ceph::buffer::list inbl, ceph_tid_t *ptid,
+		  decltype(CommandOp::onfinish)&& onfinish) {
+    auto *c = new CommandOp(
+      pgid,
+      std::move(cmd),
+      std::move(inbl),
+      std::move(onfinish));
+    submit_command(c, ptid);
+  }
+
+  template<typename CompletionToken>
+  auto pg_command(pg_t pgid, std::vector<std::string> cmd,
+		  ceph::buffer::list inbl, ceph_tid_t *ptid,
+		  CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken,
+				  CommandOp::OpSig> init(token);
+    pg_command(pgid, std::move(cmd), std::move(inbl), ptid,
+	       CommandOp::OpComp::create(service.get_executor(),
+					 std::move(init.completion_handler)));
+    return init.result.get();
+  }
+
+  // mid-level helpers
+  Op *prepare_mutate_op(
+    const object_t& oid, const object_locator_t& oloc,
+    ObjectOperation& op, const SnapContext& snapc,
+    ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    osd_reqid_t reqid = osd_reqid_t(),
+    ZTracer::Trace *parent_trace = nullptr) {
+    Op *o = new Op(oid, oloc, std::move(op.ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver,
+		   nullptr, parent_trace);
+    o->priority = op.priority;
+    o->mtime = mtime;
+    o->snapc = snapc;
+    o->out_rval.swap(op.out_rval);
+    o->out_bl.swap(op.out_bl);
+    o->out_handler.swap(op.out_handler);
+    o->out_ec.swap(op.out_ec);
+    o->reqid = reqid;
+    op.clear();
+    return o;
+  }
+  ceph_tid_t mutate(
+    const object_t& oid, const object_locator_t& oloc,
+    ObjectOperation& op, const SnapContext& snapc,
+    ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    osd_reqid_t reqid = osd_reqid_t()) {
+    Op *o = prepare_mutate_op(oid, oloc, op, snapc, mtime, flags,
+			      oncommit, objver, reqid);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  void mutate(const object_t& oid, const object_locator_t& oloc,
+	      ObjectOperation&& op, const SnapContext& snapc,
+	      ceph::real_time mtime, int flags,
+	      std::unique_ptr<Op::OpComp>&& oncommit,
+	      version_t *objver = NULL, osd_reqid_t reqid = osd_reqid_t(),
+	      ZTracer::Trace *parent_trace = nullptr) {
+    Op *o = new Op(oid, oloc, std::move(op.ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, std::move(oncommit), objver,
+		   nullptr, parent_trace);
+    o->priority = op.priority;
+    o->mtime = mtime;
+    o->snapc = snapc;
+    o->out_bl.swap(op.out_bl);
+    o->out_handler.swap(op.out_handler);
+    o->out_rval.swap(op.out_rval);
+    o->out_ec.swap(op.out_ec);
+    o->reqid = reqid;
+    op.clear();
+    op_submit(o);
+  }
+
+  Op *prepare_read_op(
+    const object_t& oid, const object_locator_t& oloc,
+    ObjectOperation& op,
+    snapid_t snapid, ceph::buffer::list *pbl, int flags,
+    Context *onack, version_t *objver = NULL,
+    int *data_offset = NULL,
+    uint64_t features = 0,
+    ZTracer::Trace *parent_trace = nullptr) {
+    Op *o = new Op(oid, oloc, std::move(op.ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, onack, objver,
+		   data_offset, parent_trace);
+    o->priority = op.priority;
+    o->snapid = snapid;
+    o->outbl = pbl;
+    if (!o->outbl && op.size() == 1 && op.out_bl[0] && op.out_bl[0]->length())
+	o->outbl = op.out_bl[0];
+    o->out_bl.swap(op.out_bl);
+    o->out_handler.swap(op.out_handler);
+    o->out_rval.swap(op.out_rval);
+    o->out_ec.swap(op.out_ec);
+    op.clear();
+    return o;
+  }
+  ceph_tid_t read(
+    const object_t& oid, const object_locator_t& oloc,
+    ObjectOperation& op,
+    snapid_t snapid, ceph::buffer::list *pbl, int flags,
+    Context *onack, version_t *objver = NULL,
+    int *data_offset = NULL,
+    uint64_t features = 0) {
+    Op *o = prepare_read_op(oid, oloc, op, snapid, pbl, flags, onack, objver,
+			    data_offset);
+    if (features)
+      o->features = features;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  void read(const object_t& oid, const object_locator_t& oloc,
+	    ObjectOperation&& op, snapid_t snapid, ceph::buffer::list *pbl,
+	    int flags, std::unique_ptr<Op::OpComp>&& onack,
+	    version_t *objver = nullptr, int *data_offset = nullptr,
+	    uint64_t features = 0, ZTracer::Trace *parent_trace = nullptr) {
+    Op *o = new Op(oid, oloc, std::move(op.ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, std::move(onack), objver,
+		   data_offset, parent_trace);
+    o->priority = op.priority;
+    o->snapid = snapid;
+    o->outbl = pbl;
+    // XXX
+    if (!o->outbl && op.size() == 1 && op.out_bl[0] && op.out_bl[0]->length()) {
+      o->outbl = op.out_bl[0];
+    }
+    o->out_bl.swap(op.out_bl);
+    o->out_handler.swap(op.out_handler);
+    o->out_rval.swap(op.out_rval);
+    o->out_ec.swap(op.out_ec);
+    if (features)
+      o->features = features;
+    op.clear();
+    op_submit(o);
+  }
+
+
+  Op *prepare_pg_read_op(
+    uint32_t hash, object_locator_t oloc,
+    ObjectOperation& op, ceph::buffer::list *pbl, int flags,
+    Context *onack, epoch_t *reply_epoch,
+    int *ctx_budget) {
+    Op *o = new Op(object_t(), oloc,
+		   std::move(op.ops),
+		   flags | global_op_flags | CEPH_OSD_FLAG_READ |
+		   CEPH_OSD_FLAG_IGNORE_OVERLAY,
+		   onack, NULL);
+    o->target.precalc_pgid = true;
+    o->target.base_pgid = pg_t(hash, oloc.pool);
+    o->priority = op.priority;
+    o->snapid = CEPH_NOSNAP;
+    o->outbl = pbl;
+    o->out_bl.swap(op.out_bl);
+    o->out_handler.swap(op.out_handler);
+    o->out_rval.swap(op.out_rval);
+    o->out_ec.swap(op.out_ec);
+    o->reply_epoch = reply_epoch;
+    if (ctx_budget) {
+      // budget is tracked by listing context
+      o->ctx_budgeted = true;
+    }
+    op.clear();
+    return o;
+  }
+  ceph_tid_t pg_read(
+    uint32_t hash, object_locator_t oloc,
+    ObjectOperation& op, ceph::buffer::list *pbl, int flags,
+    Context *onack, epoch_t *reply_epoch,
+    int *ctx_budget) {
+    Op *o = prepare_pg_read_op(hash, oloc, op, pbl, flags,
+			       onack, reply_epoch, ctx_budget);
+    ceph_tid_t tid;
+    op_submit(o, &tid, ctx_budget);
+    return tid;
+  }
+
+  ceph_tid_t pg_read(
+    uint32_t hash, object_locator_t oloc,
+    ObjectOperation& op, ceph::buffer::list *pbl, int flags,
+    std::unique_ptr<Op::OpComp>&& onack, epoch_t *reply_epoch, int *ctx_budget) {
+    ceph_tid_t tid;
+    Op *o = new Op(object_t(), oloc,
+		   std::move(op.ops),
+		   flags | global_op_flags | CEPH_OSD_FLAG_READ |
+		   CEPH_OSD_FLAG_IGNORE_OVERLAY,
+		   std::move(onack), nullptr);
+    o->target.precalc_pgid = true;
+    o->target.base_pgid = pg_t(hash, oloc.pool);
+    o->priority = op.priority;
+    o->snapid = CEPH_NOSNAP;
+    o->outbl = pbl;
+    o->out_bl.swap(op.out_bl);
+    o->out_handler.swap(op.out_handler);
+    o->out_rval.swap(op.out_rval);
+    o->out_ec.swap(op.out_ec);
+    o->reply_epoch = reply_epoch;
+    if (ctx_budget) {
+      // budget is tracked by listing context
+      o->ctx_budgeted = true;
+    }
+    op_submit(o, &tid, ctx_budget);
+    op.clear();
+    return tid;
+  }
+
+  // caller owns a ref
+  LingerOp *linger_register(const object_t& oid, const object_locator_t& oloc,
+			    int flags);
+  ceph_tid_t linger_watch(LingerOp *info,
+			  ObjectOperation& op,
+			  const SnapContext& snapc, ceph::real_time mtime,
+			  ceph::buffer::list& inbl,
+			  decltype(info->on_reg_commit)&& oncommit,
+			  version_t *objver);
+  ceph_tid_t linger_watch(LingerOp *info,
+			  ObjectOperation& op,
+			  const SnapContext& snapc, ceph::real_time mtime,
+			  ceph::buffer::list& inbl,
+			  Context* onfinish,
+			  version_t *objver) {
+    return linger_watch(info, op, snapc, mtime, inbl,
+			OpContextVert<ceph::buffer::list>(onfinish, nullptr), objver);
+  }
+  ceph_tid_t linger_notify(LingerOp *info,
+			   ObjectOperation& op,
+			   snapid_t snap, ceph::buffer::list& inbl,
+			   decltype(LingerOp::on_reg_commit)&& onfinish,
+			   version_t *objver);
+  ceph_tid_t linger_notify(LingerOp *info,
+			   ObjectOperation& op,
+			   snapid_t snap, ceph::buffer::list& inbl,
+			   ceph::buffer::list *poutbl,
+			   Context* onack,
+			   version_t *objver) {
+    return linger_notify(info, op, snap, inbl,
+			 OpContextVert(onack, poutbl),
+			 objver);
+  }
+  tl::expected<ceph::timespan,
+	       boost::system::error_code> linger_check(LingerOp *info);
+  void linger_cancel(LingerOp *info);  // releases a reference
+  void _linger_cancel(LingerOp *info);
+
+  void _do_watch_notify(boost::intrusive_ptr<LingerOp> info,
+                        boost::intrusive_ptr<MWatchNotify> m);
+
+  /**
+   * set up initial ops in the op std::vector, and allocate a final op slot.
+   *
+   * The caller is responsible for filling in the final ops_count ops.
+   *
+   * @param ops op std::vector
+   * @param ops_count number of final ops the caller will fill in
+   * @param extra_ops pointer to [array of] initial op[s]
+   * @return index of final op (for caller to fill in)
+   */
+  int init_ops(boost::container::small_vector_base<OSDOp>& ops, int ops_count,
+	       ObjectOperation *extra_ops) {
+    int i;
+    int extra = 0;
+
+    if (extra_ops)
+      extra = extra_ops->ops.size();
+
+    ops.resize(ops_count + extra);
+
+    for (i=0; i<extra; i++) {
+      ops[i] = extra_ops->ops[i];
+    }
+
+    return i;
+  }
+
+
+  // high-level helpers
+  Op *prepare_stat_op(
+    const object_t& oid, const object_locator_t& oloc,
+    snapid_t snap, uint64_t *psize, ceph::real_time *pmtime,
+    int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_STAT;
+    C_Stat *fin = new C_Stat(psize, pmtime, onfinish);
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, fin, objver);
+    o->snapid = snap;
+    o->outbl = &fin->bl;
+    return o;
+  }
+  ceph_tid_t stat(
+    const object_t& oid, const object_locator_t& oloc,
+    snapid_t snap, uint64_t *psize, ceph::real_time *pmtime,
+    int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL) {
+    Op *o = prepare_stat_op(oid, oloc, snap, psize, pmtime, flags,
+			    onfinish, objver, extra_ops);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  Op *prepare_read_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, uint64_t len, snapid_t snap, ceph::buffer::list *pbl,
+    int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0,
+    ZTracer::Trace *parent_trace = nullptr) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_READ;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = len;
+    ops[i].op.extent.truncate_size = 0;
+    ops[i].op.extent.truncate_seq = 0;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, onfinish, objver,
+		   nullptr, parent_trace);
+    o->snapid = snap;
+    o->outbl = pbl;
+    return o;
+  }
+  ceph_tid_t read(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, uint64_t len, snapid_t snap, ceph::buffer::list *pbl,
+    int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    Op *o = prepare_read_op(oid, oloc, off, len, snap, pbl, flags,
+			    onfinish, objver, extra_ops, op_flags);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  Op *prepare_cmpext_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, ceph::buffer::list &cmp_bl,
+    snapid_t snap, int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_CMPEXT;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = cmp_bl.length();
+    ops[i].op.extent.truncate_size = 0;
+    ops[i].op.extent.truncate_seq = 0;
+    ops[i].indata = cmp_bl;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, onfinish, objver);
+    o->snapid = snap;
+    return o;
+  }
+
+  ceph_tid_t cmpext(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, ceph::buffer::list &cmp_bl,
+    snapid_t snap, int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    Op *o = prepare_cmpext_op(oid, oloc, off, cmp_bl, snap,
+			      flags, onfinish, objver, extra_ops, op_flags);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  ceph_tid_t read_trunc(const object_t& oid, const object_locator_t& oloc,
+			uint64_t off, uint64_t len, snapid_t snap,
+			ceph::buffer::list *pbl, int flags, uint64_t trunc_size,
+			__u32 trunc_seq, Context *onfinish,
+			version_t *objver = NULL,
+			ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_READ;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = len;
+    ops[i].op.extent.truncate_size = trunc_size;
+    ops[i].op.extent.truncate_seq = trunc_seq;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, onfinish, objver);
+    o->snapid = snap;
+    o->outbl = pbl;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t mapext(const object_t& oid, const object_locator_t& oloc,
+		    uint64_t off, uint64_t len, snapid_t snap, ceph::buffer::list *pbl,
+		    int flags, Context *onfinish, version_t *objver = NULL,
+		    ObjectOperation *extra_ops = NULL) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_MAPEXT;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = len;
+    ops[i].op.extent.truncate_size = 0;
+    ops[i].op.extent.truncate_seq = 0;
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, onfinish, objver);
+    o->snapid = snap;
+    o->outbl = pbl;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t getxattr(const object_t& oid, const object_locator_t& oloc,
+	     const char *name, snapid_t snap, ceph::buffer::list *pbl, int flags,
+	     Context *onfinish,
+	     version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_GETXATTR;
+    ops[i].op.xattr.name_len = (name ? strlen(name) : 0);
+    ops[i].op.xattr.value_len = 0;
+    if (name)
+      ops[i].indata.append(name, ops[i].op.xattr.name_len);
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, onfinish, objver);
+    o->snapid = snap;
+    o->outbl = pbl;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  ceph_tid_t getxattrs(const object_t& oid, const object_locator_t& oloc,
+		       snapid_t snap, std::map<std::string,ceph::buffer::list>& attrset,
+		       int flags, Context *onfinish, version_t *objver = NULL,
+		       ObjectOperation *extra_ops = NULL) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_GETXATTRS;
+    C_GetAttrs *fin = new C_GetAttrs(attrset, onfinish);
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, fin, objver);
+    o->snapid = snap;
+    o->outbl = &fin->bl;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  ceph_tid_t read_full(const object_t& oid, const object_locator_t& oloc,
+		       snapid_t snap, ceph::buffer::list *pbl, int flags,
+		       Context *onfinish, version_t *objver = NULL,
+		       ObjectOperation *extra_ops = NULL) {
+    return read(oid, oloc, 0, 0, snap, pbl, flags | global_op_flags |
+		CEPH_OSD_FLAG_READ, onfinish, objver, extra_ops);
+  }
+
+
+  // writes
+  ceph_tid_t _modify(const object_t& oid, const object_locator_t& oloc,
+		     osdc_opvec& ops,
+		     ceph::real_time mtime,
+		     const SnapContext& snapc, int flags,
+		     Context *oncommit,
+		     version_t *objver = NULL) {
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_write_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, uint64_t len, const SnapContext& snapc,
+    const ceph::buffer::list &bl, ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0,
+    ZTracer::Trace *parent_trace = nullptr) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_WRITE;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = len;
+    ops[i].op.extent.truncate_size = 0;
+    ops[i].op.extent.truncate_seq = 0;
+    ops[i].indata = bl;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, std::move(oncommit), objver,
+                   nullptr, parent_trace);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    return o;
+  }
+  ceph_tid_t write(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, uint64_t len, const SnapContext& snapc,
+    const ceph::buffer::list &bl, ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    Op *o = prepare_write_op(oid, oloc, off, len, snapc, bl, mtime, flags,
+			     oncommit, objver, extra_ops, op_flags);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_append_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t len, const SnapContext& snapc,
+    const ceph::buffer::list &bl, ceph::real_time mtime, int flags,
+    Context *oncommit,
+    version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_APPEND;
+    ops[i].op.extent.offset = 0;
+    ops[i].op.extent.length = len;
+    ops[i].op.extent.truncate_size = 0;
+    ops[i].op.extent.truncate_seq = 0;
+    ops[i].indata = bl;
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    return o;
+  }
+  ceph_tid_t append(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t len, const SnapContext& snapc,
+    const ceph::buffer::list &bl, ceph::real_time mtime, int flags,
+    Context *oncommit,
+    version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL) {
+    Op *o = prepare_append_op(oid, oloc, len, snapc, bl, mtime, flags,
+			      oncommit, objver, extra_ops);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t write_trunc(const object_t& oid, const object_locator_t& oloc,
+			 uint64_t off, uint64_t len, const SnapContext& snapc,
+			 const ceph::buffer::list &bl, ceph::real_time mtime, int flags,
+			 uint64_t trunc_size, __u32 trunc_seq,
+			 Context *oncommit,
+			 version_t *objver = NULL,
+			 ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_WRITE;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = len;
+    ops[i].op.extent.truncate_size = trunc_size;
+    ops[i].op.extent.truncate_seq = trunc_seq;
+    ops[i].indata = bl;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_write_full_op(
+    const object_t& oid, const object_locator_t& oloc,
+    const SnapContext& snapc, const ceph::buffer::list &bl,
+    ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_WRITEFULL;
+    ops[i].op.extent.offset = 0;
+    ops[i].op.extent.length = bl.length();
+    ops[i].indata = bl;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    return o;
+  }
+  ceph_tid_t write_full(
+    const object_t& oid, const object_locator_t& oloc,
+    const SnapContext& snapc, const ceph::buffer::list &bl,
+    ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    Op *o = prepare_write_full_op(oid, oloc, snapc, bl, mtime, flags,
+				  oncommit, objver, extra_ops, op_flags);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_writesame_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t write_len, uint64_t off,
+    const SnapContext& snapc, const ceph::buffer::list &bl,
+    ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_WRITESAME;
+    ops[i].op.writesame.offset = off;
+    ops[i].op.writesame.length = write_len;
+    ops[i].op.writesame.data_length = bl.length();
+    ops[i].indata = bl;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    return o;
+  }
+  ceph_tid_t writesame(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t write_len, uint64_t off,
+    const SnapContext& snapc, const ceph::buffer::list &bl,
+    ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+
+    Op *o = prepare_writesame_op(oid, oloc, write_len, off, snapc, bl,
+				 mtime, flags, oncommit, objver,
+				 extra_ops, op_flags);
+
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t trunc(const object_t& oid, const object_locator_t& oloc,
+		   const SnapContext& snapc, ceph::real_time mtime, int flags,
+		   uint64_t trunc_size, __u32 trunc_seq,
+		   Context *oncommit, version_t *objver = NULL,
+		   ObjectOperation *extra_ops = NULL) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_TRUNCATE;
+    ops[i].op.extent.offset = trunc_size;
+    ops[i].op.extent.truncate_size = trunc_size;
+    ops[i].op.extent.truncate_seq = trunc_seq;
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t zero(const object_t& oid, const object_locator_t& oloc,
+		  uint64_t off, uint64_t len, const SnapContext& snapc,
+		  ceph::real_time mtime, int flags, Context *oncommit,
+	     version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_ZERO;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = len;
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t rollback_object(const object_t& oid, const object_locator_t& oloc,
+			     const SnapContext& snapc, snapid_t snapid,
+			     ceph::real_time mtime, Context *oncommit,
+			     version_t *objver = NULL,
+			     ObjectOperation *extra_ops = NULL) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_ROLLBACK;
+    ops[i].op.snap.snapid = snapid;
+    Op *o = new Op(oid, oloc, std::move(ops), CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t create(const object_t& oid, const object_locator_t& oloc,
+		    const SnapContext& snapc, ceph::real_time mtime, int global_flags,
+		    int create_flags, Context *oncommit,
+		    version_t *objver = NULL,
+		    ObjectOperation *extra_ops = NULL) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_CREATE;
+    ops[i].op.flags = create_flags;
+    Op *o = new Op(oid, oloc, std::move(ops), global_flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_remove_op(
+    const object_t& oid, const object_locator_t& oloc,
+    const SnapContext& snapc, ceph::real_time mtime, int flags,
+    Context *oncommit,
+    version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_DELETE;
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    return o;
+  }
+  ceph_tid_t remove(
+    const object_t& oid, const object_locator_t& oloc,
+    const SnapContext& snapc, ceph::real_time mtime, int flags,
+    Context *oncommit,
+    version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    Op *o = prepare_remove_op(oid, oloc, snapc, mtime, flags,
+			      oncommit, objver, extra_ops);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  ceph_tid_t setxattr(const object_t& oid, const object_locator_t& oloc,
+	      const char *name, const SnapContext& snapc, const ceph::buffer::list &bl,
+	      ceph::real_time mtime, int flags,
+	      Context *oncommit,
+	      version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_SETXATTR;
+    ops[i].op.xattr.name_len = (name ? strlen(name) : 0);
+    ops[i].op.xattr.value_len = bl.length();
+    if (name)
+      ops[i].indata.append(name, ops[i].op.xattr.name_len);
+    ops[i].indata.append(bl);
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit,
+		   objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t removexattr(const object_t& oid, const object_locator_t& oloc,
+	      const char *name, const SnapContext& snapc,
+	      ceph::real_time mtime, int flags,
+	      Context *oncommit,
+	      version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    osdc_opvec ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_RMXATTR;
+    ops[i].op.xattr.name_len = (name ? strlen(name) : 0);
+    ops[i].op.xattr.value_len = 0;
+    if (name)
+      ops[i].indata.append(name, ops[i].op.xattr.name_len);
+    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  void list_nobjects(NListContext *p, Context *onfinish);
+  uint32_t list_nobjects_seek(NListContext *p, uint32_t pos);
+  uint32_t list_nobjects_seek(NListContext *list_context, const hobject_t& c);
+  void list_nobjects_get_cursor(NListContext *list_context, hobject_t *c);
+
+  hobject_t enumerate_objects_begin();
+  hobject_t enumerate_objects_end();
+
+  template<typename T>
+  friend struct EnumerationContext;
+  template<typename T>
+  friend struct CB_EnumerateReply;
+  template<typename T>
+  void enumerate_objects(
+    int64_t pool_id,
+    std::string_view ns,
+    hobject_t start,
+    hobject_t end,
+    const uint32_t max,
+    const ceph::buffer::list& filter_bl,
+    fu2::unique_function<void(boost::system::error_code,
+			      std::vector<T>,
+			      hobject_t) &&> on_finish);
+  template<typename T>
+  void _issue_enumerate(hobject_t start,
+			std::unique_ptr<EnumerationContext<T>>);
+  template<typename T>
+  void _enumerate_reply(
+    ceph::buffer::list&& bl,
+    boost::system::error_code ec,
+    std::unique_ptr<EnumerationContext<T>>&& ectx);
+
+  // -------------------------
+  // pool ops
+private:
+  void pool_op_submit(PoolOp *op);
+  void _pool_op_submit(PoolOp *op);
+  void _finish_pool_op(PoolOp *op, int r);
+  void _do_delete_pool(int64_t pool,
+		       decltype(PoolOp::onfinish)&& onfinish);
+
+public:
+  void create_pool_snap(int64_t pool, std::string_view snapName,
+			decltype(PoolOp::onfinish)&& onfinish);
+  void create_pool_snap(int64_t pool, std::string_view snapName,
+			Context* c) {
+    create_pool_snap(pool, snapName,
+		     OpContextVert<ceph::buffer::list>(c, nullptr));
+  }
+  void allocate_selfmanaged_snap(int64_t pool,
+				 std::unique_ptr<ceph::async::Completion<
+				 void(boost::system::error_code,
+				      snapid_t)>> onfinish);
+  void allocate_selfmanaged_snap(int64_t pool, snapid_t* psnapid,
+				 Context* c) {
+    allocate_selfmanaged_snap(pool,
+			      OpContextVert(c, psnapid));
+  }
+  void delete_pool_snap(int64_t pool, std::string_view snapName,
+			decltype(PoolOp::onfinish)&& onfinish);
+  void delete_pool_snap(int64_t pool, std::string_view snapName,
+			Context* c) {
+    delete_pool_snap(pool, snapName,
+		     OpContextVert<ceph::buffer::list>(c, nullptr));
+  }
+
+  void delete_selfmanaged_snap(int64_t pool, snapid_t snap,
+			       decltype(PoolOp::onfinish)&& onfinish);
+  void delete_selfmanaged_snap(int64_t pool, snapid_t snap,
+			       Context* c) {
+    delete_selfmanaged_snap(pool, snap,
+			    OpContextVert<ceph::buffer::list>(c, nullptr));
+  }
+
+
+  void create_pool(std::string_view name,
+		   decltype(PoolOp::onfinish)&& onfinish,
+		   int crush_rule=-1);
+  void create_pool(std::string_view name, Context *onfinish,
+		  int crush_rule=-1) {
+    create_pool(name,
+		OpContextVert<ceph::buffer::list>(onfinish, nullptr),
+		crush_rule);
+  }
+  void delete_pool(int64_t pool,
+		   decltype(PoolOp::onfinish)&& onfinish);
+  void delete_pool(int64_t pool,
+		   Context* onfinish) {
+    delete_pool(pool, OpContextVert<ceph::buffer::list>(onfinish, nullptr));
+  }
+
+  void delete_pool(std::string_view name,
+		   decltype(PoolOp::onfinish)&& onfinish);
+
+  void delete_pool(std::string_view name,
+		   Context* onfinish) {
+    delete_pool(name, OpContextVert<ceph::buffer::list>(onfinish, nullptr));
+  }
+
+  void handle_pool_op_reply(MPoolOpReply *m);
+  int pool_op_cancel(ceph_tid_t tid, int r);
+
+  // --------------------------
+  // pool stats
+private:
+  void _poolstat_submit(PoolStatOp *op);
+public:
+  void handle_get_pool_stats_reply(MGetPoolStatsReply *m);
+  void get_pool_stats(const std::vector<std::string>& pools,
+		      decltype(PoolStatOp::onfinish)&& onfinish);
+  template<typename CompletionToken>
+  auto get_pool_stats(const std::vector<std::string>& pools,
+		      CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken,
+				  PoolStatOp::OpSig> init(token);
+    get_pool_stats(pools,
+		   PoolStatOp::OpComp::create(
+		     service.get_executor(),
+		     std::move(init.completion_handler)));
+    return init.result.get();
+  }
+  int pool_stat_op_cancel(ceph_tid_t tid, int r);
+  void _finish_pool_stat_op(PoolStatOp *op, int r);
+
+  // ---------------------------
+  // df stats
+private:
+  void _fs_stats_submit(StatfsOp *op);
+public:
+  void handle_fs_stats_reply(MStatfsReply *m);
+  void get_fs_stats(boost::optional<int64_t> poolid,
+		    decltype(StatfsOp::onfinish)&& onfinish);
+  template<typename CompletionToken>
+  auto get_fs_stats(boost::optional<int64_t> poolid,
+		    CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, StatfsOp::OpSig> init(token);
+    get_fs_stats(poolid,
+		 StatfsOp::OpComp::create(service.get_executor(),
+					  std::move(init.completion_handler)));
+    return init.result.get();
+  }
+  void get_fs_stats(struct ceph_statfs& result, boost::optional<int64_t> poolid,
+		    Context *onfinish) {
+    get_fs_stats(poolid, OpContextVert(onfinish, result));
+  }
+  int statfs_op_cancel(ceph_tid_t tid, int r);
+  void _finish_statfs_op(StatfsOp *op, int r);
+
+  // ---------------------------
+  // some scatter/gather hackery
+
+  void _sg_read_finish(std::vector<ObjectExtent>& extents,
+		       std::vector<ceph::buffer::list>& resultbl,
+		       ceph::buffer::list *bl, Context *onfinish);
+
+  struct C_SGRead : public Context {
+    Objecter *objecter;
+    std::vector<ObjectExtent> extents;
+    std::vector<ceph::buffer::list> resultbl;
+    ceph::buffer::list *bl;
+    Context *onfinish;
+    C_SGRead(Objecter *ob,
+	     std::vector<ObjectExtent>& e, std::vector<ceph::buffer::list>& r, ceph::buffer::list *b,
+	     Context *c) :
+      objecter(ob), bl(b), onfinish(c) {
+      extents.swap(e);
+      resultbl.swap(r);
+    }
+    void finish(int r) override {
+      objecter->_sg_read_finish(extents, resultbl, bl, onfinish);
+    }
+  };
+
+  void sg_read_trunc(std::vector<ObjectExtent>& extents, snapid_t snap,
+		     ceph::buffer::list *bl, int flags, uint64_t trunc_size,
+		     __u32 trunc_seq, Context *onfinish, int op_flags = 0) {
+    if (extents.size() == 1) {
+      read_trunc(extents[0].oid, extents[0].oloc, extents[0].offset,
+		 extents[0].length, snap, bl, flags, extents[0].truncate_size,
+		 trunc_seq, onfinish, 0, 0, op_flags);
+    } else {
+      C_GatherBuilder gather(cct);
+      std::vector<ceph::buffer::list> resultbl(extents.size());
+      int i=0;
+      for (auto p = extents.begin(); p != extents.end(); ++p) {
+	read_trunc(p->oid, p->oloc, p->offset, p->length, snap, &resultbl[i++],
+		   flags, p->truncate_size, trunc_seq, gather.new_sub(),
+		   0, 0, op_flags);
+      }
+      gather.set_finisher(new C_SGRead(this, extents, resultbl, bl, onfinish));
+      gather.activate();
+    }
+  }
+
+  void sg_read(std::vector<ObjectExtent>& extents, snapid_t snap, ceph::buffer::list *bl,
+	       int flags, Context *onfinish, int op_flags = 0) {
+    sg_read_trunc(extents, snap, bl, flags, 0, 0, onfinish, op_flags);
+  }
+
+  void sg_write_trunc(std::vector<ObjectExtent>& extents, const SnapContext& snapc,
+		      const ceph::buffer::list& bl, ceph::real_time mtime, int flags,
+		      uint64_t trunc_size, __u32 trunc_seq,
+		      Context *oncommit, int op_flags = 0) {
+    if (extents.size() == 1) {
+      write_trunc(extents[0].oid, extents[0].oloc, extents[0].offset,
+		  extents[0].length, snapc, bl, mtime, flags,
+		  extents[0].truncate_size, trunc_seq, oncommit,
+		  0, 0, op_flags);
+    } else {
+      C_GatherBuilder gcom(cct, oncommit);
+      auto it = bl.cbegin();
+      for (auto p = extents.begin(); p != extents.end(); ++p) {
+	ceph::buffer::list cur;
+	for (auto bit = p->buffer_extents.begin();
+	     bit != p->buffer_extents.end();
+	     ++bit) {
+	  if (it.get_off() != bit->first) {
+	    it.seek(bit->first);
+	  }
+	  it.copy(bit->second, cur);
+	}
+	ceph_assert(cur.length() == p->length);
+	write_trunc(p->oid, p->oloc, p->offset, p->length,
+	      snapc, cur, mtime, flags, p->truncate_size, trunc_seq,
+	      oncommit ? gcom.new_sub():0,
+	      0, 0, op_flags);
+      }
+      gcom.activate();
+    }
+  }
+
+  void sg_write(std::vector<ObjectExtent>& extents, const SnapContext& snapc,
+		const ceph::buffer::list& bl, ceph::real_time mtime, int flags,
+		Context *oncommit, int op_flags = 0) {
+    sg_write_trunc(extents, snapc, bl, mtime, flags, 0, 0, oncommit,
+		   op_flags);
+  }
+
+  void ms_handle_connect(Connection *con) override;
+  bool ms_handle_reset(Connection *con) override;
+  void ms_handle_remote_reset(Connection *con) override;
+  bool ms_handle_refused(Connection *con) override;
+
+  void blocklist_self(bool set);
+
+private:
+  epoch_t epoch_barrier = 0;
+  bool retry_writes_after_first_reply =
+    cct->_conf->objecter_retry_writes_after_first_reply;
+
+public:
+  void set_epoch_barrier(epoch_t epoch);
+
+  PerfCounters *get_logger() {
+    return logger;
+  }
+};
+
+#endif
diff --git a/src/osdc/Striper.cc b/src/osdc/Striper.cc
new file mode 100644
index 000000000..6f162e901
--- /dev/null
+++ b/src/osdc/Striper.cc
@@ -0,0 +1,537 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "Striper.h"
+
+#include "include/types.h"
+#include "include/buffer.h"
+#include "osd/OSDMap.h"
+
+#include "common/config.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_striper
+#undef dout_prefix
+#define dout_prefix *_dout << "striper "
+
+using std::make_pair;
+using std::map;
+using std::pair;
+
+using ceph::bufferlist;
+
+namespace {
+
+object_t format_oid(const char* object_format, uint64_t object_no) {
+  char buf[strlen(object_format) + 32];
+  snprintf(buf, sizeof(buf), object_format, (long long unsigned)object_no);
+  return object_t(buf);
+}
+
+struct OrderByObject {
+  constexpr bool operator()(uint64_t object_no,
+                            const striper::LightweightObjectExtent& rhs) const {
+    return object_no < rhs.object_no;
+  }
+  constexpr bool operator()(const striper::LightweightObjectExtent& lhs,
+                            uint64_t object_no) const {
+    return lhs.object_no < object_no;
+  }
+};
+
+template <typename I>
+void add_partial_sparse_result(
+    CephContext *cct,
+    std::map<uint64_t, std::pair<ceph::buffer::list, uint64_t> >* partial,
+    uint64_t* total_intended_len, bufferlist& bl, I* it, const I& end_it,
+    uint64_t* bl_off, uint64_t tofs, uint64_t tlen) {
+  ldout(cct, 30) << " be " << tofs << "~" << tlen << dendl;
+
+  auto& s = *it;
+  while (tlen > 0) {
+    ldout(cct, 20) << "  t " << tofs << "~" << tlen
+                   << " bl has " << bl.length()
+                   << " off " << *bl_off << dendl;
+    if (s == end_it) {
+      ldout(cct, 20) << "  s at end" << dendl;
+      auto& r = (*partial)[tofs];
+      r.second = tlen;
+      *total_intended_len += r.second;
+      break;
+    }
+
+    ldout(cct, 30) << "  s " << s->first << "~" << s->second << dendl;
+
+    // skip zero-length extent
+    if (s->second == 0) {
+      ldout(cct, 30) << "  s len 0, skipping" << dendl;
+      ++s;
+      continue;
+    }
+
+    if (s->first > *bl_off) {
+      // gap in sparse read result
+      pair<bufferlist, uint64_t>& r = (*partial)[tofs];
+      size_t gap = std::min<size_t>(s->first - *bl_off, tlen);
+      ldout(cct, 20) << "  s gap " << gap << ", skipping" << dendl;
+      r.second = gap;
+      *total_intended_len += r.second;
+      *bl_off += gap;
+      tofs += gap;
+      tlen -= gap;
+      if (tlen == 0) {
+        continue;
+      }
+    }
+
+    ceph_assert(s->first <= *bl_off);
+    size_t left = (s->first + s->second) - *bl_off;
+    size_t actual = std::min<size_t>(left, tlen);
+
+    if (actual > 0) {
+      ldout(cct, 20) << "  s has " << actual << ", copying" << dendl;
+      pair<bufferlist, uint64_t>& r = (*partial)[tofs];
+      bl.splice(0, actual, &r.first);
+      r.second = actual;
+      *total_intended_len += r.second;
+      *bl_off += actual;
+      tofs += actual;
+      tlen -= actual;
+    }
+    if (actual == left) {
+      ldout(cct, 30) << "  s advancing" << dendl;
+      ++s;
+    }
+  }
+}
+
+} // anonymous namespace
+
+void Striper::file_to_extents(CephContext *cct, const char *object_format,
+			      const file_layout_t *layout,
+			      uint64_t offset, uint64_t len,
+			      uint64_t trunc_size,
+			      std::vector<ObjectExtent>& extents,
+			      uint64_t buffer_offset)
+{
+  striper::LightweightObjectExtents lightweight_object_extents;
+  file_to_extents(cct, layout, offset, len, trunc_size, buffer_offset,
+                  &lightweight_object_extents);
+
+  // convert lightweight object extents to heavyweight version
+  extents.reserve(lightweight_object_extents.size());
+  for (auto& lightweight_object_extent : lightweight_object_extents) {
+    auto& object_extent = extents.emplace_back(
+      object_t(format_oid(object_format, lightweight_object_extent.object_no)),
+      lightweight_object_extent.object_no,
+      lightweight_object_extent.offset, lightweight_object_extent.length,
+      lightweight_object_extent.truncate_size);
+
+    object_extent.oloc = OSDMap::file_to_object_locator(*layout);
+    object_extent.buffer_extents.reserve(
+      lightweight_object_extent.buffer_extents.size());
+    object_extent.buffer_extents.insert(
+      object_extent.buffer_extents.end(),
+      lightweight_object_extent.buffer_extents.begin(),
+      lightweight_object_extent.buffer_extents.end());
+  }
+}
+
+void Striper::file_to_extents(
+  CephContext *cct, const char *object_format,
+  const file_layout_t *layout,
+  uint64_t offset, uint64_t len,
+  uint64_t trunc_size,
+  map<object_t,std::vector<ObjectExtent> >& object_extents,
+  uint64_t buffer_offset)
+{
+  striper::LightweightObjectExtents lightweight_object_extents;
+  file_to_extents(cct, layout, offset, len, trunc_size, buffer_offset,
+                  &lightweight_object_extents);
+
+  // convert lightweight object extents to heavyweight version
+  for (auto& lightweight_object_extent : lightweight_object_extents) {
+    auto oid = format_oid(object_format, lightweight_object_extent.object_no);
+    auto& object_extent = object_extents[oid].emplace_back(
+      oid, lightweight_object_extent.object_no,
+      lightweight_object_extent.offset, lightweight_object_extent.length,
+      lightweight_object_extent.truncate_size);
+
+      object_extent.oloc = OSDMap::file_to_object_locator(*layout);
+      object_extent.buffer_extents.reserve(
+        lightweight_object_extent.buffer_extents.size());
+      object_extent.buffer_extents.insert(
+        object_extent.buffer_extents.end(),
+        lightweight_object_extent.buffer_extents.begin(),
+        lightweight_object_extent.buffer_extents.end());
+  }
+}
+
+void Striper::file_to_extents(
+    CephContext *cct, const file_layout_t *layout, uint64_t offset,
+    uint64_t len, uint64_t trunc_size, uint64_t buffer_offset,
+    striper::LightweightObjectExtents* object_extents) {
+  ldout(cct, 10) << "file_to_extents " << offset << "~" << len << dendl;
+  ceph_assert(len > 0);
+
+  /*
+   * we want only one extent per object!  this means that each extent
+   * we read may map into different bits of the final read
+   * buffer.. hence buffer_extents
+   */
+
+  __u32 object_size = layout->object_size;
+  __u32 su = layout->stripe_unit;
+  __u32 stripe_count = layout->stripe_count;
+  ceph_assert(object_size >= su);
+  if (stripe_count == 1) {
+    ldout(cct, 20) << " sc is one, reset su to os" << dendl;
+    su = object_size;
+  }
+  uint64_t stripes_per_object = object_size / su;
+  ldout(cct, 20) << " su " << su << " sc " << stripe_count << " os "
+		 << object_size << " stripes_per_object " << stripes_per_object
+		 << dendl;
+
+  uint64_t cur = offset;
+  uint64_t left = len;
+  while (left > 0) {
+    // layout into objects
+    uint64_t blockno = cur / su; // which block
+    // which horizontal stripe (Y)
+    uint64_t stripeno = blockno / stripe_count;
+    // which object in the object set (X)
+    uint64_t stripepos = blockno % stripe_count;
+    // which object set
+    uint64_t objectsetno = stripeno / stripes_per_object;
+    // object id
+    uint64_t objectno = objectsetno * stripe_count + stripepos;
+
+    // map range into object
+    uint64_t block_start = (stripeno % stripes_per_object) * su;
+    uint64_t block_off = cur % su;
+    uint64_t max = su - block_off;
+
+    uint64_t x_offset = block_start + block_off;
+    uint64_t x_len;
+    if (left > max)
+      x_len = max;
+    else
+      x_len = left;
+
+    ldout(cct, 20) << " off " << cur << " blockno " << blockno << " stripeno "
+		   << stripeno << " stripepos " << stripepos << " objectsetno "
+		   << objectsetno << " objectno " << objectno
+		   << " block_start " << block_start << " block_off "
+		   << block_off << " " << x_offset << "~" << x_len
+		   << dendl;
+
+    striper::LightweightObjectExtent* ex = nullptr;
+    auto it = std::upper_bound(object_extents->begin(), object_extents->end(),
+                               objectno, OrderByObject());
+    striper::LightweightObjectExtents::reverse_iterator rev_it(it);
+    if (rev_it == object_extents->rend() ||
+        rev_it->object_no != objectno ||
+        rev_it->offset + rev_it->length != x_offset) {
+      // expect up to "stripe-width - 1" vector shifts in the worst-case
+      ex = &(*object_extents->emplace(
+        it, objectno, x_offset, x_len,
+        object_truncate_size(cct, layout, objectno, trunc_size)));
+        ldout(cct, 20) << " added new " << *ex << dendl;
+    } else {
+      ex = &(*rev_it);
+      ceph_assert(ex->offset + ex->length == x_offset);
+
+      ldout(cct, 20) << " adding in to " << *ex << dendl;
+      ex->length += x_len;
+    }
+
+    ex->buffer_extents.emplace_back(cur - offset + buffer_offset, x_len);
+
+    ldout(cct, 15) << "file_to_extents  " << *ex << dendl;
+    // ldout(cct, 0) << "map: ino " << ino << " oid " << ex.oid << " osd "
+    //		  << ex.osd << " offset " << ex.offset << " len " << ex.len
+    //		  << " ... left " << left << dendl;
+
+    left -= x_len;
+    cur += x_len;
+  }
+}
+
+void Striper::extent_to_file(CephContext *cct, file_layout_t *layout,
+			   uint64_t objectno, uint64_t off, uint64_t len,
+			   std::vector<pair<uint64_t, uint64_t> >& extents)
+{
+  ldout(cct, 10) << "extent_to_file " << objectno << " " << off << "~"
+		 << len << dendl;
+
+  __u32 object_size = layout->object_size;
+  __u32 su = layout->stripe_unit;
+  __u32 stripe_count = layout->stripe_count;
+  ceph_assert(object_size >= su);
+  uint64_t stripes_per_object = object_size / su;
+  ldout(cct, 20) << " stripes_per_object " << stripes_per_object << dendl;
+
+  uint64_t off_in_block = off % su;
+
+  extents.reserve(len / su + 1);
+
+  while (len > 0) {
+    uint64_t stripepos = objectno % stripe_count;
+    uint64_t objectsetno = objectno / stripe_count;
+    uint64_t stripeno = off / su + objectsetno * stripes_per_object;
+    uint64_t blockno = stripeno * stripe_count + stripepos;
+    uint64_t extent_off = blockno * su + off_in_block;
+    uint64_t extent_len = std::min(len, su - off_in_block);
+    extents.push_back(make_pair(extent_off, extent_len));
+
+    ldout(cct, 20) << " object " << off << "~" << extent_len
+		   << " -> file " << extent_off << "~" << extent_len
+		   << dendl;
+
+    off_in_block = 0;
+    off += extent_len;
+    len -= extent_len;
+  }
+}
+
+uint64_t Striper::object_truncate_size(CephContext *cct,
+				       const file_layout_t *layout,
+				       uint64_t objectno, uint64_t trunc_size)
+{
+  uint64_t obj_trunc_size;
+  if (trunc_size == 0 || trunc_size == (uint64_t)-1) {
+    obj_trunc_size = trunc_size;
+  } else {
+    __u32 object_size = layout->object_size;
+    __u32 su = layout->stripe_unit;
+    __u32 stripe_count = layout->stripe_count;
+    ceph_assert(object_size >= su);
+    uint64_t stripes_per_object = object_size / su;
+
+    uint64_t objectsetno = objectno / stripe_count;
+    uint64_t trunc_objectsetno = trunc_size / object_size / stripe_count;
+    if (objectsetno > trunc_objectsetno)
+      obj_trunc_size = 0;
+    else if (objectsetno < trunc_objectsetno)
+      obj_trunc_size = object_size;
+    else {
+      uint64_t trunc_blockno = trunc_size / su;
+      uint64_t trunc_stripeno = trunc_blockno / stripe_count;
+      uint64_t trunc_stripepos = trunc_blockno % stripe_count;
+      uint64_t trunc_objectno = trunc_objectsetno * stripe_count
+	+ trunc_stripepos;
+      if (objectno < trunc_objectno)
+	obj_trunc_size = ((trunc_stripeno % stripes_per_object) + 1) * su;
+      else if (objectno > trunc_objectno)
+	obj_trunc_size = (trunc_stripeno % stripes_per_object) * su;
+      else
+	obj_trunc_size = (trunc_stripeno % stripes_per_object) * su
+	  + (trunc_size % su);
+    }
+  }
+  ldout(cct, 20) << "object_truncate_size " << objectno << " "
+		 << trunc_size << "->" << obj_trunc_size << dendl;
+  return obj_trunc_size;
+}
+
+uint64_t Striper::get_num_objects(const file_layout_t& layout,
+				  uint64_t size)
+{
+  __u32 stripe_unit = layout.stripe_unit;
+  __u32 stripe_count = layout.stripe_count;
+  uint64_t period = layout.get_period();
+  uint64_t num_periods = (size + period - 1) / period;
+  uint64_t remainder_bytes = size % period;
+  uint64_t remainder_objs = 0;
+  if ((remainder_bytes > 0) && (remainder_bytes < (uint64_t)stripe_count
+				* stripe_unit))
+    remainder_objs = stripe_count - ((remainder_bytes + stripe_unit - 1)
+				     / stripe_unit);
+  return num_periods * stripe_count - remainder_objs;
+}
+
+uint64_t Striper::get_file_offset(CephContext *cct,
+        const file_layout_t *layout, uint64_t objectno, uint64_t off) {
+  ldout(cct, 10) << "get_file_offset " << objectno << " " << off  << dendl;
+
+  __u32 object_size = layout->object_size;
+  __u32 su = layout->stripe_unit;
+  __u32 stripe_count = layout->stripe_count;
+  ceph_assert(object_size >= su);
+  uint64_t stripes_per_object = object_size / su;
+  ldout(cct, 20) << " stripes_per_object " << stripes_per_object << dendl;
+
+  uint64_t off_in_block = off % su;
+
+  uint64_t stripepos = objectno % stripe_count;
+  uint64_t objectsetno = objectno / stripe_count;
+  uint64_t stripeno = off / su + objectsetno * stripes_per_object;
+  uint64_t blockno = stripeno * stripe_count + stripepos;
+  return blockno * su + off_in_block;
+}
+
+// StripedReadResult
+
+void Striper::StripedReadResult::add_partial_result(
+  CephContext *cct, bufferlist& bl,
+  const std::vector<pair<uint64_t,uint64_t> >& buffer_extents)
+{
+  ldout(cct, 10) << "add_partial_result(" << this << ") " << bl.length()
+		 << " to " << buffer_extents << dendl;
+  for (auto p = buffer_extents.cbegin(); p != buffer_extents.cend(); ++p) {
+    pair<bufferlist, uint64_t>& r = partial[p->first];
+    size_t actual = std::min<uint64_t>(bl.length(), p->second);
+    bl.splice(0, actual, &r.first);
+    r.second = p->second;
+    total_intended_len += r.second;
+  }
+}
+
+void Striper::StripedReadResult::add_partial_result(
+  CephContext *cct, bufferlist&& bl,
+  const striper::LightweightBufferExtents& buffer_extents)
+{
+  ldout(cct, 10) << "add_partial_result(" << this << ") " << bl.length()
+		 << " to " << buffer_extents << dendl;
+  for (auto& be : buffer_extents) {
+    auto& r = partial[be.first];
+    size_t actual = std::min<uint64_t>(bl.length(), be.second);
+    if (buffer_extents.size() == 1) {
+      r.first = std::move(bl);
+    } else {
+      bl.splice(0, actual, &r.first);
+    }
+    r.second = be.second;
+    total_intended_len += r.second;
+  }
+}
+
+void Striper::StripedReadResult::add_partial_sparse_result(
+  CephContext *cct, bufferlist& bl, const map<uint64_t, uint64_t>& bl_map,
+  uint64_t bl_off, const std::vector<pair<uint64_t,uint64_t> >& buffer_extents)
+{
+  ldout(cct, 10) << "add_partial_sparse_result(" << this << ") " << bl.length()
+		 << " covering " << bl_map << " (offset " << bl_off << ")"
+		 << " to " << buffer_extents << dendl;
+
+  if (bl_map.empty()) {
+    add_partial_result(cct, bl, buffer_extents);
+    return;
+  }
+
+  auto s = bl_map.cbegin();
+  for (auto& be : buffer_extents) {
+    ::add_partial_sparse_result(cct, &partial, &total_intended_len, bl, &s,
+                                bl_map.end(), &bl_off, be.first, be.second);
+  }
+}
+
+void Striper::StripedReadResult::add_partial_sparse_result(
+    CephContext *cct, ceph::buffer::list&& bl,
+    const std::vector<std::pair<uint64_t, uint64_t>>& bl_map, uint64_t bl_off,
+    const striper::LightweightBufferExtents& buffer_extents) {
+  ldout(cct, 10) << "add_partial_sparse_result(" << this << ") " << bl.length()
+		 << " covering " << bl_map << " (offset " << bl_off << ")"
+		 << " to " << buffer_extents << dendl;
+
+  if (bl_map.empty()) {
+    add_partial_result(cct, std::move(bl), buffer_extents);
+    return;
+  }
+
+  auto s = bl_map.cbegin();
+  for (auto& be : buffer_extents) {
+    ::add_partial_sparse_result(cct, &partial, &total_intended_len, bl, &s,
+                                bl_map.cend(), &bl_off, be.first, be.second);
+  }
+}
+
+void Striper::StripedReadResult::assemble_result(CephContext *cct,
+						 bufferlist& bl,
+						 bool zero_tail)
+{
+  ldout(cct, 10) << "assemble_result(" << this << ") zero_tail=" << zero_tail
+		 << dendl;
+  size_t zeros = 0;  // zeros preceding current position
+  for (auto& p : partial) {
+    size_t got = p.second.first.length();
+    size_t expect = p.second.second;
+    if (got) {
+      if (zeros) {
+	bl.append_zero(zeros);
+	zeros = 0;
+      }
+      bl.claim_append(p.second.first);
+    }
+    zeros += expect - got;
+  }
+  if (zero_tail && zeros) {
+    bl.append_zero(zeros);
+  }
+  partial.clear();
+}
+
+void Striper::StripedReadResult::assemble_result(CephContext *cct, char *buffer, size_t length)
+{
+
+  ceph_assert(buffer && length == total_intended_len);
+
+  map<uint64_t,pair<bufferlist,uint64_t> >::reverse_iterator p = partial.rbegin();
+  if (p == partial.rend())
+    return;
+
+  uint64_t curr = length;
+  uint64_t end = p->first + p->second.second;
+  while (p != partial.rend()) {
+    // sanity check
+    ldout(cct, 20) << "assemble_result(" << this << ") " << p->first << "~" << p->second.second
+		   << " " << p->second.first.length() << " bytes"
+		   << dendl;
+    ceph_assert(p->first == end - p->second.second);
+    end = p->first;
+
+    size_t len = p->second.first.length();
+    ceph_assert(curr >= p->second.second);
+    curr -= p->second.second;
+    if (len < p->second.second) {
+      if (len)
+	p->second.first.begin().copy(len, buffer + curr);
+      // FIPS zeroization audit 20191117: this memset is not security related.
+      memset(buffer + curr + len, 0, p->second.second - len);
+    } else {
+      p->second.first.begin().copy(len, buffer + curr);
+    }
+    ++p;
+  }
+  partial.clear();
+  ceph_assert(curr == 0);
+}
+
+uint64_t Striper::StripedReadResult::assemble_result(
+    CephContext *cct, std::map<uint64_t, uint64_t> *extent_map,
+    bufferlist *bl)
+{
+  ldout(cct, 10) << "assemble_result(" << this << ")" << dendl;
+  for (auto& p : partial) {
+    uint64_t off = p.first;
+    uint64_t len = p.second.first.length();
+    if (len > 0) {
+      (*extent_map)[off] = len;
+      bl->claim_append(p.second.first);
+    }
+  }
+  partial.clear();
+  return total_intended_len;
+}
diff --git a/src/osdc/Striper.h b/src/osdc/Striper.h
new file mode 100644
index 000000000..0761cd6c7
--- /dev/null
+++ b/src/osdc/Striper.h
@@ -0,0 +1,132 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_STRIPER_H
+#define CEPH_STRIPER_H
+
+#include "include/common_fwd.h"
+#include "include/types.h"
+#include "osd/osd_types.h"
+#include "osdc/StriperTypes.h"
+
+
+//namespace ceph {
+
+  class Striper {
+  public:
+    static void file_to_extents(
+        CephContext *cct, const file_layout_t *layout, uint64_t offset,
+        uint64_t len, uint64_t trunc_size, uint64_t buffer_offset,
+        striper::LightweightObjectExtents* object_extents);
+
+    /*
+     * std::map (ino, layout, offset, len) to a (list of) ObjectExtents (byte
+     * ranges in objects on (primary) osds)
+     */
+    static void file_to_extents(CephContext *cct, const char *object_format,
+				const file_layout_t *layout,
+				uint64_t offset, uint64_t len,
+				uint64_t trunc_size,
+				std::map<object_t, std::vector<ObjectExtent> >& extents,
+				uint64_t buffer_offset=0);
+
+    static void file_to_extents(CephContext *cct, const char *object_format,
+				const file_layout_t *layout,
+				uint64_t offset, uint64_t len,
+				uint64_t trunc_size,
+				std::vector<ObjectExtent>& extents,
+				uint64_t buffer_offset=0);
+
+    static void file_to_extents(CephContext *cct, inodeno_t ino,
+				const file_layout_t *layout,
+				uint64_t offset, uint64_t len,
+				uint64_t trunc_size,
+				std::vector<ObjectExtent>& extents) {
+      // generate prefix/format
+      char buf[32];
+      snprintf(buf, sizeof(buf), "%llx.%%08llx", (long long unsigned)ino);
+
+      file_to_extents(cct, buf, layout, offset, len, trunc_size, extents);
+    }
+
+    /**
+     * reverse std::map an object extent to file extents
+     */
+    static void extent_to_file(CephContext *cct, file_layout_t *layout,
+			       uint64_t objectno, uint64_t off, uint64_t len,
+			       std::vector<std::pair<uint64_t, uint64_t> >& extents);
+
+    static uint64_t object_truncate_size(
+      CephContext *cct, const file_layout_t *layout,
+      uint64_t objectno, uint64_t trunc_size);
+
+    static uint64_t get_num_objects(const file_layout_t& layout,
+				    uint64_t size);
+
+    static uint64_t get_file_offset(CephContext *cct,
+            const file_layout_t *layout, uint64_t objectno, uint64_t off);
+    /*
+     * helper to assemble a striped result
+     */
+    class StripedReadResult {
+      // offset -> (data, intended length)
+      std::map<uint64_t, std::pair<ceph::buffer::list, uint64_t> > partial;
+      uint64_t total_intended_len = 0; //sum of partial.second.second
+
+    public:
+      void add_partial_result(
+	CephContext *cct, ceph::buffer::list& bl,
+	const std::vector<std::pair<uint64_t,uint64_t> >& buffer_extents);
+      void add_partial_result(
+	  CephContext *cct, ceph::buffer::list&& bl,
+	  const striper::LightweightBufferExtents& buffer_extents);
+
+      /**
+       * add sparse read into results
+       *
+       * @param bl buffer
+       * @param bl_map std::map of which logical source extents this covers
+       * @param bl_off logical buffer offset (e.g., first bl_map key
+       *               if the buffer is not sparse)
+       * @param buffer_extents output buffer extents the data maps to
+       */
+      void add_partial_sparse_result(
+	CephContext *cct, ceph::buffer::list& bl,
+	const std::map<uint64_t, uint64_t>& bl_map, uint64_t bl_off,
+	const std::vector<std::pair<uint64_t,uint64_t> >& buffer_extents);
+      void add_partial_sparse_result(
+	  CephContext *cct, ceph::buffer::list&& bl,
+	  const std::vector<std::pair<uint64_t, uint64_t>>& bl_map,
+          uint64_t bl_off,
+          const striper::LightweightBufferExtents& buffer_extents);
+
+      void assemble_result(CephContext *cct, ceph::buffer::list& bl,
+                           bool zero_tail);
+
+      /**
+       * @buffer copy read data into buffer
+       * @len the length of buffer
+       */
+      void assemble_result(CephContext *cct, char *buffer, size_t len);
+
+      uint64_t assemble_result(CephContext *cct,
+                               std::map<uint64_t, uint64_t> *extent_map,
+                               ceph::buffer::list *bl);
+    };
+
+  };
+
+//};
+
+#endif
diff --git a/src/osdc/StriperTypes.h b/src/osdc/StriperTypes.h
new file mode 100644
index 000000000..2ce8466a8
--- /dev/null
+++ b/src/osdc/StriperTypes.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OSDC_STRIPER_TYPES_H
+#define CEPH_OSDC_STRIPER_TYPES_H
+
+#include "include/types.h"
+#include <boost/container/small_vector.hpp>
+#include <ios>
+#include <utility>
+
+namespace striper {
+
+// off -> len extents in (striped) buffer being mapped
+typedef std::pair<uint64_t,uint64_t> BufferExtent;
+typedef boost::container::small_vector<
+    BufferExtent, 4> LightweightBufferExtents;
+
+struct LightweightObjectExtent {
+  LightweightObjectExtent() = delete;
+  LightweightObjectExtent(uint64_t object_no, uint64_t offset,
+                          uint64_t length, uint64_t truncate_size)
+    : object_no(object_no), offset(offset), length(length),
+      truncate_size(truncate_size) {
+  }
+
+  uint64_t object_no;
+  uint64_t offset;        // in-object
+  uint64_t length;        // in-object
+  uint64_t truncate_size; // in-object
+  LightweightBufferExtents buffer_extents;
+};
+
+typedef boost::container::small_vector<
+    LightweightObjectExtent, 4> LightweightObjectExtents;
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const LightweightObjectExtent& ex) {
+  return os << "extent("
+            << ex.object_no << " "
+            << ex.offset << "~" << ex.length
+            << " -> " << ex.buffer_extents
+            << ")";
+}
+
+} // namespace striper
+
+#endif // CEPH_OSDC_STRIPER_TYPES_H
diff --git a/src/osdc/WritebackHandler.h b/src/osdc/WritebackHandler.h
new file mode 100644
index 000000000..4f4e9aef8
--- /dev/null
+++ b/src/osdc/WritebackHandler.h
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OSDC_WRITEBACKHANDLER_H
+#define CEPH_OSDC_WRITEBACKHANDLER_H
+
+#include "include/Context.h"
+#include "include/types.h"
+#include "common/zipkin_trace.h"
+#include "osd/osd_types.h"
+
+class WritebackHandler {
+ public:
+  WritebackHandler() {}
+  virtual ~WritebackHandler() {}
+
+  virtual void read(const object_t& oid, uint64_t object_no,
+		    const object_locator_t& oloc, uint64_t off, uint64_t len,
+		    snapid_t snapid, ceph::buffer::list *pbl, uint64_t trunc_size,
+		    __u32 trunc_seq, int op_flags,
+                    const ZTracer::Trace &parent_trace, Context *onfinish) = 0;
+  /**
+   * check if a given extent read result may change due to a write
+   *
+   * Check if the content we see at the given read offset may change
+   * due to a write to this object.
+   *
+   * @param oid object
+   * @param read_off read offset
+   * @param read_len read length
+   * @param snapid read snapid
+   */
+  virtual bool may_copy_on_write(const object_t& oid, uint64_t read_off,
+				 uint64_t read_len, snapid_t snapid) = 0;
+  virtual ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
+			   uint64_t off, uint64_t len,
+			   const SnapContext& snapc,
+			   const ceph::buffer::list &bl, ceph::real_time mtime,
+			   uint64_t trunc_size, __u32 trunc_seq,
+                           ceph_tid_t journal_tid,
+                           const ZTracer::Trace &parent_trace,
+                           Context *oncommit) = 0;
+
+  virtual void overwrite_extent(const object_t& oid, uint64_t off, uint64_t len,
+                                ceph_tid_t original_journal_tid,
+                                ceph_tid_t new_journal_tid) {}
+
+  virtual bool can_scattered_write() { return false; }
+  virtual ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
+			   std::vector<std::pair<uint64_t, ceph::buffer::list> >& io_vec,
+			   const SnapContext& snapc, ceph::real_time mtime,
+			   uint64_t trunc_size, __u32 trunc_seq,
+			   Context *oncommit) {
+    return 0;
+  }
+};
+
+#endif
diff --git a/src/osdc/error_code.cc b/src/osdc/error_code.cc
new file mode 100644
index 000000000..5dc548385
--- /dev/null
+++ b/src/osdc/error_code.cc
@@ -0,0 +1,159 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <string>
+
+#include "common/error_code.h"
+#include "error_code.h"
+
+namespace bs = boost::system;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+class osdc_error_category : public ceph::converting_category {
+public:
+  osdc_error_category(){}
+  const char* name() const noexcept override;
+  const char* message(int ev, char*, std::size_t) const noexcept override;
+  std::string message(int ev) const override;
+  bs::error_condition default_error_condition(int ev) const noexcept
+    override;
+  bool equivalent(int ev, const bs::error_condition& c) const
+    noexcept override;
+  using ceph::converting_category::equivalent;
+  int from_code(int ev) const noexcept override;
+};
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
+
+const char* osdc_error_category::name() const noexcept {
+  return "osdc";
+}
+
+const char* osdc_error_category::message(int ev, char*,
+					 std::size_t) const noexcept {
+  if (ev == 0)
+    return "No error";
+
+  switch (static_cast<osdc_errc>(ev)) {
+  case osdc_errc::pool_dne:
+    return "Pool does not exist";
+
+  case osdc_errc::pool_exists:
+    return "Pool already exists";
+
+  case osdc_errc::precondition_violated:
+    return "Precondition for operation not satisfied";
+
+  case osdc_errc::not_supported:
+    return "Operation not supported";
+
+  case osdc_errc::snapshot_exists:
+    return "Snapshot already exists";
+
+  case osdc_errc::snapshot_dne:
+    return "Snapshot does not exist";
+
+  case osdc_errc::timed_out:
+    return "Operation timed out";
+  }
+
+  return "Unknown error";
+}
+
+std::string osdc_error_category::message(int ev) const {
+  return message(ev, nullptr, 0);
+}
+
+bs::error_condition
+osdc_error_category::default_error_condition(int ev) const noexcept {
+  switch (static_cast<osdc_errc>(ev)) {
+  case osdc_errc::pool_dne:
+    return ceph::errc::does_not_exist;
+  case osdc_errc::pool_exists:
+    return ceph::errc::exists;
+  case osdc_errc::precondition_violated:
+    return bs::errc::invalid_argument;
+  case osdc_errc::not_supported:
+    return bs::errc::operation_not_supported;
+  case osdc_errc::snapshot_exists:
+    return ceph::errc::exists;
+  case osdc_errc::snapshot_dne:
+    return ceph::errc::does_not_exist;
+  case osdc_errc::timed_out:
+    return bs::errc::timed_out;
+  }
+
+  return { ev, *this };
+}
+
+bool osdc_error_category::equivalent(int ev,
+                                     const bs::error_condition& c) const noexcept {
+  if (static_cast<osdc_errc>(ev) == osdc_errc::pool_dne) {
+    if (c == bs::errc::no_such_file_or_directory) {
+      return true;
+    }
+    if (c == ceph::errc::not_in_map) {
+      return true;
+    }
+  }
+  if (static_cast<osdc_errc>(ev) == osdc_errc::pool_exists) {
+    if (c == bs::errc::file_exists) {
+      return true;
+    }
+  }
+  if (static_cast<osdc_errc>(ev) == osdc_errc::snapshot_exists) {
+    if (c == bs::errc::file_exists) {
+      return true;
+    }
+  }
+  if (static_cast<osdc_errc>(ev) == osdc_errc::snapshot_dne) {
+    if (c == bs::errc::no_such_file_or_directory) {
+      return true;
+    }
+    if (c == ceph::errc::not_in_map) {
+      return true;
+    }
+  }
+
+  return default_error_condition(ev) == c;
+}
+
+int osdc_error_category::from_code(int ev) const noexcept {
+  switch (static_cast<osdc_errc>(ev)) {
+  case osdc_errc::pool_dne:
+    return -ENOENT;
+  case osdc_errc::pool_exists:
+    return -EEXIST;
+  case osdc_errc::precondition_violated:
+    return -EINVAL;
+  case osdc_errc::not_supported:
+    return -EOPNOTSUPP;
+  case osdc_errc::snapshot_exists:
+    return -EEXIST;
+  case osdc_errc::snapshot_dne:
+    return -ENOENT;
+  case osdc_errc::timed_out:
+    return -ETIMEDOUT;
+  }
+  return -EDOM;
+}
+
+const bs::error_category& osdc_category() noexcept {
+  static const osdc_error_category c;
+  return c;
+}
diff --git a/src/osdc/error_code.h b/src/osdc/error_code.h
new file mode 100644
index 000000000..53c9e3c3a
--- /dev/null
+++ b/src/osdc/error_code.h
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <boost/system/error_code.hpp>
+
+#include "include/rados.h"
+
+const boost::system::error_category& osdc_category() noexcept;
+
+enum class osdc_errc {
+  pool_dne = 1,
+  pool_exists,
+  // Come the revolution, we'll just kill your program. Maybe.
+  precondition_violated,
+  not_supported,
+  snapshot_exists,
+  snapshot_dne,
+  timed_out
+};
+
+namespace boost::system {
+template<>
+struct is_error_code_enum<::osdc_errc> {
+  static const bool value = true;
+};
+
+template<>
+struct is_error_condition_enum<::osdc_errc> {
+  static const bool value = false;
+};
+}
+
+//  implicit conversion:
+inline boost::system::error_code make_error_code(osdc_errc e) noexcept {
+  return { static_cast<int>(e), osdc_category() };
+}
+
+// explicit conversion:
+inline boost::system::error_condition make_error_condition(osdc_errc e) noexcept {
+  return { static_cast<int>(e), osdc_category() };
+}