63 files changed, 54887 insertions, 0 deletions
diff --git a/src/os/CMakeLists.txt b/src/os/CMakeLists.txt
new file mode 100644
index 000000000..55415fb37
--- /dev/null
+++ b/src/os/CMakeLists.txt
@@ -0,0 +1,100 @@
+set(libos_srcs
+  ObjectStore.cc
+  Transaction.cc
+  DBObjectMap.cc
+  memstore/MemStore.cc
+  kstore/KStore.cc
+  kstore/kstore_types.cc
+  fs/FS.cc)
+
+if(WITH_BLUESTORE)
+  list(APPEND libos_srcs
+    bluestore/Allocator.cc
+    bluestore/BitmapFreelistManager.cc
+    bluestore/BlueFS.cc
+    bluestore/bluefs_types.cc
+    bluestore/BlueRocksEnv.cc
+    bluestore/BlueStore.cc
+    bluestore/simple_bitmap.cc
+    bluestore/bluestore_types.cc
+    bluestore/fastbmap_allocator_impl.cc
+    bluestore/FreelistManager.cc
+    bluestore/StupidAllocator.cc
+    bluestore/BitmapAllocator.cc
+    bluestore/AvlAllocator.cc
+    bluestore/BtreeAllocator.cc
+    bluestore/HybridAllocator.cc
+  )
+endif(WITH_BLUESTORE)
+
+if(WITH_ZBD)
+  list(APPEND libos_srcs
+    bluestore/ZonedFreelistManager.cc
+    bluestore/ZonedAllocator.cc)
+endif()
+
+if(WITH_FUSE)
+  list(APPEND libos_srcs
+    FuseStore.cc)
+endif(WITH_FUSE)
+
+if(HAVE_LIBXFS)
+  list(APPEND libos_srcs
+    fs/XFS.cc)
+endif()
+
+if(HAVE_LIBZFS)
+  add_library(os_zfs_objs OBJECT
+    filestore/ZFSFileStoreBackend.cc
+    fs/ZFS.cc)
+  target_include_directories(os_zfs_objs SYSTEM PRIVATE
+    ${ZFS_INCLUDE_DIRS})
+  list(APPEND libos_srcs $<TARGET_OBJECTS:os_zfs_objs>)
+endif()
+
+add_library(os STATIC ${libos_srcs})
+target_link_libraries(os blk)
+
+target_link_libraries(os heap_profiler kv)
+
+if(WITH_BLUEFS)
+  add_library(bluefs SHARED 
+    bluestore/BlueRocksEnv.cc)
+  target_include_directories(bluefs SYSTEM PUBLIC
+    $<TARGET_PROPERTY:RocksDB::RocksDB,INTERFACE_INCLUDE_DIRECTORIES>)
+  target_link_libraries(bluefs global)
+  install(TARGETS bluefs DESTINATION lib)
+endif(WITH_BLUEFS)
+
+if(WITH_FUSE)
+  target_link_libraries(os FUSE::FUSE)
+endif()
+
+if(HAVE_LIBZFS)
+  target_link_libraries(os ${ZFS_LIBRARIES})
+endif()
+
+if(WITH_LTTNG)
+  add_dependencies(os objectstore-tp)
+  add_dependencies(os bluestore-tp)
+endif()
+
+if(WITH_JAEGER)
+  add_dependencies(os jaeger_base)
+  target_link_libraries(os jaeger_base)
+endif()
+
+target_link_libraries(os kv)
+
+add_dependencies(os compressor_plugins)
+add_dependencies(os crypto_plugins)
+
+
+if(WITH_BLUESTORE)
+  add_executable(ceph-bluestore-tool
+    bluestore/bluestore_tool.cc)
+  target_link_libraries(ceph-bluestore-tool
+    os global)
+  install(TARGETS ceph-bluestore-tool
+    DESTINATION bin)
+endif()
diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc
new file mode 100644
index 000000000..7da9a67be
--- /dev/null
+++ b/src/os/DBObjectMap.cc
@@ -0,0 +1,1424 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+
+#include <iostream>
+#include <set>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "os/ObjectMap.h"
+#include "kv/KeyValueDB.h"
+#include "DBObjectMap.h"
+#include <errno.h>
+
+#include "common/debug.h"
+#include "common/config.h"
+#include "include/ceph_assert.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_filestore
+#undef dout_prefix
+#define dout_prefix *_dout << "filestore "
+
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::bufferlist;
+
+const string DBObjectMap::USER_PREFIX = "_USER_";
+const string DBObjectMap::XATTR_PREFIX = "_AXATTR_";
+const string DBObjectMap::SYS_PREFIX = "_SYS_";
+const string DBObjectMap::COMPLETE_PREFIX = "_COMPLETE_";
+const string DBObjectMap::HEADER_KEY = "HEADER";
+const string DBObjectMap::USER_HEADER_KEY = "USER_HEADER";
+const string DBObjectMap::GLOBAL_STATE_KEY = "HEADER";
+const string DBObjectMap::HOBJECT_TO_SEQ = "_HOBJTOSEQ_";
+
+// Legacy
+const string DBObjectMap::LEAF_PREFIX = "_LEAF_";
+const string DBObjectMap::REVERSE_LEAF_PREFIX = "_REVLEAF_";
+
+static void append_escaped(const string &in, string *out)
+{
+  for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+    if (*i == '%') {
+      out->push_back('%');
+      out->push_back('p');
+    } else if (*i == '.') {
+      out->push_back('%');
+      out->push_back('e');
+    } else if (*i == '_') {
+      out->push_back('%');
+      out->push_back('u');
+    } else {
+      out->push_back(*i);
+    }
+  }
+}
+
+int DBObjectMap::check(std::ostream &out, bool repair, bool force)
+{
+  int errors = 0, comp_errors = 0;
+  bool repaired = false;
+  map<uint64_t, uint64_t> parent_to_num_children;
+  map<uint64_t, uint64_t> parent_to_actual_num_children;
+  KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    _Header header;
+    bufferlist bl = iter->value();
+    while (true) {
+      auto bliter = bl.cbegin();
+      header.decode(bliter);
+      if (header.seq != 0)
+	parent_to_actual_num_children[header.seq] = header.num_children;
+
+      if (state.v == 2 || force) {
+	// Check complete table
+	bool complete_error = false;
+	boost::optional<string> prev;
+	KeyValueDB::Iterator complete_iter = db->get_iterator(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+	for (complete_iter->seek_to_first(); complete_iter->valid();
+	     complete_iter->next()) {
+	  if (prev && prev >= complete_iter->key()) {
+	     out << "Bad complete for " << header.oid << std::endl;
+	     complete_error = true;
+	     break;
+	  }
+	  prev = string(complete_iter->value().c_str(), complete_iter->value().length() - 1);
+	}
+	if (complete_error) {
+	  out << "Complete mapping for " << header.seq << " :" << std::endl;
+	  for (complete_iter->seek_to_first(); complete_iter->valid();
+	       complete_iter->next()) {
+	    out << complete_iter->key() << " -> " << string(complete_iter->value().c_str(), complete_iter->value().length() - 1) << std::endl;
+	  }
+	  if (repair) {
+	    repaired = true;
+	    KeyValueDB::Transaction t = db->get_transaction();
+	    t->rmkeys_by_prefix(USER_PREFIX + header_key(header.seq) + COMPLETE_PREFIX);
+	    db->submit_transaction(t);
+	    out << "Cleared complete mapping to repair" << std::endl;
+	  } else {
+	    errors++;  // Only count when not repaired
+	    comp_errors++;  // Track errors here for version update
+	  }
+	}
+      }
+
+      if (header.parent == 0)
+	break;
+
+      if (!parent_to_num_children.count(header.parent))
+	parent_to_num_children[header.parent] = 0;
+      parent_to_num_children[header.parent]++;
+      if (parent_to_actual_num_children.count(header.parent))
+	break;
+
+      set<string> to_get;
+      map<string, bufferlist> got;
+      to_get.insert(HEADER_KEY);
+      db->get(sys_parent_prefix(header), to_get, &got);
+      if (got.empty()) {
+	out << "Missing: seq " << header.parent << std::endl;
+	errors++;
+	break;
+      } else {
+	bl = got.begin()->second;
+      }
+    }
+  }
+
+  for (map<uint64_t, uint64_t>::iterator i = parent_to_num_children.begin();
+       i != parent_to_num_children.end();
+       parent_to_num_children.erase(i++)) {
+    if (!parent_to_actual_num_children.count(i->first))
+      continue;
+    if (parent_to_actual_num_children[i->first] != i->second) {
+      out << "Invalid: seq " << i->first << " recorded children: "
+	  << parent_to_actual_num_children[i->first] << " found: "
+	  << i->second << std::endl;
+      errors++;
+    }
+    parent_to_actual_num_children.erase(i->first);
+  }
+
+  // Only advance the version from 2 to 3 here
+  // Mark as legacy because there are still older structures
+  // we don't update.  The value of legacy is only used
+  // for internal assertions.
+  if (comp_errors == 0 && state.v == 2 && repair) {
+    state.v = 3;
+    state.legacy = true;
+    set_state();
+  }
+
+  if (errors == 0 && repaired)
+    return -1;
+  return errors;
+}
+
+string DBObjectMap::ghobject_key(const ghobject_t &oid)
+{
+  string out;
+  append_escaped(oid.hobj.oid.name, &out);
+  out.push_back('.');
+  append_escaped(oid.hobj.get_key(), &out);
+  out.push_back('.');
+  append_escaped(oid.hobj.nspace, &out);
+  out.push_back('.');
+
+  char snap_with_hash[1000];
+  char *t = snap_with_hash;
+  char *end = t + sizeof(snap_with_hash);
+  if (oid.hobj.snap == CEPH_NOSNAP)
+    t += snprintf(t, end - t, "head");
+  else if (oid.hobj.snap == CEPH_SNAPDIR)
+    t += snprintf(t, end - t, "snapdir");
+  else
+    t += snprintf(t, end - t, "%llx", (long long unsigned)oid.hobj.snap);
+
+  if (oid.hobj.pool == -1)
+    t += snprintf(t, end - t, ".none");
+  else
+    t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.hobj.pool);
+  t += snprintf(t, end - t, ".%.*X", (int)(sizeof(uint32_t)*2), oid.hobj.get_hash());
+
+  if (oid.generation != ghobject_t::NO_GEN ||
+      oid.shard_id != shard_id_t::NO_SHARD) {
+    t += snprintf(t, end - t, ".%llx", (long long unsigned)oid.generation);
+    t += snprintf(t, end - t, ".%x", (int)oid.shard_id);
+  }
+  out += string(snap_with_hash);
+  return out;
+}
+
+//    ok: pglog%u3%efs1...0.none.0017B237
+//   bad: plana8923501-10...4c.3.ffffffffffffffff.2
+// fixed: plana8923501-10...4c.3.CB767F2D.ffffffffffffffff.2
+// returns 0 for false, 1 for true, negative for error
+int DBObjectMap::is_buggy_ghobject_key_v1(CephContext* cct,
+					  const string &in)
+{
+  int dots = 5;  // skip 5 .'s
+  const char *s = in.c_str();
+  do {
+    while (*s && *s != '.')
+      ++s;
+    if (!*s) {
+      derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
+      return -EINVAL;
+    }
+    ++s;
+  } while (*s && --dots);
+  if (!*s) {
+    derr << "unexpected null at " << (int)(s-in.c_str()) << dendl;
+    return -EINVAL;
+  }
+  // we are now either at a hash value (32 bits, 8 chars) or a generation
+  // value (64 bits) '.' and shard id.  count the dots!
+  int len = 0;
+  while (*s && *s != '.') {
+    ++s;
+    ++len;
+  }
+  if (*s == '\0') {
+    if (len != 8) {
+      derr << "hash value is not 8 chars" << dendl;
+      return -EINVAL;  // the hash value is always 8 chars.
+    }
+    return 0;
+  }
+  if (*s != '.') { // the shard follows.
+    derr << "missing final . and shard id at " << (int)(s-in.c_str()) << dendl;
+    return -EINVAL;
+  }
+  return 1;
+}
+
+
+string DBObjectMap::map_header_key(const ghobject_t &oid)
+{
+  return ghobject_key(oid);
+}
+
+string DBObjectMap::header_key(uint64_t seq)
+{
+  char buf[100];
+  snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq);
+  return string(buf);
+}
+
+string DBObjectMap::complete_prefix(Header header)
+{
+  return USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX;
+}
+
+string DBObjectMap::user_prefix(Header header)
+{
+  return USER_PREFIX + header_key(header->seq) + USER_PREFIX;
+}
+
+string DBObjectMap::sys_prefix(Header header)
+{
+  return USER_PREFIX + header_key(header->seq) + SYS_PREFIX;
+}
+
+string DBObjectMap::xattr_prefix(Header header)
+{
+  return USER_PREFIX + header_key(header->seq) + XATTR_PREFIX;
+}
+
+string DBObjectMap::sys_parent_prefix(_Header header)
+{
+  return USER_PREFIX + header_key(header.parent) + SYS_PREFIX;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::init()
+{
+  invalid = false;
+  if (ready) {
+    return 0;
+  }
+  ceph_assert(!parent_iter);
+  if (header->parent) {
+    Header parent = map->lookup_parent(header);
+    if (!parent) {
+      ceph_abort();
+      return -EINVAL;
+    }
+    parent_iter = std::make_shared<DBObjectMapIteratorImpl>(map, parent);
+  }
+  key_iter = map->db->get_iterator(map->user_prefix(header));
+  ceph_assert(key_iter);
+  complete_iter = map->db->get_iterator(map->complete_prefix(header));
+  ceph_assert(complete_iter);
+  cur_iter = key_iter;
+  ceph_assert(cur_iter);
+  ready = true;
+  return 0;
+}
+
+ObjectMap::ObjectMapIterator DBObjectMap::get_iterator(
+  const ghobject_t &oid)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return ObjectMapIterator(new EmptyIteratorImpl());
+  DBObjectMapIterator iter = _get_iterator(header);
+  iter->hlock.swap(hl);
+  return iter;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::seek_to_first()
+{
+  init();
+  r = 0;
+  if (parent_iter) {
+    r = parent_iter->seek_to_first();
+    if (r < 0)
+      return r;
+  }
+  r = key_iter->seek_to_first();
+  if (r < 0)
+    return r;
+  return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::seek_to_last()
+{
+  init();
+  r = 0;
+  if (parent_iter) {
+    r = parent_iter->seek_to_last();
+    if (r < 0)
+      return r;
+    if (parent_iter->valid())
+      r = parent_iter->next();
+    if (r < 0)
+      return r;
+  }
+  r = key_iter->seek_to_last();
+  if (r < 0)
+    return r;
+  if (key_iter->valid())
+    r = key_iter->next();
+  if (r < 0)
+    return r;
+  return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::lower_bound(const string &to)
+{
+  init();
+  r = 0;
+  if (parent_iter) {
+    r = parent_iter->lower_bound(to);
+    if (r < 0)
+      return r;
+  }
+  r = key_iter->lower_bound(to);
+  if (r < 0)
+    return r;
+  return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::lower_bound_parent(const string &to)
+{
+  int r = lower_bound(to);
+  if (r < 0)
+    return r;
+  if (valid() && !on_parent())
+    return next_parent();
+  else
+    return r;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::upper_bound(const string &after)
+{
+  init();
+  r = 0;
+  if (parent_iter) {
+    r = parent_iter->upper_bound(after);
+    if (r < 0)
+      return r;
+  }
+  r = key_iter->upper_bound(after);
+  if (r < 0)
+    return r;
+  return adjust();
+}
+
+bool DBObjectMap::DBObjectMapIteratorImpl::valid()
+{
+  bool valid = !invalid && ready;
+  ceph_assert(!valid || cur_iter->valid());
+  return valid;
+}
+
+bool DBObjectMap::DBObjectMapIteratorImpl::valid_parent()
+{
+  if (parent_iter && parent_iter->valid() &&
+      (!key_iter->valid() || key_iter->key() > parent_iter->key()))
+    return true;
+  return false;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::next()
+{
+  ceph_assert(cur_iter->valid());
+  ceph_assert(valid());
+  cur_iter->next();
+  return adjust();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::next_parent()
+{
+  r = next();
+  if (r < 0)
+    return r;
+  while (parent_iter && parent_iter->valid() && !on_parent()) {
+    ceph_assert(valid());
+    r = lower_bound(parent_iter->key());
+    if (r < 0)
+      return r;
+  }
+
+  if (!parent_iter || !parent_iter->valid()) {
+    invalid = true;
+  }
+  return 0;
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::in_complete_region(const string &to_test,
+							     string *begin,
+							     string *end)
+{
+  /* This is clumsy because one cannot call prev() on end(), nor can one
+   * test for == begin().
+   */
+  complete_iter->upper_bound(to_test);
+  if (complete_iter->valid()) {
+    complete_iter->prev();
+    if (!complete_iter->valid()) {
+      complete_iter->upper_bound(to_test);
+      return false;
+    }
+  } else {
+    complete_iter->seek_to_last();
+    if (!complete_iter->valid())
+      return false;
+  }
+
+  ceph_assert(complete_iter->key() <= to_test);
+  ceph_assert(complete_iter->value().length() >= 1);
+  string _end(complete_iter->value().c_str(),
+	      complete_iter->value().length() - 1);
+  if (_end.empty() || _end > to_test) {
+    if (begin)
+      *begin = complete_iter->key();
+    if (end)
+      *end = _end;
+    return true;
+  } else {
+    complete_iter->next();
+    ceph_assert(!complete_iter->valid() || complete_iter->key() > to_test);
+    return false;
+  }
+}
+
+/**
+ * Moves parent_iter to the next position both out of the complete_region and
+ * not equal to key_iter.  Then, we set cur_iter to parent_iter if valid and
+ * less than key_iter and key_iter otherwise.
+ */
+int DBObjectMap::DBObjectMapIteratorImpl::adjust()
+{
+  string begin, end;
+  while (parent_iter && parent_iter->valid()) {
+    if (in_complete_region(parent_iter->key(), &begin, &end)) {
+      if (end.size() == 0) {
+	parent_iter->seek_to_last();
+	if (parent_iter->valid())
+	  parent_iter->next();
+      } else
+	parent_iter->lower_bound(end);
+    } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) {
+      parent_iter->next();
+    } else {
+      break;
+    }
+  }
+  if (valid_parent()) {
+    cur_iter = parent_iter;
+  } else if (key_iter->valid()) {
+    cur_iter = key_iter;
+  } else {
+    invalid = true;
+  }
+  ceph_assert(invalid || cur_iter->valid());
+  return 0;
+}
+
+
+string DBObjectMap::DBObjectMapIteratorImpl::key()
+{
+  return cur_iter->key();
+}
+
+bufferlist DBObjectMap::DBObjectMapIteratorImpl::value()
+{
+  return cur_iter->value();
+}
+
+int DBObjectMap::DBObjectMapIteratorImpl::status()
+{
+  return r;
+}
+
+int DBObjectMap::set_keys(const ghobject_t &oid,
+			  const map<string, bufferlist> &set,
+			  const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_create_map_header(hl, oid, t);
+  if (!header)
+    return -EINVAL;
+  if (check_spos(oid, header, spos))
+    return 0;
+
+  t->set(user_prefix(header), set);
+
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::set_header(const ghobject_t &oid,
+			    const bufferlist &bl,
+			    const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_create_map_header(hl, oid, t);
+  if (!header)
+    return -EINVAL;
+  if (check_spos(oid, header, spos))
+    return 0;
+  _set_header(header, bl, t);
+  return db->submit_transaction(t);
+}
+
+void DBObjectMap::_set_header(Header header, const bufferlist &bl,
+			      KeyValueDB::Transaction t)
+{
+  map<string, bufferlist> to_set;
+  to_set[USER_HEADER_KEY] = bl;
+  t->set(sys_prefix(header), to_set);
+}
+
+int DBObjectMap::get_header(const ghobject_t &oid,
+			    bufferlist *bl)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header) {
+    return 0;
+  }
+  return _get_header(header, bl);
+}
+
+int DBObjectMap::_get_header(Header header,
+			     bufferlist *bl)
+{
+  map<string, bufferlist> out;
+  while (true) {
+    out.clear();
+    set<string> to_get;
+    to_get.insert(USER_HEADER_KEY);
+    int r = db->get(sys_prefix(header), to_get, &out);
+    if (r == 0 && !out.empty())
+      break;
+    if (r < 0)
+      return r;
+    Header current(header);
+    if (!current->parent)
+      break;
+    header = lookup_parent(current);
+  }
+
+  if (!out.empty())
+    bl->swap(out.begin()->second);
+  return 0;
+}
+
+int DBObjectMap::clear(const ghobject_t &oid,
+		       const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  if (check_spos(oid, header, spos))
+    return 0;
+  remove_map_header(hl, oid, header, t);
+  ceph_assert(header->num_children > 0);
+  header->num_children--;
+  int r = _clear(header, t);
+  if (r < 0)
+    return r;
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::_clear(Header header,
+			KeyValueDB::Transaction t)
+{
+  while (1) {
+    if (header->num_children) {
+      set_header(header, t);
+      break;
+    }
+    clear_header(header, t);
+    if (!header->parent)
+      break;
+    Header parent = lookup_parent(header);
+    if (!parent) {
+      return -EINVAL;
+    }
+    ceph_assert(parent->num_children > 0);
+    parent->num_children--;
+    header.swap(parent);
+  }
+  return 0;
+}
+
+int DBObjectMap::copy_up_header(Header header,
+				KeyValueDB::Transaction t)
+{
+  bufferlist bl;
+  int r = _get_header(header, &bl);
+  if (r < 0)
+    return r;
+
+  _set_header(header, bl, t);
+  return 0;
+}
+
+int DBObjectMap::rm_keys(const ghobject_t &oid,
+			 const set<string> &to_clear,
+			 const SequencerPosition *spos)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  KeyValueDB::Transaction t = db->get_transaction();
+  if (check_spos(oid, header, spos))
+    return 0;
+  t->rmkeys(user_prefix(header), to_clear);
+  if (!header->parent) {
+    return db->submit_transaction(t);
+  }
+
+  ceph_assert(state.legacy);
+
+  {
+    // We only get here for legacy (v2) stores
+    // Copy up all keys from parent excluding to_clear
+    // and remove parent
+    // This eliminates a v2 format use of complete for this oid only
+    map<string, bufferlist> to_write;
+    ObjectMapIterator iter = _get_iterator(header);
+    for (iter->seek_to_first() ; iter->valid() ; iter->next()) {
+      if (iter->status())
+        return iter->status();
+      if (!to_clear.count(iter->key()))
+        to_write[iter->key()] = iter->value();
+    }
+    t->set(user_prefix(header), to_write);
+  } // destruct iter which has parent in_use
+
+  copy_up_header(header, t);
+  Header parent = lookup_parent(header);
+  if (!parent)
+    return -EINVAL;
+  parent->num_children--;
+  _clear(parent, t);
+  header->parent = 0;
+  set_map_header(hl, oid, *header, t);
+  t->rmkeys_by_prefix(complete_prefix(header));
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::clear_keys_header(const ghobject_t &oid,
+				   const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  if (check_spos(oid, header, spos))
+    return 0;
+
+  // save old attrs
+  KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
+  if (!iter)
+    return -EINVAL;
+  map<string, bufferlist> attrs;
+  for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
+    attrs.insert(make_pair(iter->key(), iter->value()));
+  if (iter->status())
+    return iter->status();
+
+  // remove current header
+  remove_map_header(hl, oid, header, t);
+  ceph_assert(header->num_children > 0);
+  header->num_children--;
+  int r = _clear(header, t);
+  if (r < 0)
+    return r;
+
+  // create new header
+  Header newheader = generate_new_header(oid, Header());
+  set_map_header(hl, oid, *newheader, t);
+  if (!attrs.empty())
+    t->set(xattr_prefix(newheader), attrs);
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::get(const ghobject_t &oid,
+		     bufferlist *_header,
+		     map<string, bufferlist> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  _get_header(header, _header);
+  ObjectMapIterator iter = _get_iterator(header);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    if (iter->status())
+      return iter->status();
+    out->insert(make_pair(iter->key(), iter->value()));
+  }
+  return 0;
+}
+
+int DBObjectMap::get_keys(const ghobject_t &oid,
+			  set<string> *keys)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  ObjectMapIterator iter = _get_iterator(header);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    if (iter->status())
+      return iter->status();
+    keys->insert(iter->key());
+  }
+  return 0;
+}
+
+int DBObjectMap::scan(Header header,
+		      const set<string> &in_keys,
+		      set<string> *out_keys,
+		      map<string, bufferlist> *out_values)
+{
+  ObjectMapIterator db_iter = _get_iterator(header);
+  for (set<string>::const_iterator key_iter = in_keys.begin();
+       key_iter != in_keys.end();
+       ++key_iter) {
+    db_iter->lower_bound(*key_iter);
+    if (db_iter->status())
+      return db_iter->status();
+    if (db_iter->valid() && db_iter->key() == *key_iter) {
+      if (out_keys)
+	out_keys->insert(*key_iter);
+      if (out_values)
+	out_values->insert(make_pair(db_iter->key(), db_iter->value()));
+    }
+  }
+  return 0;
+}
+
+int DBObjectMap::get_values(const ghobject_t &oid,
+			    const set<string> &keys,
+			    map<string, bufferlist> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  return scan(header, keys, 0, out);
+}
+
+int DBObjectMap::check_keys(const ghobject_t &oid,
+			    const set<string> &keys,
+			    set<string> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  return scan(header, keys, out, 0);
+}
+
+int DBObjectMap::get_xattrs(const ghobject_t &oid,
+			    const set<string> &to_get,
+			    map<string, bufferlist> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  return db->get(xattr_prefix(header), to_get, out);
+}
+
+int DBObjectMap::get_all_xattrs(const ghobject_t &oid,
+				set<string> *out)
+{
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  KeyValueDB::Iterator iter = db->get_iterator(xattr_prefix(header));
+  if (!iter)
+    return -EINVAL;
+  for (iter->seek_to_first(); !iter->status() && iter->valid(); iter->next())
+    out->insert(iter->key());
+  return iter->status();
+}
+
+int DBObjectMap::set_xattrs(const ghobject_t &oid,
+			    const map<string, bufferlist> &to_set,
+			    const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_create_map_header(hl, oid, t);
+  if (!header)
+    return -EINVAL;
+  if (check_spos(oid, header, spos))
+    return 0;
+  t->set(xattr_prefix(header), to_set);
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::remove_xattrs(const ghobject_t &oid,
+			       const set<string> &to_remove,
+			       const SequencerPosition *spos)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  MapHeaderLock hl(this, oid);
+  Header header = lookup_map_header(hl, oid);
+  if (!header)
+    return -ENOENT;
+  if (check_spos(oid, header, spos))
+    return 0;
+  t->rmkeys(xattr_prefix(header), to_remove);
+  return db->submit_transaction(t);
+}
+
+// ONLY USED FOR TESTING
+// Set version to 2 to avoid asserts
+int DBObjectMap::legacy_clone(const ghobject_t &oid,
+		       const ghobject_t &target,
+		       const SequencerPosition *spos)
+{
+  state.legacy = true;
+
+  if (oid == target)
+    return 0;
+
+  MapHeaderLock _l1(this, std::min(oid, target));
+  MapHeaderLock _l2(this, std::max(oid, target));
+  MapHeaderLock *lsource, *ltarget;
+  if (oid > target) {
+    lsource = &_l2;
+    ltarget= &_l1;
+  } else {
+    lsource = &_l1;
+    ltarget= &_l2;
+  }
+
+  KeyValueDB::Transaction t = db->get_transaction();
+  {
+    Header destination = lookup_map_header(*ltarget, target);
+    if (destination) {
+      if (check_spos(target, destination, spos))
+	return 0;
+      destination->num_children--;
+      remove_map_header(*ltarget, target, destination, t);
+      _clear(destination, t);
+    }
+  }
+
+  Header parent = lookup_map_header(*lsource, oid);
+  if (!parent)
+    return db->submit_transaction(t);
+
+  Header source = generate_new_header(oid, parent);
+  Header destination = generate_new_header(target, parent);
+  if (spos)
+    destination->spos = *spos;
+
+  parent->num_children = 2;
+  set_header(parent, t);
+  set_map_header(*lsource, oid, *source, t);
+  set_map_header(*ltarget, target, *destination, t);
+
+  map<string, bufferlist> to_set;
+  KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(parent));
+  for (xattr_iter->seek_to_first();
+       xattr_iter->valid();
+       xattr_iter->next())
+    to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value()));
+  t->set(xattr_prefix(source), to_set);
+  t->set(xattr_prefix(destination), to_set);
+  t->rmkeys_by_prefix(xattr_prefix(parent));
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::clone(const ghobject_t &oid,
+		       const ghobject_t &target,
+		       const SequencerPosition *spos)
+{
+  if (oid == target)
+    return 0;
+
+  MapHeaderLock _l1(this, std::min(oid, target));
+  MapHeaderLock _l2(this, std::max(oid, target));
+  MapHeaderLock *lsource, *ltarget;
+  if (oid > target) {
+    lsource = &_l2;
+    ltarget= &_l1;
+  } else {
+    lsource = &_l1;
+    ltarget= &_l2;
+  }
+
+  KeyValueDB::Transaction t = db->get_transaction();
+  {
+    Header destination = lookup_map_header(*ltarget, target);
+    if (destination) {
+      if (check_spos(target, destination, spos))
+	return 0;
+      destination->num_children--;
+      remove_map_header(*ltarget, target, destination, t);
+      _clear(destination, t);
+    }
+  }
+
+  Header source = lookup_map_header(*lsource, oid);
+  if (!source)
+    return db->submit_transaction(t);
+
+  Header destination = generate_new_header(target, Header());
+  if (spos)
+    destination->spos = *spos;
+
+  set_map_header(*ltarget, target, *destination, t);
+
+  bufferlist bl;
+  int r = _get_header(source, &bl);
+  if (r < 0)
+    return r;
+  _set_header(destination, bl, t);
+
+  map<string, bufferlist> to_set;
+  KeyValueDB::Iterator xattr_iter = db->get_iterator(xattr_prefix(source));
+  for (xattr_iter->seek_to_first();
+       xattr_iter->valid();
+       xattr_iter->next())
+    to_set.insert(make_pair(xattr_iter->key(), xattr_iter->value()));
+  t->set(xattr_prefix(destination), to_set);
+
+  map<string, bufferlist> to_write;
+  ObjectMapIterator iter = _get_iterator(source);
+  for (iter->seek_to_first() ; iter->valid() ; iter->next()) {
+    if (iter->status())
+      return iter->status();
+    to_write[iter->key()] = iter->value();
+  }
+  t->set(user_prefix(destination), to_write);
+
+  return db->submit_transaction(t);
+}
+
+int DBObjectMap::upgrade_to_v2()
+{
+  dout(1) << __func__ << " start" << dendl;
+  KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+  iter->seek_to_first();
+  while (iter->valid()) {
+    unsigned count = 0;
+    KeyValueDB::Transaction t = db->get_transaction();
+    set<string> remove;
+    map<string, bufferlist> add;
+    for (;
+        iter->valid() && count < 300;
+        iter->next()) {
+      dout(20) << __func__ << " key is " << iter->key() << dendl;
+      int r = is_buggy_ghobject_key_v1(cct, iter->key());
+      if (r < 0) {
+	derr << __func__ << " bad key '" << iter->key() << "'" << dendl;
+	return r;
+      }
+      if (!r) {
+	dout(20) << __func__ << " " << iter->key() << " ok" << dendl;
+	continue;
+      }
+
+      // decode header to get oid
+      _Header hdr;
+      bufferlist bl = iter->value();
+      auto bliter = bl.cbegin();
+      hdr.decode(bliter);
+
+      string newkey(ghobject_key(hdr.oid));
+      dout(20) << __func__ << " " << iter->key() << " -> " << newkey << dendl;
+      add[newkey] = iter->value();
+      remove.insert(iter->key());
+      ++count;
+    }
+
+    if (!remove.empty()) {
+      dout(20) << __func__ << " updating " << remove.size() << " keys" << dendl;
+      t->rmkeys(HOBJECT_TO_SEQ, remove);
+      t->set(HOBJECT_TO_SEQ, add);
+      int r = db->submit_transaction(t);
+      if (r < 0)
+	return r;
+    }
+  }
+
+  state.v = 2;
+
+  set_state();
+  return 0;
+}
+
+void DBObjectMap::set_state()
+{
+  std::lock_guard l{header_lock};
+  KeyValueDB::Transaction t = db->get_transaction();
+  write_state(t);
+  int ret = db->submit_transaction_sync(t);
+  ceph_assert(ret == 0);
+  dout(1) << __func__ << " done" << dendl;
+  return;
+}
+
+int DBObjectMap::get_state()
+{
+  map<string, bufferlist> result;
+  set<string> to_get;
+  to_get.insert(GLOBAL_STATE_KEY);
+  int r = db->get(SYS_PREFIX, to_get, &result);
+  if (r < 0)
+    return r;
+  if (!result.empty()) {
+    auto bliter = result.begin()->second.cbegin();
+    state.decode(bliter);
+  } else {
+    // New store
+    state.v = State::CUR_VERSION;
+    state.seq = 1;
+    state.legacy = false;
+  }
+  return 0;
+}
+
+int DBObjectMap::init(bool do_upgrade)
+{
+  int ret = get_state();
+  if (ret < 0)
+    return ret;
+  if (state.v < 1) {
+    dout(1) << "DBObjectMap is *very* old; upgrade to an older version first"
+	    << dendl;
+    return -ENOTSUP;
+  }
+  if (state.v < 2) { // Needs upgrade
+    if (!do_upgrade) {
+      dout(1) << "DOBjbectMap requires an upgrade,"
+	      << " set filestore_update_to"
+	      << dendl;
+      return -ENOTSUP;
+    } else {
+      int r = upgrade_to_v2();
+      if (r < 0)
+	return r;
+    }
+  }
+  ostringstream ss;
+  int errors = check(ss, true);
+  if (errors) {
+    derr << ss.str() << dendl;
+    if (errors > 0)
+      return -EINVAL;
+  }
+  dout(20) << "(init)dbobjectmap: seq is " << state.seq << dendl;
+  return 0;
+}
+
+int DBObjectMap::sync(const ghobject_t *oid,
+		      const SequencerPosition *spos) {
+  KeyValueDB::Transaction t = db->get_transaction();
+  if (oid) {
+    ceph_assert(spos);
+    MapHeaderLock hl(this, *oid);
+    Header header = lookup_map_header(hl, *oid);
+    if (header) {
+      dout(10) << "oid: " << *oid << " setting spos to "
+	       << *spos << dendl;
+      header->spos = *spos;
+      set_map_header(hl, *oid, *header, t);
+    }
+    /* It may appear that this and the identical portion of the else
+     * block can combined below, but in this block, the transaction
+     * must be submitted under *both* the MapHeaderLock and the full
+     * header_lock.
+     *
+     * See 2b63dd25fc1c73fa42e52e9ea4ab5a45dd9422a0 and bug 9891.
+     */
+    std::lock_guard l{header_lock};
+    write_state(t);
+    return db->submit_transaction_sync(t);
+  } else {
+    std::lock_guard l{header_lock};
+    write_state(t);
+    return db->submit_transaction_sync(t);
+  }
+}
+
+int DBObjectMap::write_state(KeyValueDB::Transaction _t) {
+  ceph_assert(ceph_mutex_is_locked_by_me(header_lock));
+  dout(20) << "dbobjectmap: seq is " << state.seq << dendl;
+  KeyValueDB::Transaction t = _t ? _t : db->get_transaction();
+  bufferlist bl;
+  state.encode(bl);
+  map<string, bufferlist> to_write;
+  to_write[GLOBAL_STATE_KEY] = bl;
+  t->set(SYS_PREFIX, to_write);
+  return _t ? 0 : db->submit_transaction(t);
+}
+
+
+DBObjectMap::Header DBObjectMap::_lookup_map_header(
+  const MapHeaderLock &l,
+  const ghobject_t &oid)
+{
+  ceph_assert(l.get_locked() == oid);
+
+  _Header *header = new _Header();
+  {
+    std::lock_guard l{cache_lock};
+    if (caches.lookup(oid, header)) {
+      ceph_assert(!in_use.count(header->seq));
+      in_use.insert(header->seq);
+      return Header(header, RemoveOnDelete(this));
+    }
+  }
+
+  bufferlist out;
+  int r = db->get(HOBJECT_TO_SEQ, map_header_key(oid), &out);
+  if (r < 0 || out.length()==0) {
+    delete header;
+    return Header();
+  }
+
+  Header ret(header, RemoveOnDelete(this));
+  auto iter = out.cbegin();
+  ret->decode(iter);
+  {
+    std::lock_guard l{cache_lock};
+    caches.add(oid, *ret);
+  }
+
+  ceph_assert(!in_use.count(header->seq));
+  in_use.insert(header->seq);
+  return ret;
+}
+
+DBObjectMap::Header DBObjectMap::_generate_new_header(const ghobject_t &oid,
+						      Header parent)
+{
+  Header header = Header(new _Header(), RemoveOnDelete(this));
+  header->seq = state.seq++;
+  if (parent) {
+    header->parent = parent->seq;
+    header->spos = parent->spos;
+  }
+  header->num_children = 1;
+  header->oid = oid;
+  ceph_assert(!in_use.count(header->seq));
+  in_use.insert(header->seq);
+
+  write_state();
+  return header;
+}
+
+DBObjectMap::Header DBObjectMap::lookup_parent(Header input)
+{
+  std::unique_lock l{header_lock};
+  header_cond.wait(l, [&input, this] { return !in_use.count(input->parent); });
+  map<string, bufferlist> out;
+  set<string> keys;
+  keys.insert(HEADER_KEY);
+
+  dout(20) << "lookup_parent: parent " << input->parent
+       << " for seq " << input->seq << dendl;
+  int r = db->get(sys_parent_prefix(input), keys, &out);
+  if (r < 0) {
+    ceph_abort();
+    return Header();
+  }
+  if (out.empty()) {
+    ceph_abort();
+    return Header();
+  }
+
+  Header header = Header(new _Header(), RemoveOnDelete(this));
+  auto iter = out.begin()->second.cbegin();
+  header->decode(iter);
+  ceph_assert(header->seq == input->parent);
+  dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
+       << header->parent << dendl;
+  in_use.insert(header->seq);
+  return header;
+}
+
+DBObjectMap::Header DBObjectMap::lookup_create_map_header(
+  const MapHeaderLock &hl,
+  const ghobject_t &oid,
+  KeyValueDB::Transaction t)
+{
+  std::lock_guard l{header_lock};
+  Header header = _lookup_map_header(hl, oid);
+  if (!header) {
+    header = _generate_new_header(oid, Header());
+    set_map_header(hl, oid, *header, t);
+  }
+  return header;
+}
+
+void DBObjectMap::clear_header(Header header, KeyValueDB::Transaction t)
+{
+  dout(20) << "clear_header: clearing seq " << header->seq << dendl;
+  t->rmkeys_by_prefix(user_prefix(header));
+  t->rmkeys_by_prefix(sys_prefix(header));
+  if (state.legacy)
+    t->rmkeys_by_prefix(complete_prefix(header)); // Needed when header.parent != 0
+  t->rmkeys_by_prefix(xattr_prefix(header));
+  set<string> keys;
+  keys.insert(header_key(header->seq));
+  t->rmkeys(USER_PREFIX, keys);
+}
+
+void DBObjectMap::set_header(Header header, KeyValueDB::Transaction t)
+{
+  dout(20) << "set_header: setting seq " << header->seq << dendl;
+  map<string, bufferlist> to_write;
+  header->encode(to_write[HEADER_KEY]);
+  t->set(sys_prefix(header), to_write);
+}
+
+void DBObjectMap::remove_map_header(
+  const MapHeaderLock &l,
+  const ghobject_t &oid,
+  Header header,
+  KeyValueDB::Transaction t)
+{
+  ceph_assert(l.get_locked() == oid);
+  dout(20) << "remove_map_header: removing " << header->seq
+	   << " oid " << oid << dendl;
+  set<string> to_remove;
+  to_remove.insert(map_header_key(oid));
+  t->rmkeys(HOBJECT_TO_SEQ, to_remove);
+  {
+    std::lock_guard l{cache_lock};
+    caches.clear(oid);
+  }
+}
+
+void DBObjectMap::set_map_header(
+  const MapHeaderLock &l,
+  const ghobject_t &oid, _Header header,
+  KeyValueDB::Transaction t)
+{
+  ceph_assert(l.get_locked() == oid);
+  dout(20) << "set_map_header: setting " << header.seq
+	   << " oid " << oid << " parent seq "
+	   << header.parent << dendl;
+  map<string, bufferlist> to_set;
+  header.encode(to_set[map_header_key(oid)]);
+  t->set(HOBJECT_TO_SEQ, to_set);
+  {
+    std::lock_guard l{cache_lock};
+    caches.add(oid, header);
+  }
+}
+
+bool DBObjectMap::check_spos(const ghobject_t &oid,
+			     Header header,
+			     const SequencerPosition *spos)
+{
+  if (!spos || *spos > header->spos) {
+    stringstream out;
+    if (spos)
+      dout(10) << "oid: " << oid << " not skipping op, *spos "
+	       << *spos << dendl;
+    else
+      dout(10) << "oid: " << oid << " not skipping op, *spos "
+	       << "empty" << dendl;
+    dout(10) << " > header.spos " << header->spos << dendl;
+    return false;
+  } else {
+    dout(10) << "oid: " << oid << " skipping op, *spos " << *spos
+	     << " <= header.spos " << header->spos << dendl;
+    return true;
+  }
+}
+
+int DBObjectMap::list_objects(vector<ghobject_t> *out)
+{
+  KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    bufferlist bl = iter->value();
+    auto bliter = bl.cbegin();
+    _Header header;
+    header.decode(bliter);
+    out->push_back(header.oid);
+  }
+  return 0;
+}
+
+int DBObjectMap::list_object_headers(vector<_Header> *out)
+{
+  int error = 0;
+  KeyValueDB::Iterator iter = db->get_iterator(HOBJECT_TO_SEQ);
+  for (iter->seek_to_first(); iter->valid(); iter->next()) {
+    bufferlist bl = iter->value();
+    auto bliter = bl.cbegin();
+    _Header header;
+    header.decode(bliter);
+    out->push_back(header);
+    while (header.parent) {
+      set<string> to_get;
+      map<string, bufferlist> got;
+      to_get.insert(HEADER_KEY);
+      db->get(sys_parent_prefix(header), to_get, &got);
+      if (got.empty()) {
+	dout(0) << "Missing: seq " << header.parent << dendl;
+	error = -ENOENT;
+	break;
+      } else {
+	bl = got.begin()->second;
+        auto bliter = bl.cbegin();
+        header.decode(bliter);
+        out->push_back(header);
+      }
+    }
+  }
+  return error;
+}
+
+ostream& operator<<(ostream& out, const DBObjectMap::_Header& h)
+{
+  out << "seq=" << h.seq << " parent=" << h.parent 
+      << " num_children=" << h.num_children
+      << " ghobject=" << h.oid;
+  return out;
+}
+
+int DBObjectMap::rename(const ghobject_t &from,
+		       const ghobject_t &to,
+		       const SequencerPosition *spos)
+{
+  if (from == to)
+    return 0;
+
+  MapHeaderLock _l1(this, std::min(from, to));
+  MapHeaderLock _l2(this, std::max(from, to));
+  MapHeaderLock *lsource, *ltarget;
+  if (from > to) {
+    lsource = &_l2;
+    ltarget= &_l1;
+  } else {
+    lsource = &_l1;
+    ltarget= &_l2;
+  }
+
+  KeyValueDB::Transaction t = db->get_transaction();
+  {
+    Header destination = lookup_map_header(*ltarget, to);
+    if (destination) {
+      if (check_spos(to, destination, spos))
+	return 0;
+      destination->num_children--;
+      remove_map_header(*ltarget, to, destination, t);
+      _clear(destination, t);
+    }
+  }
+
+  Header hdr = lookup_map_header(*lsource, from);
+  if (!hdr)
+    return db->submit_transaction(t);
+
+  remove_map_header(*lsource, from, hdr, t);
+  hdr->oid = to;
+  set_map_header(*ltarget, to, *hdr, t);
+
+  return db->submit_transaction(t);
+}
diff --git a/src/os/DBObjectMap.h b/src/os/DBObjectMap.h
new file mode 100644
index 000000000..444f21eb8
--- /dev/null
+++ b/src/os/DBObjectMap.h
@@ -0,0 +1,584 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#ifndef DBOBJECTMAP_DB_H
+#define DBOBJECTMAP_DB_H
+
+#include "include/buffer_fwd.h"
+#include <set>
+#include <map>
+#include <string>
+
+#include <vector>
+#include <boost/scoped_ptr.hpp>
+
+#include "os/ObjectMap.h"
+#include "kv/KeyValueDB.h"
+#include "osd/osd_types.h"
+#include "common/ceph_mutex.h"
+#include "common/simple_cache.hpp"
+#include <boost/optional/optional_io.hpp>
+
+#include "SequencerPosition.h"
+
+/**
+ * DBObjectMap: Implements ObjectMap in terms of KeyValueDB
+ *
+ * Prefix space structure:
+ *
+ * @see complete_prefix
+ * @see user_prefix
+ * @see sys_prefix
+ *
+ * - HOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->header.seq and
+ *                   corresponding omap header
+ * - SYS_PREFIX: GLOBAL_STATE_KEY - contains next seq number
+ *                                  @see State
+ *                                  @see write_state
+ *                                  @see init
+ *                                  @see generate_new_header
+ * - USER_PREFIX + header_key(header->seq) + USER_PREFIX
+ *              : key->value for header->seq
+ * - USER_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below
+ * - USER_PREFIX + header_key(header->seq) + XATTR_PREFIX: xattrs
+ * - USER_PREFIX + header_key(header->seq) + SYS_PREFIX
+ *              : USER_HEADER_KEY - omap header for header->seq
+ *              : HEADER_KEY - encoding of header for header->seq
+ *
+ * For each node (represented by a header), we
+ * store three mappings: the key mapping, the complete mapping, and the parent.
+ * The complete mapping (COMPLETE_PREFIX space) is key->key.  Each x->y entry in
+ * this mapping indicates that the key mapping contains all entries on [x,y).
+ * Note, max std::string is represented by "", so ""->"" indicates that the parent
+ * is unnecessary (@see rm_keys).  When looking up a key not contained in the
+ * the complete std::set, we have to check the parent if we don't find it in the
+ * key std::set.  During rm_keys, we copy keys from the parent and update the
+ * complete std::set to reflect the change @see rm_keys.
+ */
+class DBObjectMap : public ObjectMap {
+public:
+
+  KeyValueDB *get_db() override { return db.get(); }
+
+  /**
+   * Serializes access to next_seq as well as the in_use std::set
+   */
+  ceph::mutex header_lock = ceph::make_mutex("DBOBjectMap");
+  ceph::condition_variable header_cond;
+  ceph::condition_variable map_header_cond;
+
+  /**
+   * Std::Set of headers currently in use
+   */
+  std::set<uint64_t> in_use;
+  std::set<ghobject_t> map_header_in_use;
+
+  /**
+   * Takes the map_header_in_use entry in constructor, releases in
+   * destructor
+   */
+  class MapHeaderLock {
+    DBObjectMap *db;
+    boost::optional<ghobject_t> locked;
+
+    MapHeaderLock(const MapHeaderLock &);
+    MapHeaderLock &operator=(const MapHeaderLock &);
+  public:
+    explicit MapHeaderLock(DBObjectMap *db) : db(db) {}
+    MapHeaderLock(DBObjectMap *db, const ghobject_t &oid) : db(db), locked(oid) {
+      std::unique_lock l{db->header_lock};
+      db->map_header_cond.wait(l, [db, this] {
+        return !db->map_header_in_use.count(*locked);
+      });
+      db->map_header_in_use.insert(*locked);
+    }
+
+    const ghobject_t &get_locked() const {
+      ceph_assert(locked);
+      return *locked;
+    }
+
+    void swap(MapHeaderLock &o) {
+      ceph_assert(db == o.db);
+
+      // centos6's boost optional doesn't seem to have swap :(
+      boost::optional<ghobject_t> _locked = o.locked;
+      o.locked = locked;
+      locked = _locked;
+    }
+
+    ~MapHeaderLock() {
+      if (locked) {
+	std::lock_guard l{db->header_lock};
+	ceph_assert(db->map_header_in_use.count(*locked));
+	db->map_header_cond.notify_all();
+	db->map_header_in_use.erase(*locked);
+      }
+    }
+  };
+
+  DBObjectMap(CephContext* cct, KeyValueDB *db)
+    : ObjectMap(cct, db),
+      caches(cct->_conf->filestore_omap_header_cache_size)
+    {}
+
+  int set_keys(
+    const ghobject_t &oid,
+    const std::map<std::string, ceph::buffer::list> &set,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int set_header(
+    const ghobject_t &oid,
+    const ceph::buffer::list &bl,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int get_header(
+    const ghobject_t &oid,
+    ceph::buffer::list *bl
+    ) override;
+
+  int clear(
+    const ghobject_t &oid,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int clear_keys_header(
+    const ghobject_t &oid,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int rm_keys(
+    const ghobject_t &oid,
+    const std::set<std::string> &to_clear,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int get(
+    const ghobject_t &oid,
+    ceph::buffer::list *header,
+    std::map<std::string, ceph::buffer::list> *out
+    ) override;
+
+  int get_keys(
+    const ghobject_t &oid,
+    std::set<std::string> *keys
+    ) override;
+
+  int get_values(
+    const ghobject_t &oid,
+    const std::set<std::string> &keys,
+    std::map<std::string, ceph::buffer::list> *out
+    ) override;
+
+  int check_keys(
+    const ghobject_t &oid,
+    const std::set<std::string> &keys,
+    std::set<std::string> *out
+    ) override;
+
+  int get_xattrs(
+    const ghobject_t &oid,
+    const std::set<std::string> &to_get,
+    std::map<std::string, ceph::buffer::list> *out
+    ) override;
+
+  int get_all_xattrs(
+    const ghobject_t &oid,
+    std::set<std::string> *out
+    ) override;
+
+  int set_xattrs(
+    const ghobject_t &oid,
+    const std::map<std::string, ceph::buffer::list> &to_set,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int remove_xattrs(
+    const ghobject_t &oid,
+    const std::set<std::string> &to_remove,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int clone(
+    const ghobject_t &oid,
+    const ghobject_t &target,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int rename(
+    const ghobject_t &from,
+    const ghobject_t &to,
+    const SequencerPosition *spos=0
+    ) override;
+
+  int legacy_clone(
+    const ghobject_t &oid,
+    const ghobject_t &target,
+    const SequencerPosition *spos=0
+    ) override;
+
+  /// Read initial state from backing store
+  int get_state();
+  /// Write current state settings to DB
+  void set_state();
+  /// Read initial state and upgrade or initialize state
+  int init(bool upgrade = false);
+
+  /// Upgrade store to current version
+  int upgrade_to_v2();
+
+  /// Consistency check, debug, there must be no parallel writes
+  int check(std::ostream &out, bool repair = false, bool force = false) override;
+
+  /// Ensure that all previous operations are durable
+  int sync(const ghobject_t *oid=0, const SequencerPosition *spos=0) override;
+
+  void compact() override {
+    ceph_assert(db);
+    db->compact();
+  }
+
+  /// Util, get all objects, there must be no other concurrent access
+  int list_objects(std::vector<ghobject_t> *objs ///< [out] objects
+    );
+
+  struct _Header;
+  // Util, get all object headers, there must be no other concurrent access
+  int list_object_headers(std::vector<_Header> *out ///< [out] headers
+    );
+
+  ObjectMapIterator get_iterator(const ghobject_t &oid) override;
+
+  static const std::string USER_PREFIX;
+  static const std::string XATTR_PREFIX;
+  static const std::string SYS_PREFIX;
+  static const std::string COMPLETE_PREFIX;
+  static const std::string HEADER_KEY;
+  static const std::string USER_HEADER_KEY;
+  static const std::string GLOBAL_STATE_KEY;
+  static const std::string HOBJECT_TO_SEQ;
+
+  /// Legacy
+  static const std::string LEAF_PREFIX;
+  static const std::string REVERSE_LEAF_PREFIX;
+
+  /// persistent state for store @see generate_header
+  struct State {
+    static const __u8 CUR_VERSION = 3;
+    __u8 v;
+    uint64_t seq;
+    // legacy is false when complete regions never used
+    bool legacy;
+    State() : v(0), seq(1), legacy(false) {}
+    explicit State(uint64_t seq) : v(0), seq(seq), legacy(false) {}
+
+    void encode(ceph::buffer::list &bl) const {
+      ENCODE_START(3, 1, bl);
+      encode(v, bl);
+      encode(seq, bl);
+      encode(legacy, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator &bl) {
+      DECODE_START(3, bl);
+      if (struct_v >= 2)
+	decode(v, bl);
+      else
+	v = 0;
+      decode(seq, bl);
+      if (struct_v >= 3)
+	decode(legacy, bl);
+      else
+	legacy = false;
+      DECODE_FINISH(bl);
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->dump_unsigned("v", v);
+      f->dump_unsigned("seq", seq);
+      f->dump_bool("legacy", legacy);
+    }
+
+    static void generate_test_instances(std::list<State*> &o) {
+      o.push_back(new State(0));
+      o.push_back(new State(20));
+    }
+  } state;
+
+  struct _Header {
+    uint64_t seq;
+    uint64_t parent;
+    uint64_t num_children;
+
+    ghobject_t oid;
+
+    SequencerPosition spos;
+
+    void encode(ceph::buffer::list &bl) const {
+      coll_t unused;
+      ENCODE_START(2, 1, bl);
+      encode(seq, bl);
+      encode(parent, bl);
+      encode(num_children, bl);
+      encode(unused, bl);
+      encode(oid, bl);
+      encode(spos, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(ceph::buffer::list::const_iterator &bl) {
+      coll_t unused;
+      DECODE_START(2, bl);
+      decode(seq, bl);
+      decode(parent, bl);
+      decode(num_children, bl);
+      decode(unused, bl);
+      decode(oid, bl);
+      if (struct_v >= 2)
+	decode(spos, bl);
+      DECODE_FINISH(bl);
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->dump_unsigned("seq", seq);
+      f->dump_unsigned("parent", parent);
+      f->dump_unsigned("num_children", num_children);
+      f->dump_stream("oid") << oid;
+    }
+
+    static void generate_test_instances(std::list<_Header*> &o) {
+      o.push_back(new _Header);
+      o.push_back(new _Header);
+      o.back()->parent = 20;
+      o.back()->seq = 30;
+    }
+
+    size_t length() {
+      return sizeof(_Header);
+    }
+
+    _Header() : seq(0), parent(0), num_children(1) {}
+  };
+
+  /// Std::String munging (public for testing)
+  static std::string ghobject_key(const ghobject_t &oid);
+  static std::string ghobject_key_v0(coll_t c, const ghobject_t &oid);
+  static int is_buggy_ghobject_key_v1(CephContext* cct,
+				      const std::string &in);
+private:
+  /// Implicit lock on Header->seq
+  typedef std::shared_ptr<_Header> Header;
+  ceph::mutex cache_lock = ceph::make_mutex("DBObjectMap::CacheLock");
+  SimpleLRU<ghobject_t, _Header> caches;
+
+  std::string map_header_key(const ghobject_t &oid);
+  std::string header_key(uint64_t seq);
+  std::string complete_prefix(Header header);
+  std::string user_prefix(Header header);
+  std::string sys_prefix(Header header);
+  std::string xattr_prefix(Header header);
+  std::string sys_parent_prefix(_Header header);
+  std::string sys_parent_prefix(Header header) {
+    return sys_parent_prefix(*header);
+  }
+
+  class EmptyIteratorImpl : public ObjectMapIteratorImpl {
+  public:
+    int seek_to_first() override { return 0; }
+    int seek_to_last() { return 0; }
+    int upper_bound(const std::string &after) override { return 0; }
+    int lower_bound(const std::string &to) override { return 0; }
+    bool valid() override { return false; }
+    int next() override { ceph_abort(); return 0; }
+    std::string key() override { ceph_abort(); return ""; }
+    ceph::buffer::list value() override { ceph_abort(); return ceph::buffer::list(); }
+    int status() override { return 0; }
+  };
+
+
+  /// Iterator
+  class DBObjectMapIteratorImpl : public ObjectMapIteratorImpl {
+  public:
+    DBObjectMap *map;
+
+    /// NOTE: implicit lock hlock->get_locked() when returned out of the class
+    MapHeaderLock hlock;
+    /// NOTE: implicit lock on header->seq AND for all ancestors
+    Header header;
+
+    /// parent_iter == NULL iff no parent
+    std::shared_ptr<DBObjectMapIteratorImpl> parent_iter;
+    KeyValueDB::Iterator key_iter;
+    KeyValueDB::Iterator complete_iter;
+
+    /// cur_iter points to currently valid iterator
+    std::shared_ptr<ObjectMapIteratorImpl> cur_iter;
+    int r;
+
+    /// init() called, key_iter, complete_iter, parent_iter filled in
+    bool ready;
+    /// past end
+    bool invalid;
+
+    DBObjectMapIteratorImpl(DBObjectMap *map, Header header) :
+      map(map), hlock(map), header(header), r(0), ready(false), invalid(true) {}
+    int seek_to_first() override;
+    int seek_to_last();
+    int upper_bound(const std::string &after) override;
+    int lower_bound(const std::string &to) override;
+    bool valid() override;
+    int next() override;
+    std::string key() override;
+    ceph::buffer::list value() override;
+    int status() override;
+
+    bool on_parent() {
+      return cur_iter == parent_iter;
+    }
+
+    /// skips to next valid parent entry
+    int next_parent();
+    
+    /// first parent() >= to
+    int lower_bound_parent(const std::string &to);
+
+    /**
+     * Tests whether to_test is in complete region
+     *
+     * postcondition: complete_iter will be max s.t. complete_iter->value > to_test
+     */
+    int in_complete_region(const std::string &to_test, ///< [in] key to test
+			   std::string *begin,         ///< [out] beginning of region
+			   std::string *end            ///< [out] end of region
+      ); ///< @returns true if to_test is in the complete region, else false
+
+  private:
+    int init();
+    bool valid_parent();
+    int adjust();
+  };
+
+  typedef std::shared_ptr<DBObjectMapIteratorImpl> DBObjectMapIterator;
+  DBObjectMapIterator _get_iterator(Header header) {
+    return std::make_shared<DBObjectMapIteratorImpl>(this, header);
+  }
+
+  /// sys
+
+  /// Removes node corresponding to header
+  void clear_header(Header header, KeyValueDB::Transaction t);
+
+  /// Std::Set node containing input to new contents
+  void set_header(Header input, KeyValueDB::Transaction t);
+
+  /// Remove leaf node corresponding to oid in c
+  void remove_map_header(
+    const MapHeaderLock &l,
+    const ghobject_t &oid,
+    Header header,
+    KeyValueDB::Transaction t);
+
+  /// Std::Set leaf node for c and oid to the value of header
+  void set_map_header(
+    const MapHeaderLock &l,
+    const ghobject_t &oid, _Header header,
+    KeyValueDB::Transaction t);
+
+  /// Std::Set leaf node for c and oid to the value of header
+  bool check_spos(const ghobject_t &oid,
+		  Header header,
+		  const SequencerPosition *spos);
+
+  /// Lookup or create header for c oid
+  Header lookup_create_map_header(
+    const MapHeaderLock &l,
+    const ghobject_t &oid,
+    KeyValueDB::Transaction t);
+
+  /**
+   * Generate new header for c oid with new seq number
+   *
+   * Has the side effect of synchronously saving the new DBObjectMap state
+   */
+  Header _generate_new_header(const ghobject_t &oid, Header parent);
+  Header generate_new_header(const ghobject_t &oid, Header parent) {
+    std::lock_guard l{header_lock};
+    return _generate_new_header(oid, parent);
+  }
+
+  /// Lookup leaf header for c oid
+  Header _lookup_map_header(
+    const MapHeaderLock &l,
+    const ghobject_t &oid);
+  Header lookup_map_header(
+    const MapHeaderLock &l2,
+    const ghobject_t &oid) {
+    std::lock_guard l{header_lock};
+    return _lookup_map_header(l2, oid);
+  }
+
+  /// Lookup header node for input
+  Header lookup_parent(Header input);
+
+
+  /// Helpers
+  int _get_header(Header header, ceph::buffer::list *bl);
+
+  /// Scan keys in header into out_keys and out_values (if nonnull)
+  int scan(Header header,
+	   const std::set<std::string> &in_keys,
+	   std::set<std::string> *out_keys,
+	   std::map<std::string, ceph::buffer::list> *out_values);
+
+  /// Remove header and all related prefixes
+  int _clear(Header header,
+	     KeyValueDB::Transaction t);
+
+  /* Scan complete region bumping *begin to the beginning of any
+   * containing region and adding all complete region keys between
+   * the updated begin and end to the complete_keys_to_remove std::set */
+  int merge_new_complete(DBObjectMapIterator &iter,
+			 std::string *begin,
+			 const std::string &end,
+			 std::set<std::string> *complete_keys_to_remove);
+
+  /// Writes out State (mainly next_seq)
+  int write_state(KeyValueDB::Transaction _t =
+		  KeyValueDB::Transaction());
+
+  /// Copies header entry from parent @see rm_keys
+  int copy_up_header(Header header,
+		     KeyValueDB::Transaction t);
+
+  /// Sets header @see set_header
+  void _set_header(Header header, const ceph::buffer::list &bl,
+		   KeyValueDB::Transaction t);
+
+  /**
+   * Removes header seq lock and possibly object lock
+   * once Header is out of scope
+   * @see lookup_parent
+   * @see generate_new_header
+   */
+  class RemoveOnDelete {
+  public:
+    DBObjectMap *db;
+    explicit RemoveOnDelete(DBObjectMap *db) :
+      db(db) {}
+    void operator() (_Header *header) {
+      std::lock_guard l{db->header_lock};
+      ceph_assert(db->in_use.count(header->seq));
+      db->in_use.erase(header->seq);
+      db->header_cond.notify_all();
+      delete header;
+    }
+  };
+  friend class RemoveOnDelete;
+};
+WRITE_CLASS_ENCODER(DBObjectMap::_Header)
+WRITE_CLASS_ENCODER(DBObjectMap::State)
+
+std::ostream& operator<<(std::ostream& out, const DBObjectMap::_Header& h);
+
+#endif
diff --git a/src/os/FuseStore.cc b/src/os/FuseStore.cc
new file mode 100644
index 000000000..a1a9aa6d0
--- /dev/null
+++ b/src/os/FuseStore.cc
@@ -0,0 +1,1287 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/compat.h"
+#include "include/ceph_fuse.h"
+#include "FuseStore.h"
+#include "os/ObjectStore.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+
+#include <fuse_lowlevel.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>           /* Definition of AT_* constants */
+#include <sys/stat.h>
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif
+
+#define dout_context store->cct
+#define dout_subsys ceph_subsys_fuse
+#include "common/debug.h"
+#undef dout_prefix
+#define dout_prefix *_dout << "fuse "
+
+using std::less;
+using std::list;
+using std::map;
+using std::set;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+
+// some fuse-y bits of state
+struct fs_info {
+  struct fuse_args args;
+  struct fuse *f;
+#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0)
+  struct fuse_chan *ch;
+#endif
+  char *mountpoint;
+};
+
+int FuseStore::open_file(string p, struct fuse_file_info *fi,
+			 std::function<int(bufferlist *bl)> f)
+{
+  if (open_files.count(p)) {
+    OpenFile *o = open_files[p];
+    fi->fh = reinterpret_cast<uint64_t>(o);
+    ++o->ref;
+    return 0;
+  }
+  bufferlist bl;
+  int r = f(&bl);
+  if (r < 0) {
+    return r;
+  }
+  OpenFile *o = new OpenFile;
+  o->path = p;
+  o->bl = std::move(bl);
+  open_files[p] = o;
+  fi->fh = reinterpret_cast<uint64_t>(o);
+  ++o->ref;
+  return 0;
+}
+
+FuseStore::FuseStore(ObjectStore *s, string p)
+  : store(s),
+    mount_point(p),
+    fuse_thread(this)
+{
+  info = new fs_info();
+}
+
+FuseStore::~FuseStore()
+{
+  delete info;
+}
+
+/*
+ * / - root directory
+ * $cid/
+ * $cid/type - objectstore type
+ * $cid/bitwise_hash_start = lowest hash value
+ * $cid/bitwise_hash_end = highest hash value
+ * $cid/bitwise_hash_bits - how many bits are significant
+ * $cid/pgmeta/ - pgmeta object
+ * $cid/all/ - all objects
+ * $cid/all/$obj/
+ * $cid/all/$obj/bitwise_hash
+ * $cid/all/$obj/data
+ * $cid/all/$obj/omap/$key
+ * $cid/all/$obj/attr/$name
+ * $cid/by_bitwise_hash/$hash/$bits/$obj - all objects with this (bitwise) hash (prefix)
+ */
+enum {
+  FN_ROOT = 1,
+  FN_TYPE,
+  FN_COLLECTION,
+  FN_HASH_START,
+  FN_HASH_END,
+  FN_HASH_BITS,
+  FN_OBJECT,
+  FN_OBJECT_HASH,
+  FN_OBJECT_DATA,
+  FN_OBJECT_OMAP_HEADER,
+  FN_OBJECT_OMAP,
+  FN_OBJECT_OMAP_VAL,
+  FN_OBJECT_ATTR,
+  FN_OBJECT_ATTR_VAL,
+  FN_ALL,
+  FN_HASH_DIR,
+  FN_HASH_VAL,
+};
+
+static int parse_fn(CephContext* cct, const char *path, coll_t *cid,
+		    ghobject_t *oid, string *key,
+		    uint32_t *hash, uint32_t *hash_bits)
+{
+  list<string> v;
+  for (const char *p = path; *p; ++p) {
+    if (*p == '/')
+      continue;
+    const char *e;
+    for (e = p + 1; *e && *e != '/'; e++) ;
+    string c(p, e-p);
+    v.push_back(c);
+    p = e;
+    if (!*p)
+      break;
+  }
+  ldout(cct, 10) << __func__ << " path " << path << " -> " << v << dendl;
+
+  if (v.empty())
+    return FN_ROOT;
+
+  if (v.front() == "type")
+    return FN_TYPE;
+
+  if (!cid->parse(v.front())) {
+    return -ENOENT;
+  }
+  if (v.size() == 1)
+    return FN_COLLECTION;
+  v.pop_front();
+
+  if (v.front() == "bitwise_hash_start")
+    return FN_HASH_START;
+  if (v.front() == "bitwise_hash_end")
+    return FN_HASH_END;
+  if (v.front() == "bitwise_hash_bits")
+    return FN_HASH_BITS;
+  if (v.front() == "pgmeta") {
+    spg_t pgid;
+    if (cid->is_pg(&pgid)) {
+      *oid = pgid.make_pgmeta_oid();
+      v.pop_front();
+      if (v.empty())
+	return FN_OBJECT;
+      goto do_object;
+    }
+    return -ENOENT;
+  }
+  if (v.front() == "all") {
+    v.pop_front();
+    if (v.empty())
+      return FN_ALL;
+    goto do_dir;
+  }
+  if (v.front() == "by_bitwise_hash") {
+    v.pop_front();
+    if (v.empty())
+      return FN_HASH_DIR;
+    unsigned long hv, hm;
+    int r = sscanf(v.front().c_str(), "%lx", &hv);
+    if (r != 1)
+      return -ENOENT;
+    int shift = 32 - v.front().length() * 4;
+    v.pop_front();
+    if (v.empty())
+      return FN_HASH_DIR;
+    r = sscanf(v.front().c_str(), "%ld", &hm);
+    if (r != 1)
+      return -ENOENT;
+    if (hm < 1 || hm > 32)
+      return -ENOENT;
+    v.pop_front();
+    *hash = hv << shift;//hobject_t::_reverse_bits(hv << shift);
+    *hash_bits = hm;
+    if (v.empty())
+      return FN_HASH_VAL;
+    goto do_dir;
+  }
+  return -ENOENT;
+
+ do_dir:
+  {
+    string o = v.front();
+    if (!oid->parse(o)) {
+      return -ENOENT;
+    }
+    v.pop_front();
+    if (v.empty())
+      return FN_OBJECT;
+  }
+
+ do_object:
+  if (v.front() == "data")
+    return FN_OBJECT_DATA;
+  if (v.front() == "omap_header")
+    return FN_OBJECT_OMAP_HEADER;
+  if (v.front() == "omap") {
+    v.pop_front();
+    if (v.empty())
+      return FN_OBJECT_OMAP;
+    *key = v.front();
+    v.pop_front();
+    if (v.empty())
+      return FN_OBJECT_OMAP_VAL;
+    return -ENOENT;
+  }
+  if (v.front() == "attr") {
+    v.pop_front();
+    if (v.empty())
+      return FN_OBJECT_ATTR;
+    *key = v.front();
+    v.pop_front();
+    if (v.empty())
+      return FN_OBJECT_ATTR_VAL;
+    return -ENOENT;
+  }
+  if (v.front() == "bitwise_hash")
+    return FN_OBJECT_HASH;
+  return -ENOENT;
+}
+
+
+static int os_getattr(const char *path, struct stat *stbuf
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+                      , struct fuse_file_info *fi
+#endif
+                      )
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int t = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (t < 0)
+    return t;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  stbuf->st_size = 0;
+  stbuf->st_uid = 0;
+  stbuf->st_gid = 0;
+  stbuf->st_mode = S_IFREG | 0700;
+
+  auto ch = fs->store->open_collection(cid);
+
+  switch (t) {
+  case FN_OBJECT_OMAP:
+  case FN_OBJECT_ATTR:
+  case FN_OBJECT:
+  case FN_OBJECT_DATA:
+  case FN_OBJECT_OMAP_HEADER:
+  case FN_OBJECT_OMAP_VAL:
+    {
+      spg_t pgid;
+      if (cid.is_pg(&pgid)) {
+	if (!ch) {
+	  return -ENOENT;
+	}
+	int bits = fs->store->collection_bits(ch);
+	if (bits >= 0 && !oid.match(bits, pgid.ps())) {
+	  // sorry, not part of this PG
+	  return -ENOENT;
+	}
+      }
+    }
+    break;
+  }
+
+  switch (t) {
+  case FN_OBJECT_OMAP:
+  case FN_OBJECT_ATTR:
+  case FN_OBJECT:
+    if (!fs->store->exists(ch, oid))
+      return -ENOENT;
+    // fall-thru
+  case FN_ALL:
+  case FN_HASH_DIR:
+  case FN_HASH_VAL:
+  case FN_COLLECTION:
+    if (!fs->store->collection_exists(cid))
+      return -ENOENT;
+    // fall-thru
+  case FN_ROOT:
+    stbuf->st_mode = S_IFDIR | 0700;
+    return 0;
+
+  case FN_TYPE:
+    stbuf->st_size = fs->store->get_type().length() + 1;
+    break;
+
+  case FN_OBJECT_HASH:
+    if (!fs->store->exists(ch, oid))
+      return -ENOENT;
+    stbuf->st_size = 9;
+    return 0;
+
+  case FN_HASH_END:
+    if (!ch)
+      return -ENOENT;
+    if (fs->store->collection_bits(ch) < 0)
+      return -ENOENT;
+    // fall-thru
+  case FN_HASH_START:
+    stbuf->st_size = 9;
+    return 0;
+
+  case FN_HASH_BITS:
+    {
+      if (!ch)
+	return -ENOENT;
+      int bits = fs->store->collection_bits(ch);
+      if (bits < 0)
+	return -ENOENT;
+      char buf[12];
+      snprintf(buf, sizeof(buf), "%d\n", bits);
+      stbuf->st_size = strlen(buf);
+    }
+    return 0;
+
+  case FN_OBJECT_DATA:
+    {
+      if (!fs->store->exists(ch, oid))
+	return -ENOENT;
+      int r = fs->store->stat(ch, oid, stbuf);
+      if (r < 0)
+	return r;
+    }
+    break;
+
+  case FN_OBJECT_OMAP_HEADER:
+    {
+      if (!fs->store->exists(ch, oid))
+	return -ENOENT;
+      bufferlist bl;
+      fs->store->omap_get_header(ch, oid, &bl);
+      stbuf->st_size = bl.length();
+    }
+    break;
+
+  case FN_OBJECT_OMAP_VAL:
+    {
+      if (!fs->store->exists(ch, oid))
+	return -ENOENT;
+      set<string> k;
+      k.insert(key);
+      map<string,bufferlist> v;
+      fs->store->omap_get_values(ch, oid, k, &v);
+      if (!v.count(key)) {
+	return -ENOENT;
+      }
+      stbuf->st_size = v[key].length();
+    }
+    break;
+
+  case FN_OBJECT_ATTR_VAL:
+    {
+      if (!fs->store->exists(ch, oid))
+	return -ENOENT;
+      bufferptr v;
+      int r = fs->store->getattr(ch, oid, key.c_str(), v);
+      if (r == -ENODATA)
+	r = -ENOENT;
+      if (r < 0)
+	return r;
+      stbuf->st_size = v.length();
+    }
+    break;
+
+  default:
+    return -ENOENT;
+  }
+
+  return 0;
+}
+
+static int os_readdir(const char *path,
+		      void *buf,
+		      fuse_fill_dir_t filler,
+		      off_t offset,
+		      struct fuse_file_info *fi
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+                      , enum fuse_readdir_flags
+#endif
+                      )
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << " offset " << offset
+		     << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int t = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (t < 0)
+    return t;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  auto ch = fs->store->open_collection(cid);
+
+  // we can't shift 32 bits or else off_t will go negative
+  const int hash_shift = 31;
+
+  switch (t) {
+  case FN_ROOT:
+    {
+      filler_compat(filler, buf, "type", NULL, 0);
+      vector<coll_t> cls;
+      fs->store->list_collections(cls);
+      for (auto c : cls) {
+	int r = filler_compat(filler, buf, stringify(c).c_str(), NULL, 0);
+	if (r > 0)
+	  break;
+      }
+    }
+    break;
+
+  case FN_COLLECTION:
+    {
+      if (!ch) {
+	return -ENOENT;
+      }
+      filler_compat(filler, buf, "bitwise_hash_start", NULL, 0);
+      if (fs->store->collection_bits(ch) >= 0) {
+	filler_compat(filler, buf, "bitwise_hash_end", NULL, 0);
+	filler_compat(filler, buf, "bitwise_hash_bits", NULL, 0);
+      }
+      filler_compat(filler, buf, "all", NULL, 0);
+      filler_compat(filler, buf, "by_bitwise_hash", NULL, 0);
+      spg_t pgid;
+      if (cid.is_pg(&pgid) &&
+	  fs->store->exists(ch, pgid.make_pgmeta_oid())) {
+	filler_compat(filler, buf, "pgmeta", NULL, 0);
+      }
+    }
+    break;
+
+  case FN_OBJECT:
+    {
+      filler_compat(filler, buf, "bitwise_hash", NULL, 0);
+      filler_compat(filler, buf, "data", NULL, 0);
+      filler_compat(filler, buf, "omap", NULL, 0);
+      filler_compat(filler, buf, "attr", NULL, 0);
+      filler_compat(filler, buf, "omap_header", NULL, 0);
+    }
+    break;
+
+  case FN_HASH_VAL:
+  case FN_ALL:
+    {
+      uint32_t bitwise_hash = (offset >> hash_shift) & 0xffffffff;
+      uint32_t hashoff = offset - (bitwise_hash << hash_shift);
+      int skip = hashoff;
+      ghobject_t next = cid.get_min_hobj();
+      if (offset) {
+	// obey the offset
+	next.hobj.set_hash(hobject_t::_reverse_bits(bitwise_hash));
+      } else if (t == FN_HASH_VAL) {
+	next.hobj.set_hash(hobject_t::_reverse_bits(hash_value));
+      }
+      ghobject_t last;
+      if (t == FN_HASH_VAL) {
+	last = next;
+	uint64_t rev_end = (hash_value | (0xffffffff >> hash_bits)) + 1;
+	if (rev_end >= 0x100000000)
+	  last = ghobject_t::get_max();
+	else
+	  last.hobj.set_hash(hobject_t::_reverse_bits(rev_end));
+      } else {
+	last = ghobject_t::get_max();
+      }
+      ldout(fs->store->cct, 10) << __func__ << std::hex
+			 << " offset " << offset << " hash "
+			 << hobject_t::_reverse_bits(hash_value)
+			 << std::dec
+			 << "/" << hash_bits
+			 << " first " << next << " last " << last
+			 << dendl;
+      while (true) {
+	vector<ghobject_t> ls;
+	int r = fs->store->collection_list(
+	  ch, next, last, 1000, &ls, &next);
+	if (r < 0)
+	  return r;
+	for (auto p : ls) {
+	  if (skip) {
+	    --skip;
+	    continue;
+	  }
+	  uint32_t cur_bitwise_hash = p.hobj.get_bitwise_key_u32();
+	  if (cur_bitwise_hash != bitwise_hash) {
+	    bitwise_hash = cur_bitwise_hash;
+	    hashoff = 0;
+	  }
+	  ++hashoff;
+	  uint64_t cur_off = ((uint64_t)bitwise_hash << hash_shift) |
+	    (uint64_t)hashoff;
+	  string s = stringify(p);
+	  r = filler_compat(filler, buf, s.c_str(), NULL, cur_off);
+	  if (r)
+	    break;
+	}
+	if (r)
+	  break;
+	if (next == ghobject_t::get_max() || next == last)
+	  break;
+      }
+    }
+    break;
+
+  case FN_OBJECT_OMAP:
+    {
+      set<string> keys;
+      fs->store->omap_get_keys(ch, oid, &keys);
+      unsigned skip = offset;
+      for (auto k : keys) {
+	if (skip) {
+	  --skip;
+	  continue;
+	}
+	++offset;
+	int r = filler_compat(filler, buf, k.c_str(), NULL, offset);
+	if (r)
+	  break;
+      }
+    }
+    break;
+
+  case FN_OBJECT_ATTR:
+    {
+      map<string,bufferptr,less<>> aset;
+      fs->store->getattrs(ch, oid, aset);
+      unsigned skip = offset;
+      for (auto a : aset) {
+	if (skip) {
+	  --skip;
+	  continue;
+	}
+	++offset;
+	int r = filler_compat(filler, buf, a.first.c_str(), NULL, offset);
+	if (r)
+	  break;
+      }
+    }
+    break;
+  }
+  return 0;
+}
+
+static int os_open(const char *path, struct fuse_file_info *fi)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int t = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (t < 0)
+    return t;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  auto ch = fs->store->open_collection(cid);
+
+  bufferlist *pbl = 0;
+  switch (t) {
+  case FN_TYPE:
+    pbl = new bufferlist;
+    pbl->append(fs->store->get_type());
+    pbl->append("\n");
+    break;
+
+  case FN_HASH_START:
+    {
+      pbl = new bufferlist;
+      spg_t pgid;
+      if (cid.is_pg(&pgid)) {
+	unsigned long h;
+	h = hobject_t::_reverse_bits(pgid.ps());
+	char buf[10];
+	snprintf(buf, sizeof(buf), "%08lx\n", h);
+	pbl->append(buf);
+      } else {
+	pbl->append("00000000\n");
+      }
+    }
+    break;
+
+  case FN_HASH_END:
+    {
+      if (!ch) {
+	return -ENOENT;
+      }
+      spg_t pgid;
+      unsigned long h;
+      if (cid.is_pg(&pgid)) {
+	int hash_bits = fs->store->collection_bits(ch);
+	if (hash_bits >= 0) {
+	  uint64_t rev_start = hobject_t::_reverse_bits(pgid.ps());
+	  uint64_t rev_end = (rev_start | (0xffffffff >> hash_bits));
+	  h = rev_end;
+	} else {
+	  return -ENOENT;
+	}
+      } else {
+	h = 0xffffffff;
+      }
+      char buf[10];
+      snprintf(buf, sizeof(buf), "%08lx\n", h);
+      pbl = new bufferlist;
+      pbl->append(buf);
+    }
+    break;
+
+  case FN_HASH_BITS:
+    {
+      if (!ch) {
+	return -ENOENT;
+      }
+      int r = fs->store->collection_bits(ch);
+      if (r < 0)
+        return r;
+      char buf[12];
+      snprintf(buf, sizeof(buf), "%d\n", r);
+      pbl = new bufferlist;
+      pbl->append(buf);
+    }
+    break;
+
+  case FN_OBJECT_HASH:
+    {
+      pbl = new bufferlist;
+      char buf[10];
+      snprintf(buf, sizeof(buf), "%08x\n",
+	       (unsigned)oid.hobj.get_bitwise_key_u32());
+      pbl->append(buf);
+    }
+    break;
+
+  case FN_OBJECT_DATA:
+    {
+      int r = fs->open_file(
+	path, fi,
+	[&](bufferlist *pbl) {
+	  return fs->store->read(ch, oid, 0, 0, *pbl);
+	});
+      if (r < 0) {
+        return r;
+      }
+    }
+    break;
+
+  case FN_OBJECT_ATTR_VAL:
+    {
+      int r = fs->open_file(
+	path, fi,
+	[&](bufferlist *pbl) {
+	  bufferptr bp;
+	  int r = fs->store->getattr(ch, oid, key.c_str(), bp);
+	  if (r < 0)
+	    return r;
+	  pbl->append(bp);
+	  return 0;
+	});
+      if (r < 0)
+        return r;
+    }
+    break;
+
+  case FN_OBJECT_OMAP_VAL:
+    {
+      int r = fs->open_file(
+	path, fi,
+	[&](bufferlist *pbl) {
+	  set<string> k;
+	  k.insert(key);
+	  map<string,bufferlist> v;
+	  int r = fs->store->omap_get_values(ch, oid, k, &v);
+	  if (r < 0)
+	    return r;
+	  *pbl = v[key];
+	  return 0;
+	});
+      if (r < 0)
+	return r;
+    }
+    break;
+
+  case FN_OBJECT_OMAP_HEADER:
+    {
+      int r = fs->open_file(
+	path, fi,
+	[&](bufferlist *pbl) {
+	  return fs->store->omap_get_header(ch, oid, pbl);
+	});
+      if (r < 0)
+       return r;
+    }
+    break;
+  }
+
+  if (pbl) {
+    FuseStore::OpenFile *o = new FuseStore::OpenFile;
+    o->bl = std::move(*pbl);
+    fi->fh = reinterpret_cast<uint64_t>(o);
+  }
+  return 0;
+}
+
+static int os_mkdir(const char *path, mode_t mode)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (f < 0)
+    return f;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  ObjectStore::CollectionHandle ch;
+
+  ObjectStore::Transaction t;
+  switch (f) {
+  case FN_OBJECT:
+    {
+      ch = fs->store->open_collection(cid);
+      if (!ch) {
+	return -ENOENT;
+      }
+      spg_t pgid;
+      if (cid.is_pg(&pgid)) {
+	int bits = fs->store->collection_bits(ch);
+	if (bits >= 0 && !oid.match(bits, pgid.ps())) {
+	  // sorry, not part of this PG
+	  return -EINVAL;
+	}
+      }
+      t.touch(cid, oid);
+      ch = fs->store->open_collection(cid);
+    }
+    break;
+
+  case FN_COLLECTION:
+    if (cid.is_pg()) {
+      // use the mode for the bit count.  e.g., mkdir --mode=0003
+      // mnt/0.7_head will create 0.7 with bits = 3.
+      mode &= 0777;
+      if (mode >= 32)
+	return -EINVAL;
+    } else {
+      mode = 0;
+    }
+    t.create_collection(cid, mode);
+    ch = fs->store->create_new_collection(cid);
+    break;
+
+  default:
+    return -EPERM;
+  }
+
+  if (!t.empty()) {
+    fs->store->queue_transaction(ch, std::move(t));
+  }
+
+  return 0;
+}
+
+static int os_chmod(const char *path, mode_t mode
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+                    , struct fuse_file_info *fi
+#endif
+                    )
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  return 0;
+}
+
+static int os_create(const char *path, mode_t mode, struct fuse_file_info *fi)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (f < 0)
+    return f;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  ObjectStore::CollectionHandle ch = fs->store->open_collection(cid);
+
+  ObjectStore::Transaction t;
+  bufferlist *pbl = 0;
+  switch (f) {
+  case FN_OBJECT_DATA:
+    {
+      pbl = new bufferlist;
+      fs->store->read(ch, oid, 0, 0, *pbl);
+    }
+    break;
+
+  case FN_OBJECT_ATTR_VAL:
+    {
+      pbl = new bufferlist;
+      bufferptr bp;
+      int r = fs->store->getattr(ch, oid, key.c_str(), bp);
+      if (r == -ENODATA) {
+	bufferlist empty;
+	t.setattr(cid, oid, key.c_str(), empty);
+      }
+      pbl->append(bp);
+    }
+    break;
+
+  case FN_OBJECT_OMAP_VAL:
+    {
+      pbl = new bufferlist;
+      set<string> k;
+      k.insert(key);
+      map<string,bufferlist> v;
+      fs->store->omap_get_values(ch, oid, k, &v);
+      if (v.count(key) == 0) {
+	map<string,bufferlist> aset;
+	aset[key] = bufferlist();
+	t.omap_setkeys(cid, oid, aset);
+      } else {
+	*pbl = v[key];
+      }
+    }
+    break;
+  }
+
+  if (!t.empty()) {
+    fs->store->queue_transaction(ch, std::move(t));
+  }
+
+  if (pbl) {
+    FuseStore::OpenFile *o = new FuseStore::OpenFile;
+    o->bl = std::move(*pbl);
+    o->dirty = true;
+    fi->fh = reinterpret_cast<uint64_t>(o);
+  }
+  return 0;
+}
+
+static int os_release(const char *path, struct fuse_file_info *fi)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  std::lock_guard<std::mutex> l(fs->lock);
+  FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh);
+  if (--o->ref == 0) {
+    ldout(fs->store->cct, 10) << __func__ << " closing last " << o->path << dendl;
+    fs->open_files.erase(o->path);
+    delete o;
+  }
+  return 0;
+}
+
+static int os_read(const char *path, char *buf, size_t size, off_t offset,
+		   struct fuse_file_info *fi)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << " offset " << offset
+		     << " size " << size << dendl;
+  std::lock_guard<std::mutex> l(fs->lock);
+  FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh);
+  if (!o)
+    return 0;
+  if (offset >= o->bl.length())
+    return 0;
+  if (offset + size > o->bl.length())
+    size = o->bl.length() - offset;
+  bufferlist r;
+  r.substr_of(o->bl, offset, size);
+  memcpy(buf, r.c_str(), r.length());
+  return r.length();
+}
+
+static int os_write(const char *path, const char *buf, size_t size,
+		    off_t offset, struct fuse_file_info *fi)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << " offset " << offset
+		     << " size " << size << dendl;
+  std::lock_guard<std::mutex> l(fs->lock);
+  FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh);
+  if (!o)
+    return 0;
+
+  bufferlist final;
+  if (offset) {
+    if (offset > o->bl.length()) {
+      final.substr_of(o->bl, 0, offset);
+    } else {
+      final.claim_append(o->bl);
+      size_t zlen = offset - final.length();
+      final.append_zero(zlen);
+    }
+  }
+  final.append(buf, size);
+  if (offset + size < o->bl.length()) {
+    bufferlist rest;
+    rest.substr_of(o->bl, offset + size, o->bl.length() - offset - size);
+    final.claim_append(rest);
+  }
+  o->bl = final;
+  o->dirty = true;
+  return size;
+}
+
+int os_flush(const char *path, struct fuse_file_info *fi)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (f < 0)
+    return f;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  FuseStore::OpenFile *o = reinterpret_cast<FuseStore::OpenFile*>(fi->fh);
+  if (!o)
+    return 0;
+  if (!o->dirty)
+    return 0;
+
+  ObjectStore::CollectionHandle ch = fs->store->open_collection(cid);
+
+  ObjectStore::Transaction t;
+
+  switch (f) {
+  case FN_OBJECT_DATA:
+    t.write(cid, oid, 0, o->bl.length(), o->bl);
+    break;
+
+  case FN_OBJECT_ATTR_VAL:
+    t.setattr(cid, oid, key.c_str(), o->bl);
+    break;
+
+  case FN_OBJECT_OMAP_VAL:
+    {
+      map<string,bufferlist> aset;
+      aset[key] = o->bl;
+      t.omap_setkeys(cid, oid, aset);
+      break;
+    }
+
+  case FN_OBJECT_OMAP_HEADER:
+    t.omap_setheader(cid, oid, o->bl);
+    break;
+
+  default:
+    return 0;
+  }
+
+  fs->store->queue_transaction(ch, std::move(t));
+
+  return 0;
+}
+
+static int os_unlink(const char *path)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (f < 0)
+    return f;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  ObjectStore::CollectionHandle ch = fs->store->open_collection(cid);
+  ObjectStore::Transaction t;
+
+  switch (f) {
+  case FN_OBJECT_OMAP_VAL:
+    {
+      t.omap_rmkey(cid, oid, key);
+    }
+    break;
+
+  case FN_OBJECT_ATTR_VAL:
+    t.rmattr(cid, oid, key.c_str());
+    break;
+
+  case FN_OBJECT_OMAP_HEADER:
+    {
+      bufferlist empty;
+      t.omap_setheader(cid, oid, empty);
+    }
+    break;
+
+  case FN_OBJECT:
+    t.remove(cid, oid);
+    break;
+
+  case FN_COLLECTION:
+    {
+      bool empty;
+      int r = fs->store->collection_empty(ch, &empty);
+      if (r < 0)
+        return r;
+      if (!empty)
+        return -ENOTEMPTY;
+      t.remove_collection(cid);
+    }
+    break;
+
+  case FN_OBJECT_DATA:
+    t.truncate(cid, oid, 0);
+    break;
+
+  default:
+    return -EPERM;
+  }
+
+  fs->store->queue_transaction(ch, std::move(t));
+
+  return 0;
+}
+
+static int os_truncate(const char *path, off_t size
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+                       , struct fuse_file_info *fi
+#endif
+                       )
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << " size " << size << dendl;
+  coll_t cid;
+  ghobject_t oid;
+  string key;
+  uint32_t hash_value, hash_bits;
+  int f = parse_fn(fs->store->cct, path, &cid, &oid, &key, &hash_value,
+		   &hash_bits);
+  if (f < 0)
+    return f;
+
+  if (f == FN_OBJECT_OMAP_VAL ||
+      f == FN_OBJECT_ATTR_VAL ||
+      f == FN_OBJECT_OMAP_HEADER) {
+    if (size)
+      return -EPERM;
+    return 0;
+  }
+  if (f != FN_OBJECT_DATA)
+    return -EPERM;
+
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  if (fs->open_files.count(path)) {
+    FuseStore::OpenFile *o = fs->open_files[path];
+    if (o->bl.length() > size) {
+      bufferlist t;
+      t.substr_of(o->bl, 0, size);
+      o->bl.swap(t);
+    }
+  }
+
+  ObjectStore::CollectionHandle ch = fs->store->open_collection(cid);
+  ObjectStore::Transaction t;
+  t.truncate(cid, oid, size);
+  fs->store->queue_transaction(ch, std::move(t));
+  return 0;
+}
+
+static int os_statfs(const char *path, struct statvfs *stbuf)
+{
+  fuse_context *fc = fuse_get_context();
+  FuseStore *fs = static_cast<FuseStore*>(fc->private_data);
+  ldout(fs->store->cct, 10) << __func__ << " " << path << dendl;
+  std::lock_guard<std::mutex> l(fs->lock);
+
+  struct store_statfs_t s;
+  int r = fs->store->statfs(&s);
+  if (r < 0)
+    return r;
+  stbuf->f_bsize = 4096;   // LIES!
+  stbuf->f_blocks = s.total / 4096;
+  stbuf->f_bavail = s.available / 4096;
+  stbuf->f_bfree = stbuf->f_bavail;
+
+  ldout(fs->store->cct, 10) << __func__ << " " << path << ": " 
+    << stbuf->f_bavail << "/" << stbuf->f_blocks << dendl;
+  return 0;
+}
+
+static struct fuse_operations fs_oper = {
+  getattr: os_getattr,
+  readlink: 0,
+#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0)
+  getdir: 0,
+#endif
+  mknod: 0,
+  mkdir: os_mkdir,
+  unlink: os_unlink,
+  rmdir: os_unlink,
+  symlink: 0,
+  rename: 0,
+  link: 0,
+  chmod: os_chmod,
+  chown: 0,
+  truncate: os_truncate,
+#if FUSE_VERSION < FUSE_MAKE_VERSION(3, 0)
+  utime: 0,
+#endif
+  open: os_open,
+  read: os_read,
+  write: os_write,
+  statfs: os_statfs,
+  flush: os_flush,
+  release: os_release,
+  fsync: 0,
+  setxattr: 0,
+  getxattr: 0,
+  listxattr: 0,
+  removexattr: 0,
+  opendir: 0,
+  readdir: os_readdir,
+  releasedir: 0,
+  fsyncdir: 0,
+  init: 0,
+  destroy: 0,
+  access: 0,
+  create: os_create,
+};
+
+int FuseStore::main()
+{
+  const char *v[] = {
+    "foo",
+    mount_point.c_str(),
+    "-f",
+    "-d", // debug
+  };
+  int c = 3;
+  auto fuse_debug = store->cct->_conf.get_val<bool>("fuse_debug");
+  if (fuse_debug)
+    ++c;
+  return fuse_main(c, (char**)v, &fs_oper, (void*)this);
+}
+
+int FuseStore::start()
+{
+  dout(10) << __func__ << dendl;
+
+  memset(&info->args, 0, sizeof(info->args));
+  const char *v[] = {
+    "foo",
+    mount_point.c_str(),
+    "-f", // foreground
+    "-d", // debug
+  };
+  int c = 3;
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+  int rc;
+  struct fuse_cmdline_opts opts = {};
+#endif
+  auto fuse_debug = store->cct->_conf.get_val<bool>("fuse_debug");
+  if (fuse_debug)
+    ++c;
+  fuse_args a = FUSE_ARGS_INIT(c, (char**)v);
+  info->args = a;
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+  if (fuse_parse_cmdline(&info->args, &opts) == -1) {
+#else
+  if (fuse_parse_cmdline(&info->args, &info->mountpoint, NULL, NULL) == -1) {
+#endif
+    derr << __func__ << " failed to parse args" << dendl;
+    return -EINVAL;
+  }
+
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+  info->mountpoint = opts.mountpoint;
+  info->f = fuse_new(&info->args, &fs_oper, sizeof(fs_oper), (void*)this);
+  if (!info->f) {
+    derr << __func__ << " fuse_new failed" << dendl;
+    return -EIO;
+  }
+
+  rc = fuse_mount(info->f, info->mountpoint);
+  if (rc != 0) {
+    derr << __func__ << " fuse_mount failed" << dendl;
+    return -EIO;
+  }
+#else
+  info->ch = fuse_mount(info->mountpoint, &info->args);
+  if (!info->ch) {
+    derr << __func__ << " fuse_mount failed" << dendl;
+    return -EIO;
+  }
+
+  info->f = fuse_new(info->ch, &info->args, &fs_oper, sizeof(fs_oper),
+		     (void*)this);
+  if (!info->f) {
+    fuse_unmount(info->mountpoint, info->ch);
+    derr << __func__ << " fuse_new failed" << dendl;
+    return -EIO;
+  }
+#endif
+
+  fuse_thread.create("fusestore");
+  dout(10) << __func__ << " done" << dendl;
+  return 0;
+}
+
+int FuseStore::loop()
+{
+  dout(10) << __func__ << " enter" << dendl;
+  int r = fuse_loop(info->f);
+  if (r)
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+  dout(10) << __func__ << " exit" << dendl;
+  return r;
+}
+
+int FuseStore::stop()
+{
+  dout(10) << __func__ << " enter" << dendl;
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0)
+  fuse_unmount(info->f);
+#else
+  fuse_unmount(info->mountpoint, info->ch);
+#endif
+  fuse_thread.join();
+  fuse_destroy(info->f);
+  dout(10) << __func__ << " exit" << dendl;
+  return 0;
+}
diff --git a/src/os/FuseStore.h b/src/os/FuseStore.h
new file mode 100644
index 000000000..a3000d89d
--- /dev/null
+++ b/src/os/FuseStore.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_FUSESTORE_H
+#define CEPH_OS_FUSESTORE_H
+
+#include <string>
+#include <map>
+#include <mutex>
+#include <functional>
+
+#include "common/Thread.h"
+#include "include/buffer.h"
+
+class ObjectStore;
+
+class FuseStore {
+public:
+  ObjectStore *store;
+  std::string mount_point;
+  struct fs_info *info;
+  std::mutex lock;
+
+  struct OpenFile {
+    std::string path;
+    ceph::buffer::list bl;
+    bool dirty = false;
+    int ref = 0;
+  };
+  std::map<std::string,OpenFile*> open_files;
+
+  int open_file(std::string p, struct fuse_file_info *fi,
+		std::function<int(ceph::buffer::list *bl)> f);
+
+  class FuseThread : public Thread {
+    FuseStore *fs;
+  public:
+    explicit FuseThread(FuseStore *f) : fs(f) {}
+    void *entry() override {
+      fs->loop();
+      return NULL;
+    }
+  } fuse_thread;
+
+  FuseStore(ObjectStore *s, std::string p);
+  ~FuseStore();
+
+  int main();
+  int start();
+  int loop();
+  int stop();
+};
+
+#endif
diff --git a/src/os/ObjectMap.h b/src/os/ObjectMap.h
new file mode 100644
index 000000000..517d0ca98
--- /dev/null
+++ b/src/os/ObjectMap.h
@@ -0,0 +1,172 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef OS_KEYVALUESTORE_H
+#define OS_KEYVALUESTORE_H
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "kv/KeyValueDB.h"
+#include "common/hobject.h"
+
+class SequencerPosition;
+
+/**
+ * Encapsulates the FileStore key value store
+ *
+ * Implementations of this interface will be used to implement TMAP
+ */
+class ObjectMap {
+public:
+  CephContext* cct;
+  boost::scoped_ptr<KeyValueDB> db;
+  /// std::Set keys and values from specified map
+  virtual int set_keys(
+    const ghobject_t &oid,              ///< [in] object containing map
+    const std::map<std::string, ceph::buffer::list> &set,  ///< [in] key to value map to set
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+  /// std::Set header
+  virtual int set_header(
+    const ghobject_t &oid,              ///< [in] object containing map
+    const ceph::buffer::list &bl,               ///< [in] header to set
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+  /// Retrieve header
+  virtual int get_header(
+    const ghobject_t &oid,              ///< [in] object containing map
+    ceph::buffer::list *bl                      ///< [out] header to set
+    ) = 0;
+
+  /// Clear all map keys and values from oid
+  virtual int clear(
+    const ghobject_t &oid,             ///< [in] object containing map
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+  /// Clear all map keys and values in to_clear from oid
+  virtual int rm_keys(
+    const ghobject_t &oid,              ///< [in] object containing map
+    const std::set<std::string> &to_clear,        ///< [in] Keys to clear
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+  /// Clear all omap keys and the header
+  virtual int clear_keys_header(
+    const ghobject_t &oid,              ///< [in] oid to clear
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+  /// Get all keys and values
+  virtual int get(
+    const ghobject_t &oid,             ///< [in] object containing map
+    ceph::buffer::list *header,                ///< [out] Returned Header
+    std::map<std::string, ceph::buffer::list> *out       ///< [out] Returned keys and values
+    ) = 0;
+
+  /// Get values for supplied keys
+  virtual int get_keys(
+    const ghobject_t &oid,             ///< [in] object containing map
+    std::set<std::string> *keys                  ///< [out] Keys defined on oid
+    ) = 0;
+
+  /// Get values for supplied keys
+  virtual int get_values(
+    const ghobject_t &oid,             ///< [in] object containing map
+    const std::set<std::string> &keys,           ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out       ///< [out] Returned keys and values
+    ) = 0;
+
+  /// Check key existence
+  virtual int check_keys(
+    const ghobject_t &oid,             ///< [in] object containing map
+    const std::set<std::string> &keys,           ///< [in] Keys to check
+    std::set<std::string> *out                   ///< [out] Subset of keys defined on oid
+    ) = 0;
+
+  /// Get xattrs
+  virtual int get_xattrs(
+    const ghobject_t &oid,             ///< [in] object
+    const std::set<std::string> &to_get,         ///< [in] keys to get
+    std::map<std::string, ceph::buffer::list> *out       ///< [out] subset of attrs/vals defined
+    ) = 0;
+
+  /// Get all xattrs
+  virtual int get_all_xattrs(
+    const ghobject_t &oid,             ///< [in] object
+    std::set<std::string> *out                   ///< [out] attrs and values
+    ) = 0;
+
+  /// std::set xattrs in to_set
+  virtual int set_xattrs(
+    const ghobject_t &oid,                ///< [in] object
+    const std::map<std::string, ceph::buffer::list> &to_set,///< [in] attrs/values to set
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+  /// remove xattrs in to_remove
+  virtual int remove_xattrs(
+    const ghobject_t &oid,               ///< [in] object
+    const std::set<std::string> &to_remove,        ///< [in] attrs to remove
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) = 0;
+
+
+  /// Clone keys from oid map to target map
+  virtual int clone(
+    const ghobject_t &oid,             ///< [in] object containing map
+    const ghobject_t &target,           ///< [in] target of clone
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) { return 0; }
+
+  /// Rename map because of name change
+  virtual int rename(
+    const ghobject_t &from,             ///< [in] object containing map
+    const ghobject_t &to,               ///< [in] new name
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) { return 0; }
+
+  /// For testing clone keys from oid map to target map using faster but more complex method
+  virtual int legacy_clone(
+    const ghobject_t &oid,             ///< [in] object containing map
+    const ghobject_t &target,           ///< [in] target of clone
+    const SequencerPosition *spos=0     ///< [in] sequencer position
+    ) { return 0; }
+
+  /// Ensure all previous writes are durable
+  virtual int sync(
+    const ghobject_t *oid=0,          ///< [in] object
+    const SequencerPosition *spos=0   ///< [in] Sequencer
+    ) { return 0; }
+
+  virtual int check(std::ostream &out, bool repair = false, bool force = false) { return 0; }
+
+  virtual void compact() {}
+
+  typedef KeyValueDB::SimplestIteratorImpl ObjectMapIteratorImpl;
+  typedef std::shared_ptr<ObjectMapIteratorImpl> ObjectMapIterator;
+  virtual ObjectMapIterator get_iterator(const ghobject_t &oid) {
+    return ObjectMapIterator();
+  }
+
+  virtual KeyValueDB *get_db() { return nullptr; }
+
+  ObjectMap(CephContext* cct, KeyValueDB *db) : cct(cct), db(db) {}
+  virtual ~ObjectMap() {}
+};
+
+#endif
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
new file mode 100644
index 000000000..d40593891
--- /dev/null
+++ b/src/os/ObjectStore.cc
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+#include <ctype.h>
+#include <sstream>
+#include "ObjectStore.h"
+#include "common/Formatter.h"
+#include "common/safe_io.h"
+
+#include "memstore/MemStore.h"
+#if defined(WITH_BLUESTORE)
+#include "bluestore/BlueStore.h"
+#endif
+#ifndef WITH_SEASTAR
+#include "kstore/KStore.h"
+#endif
+
+using std::string;
+
+std::unique_ptr<ObjectStore> ObjectStore::create(
+  CephContext *cct,
+  const string& type,
+  const string& data)
+{
+  if (type == "memstore") {
+    return std::make_unique<MemStore>(cct, data);
+  }
+#if defined(WITH_BLUESTORE)
+  if (type == "bluestore" || type == "random") {
+    return std::make_unique<BlueStore>(cct, data);
+  }
+#endif
+  return nullptr;
+}
+
+#ifndef WITH_SEASTAR
+std::unique_ptr<ObjectStore> ObjectStore::create(
+  CephContext *cct,
+  const string& type,
+  const string& data,
+  const string& journal,
+  osflagbits_t flags)
+{
+  if (type == "filestore") {
+    lgeneric_derr(cct) << __func__ << ": FileStore has been deprecated and is no longer supported" << dendl;
+    return nullptr;
+  }
+  if (type == "kstore" &&
+      cct->check_experimental_feature_enabled("kstore")) {
+    return std::make_unique<KStore>(cct, data);
+  }
+  return create(cct, type, data);
+}
+#endif
+
+int ObjectStore::probe_block_device_fsid(
+  CephContext *cct,
+  const string& path,
+  uuid_d *fsid)
+{
+  int r;
+
+#if defined(WITH_BLUESTORE)
+  // first try bluestore -- it has a crc on its header and will fail
+  // reliably.
+  r = BlueStore::get_block_device_fsid(cct, path, fsid);
+  if (r == 0) {
+    lgeneric_dout(cct, 0) << __func__ << " " << path << " is bluestore, "
+			  << *fsid << dendl;
+    return r;
+  }
+#endif
+
+  return -EINVAL;
+}
+
+int ObjectStore::write_meta(const std::string& key,
+			    const std::string& value)
+{
+  string v = value;
+  v += "\n";
+  int r = safe_write_file(path.c_str(), key.c_str(),
+			  v.c_str(), v.length(), 0600);
+  if (r < 0)
+    return r;
+  return 0;
+}
+
+int ObjectStore::read_meta(const std::string& key,
+			   std::string *value)
+{
+  char buf[4096];
+  int r = safe_read_file(path.c_str(), key.c_str(),
+			 buf, sizeof(buf));
+  if (r <= 0)
+    return r;
+  // drop trailing newlines
+  while (r && isspace(buf[r-1])) {
+    --r;
+  }
+  *value = string(buf, r);
+  return 0;
+}
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
new file mode 100644
index 000000000..4c837b84d
--- /dev/null
+++ b/src/os/ObjectStore.h
@@ -0,0 +1,786 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_OBJECTSTORE_H
+#define CEPH_OBJECTSTORE_H
+
+#include "include/buffer.h"
+#include "include/common_fwd.h"
+#include "include/Context.h"
+#include "include/interval_set.h"
+#include "include/stringify.h"
+#include "include/types.h"
+
+#include "osd/osd_types.h"
+#include "common/TrackedOp.h"
+#include "common/WorkQueue.h"
+#include "ObjectMap.h"
+#include "os/Transaction.h"
+
+#include <errno.h>
+#include <sys/stat.h>
+#include <map>
+#include <memory>
+#include <vector>
+
+#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__sun) || defined(_WIN32)
+#include <sys/statvfs.h>
+#else
+#include <sys/vfs.h>    /* or <sys/statfs.h> */
+#endif
+
+namespace ceph {
+  class Formatter;
+}
+
+/*
+ * low-level interface to the local OSD file system
+ */
+
+class Logger;
+class ContextQueue;
+
+static inline void encode(const std::map<std::string,ceph::buffer::ptr> *attrset, ceph::buffer::list &bl) {
+  using ceph::encode;
+  encode(*attrset, bl);
+}
+
+// Flag bits
+typedef uint32_t osflagbits_t;
+const int SKIP_JOURNAL_REPLAY = 1 << 0;
+const int SKIP_MOUNT_OMAP = 1 << 1;
+
+class ObjectStore {
+protected:
+  std::string path;
+
+public:
+  using Transaction = ceph::os::Transaction;
+
+  CephContext* cct;
+  /**
+   * create - create an ObjectStore instance.
+   *
+   * This is invoked once at initialization time.
+   *
+   * @param type type of store. This is a std::string from the configuration file.
+   * @param data path (or other descriptor) for data
+   * @param journal path (or other descriptor) for journal (optional)
+   * @param flags which filestores should check if applicable
+   */
+#ifndef WITH_SEASTAR
+  static std::unique_ptr<ObjectStore> create(
+    CephContext *cct,
+    const std::string& type,
+    const std::string& data,
+    const std::string& journal,
+    osflagbits_t flags = 0);
+#endif
+  static std::unique_ptr<ObjectStore> create(
+    CephContext *cct,
+    const std::string& type,
+    const std::string& data);
+
+  /**
+   * probe a block device to learn the uuid of the owning OSD
+   *
+   * @param cct cct
+   * @param path path to device
+   * @param fsid [out] osd uuid
+   */
+  static int probe_block_device_fsid(
+    CephContext *cct,
+    const std::string& path,
+    uuid_d *fsid);
+
+  /**
+   * Fetch Object Store statistics.
+   *
+   * Currently only latency of write and apply times are measured.
+   *
+   * This appears to be called with nothing locked.
+   */
+  virtual objectstore_perf_stat_t get_cur_stats() = 0;
+
+  /**
+   * Fetch Object Store performance counters.
+   *
+   *
+   * This appears to be called with nothing locked.
+   */
+  virtual const PerfCounters* get_perf_counters() const = 0;
+
+  /**
+   * a collection also orders transactions
+   *
+   * Any transactions queued under a given collection will be applied in
+   * sequence.  Transactions queued under different collections may run
+   * in parallel.
+   *
+   * ObjectStore users may get collection handles with open_collection() (or,
+   * for bootstrapping a new collection, create_new_collection()).
+   */
+  struct CollectionImpl : public RefCountedObject {
+    const coll_t cid;
+
+    /// wait for any queued transactions to apply
+    // block until any previous transactions are visible.  specifically,
+    // collection_list and collection_empty need to reflect prior operations.
+    virtual void flush() = 0;
+
+    /**
+     * Async flush_commit
+     *
+     * There are two cases:
+     * 1) collection is currently idle: the method returns true.  c is
+     *    not touched.
+     * 2) collection is not idle: the method returns false and c is
+     *    called asynchronously with a value of 0 once all transactions
+     *    queued on this collection prior to the call have been applied
+     *    and committed.
+     */
+    virtual bool flush_commit(Context *c) = 0;
+
+    const coll_t &get_cid() {
+      return cid;
+    }
+  protected:
+    CollectionImpl() = delete;
+    CollectionImpl(CephContext* cct, const coll_t& c) : RefCountedObject(cct), cid(c) {}
+    ~CollectionImpl() = default;
+  };
+  using CollectionHandle = ceph::ref_t<CollectionImpl>;
+
+
+  /*********************************
+   *
+   * Object Contents and semantics
+   *
+   * All ObjectStore objects are identified as a named object
+   * (ghobject_t and hobject_t) in a named collection (coll_t).
+   * ObjectStore operations support the creation, mutation, deletion
+   * and enumeration of objects within a collection.  Enumeration is
+   * in sorted key order (where keys are sorted by hash). Object names
+   * are globally unique.
+   *
+   * Each object has four distinct parts: byte data, xattrs, omap_header
+   * and omap entries.
+   *
+   * The data portion of an object is conceptually equivalent to a
+   * file in a file system. Random and Partial access for both read
+   * and write operations is required. The ability to have a sparse
+   * implementation of the data portion of an object is beneficial for
+   * some workloads, but not required. There is a system-wide limit on
+   * the maximum size of an object, which is typically around 100 MB.
+   *
+   * Xattrs are equivalent to the extended attributes of file
+   * systems. Xattrs are a std::set of key/value pairs.  Sub-value access
+   * is not required. It is possible to enumerate the std::set of xattrs in
+   * key order.  At the implementation level, xattrs are used
+   * exclusively internal to Ceph and the implementer can expect the
+   * total size of all of the xattrs on an object to be relatively
+   * small, i.e., less than 64KB. Much of Ceph assumes that accessing
+   * xattrs on temporally adjacent object accesses (recent past or
+   * near future) is inexpensive.
+   *
+   * omap_header is a single blob of data. It can be read or written
+   * in total.
+   *
+   * Omap entries are conceptually the same as xattrs
+   * but in a different address space. In other words, you can have
+   * the same key as an xattr and an omap entry and they have distinct
+   * values. Enumeration of xattrs doesn't include omap entries and
+   * vice versa. The size and access characteristics of omap entries
+   * are very different from xattrs. In particular, the value portion
+   * of an omap entry can be quite large (MBs).  More importantly, the
+   * interface must support efficient range queries on omap entries even
+   * when there are a large numbers of entries.
+   *
+   *********************************/
+
+  /*******************************
+   *
+   * Collections
+   *
+   * A collection is simply a grouping of objects. Collections have
+   * names (coll_t) and can be enumerated in order.  Like an
+   * individual object, a collection also has a std::set of xattrs.
+   *
+   *
+   */
+
+
+  int queue_transaction(CollectionHandle& ch,
+			Transaction&& t,
+			TrackedOpRef op = TrackedOpRef(),
+			ThreadPool::TPHandle *handle = NULL) {
+    std::vector<Transaction> tls;
+    tls.push_back(std::move(t));
+    return queue_transactions(ch, tls, op, handle);
+  }
+
+  virtual int queue_transactions(
+    CollectionHandle& ch, std::vector<Transaction>& tls,
+    TrackedOpRef op = TrackedOpRef(),
+    ThreadPool::TPHandle *handle = NULL) = 0;
+
+
+ public:
+  ObjectStore(CephContext* cct,
+	      const std::string& path_) : path(path_), cct(cct) {}
+  virtual ~ObjectStore() {}
+
+  // no copying
+  explicit ObjectStore(const ObjectStore& o) = delete;
+  const ObjectStore& operator=(const ObjectStore& o) = delete;
+
+  // versioning
+  virtual int upgrade() {
+    return 0;
+  }
+
+  virtual void get_db_statistics(ceph::Formatter *f) { }
+  virtual void generate_db_histogram(ceph::Formatter *f) { }
+  virtual int flush_cache(std::ostream *os = NULL) { return -1; }
+  virtual void dump_perf_counters(ceph::Formatter *f) {}
+  virtual void dump_cache_stats(ceph::Formatter *f) {}
+  virtual void dump_cache_stats(std::ostream& os) {}
+
+  virtual std::string get_type() = 0;
+
+  // mgmt
+  virtual bool test_mount_in_use() = 0;
+  virtual int mount() = 0;
+  virtual int umount() = 0;
+  virtual int fsck(bool deep) {
+    return -EOPNOTSUPP;
+  }
+  virtual int repair(bool deep) {
+    return -EOPNOTSUPP;
+  }
+  virtual int quick_fix() {
+    return -EOPNOTSUPP;
+  }
+
+  virtual void set_cache_shards(unsigned num) { }
+
+  /**
+   * Returns 0 if the hobject is valid, -error otherwise
+   *
+   * Errors:
+   * -ENAMETOOLONG: locator/namespace/name too large
+   */
+  virtual int validate_hobject_key(const hobject_t &obj) const = 0;
+
+  virtual unsigned get_max_attr_name_length() = 0;
+  virtual int mkfs() = 0;  // wipe
+  virtual int mkjournal() = 0; // journal only
+  virtual bool needs_journal() = 0;  //< requires a journal
+  virtual bool wants_journal() = 0;  //< prefers a journal
+  virtual bool allows_journal() = 0; //< allows a journal
+  virtual void prepare_for_fast_shutdown() {}
+  virtual bool has_null_manager() const { return false; }
+  // return store min allocation size, if applicable
+  virtual uint64_t get_min_alloc_size() const {
+    return 0;
+  }
+
+  /// enumerate hardware devices (by 'devname', e.g., 'sda' as in /sys/block/sda)
+  virtual int get_devices(std::set<std::string> *devls) {
+    return -EOPNOTSUPP;
+  }
+
+  /// true if a txn is readable immediately after it is queued.
+  virtual bool is_sync_onreadable() const {
+    return true;
+  }
+
+  /**
+   * is_rotational
+   *
+   * Check whether store is backed by a rotational (HDD) or non-rotational
+   * (SSD) device.
+   *
+   * This must be usable *before* the store is mounted.
+   *
+   * @return true for HDD, false for SSD
+   */
+  virtual bool is_rotational() {
+    return true;
+  }
+
+  /**
+   * is_journal_rotational
+   *
+   * Check whether journal is backed by a rotational (HDD) or non-rotational
+   * (SSD) device.
+   *
+   *
+   * @return true for HDD, false for SSD
+   */
+  virtual bool is_journal_rotational() {
+    return true;
+  }
+
+  virtual std::string get_default_device_class() {
+    return is_rotational() ? "hdd" : "ssd";
+  }
+
+  virtual int get_numa_node(
+    int *numa_node,
+    std::set<int> *nodes,
+    std::set<std::string> *failed) {
+    return -EOPNOTSUPP;
+  }
+
+
+  virtual bool can_sort_nibblewise() {
+    return false;   // assume a backend cannot, unless it says otherwise
+  }
+
+  virtual int statfs(struct store_statfs_t *buf,
+		     osd_alert_list_t* alerts = nullptr) = 0;
+  virtual int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+			  bool *per_pool_omap) = 0;
+
+  virtual void collect_metadata(std::map<std::string,std::string> *pm) { }
+
+  /**
+   * write_meta - write a simple configuration key out-of-band
+   *
+   * Write a simple key/value pair for basic store configuration
+   * (e.g., a uuid or magic number) to an unopened/unmounted store.
+   * The default implementation writes this to a plaintext file in the
+   * path.
+   *
+   * A newline is appended.
+   *
+   * @param key key name (e.g., "fsid")
+   * @param value value (e.g., a uuid rendered as a std::string)
+   * @returns 0 for success, or an error code
+   */
+  virtual int write_meta(const std::string& key,
+			 const std::string& value);
+
+  /**
+   * read_meta - read a simple configuration key out-of-band
+   *
+   * Read a simple key value to an unopened/mounted store.
+   *
+   * Trailing whitespace is stripped off.
+   *
+   * @param key key name
+   * @param value pointer to value std::string
+   * @returns 0 for success, or an error code
+   */
+  virtual int read_meta(const std::string& key,
+			std::string *value);
+
+  /**
+   * get ideal max value for collection_list()
+   *
+   * default to some arbitrary values; the implementation will override.
+   */
+  virtual int get_ideal_list_max() { return 64; }
+
+
+  /**
+   * get a collection handle
+   *
+   * Provide a trivial handle as a default to avoid converting legacy
+   * implementations.
+   */
+  virtual CollectionHandle open_collection(const coll_t &cid) = 0;
+
+  /**
+   * get a collection handle for a soon-to-be-created collection
+   *
+   * This handle must be used by queue_transaction that includes a
+   * create_collection call in order to become valid.  It will become the
+   * reference to the created collection.
+   */
+  virtual CollectionHandle create_new_collection(const coll_t &cid) = 0;
+
+  /**
+   * std::set ContextQueue for a collection
+   *
+   * After that, oncommits of Transaction will queue into commit_queue.
+   * And osd ShardThread will call oncommits.
+   */
+  virtual void set_collection_commit_queue(const coll_t &cid, ContextQueue *commit_queue) = 0;
+
+  /**
+   * Synchronous read operations
+   */
+
+  /**
+   * exists -- Test for existence of object
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @returns true if object exists, false otherwise
+   */
+  virtual bool exists(CollectionHandle& c, const ghobject_t& oid) = 0;
+  /**
+   * set_collection_opts -- std::set pool options for a collectioninformation for an object
+   *
+   * @param cid collection
+   * @param opts new collection options
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int set_collection_opts(
+    CollectionHandle& c,
+    const pool_opts_t& opts) = 0;
+
+  /**
+   * stat -- get information for an object
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param st output information for the object
+   * @param allow_eio if false, assert on -EIO operation failure
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int stat(
+    CollectionHandle &c,
+    const ghobject_t& oid,
+    struct stat *st,
+    bool allow_eio = false) = 0;
+  /**
+   * read -- read a byte range of data from an object
+   *
+   * Note: if reading from an offset past the end of the object, we
+   * return 0 (not, say, -EINVAL).
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param offset location offset of first byte to be read
+   * @param len number of bytes to be read
+   * @param bl output ceph::buffer::list
+   * @param op_flags is CEPH_OSD_OP_FLAG_*
+   * @returns number of bytes read on success, or negative error code on failure.
+   */
+   virtual int read(
+     CollectionHandle &c,
+     const ghobject_t& oid,
+     uint64_t offset,
+     size_t len,
+     ceph::buffer::list& bl,
+     uint32_t op_flags = 0) = 0;
+
+  /**
+   * fiemap -- get extent std::map of data of an object
+   *
+   * Returns an encoded std::map of the extents of an object's data portion
+   * (std::map<offset,size>).
+   *
+   * A non-enlightened implementation is free to return the extent (offset, len)
+   * as the sole extent.
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param offset location offset of first byte to be read
+   * @param len number of bytes to be read
+   * @param bl output ceph::buffer::list for extent std::map information.
+   * @returns 0 on success, negative error code on failure.
+   */
+   virtual int fiemap(CollectionHandle& c, const ghobject_t& oid,
+		      uint64_t offset, size_t len, ceph::buffer::list& bl) = 0;
+   virtual int fiemap(CollectionHandle& c, const ghobject_t& oid,
+		      uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) = 0;
+
+  /**
+   * readv -- read specfic intervals from an object;
+   * caller must call fiemap to fill in the extent-map first.
+   *
+   * Note: if reading from an offset past the end of the object, we
+   * return 0 (not, say, -EINVAL). Also the default version of readv
+   * reads each extent separately synchronously, which can become horribly
+   * inefficient if the physical layout of the pushing object get massively
+   * fragmented and hence should be overridden by any real os that
+   * cares about the performance..
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param m intervals to be read
+   * @param bl output ceph::buffer::list
+   * @param op_flags is CEPH_OSD_OP_FLAG_*
+   * @returns number of bytes read on success, or negative error code on failure.
+   */
+   virtual int readv(
+     CollectionHandle &c,
+     const ghobject_t& oid,
+     interval_set<uint64_t>& m,
+     ceph::buffer::list& bl,
+     uint32_t op_flags = 0) {
+     int total = 0;
+     for (auto p = m.begin(); p != m.end(); p++) {
+       ceph::buffer::list t;
+       int r = read(c, oid, p.get_start(), p.get_len(), t, op_flags);
+       if (r < 0)
+         return r;
+       total += r;
+       // prune fiemap, if necessary
+       if (p.get_len() != t.length()) {
+          auto save = p++;
+          if (t.length() == 0) {
+            m.erase(save); // Remove this empty interval
+          } else {
+            save.set_len(t.length()); // fix interval length
+            bl.claim_append(t);
+          }
+          // Remove any other follow-up intervals present too
+          while (p != m.end()) {
+            save = p++;
+            m.erase(save);
+          }
+          break;
+       }
+       bl.claim_append(t);
+     }
+     return total;
+   }
+
+  /**
+   * dump_onode -- dumps onode metadata in human readable form,
+     intended primiarily for debugging
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param section_name section name to create and print under
+   * @param f Formatter class instance to print to
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int dump_onode(
+    CollectionHandle &c,
+    const ghobject_t& oid,
+    const std::string& section_name,
+    ceph::Formatter *f) {
+    return -ENOTSUP;
+  }
+
+  /**
+   * getattr -- get an xattr of an object
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param name name of attr to read
+   * @param value place to put output result.
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int getattr(CollectionHandle &c, const ghobject_t& oid,
+		      const char *name, ceph::buffer::ptr& value) = 0;
+
+  /**
+   * getattr -- get an xattr of an object
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param name name of attr to read
+   * @param value place to put output result.
+   * @returns 0 on success, negative error code on failure.
+   */
+  int getattr(
+    CollectionHandle &c, const ghobject_t& oid,
+    const std::string& name, ceph::buffer::list& value) {
+    ceph::buffer::ptr bp;
+    int r = getattr(c, oid, name.c_str(), bp);
+    value.push_back(bp);
+    return r;
+  }
+
+  /**
+   * getattrs -- get all of the xattrs of an object
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param aset place to put output result.
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int getattrs(CollectionHandle &c, const ghobject_t& oid,
+		       std::map<std::string,ceph::buffer::ptr, std::less<>>& aset) = 0;
+
+  /**
+   * getattrs -- get all of the xattrs of an object
+   *
+   * @param cid collection for object
+   * @param oid oid of object
+   * @param aset place to put output result.
+   * @returns 0 on success, negative error code on failure.
+   */
+  int getattrs(CollectionHandle &c, const ghobject_t& oid,
+	       std::map<std::string,ceph::buffer::list,std::less<>>& aset) {
+    std::map<std::string,ceph::buffer::ptr,std::less<>> bmap;
+    int r = getattrs(c, oid, bmap);
+    for (auto i = bmap.begin(); i != bmap.end(); ++i) {
+      aset[i->first].append(i->second);
+    }
+    return r;
+  }
+
+
+  // collections
+
+  /**
+   * list_collections -- get all of the collections known to this ObjectStore
+   *
+   * @param ls std::list of the collections in sorted order.
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int list_collections(std::vector<coll_t>& ls) = 0;
+
+  /**
+   * does a collection exist?
+   *
+   * @param c collection
+   * @returns true if it exists, false otherwise
+   */
+  virtual bool collection_exists(const coll_t& c) = 0;
+
+  /**
+   * is a collection empty?
+   *
+   * @param c collection
+   * @param empty true if the specified collection is empty, false otherwise
+   * @returns 0 on success, negative error code on failure.
+   */
+  virtual int collection_empty(CollectionHandle& c, bool *empty) = 0;
+
+  /**
+   * return the number of significant bits of the coll_t::pgid.
+   *
+   * This should return what the last create_collection or split_collection
+   * std::set.  A legacy backend may return -EAGAIN if the value is unavailable
+   * (because we upgraded from an older version, e.g., FileStore).
+   */
+  virtual int collection_bits(CollectionHandle& c) = 0;
+
+
+  /**
+   * std::list contents of a collection that fall in the range [start, end) and no more than a specified many result
+   *
+   * @param c collection
+   * @param start list object that sort >= this value
+   * @param end list objects that sort < this value
+   * @param max return no more than this many results
+   * @param seq return no objects with snap < seq
+   * @param ls [out] result
+   * @param next [out] next item sorts >= this value
+   * @return zero on success, or negative error
+   */
+  virtual int collection_list(CollectionHandle &c,
+			      const ghobject_t& start, const ghobject_t& end,
+			      int max,
+			      std::vector<ghobject_t> *ls, ghobject_t *next) = 0;
+
+  virtual int collection_list_legacy(CollectionHandle &c,
+                                     const ghobject_t& start,
+                                     const ghobject_t& end, int max,
+                                     std::vector<ghobject_t> *ls,
+                                     ghobject_t *next) {
+    return collection_list(c, start, end, max, ls, next);
+  }
+
+  /// OMAP
+  /// Get omap contents
+  virtual int omap_get(
+    CollectionHandle &c,     ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value std::map
+    ) = 0;
+
+  /// Get omap header
+  virtual int omap_get_header(
+    CollectionHandle &c,     ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    bool allow_eio = false ///< [in] don't assert on eio
+    ) = 0;
+
+  /// Get keys defined on oid
+  virtual int omap_get_keys(
+    CollectionHandle &c,   ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    std::set<std::string> *keys      ///< [out] Keys defined on oid
+    ) = 0;
+
+  /// Get key values
+  virtual int omap_get_values(
+    CollectionHandle &c,         ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::set<std::string> &keys,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) = 0;
+
+#ifdef WITH_SEASTAR
+  virtual int omap_get_values(
+    CollectionHandle &c,         ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::optional<std::string> &start_after,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) = 0;
+#endif
+
+  /// Filters keys into out which are defined on oid
+  virtual int omap_check_keys(
+    CollectionHandle &c,     ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    const std::set<std::string> &keys, ///< [in] Keys to check
+    std::set<std::string> *out         ///< [out] Subset of keys defined on oid
+    ) = 0;
+
+  /**
+   * Returns an object map iterator
+   *
+   * Warning!  The returned iterator is an implicit lock on filestore
+   * operations in c.  Do not use filestore methods on c while the returned
+   * iterator is live.  (Filling in a transaction is no problem).
+   *
+   * @return iterator, null on error
+   */
+  virtual ObjectMap::ObjectMapIterator get_omap_iterator(
+    CollectionHandle &c,   ///< [in] collection
+    const ghobject_t &oid  ///< [in] object
+    ) = 0;
+
+  virtual int flush_journal() { return -EOPNOTSUPP; }
+
+  virtual int dump_journal(std::ostream& out) { return -EOPNOTSUPP; }
+
+  virtual int snapshot(const std::string& name) { return -EOPNOTSUPP; }
+
+  /**
+   * Set and get internal fsid for this instance. No external data is modified
+   */
+  virtual void set_fsid(uuid_d u) = 0;
+  virtual uuid_d get_fsid() = 0;
+
+  /**
+  * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store
+  * - num objects - total (including witeouts) object count to measure used space for.
+  */
+  virtual uint64_t estimate_objects_overhead(uint64_t num_objects) = 0;
+
+
+  // DEBUG
+  virtual void inject_data_error(const ghobject_t &oid) {}
+  virtual void inject_mdata_error(const ghobject_t &oid) {}
+
+  virtual void compact() {}
+  virtual bool has_builtin_csum() const {
+    return false;
+  }
+};
+
+#endif
diff --git a/src/os/SequencerPosition.h b/src/os/SequencerPosition.h
new file mode 100644
index 000000000..5ba4699a2
--- /dev/null
+++ b/src/os/SequencerPosition.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_OS_SEQUENCERPOSITION_H
+#define __CEPH_OS_SEQUENCERPOSITION_H
+
+#include "include/types.h"
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+#include <ostream>
+
+/**
+ * transaction and op offset
+ */
+struct SequencerPosition {
+  uint64_t seq;  ///< seq
+  uint32_t trans; ///< transaction in that seq (0-based)
+  uint32_t op;    ///< op in that transaction (0-based)
+
+  SequencerPosition(uint64_t s=0, int32_t t=0, int32_t o=0) : seq(s), trans(t), op(o) {}
+
+  auto operator<=>(const SequencerPosition&) const = default;
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(seq, bl);
+    encode(trans, bl);
+    encode(op, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) {
+    DECODE_START(1, p);
+    decode(seq, p);
+    decode(trans, p);
+    decode(op, p);
+    DECODE_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->dump_unsigned("seq", seq);
+    f->dump_unsigned("trans", trans);
+    f->dump_unsigned("op", op);
+  }
+  static void generate_test_instances(std::list<SequencerPosition*>& o) {
+    o.push_back(new SequencerPosition);
+    o.push_back(new SequencerPosition(1, 2, 3));
+    o.push_back(new SequencerPosition(4, 5, 6));
+  }
+};
+WRITE_CLASS_ENCODER(SequencerPosition)
+
+inline std::ostream& operator<<(std::ostream& out, const SequencerPosition& t) {
+  return out << t.seq << "." << t.trans << "." << t.op;
+}
+
+#endif
diff --git a/src/os/Transaction.cc b/src/os/Transaction.cc
new file mode 100644
index 000000000..f99b25220
--- /dev/null
+++ b/src/os/Transaction.cc
@@ -0,0 +1,583 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "os/Transaction.h"
+#include "common/Formatter.h"
+
+using std::less;
+using std::list;
+using std::map;
+using std::ostream;
+using std::set;
+using std::string;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+
+void decode_str_str_map_to_bl(bufferlist::const_iterator& p,
+			      bufferlist *out)
+{
+  auto start = p;
+  __u32 n;
+  decode(n, p);
+  unsigned len = 4;
+  while (n--) {
+    __u32 l;
+    decode(l, p);
+    p += l;
+    len += 4 + l;
+    decode(l, p);
+    p += l;
+    len += 4 + l;
+  }
+  start.copy(len, *out);
+}
+
+void decode_str_set_to_bl(bufferlist::const_iterator& p,
+			  bufferlist *out)
+{
+  auto start = p;
+  __u32 n;
+  decode(n, p);
+  unsigned len = 4;
+  while (n--) {
+    __u32 l;
+    decode(l, p);
+    p += l;
+    len += 4 + l;
+  }
+  start.copy(len, *out);
+}
+
+namespace ceph::os {
+
+void Transaction::dump(ceph::Formatter *f)
+{
+  f->open_array_section("ops");
+  iterator i = begin();
+  int op_num = 0;
+  bool stop_looping = false;
+  while (i.have_op() && !stop_looping) {
+    Transaction::Op *op = i.decode_op();
+    f->open_object_section("op");
+    f->dump_int("op_num", op_num);
+
+    switch (op->op) {
+    case Transaction::OP_NOP:
+      f->dump_string("op_name", "nop");
+      break;
+    case Transaction::OP_CREATE:
+      {
+	coll_t cid = i.get_cid(op->cid);
+	ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "create");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+      }
+      break;
+
+    case Transaction::OP_TOUCH:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "touch");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+      }
+      break;
+      
+    case Transaction::OP_WRITE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	bufferlist bl;
+	i.decode_bl(bl);
+	f->dump_string("op_name", "write");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+        f->dump_unsigned("length", len);
+        f->dump_unsigned("offset", off);
+        f->dump_unsigned("bufferlist length", bl.length());
+      }
+      break;
+      
+    case Transaction::OP_ZERO:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	f->dump_string("op_name", "zero");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+        f->dump_unsigned("offset", off);
+	f->dump_unsigned("length", len);
+      }
+      break;
+      
+    case Transaction::OP_TRIMCACHE:
+      {
+        // deprecated, no-op
+	f->dump_string("op_name", "trim_cache");
+      }
+      break;
+      
+    case Transaction::OP_TRUNCATE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+	f->dump_string("op_name", "truncate");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->dump_unsigned("offset", off);
+      }
+      break;
+      
+    case Transaction::OP_REMOVE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "remove");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+      }
+      break;
+      
+    case Transaction::OP_SETATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        string name = i.decode_string();
+	bufferlist bl;
+	i.decode_bl(bl);
+	f->dump_string("op_name", "setattr");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->dump_string("name", name);
+	f->dump_unsigned("length", bl.length());
+      }
+      break;
+      
+    case Transaction::OP_SETATTRS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	map<string, bufferptr> aset;
+	i.decode_attrset(aset);
+	f->dump_string("op_name", "setattrs");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->open_object_section("attr_lens");
+	for (map<string,bufferptr>::iterator p = aset.begin();
+	    p != aset.end(); ++p) {
+	  f->dump_unsigned(p->first.c_str(), p->second.length());
+	}
+	f->close_section();
+      }
+      break;
+
+    case Transaction::OP_RMATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        string name = i.decode_string();
+	f->dump_string("op_name", "rmattr");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->dump_string("name", name);
+      }
+      break;
+
+    case Transaction::OP_RMATTRS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "rmattrs");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+      }
+      break;
+      
+    case Transaction::OP_CLONE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ghobject_t noid = i.get_oid(op->dest_oid);
+	f->dump_string("op_name", "clone");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("src_oid") << oid;
+	f->dump_stream("dst_oid") << noid;
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ghobject_t noid = i.get_oid(op->dest_oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	f->dump_string("op_name", "clonerange");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("src_oid") << oid;
+	f->dump_stream("dst_oid") << noid;
+	f->dump_unsigned("offset", off);
+	f->dump_unsigned("len", len);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE2:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ghobject_t noid = i.get_oid(op->dest_oid);
+        uint64_t srcoff = op->off;
+        uint64_t len = op->len;
+        uint64_t dstoff = op->dest_off;
+	f->dump_string("op_name", "clonerange2");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("src_oid") << oid;
+	f->dump_stream("dst_oid") << noid;
+	f->dump_unsigned("src_offset", srcoff);
+	f->dump_unsigned("len", len);
+	f->dump_unsigned("dst_offset", dstoff);
+      }
+      break;
+
+    case Transaction::OP_MKCOLL:
+      {
+        coll_t cid = i.get_cid(op->cid);
+	f->dump_string("op_name", "mkcoll");
+	f->dump_stream("collection") << cid;
+      }
+      break;
+
+    case Transaction::OP_COLL_HINT:
+      {
+	using ceph::decode;
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t type = op->hint;
+        f->dump_string("op_name", "coll_hint");
+        f->dump_stream("collection") << cid;
+        f->dump_unsigned("type", type);
+        bufferlist hint;
+        i.decode_bl(hint);
+        auto hiter = hint.cbegin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          decode(pg_num, hiter);
+          decode(num_objs, hiter);
+          f->dump_unsigned("pg_num", pg_num);
+          f->dump_unsigned("expected_num_objects", num_objs);
+        }
+      }
+      break;
+
+    case Transaction::OP_COLL_SET_BITS:
+      {
+	coll_t cid = i.get_cid(op->cid);
+	f->dump_string("op_name", "coll_set_bits");
+	f->dump_stream("collection") << cid;
+	f->dump_unsigned("bits", op->split_bits);
+      }
+      break;
+
+    case Transaction::OP_RMCOLL:
+      {
+        coll_t cid = i.get_cid(op->cid);
+	f->dump_string("op_name", "rmcoll");
+	f->dump_stream("collection") << cid;
+      }
+      break;
+
+    case Transaction::OP_COLL_ADD:
+      {
+        coll_t ocid = i.get_cid(op->cid);
+        coll_t ncid = i.get_cid(op->dest_cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "collection_add");
+	f->dump_stream("src_collection") << ocid;
+	f->dump_stream("dst_collection") << ncid;
+	f->dump_stream("oid") << oid;
+      }
+      break;
+
+    case Transaction::OP_COLL_REMOVE:
+       {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "collection_remove");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+       }
+      break;
+
+    case Transaction::OP_COLL_MOVE:
+       {
+        coll_t ocid = i.get_cid(op->cid);
+        coll_t ncid = i.get_cid(op->dest_cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->open_object_section("collection_move");
+	f->dump_stream("src_collection") << ocid;
+	f->dump_stream("dst_collection") << ncid;
+	f->dump_stream("oid") << oid;
+	f->close_section();
+       }
+      break;
+
+    case Transaction::OP_COLL_SETATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        string name = i.decode_string();
+	bufferlist bl;
+	i.decode_bl(bl);
+	f->dump_string("op_name", "collection_setattr");
+	f->dump_stream("collection") << cid;
+	f->dump_string("name", name);
+	f->dump_unsigned("length", bl.length());
+      }
+      break;
+
+    case Transaction::OP_COLL_RMATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        string name = i.decode_string();
+	f->dump_string("op_name", "collection_rmattr");
+	f->dump_stream("collection") << cid;
+	f->dump_string("name", name);
+      }
+      break;
+
+    case Transaction::OP_COLL_RENAME:
+      {
+	f->dump_string("op_name", "collection_rename");
+      }
+      break;
+
+    case Transaction::OP_OMAP_CLEAR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	f->dump_string("op_name", "omap_clear");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+      }
+      break;
+
+    case Transaction::OP_OMAP_SETKEYS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	map<string, bufferlist> aset;
+	i.decode_attrset(aset);
+	f->dump_string("op_name", "omap_setkeys");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->open_object_section("attr_lens");
+	for (map<string, bufferlist>::iterator p = aset.begin();
+	    p != aset.end(); ++p) {
+	  f->dump_unsigned(p->first.c_str(), p->second.length());
+	}
+	f->close_section();
+      }
+      break;
+
+    case Transaction::OP_OMAP_RMKEYS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	set<string> keys;
+	i.decode_keyset(keys);
+	f->dump_string("op_name", "omap_rmkeys");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->open_array_section("attrs");
+	for (auto& k : keys) {
+	  f->dump_string("", k.c_str());
+	}
+	f->close_section();
+      }
+      break;
+
+    case Transaction::OP_OMAP_SETHEADER:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	bufferlist bl;
+	i.decode_bl(bl);
+	f->dump_string("op_name", "omap_setheader");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->dump_stream("header_length") << bl.length();
+      }
+      break;
+
+    case Transaction::OP_SPLIT_COLLECTION:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+        coll_t dest = i.get_cid(op->dest_cid);
+	f->dump_string("op_name", "op_split_collection_create");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("bits") << bits;
+	f->dump_stream("rem") << rem;
+	f->dump_stream("dest") << dest;
+      }
+      break;
+
+    case Transaction::OP_SPLIT_COLLECTION2:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+        coll_t dest = i.get_cid(op->dest_cid);
+	f->dump_string("op_name", "op_split_collection");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("bits") << bits;
+	f->dump_stream("rem") << rem;
+	f->dump_stream("dest") << dest;
+      }
+      break;
+
+    case Transaction::OP_MERGE_COLLECTION:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        coll_t dest = i.get_cid(op->dest_cid);
+	f->dump_string("op_name", "op_merge_collection");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("dest") << dest;
+	f->dump_stream("bits") << bits;
+      }
+      break;
+
+    case Transaction::OP_OMAP_RMKEYRANGE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        string first, last;
+        first = i.decode_string();
+        last = i.decode_string();
+	f->dump_string("op_name", "op_omap_rmkeyrange");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("oid") << oid;
+	f->dump_string("first", first);
+	f->dump_string("last", last);
+      }
+      break;
+
+    case Transaction::OP_COLL_MOVE_RENAME:
+      {
+        coll_t old_cid = i.get_cid(op->cid);
+        ghobject_t old_oid = i.get_oid(op->oid);
+        coll_t new_cid = i.get_cid(op->dest_cid);
+        ghobject_t new_oid = i.get_oid(op->dest_oid);
+	f->dump_string("op_name", "op_coll_move_rename");
+	f->dump_stream("old_collection") << old_cid;
+	f->dump_stream("old_oid") << old_oid;
+	f->dump_stream("new_collection") << new_cid;
+	f->dump_stream("new_oid") << new_oid;
+      }
+      break;
+
+    case Transaction::OP_TRY_RENAME:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t old_oid = i.get_oid(op->oid);
+        ghobject_t new_oid = i.get_oid(op->dest_oid);
+	f->dump_string("op_name", "op_coll_move_rename");
+	f->dump_stream("collection") << cid;
+	f->dump_stream("old_oid") << old_oid;
+	f->dump_stream("new_oid") << new_oid;
+      }
+      break;
+	
+    case Transaction::OP_SETALLOCHINT:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t expected_object_size = op->expected_object_size;
+        uint64_t expected_write_size = op->expected_write_size;
+        uint32_t alloc_hint_flags = op->hint;
+        f->dump_string("op_name", "op_setallochint");
+        f->dump_stream("collection") << cid;
+        f->dump_stream("oid") << oid;
+        f->dump_stream("expected_object_size") << expected_object_size;
+        f->dump_stream("expected_write_size") << expected_write_size;
+        f->dump_string("alloc_hint_flags", ceph_osd_alloc_hint_flag_string(alloc_hint_flags));
+      }
+      break;
+
+    default:
+      f->dump_string("op_name", "unknown");
+      f->dump_unsigned("op_code", op->op);
+      stop_looping = true;
+      break;
+    }
+    f->close_section();
+    op_num++;
+  }
+  f->close_section();
+}
+
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+void Transaction::generate_test_instances(list<Transaction*>& o)
+{
+  o.push_back(new Transaction);
+
+  Transaction *t = new Transaction;
+  t->nop();
+  o.push_back(t);
+  
+  t = new Transaction;
+  coll_t c(spg_t(pg_t(1,2), shard_id_t::NO_SHARD));
+  coll_t c2(spg_t(pg_t(4,5), shard_id_t::NO_SHARD));
+  ghobject_t o1(hobject_t("obj", "", 123, 456, -1, ""));
+  ghobject_t o2(hobject_t("obj2", "", 123, 456, -1, ""));
+  ghobject_t o3(hobject_t("obj3", "", 123, 456, -1, ""));
+  t->touch(c, o1);
+  bufferlist bl;
+  bl.append("some data");
+  t->write(c, o1, 1, bl.length(), bl);
+  t->zero(c, o1, 22, 33);
+  t->truncate(c, o1, 99);
+  t->remove(c, o1);
+  o.push_back(t);
+
+  t = new Transaction;
+  t->setattr(c, o1, "key", bl);
+  map<string,bufferptr,less<>> m;
+  m["a"] = buffer::copy("this", 4);
+  m["b"] = buffer::copy("that", 4);
+  t->setattrs(c, o1, m);
+  t->rmattr(c, o1, "b");
+  t->rmattrs(c, o1);
+
+  t->clone(c, o1, o2);
+  t->clone(c, o1, o3);
+  t->clone_range(c, o1, o2, 1, 12, 99);
+
+  t->create_collection(c, 12);
+  t->collection_move_rename(c, o2, c2, o3);
+  t->remove_collection(c);
+  o.push_back(t);  
+}
+
+ostream& operator<<(ostream& out, const Transaction& tx) {
+
+  return out << "Transaction(" << &tx << ")";
+}
+
+#pragma GCC diagnostic pop
+#pragma GCC diagnostic warning "-Wpragmas"
+
+}
diff --git a/src/os/Transaction.h b/src/os/Transaction.h
new file mode 100644
index 000000000..f28a257fc
--- /dev/null
+++ b/src/os/Transaction.h
@@ -0,0 +1,1301 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+
+#include "include/Context.h"
+#include "include/int_types.h"
+#include "include/buffer.h"
+
+#include "osd/osd_types.h"
+
+#define OPS_PER_PTR 32
+
+void decode_str_str_map_to_bl(ceph::buffer::list::const_iterator& p, ceph::buffer::list *out);
+void decode_str_set_to_bl(ceph::buffer::list::const_iterator& p, ceph::buffer::list *out);
+
+
+/*********************************
+ * transaction
+ *
+ * A Transaction represents a sequence of primitive mutation
+ * operations.
+ *
+ * Three events in the life of a Transaction result in
+ * callbacks. Any Transaction can contain any number of callback
+ * objects (Context) for any combination of the three classes of
+ * callbacks:
+ *
+ *    on_applied_sync, on_applied, and on_commit.
+ *
+ * The "on_applied" and "on_applied_sync" callbacks are invoked when
+ * the modifications requested by the Transaction are visible to
+ * subsequent ObjectStore operations, i.e., the results are
+ * readable. The only conceptual difference between on_applied and
+ * on_applied_sync is the specific thread and locking environment in
+ * which the callbacks operate.  "on_applied_sync" is called
+ * directly by an ObjectStore execution thread. It is expected to
+ * execute quickly and must not acquire any locks of the calling
+ * environment. Conversely, "on_applied" is called from the separate
+ * Finisher thread, meaning that it can contend for calling
+ * environment locks. NB, on_applied and on_applied_sync are
+ * sometimes called on_readable and on_readable_sync.
+ *
+ * The "on_commit" callback is also called from the Finisher thread
+ * and indicates that all of the mutations have been durably
+ * committed to stable storage (i.e., are now software/hardware
+ * crashproof).
+ *
+ * At the implementation level, each mutation primitive (and its
+ * associated data) can be serialized to a single buffer.  That
+ * serialization, however, does not copy any data, but (using the
+ * ceph::buffer::list library) will reference the original buffers.  This
+ * implies that the buffer that contains the data being submitted
+ * must remain stable until the on_commit callback completes.  In
+ * practice, ceph::buffer::list handles all of this for you and this
+ * subtlety is only relevant if you are referencing an existing
+ * buffer via buffer::raw_static.
+ *
+ * Some implementations of ObjectStore choose to implement their own
+ * form of journaling that uses the serialized form of a
+ * Transaction. This requires that the encode/decode logic properly
+ * version itself and handle version upgrades that might change the
+ * format of the encoded Transaction. This has already happened a
+ * couple of times and the Transaction object contains some helper
+ * variables that aid in this legacy decoding:
+ *
+ *   sobject_encoding detects an older/simpler version of oid
+ *   present in pre-bobtail versions of ceph.  use_pool_override
+ *   also detects a situation where the pool of an oid can be
+ *   overridden for legacy operations/buffers.  For non-legacy
+ *   implementations of ObjectStore, neither of these fields are
+ *   relevant.
+ *
+ *
+ * TRANSACTION ISOLATION
+ *
+ * Except as noted above, isolation is the responsibility of the
+ * caller. In other words, if any storage element (storage element
+ * == any of the four portions of an object as described above) is
+ * altered by a transaction (including deletion), the caller
+ * promises not to attempt to read that element while the
+ * transaction is pending (here pending means from the time of
+ * issuance until the "on_applied_sync" callback has been
+ * received). Violations of isolation need not be detected by
+ * ObjectStore and there is no corresponding error mechanism for
+ * reporting an isolation violation (crashing would be the
+ * appropriate way to report an isolation violation if detected).
+ *
+ * Enumeration operations may violate transaction isolation as
+ * described above when a storage element is being created or
+ * deleted as part of a transaction. In this case, ObjectStore is
+ * allowed to consider the enumeration operation to either precede
+ * or follow the violating transaction element. In other words, the
+ * presence/absence of the mutated element in the enumeration is
+ * entirely at the discretion of ObjectStore. The arbitrary ordering
+ * applies independently to each transaction element. For example,
+ * if a transaction contains two mutating elements "create A" and
+ * "delete B". And an enumeration operation is performed while this
+ * transaction is pending. It is permissible for ObjectStore to
+ * report any of the four possible combinations of the existence of
+ * A and B.
+ *
+ */
+namespace ceph::os {
+class Transaction {
+public:
+  enum {
+    OP_NOP =          0,
+    OP_CREATE =       7,   // cid, oid
+    OP_TOUCH =        9,   // cid, oid
+    OP_WRITE =        10,  // cid, oid, offset, len, bl
+    OP_ZERO =         11,  // cid, oid, offset, len
+    OP_TRUNCATE =     12,  // cid, oid, len
+    OP_REMOVE =       13,  // cid, oid
+    OP_SETATTR =      14,  // cid, oid, attrname, bl
+    OP_SETATTRS =     15,  // cid, oid, attrset
+    OP_RMATTR =       16,  // cid, oid, attrname
+    OP_CLONE =        17,  // cid, oid, newoid
+    OP_CLONERANGE =   18,  // cid, oid, newoid, offset, len
+    OP_CLONERANGE2 =  30,  // cid, oid, newoid, srcoff, len, dstoff
+
+    OP_TRIMCACHE =    19,  // cid, oid, offset, len  **DEPRECATED**
+
+    OP_MKCOLL =       20,  // cid
+    OP_RMCOLL =       21,  // cid
+    OP_COLL_ADD =     22,  // cid, oldcid, oid
+    OP_COLL_REMOVE =  23,  // cid, oid
+    OP_COLL_SETATTR = 24,  // cid, attrname, bl
+    OP_COLL_RMATTR =  25,  // cid, attrname
+    OP_COLL_SETATTRS = 26,  // cid, attrset
+    OP_COLL_MOVE =    8,   // newcid, oldcid, oid
+
+    OP_RMATTRS =      28,  // cid, oid
+    OP_COLL_RENAME =       29,  // cid, newcid
+
+    OP_OMAP_CLEAR = 31,   // cid
+    OP_OMAP_SETKEYS = 32, // cid, attrset
+    OP_OMAP_RMKEYS = 33,  // cid, keyset
+    OP_OMAP_SETHEADER = 34, // cid, header
+    OP_SPLIT_COLLECTION = 35, // cid, bits, destination
+    OP_SPLIT_COLLECTION2 = 36, /* cid, bits, destination
+				    doesn't create the destination */
+    OP_OMAP_RMKEYRANGE = 37,  // cid, oid, firstkey, lastkey
+    OP_COLL_MOVE_RENAME = 38,   // oldcid, oldoid, newcid, newoid
+
+    OP_SETALLOCHINT = 39,  // cid, oid, object_size, write_size
+    OP_COLL_HINT = 40, // cid, type, bl
+
+    OP_TRY_RENAME = 41,   // oldcid, oldoid, newoid
+
+    OP_COLL_SET_BITS = 42, // cid, bits
+
+    OP_MERGE_COLLECTION = 43, // cid, destination
+  };
+
+  // Transaction hint type
+  enum {
+    COLL_HINT_EXPECTED_NUM_OBJECTS = 1,
+  };
+
+  struct Op {
+    ceph_le32 op;
+    ceph_le32 cid;
+    ceph_le32 oid;
+    ceph_le64 off;
+    ceph_le64 len;
+    ceph_le32 dest_cid;
+    ceph_le32 dest_oid;               //OP_CLONE, OP_CLONERANGE
+    ceph_le64 dest_off;               //OP_CLONERANGE
+    ceph_le32 hint;                   //OP_COLL_HINT,OP_SETALLOCHINT
+    ceph_le64 expected_object_size;   //OP_SETALLOCHINT
+    ceph_le64 expected_write_size;    //OP_SETALLOCHINT
+    ceph_le32 split_bits;             //OP_SPLIT_COLLECTION2,OP_COLL_SET_BITS,
+                                      //OP_MKCOLL
+    ceph_le32 split_rem;              //OP_SPLIT_COLLECTION2
+  } __attribute__ ((packed)) ;
+
+  struct TransactionData {
+    ceph_le64 ops;
+    ceph_le32 largest_data_len;
+    ceph_le32 largest_data_off;
+    ceph_le32 largest_data_off_in_data_bl;
+    ceph_le32 fadvise_flags;
+
+    TransactionData() noexcept :
+      ops(0),
+      largest_data_len(0),
+      largest_data_off(0),
+      largest_data_off_in_data_bl(0),
+      fadvise_flags(0) { }
+
+    // override default move operations to reset default values
+    TransactionData(TransactionData&& other) noexcept :
+      ops(other.ops),
+      largest_data_len(other.largest_data_len),
+      largest_data_off(other.largest_data_off),
+      largest_data_off_in_data_bl(other.largest_data_off_in_data_bl),
+      fadvise_flags(other.fadvise_flags) {
+      other.ops = 0;
+      other.largest_data_len = 0;
+      other.largest_data_off = 0;
+      other.largest_data_off_in_data_bl = 0;
+      other.fadvise_flags = 0;
+    }
+    TransactionData& operator=(TransactionData&& other) noexcept {
+      ops = other.ops;
+      largest_data_len = other.largest_data_len;
+      largest_data_off = other.largest_data_off;
+      largest_data_off_in_data_bl = other.largest_data_off_in_data_bl;
+      fadvise_flags = other.fadvise_flags;
+      other.ops = 0;
+      other.largest_data_len = 0;
+      other.largest_data_off = 0;
+      other.largest_data_off_in_data_bl = 0;
+      other.fadvise_flags = 0;
+      return *this;
+    }
+
+    TransactionData(const TransactionData& other) = default;
+    TransactionData& operator=(const TransactionData& other) = default;
+
+    void encode(ceph::buffer::list& bl) const {
+      bl.append((char*)this, sizeof(TransactionData));
+    }
+    void decode(ceph::buffer::list::const_iterator &bl) {
+      bl.copy(sizeof(TransactionData), (char*)this);
+    }
+  } __attribute__ ((packed)) ;
+
+private:
+  TransactionData data;
+
+  std::map<coll_t, uint32_t> coll_index;
+  std::map<ghobject_t, uint32_t> object_index;
+
+  uint32_t coll_id = 0;
+  uint32_t object_id = 0;
+
+  ceph::buffer::list data_bl;
+  ceph::buffer::list op_bl;
+
+  std::list<Context *> on_applied;
+  std::list<Context *> on_commit;
+  std::list<Context *> on_applied_sync;
+
+public:
+  Transaction() = default;
+
+  explicit Transaction(ceph::buffer::list::const_iterator &dp) {
+    decode(dp);
+  }
+  explicit Transaction(ceph::buffer::list &nbl) {
+    auto dp = nbl.cbegin();
+    decode(dp);
+  }
+
+  // override default move operations to reset default values
+  Transaction(Transaction&& other) noexcept :
+    data(std::move(other.data)),
+    coll_index(std::move(other.coll_index)),
+    object_index(std::move(other.object_index)),
+    coll_id(other.coll_id),
+    object_id(other.object_id),
+    data_bl(std::move(other.data_bl)),
+    op_bl(std::move(other.op_bl)),
+    on_applied(std::move(other.on_applied)),
+    on_commit(std::move(other.on_commit)),
+    on_applied_sync(std::move(other.on_applied_sync)) {
+    other.coll_id = 0;
+    other.object_id = 0;
+  }
+
+  Transaction& operator=(Transaction&& other) noexcept {
+    data = std::move(other.data);
+    coll_index = std::move(other.coll_index);
+    object_index = std::move(other.object_index);
+    coll_id = other.coll_id;
+    object_id = other.object_id;
+    data_bl = std::move(other.data_bl);
+    op_bl = std::move(other.op_bl);
+    on_applied = std::move(other.on_applied);
+    on_commit = std::move(other.on_commit);
+    on_applied_sync = std::move(other.on_applied_sync);
+    other.coll_id = 0;
+    other.object_id = 0;
+    return *this;
+  }
+
+  Transaction(const Transaction& other) = default;
+  Transaction& operator=(const Transaction& other) = default;
+
+  // expose object_index for FileStore::Op's benefit
+  const std::map<ghobject_t, uint32_t>& get_object_index() const {
+    return object_index;
+  }
+
+  /* Operations on callback contexts */
+  void register_on_applied(Context *c) {
+    if (!c) return;
+    on_applied.push_back(c);
+  }
+  void register_on_commit(Context *c) {
+    if (!c) return;
+    on_commit.push_back(c);
+  }
+  void register_on_applied_sync(Context *c) {
+    if (!c) return;
+    on_applied_sync.push_back(c);
+  }
+  void register_on_complete(Context *c) {
+    if (!c) return;
+    RunOnDeleteRef _complete (std::make_shared<RunOnDelete>(c));
+    register_on_applied(new ContainerContext<RunOnDeleteRef>(_complete));
+    register_on_commit(new ContainerContext<RunOnDeleteRef>(_complete));
+  }
+  bool has_contexts() const {
+    return
+	!on_commit.empty() ||
+	!on_applied.empty() ||
+	!on_applied_sync.empty();
+  }
+
+  static void collect_contexts(
+    std::vector<Transaction>& t,
+    Context **out_on_applied,
+    Context **out_on_commit,
+    Context **out_on_applied_sync) {
+    ceph_assert(out_on_applied);
+    ceph_assert(out_on_commit);
+    ceph_assert(out_on_applied_sync);
+    std::list<Context *> on_applied, on_commit, on_applied_sync;
+    for (auto& i : t) {
+	on_applied.splice(on_applied.end(), i.on_applied);
+	on_commit.splice(on_commit.end(), i.on_commit);
+	on_applied_sync.splice(on_applied_sync.end(), i.on_applied_sync);
+    }
+    *out_on_applied = C_Contexts::list_to_context(on_applied);
+    *out_on_commit = C_Contexts::list_to_context(on_commit);
+    *out_on_applied_sync = C_Contexts::list_to_context(on_applied_sync);
+  }
+  static void collect_contexts(
+    std::vector<Transaction>& t,
+    std::list<Context*> *out_on_applied,
+    std::list<Context*> *out_on_commit,
+    std::list<Context*> *out_on_applied_sync) {
+    ceph_assert(out_on_applied);
+    ceph_assert(out_on_commit);
+    ceph_assert(out_on_applied_sync);
+    for (auto& i : t) {
+	out_on_applied->splice(out_on_applied->end(), i.on_applied);
+	out_on_commit->splice(out_on_commit->end(), i.on_commit);
+	out_on_applied_sync->splice(out_on_applied_sync->end(),
+				    i.on_applied_sync);
+    }
+  }
+  static Context *collect_all_contexts(
+    Transaction& t) {
+    std::list<Context*> contexts;
+    contexts.splice(contexts.end(), t.on_applied);
+    contexts.splice(contexts.end(), t.on_commit);
+    contexts.splice(contexts.end(), t.on_applied_sync);
+    return C_Contexts::list_to_context(contexts);
+  }
+
+  Context *get_on_applied() {
+    return C_Contexts::list_to_context(on_applied);
+  }
+  Context *get_on_commit() {
+    return C_Contexts::list_to_context(on_commit);
+  }
+  Context *get_on_applied_sync() {
+    return C_Contexts::list_to_context(on_applied_sync);
+  }
+
+  void set_fadvise_flags(uint32_t flags) {
+    data.fadvise_flags = flags;
+  }
+  void set_fadvise_flag(uint32_t flag) {
+    data.fadvise_flags = data.fadvise_flags | flag;
+  }
+  uint32_t get_fadvise_flags() { return data.fadvise_flags; }
+
+  void swap(Transaction& other) noexcept {
+    std::swap(data, other.data);
+    std::swap(on_applied, other.on_applied);
+    std::swap(on_commit, other.on_commit);
+    std::swap(on_applied_sync, other.on_applied_sync);
+
+    std::swap(coll_index, other.coll_index);
+    std::swap(object_index, other.object_index);
+    std::swap(coll_id, other.coll_id);
+    std::swap(object_id, other.object_id);
+    op_bl.swap(other.op_bl);
+    data_bl.swap(other.data_bl);
+  }
+
+  void _update_op(Op* op,
+    std::vector<uint32_t> &cm,
+    std::vector<uint32_t> &om) {
+
+    switch (op->op) {
+    case OP_NOP:
+      break;
+
+    case OP_CREATE:
+    case OP_TOUCH:
+    case OP_REMOVE:
+    case OP_SETATTR:
+    case OP_SETATTRS:
+    case OP_RMATTR:
+    case OP_RMATTRS:
+    case OP_COLL_REMOVE:
+    case OP_OMAP_CLEAR:
+    case OP_OMAP_SETKEYS:
+    case OP_OMAP_RMKEYS:
+    case OP_OMAP_RMKEYRANGE:
+    case OP_OMAP_SETHEADER:
+    case OP_WRITE:
+    case OP_ZERO:
+    case OP_TRUNCATE:
+    case OP_SETALLOCHINT:
+      ceph_assert(op->cid < cm.size());
+      ceph_assert(op->oid < om.size());
+      op->cid = cm[op->cid];
+      op->oid = om[op->oid];
+      break;
+
+    case OP_CLONERANGE2:
+    case OP_CLONE:
+      ceph_assert(op->cid < cm.size());
+      ceph_assert(op->oid < om.size());
+      ceph_assert(op->dest_oid < om.size());
+      op->cid = cm[op->cid];
+      op->oid = om[op->oid];
+      op->dest_oid = om[op->dest_oid];
+      break;
+
+    case OP_MKCOLL:
+    case OP_RMCOLL:
+    case OP_COLL_SETATTR:
+    case OP_COLL_RMATTR:
+    case OP_COLL_SETATTRS:
+    case OP_COLL_HINT:
+    case OP_COLL_SET_BITS:
+      ceph_assert(op->cid < cm.size());
+      op->cid = cm[op->cid];
+      break;
+
+    case OP_COLL_ADD:
+      ceph_assert(op->cid < cm.size());
+      ceph_assert(op->oid < om.size());
+      ceph_assert(op->dest_cid < om.size());
+      op->cid = cm[op->cid];
+      op->dest_cid = cm[op->dest_cid];
+      op->oid = om[op->oid];
+      break;
+
+    case OP_COLL_MOVE_RENAME:
+      ceph_assert(op->cid < cm.size());
+      ceph_assert(op->oid < om.size());
+      ceph_assert(op->dest_cid < cm.size());
+      ceph_assert(op->dest_oid < om.size());
+      op->cid = cm[op->cid];
+      op->oid = om[op->oid];
+      op->dest_cid = cm[op->dest_cid];
+      op->dest_oid = om[op->dest_oid];
+      break;
+
+    case OP_TRY_RENAME:
+      ceph_assert(op->cid < cm.size());
+      ceph_assert(op->oid < om.size());
+      ceph_assert(op->dest_oid < om.size());
+      op->cid = cm[op->cid];
+      op->oid = om[op->oid];
+      op->dest_oid = om[op->dest_oid];
+	break;
+
+    case OP_SPLIT_COLLECTION2:
+      ceph_assert(op->cid < cm.size());
+	ceph_assert(op->dest_cid < cm.size());
+      op->cid = cm[op->cid];
+      op->dest_cid = cm[op->dest_cid];
+      break;
+
+    case OP_MERGE_COLLECTION:
+      ceph_assert(op->cid < cm.size());
+	ceph_assert(op->dest_cid < cm.size());
+      op->cid = cm[op->cid];
+      op->dest_cid = cm[op->dest_cid];
+      break;
+
+    default:
+      ceph_abort_msg("Unknown OP");
+    }
+  }
+  void _update_op_bl(
+    ceph::buffer::list& bl,
+    std::vector<uint32_t> &cm,
+    std::vector<uint32_t> &om) {
+    for (auto& bp : bl.buffers()) {
+      ceph_assert(bp.length() % sizeof(Op) == 0);
+
+      char* raw_p = const_cast<char*>(bp.c_str());
+      char* raw_end = raw_p + bp.length();
+      while (raw_p < raw_end) {
+        _update_op(reinterpret_cast<Op*>(raw_p), cm, om);
+        raw_p += sizeof(Op);
+      }
+    }
+  }
+  /// Append the operations of the parameter to this Transaction. Those operations are removed from the parameter Transaction
+  void append(Transaction& other) {
+
+    data.ops = data.ops + other.data.ops;
+    if (other.data.largest_data_len > data.largest_data_len) {
+	data.largest_data_len = other.data.largest_data_len;
+	data.largest_data_off = other.data.largest_data_off;
+	data.largest_data_off_in_data_bl = data_bl.length() + other.data.largest_data_off_in_data_bl;
+    }
+    data.fadvise_flags = data.fadvise_flags | other.data.fadvise_flags;
+    on_applied.splice(on_applied.end(), other.on_applied);
+    on_commit.splice(on_commit.end(), other.on_commit);
+    on_applied_sync.splice(on_applied_sync.end(), other.on_applied_sync);
+
+    //append coll_index & object_index
+    std::vector<uint32_t> cm(other.coll_index.size());
+    std::map<coll_t, uint32_t>::iterator coll_index_p;
+    for (coll_index_p = other.coll_index.begin();
+         coll_index_p != other.coll_index.end();
+         ++coll_index_p) {
+      cm[coll_index_p->second] = _get_coll_id(coll_index_p->first);
+    }
+
+    std::vector<uint32_t> om(other.object_index.size());
+    std::map<ghobject_t, uint32_t>::iterator object_index_p;
+    for (object_index_p = other.object_index.begin();
+         object_index_p != other.object_index.end();
+         ++object_index_p) {
+      om[object_index_p->second] = _get_object_id(object_index_p->first);
+    }
+
+    //the other.op_bl SHOULD NOT be changes during append operation,
+    //we use additional ceph::buffer::list to avoid this problem
+    ceph::buffer::list other_op_bl;
+    {
+      ceph::buffer::ptr other_op_bl_ptr(other.op_bl.length());
+      other.op_bl.begin().copy(other.op_bl.length(), other_op_bl_ptr.c_str());
+      other_op_bl.append(std::move(other_op_bl_ptr));
+    }
+
+    //update other_op_bl with cm & om
+    //When the other is appended to current transaction, all coll_index and
+    //object_index in other.op_buffer should be updated by new index of the
+    //combined transaction
+    _update_op_bl(other_op_bl, cm, om);
+
+    //append op_bl
+    op_bl.append(other_op_bl);
+    //append data_bl
+    data_bl.append(other.data_bl);
+  }
+
+  /** Inquires about the Transaction as a whole. */
+
+  /// How big is the encoded Transaction buffer?
+  uint64_t get_encoded_bytes() {
+    //layout: data_bl + op_bl + coll_index + object_index + data
+
+    // coll_index size, object_index size and sizeof(transaction_data)
+    // all here, so they may be computed at compile-time
+    size_t final_size = sizeof(__u32) * 2 + sizeof(data);
+
+    // coll_index second and object_index second
+    final_size += (coll_index.size() + object_index.size()) * sizeof(__u32);
+
+    // coll_index first
+    for (auto p = coll_index.begin(); p != coll_index.end(); ++p) {
+	final_size += p->first.encoded_size();
+    }
+
+    // object_index first
+    for (auto p = object_index.begin(); p != object_index.end(); ++p) {
+	final_size += p->first.encoded_size();
+    }
+
+    return data_bl.length() +
+	op_bl.length() +
+	final_size;
+  }
+
+  /// Retain old version for regression testing purposes
+  uint64_t get_encoded_bytes_test() {
+    using ceph::encode;
+    //layout: data_bl + op_bl + coll_index + object_index + data
+    ceph::buffer::list bl;
+    encode(coll_index, bl);
+    encode(object_index, bl);
+
+    return data_bl.length() +
+	op_bl.length() +
+	bl.length() +
+	sizeof(data);
+  }
+
+  uint64_t get_num_bytes() {
+    return get_encoded_bytes();
+  }
+  /// Size of largest data buffer to the "write" operation encountered so far
+  uint32_t get_data_length() {
+    return data.largest_data_len;
+  }
+  /// offset within the encoded buffer to the start of the largest data buffer that's encoded
+  uint32_t get_data_offset() {
+    if (data.largest_data_off_in_data_bl) {
+	return data.largest_data_off_in_data_bl +
+	  sizeof(__u8) +      // encode struct_v
+	  sizeof(__u8) +      // encode compat_v
+	  sizeof(__u32) +     // encode len
+	  sizeof(__u32);      // data_bl len
+    }
+    return 0;  // none
+  }
+  /// offset of buffer as aligned to destination within object.
+  int get_data_alignment() {
+    if (!data.largest_data_len)
+	return 0;
+    return (0 - get_data_offset()) & ~CEPH_PAGE_MASK;
+  }
+  /// Is the Transaction empty (no operations)
+  bool empty() {
+    return !data.ops;
+  }
+  /// Number of operations in the transaction
+  int get_num_ops() {
+    return data.ops;
+  }
+
+  /**
+   * iterator
+   *
+   * Helper object to parse Transactions.
+   *
+   * ObjectStore instances use this object to step down the encoded
+   * buffer decoding operation codes and parameters as we go.
+   *
+   */
+  class iterator {
+    Transaction *t;
+
+    uint64_t ops;
+    char* op_buffer_p;
+
+    ceph::buffer::list::const_iterator data_bl_p;
+
+  public:
+    std::vector<coll_t> colls;
+    std::vector<ghobject_t> objects;
+
+  private:
+    explicit iterator(Transaction *t)
+      : t(t),
+	  data_bl_p(t->data_bl.cbegin()),
+        colls(t->coll_index.size()),
+        objects(t->object_index.size()) {
+
+      ops = t->data.ops;
+      op_buffer_p = t->op_bl.c_str();
+
+      std::map<coll_t, uint32_t>::iterator coll_index_p;
+      for (coll_index_p = t->coll_index.begin();
+           coll_index_p != t->coll_index.end();
+           ++coll_index_p) {
+        colls[coll_index_p->second] = coll_index_p->first;
+      }
+
+      std::map<ghobject_t, uint32_t>::iterator object_index_p;
+      for (object_index_p = t->object_index.begin();
+           object_index_p != t->object_index.end();
+           ++object_index_p) {
+        objects[object_index_p->second] = object_index_p->first;
+      }
+    }
+
+    friend class Transaction;
+
+  public:
+
+    bool have_op() {
+      return ops > 0;
+    }
+    Op* decode_op() {
+      ceph_assert(ops > 0);
+
+      Op* op = reinterpret_cast<Op*>(op_buffer_p);
+      op_buffer_p += sizeof(Op);
+      ops--;
+
+      return op;
+    }
+    std::string decode_string() {
+	using ceph::decode;
+      std::string s;
+      decode(s, data_bl_p);
+      return s;
+    }
+    void decode_bp(ceph::buffer::ptr& bp) {
+	using ceph::decode;
+      decode(bp, data_bl_p);
+    }
+    void decode_bl(ceph::buffer::list& bl) {
+	using ceph::decode;
+      decode(bl, data_bl_p);
+    }
+    void decode_attrset(std::map<std::string,ceph::buffer::ptr>& aset) {
+	using ceph::decode;
+      decode(aset, data_bl_p);
+    }
+    void decode_attrset(std::map<std::string,ceph::buffer::list>& aset) {
+	using ceph::decode;
+      decode(aset, data_bl_p);
+    }
+    void decode_attrset_bl(ceph::buffer::list *pbl) {
+	decode_str_str_map_to_bl(data_bl_p, pbl);
+    }
+    void decode_keyset(std::set<std::string> &keys){
+	using ceph::decode;
+      decode(keys, data_bl_p);
+    }
+    void decode_keyset_bl(ceph::buffer::list *pbl){
+      decode_str_set_to_bl(data_bl_p, pbl);
+    }
+
+    const ghobject_t &get_oid(uint32_t oid_id) {
+      ceph_assert(oid_id < objects.size());
+      return objects[oid_id];
+    }
+    const coll_t &get_cid(uint32_t cid_id) {
+      ceph_assert(cid_id < colls.size());
+      return colls[cid_id];
+    }
+    uint32_t get_fadvise_flags() const {
+	return t->get_fadvise_flags();
+    }
+
+    const std::vector<ghobject_t> &get_objects() const {
+      return objects;
+    }
+  };
+
+  iterator begin() {
+     return iterator(this);
+  }
+
+private:
+  void _build_actions_from_tbl();
+
+  /**
+   * Helper functions to encode the various mutation elements of a
+   * transaction.  These are 1:1 with the operation codes (see
+   * enumeration above).  These routines ensure that the
+   * encoder/creator of a transaction gets the right data in the
+   * right place. Sadly, there's no corresponding version nor any
+   * form of seat belts for the decoder.
+   */
+  Op* _get_next_op() {
+    if (op_bl.get_append_buffer_unused_tail_length() < sizeof(Op)) {
+      op_bl.reserve(sizeof(Op) * OPS_PER_PTR);
+    }
+    // append_hole ensures bptr merging. Even huge number of ops
+    // shouldn't result in overpopulating bl::_buffers.
+    char* const p = op_bl.append_hole(sizeof(Op)).c_str();
+    memset(p, 0, sizeof(Op));
+    return reinterpret_cast<Op*>(p);
+  }
+  uint32_t _get_coll_id(const coll_t& coll) {
+    std::map<coll_t, uint32_t>::iterator c = coll_index.find(coll);
+    if (c != coll_index.end())
+      return c->second;
+
+    uint32_t index_id = coll_id++;
+    coll_index[coll] = index_id;
+    return index_id;
+  }
+  uint32_t _get_object_id(const ghobject_t& oid) {
+    std::map<ghobject_t, uint32_t>::iterator o = object_index.find(oid);
+    if (o != object_index.end())
+      return o->second;
+
+    uint32_t index_id = object_id++;
+    object_index[oid] = index_id;
+    return index_id;
+  }
+
+public:
+  /// noop. 'nuf said
+  void nop() {
+    Op* _op = _get_next_op();
+    _op->op = OP_NOP;
+    data.ops = data.ops + 1;
+  }
+  /**
+   * create
+   *
+   * create an object that does not yet exist
+   * (behavior is undefined if the object already exists)
+   */
+  void create(const coll_t& cid, const ghobject_t& oid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_CREATE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+  /**
+   * touch
+   *
+   * Ensure the existance of an object in a collection. Create an
+   * empty object if necessary
+   */
+  void touch(const coll_t& cid, const ghobject_t& oid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_TOUCH;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+  /**
+   * Write data to an offset within an object. If the object is too
+   * small, it is expanded as needed.  It is possible to specify an
+   * offset beyond the current end of an object and it will be
+   * expanded as needed. Simple implementations of ObjectStore will
+   * just zero the data between the old end of the object and the
+   * newly provided data. More sophisticated implementations of
+   * ObjectStore will omit the untouched data and store it as a
+   * "hole" in the file.
+   *
+   * Note that a 0-length write does not affect the size of the object.
+   */
+  void write(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len,
+	       const ceph::buffer::list& write_data, uint32_t flags = 0) {
+    using ceph::encode;
+    uint32_t orig_len = data_bl.length();
+    Op* _op = _get_next_op();
+    _op->op = OP_WRITE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    _op->off = off;
+    _op->len = len;
+    encode(write_data, data_bl);
+
+    ceph_assert(len == write_data.length());
+    data.fadvise_flags = data.fadvise_flags | flags;
+    if (write_data.length() > data.largest_data_len) {
+	data.largest_data_len = write_data.length();
+	data.largest_data_off = off;
+	data.largest_data_off_in_data_bl = orig_len + sizeof(__u32);  // we are about to
+    }
+    data.ops = data.ops + 1;
+  }
+  /**
+   * zero out the indicated byte range within an object. Some
+   * ObjectStore instances may optimize this to release the
+   * underlying storage space.
+   *
+   * If the zero range extends beyond the end of the object, the object
+   * size is extended, just as if we were writing a buffer full of zeros.
+   * EXCEPT if the length is 0, in which case (just like a 0-length write)
+   * we do not adjust the object size.
+   */
+  void zero(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len) {
+    Op* _op = _get_next_op();
+    _op->op = OP_ZERO;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    _op->off = off;
+    _op->len = len;
+    data.ops = data.ops + 1;
+  }
+  /// Discard all data in the object beyond the specified size.
+  void truncate(const coll_t& cid, const ghobject_t& oid, uint64_t off) {
+    Op* _op = _get_next_op();
+    _op->op = OP_TRUNCATE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    _op->off = off;
+    data.ops = data.ops + 1;
+  }
+  /// Remove an object. All four parts of the object are removed.
+  void remove(const coll_t& cid, const ghobject_t& oid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_REMOVE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+  /// Set an xattr of an object
+  void setattr(const coll_t& cid, const ghobject_t& oid, const char* name, ceph::buffer::list& val) {
+    std::string n(name);
+    setattr(cid, oid, n, val);
+  }
+  /// Set an xattr of an object
+  void setattr(const coll_t& cid, const ghobject_t& oid, const std::string& s, ceph::buffer::list& val) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_SETATTR;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(s, data_bl);
+    encode(val, data_bl);
+    data.ops = data.ops + 1;
+  }
+  /// Set multiple xattrs of an object
+  void setattrs(const coll_t& cid,
+		const ghobject_t& oid,
+		const std::map<std::string,ceph::buffer::ptr,std::less<>>& attrset) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_SETATTRS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(attrset, data_bl);
+    data.ops = data.ops + 1;
+  }
+  /// Set multiple xattrs of an object
+  void setattrs(const coll_t& cid,
+		const ghobject_t& oid,
+		const std::map<std::string,ceph::buffer::list,std::less<>>& attrset) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_SETATTRS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(attrset, data_bl);
+    data.ops = data.ops + 1;
+  }
+  /// remove an xattr from an object
+  void rmattr(const coll_t& cid, const ghobject_t& oid, const char *name) {
+    std::string n(name);
+    rmattr(cid, oid, n);
+  }
+  /// remove an xattr from an object
+  void rmattr(const coll_t& cid, const ghobject_t& oid, const std::string& s) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_RMATTR;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(s, data_bl);
+    data.ops = data.ops + 1;
+  }
+  /// remove all xattrs from an object
+  void rmattrs(const coll_t& cid, const ghobject_t& oid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_RMATTRS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+  /**
+   * Clone an object into another object.
+   *
+   * Low-cost (e.g., O(1)) cloning (if supported) is best, but
+   * fallback to an O(n) copy is allowed.  All four parts of the
+   * object are cloned (data, xattrs, omap header, omap
+   * entries).
+   *
+   * The destination named object may already exist, in
+   * which case its previous contents are discarded.
+   */
+  void clone(const coll_t& cid, const ghobject_t& oid,
+	       const ghobject_t& noid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_CLONE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    _op->dest_oid = _get_object_id(noid);
+    data.ops = data.ops + 1;
+  }
+  /**
+   * Clone a byte range from one object to another.
+   *
+   * The data portion of the destination object receives a copy of a
+   * portion of the data from the source object. None of the other
+   * three parts of an object is copied from the source.
+   *
+   * The destination object size may be extended to the dstoff + len.
+   *
+   * The source range *must* overlap with the source object data. If it does
+   * not the result is undefined.
+   */
+  void clone_range(const coll_t& cid, const ghobject_t& oid,
+		     const ghobject_t& noid,
+		     uint64_t srcoff, uint64_t srclen, uint64_t dstoff) {
+    Op* _op = _get_next_op();
+    _op->op = OP_CLONERANGE2;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    _op->dest_oid = _get_object_id(noid);
+    _op->off = srcoff;
+    _op->len = srclen;
+    _op->dest_off = dstoff;
+    data.ops = data.ops + 1;
+  }
+
+  /// Create the collection
+  void create_collection(const coll_t& cid, int bits) {
+    Op* _op = _get_next_op();
+    _op->op = OP_MKCOLL;
+    _op->cid = _get_coll_id(cid);
+    _op->split_bits = bits;
+    data.ops = data.ops + 1;
+  }
+
+  /**
+   * Give the collection a hint.
+   *
+   * @param cid  - collection id.
+   * @param type - hint type.
+   * @param hint - the hint payload, which contains the customized
+   *               data along with the hint type.
+   */
+  void collection_hint(const coll_t& cid, uint32_t type, const ceph::buffer::list& hint) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_COLL_HINT;
+    _op->cid = _get_coll_id(cid);
+    _op->hint = type;
+    encode(hint, data_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// remove the collection, the collection must be empty
+  void remove_collection(const coll_t& cid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_RMCOLL;
+    _op->cid = _get_coll_id(cid);
+    data.ops = data.ops + 1;
+  }
+  void collection_move(const coll_t& cid, const coll_t &oldcid, const ghobject_t& oid)
+    __attribute__ ((deprecated)) {
+	// NOTE: we encode this as a fixed combo of ADD + REMOVE.  they
+	// always appear together, so this is effectively a single MOVE.
+	Op* _op = _get_next_op();
+	_op->op = OP_COLL_ADD;
+	_op->cid = _get_coll_id(oldcid);
+	_op->oid = _get_object_id(oid);
+	_op->dest_cid = _get_coll_id(cid);
+	data.ops = data.ops + 1;
+
+	_op = _get_next_op();
+	_op->op = OP_COLL_REMOVE;
+	_op->cid = _get_coll_id(oldcid);
+	_op->oid = _get_object_id(oid);
+	data.ops = data.ops + 1;
+    }
+  void collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+				const coll_t &cid, const ghobject_t& oid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_COLL_MOVE_RENAME;
+    _op->cid = _get_coll_id(oldcid);
+    _op->oid = _get_object_id(oldoid);
+    _op->dest_cid = _get_coll_id(cid);
+    _op->dest_oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+  void try_rename(const coll_t &cid, const ghobject_t& oldoid,
+                  const ghobject_t& oid) {
+    Op* _op = _get_next_op();
+    _op->op = OP_TRY_RENAME;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oldoid);
+    _op->dest_oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+
+  /// Remove omap from oid
+  void omap_clear(
+    const coll_t &cid,           ///< [in] Collection containing oid
+    const ghobject_t &oid  ///< [in] Object from which to remove omap
+    ) {
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_CLEAR;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data.ops = data.ops + 1;
+  }
+  /// Set keys on oid omap.  Replaces duplicate keys.
+  void omap_setkeys(
+    const coll_t& cid,                           ///< [in] Collection containing oid
+    const ghobject_t &oid,                ///< [in] Object to update
+    const std::map<std::string, ceph::buffer::list> &attrset ///< [in] Replacement keys and values
+    ) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_SETKEYS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(attrset, data_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Set keys on an oid omap (ceph::buffer::list variant).
+  void omap_setkeys(
+    const coll_t &cid,                           ///< [in] Collection containing oid
+    const ghobject_t &oid,                ///< [in] Object to update
+    const ceph::buffer::list &attrset_bl          ///< [in] Replacement keys and values
+    ) {
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_SETKEYS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data_bl.append(attrset_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Remove keys from oid omap
+  void omap_rmkeys(
+    const coll_t &cid,             ///< [in] Collection containing oid
+    const ghobject_t &oid,  ///< [in] Object from which to remove the omap
+    const std::set<std::string> &keys ///< [in] Keys to clear
+    ) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_RMKEYS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(keys, data_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Remove key from oid omap
+  void omap_rmkey(
+    const coll_t &cid,             ///< [in] Collection containing oid
+    const ghobject_t &oid,  ///< [in] Object from which to remove the omap
+    const std::string& key ///< [in] Keys to clear
+    ) {
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_RMKEYS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    using ceph::encode;
+    encode((uint32_t)1, data_bl);
+    encode(key, data_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Remove keys from oid omap
+  void omap_rmkeys(
+    const coll_t &cid,             ///< [in] Collection containing oid
+    const ghobject_t &oid,  ///< [in] Object from which to remove the omap
+    const ceph::buffer::list &keys_bl ///< [in] Keys to clear
+    ) {
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_RMKEYS;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data_bl.append(keys_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Remove key range from oid omap
+  void omap_rmkeyrange(
+    const coll_t &cid,             ///< [in] Collection containing oid
+    const ghobject_t &oid,  ///< [in] Object from which to remove the omap keys
+    const std::string& first,    ///< [in] first key in range
+    const std::string& last      ///< [in] first key past range, range is [first,last)
+    ) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_RMKEYRANGE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(first, data_bl);
+    encode(last, data_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Remove key range from oid omap
+  void omap_rmkeyrange(
+    const coll_t cid,       ///< [in] Collection containing oid
+    const ghobject_t &oid,  ///< [in] Object from which to remove the omap keys
+    const bufferlist &keys_bl ///< [in] range of keys to clear
+    ) {
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_RMKEYRANGE;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    data_bl.append(keys_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Set omap header
+  void omap_setheader(
+    const coll_t &cid,             ///< [in] Collection containing oid
+    const ghobject_t &oid,  ///< [in] Object
+    const ceph::buffer::list &bl    ///< [in] Header value
+    ) {
+    using ceph::encode;
+    Op* _op = _get_next_op();
+    _op->op = OP_OMAP_SETHEADER;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    encode(bl, data_bl);
+    data.ops = data.ops + 1;
+  }
+
+  /// Split collection based on given prefixes, objects matching the specified bits/rem are
+  /// moved to the new collection
+  void split_collection(
+    const coll_t &cid,
+    uint32_t bits,
+    uint32_t rem,
+    const coll_t &destination) {
+    Op* _op = _get_next_op();
+    _op->op = OP_SPLIT_COLLECTION2;
+    _op->cid = _get_coll_id(cid);
+    _op->dest_cid = _get_coll_id(destination);
+    _op->split_bits = bits;
+    _op->split_rem = rem;
+    data.ops = data.ops + 1;
+  }
+
+  /// Merge collection into another.
+  void merge_collection(
+    coll_t cid,
+    coll_t destination,
+    uint32_t bits) {
+    Op* _op = _get_next_op();
+    _op->op = OP_MERGE_COLLECTION;
+    _op->cid = _get_coll_id(cid);
+    _op->dest_cid = _get_coll_id(destination);
+    _op->split_bits = bits;
+    data.ops = data.ops + 1;
+  }
+
+  void collection_set_bits(
+    const coll_t &cid,
+    int bits) {
+    Op* _op = _get_next_op();
+    _op->op = OP_COLL_SET_BITS;
+    _op->cid = _get_coll_id(cid);
+    _op->split_bits = bits;
+    data.ops = data.ops + 1;
+  }
+
+  /// Set allocation hint for an object
+  /// make 0 values(expected_object_size, expected_write_size) noops for all implementations
+  void set_alloc_hint(
+    const coll_t &cid,
+    const ghobject_t &oid,
+    uint64_t expected_object_size,
+    uint64_t expected_write_size,
+    uint32_t flags
+  ) {
+    Op* _op = _get_next_op();
+    _op->op = OP_SETALLOCHINT;
+    _op->cid = _get_coll_id(cid);
+    _op->oid = _get_object_id(oid);
+    _op->expected_object_size = expected_object_size;
+    _op->expected_write_size = expected_write_size;
+    _op->hint = flags;
+    data.ops = data.ops + 1;
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    //layout: data_bl + op_bl + coll_index + object_index + data
+    ENCODE_START(9, 9, bl);
+    encode(data_bl, bl);
+    encode(op_bl, bl);
+    encode(coll_index, bl);
+    encode(object_index, bl);
+    data.encode(bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(9, bl);
+    DECODE_OLDEST(9);
+
+    decode(data_bl, bl);
+    decode(op_bl, bl);
+    decode(coll_index, bl);
+    decode(object_index, bl);
+    data.decode(bl);
+    coll_id = coll_index.size();
+    object_id = object_index.size();
+
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f);
+  static void generate_test_instances(std::list<Transaction*>& o);
+};
+WRITE_CLASS_ENCODER(Transaction)
+WRITE_CLASS_ENCODER(Transaction::TransactionData)
+
+std::ostream& operator<<(std::ostream& out, const Transaction& tx);
+
+}
diff --git a/src/os/bluestore/Allocator.cc b/src/os/bluestore/Allocator.cc
new file mode 100644
index 000000000..3acdeacdf
--- /dev/null
+++ b/src/os/bluestore/Allocator.cc
@@ -0,0 +1,224 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Allocator.h"
+#include <bit>
+#include "StupidAllocator.h"
+#include "BitmapAllocator.h"
+#include "AvlAllocator.h"
+#include "BtreeAllocator.h"
+#include "HybridAllocator.h"
+#ifdef HAVE_LIBZBD
+#include "ZonedAllocator.h"
+#endif
+#include "common/debug.h"
+#include "common/admin_socket.h"
+#define dout_subsys ceph_subsys_bluestore
+
+using std::string;
+using std::to_string;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+class Allocator::SocketHook : public AdminSocketHook {
+  Allocator *alloc;
+
+  friend class Allocator;
+  std::string name;
+public:
+  SocketHook(Allocator *alloc, std::string_view _name) :
+    alloc(alloc), name(_name)
+  {
+    AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+    if (name.empty()) {
+      name = to_string((uintptr_t)this);
+    }
+    if (admin_socket) {
+      int r = admin_socket->register_command(
+	("bluestore allocator dump " + name).c_str(),
+	this,
+	"dump allocator free regions");
+      if (r != 0)
+        alloc = nullptr; //some collision, disable
+      if (alloc) {
+        r = admin_socket->register_command(
+	  ("bluestore allocator score " + name).c_str(),
+	  this,
+	  "give score on allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)");
+        ceph_assert(r == 0);
+        r = admin_socket->register_command(
+          ("bluestore allocator fragmentation " + name).c_str(),
+          this,
+          "give allocator fragmentation (0-no fragmentation, 1-absolute fragmentation)");
+        ceph_assert(r == 0);
+      }
+    }
+  }
+  ~SocketHook()
+  {
+    AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+    if (admin_socket && alloc) {
+      admin_socket->unregister_commands(this);
+    }
+  }
+
+  int call(std::string_view command,
+	   const cmdmap_t& cmdmap,
+	   const bufferlist&,
+	   Formatter *f,
+	   std::ostream& ss,
+	   bufferlist& out) override {
+    int r = 0;
+    if (command == "bluestore allocator dump " + name) {
+      f->open_object_section("allocator_dump");
+      f->dump_unsigned("capacity", alloc->get_capacity());
+      f->dump_unsigned("alloc_unit", alloc->get_block_size());
+      f->dump_string("alloc_type", alloc->get_type());
+      f->dump_string("alloc_name", name);
+
+      f->open_array_section("extents");
+      auto iterated_allocation = [&](size_t off, size_t len) {
+        ceph_assert(len > 0);
+        f->open_object_section("free");
+        char off_hex[30];
+        char len_hex[30];
+        snprintf(off_hex, sizeof(off_hex) - 1, "0x%zx", off);
+        snprintf(len_hex, sizeof(len_hex) - 1, "0x%zx", len);
+        f->dump_string("offset", off_hex);
+        f->dump_string("length", len_hex);
+        f->close_section();
+      };
+      alloc->foreach(iterated_allocation);
+      f->close_section();
+      f->close_section();
+    } else if (command == "bluestore allocator score " + name) {
+      f->open_object_section("fragmentation_score");
+      f->dump_float("fragmentation_rating", alloc->get_fragmentation_score());
+      f->close_section();
+    } else if (command == "bluestore allocator fragmentation " + name) {
+      f->open_object_section("fragmentation");
+      f->dump_float("fragmentation_rating", alloc->get_fragmentation());
+      f->close_section();
+    } else {
+      ss << "Invalid command" << std::endl;
+      r = -ENOSYS;
+    }
+    return r;
+  }
+
+};
+Allocator::Allocator(std::string_view name,
+                     int64_t _capacity,
+                     int64_t _block_size)
+ : device_size(_capacity),
+   block_size(_block_size)
+{
+  asok_hook = new SocketHook(this, name);
+}
+
+
+Allocator::~Allocator()
+{
+  delete asok_hook;
+}
+
+const string& Allocator::get_name() const {
+  return asok_hook->name;
+}
+
+Allocator *Allocator::create(
+  CephContext* cct,
+  std::string_view type,
+  int64_t size,
+  int64_t block_size,
+  int64_t zone_size,
+  int64_t first_sequential_zone,
+  std::string_view name)
+{
+  Allocator* alloc = nullptr;
+  if (type == "stupid") {
+    alloc = new StupidAllocator(cct, size, block_size, name);
+  } else if (type == "bitmap") {
+    alloc = new BitmapAllocator(cct, size, block_size, name);
+  } else if (type == "avl") {
+    return new AvlAllocator(cct, size, block_size, name);
+  } else if (type == "btree") {
+    return new BtreeAllocator(cct, size, block_size, name);
+  } else if (type == "hybrid") {
+    return new HybridAllocator(cct, size, block_size,
+      cct->_conf.get_val<uint64_t>("bluestore_hybrid_alloc_mem_cap"),
+      name);
+#ifdef HAVE_LIBZBD
+  } else if (type == "zoned") {
+    return new ZonedAllocator(cct, size, block_size, zone_size, first_sequential_zone,
+			      name);
+#endif
+  }
+  if (alloc == nullptr) {
+    lderr(cct) << "Allocator::" << __func__ << " unknown alloc type "
+	     << type << dendl;
+  }
+  return alloc;
+}
+
+void Allocator::release(const PExtentVector& release_vec)
+{
+  interval_set<uint64_t> release_set;
+  for (auto e : release_vec) {
+    release_set.insert(e.offset, e.length);
+  }
+  release(release_set);
+}
+
+/**
+ * Gives fragmentation a numeric value.
+ *
+ * Following algorithm applies value to each existing free unallocated block.
+ * Value of single block is a multiply of size and per-byte-value.
+ * Per-byte-value is greater for larger blocks.
+ * Assume block size X has value per-byte p; then block size 2*X will have per-byte value 1.1*p.
+ *
+ * This could be expressed in logarithms, but for speed this is interpolated inside ranges.
+ * [1]  [2..3] [4..7] [8..15] ...
+ * ^    ^      ^      ^
+ * 1.1  1.1^2  1.1^3  1.1^4 ...
+ *
+ * Final score is obtained by proportion between score that would have been obtained
+ * in condition of absolute fragmentation and score in no fragmentation at all.
+ */
+double Allocator::get_fragmentation_score()
+{
+  // this value represents how much worth is 2X bytes in one chunk then in X + X bytes
+  static const double double_size_worth = 1.1 ;
+  std::vector<double> scales{1};
+  double score_sum = 0;
+  size_t sum = 0;
+
+  auto get_score = [&](size_t v) -> double {
+    size_t sc = sizeof(v) * 8 - std::countl_zero(v) - 1; //assign to grade depending on log2(len)
+    while (scales.size() <= sc + 1) {
+      //unlikely expand scales vector
+      scales.push_back(scales[scales.size() - 1] * double_size_worth);
+    }
+
+    size_t sc_shifted = size_t(1) << sc;
+    double x = double(v - sc_shifted) / sc_shifted; //x is <0,1) in its scale grade
+    // linear extrapolation in its scale grade
+    double score = (sc_shifted    ) * scales[sc]   * (1-x) +
+                   (sc_shifted * 2) * scales[sc+1] * x;
+    return score;
+  };
+
+  auto iterated_allocation = [&](size_t off, size_t len) {
+    ceph_assert(len > 0);
+    score_sum += get_score(len);
+    sum += len;
+  };
+  foreach(iterated_allocation);
+
+
+  double ideal = get_score(sum);
+  double terrible = sum * get_score(1);
+  return (ideal - score_sum) / (ideal - terrible);
+}
diff --git a/src/os/bluestore/Allocator.h b/src/os/bluestore/Allocator.h
new file mode 100644
index 000000000..e378007c3
--- /dev/null
+++ b/src/os/bluestore/Allocator.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_OS_BLUESTORE_ALLOCATOR_H
+#define CEPH_OS_BLUESTORE_ALLOCATOR_H
+
+#include <functional>
+#include <ostream>
+#include "include/ceph_assert.h"
+#include "bluestore_types.h"
+
+class Allocator {
+public:
+  Allocator(std::string_view name,
+	    int64_t _capacity,
+	    int64_t _block_size);
+  virtual ~Allocator();
+
+  /*
+  * returns allocator type name as per names in config
+  */
+  virtual const char* get_type() const = 0;
+
+  /*
+   * Allocate required number of blocks in n number of extents.
+   * Min and Max number of extents are limited by:
+   * a. alloc unit
+   * b. max_alloc_size.
+   * as no extent can be lesser than block_size and greater than max_alloc size.
+   * Apart from that extents can vary between these lower and higher limits according
+   * to free block search algorithm and availability of contiguous space.
+   */
+  virtual int64_t allocate(uint64_t want_size, uint64_t block_size,
+			   uint64_t max_alloc_size, int64_t hint,
+			   PExtentVector *extents) = 0;
+
+  int64_t allocate(uint64_t want_size, uint64_t block_size,
+		   int64_t hint, PExtentVector *extents) {
+    return allocate(want_size, block_size, want_size, hint, extents);
+  }
+
+  /* Bulk release. Implementations may override this method to handle the whole
+   * set at once. This could save e.g. unnecessary mutex dance. */
+  virtual void release(const interval_set<uint64_t>& release_set) = 0;
+  void release(const PExtentVector& release_set);
+
+  virtual void dump() = 0;
+  virtual void foreach(
+    std::function<void(uint64_t offset, uint64_t length)> notify) = 0;
+
+  virtual void init_add_free(uint64_t offset, uint64_t length) = 0;
+  virtual void init_rm_free(uint64_t offset, uint64_t length) = 0;
+
+  virtual uint64_t get_free() = 0;
+  virtual double get_fragmentation()
+  {
+    return 0.0;
+  }
+  virtual double get_fragmentation_score();
+  virtual void shutdown() = 0;
+
+  static Allocator *create(
+    CephContext* cct,
+    std::string_view type,
+    int64_t size,
+    int64_t block_size,
+    int64_t zone_size = 0,
+    int64_t firs_sequential_zone = 0,
+    const std::string_view name = ""
+    );
+
+
+  const std::string& get_name() const;
+  int64_t get_capacity() const
+  {
+    return device_size;
+  }
+  int64_t get_block_size() const
+  {
+    return block_size;
+  }
+
+private:
+  class SocketHook;
+  SocketHook* asok_hook = nullptr;
+protected:
+  const int64_t device_size = 0;
+  const int64_t block_size = 0;
+};
+
+#endif
diff --git a/src/os/bluestore/AvlAllocator.cc b/src/os/bluestore/AvlAllocator.cc
new file mode 100644
index 000000000..afa541862
--- /dev/null
+++ b/src/os/bluestore/AvlAllocator.cc
@@ -0,0 +1,476 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "AvlAllocator.h"
+
+#include <bit>
+#include <limits>
+
+#include "common/config_proxy.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef  dout_prefix
+#define dout_prefix *_dout << "AvlAllocator "
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(range_seg_t, range_seg_t, bluestore_alloc);
+
+namespace {
+  // a light-weight "range_seg_t", which only used as the key when searching in
+  // range_tree and range_size_tree
+  struct range_t {
+    uint64_t start;
+    uint64_t end;
+  };
+}
+
+/*
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
+ */
+uint64_t AvlAllocator::_pick_block_after(uint64_t *cursor,
+					 uint64_t size,
+					 uint64_t align)
+{
+  const auto compare = range_tree.key_comp();
+  uint32_t search_count = 0;
+  uint64_t search_bytes = 0;
+  auto rs_start = range_tree.lower_bound(range_t{*cursor, size}, compare);
+  for (auto rs = rs_start; rs != range_tree.end(); ++rs) {
+    uint64_t offset = rs->start;
+    *cursor = offset + size;
+    if (offset + size <= rs->end) {
+      return offset;
+    }
+    if (max_search_count > 0 && ++search_count > max_search_count) {
+      return -1ULL;
+    }
+    if (search_bytes = rs->start - rs_start->start;
+	max_search_bytes > 0 && search_bytes > max_search_bytes) {
+      return -1ULL;
+    }
+  }
+
+  if (*cursor == 0) {
+    // If we already started from beginning, don't bother with searching from beginning
+    return -1ULL;
+  }
+  // If we reached end, start from beginning till cursor.
+  for (auto rs = range_tree.begin(); rs != rs_start; ++rs) {
+    uint64_t offset = rs->start;
+    *cursor = offset + size;
+    if (offset + size <= rs->end) {
+      return offset;
+    }
+    if (max_search_count > 0 && ++search_count > max_search_count) {
+      return -1ULL;
+    }
+    if (max_search_bytes > 0 && search_bytes + rs->start > max_search_bytes) {
+      return -1ULL;
+    }
+  }
+  return -1ULL;
+}
+
+uint64_t AvlAllocator::_pick_block_fits(uint64_t size,
+					uint64_t align)
+{
+  // instead of searching from cursor, just pick the smallest range which fits
+  // the needs
+  const auto compare = range_size_tree.key_comp();
+  auto rs_start = range_size_tree.lower_bound(range_t{0, size}, compare);
+  for (auto rs = rs_start; rs != range_size_tree.end(); ++rs) {
+    uint64_t offset = rs->start;
+    if (offset + size <= rs->end) {
+      return offset;
+    }
+  }
+  return -1ULL;
+}
+
+void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size)
+{
+  ceph_assert(size != 0);
+
+  uint64_t end = start + size;
+
+  auto rs_after = range_tree.upper_bound(range_t{start, end},
+					 range_tree.key_comp());
+
+  /* Make sure we don't overlap with either of our neighbors */
+  auto rs_before = range_tree.end();
+  if (rs_after != range_tree.begin()) {
+    rs_before = std::prev(rs_after);
+  }
+
+  bool merge_before = (rs_before != range_tree.end() && rs_before->end == start);
+  bool merge_after = (rs_after != range_tree.end() && rs_after->start == end);
+
+  if (merge_before && merge_after) {
+    _range_size_tree_rm(*rs_before);
+    _range_size_tree_rm(*rs_after);
+    rs_after->start = rs_before->start;
+    range_tree.erase_and_dispose(rs_before, dispose_rs{});
+    _range_size_tree_try_insert(*rs_after);
+  } else if (merge_before) {
+    _range_size_tree_rm(*rs_before);
+    rs_before->end = end;
+    _range_size_tree_try_insert(*rs_before);
+  } else if (merge_after) {
+    _range_size_tree_rm(*rs_after);
+    rs_after->start = start;
+    _range_size_tree_try_insert(*rs_after);
+  } else {
+    _try_insert_range(start, end, &rs_after);
+  }
+}
+
+void AvlAllocator::_process_range_removal(uint64_t start, uint64_t end,
+  AvlAllocator::range_tree_t::iterator& rs)
+{
+  bool left_over = (rs->start != start);
+  bool right_over = (rs->end != end);
+
+  _range_size_tree_rm(*rs);
+
+  if (left_over && right_over) {
+    auto old_right_end = rs->end;
+    auto insert_pos = rs;
+    ceph_assert(insert_pos != range_tree.end());
+    ++insert_pos;
+    rs->end = start;
+
+    // Insert tail first to be sure insert_pos hasn't been disposed.
+    // This woulnd't dispose rs though since it's out of range_size_tree.
+    // Don't care about a small chance of 'not-the-best-choice-for-removal' case
+    // which might happen if rs has the lowest size.
+    _try_insert_range(end, old_right_end, &insert_pos);
+    _range_size_tree_try_insert(*rs);
+
+  } else if (left_over) {
+    rs->end = start;
+    _range_size_tree_try_insert(*rs);
+  } else if (right_over) {
+    rs->start = end;
+    _range_size_tree_try_insert(*rs);
+  } else {
+    range_tree.erase_and_dispose(rs, dispose_rs{});
+  }
+}
+
+void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size)
+{
+  uint64_t end = start + size;
+
+  ceph_assert(size != 0);
+  ceph_assert(size <= num_free);
+
+  auto rs = range_tree.find(range_t{start, end}, range_tree.key_comp());
+  /* Make sure we completely overlap with someone */
+  ceph_assert(rs != range_tree.end());
+  ceph_assert(rs->start <= start);
+  ceph_assert(rs->end >= end);
+
+  _process_range_removal(start, end, rs);
+}
+
+void AvlAllocator::_try_remove_from_tree(uint64_t start, uint64_t size,
+  std::function<void(uint64_t, uint64_t, bool)> cb)
+{
+  uint64_t end = start + size;
+
+  ceph_assert(size != 0);
+
+  auto rs = range_tree.find(range_t{ start, end },
+    range_tree.key_comp());
+
+  if (rs == range_tree.end() || rs->start >= end) {
+    cb(start, size, false);
+    return;
+  }
+
+  do {
+
+    auto next_rs = rs;
+    ++next_rs;
+
+    if (start < rs->start) {
+      cb(start, rs->start - start, false);
+      start = rs->start;
+    }
+    auto range_end = std::min(rs->end, end);
+    _process_range_removal(start, range_end, rs);
+    cb(start, range_end - start, true);
+    start = range_end;
+
+    rs = next_rs;
+  } while (rs != range_tree.end() && rs->start < end && start < end);
+  if (start < end) {
+    cb(start, end - start, false);
+  }
+}
+
+int64_t AvlAllocator::_allocate(
+  uint64_t want,
+  uint64_t unit,
+  uint64_t max_alloc_size,
+  int64_t  hint, // unused, for now!
+  PExtentVector* extents)
+{
+  uint64_t allocated = 0;
+  while (allocated < want) {
+    uint64_t offset, length;
+    int r = _allocate(std::min(max_alloc_size, want - allocated),
+      unit, &offset, &length);
+    if (r < 0) {
+      // Allocation failed.
+      break;
+    }
+    extents->emplace_back(offset, length);
+    allocated += length;
+  }
+  return allocated ? allocated : -ENOSPC;
+}
+
+int AvlAllocator::_allocate(
+  uint64_t size,
+  uint64_t unit,
+  uint64_t *offset,
+  uint64_t *length)
+{
+  uint64_t max_size = 0;
+  if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) {
+    max_size = p->end - p->start;
+  }
+
+  bool force_range_size_alloc = false;
+  if (max_size < size) {
+    if (max_size < unit) {
+      return -ENOSPC;
+    }
+    size = p2align(max_size, unit);
+    ceph_assert(size > 0);
+    force_range_size_alloc = true;
+  }
+
+  const int free_pct = num_free * 100 / device_size;
+  uint64_t start = 0;
+  // If we're running low on space, find a range by size by looking up in the size
+  // sorted tree (best-fit), instead of searching in the area pointed by cursor
+  if (force_range_size_alloc ||
+      max_size < range_size_alloc_threshold ||
+      free_pct < range_size_alloc_free_pct) {
+    start = -1ULL;
+  } else {
+    /*
+     * Find the largest power of 2 block size that evenly divides the
+     * requested size. This is used to try to allocate blocks with similar
+     * alignment from the same area (i.e. same cursor bucket) but it does
+     * not guarantee that other allocations sizes may exist in the same
+     * region.
+     */
+    uint64_t align = size & -size;
+    ceph_assert(align != 0);
+    uint64_t* cursor = &lbas[cbits(align) - 1];
+    start = _pick_block_after(cursor, size, unit);
+    dout(20) << __func__ << " first fit=" << start << " size=" << size << dendl;
+  }
+  if (start == -1ULL) {
+    do {
+      start = _pick_block_fits(size, unit);
+      dout(20) << __func__ << " best fit=" << start << " size=" << size << dendl;
+      if (start != uint64_t(-1ULL)) {
+        break;
+      }
+      // try to collect smaller extents as we could fail to retrieve
+      // that large block due to misaligned extents
+      size = p2align(size >> 1, unit);
+    } while (size >= unit);
+  }
+  if (start == -1ULL) {
+    return -ENOSPC;
+  }
+
+  _remove_from_tree(start, size);
+
+  *offset = start;
+  *length = size;
+  return 0;
+}
+
+void AvlAllocator::_release(const interval_set<uint64_t>& release_set)
+{
+  for (auto p = release_set.begin(); p != release_set.end(); ++p) {
+    const auto offset = p.get_start();
+    const auto length = p.get_len();
+    ceph_assert(offset + length <= uint64_t(device_size));
+    ldout(cct, 10) << __func__ << std::hex
+      << " offset 0x" << offset
+      << " length 0x" << length
+      << std::dec << dendl;
+    _add_to_tree(offset, length);
+  }
+}
+
+void AvlAllocator::_release(const PExtentVector& release_set) {
+  for (auto& e : release_set) {
+    ldout(cct, 10) << __func__ << std::hex
+      << " offset 0x" << e.offset
+      << " length 0x" << e.length
+      << std::dec << dendl;
+    _add_to_tree(e.offset, e.length);
+  }
+}
+
+void AvlAllocator::_shutdown()
+{
+  range_size_tree.clear();
+  range_tree.clear_and_dispose(dispose_rs{});
+}
+
+AvlAllocator::AvlAllocator(CephContext* cct,
+                           int64_t device_size,
+                           int64_t block_size,
+                           uint64_t max_mem,
+                           std::string_view name) :
+  Allocator(name, device_size, block_size),
+  range_size_alloc_threshold(
+    cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
+  range_size_alloc_free_pct(
+    cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
+  max_search_count(
+    cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_ff_max_search_count")),
+  max_search_bytes(
+    cct->_conf.get_val<Option::size_t>("bluestore_avl_alloc_ff_max_search_bytes")),
+  range_count_cap(max_mem / sizeof(range_seg_t)),
+  cct(cct)
+{}
+
+AvlAllocator::AvlAllocator(CephContext* cct,
+			   int64_t device_size,
+			   int64_t block_size,
+			   std::string_view name) :
+  AvlAllocator(cct, device_size, block_size, 0 /* max_mem */, name)
+{}
+
+AvlAllocator::~AvlAllocator()
+{
+  shutdown();
+}
+
+int64_t AvlAllocator::allocate(
+  uint64_t want,
+  uint64_t unit,
+  uint64_t max_alloc_size,
+  int64_t  hint, // unused, for now!
+  PExtentVector* extents)
+{
+  ldout(cct, 10) << __func__ << std::hex
+                 << " want 0x" << want
+                 << " unit 0x" << unit
+                 << " max_alloc_size 0x" << max_alloc_size
+                 << " hint 0x" << hint
+                 << std::dec << dendl;
+  ceph_assert(std::has_single_bit(unit));
+  ceph_assert(want % unit == 0);
+
+  if (max_alloc_size == 0) {
+    max_alloc_size = want;
+  }
+  if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
+      max_alloc_size >= cap) {
+    max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size);
+  }
+  std::lock_guard l(lock);
+  return _allocate(want, unit, max_alloc_size, hint, extents);
+}
+
+void AvlAllocator::release(const interval_set<uint64_t>& release_set) {
+  std::lock_guard l(lock);
+  _release(release_set);
+}
+
+uint64_t AvlAllocator::get_free()
+{
+  std::lock_guard l(lock);
+  return num_free;
+}
+
+double AvlAllocator::get_fragmentation()
+{
+  std::lock_guard l(lock);
+  return _get_fragmentation();
+}
+
+void AvlAllocator::dump()
+{
+  std::lock_guard l(lock);
+  _dump();
+}
+
+void AvlAllocator::_dump() const
+{
+  ldout(cct, 0) << __func__ << " range_tree: " << dendl;
+  for (auto& rs : range_tree) {
+    ldout(cct, 0) << std::hex
+      << "0x" << rs.start << "~" << rs.end
+      << std::dec
+      << dendl;
+  }
+  ldout(cct, 0) << __func__ << " range_size_tree: " << dendl;
+  for (auto& rs : range_size_tree) {
+    ldout(cct, 0) << std::hex
+      << "0x" << rs.start << "~" << rs.end
+      << std::dec
+      << dendl;
+  }
+}
+
+void AvlAllocator::foreach(
+  std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+  std::lock_guard l(lock);
+  _foreach(notify);
+}
+
+void AvlAllocator::_foreach(
+  std::function<void(uint64_t offset, uint64_t length)> notify) const
+{
+  for (auto& rs : range_tree) {
+    notify(rs.start, rs.end - rs.start);
+  }
+}
+
+void AvlAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+  ldout(cct, 10) << __func__ << std::hex
+                 << " offset 0x" << offset
+                 << " length 0x" << length
+                 << std::dec << dendl;
+  if (!length)
+    return;
+  std::lock_guard l(lock);
+  ceph_assert(offset + length <= uint64_t(device_size));
+  _add_to_tree(offset, length);
+}
+
+void AvlAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+  ldout(cct, 10) << __func__ << std::hex
+                 << " offset 0x" << offset
+                 << " length 0x" << length
+                 << std::dec << dendl;
+  if (!length)
+    return;
+  std::lock_guard l(lock);
+  ceph_assert(offset + length <= uint64_t(device_size));
+  _remove_from_tree(offset, length);
+}
+
+void AvlAllocator::shutdown()
+{
+  std::lock_guard l(lock);
+  _shutdown();
+}
diff --git a/src/os/bluestore/AvlAllocator.h b/src/os/bluestore/AvlAllocator.h
new file mode 100644
index 000000000..d79242a52
--- /dev/null
+++ b/src/os/bluestore/AvlAllocator.h
@@ -0,0 +1,271 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <mutex>
+#include <boost/intrusive/avl_set.hpp>
+
+#include "Allocator.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+
+struct range_seg_t {
+  MEMPOOL_CLASS_HELPERS();  ///< memory monitoring
+  uint64_t start;   ///< starting offset of this segment
+  uint64_t end;	    ///< ending offset (non-inclusive)
+
+  range_seg_t(uint64_t start, uint64_t end)
+    : start{start},
+      end{end}
+  {}
+  // Tree is sorted by offset, greater offsets at the end of the tree.
+  struct before_t {
+    template<typename KeyLeft, typename KeyRight>
+    bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const {
+      return lhs.end <= rhs.start;
+    }
+  };
+  boost::intrusive::avl_set_member_hook<> offset_hook;
+
+  // Tree is sorted by size, larger sizes at the end of the tree.
+  struct shorter_t {
+    template<typename KeyType>
+    bool operator()(const range_seg_t& lhs, const KeyType& rhs) const {
+      auto lhs_size = lhs.end - lhs.start;
+      auto rhs_size = rhs.end - rhs.start;
+      if (lhs_size < rhs_size) {
+	return true;
+      } else if (lhs_size > rhs_size) {
+	return false;
+      } else {
+	return lhs.start < rhs.start;
+      }
+    }
+  };
+  inline uint64_t length() const {
+    return end - start;
+  }
+  boost::intrusive::avl_set_member_hook<> size_hook;
+};
+
+class AvlAllocator : public Allocator {
+  struct dispose_rs {
+    void operator()(range_seg_t* p)
+    {
+      delete p;
+    }
+  };
+
+protected:
+  /*
+  * ctor intended for the usage from descendant class(es) which
+  * provides handling for spilled over entries
+  * (when entry count >= max_entries)
+  */
+  AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
+    uint64_t max_mem,
+    std::string_view name);
+
+public:
+  AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
+	       std::string_view name);
+  ~AvlAllocator();
+  const char* get_type() const override
+  {
+    return "avl";
+  }
+  int64_t allocate(
+    uint64_t want,
+    uint64_t unit,
+    uint64_t max_alloc_size,
+    int64_t  hint,
+    PExtentVector *extents) override;
+  void release(const interval_set<uint64_t>& release_set) override;
+  uint64_t get_free() override;
+  double get_fragmentation() override;
+
+  void dump() override;
+  void foreach(
+    std::function<void(uint64_t offset, uint64_t length)> notify) override;
+  void init_add_free(uint64_t offset, uint64_t length) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+  void shutdown() override;
+
+private:
+  // pick a range by search from cursor forward
+  uint64_t _pick_block_after(
+    uint64_t *cursor,
+    uint64_t size,
+    uint64_t align);
+  // pick a range with exactly the same size or larger
+  uint64_t _pick_block_fits(
+    uint64_t size,
+    uint64_t align);
+  int _allocate(
+    uint64_t size,
+    uint64_t unit,
+    uint64_t *offset,
+    uint64_t *length);
+
+  using range_tree_t = 
+    boost::intrusive::avl_set<
+      range_seg_t,
+      boost::intrusive::compare<range_seg_t::before_t>,
+      boost::intrusive::member_hook<
+	range_seg_t,
+	boost::intrusive::avl_set_member_hook<>,
+	&range_seg_t::offset_hook>>;
+  range_tree_t range_tree;    ///< main range tree
+  /*
+   * The range_size_tree should always contain the
+   * same number of segments as the range_tree.
+   * The only difference is that the range_size_tree
+   * is ordered by segment sizes.
+   */
+  using range_size_tree_t =
+    boost::intrusive::avl_multiset<
+      range_seg_t,
+      boost::intrusive::compare<range_seg_t::shorter_t>,
+      boost::intrusive::member_hook<
+	range_seg_t,
+	boost::intrusive::avl_set_member_hook<>,
+	&range_seg_t::size_hook>,
+      boost::intrusive::constant_time_size<true>>;
+  range_size_tree_t range_size_tree;
+
+  uint64_t num_free = 0;     ///< total bytes in freelist
+
+  /*
+   * This value defines the number of elements in the ms_lbas array.
+   * The value of 64 was chosen as it covers all power of 2 buckets
+   * up to UINT64_MAX.
+   * This is the equivalent of highest-bit of UINT64_MAX.
+   */
+  static constexpr unsigned MAX_LBAS = 64;
+  uint64_t lbas[MAX_LBAS] = {0};
+
+  /*
+   * Minimum size which forces the dynamic allocator to change
+   * it's allocation strategy.  Once the allocator cannot satisfy
+   * an allocation of this size then it switches to using more
+   * aggressive strategy (i.e search by size rather than offset).
+   */
+  uint64_t range_size_alloc_threshold = 0;
+  /*
+   * The minimum free space, in percent, which must be available
+   * in allocator to continue allocations in a first-fit fashion.
+   * Once the allocator's free space drops below this level we dynamically
+   * switch to using best-fit allocations.
+   */
+  int range_size_alloc_free_pct = 0;
+  /*
+   * Maximum number of segments to check in the first-fit mode, without this
+   * limit, fragmented device can see lots of iterations and _block_picker()
+   * becomes the performance limiting factor on high-performance storage.
+   */
+  const uint32_t max_search_count;
+  /*
+   * Maximum distance to search forward from the last offset, without this
+   * limit, fragmented device can see lots of iterations and _block_picker()
+   * becomes the performance limiting factor on high-performance storage.
+   */
+  const uint32_t max_search_bytes;
+  /*
+  * Max amount of range entries allowed. 0 - unlimited
+  */
+  uint64_t range_count_cap = 0;
+
+  void _range_size_tree_rm(range_seg_t& r) {
+    ceph_assert(num_free >= r.length());
+    num_free -= r.length();
+    range_size_tree.erase(r);
+
+  }
+  void _range_size_tree_try_insert(range_seg_t& r) {
+    if (_try_insert_range(r.start, r.end)) {
+      range_size_tree.insert(r);
+      num_free += r.length();
+    } else {
+      range_tree.erase_and_dispose(r, dispose_rs{});
+    }
+  }
+  bool _try_insert_range(uint64_t start,
+                         uint64_t end,
+                        range_tree_t::iterator* insert_pos = nullptr) {
+    bool res = !range_count_cap || range_size_tree.size() < range_count_cap;
+    bool remove_lowest = false;
+    if (!res) {
+      if (end - start > _lowest_size_available()) {
+        remove_lowest = true;
+        res = true;
+      }
+    }
+    if (!res) {
+      _spillover_range(start, end);
+    } else {
+      // NB:  we should do insertion before the following removal
+      // to avoid potential iterator disposal insertion might depend on.
+      if (insert_pos) {
+        auto new_rs = new range_seg_t{ start, end };
+        range_tree.insert_before(*insert_pos, *new_rs);
+        range_size_tree.insert(*new_rs);
+        num_free += new_rs->length();
+      }
+      if (remove_lowest) {
+        auto r = range_size_tree.begin();
+        _range_size_tree_rm(*r);
+        _spillover_range(r->start, r->end);
+        range_tree.erase_and_dispose(*r, dispose_rs{});
+      }
+    }
+    return res;
+  }
+  virtual void _spillover_range(uint64_t start, uint64_t end) {
+    // this should be overriden when range count cap is present,
+    // i.e. (range_count_cap > 0)
+    ceph_assert(false);
+  }
+protected:
+  // called when extent to be released/marked free
+  virtual void _add_to_tree(uint64_t start, uint64_t size);
+
+protected:
+  CephContext* cct;
+  std::mutex lock;
+
+  double _get_fragmentation() const {
+    auto free_blocks = p2align(num_free, (uint64_t)block_size) / block_size;
+    if (free_blocks <= 1) {
+      return .0;
+    }
+    return (static_cast<double>(range_tree.size() - 1) / (free_blocks - 1));
+  }
+  void _dump() const;
+  void _foreach(std::function<void(uint64_t offset, uint64_t length)>) const;
+
+  uint64_t _lowest_size_available() {
+    auto rs = range_size_tree.begin();
+    return rs != range_size_tree.end() ? rs->length() : 0;
+  }
+
+  int64_t _allocate(
+    uint64_t want,
+    uint64_t unit,
+    uint64_t max_alloc_size,
+    int64_t  hint,
+    PExtentVector *extents);
+
+  void _release(const interval_set<uint64_t>& release_set);
+  void _release(const PExtentVector&  release_set);
+  void _shutdown();
+
+  void _process_range_removal(uint64_t start, uint64_t end, range_tree_t::iterator& rs);
+  void _remove_from_tree(uint64_t start, uint64_t size);
+  void _try_remove_from_tree(uint64_t start, uint64_t size,
+    std::function<void(uint64_t offset, uint64_t length, bool found)> cb);
+
+  uint64_t _get_free() const {
+    return num_free;
+  }
+};
diff --git a/src/os/bluestore/BitmapAllocator.cc b/src/os/bluestore/BitmapAllocator.cc
new file mode 100644
index 000000000..2decfcb87
--- /dev/null
+++ b/src/os/bluestore/BitmapAllocator.cc
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BitmapAllocator.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "fbmap_alloc " << this << " "
+
+BitmapAllocator::BitmapAllocator(CephContext* _cct,
+					 int64_t capacity,
+					 int64_t alloc_unit,
+					 std::string_view name) :
+    Allocator(name, capacity, alloc_unit),
+    cct(_cct)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << capacity << "/"
+		 << alloc_unit << std::dec << dendl;
+  _init(capacity, alloc_unit, false);
+}
+
+int64_t BitmapAllocator::allocate(
+  uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+  int64_t hint, PExtentVector *extents)
+{
+  uint64_t allocated = 0;
+  size_t old_size = extents->size();
+  ldout(cct, 10) << __func__ << std::hex << " 0x" << want_size
+		 << "/" << alloc_unit << "," << max_alloc_size << "," << hint
+		 << std::dec << dendl;
+    
+    
+  _allocate_l2(want_size, alloc_unit, max_alloc_size, hint,
+    &allocated, extents);
+  if (!allocated) {
+    return -ENOSPC;
+  }
+  if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
+    for (auto i = old_size; i < extents->size(); ++i) {
+      auto& e = (*extents)[i];
+      ldout(cct, 10) << __func__
+                     << " extent: 0x" << std::hex << e.offset << "~" << e.length
+		     << "/" << alloc_unit << "," << max_alloc_size << "," << hint
+		     << std::dec << dendl;
+    }
+  }
+  return int64_t(allocated);
+}
+
+void BitmapAllocator::release(
+  const interval_set<uint64_t>& release_set)
+{
+  if (cct->_conf->subsys.should_gather<dout_subsys, 10>()) {
+    for (auto& [offset, len] : release_set) {
+      ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << len
+                     << std::dec << dendl;
+      ceph_assert(offset + len <= (uint64_t)device_size);
+    }
+  }
+  _free_l2(release_set);
+  ldout(cct, 10) << __func__ << " done" << dendl;
+}
+
+
+void BitmapAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		  << std::dec << dendl;
+
+  auto mas = get_min_alloc_size();
+  uint64_t offs = round_up_to(offset, mas);
+  uint64_t l = p2align(offset + length - offs, mas);
+  ceph_assert(offs + l <= (uint64_t)device_size);
+
+  _mark_free(offs, l);
+  ldout(cct, 10) << __func__ << " done" << dendl;
+}
+void BitmapAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		 << std::dec << dendl;
+  auto mas = get_min_alloc_size();
+  uint64_t offs = round_up_to(offset, mas);
+  uint64_t l = p2align(offset + length - offs, mas);
+  ceph_assert(offs + l <= (uint64_t)device_size);
+  _mark_allocated(offs, l);
+  ldout(cct, 10) << __func__ << " done" << dendl;
+}
+
+void BitmapAllocator::shutdown()
+{
+  ldout(cct, 1) << __func__ << dendl;
+  _shutdown();
+}
+
+void BitmapAllocator::dump()
+{
+  // bin -> interval count
+  std::map<size_t, size_t> bins_overall;
+  collect_stats(bins_overall);
+  auto it = bins_overall.begin();
+  while (it != bins_overall.end()) {
+    ldout(cct, 0) << __func__
+                  << " bin " << it->first
+                  << "(< " << byte_u_t((1 << (it->first + 1)) * get_min_alloc_size()) << ")"
+                  << " : " << it->second << " extents"
+                  << dendl;
+    ++it;
+  }
+}
diff --git a/src/os/bluestore/BitmapAllocator.h b/src/os/bluestore/BitmapAllocator.h
new file mode 100644
index 000000000..a418718aa
--- /dev/null
+++ b/src/os/bluestore/BitmapAllocator.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H
+#define CEPH_OS_BLUESTORE_BITMAPFASTALLOCATOR_H
+
+#include <mutex>
+
+#include "Allocator.h"
+#include "os/bluestore/bluestore_types.h"
+#include "fastbmap_allocator_impl.h"
+#include "include/mempool.h"
+#include "common/debug.h"
+
+class BitmapAllocator : public Allocator,
+  public AllocatorLevel02<AllocatorLevel01Loose> {
+  CephContext* cct;
+public:
+  BitmapAllocator(CephContext* _cct, int64_t capacity, int64_t alloc_unit,
+		  std::string_view name);
+  ~BitmapAllocator() override
+  {
+  }
+
+  const char* get_type() const override
+  {
+    return "bitmap";
+  }
+  int64_t allocate(
+    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+    int64_t hint, PExtentVector *extents) override;
+
+  void release(
+    const interval_set<uint64_t>& release_set) override;
+
+  using Allocator::release;
+
+  uint64_t get_free() override
+  {
+    return get_available();
+  }
+
+  void dump() override;
+  void foreach(
+    std::function<void(uint64_t offset, uint64_t length)> notify) override
+  {
+    foreach_internal(notify);
+  }
+  double get_fragmentation() override
+  {
+    return get_fragmentation_internal();
+  }
+
+  void init_add_free(uint64_t offset, uint64_t length) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+
+  void shutdown() override;
+};
+
+#endif
diff --git a/src/os/bluestore/BitmapFreelistManager.cc b/src/os/bluestore/BitmapFreelistManager.cc
new file mode 100644
index 000000000..bec6ace86
--- /dev/null
+++ b/src/os/bluestore/BitmapFreelistManager.cc
@@ -0,0 +1,613 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BitmapFreelistManager.h"
+
+#include <bit>
+#include "kv/KeyValueDB.h"
+#include "os/kv.h"
+#include "include/stringify.h"
+
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "freelist "
+
+using std::string;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::decode;
+using ceph::encode;
+
+void make_offset_key(uint64_t offset, std::string *key)
+{
+  key->reserve(10);
+  _key_encode_u64(offset, key);
+}
+
+struct XorMergeOperator : public KeyValueDB::MergeOperator {
+  void merge_nonexistent(
+    const char *rdata, size_t rlen, std::string *new_value) override {
+    *new_value = std::string(rdata, rlen);
+  }
+  void merge(
+    const char *ldata, size_t llen,
+    const char *rdata, size_t rlen,
+    std::string *new_value) override {
+    ceph_assert(llen == rlen);
+    *new_value = std::string(ldata, llen);
+    for (size_t i = 0; i < rlen; ++i) {
+      (*new_value)[i] ^= rdata[i];
+    }
+  }
+  // We use each operator name and each prefix to construct the
+  // overall RocksDB operator name for consistency check at open time.
+  const char *name() const override {
+    return "bitwise_xor";
+  }
+};
+
+void BitmapFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix)
+{
+  std::shared_ptr<XorMergeOperator> merge_op(new XorMergeOperator);
+  db->set_merge_operator(prefix, merge_op);
+}
+
+BitmapFreelistManager::BitmapFreelistManager(CephContext* cct,
+					     string meta_prefix,
+					     string bitmap_prefix)
+  : FreelistManager(cct),
+    meta_prefix(meta_prefix),
+    bitmap_prefix(bitmap_prefix),
+    enumerate_bl_pos(0)
+{
+}
+
+int BitmapFreelistManager::create(uint64_t new_size, uint64_t granularity,
+				  uint64_t zone_size, uint64_t first_sequential_zone,
+				  KeyValueDB::Transaction txn)
+{
+  bytes_per_block = granularity;
+  ceph_assert(std::has_single_bit(bytes_per_block));
+  size = p2align(new_size, bytes_per_block);
+  blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key;
+
+  _init_misc();
+
+  blocks = size_2_block_count(size);
+  if (blocks * bytes_per_block > size) {
+    dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size
+	     << " to 0x" << (blocks * bytes_per_block)
+	     << " (0x" << blocks << " blocks)" << std::dec << dendl;
+    // set past-eof blocks as allocated
+    _xor(size, blocks * bytes_per_block - size, txn);
+  }
+  dout(1) << __func__
+	   << " size 0x" << std::hex << size
+	   << " bytes_per_block 0x" << bytes_per_block
+	   << " blocks 0x" << blocks
+	   << " blocks_per_key 0x" << blocks_per_key
+	   << std::dec << dendl;
+  {
+    bufferlist bl;
+    encode(bytes_per_block, bl);
+    txn->set(meta_prefix, "bytes_per_block", bl);
+  }
+  {
+    bufferlist bl;
+    encode(blocks_per_key, bl);
+    txn->set(meta_prefix, "blocks_per_key", bl);
+  }
+  {
+    bufferlist bl;
+    encode(blocks, bl);
+    txn->set(meta_prefix, "blocks", bl);
+  }
+  {
+    bufferlist bl;
+    encode(size, bl);
+    txn->set(meta_prefix, "size", bl);
+  }
+  return 0;
+}
+
+int BitmapFreelistManager::_expand(uint64_t old_size, KeyValueDB* db)
+{
+  assert(old_size < size);
+  ceph_assert(std::has_single_bit(bytes_per_block));
+
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  auto blocks0 = size_2_block_count(old_size);
+  if (blocks0 * bytes_per_block > old_size) {
+    dout(10) << __func__ << " rounding1 blocks up from 0x" << std::hex
+             << old_size << " to 0x" << (blocks0 * bytes_per_block)
+	     << " (0x" << blocks0 << " blocks)" << std::dec << dendl;
+    // reset past-eof blocks to unallocated
+    _xor(old_size, blocks0 * bytes_per_block - old_size, txn);
+  }
+
+  size = p2align(size, bytes_per_block);
+  blocks = size_2_block_count(size);
+
+  if (blocks * bytes_per_block > size) {
+    dout(10) << __func__ << " rounding2 blocks up from 0x" << std::hex
+             << size << " to 0x" << (blocks * bytes_per_block)
+	     << " (0x" << blocks << " blocks)" << std::dec << dendl;
+    // set past-eof blocks as allocated
+    _xor(size, blocks * bytes_per_block - size, txn);
+  }
+
+  dout(10) << __func__
+	   << " size 0x" << std::hex << size
+	   << " bytes_per_block 0x" << bytes_per_block
+	   << " blocks 0x" << blocks
+	   << " blocks_per_key 0x" << blocks_per_key
+	   << std::dec << dendl;
+  {
+    bufferlist bl;
+    encode(blocks, bl);
+    txn->set(meta_prefix, "blocks", bl);
+  }
+  {
+    bufferlist bl;
+    encode(size, bl);
+    txn->set(meta_prefix, "size", bl);
+  }
+  db->submit_transaction_sync(txn);
+
+  return 0;
+}
+
+int BitmapFreelistManager::read_size_meta_from_db(KeyValueDB* kvdb,
+  uint64_t* res)
+{
+  bufferlist v;
+  int r = kvdb->get(meta_prefix, "size", &v);
+  if (r < 0) {
+    derr << __func__ << " missing size meta in DB" << dendl;
+    return -ENOENT;
+  } else {
+    auto p = v.cbegin();
+    decode(*res, p);
+    r = 0;
+  }
+  return r;
+}
+
+void BitmapFreelistManager::_load_from_db(KeyValueDB* kvdb)
+{
+  KeyValueDB::Iterator it = kvdb->get_iterator(meta_prefix);
+  it->lower_bound(string());
+
+  // load meta
+  while (it->valid()) {
+    string k = it->key();
+    if (k == "bytes_per_block") {
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      decode(bytes_per_block, p);
+      dout(10) << __func__ << " bytes_per_block 0x" << std::hex
+        << bytes_per_block << std::dec << dendl;
+    } else if (k == "blocks") {
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      decode(blocks, p);
+      dout(10) << __func__ << " blocks 0x" << std::hex << blocks << std::dec
+        << dendl;
+    } else if (k == "size") {
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      decode(size, p);
+      dout(10) << __func__ << " size 0x" << std::hex << size << std::dec
+        << dendl;
+    } else if (k == "blocks_per_key") {
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      decode(blocks_per_key, p);
+      dout(10) << __func__ << " blocks_per_key 0x" << std::hex << blocks_per_key
+        << std::dec << dendl;
+    } else {
+      derr << __func__ << " unrecognized meta " << k << dendl;
+    }
+    it->next();
+  }
+}
+
+
+int BitmapFreelistManager::init(KeyValueDB *kvdb, bool db_in_read_only,
+  std::function<int(const std::string&, std::string*)> cfg_reader)
+{
+  dout(1) << __func__ << dendl;
+  int r = _read_cfg(cfg_reader);
+  if (r != 0) {
+    dout(1) << __func__ << " fall back to legacy meta repo" << dendl;
+    _load_from_db(kvdb);
+  }
+  _sync(kvdb, db_in_read_only);
+
+  dout(10) << __func__ << std::hex
+	   << " size 0x" << size
+	   << " bytes_per_block 0x" << bytes_per_block
+	   << " blocks 0x" << blocks
+	   << " blocks_per_key 0x" << blocks_per_key
+	   << std::dec << dendl;
+  _init_misc();
+  return 0;
+}
+
+int BitmapFreelistManager::_read_cfg(
+  std::function<int(const std::string&, std::string*)> cfg_reader)
+{
+  dout(1) << __func__ << dendl;
+
+  string err;
+
+  const size_t key_count = 4;
+  string keys[key_count] = {
+    "bfm_size",
+    "bfm_blocks",
+    "bfm_bytes_per_block",
+    "bfm_blocks_per_key"};
+  uint64_t* vals[key_count] = {
+    &size,
+    &blocks,
+    &bytes_per_block,
+    &blocks_per_key};
+
+  for (size_t i = 0; i < key_count; i++) {
+    string val;
+    int r = cfg_reader(keys[i], &val);
+    if (r == 0) {
+      *(vals[i]) = strict_iecstrtoll(val, &err);
+      if (!err.empty()) {
+        derr << __func__ << " Failed to parse - "
+          << keys[i] << ":" << val
+          << ", error: " << err << dendl;
+        return -EINVAL;
+      }
+    } else {
+      // this is expected for legacy deployed OSDs
+      dout(0) << __func__ << " " << keys[i] << " not found in bdev meta" << dendl;
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+void BitmapFreelistManager::_init_misc()
+{
+  bufferptr z(blocks_per_key >> 3);
+  memset(z.c_str(), 0xff, z.length());
+  all_set_bl.clear();
+  all_set_bl.append(z);
+
+  block_mask = ~(bytes_per_block - 1);
+
+  bytes_per_key = bytes_per_block * blocks_per_key;
+  key_mask = ~(bytes_per_key - 1);
+  dout(10) << __func__ << std::hex << " bytes_per_key 0x" << bytes_per_key
+	   << ", key_mask 0x" << key_mask << std::dec
+	   << dendl;
+}
+
+void BitmapFreelistManager::sync(KeyValueDB* kvdb)
+{
+  _sync(kvdb, true);
+}
+
+void BitmapFreelistManager::_sync(KeyValueDB* kvdb, bool read_only)
+{
+  dout(10) << __func__ << " checks if size sync is needed" << dendl;
+  uint64_t size_db = 0;
+  int r = read_size_meta_from_db(kvdb, &size_db);
+  ceph_assert(r >= 0);
+  if (!read_only && size_db < size) {
+    dout(1) << __func__ << " committing new size 0x" << std::hex << size
+      << std::dec << dendl;
+    r = _expand(size_db, kvdb);
+    ceph_assert(r == 0);
+  } else if (size_db > size) {
+    // this might hapen when OSD passed the following sequence:
+    // upgrade -> downgrade -> expand -> upgrade
+    // One needs to run expand once again to syncup
+    dout(1) << __func__ << " fall back to legacy meta repo" << dendl;
+    _load_from_db(kvdb);
+  }
+}
+
+void BitmapFreelistManager::shutdown()
+{
+  dout(1) << __func__ << dendl;
+}
+
+void BitmapFreelistManager::enumerate_reset()
+{
+  std::lock_guard l(lock);
+  enumerate_offset = 0;
+  enumerate_bl_pos = 0;
+  enumerate_bl.clear();
+  enumerate_p.reset();
+}
+
+int get_next_clear_bit(bufferlist& bl, int start)
+{
+  const char *p = bl.c_str();
+  int bits = bl.length() << 3;
+  while (start < bits) {
+    // byte = start / 8 (or start >> 3)
+    // bit = start % 8 (or start & 7)
+    unsigned char byte_mask = 1 << (start & 7);
+    if ((p[start >> 3] & byte_mask) == 0) {
+      return start;
+    }
+    ++start;
+  }
+  return -1; // not found
+}
+
+int get_next_set_bit(bufferlist& bl, int start)
+{
+  const char *p = bl.c_str();
+  int bits = bl.length() << 3;
+  while (start < bits) {
+    int which_byte = start / 8;
+    int which_bit = start % 8;
+    unsigned char byte_mask = 1 << which_bit;
+    if (p[which_byte] & byte_mask) {
+      return start;
+    }
+    ++start;
+  }
+  return -1; // not found
+}
+
+bool BitmapFreelistManager::enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length)
+{
+  std::lock_guard l(lock);
+
+  // initial base case is a bit awkward
+  if (enumerate_offset == 0 && enumerate_bl_pos == 0) {
+    dout(10) << __func__ << " start" << dendl;
+    enumerate_p = kvdb->get_iterator(bitmap_prefix);
+    enumerate_p->lower_bound(string());
+    // we assert that the first block is always allocated; it's true,
+    // and it simplifies our lives a bit.
+    ceph_assert(enumerate_p->valid());
+    string k = enumerate_p->key();
+    const char *p = k.c_str();
+    _key_decode_u64(p, &enumerate_offset);
+    enumerate_bl = enumerate_p->value();
+    ceph_assert(enumerate_offset == 0);
+    ceph_assert(get_next_set_bit(enumerate_bl, 0) == 0);
+  }
+
+  if (enumerate_offset >= size) {
+    dout(10) << __func__ << " end" << dendl;
+    return false;
+  }
+
+  // skip set bits to find offset
+  while (true) {
+    enumerate_bl_pos = get_next_clear_bit(enumerate_bl, enumerate_bl_pos);
+    if (enumerate_bl_pos >= 0) {
+      *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
+      dout(30) << __func__ << " found clear bit, key 0x" << std::hex
+	       << enumerate_offset << " bit 0x" << enumerate_bl_pos
+	       << " offset 0x" << *offset
+	       << std::dec << dendl;
+      break;
+    }
+    dout(30) << " no more clear bits in 0x" << std::hex << enumerate_offset
+	     << std::dec << dendl;
+    enumerate_p->next();
+    enumerate_bl.clear();
+    if (!enumerate_p->valid()) {
+      enumerate_offset += bytes_per_key;
+      enumerate_bl_pos = 0;
+      *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
+      break;
+    }
+    string k = enumerate_p->key();
+    const char *p = k.c_str();
+    uint64_t next = enumerate_offset + bytes_per_key;
+    _key_decode_u64(p, &enumerate_offset);
+    enumerate_bl = enumerate_p->value();
+    enumerate_bl_pos = 0;
+    if (enumerate_offset > next) {
+      dout(30) << " no key at 0x" << std::hex << next << ", got 0x"
+	       << enumerate_offset << std::dec << dendl;
+      *offset = next;
+      break;
+    }
+  }
+
+  // skip clear bits to find the end
+  uint64_t end = 0;
+  if (enumerate_p->valid()) {
+    while (true) {
+      enumerate_bl_pos = get_next_set_bit(enumerate_bl, enumerate_bl_pos);
+      if (enumerate_bl_pos >= 0) {
+	end = _get_offset(enumerate_offset, enumerate_bl_pos);
+	dout(30) << __func__ << " found set bit, key 0x" << std::hex
+		 << enumerate_offset << " bit 0x" << enumerate_bl_pos
+		 << " offset 0x" << end << std::dec
+		 << dendl;
+	end = std::min(get_alloc_units() * bytes_per_block, end);
+	*length = end - *offset;
+        dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
+		 << std::dec << dendl;
+	return true;
+      }
+      dout(30) << " no more set bits in 0x" << std::hex << enumerate_offset
+	       << std::dec << dendl;
+      enumerate_p->next();
+      enumerate_bl.clear();
+      enumerate_bl_pos = 0;
+      if (!enumerate_p->valid()) {
+	break;
+      }
+      string k = enumerate_p->key();
+      const char *p = k.c_str();
+      _key_decode_u64(p, &enumerate_offset);
+      enumerate_bl = enumerate_p->value();
+    }
+  }
+
+  if (enumerate_offset < size) {
+    end = get_alloc_units() * bytes_per_block;
+    *length = end - *offset;
+    dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
+	     << std::dec << dendl;
+    enumerate_offset = size;
+    enumerate_bl_pos = blocks_per_key;
+    return true;
+  }
+
+  dout(10) << __func__ << " end" << dendl;
+  return false;
+}
+
+void BitmapFreelistManager::dump(KeyValueDB *kvdb)
+{
+  enumerate_reset();
+  uint64_t offset, length;
+  while (enumerate_next(kvdb, &offset, &length)) {
+    dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+	     << std::dec << dendl;
+  }
+}
+
+void BitmapFreelistManager::allocate(
+  uint64_t offset, uint64_t length,
+  KeyValueDB::Transaction txn)
+{
+  dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << std::dec << dendl;
+  if (!is_null_manager()) {
+    _xor(offset, length, txn);
+  }
+}
+
+void BitmapFreelistManager::release(
+  uint64_t offset, uint64_t length,
+  KeyValueDB::Transaction txn)
+{
+  dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << std::dec << dendl;
+  if (!is_null_manager()) {
+    _xor(offset, length, txn);
+  }
+}
+
+void BitmapFreelistManager::_xor(
+  uint64_t offset, uint64_t length,
+  KeyValueDB::Transaction txn)
+{
+  // must be block aligned
+  ceph_assert((offset & block_mask) == offset);
+  ceph_assert((length & block_mask) == length);
+
+  uint64_t first_key = offset & key_mask;
+  uint64_t last_key = (offset + length - 1) & key_mask;
+  dout(20) << __func__ << " first_key 0x" << std::hex << first_key
+	   << " last_key 0x" << last_key << std::dec << dendl;
+
+  if (first_key == last_key) {
+    bufferptr p(blocks_per_key >> 3);
+    p.zero();
+    unsigned s = (offset & ~key_mask) / bytes_per_block;
+    unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
+    for (unsigned i = s; i <= e; ++i) {
+      p[i >> 3] ^= 1ull << (i & 7);
+    }
+    string k;
+    make_offset_key(first_key, &k);
+    bufferlist bl;
+    bl.append(p);
+    dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
+    bl.hexdump(*_dout, false);
+    *_dout << dendl;
+    txn->merge(bitmap_prefix, k, bl);
+  } else {
+    // first key
+    {
+      bufferptr p(blocks_per_key >> 3);
+      p.zero();
+      unsigned s = (offset & ~key_mask) / bytes_per_block;
+      unsigned e = blocks_per_key;
+      for (unsigned i = s; i < e; ++i) {
+	p[i >> 3] ^= 1ull << (i & 7);
+      }
+      string k;
+      make_offset_key(first_key, &k);
+      bufferlist bl;
+      bl.append(p);
+      dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
+      bl.hexdump(*_dout, false);
+      *_dout << dendl;
+      txn->merge(bitmap_prefix, k, bl);
+      first_key += bytes_per_key;
+    }
+    // middle keys
+    while (first_key < last_key) {
+      string k;
+      make_offset_key(first_key, &k);
+      dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec
+      	 << ": ";
+      all_set_bl.hexdump(*_dout, false);
+      *_dout << dendl;
+      txn->merge(bitmap_prefix, k, all_set_bl);
+      first_key += bytes_per_key;
+    }
+    ceph_assert(first_key == last_key);
+    {
+      bufferptr p(blocks_per_key >> 3);
+      p.zero();
+      unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
+      for (unsigned i = 0; i <= e; ++i) {
+	p[i >> 3] ^= 1ull << (i & 7);
+      }
+      string k;
+      make_offset_key(first_key, &k);
+      bufferlist bl;
+      bl.append(p);
+      dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
+      bl.hexdump(*_dout, false);
+      *_dout << dendl;
+      txn->merge(bitmap_prefix, k, bl);
+    }
+  }
+}
+
+uint64_t BitmapFreelistManager::size_2_block_count(uint64_t target_size) const
+{
+  auto target_blocks = target_size / bytes_per_block;
+  if (target_blocks / blocks_per_key * blocks_per_key != target_blocks) {
+    target_blocks = (target_blocks / blocks_per_key + 1) * blocks_per_key;
+  }
+  return target_blocks;
+}
+
+void BitmapFreelistManager::get_meta(
+  uint64_t target_size,
+  std::vector<std::pair<string, string>>* res) const
+{
+  if (target_size == 0) {
+    res->emplace_back("bfm_blocks", stringify(blocks));
+    res->emplace_back("bfm_size", stringify(size));
+  } else {
+    target_size = p2align(target_size, bytes_per_block);
+    auto target_blocks = size_2_block_count(target_size);
+
+    res->emplace_back("bfm_blocks", stringify(target_blocks));
+    res->emplace_back("bfm_size", stringify(target_size));
+  }
+  res->emplace_back("bfm_bytes_per_block", stringify(bytes_per_block));
+  res->emplace_back("bfm_blocks_per_key", stringify(blocks_per_key));
+}
diff --git a/src/os/bluestore/BitmapFreelistManager.h b/src/os/bluestore/BitmapFreelistManager.h
new file mode 100644
index 000000000..8e4ea8fd3
--- /dev/null
+++ b/src/os/bluestore/BitmapFreelistManager.h
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_BITMAPFREELISTMANAGER_H
+#define CEPH_OS_BLUESTORE_BITMAPFREELISTMANAGER_H
+
+#include "FreelistManager.h"
+
+#include <string>
+#include <mutex>
+
+#include "common/ceph_mutex.h"
+#include "include/buffer.h"
+#include "kv/KeyValueDB.h"
+
+class BitmapFreelistManager : public FreelistManager {
+  std::string meta_prefix, bitmap_prefix;
+  std::shared_ptr<KeyValueDB::MergeOperator> merge_op;
+  ceph::mutex lock = ceph::make_mutex("BitmapFreelistManager::lock");
+
+  uint64_t size;            ///< size of device (bytes)
+  uint64_t bytes_per_block; ///< bytes per block (bdev_block_size)
+  uint64_t blocks_per_key;  ///< blocks (bits) per key/value pair
+  uint64_t bytes_per_key;   ///< bytes per key/value pair
+  uint64_t blocks;          ///< size of device (blocks, size rounded up)
+
+  uint64_t block_mask;  ///< mask to convert byte offset to block offset
+  uint64_t key_mask;    ///< mask to convert offset to key offset
+
+  ceph::buffer::list all_set_bl;
+
+  KeyValueDB::Iterator enumerate_p;
+  uint64_t enumerate_offset; ///< logical offset; position
+  ceph::buffer::list enumerate_bl;   ///< current key at enumerate_offset
+  int enumerate_bl_pos;      ///< bit position in enumerate_bl
+
+  uint64_t _get_offset(uint64_t key_off, int bit) {
+    return key_off + bit * bytes_per_block;
+  }
+
+  void _init_misc();
+
+  void _xor(
+    uint64_t offset, uint64_t length,
+    KeyValueDB::Transaction txn);
+
+  int _read_cfg(
+    std::function<int(const std::string&, std::string*)> cfg_reader);
+
+  int _expand(uint64_t new_size, KeyValueDB* db);
+
+  uint64_t size_2_block_count(uint64_t target_size) const;
+
+  int read_size_meta_from_db(KeyValueDB* kvdb, uint64_t* res);
+  void _sync(KeyValueDB* kvdb, bool read_only);
+
+  void _load_from_db(KeyValueDB* kvdb);
+
+public:
+  BitmapFreelistManager(CephContext* cct, std::string meta_prefix,
+			std::string bitmap_prefix);
+
+  static void setup_merge_operator(KeyValueDB *db, std::string prefix);
+
+  int create(uint64_t size, uint64_t granularity,
+	     uint64_t zone_size, uint64_t first_sequential_zone,
+	     KeyValueDB::Transaction txn) override;
+
+  int init(KeyValueDB *kvdb, bool db_in_read_only,
+    std::function<int(const std::string&, std::string*)> cfg_reader) override;
+
+  void shutdown() override;
+  void sync(KeyValueDB* kvdb) override;
+
+  void dump(KeyValueDB *kvdb) override;
+
+  void enumerate_reset() override;
+  bool enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length) override;
+
+  void allocate(
+    uint64_t offset, uint64_t length,
+    KeyValueDB::Transaction txn) override;
+  void release(
+    uint64_t offset, uint64_t length,
+    KeyValueDB::Transaction txn) override;
+
+  inline uint64_t get_size() const override {
+    return size;
+  }
+  inline uint64_t get_alloc_units() const override {
+    return size / bytes_per_block;
+  }
+  inline uint64_t get_alloc_size() const override {
+    return bytes_per_block;
+  }
+  void get_meta(uint64_t target_size,
+    std::vector<std::pair<std::string, std::string>>*) const override;
+};
+
+#endif
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
new file mode 100644
index 000000000..710021f07
--- /dev/null
+++ b/src/os/bluestore/BlueFS.cc
@@ -0,0 +1,4682 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include <chrono>
+#include "boost/algorithm/string.hpp" 
+#include "bluestore_common.h"
+#include "BlueFS.h"
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+#include "Allocator.h"
+#include "include/ceph_assert.h"
+#include "common/admin_socket.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluefs
+#undef dout_prefix
+#define dout_prefix *_dout << "bluefs "
+using TOPNSPC::common::cmd_getval;
+
+using std::byte;
+using std::list;
+using std::make_pair;
+using std::map;
+using std::ostream;
+using std::pair;
+using std::set;
+using std::string;
+using std::to_string;
+using std::vector;
+using std::chrono::duration;
+using std::chrono::seconds;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
+			      bluefs_file_reader_buffer, bluefs_file_reader);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
+
+static void wal_discard_cb(void *priv, void* priv2) {
+  BlueFS *bluefs = static_cast<BlueFS*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
+}
+
+static void db_discard_cb(void *priv, void* priv2) {
+  BlueFS *bluefs = static_cast<BlueFS*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
+}
+
+static void slow_discard_cb(void *priv, void* priv2) {
+  BlueFS *bluefs = static_cast<BlueFS*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
+}
+
+class BlueFS::SocketHook : public AdminSocketHook {
+  BlueFS* bluefs;
+public:
+  static BlueFS::SocketHook* create(BlueFS* bluefs)
+  {
+    BlueFS::SocketHook* hook = nullptr;
+    AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
+    if (admin_socket) {
+      hook = new BlueFS::SocketHook(bluefs);
+      int r = admin_socket->register_command("bluestore bluefs device info "
+                                             "name=alloc_size,type=CephInt,req=false",
+                                             hook,
+                                             "Shows space report for bluefs devices. "
+                                             "This also includes an estimation for space "
+                                             "available to bluefs at main device. "
+                                             "alloc_size, if set, specifies the custom bluefs "
+                                             "allocation unit size for the estimation above.");
+      if (r != 0) {
+        ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
+        delete hook;
+        hook = nullptr;
+      } else {
+        r = admin_socket->register_command("bluefs stats",
+                                           hook,
+                                           "Dump internal statistics for bluefs."
+                                           "");
+        ceph_assert(r == 0);
+	r = admin_socket->register_command("bluefs files list", hook,
+					   "print files in bluefs");
+	ceph_assert(r == 0);
+	r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook,
+					   "Injects 8K zeros into next BlueFS read. Debug only.");
+	ceph_assert(r == 0);
+      }
+    }
+    return hook;
+  }
+
+  ~SocketHook() {
+    AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
+    admin_socket->unregister_commands(this);
+  }
+private:
+  SocketHook(BlueFS* bluefs) :
+    bluefs(bluefs) {}
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+	   const bufferlist&,
+	   Formatter *f,
+	   std::ostream& errss,
+	   bufferlist& out) override {
+    if (command == "bluestore bluefs device info") {
+      int64_t alloc_size = 0;
+      cmd_getval(cmdmap, "alloc_size", alloc_size);
+      if ((alloc_size & (alloc_size - 1)) != 0) {
+	errss << "Invalid allocation size:'" << alloc_size << std::endl;
+	return -EINVAL;
+      }
+      if (alloc_size == 0)
+	alloc_size = bluefs->cct->_conf->bluefs_shared_alloc_size;
+      f->open_object_section("bluefs_device_info");
+      for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
+	if (bluefs->bdev[dev]) {
+	  f->open_object_section("dev");
+	  f->dump_string("device", bluefs->get_device_name(dev));
+	  ceph_assert(bluefs->alloc[dev]);
+          auto total = bluefs->get_total(dev);
+          auto free = bluefs->get_free(dev);
+          auto used = bluefs->get_used(dev);
+
+          f->dump_int("total", total);
+          f->dump_int("free", free);
+          f->dump_int("bluefs_used", used);
+          if (bluefs->is_shared_alloc(dev)) {
+            size_t avail = bluefs->probe_alloc_avail(dev, alloc_size);
+            f->dump_int("bluefs max available", avail);
+          }
+          f->close_section();
+        }
+      }
+
+      f->close_section();
+    } else if (command == "bluefs stats") {
+      std::stringstream ss;
+      bluefs->dump_block_extents(ss);
+      bluefs->dump_volume_selector(ss);
+      out.append(ss);
+    } else if (command == "bluefs files list") {
+      const char* devnames[3] = {"wal","db","slow"};
+      std::lock_guard l(bluefs->nodes.lock);
+      f->open_array_section("files");
+      for (auto &d : bluefs->nodes.dir_map) {
+        std::string dir = d.first;
+        for (auto &r : d.second->file_map) {
+          f->open_object_section("file");
+          f->dump_string("name", (dir + "/" + r.first).c_str());
+          std::vector<size_t> sizes;
+          sizes.resize(bluefs->bdev.size());
+          for(auto& i : r.second->fnode.extents) {
+            sizes[i.bdev] += i.length;
+          }
+          for (size_t i = 0; i < sizes.size(); i++) {
+            if (sizes[i]>0) {
+	      if (i < sizeof(devnames) / sizeof(*devnames))
+		f->dump_int(devnames[i], sizes[i]);
+	      else
+		f->dump_int(("dev-"+to_string(i)).c_str(), sizes[i]);
+	    }
+          }
+          f->close_section();
+        }
+      }
+      f->close_section();
+      f->flush(out);
+    } else if (command == "bluefs debug_inject_read_zeros") {
+      bluefs->inject_read_zeros++;
+    } else {
+      errss << "Invalid command" << std::endl;
+      return -ENOSYS;
+    }
+    return 0;
+  }
+};
+
+BlueFS::BlueFS(CephContext* cct)
+  : cct(cct),
+    bdev(MAX_BDEV),
+    ioc(MAX_BDEV),
+    block_reserved(MAX_BDEV),
+    alloc(MAX_BDEV),
+    alloc_size(MAX_BDEV, 0)
+{
+  dirty.pending_release.resize(MAX_BDEV);
+  discard_cb[BDEV_WAL] = wal_discard_cb;
+  discard_cb[BDEV_DB] = db_discard_cb;
+  discard_cb[BDEV_SLOW] = slow_discard_cb;
+  asok_hook = SocketHook::create(this);
+}
+
+BlueFS::~BlueFS()
+{
+  delete asok_hook;
+  for (auto p : ioc) {
+    if (p)
+      p->aio_wait();
+  }
+  for (auto p : bdev) {
+    if (p) {
+      p->close();
+      delete p;
+    }
+  }
+  for (auto p : ioc) {
+    delete p;
+  }
+}
+
+void BlueFS::_init_logger()
+{
+  PerfCountersBuilder b(cct, "bluefs",
+                        l_bluefs_first, l_bluefs_last);
+  b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
+	    "Total bytes (main db device)",
+	    "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
+	    "Used bytes (main db device)",
+	    "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
+	    "Total bytes (wal device)",
+	    "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
+	    "Used bytes (wal device)",
+	    "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
+	    "Total bytes (slow device)",
+	    "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
+	    "Used bytes (slow device)",
+	    "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluefs_num_files, "num_files", "File count",
+	    "f", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
+	    "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
+		    "Compactions of the metadata log");
+  b.add_u64_counter(l_bluefs_log_write_count, "log_write_count",
+		    "Write op count to the metadata log");
+  b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
+		    "Bytes written to the metadata log",
+		    "j",
+		    PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
+		    "Files written to WAL");
+  b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
+		    "Files written to SSTs");
+  b.add_u64_counter(l_bluefs_write_count_wal, "write_count_wal",
+		    "Write op count to WAL");
+  b.add_u64_counter(l_bluefs_write_count_sst, "write_count_sst",
+		    "Write op count to SSTs");
+  b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
+		    "Bytes written to WAL",
+		    "walb",
+		    PerfCountersBuilder::PRIO_CRITICAL);
+  b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
+		    "Bytes written to SSTs",
+		    "sstb",
+		    PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
+		    "Bytes written to WAL/SSTs at slow device",
+		    "slwb",
+		    PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
+		    "Maximum bytes allocated from WAL",
+		    "mxwb",
+		    PerfCountersBuilder::PRIO_INTERESTING,
+		    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
+		    "Maximum bytes allocated from DB",
+		    "mxdb",
+		    PerfCountersBuilder::PRIO_INTERESTING,
+		    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
+		    "Maximum bytes allocated from SLOW",
+		    "mxwb",
+		    PerfCountersBuilder::PRIO_INTERESTING,
+		    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_main_alloc_unit, "alloc_unit_main",
+		    "Allocation unit size (in bytes) for primary/shared device",
+		    "aumb",
+		    PerfCountersBuilder::PRIO_CRITICAL,
+		    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_db_alloc_unit, "alloc_unit_db",
+		    "Allocation unit size (in bytes) for standalone DB device",
+		    "audb",
+		    PerfCountersBuilder::PRIO_CRITICAL,
+		    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_wal_alloc_unit, "alloc_unit_wal",
+		    "Allocation unit size (in bytes) for standalone WAL device",
+		    "auwb",
+		    PerfCountersBuilder::PRIO_CRITICAL,
+		    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
+		    "random read requests processed",
+		    NULL,
+		    PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
+		    "Bytes requested in random read mode",
+		    NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
+		    "random reads requests going to disk",
+		    NULL,
+		    PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
+		    "Bytes read from disk in random read mode",
+		    "rrb",
+		    PerfCountersBuilder::PRIO_INTERESTING,
+		    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_random_disk_bytes_wal, "read_random_disk_bytes_wal",
+		    "random reads requests going to WAL disk",
+		    NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_random_disk_bytes_db, "read_random_disk_bytes_db",
+		    "random reads requests going to DB disk",
+		    NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_random_disk_bytes_slow, "read_random_disk_bytes_slow",
+		    "random reads requests going to main disk",
+		    "rrsb",
+		    PerfCountersBuilder::PRIO_INTERESTING,
+		    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
+		    "random read requests processed using prefetch buffer",
+		    NULL,
+		    PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
+		    "Bytes read from prefetch buffer in random read mode",
+		    NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_count, "read_count",
+		    "buffered read requests processed",
+		    NULL,
+		    PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
+		    "Bytes requested in buffered read mode",
+		    NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_disk_count, "read_disk_count",
+		    "buffered reads requests going to disk",
+		    NULL,
+		    PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluefs_read_disk_bytes, "read_disk_bytes",
+		    "Bytes read in buffered mode from disk",
+		    "rb",
+		    PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_disk_bytes_wal, "read_disk_bytes_wal",
+		    "reads requests going to WAL disk",
+		    NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_disk_bytes_db, "read_disk_bytes_db",
+		    "reads requests going to DB disk",
+		    NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_disk_bytes_slow, "read_disk_bytes_slow",
+		    "reads requests going to main disk",
+		    "rsb",
+		    PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
+		    "prefetch read requests processed",
+		     NULL,
+		    PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
+		    "Bytes requested in prefetch read mode",
+		     NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluefs_write_count, "write_count",
+		    "Write requests processed");
+  b.add_u64_counter(l_bluefs_write_disk_count, "write_disk_count",
+		    "Write requests sent to disk");
+  b.add_u64_counter(l_bluefs_write_bytes, "write_bytes",
+		    "Bytes written", NULL,
+		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_time_avg     (l_bluefs_compaction_lat, "compact_lat",
+                    "Average bluefs log compaction latency",
+                    "c__t",
+                    PerfCountersBuilder::PRIO_INTERESTING);
+ b.add_time_avg     (l_bluefs_compaction_lock_lat, "compact_lock_lat",
+                    "Average lock duration while compacting bluefs log",
+                    "c_lt",
+                    PerfCountersBuilder::PRIO_INTERESTING);
+  b.add_u64_counter(l_bluefs_alloc_shared_dev_fallbacks, "alloc_slow_fallback",
+		    "Amount of allocations that required fallback to "
+                    " slow/shared device",
+		     "asdf",
+		    PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluefs_alloc_shared_size_fallbacks, "alloc_slow_size_fallback",
+		    "Amount of allocations that required fallback to shared device's "
+                    "regular unit size",
+		     "assf",
+		    PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
+	    "How many times bluefs read found page with all 0s");
+  b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
+	    "How many times bluefs read found transient page with all 0s");
+
+  logger = b.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+}
+
+void BlueFS::_shutdown_logger()
+{
+  cct->get_perfcounters_collection()->remove(logger);
+  delete logger;
+}
+
+void BlueFS::_update_logger_stats()
+{
+  if (alloc[BDEV_WAL]) {
+    logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL));
+    logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL));
+  }
+  if (alloc[BDEV_DB]) {
+    logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB));
+    logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB));
+  }
+  if (alloc[BDEV_SLOW]) {
+    logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW));
+    logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW));
+  }
+}
+
+int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
+                             uint64_t reserved,
+                             bluefs_shared_alloc_context_t* _shared_alloc)
+{
+  dout(10) << __func__ << " bdev " << id << " path " << path << " "
+           << reserved << dendl;
+  ceph_assert(id < bdev.size());
+  ceph_assert(bdev[id] == NULL);
+  BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
+				       discard_cb[id], static_cast<void*>(this));
+  block_reserved[id] = reserved;
+  if (_shared_alloc) {
+    b->set_no_exclusive_lock();
+  }
+  int r = b->open(path);
+  if (r < 0) {
+    delete b;
+    return r;
+  }
+  if (trim) {
+    interval_set<uint64_t> whole_device;
+    whole_device.insert(0, b->get_size());
+    b->try_discard(whole_device, false);
+  }
+
+  dout(1) << __func__ << " bdev " << id << " path " << path
+	  << " size " << byte_u_t(b->get_size()) << dendl;
+  bdev[id] = b;
+  ioc[id] = new IOContext(cct, NULL);
+  if (_shared_alloc) {
+    ceph_assert(!shared_alloc);
+    shared_alloc = _shared_alloc;
+    alloc[id] = shared_alloc->a;
+    shared_alloc_id = id;
+  }
+  return 0;
+}
+
+bool BlueFS::bdev_support_label(unsigned id)
+{
+  ceph_assert(id < bdev.size());
+  ceph_assert(bdev[id]);
+  return bdev[id]->supported_bdev_label();
+}
+
+uint64_t BlueFS::get_block_device_size(unsigned id) const
+{
+  if (id < bdev.size() && bdev[id])
+    return bdev[id]->get_size();
+  return 0;
+}
+
+void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
+{
+  dout(10) << __func__ << " bdev " << id << dendl;
+  ceph_assert(alloc[id]);
+  alloc[id]->release(to_release);
+  if (is_shared_alloc(id)) {
+    shared_alloc->bluefs_used -= to_release.size();
+  }
+}
+
+uint64_t BlueFS::get_used()
+{
+  uint64_t used = 0;
+  for (unsigned id = 0; id < MAX_BDEV; ++id) {
+    used += _get_used(id);
+  }
+  return used;
+}
+
+uint64_t BlueFS::_get_used(unsigned id) const
+{
+  uint64_t used = 0;
+  if (!alloc[id])
+     return 0;
+
+  if (is_shared_alloc(id)) {
+    used = shared_alloc->bluefs_used;
+  } else {
+    used = _get_total(id) - alloc[id]->get_free();
+  }
+  return used;
+}
+
+uint64_t BlueFS::get_used(unsigned id)
+{
+  ceph_assert(id < alloc.size());
+  ceph_assert(alloc[id]);
+  return _get_used(id);
+}
+
+uint64_t BlueFS::_get_total(unsigned id) const
+{
+  ceph_assert(id < bdev.size());
+  ceph_assert(id < block_reserved.size());
+  return get_block_device_size(id) - block_reserved[id];
+}
+
+uint64_t BlueFS::get_total(unsigned id)
+{
+  return _get_total(id);
+}
+
+uint64_t BlueFS::get_free(unsigned id)
+{
+  ceph_assert(id < alloc.size());
+  return alloc[id]->get_free();
+}
+
+void BlueFS::dump_perf_counters(Formatter *f)
+{
+  f->open_object_section("bluefs_perf_counters");
+  logger->dump_formatted(f, false, false);
+  f->close_section();
+}
+
+void BlueFS::dump_block_extents(ostream& out)
+{
+  for (unsigned i = 0; i < MAX_BDEV; ++i) {
+    if (!bdev[i]) {
+      continue;
+    }
+    auto total = get_total(i);
+    auto free = get_free(i);
+
+    out << i << " : device size 0x" << std::hex << total
+        << " : using 0x" << total - free
+	<< std::dec << "(" << byte_u_t(total - free) << ")";
+    out << "\n";
+  }
+}
+
+void BlueFS::foreach_block_extents(
+  unsigned id,
+  std::function<void(uint64_t, uint32_t)> fn)
+{
+  std::lock_guard nl(nodes.lock);
+  dout(10) << __func__ << " bdev " << id << dendl;
+  ceph_assert(id < alloc.size());
+  for (auto& p : nodes.file_map) {
+    for (auto& q : p.second->fnode.extents) {
+      if (q.bdev == id) {
+        fn(q.offset, q.length);
+      }
+    }
+  }
+}
+
+int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
+{
+  dout(1) << __func__
+	  << " osd_uuid " << osd_uuid
+	  << dendl;
+
+  // set volume selector if not provided before/outside
+  if (vselector == nullptr) {
+    vselector.reset(
+      new OriginalVolumeSelector(
+        get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+        get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+        get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
+  }
+
+  _init_logger();
+  _init_alloc();
+
+  super.version = 0;
+  super.block_size = bdev[BDEV_DB]->get_block_size();
+  super.osd_uuid = osd_uuid;
+  super.uuid.generate_random();
+  dout(1) << __func__ << " uuid " << super.uuid << dendl;
+
+  // init log
+  FileRef log_file = ceph::make_ref<File>();
+  log_file->fnode.ino = 1;
+  log_file->vselector_hint = vselector->get_hint_for_log();
+  int r = _allocate(
+    vselector->select_prefer_bdev(log_file->vselector_hint),
+    cct->_conf->bluefs_max_log_runway,
+    0,
+    &log_file->fnode);
+  vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+  ceph_assert(r == 0);
+  log.writer = _create_writer(log_file);
+
+  // initial txn
+  ceph_assert(log.seq_live == 1);
+  log.t.seq = 1;
+  log.t.op_init();
+  _flush_and_sync_log_LD();
+
+  // write supers
+  super.log_fnode = log_file->fnode;
+  super.memorized_layout = layout;
+  _write_super(BDEV_DB);
+  _flush_bdev();
+
+  // clean up
+  super = bluefs_super_t();
+  _close_writer(log.writer);
+  log.writer = NULL;
+  vselector.reset(nullptr);
+  _stop_alloc();
+  _shutdown_logger();
+  if (shared_alloc) {
+    ceph_assert(shared_alloc->need_init);
+    shared_alloc->need_init = false;
+  }
+
+  dout(10) << __func__ << " success" << dendl;
+  return 0;
+}
+
+void BlueFS::_init_alloc()
+{
+  dout(20) << __func__ << dendl;
+
+  size_t wal_alloc_size = 0;
+  if (bdev[BDEV_WAL]) {
+    wal_alloc_size = cct->_conf->bluefs_alloc_size;
+    alloc_size[BDEV_WAL] = wal_alloc_size;
+  }
+  logger->set(l_bluefs_wal_alloc_unit, wal_alloc_size);
+
+
+  uint64_t shared_alloc_size = cct->_conf->bluefs_shared_alloc_size;
+  if (shared_alloc && shared_alloc->a) {
+    uint64_t unit = shared_alloc->a->get_block_size();
+    shared_alloc_size = std::max(
+      unit,
+      shared_alloc_size);
+    ceph_assert(0 == p2phase(shared_alloc_size, unit));
+  }
+  if (bdev[BDEV_SLOW]) {
+    alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
+    alloc_size[BDEV_SLOW] = shared_alloc_size;
+  } else {
+    alloc_size[BDEV_DB] = shared_alloc_size;
+    alloc_size[BDEV_SLOW] = 0;
+  }
+  logger->set(l_bluefs_db_alloc_unit, alloc_size[BDEV_DB]);
+  logger->set(l_bluefs_main_alloc_unit, alloc_size[BDEV_SLOW]);
+  // new wal and db devices are never shared
+  if (bdev[BDEV_NEWWAL]) {
+    alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
+  }
+  if (bdev[BDEV_NEWDB]) {
+    alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
+  }
+
+  for (unsigned id = 0; id < bdev.size(); ++id) {
+    if (!bdev[id]) {
+      continue;
+    }
+    ceph_assert(bdev[id]->get_size());
+    if (is_shared_alloc(id)) {
+      dout(1) << __func__ << " shared, id " << id << std::hex
+              << ", capacity 0x" << bdev[id]->get_size()
+              << ", block size 0x" << alloc_size[id]
+              << std::dec << dendl;
+    } else {
+      ceph_assert(alloc_size[id]);
+      std::string name = "bluefs-";
+      const char* devnames[] = { "wal","db","slow" };
+      if (id <= BDEV_SLOW)
+        name += devnames[id];
+      else
+        name += to_string(uintptr_t(this));
+      dout(1) << __func__ << " new, id " << id << std::hex
+              << ", allocator name " << name
+              << ", allocator type " << cct->_conf->bluefs_allocator
+              << ", capacity 0x" << bdev[id]->get_size()
+              << ", block size 0x" << alloc_size[id]
+              << std::dec << dendl;
+      alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
+				    bdev[id]->get_size(),
+				    alloc_size[id],
+				    0, 0,
+				    name);
+      alloc[id]->init_add_free(
+        block_reserved[id],
+        _get_total(id));
+    }
+  }
+}
+
+void BlueFS::_stop_alloc()
+{
+  dout(20) << __func__ << dendl;
+  for (auto p : bdev) {
+    if (p)
+      p->discard_drain();
+  }
+
+  for (size_t i = 0; i < alloc.size(); ++i) {
+    if (alloc[i] && !is_shared_alloc(i)) {
+      alloc[i]->shutdown();
+      delete alloc[i];
+      alloc[i] = nullptr;
+    }
+  }
+}
+
+int BlueFS::_read_and_check(uint8_t ndev, uint64_t off, uint64_t len,
+			    ceph::buffer::list *pbl, IOContext *ioc, bool buffered)
+{
+  dout(10) << __func__ << " dev " << int(ndev)
+           << ": 0x" << std::hex << off << "~" << len << std::dec
+	   << (buffered ? " buffered" : "")
+	   << dendl;
+  int r;
+  bufferlist bl;
+  r = _bdev_read(ndev, off, len, &bl, ioc, buffered);
+  if (r != 0) {
+    return r;
+  }
+  uint64_t block_size = bdev[ndev]->get_block_size();
+  if (inject_read_zeros) {
+    if (len >= block_size * 2) {
+      derr << __func__ << " injecting error, zeros at "
+	   << int(ndev) << ": 0x" << std::hex << (off + len / 2)
+	   << "~" << (block_size * 2) << std::dec << dendl;
+      //use beginning, replace 8K in the middle with zeros, use tail
+      bufferlist temp;
+      bl.splice(0, len / 2 - block_size, &temp);
+      temp.append(buffer::create(block_size * 2, 0));
+      bl.splice(block_size * 2, len / 2 - block_size, &temp);
+      bl = temp;
+      inject_read_zeros--;
+    }
+  }
+  //make a check if there is a block with all 0
+  uint64_t to_check_len = len;
+  uint64_t skip = p2nphase(off, block_size);
+  if (skip >= to_check_len) {
+    return r;
+  }
+  auto it = bl.begin(skip);
+  to_check_len -= skip;
+  bool all_zeros = false;
+  while (all_zeros == false && to_check_len >= block_size) {
+    // checking 0s step
+    unsigned block_left = block_size;
+    unsigned avail;
+    const char* data;
+    all_zeros = true;
+    while (all_zeros && block_left > 0) {
+      avail = it.get_ptr_and_advance(block_left, &data);
+      block_left -= avail;
+      all_zeros = mem_is_zero(data, avail);
+    }
+    // skipping step
+    while (block_left > 0) {
+      avail = it.get_ptr_and_advance(block_left, &data);
+      block_left -= avail;
+    }
+    to_check_len -= block_size;
+  }
+  if (all_zeros) {
+    logger->inc(l_bluefs_read_zeros_candidate, 1);
+    bufferlist bl_reread;
+    r = _bdev_read(ndev, off, len, &bl_reread, ioc, buffered);
+    if (r != 0) {
+      return r;
+    }
+    // check if both read gave the same
+    if (!bl.contents_equal(bl_reread)) {
+      // report problems to log, but continue, maybe it will be good now...
+      derr << __func__ << " initial read of " << int(ndev)
+	   << ": 0x" << std::hex << off << "~" << len
+	   << std::dec << ": different then re-read " << dendl;
+      logger->inc(l_bluefs_read_zeros_errors, 1);
+    }
+    // use second read will be better if is different
+    pbl->append(bl_reread);
+  } else {
+    pbl->append(bl);
+  }
+  return r;
+}
+
+int BlueFS::_read_random_and_check(
+  uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered)
+{
+  dout(10) << __func__ << " dev " << int(ndev)
+           << ": 0x" << std::hex << off << "~" << len << std::dec
+	   << (buffered ? " buffered" : "")
+	   << dendl;
+  int r;
+  r = _bdev_read_random(ndev, off, len, buf, buffered);
+  if (r != 0) {
+    return r;
+  }
+  uint64_t block_size = bdev[ndev]->get_block_size();
+  if (inject_read_zeros) {
+    if (len >= block_size * 2) {
+      derr << __func__ << " injecting error, zeros at "
+	   << int(ndev) << ": 0x" << std::hex << (off + len / 2)
+	   << "~" << (block_size * 2) << std::dec << dendl;
+      //zero middle 8K
+      memset(buf + len / 2 - block_size, 0, block_size * 2);
+      inject_read_zeros--;
+    }
+  }
+  //make a check if there is a block with all 0
+  uint64_t to_check_len = len;
+  const char* data = buf;
+  uint64_t skip = p2nphase(off, block_size);
+  if (skip >= to_check_len) {
+    return r;
+  }
+  to_check_len -= skip;
+  data += skip;
+
+  bool all_zeros = false;
+  while (all_zeros == false && to_check_len >= block_size) {
+    if (mem_is_zero(data, block_size)) {
+      // at least one block is all zeros
+      all_zeros = true;
+      break;
+    }
+    data += block_size;
+    to_check_len -= block_size;
+  }
+  if (all_zeros) {
+    logger->inc(l_bluefs_read_zeros_candidate, 1);
+    std::unique_ptr<char[]> data_reread(new char[len]);
+    r = _bdev_read_random(ndev, off, len, &data_reread[0], buffered);
+    if (r != 0) {
+      return r;
+    }
+    // check if both read gave the same
+    if (memcmp(buf, &data_reread[0], len) != 0) {
+      derr << __func__ << " initial read of " << int(ndev)
+	   << ": 0x" << std::hex << off << "~" << len
+	   << std::dec << ": different then re-read " << dendl;
+      logger->inc(l_bluefs_read_zeros_errors, 1);
+      // second read is probably better
+      memcpy(buf, &data_reread[0], len);
+    }
+  }
+  return r;
+}
+
+int BlueFS::_bdev_read(uint8_t ndev, uint64_t off, uint64_t len,
+  ceph::buffer::list* pbl, IOContext* ioc, bool buffered)
+{
+  int cnt = 0;
+  switch (ndev) {
+    case BDEV_WAL: cnt = l_bluefs_read_disk_bytes_wal; break;
+    case BDEV_DB: cnt = l_bluefs_read_disk_bytes_db; break;
+    case BDEV_SLOW: cnt = l_bluefs_read_disk_bytes_slow; break;
+
+  }
+  if (cnt) {
+    logger->inc(cnt, len);
+  }
+  return bdev[ndev]->read(off, len, pbl, ioc, buffered);
+}
+
+int BlueFS::_bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len,
+  char* buf, bool buffered)
+{
+  int cnt = 0;
+  switch (ndev) {
+    case BDEV_WAL: cnt = l_bluefs_read_random_disk_bytes_wal; break;
+    case BDEV_DB: cnt = l_bluefs_read_random_disk_bytes_db; break;
+    case BDEV_SLOW: cnt = l_bluefs_read_random_disk_bytes_slow; break;
+  }
+  if (cnt) {
+    logger->inc(cnt, len);
+  }
+  return bdev[ndev]->read_random(off, len, buf, buffered);
+}
+
+int BlueFS::mount()
+{
+  dout(1) << __func__ << dendl;
+
+  _init_logger();
+  int r = _open_super();
+  if (r < 0) {
+    derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
+    goto out;
+  }
+
+  // set volume selector if not provided before/outside
+  if (vselector == nullptr) {
+    vselector.reset(
+      new OriginalVolumeSelector(
+        get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+        get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+        get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
+  }
+
+  _init_alloc();
+
+  r = _replay(false, false);
+  if (r < 0) {
+    derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
+    _stop_alloc();
+    goto out;
+  }
+
+  // init freelist
+  for (auto& p : nodes.file_map) {
+    dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
+    for (auto& q : p.second->fnode.extents) {
+      bool is_shared = is_shared_alloc(q.bdev);
+      ceph_assert(!is_shared || (is_shared && shared_alloc));
+      if (is_shared && shared_alloc->need_init && shared_alloc->a) {
+        shared_alloc->bluefs_used += q.length;
+        alloc[q.bdev]->init_rm_free(q.offset, q.length);
+      } else if (!is_shared) {
+        alloc[q.bdev]->init_rm_free(q.offset, q.length);
+      }
+    }
+  }
+  if (shared_alloc) {
+    shared_alloc->need_init = false;
+    dout(1) << __func__ << " shared_bdev_used = "
+            << shared_alloc->bluefs_used << dendl;
+  } else {
+    dout(1) << __func__ << " shared bdev not used"
+            << dendl;
+  }
+
+  // set up the log for future writes
+  log.writer = _create_writer(_get_file(1));
+  ceph_assert(log.writer->file->fnode.ino == 1);
+  log.writer->pos = log.writer->file->fnode.size;
+  log.writer->file->fnode.reset_delta();
+  dout(10) << __func__ << " log write pos set to 0x"
+           << std::hex << log.writer->pos << std::dec
+           << dendl;
+  // update log size
+  logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
+  return 0;
+
+ out:
+  super = bluefs_super_t();
+  return r;
+}
+
+int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
+{
+  if (super.memorized_layout) {
+    if (layout == *super.memorized_layout) {
+      dout(10) << __func__ << " bluefs layout verified positively" << dendl;
+    } else {
+      derr << __func__ << " memorized layout doesn't fit current one" << dendl;
+      return -EIO;
+    }
+  } else {
+    dout(10) << __func__ << " no memorized_layout in bluefs superblock"
+             << dendl;
+  }
+
+  return 0;
+}
+
+void BlueFS::umount(bool avoid_compact)
+{
+  dout(1) << __func__ << dendl;
+
+  sync_metadata(avoid_compact);
+  if (cct->_conf->bluefs_check_volume_selector_on_umount) {
+    _check_vselector_LNF();
+  }
+  _close_writer(log.writer);
+  log.writer = NULL;
+  log.t.clear();
+
+  vselector.reset(nullptr);
+  _stop_alloc();
+  nodes.file_map.clear();
+  nodes.dir_map.clear();
+  super = bluefs_super_t();
+  _shutdown_logger();
+}
+
+int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
+{
+  dout(1) << __func__ << dendl;
+
+  if(id == BDEV_NEWDB) {
+    int new_log_dev_cur = BDEV_WAL;
+    int new_log_dev_next = BDEV_WAL;
+    if (!bdev[BDEV_WAL]) {
+      new_log_dev_cur = BDEV_NEWDB;
+      new_log_dev_next = BDEV_DB;
+    }
+    _rewrite_log_and_layout_sync_LNF_LD(false,
+      BDEV_NEWDB,
+      new_log_dev_cur,
+      new_log_dev_next,
+      RENAME_DB2SLOW,
+      layout);
+  } else if(id == BDEV_NEWWAL) {
+    _rewrite_log_and_layout_sync_LNF_LD(false,
+      BDEV_DB,
+      BDEV_NEWWAL,
+      BDEV_WAL,
+      REMOVE_WAL,
+      layout);
+  } else {
+    assert(false);
+  }
+  return 0;
+}
+
+void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
+{
+  if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
+    bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
+  if (bdev[BDEV_WAL])
+    bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
+}
+
+void BlueFS::get_devices(set<string> *ls)
+{
+  for (unsigned i = 0; i < MAX_BDEV; ++i) {
+    if (bdev[i]) {
+      bdev[i]->get_devices(ls);
+    }
+  }
+}
+
+int BlueFS::fsck()
+{
+  dout(1) << __func__ << dendl;
+  // hrm, i think we check everything on mount...
+  return 0;
+}
+
+int BlueFS::_write_super(int dev)
+{
+  ++super.version;
+  // build superblock
+  bufferlist bl;
+  encode(super, bl);
+  uint32_t crc = bl.crc32c(-1);
+  encode(crc, bl);
+  dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
+  dout(10) << __func__ << " superblock " << super.version << dendl;
+  dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
+  ceph_assert_always(bl.length() <= get_super_length());
+  bl.append_zero(get_super_length() - bl.length());
+
+  bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
+  dout(20) << __func__ << " v " << super.version
+           << " crc 0x" << std::hex << crc
+           << " offset 0x" << get_super_offset() << std::dec
+           << dendl;
+  return 0;
+}
+
+int BlueFS::_open_super()
+{
+  dout(10) << __func__ << dendl;
+
+  bufferlist bl;
+  uint32_t expected_crc, crc;
+  int r;
+
+  // always the second block
+  r = _bdev_read(BDEV_DB, get_super_offset(), get_super_length(),
+		 &bl, ioc[BDEV_DB], false);
+  if (r < 0)
+    return r;
+
+  auto p = bl.cbegin();
+  decode(super, p);
+  {
+    bufferlist t;
+    t.substr_of(bl, 0, p.get_off());
+    crc = t.crc32c(-1);
+  }
+  decode(expected_crc, p);
+  if (crc != expected_crc) {
+    derr << __func__ << " bad crc on superblock, expected 0x"
+         << std::hex << expected_crc << " != actual 0x" << crc << std::dec
+         << dendl;
+    return -EIO;
+  }
+  dout(10) << __func__ << " superblock " << super.version << dendl;
+  dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
+  return 0;
+}
+
+int BlueFS::_check_allocations(const bluefs_fnode_t& fnode,
+  boost::dynamic_bitset<uint64_t>* used_blocks,
+  bool is_alloc, //true when allocating, false when deallocating
+  const char* op_name)
+{
+  auto& fnode_extents = fnode.extents;
+  for (auto e : fnode_extents) {
+    auto id = e.bdev;
+    bool fail = false;
+    ceph_assert(id < MAX_BDEV);
+    ceph_assert(bdev[id]);
+    // let's use minimal allocation unit we can have
+    auto alloc_unit = bdev[id]->get_block_size();
+
+    if (int r = _verify_alloc_granularity(id, e.offset, e.length,
+                                          alloc_unit,
+					  op_name); r < 0) {
+      return r;
+    }
+
+    apply_for_bitset_range(e.offset, e.length, alloc_unit, used_blocks[id],
+      [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
+	if (is_alloc == bs.test(pos)) {
+	  fail = true;
+	} else {
+	  bs.flip(pos);
+	}
+      }
+    );
+    if (fail) {
+      derr << __func__ << " " << op_name << " invalid extent " << int(e.bdev)
+        << ": 0x" << std::hex << e.offset << "~" << e.length << std::dec
+	<< (is_alloc == true ?
+	    ": duplicate reference, ino " : ": double free, ino ")
+	<< fnode.ino << dendl;
+      return -EFAULT;
+    }
+  }
+  return 0;
+}
+
+int BlueFS::_verify_alloc_granularity(
+  __u8 id, uint64_t offset, uint64_t length, uint64_t alloc_unit, const char *op)
+{
+  if ((offset & (alloc_unit - 1)) ||
+      (length & (alloc_unit - 1))) {
+    derr << __func__ << " " << op << " of " << (int)id
+	 << ":0x" << std::hex << offset << "~" << length << std::dec
+	 << " does not align to alloc_size 0x"
+	 << std::hex << alloc_unit << std::dec << dendl;
+    return -EFAULT;
+  }
+  return 0;
+}
+
+int BlueFS::_replay(bool noop, bool to_stdout)
+{
+  dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
+  ino_last = 1;  // by the log
+  uint64_t log_seq = 0;
+
+  FileRef log_file;
+  log_file = _get_file(1);
+
+  log_file->fnode = super.log_fnode;
+  if (!noop) {
+    log_file->vselector_hint =
+      vselector->get_hint_for_log();
+  }
+  dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
+  if (unlikely(to_stdout)) {
+    std::cout << " log_fnode " << super.log_fnode << std::endl;
+  } 
+
+  FileReader *log_reader = new FileReader(
+    log_file, cct->_conf->bluefs_max_prefetch,
+    false,  // !random
+    true);  // ignore eof
+
+  bool seen_recs = false;
+
+  boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
+
+  if (!noop) {
+    if (cct->_conf->bluefs_log_replay_check_allocations) {
+      for (size_t i = 0; i < MAX_BDEV; ++i) {
+	if (bdev[i] != nullptr) {
+          // let's use minimal allocation unit we can have
+          auto au = bdev[i]->get_block_size();
+          //hmm... on 32TB/4K drive this would take 1GB RAM!!!
+	  used_blocks[i].resize(round_up_to(bdev[i]->get_size(), au) / au);
+	}
+      }
+      // check initial log layout
+      int r = _check_allocations(log_file->fnode,
+				 used_blocks, true, "Log from super");
+      if (r < 0) {
+	return r;
+      }
+    }
+  }
+  
+  while (true) {
+    ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
+    uint64_t pos = log_reader->buf.pos;
+    uint64_t read_pos = pos;
+    bufferlist bl;
+    {
+      int r = _read(log_reader, read_pos, super.block_size,
+		    &bl, NULL);
+      if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
+	r += _do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
+      }
+      assert(r == (int)super.block_size);
+      read_pos += r;
+    }
+    uint64_t more = 0;
+    uint64_t seq;
+    uuid_d uuid;
+    {
+      auto p = bl.cbegin();
+      __u8 a, b;
+      uint32_t len;
+      decode(a, p);
+      decode(b, p);
+      decode(len, p);
+      decode(uuid, p);
+      decode(seq, p);
+      if (len + 6 > bl.length()) {
+	more = round_up_to(len + 6 - bl.length(), super.block_size);
+      }
+    }
+    if (uuid != super.uuid) {
+      if (seen_recs) {
+	dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
+		 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
+		 << dendl;
+      } else {
+	derr << __func__ << " 0x" << std::hex << pos << std::dec
+		 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
+		 << ", block dump: \n";
+	bufferlist t;
+	t.substr_of(bl, 0, super.block_size);
+	t.hexdump(*_dout);
+	*_dout << dendl;
+      }
+      break;
+    }
+    if (seq != log_seq + 1) {
+      if (seen_recs) {
+	dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
+		 << ": stop: seq " << seq << " != expected " << log_seq + 1
+		 << dendl;;
+      } else {
+	derr << __func__ << " 0x" << std::hex << pos << std::dec
+	     << ": stop: seq " << seq << " != expected " << log_seq + 1
+	     << dendl;;
+      }
+      break;
+    }
+    if (more) {
+      dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
+               << " more bytes" << dendl;
+      bufferlist t;
+      int r = _read(log_reader, read_pos, more, &t, NULL);
+      if (r < (int)more) {
+	dout(10) << __func__ << " 0x" << std::hex << pos
+                 << ": stop: len is 0x" << bl.length() + more << std::dec
+                 << ", which is past eof" << dendl;
+	if (cct->_conf->bluefs_replay_recovery) {
+	  //try to search for more data
+	  r += _do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
+	  if (r < (int)more) {
+	    //in normal mode we must read r==more, for recovery it is too strict
+	    break;
+	  }
+	}
+      }
+      ceph_assert(r == (int)more);
+      bl.claim_append(t);
+      read_pos += r;
+    }
+    bluefs_transaction_t t;
+    try {
+      auto p = bl.cbegin();
+      decode(t, p);
+      seen_recs = true;
+    }
+    catch (ceph::buffer::error& e) {
+      // Multi-block transactions might be incomplete due to unexpected
+      // power off. Hence let's treat that as a regular stop condition.
+      if (seen_recs && more) {
+        dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
+                 << ": stop: failed to decode: " << e.what()
+                 << dendl;
+      } else {
+        derr << __func__ << " 0x" << std::hex << pos << std::dec
+             << ": stop: failed to decode: " << e.what()
+             << dendl;
+        delete log_reader;
+        return -EIO;
+      }
+      break;
+    }
+    ceph_assert(seq == t.seq);
+    dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
+             << ": " << t << dendl;
+    if (unlikely(to_stdout)) {
+      std::cout << " 0x" << std::hex << pos << std::dec
+                << ": " << t << std::endl;
+    }
+
+    auto p = t.op_bl.cbegin();
+    auto pos0 = pos;
+    while (!p.end()) {
+      pos = pos0 + p.get_off();
+      __u8 op;
+      decode(op, p);
+      switch (op) {
+
+      case bluefs_transaction_t::OP_INIT:
+	dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                 << ":  op_init" << dendl;
+        if (unlikely(to_stdout)) {
+          std::cout << " 0x" << std::hex << pos << std::dec
+                    << ":  op_init" << std::endl;
+        }
+
+	ceph_assert(t.seq == 1);
+	break;
+
+      case bluefs_transaction_t::OP_JUMP:
+        {
+	  uint64_t next_seq;
+	  uint64_t offset;
+	  decode(next_seq, p);
+	  decode(offset, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+		   << ":  op_jump seq " << next_seq
+		   << " offset 0x" << std::hex << offset << std::dec << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_jump seq " << next_seq
+                      << " offset 0x" << std::hex << offset << std::dec
+                      << std::endl;
+          }
+
+	  ceph_assert(next_seq > log_seq);
+	  log_seq = next_seq - 1; // we will increment it below
+	  uint64_t skip = offset - read_pos;
+	  if (skip) {
+	    bufferlist junk;
+	    int r = _read(log_reader, read_pos, skip, &junk,
+			  NULL);
+	    if (r != (int)skip) {
+	      dout(10) << __func__ << " 0x" << std::hex << read_pos
+		       << ": stop: failed to skip to " << offset
+		       << std::dec << dendl;
+	      ceph_abort_msg("problem with op_jump");
+	    }
+	  }
+	}
+	break;
+
+      case bluefs_transaction_t::OP_JUMP_SEQ:
+        {
+	  uint64_t next_seq;
+	  decode(next_seq, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_jump_seq " << next_seq << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_jump_seq " << next_seq << std::endl;
+          }
+
+	  ceph_assert(next_seq > log_seq);
+	  log_seq = next_seq - 1; // we will increment it below
+	}
+	break;
+
+      case bluefs_transaction_t::OP_ALLOC_ADD:
+	// LEGACY, do nothing but read params
+        {
+          __u8 id;
+          uint64_t offset, length;
+          decode(id, p);
+          decode(offset, p);
+          decode(length, p);
+        }
+	break;
+
+      case bluefs_transaction_t::OP_ALLOC_RM:
+	// LEGACY, do nothing but read params
+        {
+          __u8 id;
+          uint64_t offset, length;
+          decode(id, p);
+          decode(offset, p);
+          decode(length, p);
+        }
+        break;
+
+      case bluefs_transaction_t::OP_DIR_LINK:
+        {
+	  string dirname, filename;
+	  uint64_t ino;
+	  decode(dirname, p);
+	  decode(filename, p);
+	  decode(ino, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_dir_link " << " " << dirname << "/" << filename
+                   << " to " << ino
+		   << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_dir_link " << " " << dirname << "/" << filename
+                      << " to " << ino
+                      << std::endl;
+          }
+
+	  if (!noop) {
+	    FileRef file = _get_file(ino);
+	    ceph_assert(file->fnode.ino);
+	    map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
+	    ceph_assert(q != nodes.dir_map.end());
+	    map<string,FileRef>::iterator r = q->second->file_map.find(filename);
+	    ceph_assert(r == q->second->file_map.end());
+
+            vselector->sub_usage(file->vselector_hint, file->fnode);
+            file->vselector_hint =
+              vselector->get_hint_by_dir(dirname);
+            vselector->add_usage(file->vselector_hint, file->fnode);
+
+	    q->second->file_map[filename] = file;
+	    ++file->refs;
+	  }
+	}
+	break;
+
+      case bluefs_transaction_t::OP_DIR_UNLINK:
+        {
+	  string dirname, filename;
+	  decode(dirname, p);
+	  decode(filename, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_dir_unlink " << " " << dirname << "/" << filename
+                   << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_dir_unlink " << " " << dirname << "/" << filename
+                      << std::endl;
+          }
+ 
+	  if (!noop) {
+	    map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
+	    ceph_assert(q != nodes.dir_map.end());
+	    map<string,FileRef>::iterator r = q->second->file_map.find(filename);
+	    ceph_assert(r != q->second->file_map.end());
+            ceph_assert(r->second->refs > 0); 
+	    --r->second->refs;
+	    q->second->file_map.erase(r);
+	  }
+	}
+	break;
+
+      case bluefs_transaction_t::OP_DIR_CREATE:
+        {
+	  string dirname;
+	  decode(dirname, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_dir_create " << dirname << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_dir_create " << dirname << std::endl;
+          }
+
+	  if (!noop) {
+	    map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
+	    ceph_assert(q == nodes.dir_map.end());
+	    nodes.dir_map[dirname] = ceph::make_ref<Dir>();
+	  }
+	}
+	break;
+
+      case bluefs_transaction_t::OP_DIR_REMOVE:
+        {
+	  string dirname;
+	  decode(dirname, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_dir_remove " << dirname << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_dir_remove " << dirname << std::endl;
+          }
+
+	  if (!noop) {
+	    map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
+	    ceph_assert(q != nodes.dir_map.end());
+	    ceph_assert(q->second->file_map.empty());
+	    nodes.dir_map.erase(q);
+	  }
+	}
+	break;
+
+      case bluefs_transaction_t::OP_FILE_UPDATE:
+        {
+	  bluefs_fnode_t fnode;
+	  decode(fnode, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_file_update " << " " << fnode << " " << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_file_update " << " " << fnode << std::endl;
+          }
+          if (!noop) {
+	    FileRef f = _get_file(fnode.ino);
+	    if (cct->_conf->bluefs_log_replay_check_allocations) {
+              int r = _check_allocations(f->fnode,
+		used_blocks, false, "OP_FILE_UPDATE");
+              if (r < 0) {
+                return r;
+              }
+            }
+            if (fnode.ino != 1) {
+              vselector->sub_usage(f->vselector_hint, f->fnode);
+            }
+            f->fnode = fnode;
+            if (fnode.ino != 1) {
+              vselector->add_usage(f->vselector_hint, f->fnode);
+            }
+
+	    if (fnode.ino > ino_last) {
+	      ino_last = fnode.ino;
+	    }
+            if (cct->_conf->bluefs_log_replay_check_allocations) {
+              int r = _check_allocations(f->fnode,
+		used_blocks, true, "OP_FILE_UPDATE");
+              if (r < 0) {
+                return r;
+              }
+            }
+	  } else if (noop && fnode.ino == 1) {
+	    FileRef f = _get_file(fnode.ino);
+	    f->fnode = fnode;
+	  }
+        }
+	break;
+      case bluefs_transaction_t::OP_FILE_UPDATE_INC:
+	{
+	  bluefs_fnode_delta_t delta;
+	  decode(delta, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+	    << ":  op_file_update_inc " << " " << delta << " " << dendl;
+	  if (unlikely(to_stdout)) {
+	    std::cout << " 0x" << std::hex << pos << std::dec
+	      << ":  op_file_update_inc " << " " << delta << std::endl;
+	  }
+	  if (!noop) {
+	    FileRef f = _get_file(delta.ino);
+	    bluefs_fnode_t& fnode = f->fnode;
+	    if (delta.offset != fnode.allocated) {
+	      derr << __func__ << " invalid op_file_update_inc, new extents miss end of file"
+		   << " fnode=" << fnode
+		   << " delta=" << delta
+		   << dendl;
+	      ceph_assert(delta.offset == fnode.allocated);
+	    }
+	    if (cct->_conf->bluefs_log_replay_check_allocations) {
+              int r = _check_allocations(fnode,
+		used_blocks, false, "OP_FILE_UPDATE_INC");
+              if (r < 0) {
+                return r;
+              }
+            }
+
+	    fnode.ino = delta.ino;
+	    fnode.mtime = delta.mtime;
+	    if (fnode.ino != 1) {
+	      vselector->sub_usage(f->vselector_hint, fnode);
+	    }
+	    fnode.size = delta.size;
+	    fnode.claim_extents(delta.extents);
+	    dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+		     << ":  op_file_update_inc produced " << " " << fnode << " " << dendl;
+
+	    if (fnode.ino != 1) {
+	      vselector->add_usage(f->vselector_hint, fnode);
+	    }
+
+	    if (fnode.ino > ino_last) {
+	      ino_last = fnode.ino;
+	    }
+	    if (cct->_conf->bluefs_log_replay_check_allocations) {
+              int r = _check_allocations(f->fnode,
+		used_blocks, true, "OP_FILE_UPDATE_INC");
+              if (r < 0) {
+                return r;
+              }
+	    }
+	  } else if (noop && delta.ino == 1) {
+	    // we need to track bluefs log, even in noop mode
+	    FileRef f = _get_file(1);
+	    bluefs_fnode_t& fnode = f->fnode;
+	    fnode.ino = delta.ino;
+	    fnode.mtime = delta.mtime;
+	    fnode.size = delta.size;
+	    fnode.claim_extents(delta.extents);
+	  }
+	}
+      break;
+
+      case bluefs_transaction_t::OP_FILE_REMOVE:
+        {
+	  uint64_t ino;
+	  decode(ino, p);
+	  dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
+                   << ":  op_file_remove " << ino << dendl;
+          if (unlikely(to_stdout)) {
+            std::cout << " 0x" << std::hex << pos << std::dec
+                      << ":  op_file_remove " << ino << std::endl;
+          }
+
+          if (!noop) {
+            auto p = nodes.file_map.find(ino);
+            ceph_assert(p != nodes.file_map.end());
+            vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
+            if (cct->_conf->bluefs_log_replay_check_allocations) {
+	      int r = _check_allocations(p->second->fnode,
+		used_blocks, false, "OP_FILE_REMOVE");
+              if (r < 0) {
+		return r;
+              }
+            }
+            nodes.file_map.erase(p);
+          }
+        }
+	break;
+
+      default:
+	derr << __func__ << " 0x" << std::hex << pos << std::dec
+             << ": stop: unrecognized op " << (int)op << dendl;
+	delete log_reader;
+        return -EIO;
+      }
+    }
+    ceph_assert(p.end());
+
+    // we successfully replayed the transaction; bump the seq and log size
+    ++log_seq;
+    log_file->fnode.size = log_reader->buf.pos;
+  }
+  if (!noop) {
+    vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+    log.seq_live = log_seq + 1;
+    dirty.seq_live = log_seq + 1;
+    log.t.seq = log.seq_live;
+    dirty.seq_stable = log_seq;
+  }
+
+  dout(10) << __func__ << " log file size was 0x"
+           << std::hex << log_file->fnode.size << std::dec << dendl;
+  if (unlikely(to_stdout)) {
+    std::cout << " log file size was 0x"
+              << std::hex << log_file->fnode.size << std::dec << std::endl;
+  }
+
+  delete log_reader;
+
+  if (!noop) {
+    // verify file link counts are all >0
+    for (auto& p : nodes.file_map) {
+      if (p.second->refs == 0 &&
+	  p.second->fnode.ino > 1) {
+	derr << __func__ << " file with link count 0: " << p.second->fnode
+	     << dendl;
+	return -EIO;
+      }
+    }
+  }
+  // reflect file count in logger
+  logger->set(l_bluefs_num_files, nodes.file_map.size());
+
+  dout(10) << __func__ << " done" << dendl;
+  return 0;
+}
+
+int BlueFS::log_dump()
+{
+  // only dump log file's content
+  ceph_assert(log.writer == nullptr && "cannot log_dump on mounted BlueFS");
+  _init_logger();
+  int r = _open_super();
+  if (r < 0) {
+    derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = _replay(true, true);
+  if (r < 0) {
+    derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
+  }
+  _shutdown_logger();
+  super = bluefs_super_t();
+  return r;
+}
+
+int BlueFS::device_migrate_to_existing(
+  CephContext *cct,
+  const set<int>& devs_source,
+  int dev_target,
+  const bluefs_layout_t& layout)
+{
+  vector<byte> buf;
+  bool buffered = cct->_conf->bluefs_buffered_io;
+
+  dout(10) << __func__ << " devs_source " << devs_source
+	   << " dev_target " << dev_target << dendl;
+  assert(dev_target < (int)MAX_BDEV);
+
+  int flags = 0;
+  flags |= devs_source.count(BDEV_DB) ?
+    (REMOVE_DB | RENAME_SLOW2DB) : 0;
+  flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
+  int dev_target_new = dev_target;
+
+  // Slow device without separate DB one is addressed via BDEV_DB
+  // Hence need renaming.
+  if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
+    dev_target_new = BDEV_DB;
+    dout(0) << __func__ << " super to be written to " << dev_target << dendl;
+  }
+
+  for (auto& [ino, file_ref] : nodes.file_map) {
+    //do not copy log
+    if (ino == 1) {
+      continue;
+    }
+    dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
+
+    vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
+
+    bool rewrite = std::any_of(
+      file_ref->fnode.extents.begin(),
+      file_ref->fnode.extents.end(),
+      [=](auto& ext) {
+	return ext.bdev != dev_target && devs_source.count(ext.bdev);
+      });
+    if (rewrite) {
+      dout(10) << __func__ << "  migrating" << dendl;
+      bluefs_fnode_t old_fnode;
+      old_fnode.swap_extents(file_ref->fnode);
+      auto& old_fnode_extents = old_fnode.extents;
+      // read entire file
+      bufferlist bl;
+      for (const auto &old_ext : old_fnode_extents) {
+	buf.resize(old_ext.length);
+	int r = _bdev_read_random(old_ext.bdev,
+	  old_ext.offset,
+	  old_ext.length,
+	  (char*)&buf.at(0),
+	  buffered);
+	if (r != 0) {
+	  derr << __func__ << " failed to read 0x" << std::hex
+	       << old_ext.offset << "~" << old_ext.length << std::dec
+	       << " from " << (int)dev_target << dendl;
+	  return -EIO;
+	}
+	bl.append((char*)&buf[0], old_ext.length);
+      }
+
+      // write entire file
+      auto l = _allocate(dev_target, bl.length(), 0,
+        &file_ref->fnode, 0, false);
+      if (l < 0) {
+	derr << __func__ << " unable to allocate len 0x" << std::hex
+	     << bl.length() << std::dec << " from " << (int)dev_target
+	     << ": " << cpp_strerror(l) << dendl;
+	return -ENOSPC;
+      }
+
+      uint64_t off = 0;
+      for (auto& i : file_ref->fnode.extents) {
+	bufferlist cur;
+	uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
+	ceph_assert(cur_len > 0);
+	cur.substr_of(bl, off, cur_len);
+	int r = bdev[dev_target]->write(i.offset, cur, buffered);
+	ceph_assert(r == 0);
+	off += cur_len;
+      }
+
+      // release old extents
+      for (const auto &old_ext : old_fnode_extents) {
+	PExtentVector to_release;
+	to_release.emplace_back(old_ext.offset, old_ext.length);
+	alloc[old_ext.bdev]->release(to_release);
+        if (is_shared_alloc(old_ext.bdev)) {
+          shared_alloc->bluefs_used -= to_release.size();
+        }
+      }
+
+      // update fnode
+      for (auto& i : file_ref->fnode.extents) {
+	i.bdev = dev_target_new;
+      }
+    } else {
+      for (auto& ext : file_ref->fnode.extents) {
+	if (dev_target != dev_target_new && ext.bdev == dev_target) {
+	  dout(20) << __func__ << "  " << " ... adjusting extent 0x"
+		   << std::hex << ext.offset << std::dec
+		   << " bdev " << dev_target << " -> " << dev_target_new
+		   << dendl;
+	  ext.bdev = dev_target_new;
+	}
+      }
+    }
+    vselector->add_usage(file_ref->vselector_hint, file_ref->fnode);
+  }
+  // new logging device in the current naming scheme
+  int new_log_dev_cur = bdev[BDEV_WAL] ?
+    BDEV_WAL :
+    bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
+
+  // new logging device in new naming scheme
+  int new_log_dev_next = new_log_dev_cur;
+
+  if (devs_source.count(new_log_dev_cur)) {
+    // SLOW device is addressed via BDEV_DB too hence either WAL or DB
+    new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
+      BDEV_DB :
+      BDEV_WAL;
+
+    dout(0) << __func__ << " log moved from " << new_log_dev_cur
+      << " to " << new_log_dev_next << dendl;
+
+    new_log_dev_cur =
+      (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
+        BDEV_SLOW :
+        new_log_dev_next;
+  }
+
+  _rewrite_log_and_layout_sync_LNF_LD(
+    false,
+    (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
+    new_log_dev_cur,
+    new_log_dev_next,
+    flags,
+    layout);
+  return 0;
+}
+
+int BlueFS::device_migrate_to_new(
+  CephContext *cct,
+  const set<int>& devs_source,
+  int dev_target,
+  const bluefs_layout_t& layout)
+{
+  vector<byte> buf;
+  bool buffered = cct->_conf->bluefs_buffered_io;
+
+  dout(10) << __func__ << " devs_source " << devs_source
+	   << " dev_target " << dev_target << dendl;
+  assert(dev_target == (int)BDEV_NEWDB || dev_target == (int)BDEV_NEWWAL);
+
+  int flags = 0;
+
+  flags |= devs_source.count(BDEV_DB) ?
+    (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
+    0;
+  flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
+  int dev_target_new = dev_target; //FIXME: remove, makes no sense
+
+  for (auto& [ino, file_ref] : nodes.file_map) {
+    //do not copy log
+    if (ino == 1) {
+      continue;
+    }
+    dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
+
+    vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
+
+    bool rewrite = std::any_of(
+      file_ref->fnode.extents.begin(),
+      file_ref->fnode.extents.end(),
+      [=](auto& ext) {
+	return ext.bdev != dev_target && devs_source.count(ext.bdev);
+      });
+    if (rewrite) {
+      dout(10) << __func__ << "  migrating" << dendl;
+      bluefs_fnode_t old_fnode;
+      old_fnode.swap_extents(file_ref->fnode);
+      auto& old_fnode_extents = old_fnode.extents;
+      // read entire file
+      bufferlist bl;
+      for (const auto &old_ext : old_fnode_extents) {
+	buf.resize(old_ext.length);
+	int r = _bdev_read_random(old_ext.bdev,
+	  old_ext.offset,
+	  old_ext.length,
+	  (char*)&buf.at(0),
+	  buffered);
+	if (r != 0) {
+	  derr << __func__ << " failed to read 0x" << std::hex
+	       << old_ext.offset << "~" << old_ext.length << std::dec
+	       << " from " << (int)dev_target << dendl;
+	  return -EIO;
+	}
+	bl.append((char*)&buf[0], old_ext.length);
+      }
+
+      // write entire file
+      auto l = _allocate(dev_target, bl.length(), 0,
+        &file_ref->fnode, 0, false);
+      if (l < 0) {
+	derr << __func__ << " unable to allocate len 0x" << std::hex
+	     << bl.length() << std::dec << " from " << (int)dev_target
+	     << ": " << cpp_strerror(l) << dendl;
+	return -ENOSPC;
+      }
+
+      uint64_t off = 0;
+      for (auto& i : file_ref->fnode.extents) {
+	bufferlist cur;
+	uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
+	ceph_assert(cur_len > 0);
+	cur.substr_of(bl, off, cur_len);
+	int r = bdev[dev_target]->write(i.offset, cur, buffered);
+	ceph_assert(r == 0);
+	off += cur_len;
+      }
+
+      // release old extents
+      for (const auto &old_ext : old_fnode_extents) {
+	PExtentVector to_release;
+	to_release.emplace_back(old_ext.offset, old_ext.length);
+	alloc[old_ext.bdev]->release(to_release);
+        if (is_shared_alloc(old_ext.bdev)) {
+          shared_alloc->bluefs_used -= to_release.size();
+        }
+      }
+
+      // update fnode
+      for (auto& i : file_ref->fnode.extents) {
+	i.bdev = dev_target_new;
+      }
+    }
+  }
+  // new logging device in the current naming scheme
+  int new_log_dev_cur =
+    bdev[BDEV_NEWWAL] ?
+      BDEV_NEWWAL :
+      bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
+        BDEV_WAL :
+	bdev[BDEV_NEWDB] ?
+	  BDEV_NEWDB :
+	  bdev[BDEV_DB] && !(flags & REMOVE_DB)?
+	    BDEV_DB :
+	    BDEV_SLOW;
+
+  // new logging device in new naming scheme
+  int new_log_dev_next =
+    new_log_dev_cur == BDEV_NEWWAL ?
+      BDEV_WAL :
+      new_log_dev_cur == BDEV_NEWDB ?
+	BDEV_DB :
+        new_log_dev_cur;
+
+  int super_dev =
+    dev_target == BDEV_NEWDB ?
+      BDEV_NEWDB :
+      bdev[BDEV_DB] ?
+        BDEV_DB :
+	BDEV_SLOW;
+
+  _rewrite_log_and_layout_sync_LNF_LD(
+    false,
+    super_dev,
+    new_log_dev_cur,
+    new_log_dev_next,
+    flags,
+    layout);
+  return 0;
+}
+
+BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
+{
+  auto p = nodes.file_map.find(ino);
+  if (p == nodes.file_map.end()) {
+    FileRef f = ceph::make_ref<File>();
+    nodes.file_map[ino] = f;
+    // track files count in logger
+    logger->set(l_bluefs_num_files, nodes.file_map.size());
+    dout(30) << __func__ << " ino " << ino << " = " << f
+	     << " (new)" << dendl;
+    return f;
+  } else {
+    dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
+    return p->second;
+  }
+}
+
+
+/**
+To modify fnode both FileWriter::lock and File::lock must be obtained.
+The special case is when we modify bluefs log (ino 1) or
+we are compacting log (ino 0).
+
+In any case it is enough to hold File::lock to be sure fnode will not be modified.
+*/
+struct lock_fnode_print {
+  BlueFS::FileRef file;
+  lock_fnode_print(BlueFS::FileRef file) : file(file) {};
+};
+std::ostream& operator<<(std::ostream& out, const lock_fnode_print& to_lock) {
+  std::lock_guard l(to_lock.file->lock);
+  out << to_lock.file->fnode;
+  return out;
+}
+
+void BlueFS::_drop_link_D(FileRef file)
+{
+  dout(20) << __func__ << " had refs " << file->refs
+	   << " on " << lock_fnode_print(file) << dendl;
+  ceph_assert(file->refs > 0);
+  ceph_assert(ceph_mutex_is_locked(log.lock));
+  ceph_assert(ceph_mutex_is_locked(nodes.lock));
+
+  --file->refs;
+  if (file->refs == 0) {
+    dout(20) << __func__ << " destroying " << file->fnode << dendl;
+    ceph_assert(file->num_reading.load() == 0);
+    vselector->sub_usage(file->vselector_hint, file->fnode);
+    log.t.op_file_remove(file->fnode.ino);
+    nodes.file_map.erase(file->fnode.ino);
+    logger->set(l_bluefs_num_files, nodes.file_map.size());
+    file->deleted = true;
+
+    std::lock_guard dl(dirty.lock);
+    for (auto& r : file->fnode.extents) {
+      dirty.pending_release[r.bdev].insert(r.offset, r.length);
+    }
+    if (file->dirty_seq > dirty.seq_stable) {
+      // retract request to serialize changes
+      ceph_assert(dirty.files.count(file->dirty_seq));
+      auto it = dirty.files[file->dirty_seq].iterator_to(*file);
+      dirty.files[file->dirty_seq].erase(it);
+      file->dirty_seq = dirty.seq_stable;
+    }
+  }
+}
+
+int64_t BlueFS::_read_random(
+  FileReader *h,         ///< [in] read from here
+  uint64_t off,          ///< [in] offset
+  uint64_t len,          ///< [in] this many bytes
+  char *out)             ///< [out] copy it here
+{
+  auto* buf = &h->buf;
+
+  int64_t ret = 0;
+  dout(10) << __func__ << " h " << h
+           << " 0x" << std::hex << off << "~" << len << std::dec
+	   << " from " << lock_fnode_print(h->file) << dendl;
+
+  ++h->file->num_reading;
+
+  if (!h->ignore_eof &&
+      off + len > h->file->fnode.size) {
+    if (off > h->file->fnode.size)
+      len = 0;
+    else
+      len = h->file->fnode.size - off;
+    dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
+	     << std::hex << len << std::dec << dendl;
+  }
+  logger->inc(l_bluefs_read_random_count, 1);
+  logger->inc(l_bluefs_read_random_bytes, len);
+
+  std::shared_lock s_lock(h->lock);
+  buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
+  while (len > 0) {
+    if (off < buf->bl_off || off >= buf->get_buf_end()) {
+      s_lock.unlock();
+      uint64_t x_off = 0;
+      auto p = h->file->fnode.seek(off, &x_off);
+      ceph_assert(p != h->file->fnode.extents.end());
+      uint64_t l = std::min(p->length - x_off, len);
+      //hard cap to 1GB
+      l = std::min(l, uint64_t(1) << 30);
+      dout(20) << __func__ << " read random 0x"
+	       << std::hex << x_off << "~" << l << std::dec
+	       << " of " << *p << dendl;
+      int r;
+      if (!cct->_conf->bluefs_check_for_zeros) {
+	r = _bdev_read_random(p->bdev, p->offset + x_off, l, out,
+			      cct->_conf->bluefs_buffered_io);
+      } else {
+	r = _read_random_and_check(p->bdev, p->offset + x_off, l, out,
+			cct->_conf->bluefs_buffered_io);
+      }
+      ceph_assert(r == 0);
+      off += l;
+      len -= l;
+      ret += l;
+      out += l;
+
+      logger->inc(l_bluefs_read_random_disk_count, 1);
+      logger->inc(l_bluefs_read_random_disk_bytes, l);
+      if (len > 0) {
+	s_lock.lock();
+      }
+    } else {
+      auto left = buf->get_buf_remaining(off);
+      int64_t r = std::min(len, left);
+      logger->inc(l_bluefs_read_random_buffer_count, 1);
+      logger->inc(l_bluefs_read_random_buffer_bytes, r);
+      dout(20) << __func__ << " left 0x" << std::hex << left
+	      << " 0x" << off << "~" << len << std::dec
+	      << dendl;
+
+      auto p = buf->bl.begin();
+      p.seek(off - buf->bl_off);
+      p.copy(r, out);
+      out += r;
+
+      dout(30) << __func__ << " result chunk (0x"
+	       << std::hex << r << std::dec << " bytes):\n";
+      bufferlist t;
+      t.substr_of(buf->bl, off - buf->bl_off, r);
+      t.hexdump(*_dout);
+      *_dout << dendl;
+
+      off += r;
+      len -= r;
+      ret += r;
+      buf->pos += r;
+    }
+  }
+  dout(20) << __func__ << std::hex
+           << " got 0x" << ret
+           << std::dec  << dendl;
+  --h->file->num_reading;
+  return ret;
+}
+
+int64_t BlueFS::_read(
+  FileReader *h,         ///< [in] read from here
+  uint64_t off,          ///< [in] offset
+  size_t len,            ///< [in] this many bytes
+  bufferlist *outbl,     ///< [out] optional: reference the result here
+  char *out)             ///< [out] optional: or copy it here
+{
+  FileReaderBuffer *buf = &(h->buf);
+
+  bool prefetch = !outbl && !out;
+  dout(10) << __func__ << " h " << h
+           << " 0x" << std::hex << off << "~" << len << std::dec
+	   << " from " << lock_fnode_print(h->file)
+	   << (prefetch ? " prefetch" : "")
+	   << dendl;
+
+  ++h->file->num_reading;
+
+  if (!h->ignore_eof &&
+      off + len > h->file->fnode.size) {
+    if (off > h->file->fnode.size)
+      len = 0;
+    else
+      len = h->file->fnode.size - off;
+    dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
+	     << std::hex << len << std::dec << dendl;
+  }
+  logger->inc(l_bluefs_read_count, 1);
+  logger->inc(l_bluefs_read_bytes, len);
+  if (prefetch) {
+    logger->inc(l_bluefs_read_prefetch_count, 1);
+    logger->inc(l_bluefs_read_prefetch_bytes, len);
+  }
+
+  if (outbl)
+    outbl->clear();
+
+  int64_t ret = 0;
+  std::shared_lock s_lock(h->lock);
+  while (len > 0) {
+    size_t left;
+    if (off < buf->bl_off || off >= buf->get_buf_end()) {
+      s_lock.unlock();
+      std::unique_lock u_lock(h->lock);
+      buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
+      if (off < buf->bl_off || off >= buf->get_buf_end()) {
+        // if precondition hasn't changed during locking upgrade.
+        buf->bl.clear();
+        buf->bl_off = off & super.block_mask();
+        uint64_t x_off = 0;
+        auto p = h->file->fnode.seek(buf->bl_off, &x_off);
+	if (p == h->file->fnode.extents.end()) {
+	  dout(5) << __func__ << " reading less then required "
+		  << ret << "<" << ret + len << dendl;
+	  break;
+	}
+
+        uint64_t want = round_up_to(len + (off & ~super.block_mask()),
+				    super.block_size);
+        want = std::max(want, buf->max_prefetch);
+        uint64_t l = std::min(p->length - x_off, want);
+        //hard cap to 1GB
+	l = std::min(l, uint64_t(1) << 30);
+        uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
+        if (!h->ignore_eof &&
+	    buf->bl_off + l > eof_offset) {
+	  l = eof_offset - buf->bl_off;
+        }
+        dout(20) << __func__ << " fetching 0x"
+                 << std::hex << x_off << "~" << l << std::dec
+                 << " of " << *p << dendl;
+	int r;
+	// when reading BlueFS log (only happens on startup) use non-buffered io
+	// it makes it in sync with logic in _flush_range()
+	bool use_buffered_io = h->file->fnode.ino == 1 ? false : cct->_conf->bluefs_buffered_io;
+	if (!cct->_conf->bluefs_check_for_zeros) {
+	  r = _bdev_read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
+			 use_buffered_io);
+	} else {
+	  r = _read_and_check(
+	    p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
+	    use_buffered_io);
+	}
+	logger->inc(l_bluefs_read_disk_count, 1);
+	logger->inc(l_bluefs_read_disk_bytes, l);
+
+        ceph_assert(r == 0);
+      }
+      u_lock.unlock();
+      s_lock.lock();
+      // we should recheck if buffer is valid after lock downgrade
+      continue; 
+    }
+    left = buf->get_buf_remaining(off);
+    dout(20) << __func__ << " left 0x" << std::hex << left
+             << " len 0x" << len << std::dec << dendl;
+
+    int64_t r = std::min(len, left);
+    if (outbl) {
+      bufferlist t;
+      t.substr_of(buf->bl, off - buf->bl_off, r);
+      outbl->claim_append(t);
+    }
+    if (out) {
+      auto p = buf->bl.begin();
+      p.seek(off - buf->bl_off);
+      p.copy(r, out);
+      out += r;
+    }
+
+    dout(30) << __func__ << " result chunk (0x"
+             << std::hex << r << std::dec << " bytes):\n";
+    bufferlist t;
+    t.substr_of(buf->bl, off - buf->bl_off, r);
+    t.hexdump(*_dout);
+    *_dout << dendl;
+
+    off += r;
+    len -= r;
+    ret += r;
+    buf->pos += r;
+  }
+
+  dout(20) << __func__ << std::hex
+           << " got 0x" << ret
+           << std::dec  << dendl;
+  ceph_assert(!outbl || (int)outbl->length() == ret);
+  --h->file->num_reading;
+  return ret;
+}
+
+void BlueFS::invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
+{
+  std::lock_guard l(f->lock);
+  dout(10) << __func__ << " file " << f->fnode
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+           << dendl;
+  if (offset & ~super.block_mask()) {
+    offset &= super.block_mask();
+    length = round_up_to(length, super.block_size);
+  }
+  uint64_t x_off = 0;
+  auto p = f->fnode.seek(offset, &x_off);
+  while (length > 0 && p != f->fnode.extents.end()) {
+    uint64_t x_len = std::min(p->length - x_off, length);
+    bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
+    dout(20) << __func__  << " 0x" << std::hex << x_off << "~" << x_len
+             << std:: dec << " of " << *p << dendl;
+    offset += x_len;
+    length -= x_len;
+  }
+}
+
+
+uint64_t BlueFS::_estimate_transaction_size(bluefs_transaction_t* t)
+{
+  uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
+				     std::max(alloc_size[BDEV_DB],
+					      alloc_size[BDEV_SLOW]));
+
+  // conservative estimate for final encoded size
+  return round_up_to(t->op_bl.length() + super.block_size * 2, max_alloc_size);
+}
+
+uint64_t BlueFS::_make_initial_transaction(uint64_t start_seq,
+                                           bluefs_fnode_t& fnode,
+                                           uint64_t expected_final_size,
+                                           bufferlist* out)
+{
+  bluefs_transaction_t t0;
+  t0.seq = start_seq;
+  t0.uuid = super.uuid;
+  t0.op_init();
+  t0.op_file_update_inc(fnode);
+  t0.op_jump(start_seq, expected_final_size); // this is a fixed size op,
+                                              // hence it's valid with fake
+                                              // params for overall txc size
+                                              // estimation
+  if (!out) {
+    return _estimate_transaction_size(&t0);
+  }
+
+  ceph_assert(expected_final_size > 0);
+  out->reserve(expected_final_size);
+  encode(t0, *out);
+  // make sure we're not wrong aboth the size
+  ceph_assert(out->length() <= expected_final_size);
+  _pad_bl(*out, expected_final_size);
+  return expected_final_size;
+}
+
+uint64_t BlueFS::_estimate_log_size_N()
+{
+  std::lock_guard nl(nodes.lock);
+  int avg_dir_size = 40;  // fixme
+  int avg_file_size = 12;
+  uint64_t size = 4096 * 2;
+  size += nodes.file_map.size() * (1 + sizeof(bluefs_fnode_t));
+  size += nodes.dir_map.size() + (1 + avg_dir_size);
+  size += nodes.file_map.size() * (1 + avg_dir_size + avg_file_size);
+  return round_up_to(size, super.block_size);
+}
+
+void BlueFS::compact_log()/*_LNF_LD_NF_D*/
+{
+  if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
+    if (cct->_conf->bluefs_compact_log_sync) {
+      _compact_log_sync_LNF_LD();
+    } else {
+      _compact_log_async_LD_LNF_D();
+    }
+  }
+}
+
+bool BlueFS::_should_start_compact_log_L_N()
+{
+  if (log_is_compacting.load() == true) {
+    // compaction is already running
+    return false;
+  }
+  uint64_t current;
+  {
+    std::lock_guard ll(log.lock);
+    current = log.writer->file->fnode.size;
+  }
+  uint64_t expected = _estimate_log_size_N();
+  float ratio = (float)current / (float)expected;
+  dout(10) << __func__ << " current 0x" << std::hex << current
+	   << " expected " << expected << std::dec
+	   << " ratio " << ratio
+	   << dendl;
+  if (current < cct->_conf->bluefs_log_compact_min_size ||
+      ratio < cct->_conf->bluefs_log_compact_min_ratio) {
+    return false;
+  }
+  return true;
+}
+
+void BlueFS::_compact_log_dump_metadata_NF(uint64_t start_seq,
+                                        bluefs_transaction_t *t,
+					int bdev_update_flags,
+                                        uint64_t capture_before_seq)
+{
+  dout(20) << __func__ << dendl;
+  t->seq = start_seq;
+  t->uuid = super.uuid;
+
+  std::lock_guard nl(nodes.lock);
+
+  for (auto& [ino, file_ref] : nodes.file_map) {
+    if (ino == 1)
+      continue;
+    ceph_assert(ino > 1);
+    std::lock_guard fl(file_ref->lock);
+    if (bdev_update_flags) {
+      for(auto& e : file_ref->fnode.extents) {
+        auto bdev = e.bdev;
+        auto bdev_new = bdev;
+        ceph_assert(!((bdev_update_flags & REMOVE_WAL) && bdev == BDEV_WAL));
+        if ((bdev_update_flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
+	  bdev_new = BDEV_DB;
+        }
+        if ((bdev_update_flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
+	  bdev_new = BDEV_SLOW;
+        }
+        if (bdev == BDEV_NEWDB) {
+	  // REMOVE_DB xor RENAME_DB
+	  ceph_assert(!(bdev_update_flags & REMOVE_DB) != !(bdev_update_flags & RENAME_DB2SLOW));
+	  ceph_assert(!(bdev_update_flags & RENAME_SLOW2DB));
+	  bdev_new = BDEV_DB;
+        }
+        if (bdev == BDEV_NEWWAL) {
+	  ceph_assert(bdev_update_flags & REMOVE_WAL);
+	  bdev_new = BDEV_WAL;
+        }
+        e.bdev = bdev_new;
+      }
+    }
+    if (capture_before_seq == 0 || file_ref->dirty_seq < capture_before_seq) {
+      dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
+    } else {
+      dout(20) << __func__ << " op_file_update just modified, dirty_seq="
+               << file_ref->dirty_seq << " " << file_ref->fnode << dendl;
+    }
+    t->op_file_update(file_ref->fnode);
+  }
+  for (auto& [path, dir_ref] : nodes.dir_map) {
+    dout(20) << __func__ << " op_dir_create " << path << dendl;
+    t->op_dir_create(path);
+    for (auto& [fname, file_ref] : dir_ref->file_map) {
+      dout(20) << __func__ << " op_dir_link " << path << "/" << fname
+	       << " to " << file_ref->fnode.ino << dendl;
+      t->op_dir_link(path, fname, file_ref->fnode.ino);
+    }
+  }
+}
+
+void BlueFS::_compact_log_sync_LNF_LD()
+{
+  dout(10) << __func__ << dendl;
+  uint8_t prefer_bdev;
+  {
+    std::lock_guard ll(log.lock);
+    prefer_bdev =
+      vselector->select_prefer_bdev(log.writer->file->vselector_hint);
+  }
+  _rewrite_log_and_layout_sync_LNF_LD(true,
+    BDEV_DB,
+    prefer_bdev,
+    prefer_bdev,
+    0,
+    super.memorized_layout);
+  logger->inc(l_bluefs_log_compactions);
+}
+
+/*
+ * SYNC LOG COMPACTION
+ *
+ * 0. Lock the log completely through the whole procedure
+ *
+ * 1. Build new log. It will include log's starter and compacted metadata
+ *    body. Jump op appended to the starter will link the pieces together.
+ *
+ * 2. Write out new log's content
+ *
+ * 3. Write out new superblock. This includes relevant device layout update.
+ *
+ * 4. Finalization. Old space release.
+ */
+
+void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
+					 int super_dev,
+					 int log_dev,
+					 int log_dev_new,
+					 int flags,
+					 std::optional<bluefs_layout_t> layout)
+{
+  // we substitute log_dev with log_dev_new for new allocations below
+  // and permitting fallback allocations prevents such a substitution
+  ceph_assert((permit_dev_fallback && log_dev == log_dev_new) ||
+              !permit_dev_fallback);
+
+  dout(10) << __func__ << " super_dev:" << super_dev
+                       << " log_dev:" << log_dev
+                       << " log_dev_new:" << log_dev_new
+		       << " flags:" << flags
+		       << " seq:" << log.seq_live
+		       << dendl;
+  utime_t mtime = ceph_clock_now();
+  uint64_t starter_seq = 1;
+
+  // Part 0.
+  // Lock the log totally till the end of the procedure
+  std::lock_guard ll(log.lock);
+  auto t0 = mono_clock::now();
+
+  File *log_file = log.writer->file.get();
+  bluefs_fnode_t fnode_tail;
+  // log.t.seq is always set to current live seq
+  ceph_assert(log.t.seq == log.seq_live);
+  // Capturing entire state. Dump anything that has been stored there.
+  log.t.clear();
+  log.t.seq = log.seq_live;
+  // From now on, no changes to log.t are permitted until we finish rewriting log.
+  // Can allow dirty to remain dirty - log.seq_live will not change.
+
+  //
+  // Part 1.
+  // Build new log starter and compacted metadata body
+  // 1.1. Build full compacted meta transaction.
+  //      Encode a bluefs transaction that dumps all of the in-memory fnodes
+  //      and names.
+  //      This might be pretty large and its allocation map can exceed
+  //      superblock size. Hence instead we'll need log starter part which
+  //      goes to superblock and refers that new meta through op_update_inc.
+  // 1.2.  Allocate space for the above transaction
+  //       using its size estimation.
+  // 1.3.  Allocate the space required for the starter part of the new log.
+  //       It should be small enough to fit into superblock.
+  // 1.4   Building new log persistent fnode representation which will
+  //       finally land to disk.
+  //       Depending on input parameters we might need to perform device ids
+  //       rename - runtime and persistent replicas should be different when we
+  //       are in the device migration process.
+  // 1.5   Store starter fnode to run-time superblock, to be written out later.
+  //       It doesn't contain compacted meta to fit relevant alocation map into
+  //       superblock.
+  // 1.6   Proceed building new log persistent fnode representation.
+  //       Will add log tail with compacted meta extents from 1.1.
+  //       Device rename applied as well
+  //
+  // 1.7.  Encode new log fnode starter,
+  //       It will include op_init, new log's op_update_inc
+  //       and jump to the compacted meta transaction beginning.
+  //       Superblock will reference this starter part
+  //
+  // 1.8.  Encode compacted meta transaction,
+  //       extend the transaction with a jump to proper sequence no
+  //
+
+
+  // 1.1 Build full compacted meta transaction
+  bluefs_transaction_t compacted_meta_t;
+  _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, flags, 0);
+
+  // 1.2 Allocate the space required for the compacted meta transaction
+  uint64_t compacted_meta_need =
+    _estimate_transaction_size(&compacted_meta_t) +
+      cct->_conf->bluefs_max_log_runway;
+
+  dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl;
+
+  int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, 0,
+    permit_dev_fallback);
+  ceph_assert(r == 0);
+
+
+  // 1.3 Allocate the space required for the starter part of the new log.
+  // estimate new log fnode size to be referenced from superblock
+  // hence use dummy fnode and jump parameters
+  uint64_t starter_need = _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
+
+  bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime);
+  r = _allocate(log_dev, starter_need, 0, &fnode_starter, 0,
+    permit_dev_fallback);
+  ceph_assert(r == 0);
+
+  // 1.4 Building starter fnode
+  bluefs_fnode_t fnode_persistent(fnode_starter.ino, 0, mtime);
+  for (auto p : fnode_starter.extents) {
+    // rename device if needed - this is possible when fallback allocations
+    // are prohibited only. Which means every extent is targeted to the same
+    // device and we can unconditionally update them.
+    if (log_dev != log_dev_new) {
+      dout(10) << __func__ << " renaming log extents to "
+               << log_dev_new << dendl;
+      p.bdev = log_dev_new;
+    }
+    fnode_persistent.append_extent(p);
+  }
+
+  // 1.5 Store starter fnode to run-time superblock, to be written out later
+  super.log_fnode = fnode_persistent;
+
+  // 1.6 Proceed building new log persistent fnode representation
+  // we'll build incremental update starting from this point
+  fnode_persistent.reset_delta();
+  for (auto p : fnode_tail.extents) {
+    // rename device if needed - this is possible when fallback allocations
+    // are prohibited only. Which means every extent is targeted to the same
+    // device and we can unconditionally update them.
+    if (log_dev != log_dev_new) {
+      dout(10) << __func__ << " renaming log extents to "
+               << log_dev_new << dendl;
+      p.bdev = log_dev_new;
+    }
+    fnode_persistent.append_extent(p);
+  }
+
+  // 1.7 Encode new log fnode
+  // This will flush incremental part of fnode_persistent only.
+  bufferlist starter_bl;
+  _make_initial_transaction(starter_seq, fnode_persistent, starter_need, &starter_bl);
+
+  // 1.8 Encode compacted meta transaction
+  dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl;
+  // hopefully "compact_meta_need" estimation provides enough extra space
+  // for this op, assert below if not
+  compacted_meta_t.op_jump_seq(log.seq_live);
+
+  bufferlist compacted_meta_bl;
+  encode(compacted_meta_t, compacted_meta_bl);
+  _pad_bl(compacted_meta_bl);
+  ceph_assert(compacted_meta_bl.length() <= compacted_meta_need);
+
+  //
+  // Part 2
+  // Write out new log's content
+  // 2.1. Build the full runtime new log's fnode
+  //
+  // 2.2. Write out new log's
+  //
+  // 2.3. Do flush and wait for completion through flush_bdev()
+  //
+  // 2.4. Finalize log update
+  //      Update all sequence numbers
+  //
+
+  // 2.1 Build the full runtime new log's fnode
+  bluefs_fnode_t old_log_fnode;
+  old_log_fnode.swap(fnode_starter);
+  old_log_fnode.clone_extents(fnode_tail);
+  old_log_fnode.reset_delta();
+  log_file->fnode.swap(old_log_fnode);
+
+  // 2.2 Write out new log's content
+  // Get rid off old writer
+  _close_writer(log.writer);
+  // Make new log writer and stage new log's content writing
+  log.writer = _create_writer(log_file);
+  log.writer->append(starter_bl);
+  log.writer->append(compacted_meta_bl);
+
+  // 2.3 Do flush and wait for completion through flush_bdev()
+  _flush_special(log.writer);
+#ifdef HAVE_LIBAIO
+  if (!cct->_conf->bluefs_sync_write) {
+    list<aio_t> completed_ios;
+    _claim_completed_aios(log.writer, &completed_ios);
+    _wait_for_aio(log.writer);
+    completed_ios.clear();
+  }
+#endif
+  _flush_bdev();
+
+  // 2.4 Finalize log update
+  ++log.seq_live;
+  dirty.seq_live = log.seq_live;
+  log.t.seq = log.seq_live;
+  vselector->sub_usage(log_file->vselector_hint, old_log_fnode);
+  vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+
+  // Part 3.
+  // Write out new superblock to reflect all the changes.
+  //
+
+  super.memorized_layout = layout;
+  _write_super(super_dev);
+  _flush_bdev();
+
+  // we're mostly done
+  dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
+  logger->inc(l_bluefs_log_compactions);
+
+  // Part 4
+  // Finalization. Release old space.
+  //
+  {
+    dout(10) << __func__
+             << " release old log extents " << old_log_fnode.extents
+             << dendl;
+    std::lock_guard dl(dirty.lock);
+    for (auto& r : old_log_fnode.extents) {
+      dirty.pending_release[r.bdev].insert(r.offset, r.length);
+    }
+  }
+  logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0);
+}
+
+/*
+ * ASYNC LOG COMPACTION
+ *
+ * 0. Lock the log and forbid its extension. The former covers just
+ *    a part of the below procedure while the latter spans over it
+ *    completely.
+ * 1. Allocate a new extent to continue the log, and then log an event
+ *    that jumps the log write position to the new extent.  At this point, the
+ *    old extent(s) won't be written to, and reflect everything to compact.
+ *    New events will be written to the new region that we'll keep.
+ *    The latter will finally become new log tail on compaction completion.
+ *
+ * 2. Build new log. It will include log's starter, compacted metadata
+ *    body and the above tail. Jump ops appended to the starter and meta body
+ *    will link the pieces togather. Log's lock is releases in the mid of the
+ *    process to permit parallel access to it.
+ *
+ * 3. Write out new log's content.
+ *
+ * 4. Write out new superblock to reflect all the changes.
+ *
+ * 5. Apply new log fnode, log is locked for a while.
+ *
+ * 6. Finalization. Clean up, old space release and total unlocking.
+ */
+
+void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
+{
+  dout(10) << __func__ << dendl;
+  utime_t mtime = ceph_clock_now();
+  uint64_t starter_seq = 1;
+  uint64_t old_log_jump_to = 0;
+
+  // Part 0.
+  // Lock the log and forbid its expansion and other compactions
+
+  // only one compaction allowed at one time
+  bool old_is_comp = std::atomic_exchange(&log_is_compacting, true);
+  if (old_is_comp) {
+    dout(10) << __func__ << " ongoing" <<dendl;
+    return;
+  }
+  // lock log's run-time structures for a while
+  log.lock.lock();
+  auto t0 = mono_clock::now();
+
+  // Part 1.
+  // Prepare current log for jumping into it.
+  // 1. Allocate extent
+  // 2. Update op to log
+  // 3. Jump op to log
+  // During that, no one else can write to log, otherwise we risk jumping backwards.
+  // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
+
+  //signal _maybe_extend_log that expansion of log is temporary inacceptable
+  bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true);
+  ceph_assert(old_forbidden == false);
+
+  //
+  // Part 1.
+  // Prepare current log for jumping into it.
+  // 1.1. Allocate extent
+  // 1.2. Save log's fnode extents and add new extents
+  // 1.3. Update op to log
+  // 1.4. Jump op to log
+  // During that, no one else can write to log, otherwise we risk jumping backwards.
+  // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
+
+  // 1.1 allocate new log extents and store them at fnode_tail
+  File *log_file = log.writer->file.get();
+  old_log_jump_to = log_file->fnode.get_allocated();
+  bluefs_fnode_t fnode_tail;
+  uint64_t runway = log_file->fnode.get_allocated() - log.writer->get_effective_write_pos();
+  dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
+           << " need 0x" << cct->_conf->bluefs_max_log_runway << std::dec << dendl;
+  int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
+		    cct->_conf->bluefs_max_log_runway,
+                    0,
+                    &fnode_tail);
+  ceph_assert(r == 0);
+
+  // 1.2 save log's fnode extents and add new extents
+  bluefs_fnode_t old_log_fnode(log_file->fnode);
+  log_file->fnode.clone_extents(fnode_tail);
+  //adjust usage as flush below will need it
+  vselector->sub_usage(log_file->vselector_hint, old_log_fnode);
+  vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+  dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
+
+  // 1.3 update the log file change and log a jump to the offset where we want to
+  // write the new entries
+  log.t.op_file_update_inc(log_file->fnode);
+
+  // 1.4 jump to new position should mean next seq
+  log.t.op_jump(log.seq_live + 1, old_log_jump_to);
+  uint64_t seq_now = log.seq_live;
+  // we need to flush all bdev because we will be streaming all dirty files to log
+  // TODO - think - if _flush_and_sync_log_jump will not add dirty files nor release pending allocations
+  // then flush_bdev() will not be necessary
+  _flush_bdev();
+  _flush_and_sync_log_jump_D(old_log_jump_to, runway);
+
+  //
+  // Part 2.
+  // Build new log starter and compacted metadata body
+  // 2.1.  Build full compacted meta transaction.
+  //       While still holding the lock, encode a bluefs transaction
+  //       that dumps all of the in-memory fnodes and names.
+  //       This might be pretty large and its allocation map can exceed
+  //       superblock size. Hence instead we'll need log starter part which
+  //       goes to superblock and refers that new meta through op_update_inc.
+  // 2.2.  After releasing the lock allocate space for the above transaction
+  //       using its size estimation.
+  //       Then build tailing list of extents which consists of these
+  //       newly allocated extents followed by ones from Part 1.
+  // 2.3.  Allocate the space required for the starter part of the new log.
+  //       It should be small enough to fit into superblock.
+  //       Effectively we start building new log fnode here.
+  // 2.4.  Store starter fnode to run-time superblock, to be written out later
+  // 2.5.  Finalize new log's fnode building
+  //       This will include log's starter and tailing extents built at 2.2
+  // 2.6.  Encode new log fnode starter,
+  //       It will include op_init, new log's op_update_inc
+  //       and jump to the compacted meta transaction beginning.
+  //       Superblock will reference this starter part
+  // 2.7.  Encode compacted meta transaction,
+  //       extend the transaction with a jump to the log tail from 1.1 before
+  //       encoding.
+  //
+
+  // 2.1 Build full compacted meta transaction
+  bluefs_transaction_t compacted_meta_t;
+  _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, 0, seq_now);
+
+  // now state is captured to compacted_meta_t,
+  // current log can be used to write to,
+  //ops in log will be continuation of captured state
+  logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0);
+  log.lock.unlock();
+
+  // 2.2 Allocate the space required for the compacted meta transaction
+  uint64_t compacted_meta_need = _estimate_transaction_size(&compacted_meta_t);
+  dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need
+           << dendl;
+  {
+    bluefs_fnode_t fnode_pre_tail;
+    // do allocate
+    r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
+                  compacted_meta_need,
+                  0,
+                  &fnode_pre_tail);
+    ceph_assert(r == 0);
+    // build trailing list of extents in fnode_tail,
+    // this will include newly allocated extents for compacted meta
+    // and aux extents allocated at step 1.1
+    fnode_pre_tail.claim_extents(fnode_tail.extents);
+    fnode_tail.swap_extents(fnode_pre_tail);
+  }
+
+  // 2.3 Allocate the space required for the starter part of the new log.
+  // Start building New log fnode
+  FileRef new_log = nullptr;
+  new_log = ceph::make_ref<File>();
+  new_log->fnode.ino = log_file->fnode.ino;
+  new_log->fnode.mtime = mtime;
+  // Estimate the required space
+  uint64_t starter_need =
+    _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
+  // and now allocate and store at new_log_fnode
+  r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
+                starter_need,
+                0,
+                &new_log->fnode);
+  ceph_assert(r == 0);
+
+  // 2.4 Store starter fnode to run-time superblock, to be written out later
+  super.log_fnode = new_log->fnode;
+
+  // 2.5 Finalize new log's fnode building
+  // start collecting new log fnode updates (to make op_update_inc later)
+  // since this point. This will include compacted meta from 2.2 and aux
+  // extents from 1.1.
+  new_log->fnode.reset_delta();
+  new_log->fnode.claim_extents(fnode_tail.extents);
+
+  // 2.6 Encode new log fnode
+  bufferlist starter_bl;
+  _make_initial_transaction(starter_seq, new_log->fnode, starter_need,
+    &starter_bl);
+
+  // 2.7 Encode compacted meta transaction,
+  dout(20) << __func__
+           << " new_log jump seq " << seq_now
+           << std::hex << " offset 0x" << starter_need + compacted_meta_need
+	   << std::dec << dendl;
+  // Extent compacted_meta transaction with a just to new log tail.
+  // Hopefully "compact_meta_need" estimation provides enough extra space
+  // for this new jump, assert below if not
+  compacted_meta_t.op_jump(seq_now, starter_need + compacted_meta_need);
+  // Now do encodeing and padding
+  bufferlist compacted_meta_bl;
+  compacted_meta_bl.reserve(compacted_meta_need);
+  encode(compacted_meta_t, compacted_meta_bl);
+  ceph_assert(compacted_meta_bl.length() <= compacted_meta_need);
+  _pad_bl(compacted_meta_bl, compacted_meta_need);
+
+  //
+  // Part 3.
+  // Write out new log's content
+  // 3.1 Stage new log's content writing
+  // 3.2 Do flush and wait for completion through flush_bdev()
+  //
+
+  // 3.1 Stage new log's content writing
+  // Make new log writer and append bufferlists to write out.
+  FileWriter *new_log_writer = _create_writer(new_log);
+  // And append all new log's bufferlists to write out.
+  new_log_writer->append(starter_bl);
+  new_log_writer->append(compacted_meta_bl);
+
+  // 3.2. flush and wait
+  _flush_special(new_log_writer);
+  _flush_bdev(new_log_writer, false); // do not check log.lock is locked
+
+  // Part 4.
+  // Write out new superblock to reflect all the changes.
+  //
+
+  _write_super(BDEV_DB);
+  _flush_bdev();
+
+  // Part 5.
+  // Apply new log fnode
+  //
+
+  // we need to acquire log's lock back at this point
+  log.lock.lock();
+  // Reconstruct actual log object from the new one.
+  vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
+  log_file->fnode.size =
+    log.writer->pos - old_log_jump_to + starter_need + compacted_meta_need;
+  log_file->fnode.mtime = std::max(mtime, log_file->fnode.mtime);
+  log_file->fnode.swap_extents(new_log->fnode);
+  // update log's writer
+  log.writer->pos = log.writer->file->fnode.size;
+  vselector->add_usage(log_file->vselector_hint, log_file->fnode);
+  // and unlock
+  log.lock.unlock();
+
+  // we're mostly done
+  dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
+  logger->inc(l_bluefs_log_compactions);
+
+  //Part 6.
+  // Finalization
+  // 6.1 Permit log's extension, forbidden at step 0.
+  //
+  // 6.2 Release the new log writer
+  //
+  // 6.3 Release old space
+  //
+  // 6.4. Enable other compactions
+  //
+
+  // 6.1 Permit log's extension, forbidden at step 0.
+  old_forbidden = atomic_exchange(&log_forbidden_to_expand, false);
+  ceph_assert(old_forbidden == true);
+  //to wake up if someone was in need of expanding log
+  log_cond.notify_all();
+
+  // 6.2 Release the new log writer
+  _close_writer(new_log_writer);
+  new_log_writer = nullptr;
+  new_log = nullptr;
+
+  // 6.3 Release old space
+  {
+    dout(10) << __func__
+             << " release old log extents " << old_log_fnode.extents
+             << dendl;
+    std::lock_guard dl(dirty.lock);
+    for (auto& r : old_log_fnode.extents) {
+      dirty.pending_release[r.bdev].insert(r.offset, r.length);
+    }
+  }
+
+  // 6.4. Enable other compactions
+  old_is_comp = atomic_exchange(&log_is_compacting, false);
+  ceph_assert(old_is_comp);
+}
+
+void BlueFS::_pad_bl(bufferlist& bl, uint64_t pad_size)
+{
+  pad_size = std::max(pad_size, uint64_t(super.block_size));
+  uint64_t partial = bl.length() % pad_size;
+  if (partial) {
+    dout(10) << __func__ << " padding with 0x" << std::hex
+	     << pad_size - partial << " zeros" << std::dec << dendl;
+    bl.append_zero(pad_size - partial);
+  }
+}
+
+
+// Returns log seq that was live before advance.
+uint64_t BlueFS::_log_advance_seq()
+{
+  ceph_assert(ceph_mutex_is_locked(dirty.lock));
+  ceph_assert(ceph_mutex_is_locked(log.lock));
+  //acquire new seq
+  // this will became seq_stable once we write
+  ceph_assert(dirty.seq_stable < dirty.seq_live);
+  ceph_assert(log.t.seq == log.seq_live);
+  uint64_t seq = log.seq_live;
+  log.t.uuid = super.uuid;
+
+  ++dirty.seq_live;
+  ++log.seq_live;
+  ceph_assert(dirty.seq_live == log.seq_live);
+  return seq;
+}
+
+
+// Adds to log.t file modifications mentioned in `dirty.files`.
+// Note: some bluefs ops may have already been stored in log.t transaction.
+void BlueFS::_consume_dirty(uint64_t seq)
+{
+  ceph_assert(ceph_mutex_is_locked(dirty.lock));
+  ceph_assert(ceph_mutex_is_locked(log.lock));
+
+  // log dirty files
+  // we just incremented log_seq. It is now illegal to add to dirty.files[log_seq]
+  auto lsi = dirty.files.find(seq);
+  if (lsi != dirty.files.end()) {
+    dout(20) << __func__ << " " << lsi->second.size() << " dirty.files" << dendl;
+    for (auto &f : lsi->second) {
+      // fnode here is protected indirectly
+      // the only path that adds to dirty.files goes from _fsync()
+      // _fsync() is executed under writer lock,
+      // and does not exit until syncing log is done
+      dout(20) << __func__ << "   op_file_update_inc " << f.fnode << dendl;
+      log.t.op_file_update_inc(f.fnode);
+    }
+  }
+}
+
+// Extends log if its free space is smaller then bluefs_min_log_runway.
+// Returns space available *BEFORE* adding new space. Signed for additional <0 detection.
+int64_t BlueFS::_maybe_extend_log()
+{
+  ceph_assert(ceph_mutex_is_locked(log.lock));
+  // allocate some more space (before we run out)?
+  // BTW: this triggers `flush()` in the `page_aligned_appender` of `log.writer`.
+  int64_t runway = log.writer->file->fnode.get_allocated() -
+    log.writer->get_effective_write_pos();
+  if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
+    dout(10) << __func__ << " allocating more log runway (0x"
+	     << std::hex << runway << std::dec  << " remaining)" << dendl;
+    /*
+     * Usually, when we are low on space in log, we just allocate new extent,
+     * put update op(log) to log and we are fine.
+     * Problem - it interferes with log compaction:
+     * New log produced in compaction will include - as last op - jump into some offset (anchor) of current log.
+     * It is assumed that log region (anchor - end) will contain all changes made by bluefs since
+     * full state capture into new log.
+     * Putting log update into (anchor - end) region is illegal, because any update there must be compatible with
+     * both logs, but old log is different then new log.
+     *
+     * Possible solutions:
+     * - stall extending log until we finish compacting and switch log (CURRENT)
+     * - re-run compaction with more runway for old log
+     * - add OP_FILE_ADDEXT that adds extent; will be compatible with both logs
+     */
+    if (log_forbidden_to_expand.load() == true) {
+      return -EWOULDBLOCK;
+    }
+    vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
+    int r = _allocate(
+      vselector->select_prefer_bdev(log.writer->file->vselector_hint),
+      cct->_conf->bluefs_max_log_runway,
+      0,
+      &log.writer->file->fnode);
+    ceph_assert(r == 0);
+    vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
+    log.t.op_file_update_inc(log.writer->file->fnode);
+  }
+  return runway;
+}
+
+void BlueFS::_flush_and_sync_log_core(int64_t runway)
+{
+  ceph_assert(ceph_mutex_is_locked(log.lock));
+  dout(10) << __func__ << " " << log.t << dendl;
+
+  bufferlist bl;
+  bl.reserve(super.block_size);
+  encode(log.t, bl);
+  // pad to block boundary
+  size_t realign = super.block_size - (bl.length() % super.block_size);
+  if (realign && realign != super.block_size)
+    bl.append_zero(realign);
+
+  logger->inc(l_bluefs_log_write_count, 1);
+  logger->inc(l_bluefs_logged_bytes, bl.length());
+
+  if (true) {
+    ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
+                                        // transaction will not fit extents before growth -> data loss on _replay
+  }
+
+  log.writer->append(bl);
+
+  // prepare log for new transactions
+  log.t.clear();
+  log.t.seq = log.seq_live;
+
+  uint64_t new_data = _flush_special(log.writer);
+  vselector->add_usage(log.writer->file->vselector_hint, new_data);
+}
+
+// Clears dirty.files up to (including) seq_stable.
+void BlueFS::_clear_dirty_set_stable_D(uint64_t seq)
+{
+  std::lock_guard dl(dirty.lock);
+
+  // clean dirty files
+  if (seq > dirty.seq_stable) {
+    dirty.seq_stable = seq;
+    dout(20) << __func__ << " seq_stable " << dirty.seq_stable << dendl;
+
+    // undirty all files that were already streamed to log
+    auto p = dirty.files.begin();
+    while (p != dirty.files.end()) {
+      if (p->first > dirty.seq_stable) {
+        dout(20) << __func__ << " done cleaning up dirty files" << dendl;
+        break;
+      }
+
+      auto l = p->second.begin();
+      while (l != p->second.end()) {
+        File *file = &*l;
+        ceph_assert(file->dirty_seq <= dirty.seq_stable);
+        dout(20) << __func__ << " cleaned file " << file->fnode.ino << dendl;
+        file->dirty_seq = dirty.seq_stable;
+        p->second.erase(l++);
+      }
+
+      ceph_assert(p->second.empty());
+      dirty.files.erase(p++);
+    }
+  } else {
+    dout(20) << __func__ << " seq_stable " << dirty.seq_stable
+             << " already >= out seq " << seq
+             << ", we lost a race against another log flush, done" << dendl;
+  }
+}
+
+void BlueFS::_release_pending_allocations(vector<interval_set<uint64_t>>& to_release)
+{
+  for (unsigned i = 0; i < to_release.size(); ++i) {
+    if (to_release[i].empty()) {
+        continue;
+    }
+    /* OK, now we have the guarantee alloc[i] won't be null. */
+
+    bool discard_queued = bdev[i]->try_discard(to_release[i]);
+    if (!discard_queued) {
+      alloc[i]->release(to_release[i]);
+      if (is_shared_alloc(i)) {
+        shared_alloc->bluefs_used -= to_release[i].size();
+      }
+    }
+  }
+}
+
+int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq)
+{
+  int64_t available_runway;
+  do {
+    log.lock.lock();
+    dirty.lock.lock();
+    if (want_seq && want_seq <= dirty.seq_stable) {
+      dout(10) << __func__ << " want_seq " << want_seq << " <= seq_stable "
+	       << dirty.seq_stable << ", done" << dendl;
+      dirty.lock.unlock();
+      log.lock.unlock();
+      return 0;
+    }
+
+    available_runway = _maybe_extend_log();
+    if (available_runway == -EWOULDBLOCK) {
+      // we are in need of adding runway, but we are during log-switch from compaction
+      dirty.lock.unlock();
+      //instead log.lock.unlock() do move ownership
+      std::unique_lock<ceph::mutex> ll(log.lock, std::adopt_lock);
+      while (log_forbidden_to_expand.load()) {
+	log_cond.wait(ll);
+      }
+    } else {
+      ceph_assert(available_runway >= 0);
+    }
+  } while (available_runway < 0);
+  
+  ceph_assert(want_seq == 0 || want_seq <= dirty.seq_live); // illegal to request seq that was not created yet
+  uint64_t seq =_log_advance_seq();
+  _consume_dirty(seq);
+  vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
+  to_release.swap(dirty.pending_release);
+  dirty.lock.unlock();
+
+  _flush_and_sync_log_core(available_runway);
+  _flush_bdev(log.writer);
+  logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
+  //now log.lock is no longer needed
+  log.lock.unlock();
+
+  _clear_dirty_set_stable_D(seq);
+  _release_pending_allocations(to_release);
+
+  _update_logger_stats();
+  return 0;
+}
+
+// Flushes log and immediately adjusts log_writer pos.
+int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to,
+				     int64_t available_runway)
+{
+  ceph_assert(ceph_mutex_is_locked(log.lock));
+
+  ceph_assert(jump_to);
+  // we synchronize writing to log, by lock to log.lock
+
+  dirty.lock.lock();
+  uint64_t seq =_log_advance_seq();
+  _consume_dirty(seq);
+  vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
+  to_release.swap(dirty.pending_release);
+  dirty.lock.unlock();
+  _flush_and_sync_log_core(available_runway);
+
+  dout(10) << __func__ << " jumping log offset from 0x" << std::hex
+	   << log.writer->pos << " -> 0x" << jump_to << std::dec << dendl;
+  log.writer->pos = jump_to;
+  vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size);
+  log.writer->file->fnode.size = jump_to;
+  vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size);
+
+  _flush_bdev(log.writer);
+
+  _clear_dirty_set_stable_D(seq);
+  _release_pending_allocations(to_release);
+
+  logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
+  _update_logger_stats();
+  return 0;
+}
+
+ceph::bufferlist BlueFS::FileWriter::flush_buffer(
+  CephContext* const cct,
+  const bool partial,
+  const unsigned length,
+  const bluefs_super_t& super)
+{
+  ceph_assert(ceph_mutex_is_locked(this->lock) || file->fnode.ino <= 1);
+  ceph::bufferlist bl;
+  if (partial) {
+    tail_block.splice(0, tail_block.length(), &bl);
+  }
+  const auto remaining_len = length - bl.length();
+  buffer.splice(0, remaining_len, &bl);
+  if (buffer.length()) {
+    dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec
+             << " unflushed" << dendl;
+  }
+  if (const unsigned tail = bl.length() & ~super.block_mask(); tail) {
+    const auto padding_len = super.block_size - tail;
+    dout(20) << __func__ << " caching tail of 0x"
+             << std::hex << tail
+             << " and padding block with 0x" << padding_len
+             << " buffer.length() " << buffer.length()
+             << std::dec << dendl;
+    // We need to go through the `buffer_appender` to get a chance to
+    // preserve in-memory contiguity and not mess with the alignment.
+    // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
+    buffer_appender.append_zero(padding_len);
+    buffer.splice(buffer.length() - padding_len, padding_len, &bl);
+    // Deep copy the tail here. This allows to avoid costlier copy on
+    // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
+    // of memory allocations.
+    // The alternative approach would be to place the entire tail and
+    // padding on a dedicated, 4 KB long memory chunk. This shouldn't
+    // trigger the rebuild while still being less expensive.
+    buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail);
+    buffer.splice(buffer.length() - tail, tail, &tail_block);
+  } else {
+    tail_block.clear();
+  }
+  return bl;
+}
+
+int BlueFS::_signal_dirty_to_log_D(FileWriter *h)
+{
+  ceph_assert(ceph_mutex_is_locked(h->lock));
+  std::lock_guard dl(dirty.lock);
+  if (h->file->deleted) {
+    dout(10) << __func__ << "  deleted, no-op" << dendl;
+    return 0;
+  }
+
+  h->file->fnode.mtime = ceph_clock_now();
+  ceph_assert(h->file->fnode.ino >= 1);
+  if (h->file->dirty_seq <= dirty.seq_stable) {
+    h->file->dirty_seq = dirty.seq_live;
+    dirty.files[h->file->dirty_seq].push_back(*h->file);
+    dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
+	     << " (was clean)" << dendl;
+  } else {
+    if (h->file->dirty_seq != dirty.seq_live) {
+      // need re-dirty, erase from list first
+      ceph_assert(dirty.files.count(h->file->dirty_seq));
+      auto it = dirty.files[h->file->dirty_seq].iterator_to(*h->file);
+      dirty.files[h->file->dirty_seq].erase(it);
+      h->file->dirty_seq = dirty.seq_live;
+      dirty.files[h->file->dirty_seq].push_back(*h->file);
+      dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
+	       << " (was " << h->file->dirty_seq << ")" << dendl;
+    } else {
+      dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
+	       << " (unchanged, do nothing) " << dendl;
+    }
+  }
+  return 0;
+}
+
+void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/
+{
+  _maybe_check_vselector_LNF();
+  std::unique_lock hl(h->lock);
+  _flush_range_F(h, offset, length);
+}
+
+int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length)
+{
+  ceph_assert(ceph_mutex_is_locked(h->lock));
+  ceph_assert(h->file->num_readers.load() == 0);
+  ceph_assert(h->file->fnode.ino > 1);
+
+  dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
+	   << " 0x" << offset << "~" << length << std::dec
+	   << " to " << h->file->fnode << dendl;
+  if (h->file->deleted) {
+    dout(10) << __func__ << "  deleted, no-op" << dendl;
+    return 0;
+  }
+
+  bool buffered = cct->_conf->bluefs_buffered_io;
+
+  if (offset + length <= h->pos)
+    return 0;
+  if (offset < h->pos) {
+    length -= h->pos - offset;
+    offset = h->pos;
+    dout(10) << " still need 0x"
+             << std::hex << offset << "~" << length << std::dec
+             << dendl;
+  }
+  std::lock_guard file_lock(h->file->lock);
+  ceph_assert(offset <= h->file->fnode.size);
+
+  uint64_t allocated = h->file->fnode.get_allocated();
+  vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
+  // do not bother to dirty the file if we are overwriting
+  // previously allocated extents.
+  if (allocated < offset + length) {
+    // we should never run out of log space here; see the min runway check
+    // in _flush_and_sync_log.
+    int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
+		      offset + length - allocated,
+                      0,
+		      &h->file->fnode);
+    if (r < 0) {
+      derr << __func__ << " allocated: 0x" << std::hex << allocated
+           << " offset: 0x" << offset << " length: 0x" << length << std::dec
+           << dendl;
+      vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
+      ceph_abort_msg("bluefs enospc");
+      return r;
+    }
+    h->file->is_dirty = true;
+  }
+  if (h->file->fnode.size < offset + length) {
+    h->file->fnode.size = offset + length;
+    h->file->is_dirty = true;
+  }
+
+  dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
+  int res = _flush_data(h, offset, length, buffered);
+  vselector->add_usage(h->file->vselector_hint, h->file->fnode);
+  return res;
+}
+
+int BlueFS::_flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered)
+{
+  if (h->file->fnode.ino > 1) {
+    ceph_assert(ceph_mutex_is_locked(h->lock));
+    ceph_assert(ceph_mutex_is_locked(h->file->lock));
+  }
+  uint64_t x_off = 0;
+  auto p = h->file->fnode.seek(offset, &x_off);
+  ceph_assert(p != h->file->fnode.extents.end());
+  dout(20) << __func__ << " in " << *p << " x_off 0x"
+           << std::hex << x_off << std::dec << dendl;
+
+  unsigned partial = x_off & ~super.block_mask();
+  if (partial) {
+    dout(20) << __func__ << " using partial tail 0x"
+             << std::hex << partial << std::dec << dendl;
+    x_off -= partial;
+    offset -= partial;
+    length += partial;
+    dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
+    for (auto p : h->iocv) {
+      if (p) {
+	p->aio_wait();
+      }
+    }
+  }
+
+  auto bl = h->flush_buffer(cct, partial, length, super);
+  ceph_assert(bl.length() >= length);
+  h->pos = offset + length;
+  length = bl.length();
+
+  logger->inc(l_bluefs_write_count, 1);
+  logger->inc(l_bluefs_write_bytes, length);
+
+  switch (h->writer_type) {
+  case WRITER_WAL:
+    logger->inc(l_bluefs_write_count_wal, 1);
+    logger->inc(l_bluefs_bytes_written_wal, length);
+    break;
+  case WRITER_SST:
+    logger->inc(l_bluefs_write_count_sst, 1);
+    logger->inc(l_bluefs_bytes_written_sst, length);
+    break;
+  }
+
+  dout(30) << "dump:\n";
+  bl.hexdump(*_dout);
+  *_dout << dendl;
+
+  uint64_t bloff = 0;
+  uint64_t bytes_written_slow = 0;
+  while (length > 0) {
+    logger->inc(l_bluefs_write_disk_count, 1);
+
+    uint64_t x_len = std::min(p->length - x_off, length);
+    bufferlist t;
+    t.substr_of(bl, bloff, x_len);
+    if (cct->_conf->bluefs_sync_write) {
+      bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
+    } else {
+      bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
+    }
+    h->dirty_devs[p->bdev] = true;
+    if (p->bdev == BDEV_SLOW) {
+      bytes_written_slow += t.length();
+    }
+
+    bloff += x_len;
+    length -= x_len;
+    ++p;
+    x_off = 0;
+  }
+  if (bytes_written_slow) {
+    logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
+  }
+  for (unsigned i = 0; i < MAX_BDEV; ++i) {
+    if (bdev[i]) {
+      if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
+        bdev[i]->aio_submit(h->iocv[i]);
+      }
+    }
+  }
+  dout(20) << __func__ << " h " << h << " pos now 0x"
+           << std::hex << h->pos << std::dec << dendl;
+  return 0;
+}
+
+#ifdef HAVE_LIBAIO
+// we need to retire old completed aios so they don't stick around in
+// memory indefinitely (along with their bufferlist refs).
+void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
+{
+  for (auto p : h->iocv) {
+    if (p) {
+      ls->splice(ls->end(), p->running_aios);
+    }
+  }
+  dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
+}
+
+void BlueFS::_wait_for_aio(FileWriter *h)
+{
+  // NOTE: this is safe to call without a lock, as long as our reference is
+  // stable.
+  utime_t start;
+  lgeneric_subdout(cct, bluefs, 10) << __func__;
+  start = ceph_clock_now();
+  *_dout << " " << h << dendl;
+  for (auto p : h->iocv) {
+    if (p) {
+      p->aio_wait();
+    }
+  }
+  dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
+}
+#endif
+
+void BlueFS::append_try_flush(FileWriter *h, const char* buf, size_t len)/*_WF_LNF_NF_LD_D*/
+{
+  bool flushed_sum = false;
+  {
+    std::unique_lock hl(h->lock);
+    size_t max_size = 1ull << 30; // cap to 1GB
+    while (len > 0) {
+      bool need_flush = true;
+      auto l0 = h->get_buffer_length();
+      if (l0 < max_size) {
+	size_t l = std::min(len, max_size - l0);
+	h->append(buf, l);
+	buf += l;
+	len -= l;
+	need_flush = h->get_buffer_length() >= cct->_conf->bluefs_min_flush_size;
+      }
+      if (need_flush) {
+	bool flushed = false;
+	int r = _flush_F(h, true, &flushed);
+	ceph_assert(r == 0);
+	flushed_sum |= flushed;
+	// make sure we've made any progress with flush hence the
+	// loop doesn't iterate forever
+	ceph_assert(h->get_buffer_length() < max_size);
+      }
+    }
+  }
+  if (flushed_sum) {
+    _maybe_compact_log_LNF_NF_LD_D();
+  }
+}
+
+void BlueFS::flush(FileWriter *h, bool force)/*_WF_LNF_NF_LD_D*/
+{
+  bool flushed = false;
+  int r;
+  {
+    std::unique_lock hl(h->lock);
+    r = _flush_F(h, force, &flushed);
+    ceph_assert(r == 0);
+  }
+  if (r == 0 && flushed) {
+    _maybe_compact_log_LNF_NF_LD_D();
+  }
+}
+
+int BlueFS::_flush_F(FileWriter *h, bool force, bool *flushed)
+{
+  ceph_assert(ceph_mutex_is_locked(h->lock));
+  uint64_t length = h->get_buffer_length();
+  uint64_t offset = h->pos;
+  if (flushed) {
+    *flushed = false;
+  }
+  if (!force &&
+      length < cct->_conf->bluefs_min_flush_size) {
+    dout(10) << __func__ << " " << h << " ignoring, length " << length
+	     << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
+	     << dendl;
+    return 0;
+  }
+  if (length == 0) {
+    dout(10) << __func__ << " " << h << " no dirty data on "
+	     << h->file->fnode << dendl;
+    return 0;
+  }
+  dout(10) << __func__ << " " << h << " 0x"
+           << std::hex << offset << "~" << length << std::dec
+	   << " to " << h->file->fnode << dendl;
+  ceph_assert(h->pos <= h->file->fnode.size);
+  int r = _flush_range_F(h, offset, length);
+  if (flushed) {
+    *flushed = true;
+  }
+  return r;
+}
+
+// Flush for bluefs special files.
+// Does not add extents to h.
+// Does not mark h as dirty.
+// we do not need to dirty the log file (or it's compacting
+// replacement) when the file size changes because replay is
+// smart enough to discover it on its own.
+uint64_t BlueFS::_flush_special(FileWriter *h)
+{
+  ceph_assert(h->file->fnode.ino <= 1);
+  uint64_t length = h->get_buffer_length();
+  uint64_t offset = h->pos;
+  uint64_t new_data = 0;
+  ceph_assert(length + offset <= h->file->fnode.get_allocated());
+  if (h->file->fnode.size < offset + length) {
+    new_data = offset + length - h->file->fnode.size;
+    h->file->fnode.size = offset + length;
+  }
+  _flush_data(h, offset, length, false);
+  return new_data;
+}
+
+int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
+{
+  std::lock_guard hl(h->lock);
+  dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
+           << " file " << h->file->fnode << dendl;
+  if (h->file->deleted) {
+    dout(10) << __func__ << "  deleted, no-op" << dendl;
+    return 0;
+  }
+
+  // we never truncate internal log files
+  ceph_assert(h->file->fnode.ino > 1);
+
+  // truncate off unflushed data?
+  if (h->pos < offset &&
+      h->pos + h->get_buffer_length() > offset) {
+    dout(20) << __func__ << " tossing out last " << offset - h->pos
+	     << " unflushed bytes" << dendl;
+    ceph_abort_msg("actually this shouldn't happen");
+  }
+  if (h->get_buffer_length()) {
+    int r = _flush_F(h, true);
+    if (r < 0)
+      return r;
+  }
+  if (offset == h->file->fnode.size) {
+    return 0;  // no-op!
+  }
+  if (offset > h->file->fnode.size) {
+    ceph_abort_msg("truncate up not supported");
+  }
+  ceph_assert(h->file->fnode.size >= offset);
+  _flush_bdev(h);
+
+  std::lock_guard ll(log.lock);
+  vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
+  h->file->fnode.size = offset;
+  h->file->is_dirty = true;
+  vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
+  log.t.op_file_update_inc(h->file->fnode);
+  return 0;
+}
+
+int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
+{
+  _maybe_check_vselector_LNF();
+  std::unique_lock hl(h->lock);
+  uint64_t old_dirty_seq = 0;
+  {
+    dout(10) << __func__ << " " << h << " " << h->file->fnode
+             << " dirty " << h->file->is_dirty << dendl;
+    int r = _flush_F(h, true);
+    if (r < 0)
+      return r;
+    _flush_bdev(h);
+    if (h->file->is_dirty) {
+      _signal_dirty_to_log_D(h);
+      h->file->is_dirty = false;
+    }
+    {
+      std::lock_guard dl(dirty.lock);
+      if (dirty.seq_stable < h->file->dirty_seq) {
+	old_dirty_seq = h->file->dirty_seq;
+	dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
+		 << ") on " << h->file->fnode << ", flushing log" << dendl;
+      }
+    }
+  }
+  if (old_dirty_seq) {
+    _flush_and_sync_log_LD(old_dirty_seq);
+  }
+  _maybe_compact_log_LNF_NF_LD_D();
+
+  return 0;
+}
+
+// be careful - either h->file->lock or log.lock must be taken
+void BlueFS::_flush_bdev(FileWriter *h, bool check_mutext_locked)
+{
+  if (check_mutext_locked) {
+    if (h->file->fnode.ino > 1) {
+      ceph_assert(ceph_mutex_is_locked(h->lock));
+    } else if (h->file->fnode.ino == 1) {
+      ceph_assert(ceph_mutex_is_locked(log.lock));
+    }
+  }
+  std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
+  h->dirty_devs.fill(false);
+#ifdef HAVE_LIBAIO
+  if (!cct->_conf->bluefs_sync_write) {
+    list<aio_t> completed_ios;
+    _claim_completed_aios(h, &completed_ios);
+    _wait_for_aio(h);
+    completed_ios.clear();
+  }
+#endif
+  _flush_bdev(flush_devs);
+}
+
+void BlueFS::_flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
+{
+  // NOTE: this is safe to call without a lock.
+  dout(20) << __func__ << dendl;
+  for (unsigned i = 0; i < MAX_BDEV; i++) {
+    if (dirty_bdevs[i])
+      bdev[i]->flush();
+  }
+}
+
+void BlueFS::_flush_bdev()
+{
+  // NOTE: this is safe to call without a lock.
+  dout(20) << __func__ << dendl;
+  for (unsigned i = 0; i < MAX_BDEV; i++) {
+    // alloc space from BDEV_SLOW is unexpected.
+    // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device.
+    if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) {
+      bdev[i]->flush();
+    }
+  }
+}
+
+const char* BlueFS::get_device_name(unsigned id)
+{
+  if (id >= MAX_BDEV) return "BDEV_INV";
+  const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
+  return names[id];
+}
+
+int BlueFS::_allocate(uint8_t id, uint64_t len,
+		      uint64_t alloc_unit,
+		      bluefs_fnode_t* node,
+                      size_t alloc_attempts,
+                      bool permit_dev_fallback)
+{
+  dout(10) << __func__ << " len 0x" << std::hex << len
+           << " au 0x" << alloc_unit
+           << std::dec << " from " << (int)id
+           << " cooldown " << cooldown_deadline
+           << dendl;
+  ceph_assert(id < alloc.size());
+  int64_t alloc_len = 0;
+  PExtentVector extents;
+  uint64_t hint = 0;
+  int64_t need = len;
+  bool shared = is_shared_alloc(id);
+  auto shared_unit = shared_alloc ? shared_alloc->alloc_unit : 0;
+  bool was_cooldown = false;
+  if (alloc[id]) {
+    if (!alloc_unit) {
+      alloc_unit = alloc_size[id];
+    }
+    // do not attempt shared_allocator with bluefs alloc unit
+    // when cooling down, fallback to slow dev alloc unit.
+    if (shared && alloc_unit != shared_unit) {
+       if (duration_cast<seconds>(real_clock::now().time_since_epoch()).count() <
+           cooldown_deadline) {
+         logger->inc(l_bluefs_alloc_shared_size_fallbacks);
+         alloc_unit = shared_unit;
+         was_cooldown = true;
+       } else if (cooldown_deadline.fetch_and(0)) {
+         // we might get false cooldown_deadline reset at this point
+         // but that's mostly harmless.
+         dout(1) << __func__ << " shared allocation cooldown period elapsed"
+                 << dendl;
+       }
+    }
+    need = round_up_to(len, alloc_unit);
+    if (!node->extents.empty() && node->extents.back().bdev == id) {
+      hint = node->extents.back().end();
+    }   
+    ++alloc_attempts;
+    extents.reserve(4);  // 4 should be (more than) enough for most allocations
+    alloc_len = alloc[id]->allocate(need, alloc_unit, hint, &extents);
+  }
+  if (alloc_len < 0 || alloc_len < need) {
+    if (alloc[id]) {
+      if (alloc_len > 0) {
+        alloc[id]->release(extents);
+      }
+      if (!was_cooldown && shared) {
+        auto delay_s = cct->_conf->bluefs_failed_shared_alloc_cooldown;
+        cooldown_deadline = delay_s +
+          duration_cast<seconds>(real_clock::now().time_since_epoch()).count();
+        dout(1) << __func__ << " shared allocation cooldown set for "
+                << delay_s << "s"
+                << dendl;
+      }
+      dout(1) << __func__ << " unable to allocate 0x" << std::hex << need
+	      << " on bdev " << (int)id
+              << ", allocator name " << alloc[id]->get_name()
+              << ", allocator type " << alloc[id]->get_type()
+              << ", capacity 0x" << alloc[id]->get_capacity()
+              << ", block size 0x" << alloc[id]->get_block_size()
+              << ", alloc unit 0x" << alloc_unit
+              << ", free 0x" << alloc[id]->get_free()
+              << ", fragmentation " << alloc[id]->get_fragmentation()
+              << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
+	      << std::dec << dendl;
+    } else {
+      dout(20) << __func__ << " alloc-id not set on index="<< (int)id
+               << " unable to allocate 0x" << std::hex << need
+	       << " on bdev " << (int)id << std::dec << dendl;
+    }
+    if (alloc[id] && shared && alloc_unit != shared_unit) {
+      alloc_unit = shared_unit;
+      dout(20) << __func__ << " fallback to bdev "
+	       << (int)id
+               << " with alloc unit 0x" << std::hex << alloc_unit
+               << std::dec << dendl;
+      logger->inc(l_bluefs_alloc_shared_size_fallbacks);
+      return _allocate(id,
+                       len,
+                       alloc_unit,
+                       node,
+                       alloc_attempts,
+                       permit_dev_fallback);
+    } else if (permit_dev_fallback && id != BDEV_SLOW && alloc[id + 1]) {
+      dout(20) << __func__ << " fallback to bdev "
+	       << (int)id + 1
+	       << dendl;
+      if (alloc_attempts > 0 && is_shared_alloc(id + 1)) {
+        logger->inc(l_bluefs_alloc_shared_dev_fallbacks);
+      }
+      return _allocate(id + 1,
+                       len,
+                       0, // back to default alloc unit
+                       node,
+                       alloc_attempts,
+                       permit_dev_fallback);
+    } else {
+      derr << __func__ << " allocation failed, needed 0x" << std::hex << need
+           << dendl;
+    }
+    return -ENOSPC;
+  } else {
+    uint64_t used = _get_used(id);
+    if (max_bytes[id] < used) {
+      logger->set(max_bytes_pcounters[id], used);
+      max_bytes[id] = used;
+    }
+    if (shared) {
+      shared_alloc->bluefs_used += alloc_len;
+    }
+  }
+
+  for (auto& p : extents) {
+    node->append_extent(bluefs_extent_t(id, p.offset, p.length));
+  }
+   
+  return 0;
+}
+
+int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/
+{
+  std::lock_guard ll(log.lock);
+  std::lock_guard fl(f->lock);
+  dout(10) << __func__ << " file " << f->fnode << " 0x"
+	   << std::hex << off << "~" << len << std::dec << dendl;
+  if (f->deleted) {
+    dout(10) << __func__ << "  deleted, no-op" << dendl;
+    return 0;
+  }
+  ceph_assert(f->fnode.ino > 1);
+  uint64_t allocated = f->fnode.get_allocated();
+  if (off + len > allocated) {
+    uint64_t want = off + len - allocated;
+
+    vselector->sub_usage(f->vselector_hint, f->fnode);
+    int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
+      want,
+      0,
+      &f->fnode);
+    vselector->add_usage(f->vselector_hint, f->fnode);
+    if (r < 0)
+      return r;
+
+    log.t.op_file_update_inc(f->fnode);
+  }
+  return 0;
+}
+
+void BlueFS::sync_metadata(bool avoid_compact)/*_LNF_NF_LD_D*/
+{
+  bool can_skip_flush;
+  {
+    std::lock_guard ll(log.lock);
+    std::lock_guard dl(dirty.lock);
+    can_skip_flush = log.t.empty() && dirty.files.empty();
+  }
+  if (can_skip_flush) {
+    dout(10) << __func__ << " - no pending log events" << dendl;
+  } else {
+    utime_t start;
+    lgeneric_subdout(cct, bluefs, 10) << __func__;
+    start = ceph_clock_now();
+    *_dout <<  dendl;
+    _flush_bdev(); // FIXME?
+    _flush_and_sync_log_LD();
+    dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
+  }
+
+  if (!avoid_compact) {
+    _maybe_compact_log_LNF_NF_LD_D();
+  }
+}
+
+void BlueFS::_maybe_compact_log_LNF_NF_LD_D()
+{
+  if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
+      _should_start_compact_log_L_N()) {
+    auto t0 = mono_clock::now();
+    if (cct->_conf->bluefs_compact_log_sync) {
+      _compact_log_sync_LNF_LD();
+    } else {
+      _compact_log_async_LD_LNF_D();
+    }
+    logger->tinc(l_bluefs_compaction_lat, mono_clock::now() - t0);
+  }
+}
+
+int BlueFS::open_for_write(
+  std::string_view dirname,
+  std::string_view filename,
+  FileWriter **h,
+  bool overwrite)/*_LND*/
+{
+  _maybe_check_vselector_LNF();
+  FileRef file;
+  bool create = false;
+  bool truncate = false;
+  mempool::bluefs::vector<bluefs_extent_t> pending_release_extents;
+  {
+  std::lock_guard ll(log.lock);
+  std::lock_guard nl(nodes.lock);
+  dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
+  map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
+  DirRef dir;
+  if (p == nodes.dir_map.end()) {
+    // implicitly create the dir
+    dout(20) << __func__ << "  dir " << dirname
+	     << " does not exist" << dendl;
+    return -ENOENT;
+  } else {
+    dir = p->second;
+  }
+
+  map<string,FileRef>::iterator q = dir->file_map.find(filename);
+  if (q == dir->file_map.end()) {
+    if (overwrite) {
+      dout(20) << __func__ << " dir " << dirname << " (" << dir
+	       << ") file " << filename
+	       << " does not exist" << dendl;
+      return -ENOENT;
+    }
+    file = ceph::make_ref<File>();
+    file->fnode.ino = ++ino_last;
+    nodes.file_map[ino_last] = file;
+    dir->file_map[string{filename}] = file;
+    ++file->refs;
+    create = true;
+    logger->set(l_bluefs_num_files, nodes.file_map.size());
+  } else {
+    // overwrite existing file?
+    file = q->second;
+    if (overwrite) {
+      dout(20) << __func__ << " dir " << dirname << " (" << dir
+	       << ") file " << filename
+	       << " already exists, overwrite in place" << dendl;
+    } else {
+      dout(20) << __func__ << " dir " << dirname << " (" << dir
+	       << ") file " << filename
+	       << " already exists, truncate + overwrite" << dendl;
+      vselector->sub_usage(file->vselector_hint, file->fnode);
+      file->fnode.size = 0;
+      pending_release_extents.swap(file->fnode.extents);
+      truncate = true;
+
+      file->fnode.clear_extents();
+    }
+  }
+  ceph_assert(file->fnode.ino > 1);
+
+  file->fnode.mtime = ceph_clock_now();
+  file->vselector_hint = vselector->get_hint_by_dir(dirname);
+  if (create || truncate) {
+    vselector->add_usage(file->vselector_hint, file->fnode); // update file count
+  }
+
+  dout(20) << __func__ << " mapping " << dirname << "/" << filename
+	   << " vsel_hint " << file->vselector_hint
+	   << dendl;
+
+  log.t.op_file_update(file->fnode);
+  if (create)
+    log.t.op_dir_link(dirname, filename, file->fnode.ino);
+
+  std::lock_guard dl(dirty.lock);
+  for (auto& p : pending_release_extents) {
+    dirty.pending_release[p.bdev].insert(p.offset, p.length);
+  }
+  }
+  *h = _create_writer(file);
+
+  if (boost::algorithm::ends_with(filename, ".log")) {
+    (*h)->writer_type = BlueFS::WRITER_WAL;
+    if (logger && !overwrite) {
+      logger->inc(l_bluefs_files_written_wal);
+    }
+  } else if (boost::algorithm::ends_with(filename, ".sst")) {
+    (*h)->writer_type = BlueFS::WRITER_SST;
+    if (logger) {
+      logger->inc(l_bluefs_files_written_sst);
+    }
+  }
+
+  dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
+  return 0;
+}
+
+BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
+{
+  FileWriter *w = new FileWriter(f);
+  for (unsigned i = 0; i < MAX_BDEV; ++i) {
+    if (bdev[i]) {
+      w->iocv[i] = new IOContext(cct, NULL);
+    }
+  }
+  return w;
+}
+
+void BlueFS::_drain_writer(FileWriter *h)
+{
+  dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
+  //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
+  for (unsigned i=0; i<MAX_BDEV; ++i) {
+    if (bdev[i]) {
+      if (h->iocv[i]) {
+	h->iocv[i]->aio_wait();
+	delete h->iocv[i];
+      }
+    }
+  }
+  // sanity
+  if (h->file->fnode.size >= (1ull << 30)) {
+    dout(10) << __func__ << " file is unexpectedly large:" << h->file->fnode << dendl;
+  }
+}
+
+void BlueFS::_close_writer(FileWriter *h)
+{
+  _drain_writer(h);
+  delete h;
+}
+void BlueFS::close_writer(FileWriter *h)
+{
+  {
+    std::lock_guard l(h->lock);
+    _drain_writer(h);
+  }
+  delete h;
+}
+
+uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h)
+{
+  std::lock_guard l(h->lock);
+  return h->file->dirty_seq;
+}
+
+bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
+{
+  std::lock_guard l(h->lock);
+  return h->dirty_devs[dev];
+}
+
+int BlueFS::open_for_read(
+  std::string_view dirname,
+  std::string_view filename,
+  FileReader **h,
+  bool random)/*_N*/
+{
+  _maybe_check_vselector_LNF();
+  std::lock_guard nl(nodes.lock);
+  dout(10) << __func__ << " " << dirname << "/" << filename
+	   << (random ? " (random)":" (sequential)") << dendl;
+  map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
+  if (p == nodes.dir_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+    return -ENOENT;
+  }
+  DirRef dir = p->second;
+
+  map<string,FileRef>::iterator q = dir->file_map.find(filename);
+  if (q == dir->file_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " (" << dir
+	     << ") file " << filename
+	     << " not found" << dendl;
+    return -ENOENT;
+  }
+  File *file = q->second.get();
+
+  *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
+		      random, false);
+  dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
+  return 0;
+}
+
+int BlueFS::rename(
+  std::string_view old_dirname, std::string_view old_filename,
+  std::string_view new_dirname, std::string_view new_filename)/*_LND*/
+{
+  std::lock_guard ll(log.lock);
+  std::lock_guard nl(nodes.lock);
+  dout(10) << __func__ << " " << old_dirname << "/" << old_filename
+	   << " -> " << new_dirname << "/" << new_filename << dendl;
+  map<string,DirRef>::iterator p = nodes.dir_map.find(old_dirname);
+  if (p == nodes.dir_map.end()) {
+    dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
+    return -ENOENT;
+  }
+  DirRef old_dir = p->second;
+  map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
+  if (q == old_dir->file_map.end()) {
+    dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
+	     << ") file " << old_filename
+	     << " not found" << dendl;
+    return -ENOENT;
+  }
+  FileRef file = q->second;
+
+  p = nodes.dir_map.find(new_dirname);
+  if (p == nodes.dir_map.end()) {
+    dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
+    return -ENOENT;
+  }
+  DirRef new_dir = p->second;
+  q = new_dir->file_map.find(new_filename);
+  if (q != new_dir->file_map.end()) {
+    dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
+	     << ") file " << new_filename
+	     << " already exists, unlinking" << dendl;
+    ceph_assert(q->second != file);
+    log.t.op_dir_unlink(new_dirname, new_filename);
+    _drop_link_D(q->second);
+  }
+
+  dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
+	   << " " << file->fnode << dendl;
+
+  new_dir->file_map[string{new_filename}] = file;
+  old_dir->file_map.erase(string{old_filename});
+
+  log.t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
+  log.t.op_dir_unlink(old_dirname, old_filename);
+  return 0;
+}
+
+int BlueFS::mkdir(std::string_view dirname)/*_LN*/
+{
+  std::lock_guard ll(log.lock);
+  std::lock_guard nl(nodes.lock);
+  dout(10) << __func__ << " " << dirname << dendl;
+  map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
+  if (p != nodes.dir_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
+    return -EEXIST;
+  }
+  nodes.dir_map[string{dirname}] = ceph::make_ref<Dir>();
+  log.t.op_dir_create(dirname);
+  return 0;
+}
+
+int BlueFS::rmdir(std::string_view dirname)/*_LN*/
+{
+  std::lock_guard ll(log.lock);
+  std::lock_guard nl(nodes.lock);
+  dout(10) << __func__ << " " << dirname << dendl;
+  auto p = nodes.dir_map.find(dirname);
+  if (p == nodes.dir_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
+    return -ENOENT;
+  }
+  DirRef dir = p->second;
+  if (!dir->file_map.empty()) {
+    dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
+    return -ENOTEMPTY;
+  }
+  nodes.dir_map.erase(string{dirname});
+  log.t.op_dir_remove(dirname);
+  return 0;
+}
+
+bool BlueFS::dir_exists(std::string_view dirname)/*_N*/
+{
+  std::lock_guard nl(nodes.lock);
+  map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
+  bool exists = p != nodes.dir_map.end();
+  dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
+  return exists;
+}
+
+int BlueFS::stat(std::string_view dirname, std::string_view filename,
+		 uint64_t *size, utime_t *mtime)/*_N*/
+{
+  std::lock_guard nl(nodes.lock);
+  dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
+  map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
+  if (p == nodes.dir_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+    return -ENOENT;
+  }
+  DirRef dir = p->second;
+  map<string,FileRef>::iterator q = dir->file_map.find(filename);
+  if (q == dir->file_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " (" << dir
+	     << ") file " << filename
+	     << " not found" << dendl;
+    return -ENOENT;
+  }
+  File *file = q->second.get();
+  dout(10) << __func__ << " " << dirname << "/" << filename
+	   << " " << file->fnode << dendl;
+  if (size)
+    *size = file->fnode.size;
+  if (mtime)
+    *mtime = file->fnode.mtime;
+  return 0;
+}
+
+int BlueFS::lock_file(std::string_view dirname, std::string_view filename,
+		      FileLock **plock)/*_LN*/
+{
+  std::lock_guard ll(log.lock);
+  std::lock_guard nl(nodes.lock);
+  dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
+  map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
+  if (p == nodes.dir_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+    return -ENOENT;
+  }
+  DirRef dir = p->second;
+  auto q = dir->file_map.find(filename);
+  FileRef file;
+  if (q == dir->file_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " (" << dir
+	     << ") file " << filename
+	     << " not found, creating" << dendl;
+    file = ceph::make_ref<File>();
+    file->fnode.ino = ++ino_last;
+    file->fnode.mtime = ceph_clock_now();
+    nodes.file_map[ino_last] = file;
+    dir->file_map[string{filename}] = file;
+    logger->set(l_bluefs_num_files, nodes.file_map.size());
+    ++file->refs;
+    log.t.op_file_update(file->fnode);
+    log.t.op_dir_link(dirname, filename, file->fnode.ino);
+  } else {
+    file = q->second;
+    if (file->locked) {
+      dout(10) << __func__ << " already locked" << dendl;
+      return -ENOLCK;
+    }
+  }
+  file->locked = true;
+  *plock = new FileLock(file);
+  dout(10) << __func__ << " locked " << file->fnode
+	   << " with " << *plock << dendl;
+  return 0;
+}
+
+int BlueFS::unlock_file(FileLock *fl)/*_N*/
+{
+  std::lock_guard nl(nodes.lock);
+  dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
+  ceph_assert(fl->file->locked);
+  fl->file->locked = false;
+  delete fl;
+  return 0;
+}
+
+int BlueFS::readdir(std::string_view dirname, vector<string> *ls)/*_N*/
+{
+  // dirname may contain a trailing /
+  if (!dirname.empty() && dirname.back() == '/') {
+    dirname.remove_suffix(1);
+  }
+  std::lock_guard nl(nodes.lock);
+  dout(10) << __func__ << " " << dirname << dendl;
+  if (dirname.empty()) {
+    // list dirs
+    ls->reserve(nodes.dir_map.size() + 2);
+    for (auto& q : nodes.dir_map) {
+      ls->push_back(q.first);
+    }
+  } else {
+    // list files in dir
+    map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
+    if (p == nodes.dir_map.end()) {
+      dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+      return -ENOENT;
+    }
+    DirRef dir = p->second;
+    ls->reserve(dir->file_map.size() + 2);
+    for (auto& q : dir->file_map) {
+      ls->push_back(q.first);
+    }
+  }
+  ls->push_back(".");
+  ls->push_back("..");
+  return 0;
+}
+
+int BlueFS::unlink(std::string_view dirname, std::string_view filename)/*_LND*/
+{
+  std::lock_guard ll(log.lock);
+  std::lock_guard nl(nodes.lock);
+  dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
+  map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
+  if (p == nodes.dir_map.end()) {
+    dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
+    return -ENOENT;
+  }
+  DirRef dir = p->second;
+  map<string,FileRef>::iterator q = dir->file_map.find(filename);
+  if (q == dir->file_map.end()) {
+    dout(20) << __func__ << " file " << dirname << "/" << filename
+	     << " not found" << dendl;
+    return -ENOENT;
+  }
+  FileRef file = q->second;
+  if (file->locked) {
+    dout(20) << __func__ << " file " << dirname << "/" << filename
+             << " is locked" << dendl;
+    return -EBUSY;
+  }
+  dir->file_map.erase(string{filename});
+  log.t.op_dir_unlink(dirname, filename);
+  _drop_link_D(file);
+  return 0;
+}
+
+bool BlueFS::wal_is_rotational()
+{
+  if (bdev[BDEV_WAL]) {
+    return bdev[BDEV_WAL]->is_rotational();
+  } else if (bdev[BDEV_DB]) {
+    return bdev[BDEV_DB]->is_rotational();
+  }
+  return bdev[BDEV_SLOW]->is_rotational();
+}
+
+bool BlueFS::db_is_rotational()
+{
+  if (bdev[BDEV_DB]) {
+    return bdev[BDEV_DB]->is_rotational();
+  }
+  return bdev[BDEV_SLOW]->is_rotational();
+}
+
+/*
+  Algorithm.
+  do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
+  Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
+  and try if using it will produce healthy bluefs transaction.
+  We encode already known bluefs log extents and search disk for these bytes.
+  When we find it, we decode following bytes as extent.
+  We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
+ */
+int BlueFS::_do_replay_recovery_read(FileReader *log_reader,
+				    size_t replay_pos,
+				    size_t read_offset,
+				    size_t read_len,
+				    bufferlist* bl) {
+  dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
+    " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
+
+  bluefs_fnode_t& log_fnode = log_reader->file->fnode;
+  bufferlist bin_extents;
+  ::encode(log_fnode.extents, bin_extents);
+  dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
+
+  // cannot process if too small to effectively search
+  ceph_assert(bin_extents.length() >= 32);
+  bufferlist last_32;
+  last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
+
+  //read fixed part from replay_pos to end of bluefs_log extents
+  bufferlist fixed;
+  uint64_t e_off = 0;
+  auto e = log_fnode.seek(replay_pos, &e_off);
+  ceph_assert(e != log_fnode.extents.end());
+  int r = _bdev_read(e->bdev, e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
+		     cct->_conf->bluefs_buffered_io);
+  ceph_assert(r == 0);
+  //capture dev of last good extent
+  uint8_t last_e_dev = e->bdev;
+  uint64_t last_e_off = e->offset;
+  ++e;
+  while (e != log_fnode.extents.end()) {
+    r = _bdev_read(e->bdev, e->offset, e->length, &fixed, ioc[e->bdev],
+		   cct->_conf->bluefs_buffered_io);
+    ceph_assert(r == 0);
+    last_e_dev = e->bdev;
+    ++e;
+  }
+  ceph_assert(replay_pos + fixed.length() == read_offset);
+
+  dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
+
+  struct compare {
+    bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
+      if (a.bdev < b.bdev) return true;
+      if (a.offset < b.offset) return true;
+      return a.length < b.length;
+    }
+  };
+  std::set<bluefs_extent_t, compare> extents_rejected;
+  for (int dcnt = 0; dcnt < 3; dcnt++) {
+    uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
+    if (bdev[dev] == nullptr) continue;
+    dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
+    interval_set<uint64_t> disk_regions;
+    disk_regions.insert(0, bdev[dev]->get_size());
+    for (auto f : nodes.file_map) {
+      auto& e = f.second->fnode.extents;
+      for (auto& p : e) {
+	if (p.bdev == dev) {
+	  disk_regions.erase(p.offset, p.length);
+	}
+      }
+    }
+    size_t disk_regions_count = disk_regions.num_intervals();
+    dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
+
+    auto reg = disk_regions.lower_bound(last_e_off);
+    //for all except first, start from beginning
+    last_e_off = 0;
+    if (reg == disk_regions.end()) {
+      reg = disk_regions.begin();
+    }
+    const uint64_t chunk_size = 4 * 1024 * 1024;
+    const uint64_t page_size = 4096;
+    const uint64_t max_extent_size = 16;
+    uint64_t overlay_size = last_32.length() + max_extent_size;
+    for (size_t i = 0; i < disk_regions_count; reg++, i++) {
+      if (reg == disk_regions.end()) {
+	reg = disk_regions.begin();
+      }
+      uint64_t pos = reg.get_start();
+      uint64_t len = reg.get_len();
+
+      std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
+      char* raw_data = raw_data_p.get();
+      memset(raw_data, 0, page_size);
+
+      while (len > last_32.length()) {
+	uint64_t chunk_len = len > chunk_size ? chunk_size : len;
+	dout(5) << __func__ << " read "
+		<< get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len
+		<< std::dec << dendl;
+	r = _bdev_read_random(dev, pos, chunk_len,
+	  raw_data + page_size, cct->_conf->bluefs_buffered_io);
+	ceph_assert(r == 0);
+
+	//search for fixed_last_32
+	char* chunk_b = raw_data + page_size;
+	char* chunk_e = chunk_b + chunk_len;
+
+	char* search_b = chunk_b - overlay_size;
+	char* search_e = chunk_e;
+
+	for (char* sp = search_b; ; sp += last_32.length()) {
+	  sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
+	  if (sp == nullptr) {
+	    break;
+	  }
+
+	  char* n = sp + last_32.length();
+	  dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
+	  bufferlist test;
+	  test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
+	  bluefs_extent_t ne;
+	  try {
+	    bufferlist::const_iterator p = test.begin();
+	    ::decode(ne, p);
+	  } catch (buffer::error& e) {
+	    continue;
+	  }
+	  if (extents_rejected.count(ne) != 0) {
+	    dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
+	    continue;
+	  }
+	  //insert as rejected already. if we succeed, it wouldn't make difference.
+	  extents_rejected.insert(ne);
+
+	  if (ne.bdev >= MAX_BDEV ||
+	      bdev[ne.bdev] == nullptr ||
+	      ne.length > 16 * 1024 * 1024 ||
+	      (ne.length & 4095) != 0 ||
+	      ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
+	      (ne.offset & 4095) != 0) {
+	    dout(5) << __func__ << " refusing extent " << ne << dendl;
+	    continue;
+	  }
+	  dout(5) << __func__ << " checking extent " << ne << dendl;
+
+	  //read candidate extent - whole
+	  bufferlist candidate;
+	  candidate.append(fixed);
+	  r = _bdev_read(ne.bdev, ne.offset, ne.length, &candidate, ioc[ne.bdev],
+			 cct->_conf->bluefs_buffered_io);
+	  ceph_assert(r == 0);
+
+	  //check if transaction & crc is ok
+	  bluefs_transaction_t t;
+	  try {
+	    bufferlist::const_iterator p = candidate.begin();
+	    ::decode(t, p);
+	  }
+	  catch (buffer::error& e) {
+	    dout(5) << __func__ << " failed match" << dendl;
+	    continue;
+	  }
+
+	  //success, it seems a probable candidate
+	  uint64_t l = std::min<uint64_t>(ne.length, read_len);
+	  //trim to required size
+	  bufferlist requested_read;
+	  requested_read.substr_of(candidate, fixed.length(), l);
+	  bl->append(requested_read);
+	  dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
+	  log_fnode.append_extent(ne);
+	  log_fnode.recalc_allocated();
+	  log_reader->buf.pos += l;
+	  return l;
+	}
+	//save overlay for next search
+	memcpy(search_b, chunk_e - overlay_size, overlay_size);
+	pos += chunk_len;
+	len -= chunk_len;
+      }
+    }
+  }
+  return 0;
+}
+
+void BlueFS::_check_vselector_LNF() {
+  BlueFSVolumeSelector* vs = vselector->clone_empty();
+  if (!vs) {
+    return;
+  }
+  std::lock_guard ll(log.lock);
+  std::lock_guard nl(nodes.lock);
+  // Checking vselector is under log, nodes and file(s) locks,
+  // so any modification of vselector must be under at least one of those locks.
+  for (auto& f : nodes.file_map) {
+    f.second->lock.lock();
+    vs->add_usage(f.second->vselector_hint, f.second->fnode);
+  }
+  bool res = vselector->compare(vs);
+  if (!res) {
+    dout(0) << "Current:";
+    vselector->dump(*_dout);
+    *_dout << dendl;
+    dout(0) << "Expected:";
+    vs->dump(*_dout);
+    *_dout << dendl;
+  }
+  ceph_assert(res);
+  for (auto& f : nodes.file_map) {
+    f.second->lock.unlock();
+  }
+  delete vs;
+}
+
+size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size)
+{
+  size_t total = 0;
+  auto iterated_allocation = [&](size_t off, size_t len) {
+    //only count in size that is alloc_size aligned
+    size_t dist_to_alignment;
+    size_t offset_in_block = off & (alloc_size - 1);
+    if (offset_in_block == 0)
+      dist_to_alignment = 0;
+    else
+      dist_to_alignment = alloc_size - offset_in_block;
+    if (dist_to_alignment >= len)
+      return;
+    len -= dist_to_alignment;
+    total += p2align(len, alloc_size);
+  };
+  if (alloc[dev]) {
+    alloc[dev]->foreach(iterated_allocation);
+  }
+  return total;
+}
+// ===============================================
+// OriginalVolumeSelector
+
+void* OriginalVolumeSelector::get_hint_for_log() const {
+  return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
+}
+void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
+  uint8_t res = BlueFS::BDEV_DB;
+  if (dirname.length() > 5) {
+    // the "db.slow" and "db.wal" directory names are hard-coded at
+    // match up with bluestore.  the slow device is always the second
+    // one (when a dedicated block.db device is present and used at
+    // bdev 0).  the wal device is always last.
+    if (boost::algorithm::ends_with(dirname, ".slow") && slow_total) {
+      res = BlueFS::BDEV_SLOW;
+    } else if (boost::algorithm::ends_with(dirname, ".wal") && wal_total) {
+      res = BlueFS::BDEV_WAL;
+    }
+  }
+  return reinterpret_cast<void*>(res);
+}
+
+uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
+{
+  return (uint8_t)(reinterpret_cast<uint64_t>(hint));
+}
+
+void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
+{
+  res.emplace_back(base, db_total);
+  res.emplace_back(base + ".slow",
+    slow_total ? slow_total : db_total); // use fake non-zero value if needed to
+                                         // avoid RocksDB complains
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "OriginalVolumeSelector: "
+
+void OriginalVolumeSelector::dump(ostream& sout) {
+  sout<< "wal_total:" << wal_total
+    << ", db_total:" << db_total
+    << ", slow_total:" << slow_total
+    << std::endl;
+}
+
+// ===============================================
+// FitToFastVolumeSelector
+
+void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const {
+  res.emplace_back(base, 1);  // size of the last db_path has no effect
+}
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
new file mode 100644
index 000000000..adfc8eb0a
--- /dev/null
+++ b/src/os/bluestore/BlueFS.h
@@ -0,0 +1,766 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OS_BLUESTORE_BLUEFS_H
+#define CEPH_OS_BLUESTORE_BLUEFS_H
+
+#include <atomic>
+#include <mutex>
+#include <limits>
+
+#include "bluefs_types.h"
+#include "blk/BlockDevice.h"
+
+#include "common/RefCountedObj.h"
+#include "common/ceph_context.h"
+#include "global/global_context.h"
+#include "include/common_fwd.h"
+
+#include "boost/intrusive/list.hpp"
+#include "boost/dynamic_bitset.hpp"
+
+class Allocator;
+
+enum {
+  l_bluefs_first = 732600,
+  l_bluefs_db_total_bytes,
+  l_bluefs_db_used_bytes,
+  l_bluefs_wal_total_bytes,
+  l_bluefs_wal_used_bytes,
+  l_bluefs_slow_total_bytes,
+  l_bluefs_slow_used_bytes,
+  l_bluefs_num_files,
+  l_bluefs_log_bytes,
+  l_bluefs_log_compactions,
+  l_bluefs_log_write_count,
+  l_bluefs_logged_bytes,
+  l_bluefs_files_written_wal,
+  l_bluefs_files_written_sst,
+  l_bluefs_write_count_wal,
+  l_bluefs_write_count_sst,
+  l_bluefs_bytes_written_wal,
+  l_bluefs_bytes_written_sst,
+  l_bluefs_bytes_written_slow,
+  l_bluefs_max_bytes_wal,
+  l_bluefs_max_bytes_db,
+  l_bluefs_max_bytes_slow,
+  l_bluefs_main_alloc_unit,
+  l_bluefs_db_alloc_unit,
+  l_bluefs_wal_alloc_unit,
+  l_bluefs_read_random_count,
+  l_bluefs_read_random_bytes,
+  l_bluefs_read_random_disk_count,
+  l_bluefs_read_random_disk_bytes,
+  l_bluefs_read_random_disk_bytes_wal,
+  l_bluefs_read_random_disk_bytes_db,
+  l_bluefs_read_random_disk_bytes_slow,
+  l_bluefs_read_random_buffer_count,
+  l_bluefs_read_random_buffer_bytes,
+  l_bluefs_read_count,
+  l_bluefs_read_bytes,
+  l_bluefs_read_disk_count,
+  l_bluefs_read_disk_bytes,
+  l_bluefs_read_disk_bytes_wal,
+  l_bluefs_read_disk_bytes_db,
+  l_bluefs_read_disk_bytes_slow,
+  l_bluefs_read_prefetch_count,
+  l_bluefs_read_prefetch_bytes,
+  l_bluefs_write_count,
+  l_bluefs_write_disk_count,
+  l_bluefs_write_bytes,
+  l_bluefs_compaction_lat,
+  l_bluefs_compaction_lock_lat,
+  l_bluefs_alloc_shared_dev_fallbacks,
+  l_bluefs_alloc_shared_size_fallbacks,
+  l_bluefs_read_zeros_candidate,
+  l_bluefs_read_zeros_errors,
+  l_bluefs_last,
+};
+
+class BlueFSVolumeSelector {
+public:
+  typedef std::vector<std::pair<std::string, uint64_t>> paths;
+
+  virtual ~BlueFSVolumeSelector() {
+  }
+  virtual void* get_hint_for_log() const = 0;
+  virtual void* get_hint_by_dir(std::string_view dirname) const = 0;
+
+  virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
+  virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
+  virtual void add_usage(void* file_hint, uint64_t fsize) = 0;
+  virtual void sub_usage(void* file_hint, uint64_t fsize) = 0;
+  virtual uint8_t select_prefer_bdev(void* hint) = 0;
+  virtual void get_paths(const std::string& base, paths& res) const = 0;
+  virtual void dump(std::ostream& sout) = 0;
+
+  /* used for sanity checking of vselector */
+  virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; }
+  virtual bool compare(BlueFSVolumeSelector* other) { return true; };
+};
+
+struct bluefs_shared_alloc_context_t {
+  bool need_init = false;
+  Allocator* a = nullptr;
+  uint64_t alloc_unit = 0;
+
+  std::atomic<uint64_t> bluefs_used = 0;
+
+  void set(Allocator* _a, uint64_t _au) {
+    a = _a;
+    alloc_unit = _au;
+    need_init = true;
+    bluefs_used = 0;
+  }
+  void reset() {
+    a = nullptr;
+    alloc_unit = 0;
+  }
+};
+
+class BlueFS {
+public:
+  CephContext* cct;
+  static constexpr unsigned MAX_BDEV = 5;
+  static constexpr unsigned BDEV_WAL = 0;
+  static constexpr unsigned BDEV_DB = 1;
+  static constexpr unsigned BDEV_SLOW = 2;
+  static constexpr unsigned BDEV_NEWWAL = 3;
+  static constexpr unsigned BDEV_NEWDB = 4;
+
+  enum {
+    WRITER_UNKNOWN,
+    WRITER_WAL,
+    WRITER_SST,
+  };
+
+  struct File : public RefCountedObject {
+    MEMPOOL_CLASS_HELPERS();
+
+    bluefs_fnode_t fnode;
+    int refs;
+    uint64_t dirty_seq;
+    bool locked;
+    bool deleted;
+    bool is_dirty;
+    boost::intrusive::list_member_hook<> dirty_item;
+
+    std::atomic_int num_readers, num_writers;
+    std::atomic_int num_reading;
+
+    void* vselector_hint = nullptr;
+    /* lock protects fnode and other the parts that can be modified during read & write operations.
+       Does not protect values that are fixed
+       Does not need to be taken when doing one-time operations:
+       _replay, device_migrate_to_existing, device_migrate_to_new */
+    ceph::mutex lock = ceph::make_mutex("BlueFS::File::lock");
+
+  private:
+    FRIEND_MAKE_REF(File);
+    File()
+      :
+	refs(0),
+	dirty_seq(0),
+	locked(false),
+	deleted(false),
+	is_dirty(false),
+	num_readers(0),
+	num_writers(0),
+	num_reading(0),
+        vselector_hint(nullptr)
+      {}
+    ~File() override {
+      ceph_assert(num_readers.load() == 0);
+      ceph_assert(num_writers.load() == 0);
+      ceph_assert(num_reading.load() == 0);
+      ceph_assert(!locked);
+    }
+  };
+  using FileRef = ceph::ref_t<File>;
+
+  typedef boost::intrusive::list<
+      File,
+      boost::intrusive::member_hook<
+        File,
+	boost::intrusive::list_member_hook<>,
+	&File::dirty_item> > dirty_file_list_t;
+
+  struct Dir : public RefCountedObject {
+    MEMPOOL_CLASS_HELPERS();
+
+    mempool::bluefs::map<std::string, FileRef, std::less<>> file_map;
+
+  private:
+    FRIEND_MAKE_REF(Dir);
+    Dir() = default;
+  };
+  using DirRef = ceph::ref_t<Dir>;
+
+  struct FileWriter {
+    MEMPOOL_CLASS_HELPERS();
+
+    FileRef file;
+    uint64_t pos = 0;       ///< start offset for buffer
+  private:
+    ceph::buffer::list buffer;      ///< new data to write (at end of file)
+    ceph::buffer::list tail_block;  ///< existing partial block at end of file, if any
+  public:
+    unsigned get_buffer_length() const {
+      return buffer.length();
+    }
+    ceph::bufferlist flush_buffer(
+      CephContext* cct,
+      const bool partial,
+      const unsigned length,
+      const bluefs_super_t& super);
+    ceph::buffer::list::page_aligned_appender buffer_appender;  //< for const char* only
+  public:
+    int writer_type = 0;    ///< WRITER_*
+    int write_hint = WRITE_LIFE_NOT_SET;
+
+    ceph::mutex lock = ceph::make_mutex("BlueFS::FileWriter::lock");
+    std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
+    std::array<bool, MAX_BDEV> dirty_devs;
+
+    FileWriter(FileRef f)
+      : file(std::move(f)),
+       buffer_appender(buffer.get_page_aligned_appender(
+                         g_conf()->bluefs_alloc_size / CEPH_PAGE_SIZE)) {
+      ++file->num_writers;
+      iocv.fill(nullptr);
+      dirty_devs.fill(false);
+      if (file->fnode.ino == 1) {
+	write_hint = WRITE_LIFE_MEDIUM;
+      }
+    }
+    // NOTE: caller must call BlueFS::close_writer()
+    ~FileWriter() {
+      --file->num_writers;
+    }
+
+    // note: BlueRocksEnv uses this append exclusively, so it's safe
+    // to use buffer_appender exclusively here (e.g., its notion of
+    // offset will remain accurate).
+    void append(const char *buf, size_t len) {
+      uint64_t l0 = get_buffer_length();
+      ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
+      buffer_appender.append(buf, len);
+    }
+
+    void append(const std::byte *buf, size_t len) {
+      // allow callers to use byte type instead of char* as we simply pass byte array
+      append((const char*)buf, len);
+    }
+
+    // note: used internally only, for ino 1 or 0.
+    void append(ceph::buffer::list& bl) {
+      uint64_t l0 = get_buffer_length();
+      ceph_assert(l0 + bl.length() <= std::numeric_limits<unsigned>::max());
+      buffer.claim_append(bl);
+    }
+
+    void append_zero(size_t len) {
+      uint64_t l0 = get_buffer_length();
+      ceph_assert(l0 + len <= std::numeric_limits<unsigned>::max());
+      buffer_appender.append_zero(len);
+    }
+
+    uint64_t get_effective_write_pos() {
+      return pos + buffer.length();
+    }
+  };
+
+  struct FileReaderBuffer {
+    MEMPOOL_CLASS_HELPERS();
+
+    uint64_t bl_off = 0;    ///< prefetch buffer logical offset
+    ceph::buffer::list bl;          ///< prefetch buffer
+    uint64_t pos = 0;       ///< current logical offset
+    uint64_t max_prefetch;  ///< max allowed prefetch
+
+    explicit FileReaderBuffer(uint64_t mpf)
+      : max_prefetch(mpf) {}
+
+    uint64_t get_buf_end() const {
+      return bl_off + bl.length();
+    }
+    uint64_t get_buf_remaining(uint64_t p) const {
+      if (p >= bl_off && p < bl_off + bl.length())
+	return bl_off + bl.length() - p;
+      return 0;
+    }
+
+    void skip(size_t n) {
+      pos += n;
+    }
+
+    // For the sake of simplicity, we invalidate completed rather than
+    // for the provided extent
+    void invalidate_cache(uint64_t offset, uint64_t length) {
+      if (offset >= bl_off && offset < get_buf_end()) {
+	bl.clear();
+	bl_off = 0;
+      }
+    }
+  };
+
+  struct FileReader {
+    MEMPOOL_CLASS_HELPERS();
+
+    FileRef file;
+    FileReaderBuffer buf;
+    bool random;
+    bool ignore_eof;        ///< used when reading our log file
+
+    ceph::shared_mutex lock {
+     ceph::make_shared_mutex(std::string(), false, false, false)
+    };
+
+
+    FileReader(FileRef f, uint64_t mpf, bool rand, bool ie)
+      : file(f),
+	buf(mpf),
+	random(rand),
+	ignore_eof(ie) {
+      ++file->num_readers;
+    }
+    ~FileReader() {
+      --file->num_readers;
+    }
+  };
+
+  struct FileLock {
+    MEMPOOL_CLASS_HELPERS();
+
+    FileRef file;
+    explicit FileLock(FileRef f) : file(std::move(f)) {}
+  };
+
+private:
+  PerfCounters *logger = nullptr;
+
+  uint64_t max_bytes[MAX_BDEV] = {0};
+  uint64_t max_bytes_pcounters[MAX_BDEV] = {
+    l_bluefs_max_bytes_wal,
+    l_bluefs_max_bytes_db,
+    l_bluefs_max_bytes_slow,
+    l_bluefs_max_bytes_wal,
+    l_bluefs_max_bytes_db,
+  };
+
+  // cache
+  struct {
+    ceph::mutex lock = ceph::make_mutex("BlueFS::nodes.lock");
+    mempool::bluefs::map<std::string, DirRef, std::less<>> dir_map; ///< dirname -> Dir
+    mempool::bluefs::unordered_map<uint64_t, FileRef> file_map;     ///< ino -> File
+  } nodes;
+
+  bluefs_super_t super;        ///< latest superblock (as last written)
+  uint64_t ino_last = 0;       ///< last assigned ino (this one is in use)
+
+  struct {
+    ceph::mutex lock = ceph::make_mutex("BlueFS::log.lock");
+    uint64_t seq_live = 1;   //seq that log is currently writing to; mirrors dirty.seq_live
+    FileWriter *writer = 0;
+    bluefs_transaction_t t;
+  } log;
+
+  struct {
+    ceph::mutex lock = ceph::make_mutex("BlueFS::dirty.lock");
+    uint64_t seq_stable = 0; //seq that is now stable on disk
+    uint64_t seq_live = 1;   //seq that is ongoing and dirty files will be written to
+    // map of dirty files, files of same dirty_seq are grouped into list.
+    std::map<uint64_t, dirty_file_list_t> files;
+    std::vector<interval_set<uint64_t>> pending_release; ///< extents to release
+    // TODO: it should be examined what makes pending_release immune to
+    // eras in a way similar to dirty_files. Hints:
+    // 1) we have actually only 2 eras: log_seq and log_seq+1
+    // 2) we usually not remove extents from files. And when we do, we force log-syncing.
+  } dirty;
+
+  ceph::condition_variable log_cond;                             ///< used for state control between log flush / log compaction
+  std::atomic<bool> log_is_compacting{false};                    ///< signals that bluefs log is already ongoing compaction
+  std::atomic<bool> log_forbidden_to_expand{false};              ///< used to signal that async compaction is in state
+                                                                 ///  that prohibits expansion of bluefs log
+  /*
+   * There are up to 3 block devices:
+   *
+   *  BDEV_DB   db/      - the primary db device
+   *  BDEV_WAL  db.wal/  - a small, fast device, specifically for the WAL
+   *  BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
+   */
+  std::vector<BlockDevice*> bdev;                  ///< block devices we can use
+  std::vector<IOContext*> ioc;                     ///< IOContexts for bdevs
+  std::vector<uint64_t> block_reserved;            ///< starting reserve extent per device
+  std::vector<Allocator*> alloc;                   ///< allocators for bdevs
+  std::vector<uint64_t> alloc_size;                ///< alloc size for each device
+
+  //std::vector<interval_set<uint64_t>> block_unused_too_granular;
+
+  BlockDevice::aio_callback_t discard_cb[3]; //discard callbacks for each dev
+
+  std::unique_ptr<BlueFSVolumeSelector> vselector;
+
+  bluefs_shared_alloc_context_t* shared_alloc = nullptr;
+  unsigned shared_alloc_id = unsigned(-1);
+  inline bool is_shared_alloc(unsigned id) const {
+    return id == shared_alloc_id;
+  }
+  std::atomic<int64_t> cooldown_deadline = 0;
+
+  class SocketHook;
+  SocketHook* asok_hook = nullptr;
+  // used to trigger zeros into read (debug / verify)
+  std::atomic<uint64_t> inject_read_zeros{0};
+
+  void _init_logger();
+  void _shutdown_logger();
+  void _update_logger_stats();
+
+  void _init_alloc();
+  void _stop_alloc();
+
+  ///< pad ceph::buffer::list to max(block size, pad_size) w/ zeros
+  void _pad_bl(ceph::buffer::list& bl, uint64_t pad_size = 0);
+
+  uint64_t _get_used(unsigned id) const;
+  uint64_t _get_total(unsigned id) const;
+
+
+  FileRef _get_file(uint64_t ino);
+  void _drop_link_D(FileRef f);
+
+  unsigned _get_slow_device_id() {
+    return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB;
+  }
+  const char* get_device_name(unsigned id);
+  int _allocate(uint8_t bdev, uint64_t len,
+                uint64_t alloc_unit,
+		bluefs_fnode_t* node,
+                size_t alloc_attempts = 0,
+                bool permit_dev_fallback = true);
+
+  /* signal replay log to include h->file in nearest log flush */
+  int _signal_dirty_to_log_D(FileWriter *h);
+  int _flush_range_F(FileWriter *h, uint64_t offset, uint64_t length);
+  int _flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered);
+  int _flush_F(FileWriter *h, bool force, bool *flushed = nullptr);
+  uint64_t _flush_special(FileWriter *h);
+  int _fsync(FileWriter *h);
+
+#ifdef HAVE_LIBAIO
+  void _claim_completed_aios(FileWriter *h, std::list<aio_t> *ls);
+  void _wait_for_aio(FileWriter *h);  // safe to call without a lock
+#endif
+
+  int64_t _maybe_extend_log();
+  void _extend_log();
+  uint64_t _log_advance_seq();
+  void _consume_dirty(uint64_t seq);
+  void _clear_dirty_set_stable_D(uint64_t seq_stable);
+  void _release_pending_allocations(std::vector<interval_set<uint64_t>>& to_release);
+
+  void _flush_and_sync_log_core(int64_t available_runway);
+  int _flush_and_sync_log_jump_D(uint64_t jump_to,
+			       int64_t available_runway);
+  int _flush_and_sync_log_LD(uint64_t want_seq = 0);
+
+  uint64_t _estimate_transaction_size(bluefs_transaction_t* t);
+  uint64_t _make_initial_transaction(uint64_t start_seq,
+                                     bluefs_fnode_t& fnode,
+                                     uint64_t expected_final_size,
+                                     bufferlist* out);
+  uint64_t _estimate_log_size_N();
+  bool _should_start_compact_log_L_N();
+
+  enum {
+    REMOVE_DB = 1,
+    REMOVE_WAL = 2,
+    RENAME_SLOW2DB = 4,
+    RENAME_DB2SLOW = 8,
+  };
+  void _compact_log_dump_metadata_NF(uint64_t start_seq,
+                                     bluefs_transaction_t *t,
+				     int flags,
+				     uint64_t capture_before_seq);
+
+  void _compact_log_sync_LNF_LD();
+  void _compact_log_async_LD_LNF_D();
+
+  void _rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
+				    int super_dev,
+				    int log_dev,
+				    int new_log_dev,
+				    int flags,
+				    std::optional<bluefs_layout_t> layout);
+
+  //void _aio_finish(void *priv);
+
+  void _flush_bdev(FileWriter *h, bool check_mutex_locked = true);
+  void _flush_bdev();  // this is safe to call without a lock
+  void _flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs);  // this is safe to call without a lock
+
+  int _preallocate(FileRef f, uint64_t off, uint64_t len);
+  int _truncate(FileWriter *h, uint64_t off);
+
+  int64_t _read(
+    FileReader *h,   ///< [in] read from here
+    uint64_t offset, ///< [in] offset
+    size_t len,      ///< [in] this many bytes
+    ceph::buffer::list *outbl,   ///< [out] optional: reference the result here
+    char *out);      ///< [out] optional: or copy it here
+  int64_t _read_random(
+    FileReader *h,   ///< [in] read from here
+    uint64_t offset, ///< [in] offset
+    uint64_t len,    ///< [in] this many bytes
+    char *out);      ///< [out] optional: or copy it here
+
+  int _open_super();
+  int _write_super(int dev);
+  int _check_allocations(const bluefs_fnode_t& fnode,
+    boost::dynamic_bitset<uint64_t>* used_blocks,
+    bool is_alloc, //true when allocating, false when deallocating
+    const char* op_name);
+  int _verify_alloc_granularity(
+    __u8 id, uint64_t offset, uint64_t length,
+    uint64_t alloc_unit,
+    const char *op);
+  int _replay(bool noop, bool to_stdout = false); ///< replay journal
+
+  FileWriter *_create_writer(FileRef f);
+  void _drain_writer(FileWriter *h);
+  void _close_writer(FileWriter *h);
+
+  // always put the super in the second 4k block.  FIXME should this be
+  // block size independent?
+  unsigned get_super_offset() {
+    return 4096;
+  }
+  unsigned get_super_length() {
+    return 4096;
+  }
+  void _maybe_check_vselector_LNF() {
+    if (cct->_conf->bluefs_check_volume_selector_often) {
+      _check_vselector_LNF();
+    }
+  }
+public:
+  BlueFS(CephContext* cct);
+  ~BlueFS();
+
+  // the super is always stored on bdev 0
+  int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout);
+  int mount();
+  int maybe_verify_layout(const bluefs_layout_t& layout) const;
+  void umount(bool avoid_compact = false);
+  int prepare_new_device(int id, const bluefs_layout_t& layout);
+  
+  int log_dump();
+
+  void collect_metadata(std::map<std::string,std::string> *pm, unsigned skip_bdev_id);
+  void get_devices(std::set<std::string> *ls);
+  uint64_t get_alloc_size(int id) {
+    return alloc_size[id];
+  }
+  int fsck();
+
+  int device_migrate_to_new(
+    CephContext *cct,
+    const std::set<int>& devs_source,
+    int dev_target,
+    const bluefs_layout_t& layout);
+  int device_migrate_to_existing(
+    CephContext *cct,
+    const std::set<int>& devs_source,
+    int dev_target,
+    const bluefs_layout_t& layout);
+
+  uint64_t get_used();
+  uint64_t get_total(unsigned id);
+  uint64_t get_free(unsigned id);
+  uint64_t get_used(unsigned id);
+  void dump_perf_counters(ceph::Formatter *f);
+
+  void dump_block_extents(std::ostream& out);
+
+  /// get current extents that we own for given block device
+  void foreach_block_extents(
+    unsigned id,
+    std::function<void(uint64_t, uint32_t)> cb);
+
+  int open_for_write(
+    std::string_view dir,
+    std::string_view file,
+    FileWriter **h,
+    bool overwrite);
+
+  int open_for_read(
+    std::string_view dir,
+    std::string_view file,
+    FileReader **h,
+    bool random = false);
+
+  // data added after last fsync() is lost
+  void close_writer(FileWriter *h);
+
+  int rename(std::string_view old_dir, std::string_view old_file,
+	     std::string_view new_dir, std::string_view new_file);
+
+  int readdir(std::string_view dirname, std::vector<std::string> *ls);
+
+  int unlink(std::string_view dirname, std::string_view filename);
+  int mkdir(std::string_view dirname);
+  int rmdir(std::string_view dirname);
+  bool wal_is_rotational();
+  bool db_is_rotational();
+
+  bool dir_exists(std::string_view dirname);
+  int stat(std::string_view dirname, std::string_view filename,
+	   uint64_t *size, utime_t *mtime);
+
+  int lock_file(std::string_view dirname, std::string_view filename, FileLock **p);
+  int unlock_file(FileLock *l);
+
+  void compact_log();
+
+  /// sync any uncommitted state to disk
+  void sync_metadata(bool avoid_compact);
+
+  void set_volume_selector(BlueFSVolumeSelector* s) {
+    vselector.reset(s);
+  }
+  void dump_volume_selector(std::ostream& sout) {
+    vselector->dump(sout);
+  }
+  void get_vselector_paths(const std::string& base,
+                           BlueFSVolumeSelector::paths& res) const {
+    return vselector->get_paths(base, res);
+  }
+
+  int add_block_device(unsigned bdev, const std::string& path, bool trim,
+                       uint64_t reserved,
+		       bluefs_shared_alloc_context_t* _shared_alloc = nullptr);
+  bool bdev_support_label(unsigned id);
+  uint64_t get_block_device_size(unsigned bdev) const;
+
+  // handler for discard event
+  void handle_discard(unsigned dev, interval_set<uint64_t>& to_release);
+
+  void flush(FileWriter *h, bool force = false);
+
+  void append_try_flush(FileWriter *h, const char* buf, size_t len);
+  void flush_range(FileWriter *h, uint64_t offset, uint64_t length);
+  int fsync(FileWriter *h);
+  int64_t read(FileReader *h, uint64_t offset, size_t len,
+	   ceph::buffer::list *outbl, char *out) {
+    // no need to hold the global lock here; we only touch h and
+    // h->file, and read vs write or delete is already protected (via
+    // atomics and asserts).
+    return _read(h, offset, len, outbl, out);
+  }
+  int64_t read_random(FileReader *h, uint64_t offset, size_t len,
+		  char *out) {
+    // no need to hold the global lock here; we only touch h and
+    // h->file, and read vs write or delete is already protected (via
+    // atomics and asserts).
+    return _read_random(h, offset, len, out);
+  }
+  void invalidate_cache(FileRef f, uint64_t offset, uint64_t len);
+  int preallocate(FileRef f, uint64_t offset, uint64_t len);
+  int truncate(FileWriter *h, uint64_t offset);
+
+  size_t probe_alloc_avail(int dev, uint64_t alloc_size);
+
+  /// test purpose methods
+  const PerfCounters* get_perf_counters() const {
+    return logger;
+  }
+  uint64_t debug_get_dirty_seq(FileWriter *h);
+  bool debug_get_is_dev_dirty(FileWriter *h, uint8_t dev);
+
+private:
+  // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...)
+  // They are used for checking if read values are all 0, and reread if so.
+  int _read_and_check(uint8_t ndev, uint64_t off, uint64_t len,
+	   ceph::buffer::list *pbl, IOContext *ioc, bool buffered);
+  int _read_random_and_check(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered);
+
+  int _bdev_read(uint8_t ndev, uint64_t off, uint64_t len,
+    ceph::buffer::list* pbl, IOContext* ioc, bool buffered);
+  int _bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len, char* buf, bool buffered);
+
+  /// test and compact log, if necessary
+  void _maybe_compact_log_LNF_NF_LD_D();
+  int _do_replay_recovery_read(FileReader *log,
+			       size_t log_pos,
+			       size_t read_offset,
+			       size_t read_len,
+			       bufferlist* bl);
+  void _check_vselector_LNF();
+};
+
+class OriginalVolumeSelector : public BlueFSVolumeSelector {
+  uint64_t wal_total;
+  uint64_t db_total;
+  uint64_t slow_total;
+
+public:
+  OriginalVolumeSelector(
+    uint64_t _wal_total,
+    uint64_t _db_total,
+    uint64_t _slow_total)
+    : wal_total(_wal_total), db_total(_db_total), slow_total(_slow_total) {}
+
+  void* get_hint_for_log() const override;
+  void* get_hint_by_dir(std::string_view dirname) const override;
+
+  void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+    // do nothing
+    return;
+  }
+  void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+    // do nothing
+    return;
+  }
+  void add_usage(void* hint, uint64_t fsize) override {
+    // do nothing
+    return;
+  }
+  void sub_usage(void* hint, uint64_t fsize) override {
+    // do nothing
+    return;
+  }
+
+  uint8_t select_prefer_bdev(void* hint) override;
+  void get_paths(const std::string& base, paths& res) const override;
+  void dump(std::ostream& sout) override;
+};
+
+class FitToFastVolumeSelector : public OriginalVolumeSelector {
+public:
+  FitToFastVolumeSelector(
+    uint64_t _wal_total,
+    uint64_t _db_total,
+    uint64_t _slow_total)
+    : OriginalVolumeSelector(_wal_total, _db_total, _slow_total) {}
+
+  void get_paths(const std::string& base, paths& res) const override;
+};
+/**
+ * Directional graph of locks.
+ * Vertices - Locks. Edges (directed) - locking progression.
+ * Edge A->B exist if last taken lock was A and next taken lock is B.
+ * 
+ * Row represents last lock taken.
+ * Column represents next lock taken.
+ *
+ *     >        | W | L | N | D | F
+ * -------------|---|---|---|---|---
+ * FileWriter W |   | > | > | > | >
+ * log        L |       | > | > | >
+ * nodes      N |           | > | >
+ * dirty      D |           |   | >
+ * File       F |
+ * 
+ * Claim: Deadlock is possible IFF graph contains cycles.
+ */
+#endif
diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc
new file mode 100644
index 000000000..68040af42
--- /dev/null
+++ b/src/os/bluestore/BlueRocksEnv.cc
@@ -0,0 +1,596 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BlueRocksEnv.h"
+#include "BlueFS.h"
+#include "include/stringify.h"
+#include "kv/RocksDBStore.h"
+#include "string.h"
+
+using std::string_view;
+
+namespace {
+
+rocksdb::Status err_to_status(int r)
+{
+  switch (r) {
+  case 0:
+    return rocksdb::Status::OK();
+  case -ENOENT:
+    return rocksdb::Status::NotFound(rocksdb::Status::kNone);
+  case -EINVAL:
+    return rocksdb::Status::InvalidArgument(rocksdb::Status::kNone);
+  case -EIO:
+  case -EEXIST:
+    return rocksdb::Status::IOError(rocksdb::Status::kNone);
+  case -ENOLCK:
+    return rocksdb::Status::IOError(strerror(r));
+  default:
+    // FIXME :(
+    ceph_abort_msg("unrecognized error code");
+    return rocksdb::Status::NotSupported(rocksdb::Status::kNone);
+  }
+}
+
+std::pair<std::string_view, std::string_view>
+split(const std::string &fn)
+{
+  size_t slash = fn.rfind('/');
+  assert(slash != fn.npos);
+  size_t file_begin = slash + 1;
+  while (slash && fn[slash - 1] == '/')
+    --slash;
+  return {string_view(fn.data(), slash),
+          string_view(fn.data() + file_begin,
+	              fn.size() - file_begin)};
+}
+
+}
+
+// A file abstraction for reading sequentially through a file
+class BlueRocksSequentialFile : public rocksdb::SequentialFile {
+  BlueFS *fs;
+  BlueFS::FileReader *h;
+ public:
+  BlueRocksSequentialFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
+  ~BlueRocksSequentialFile() override {
+    delete h;
+  }
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // REQUIRES: External synchronization
+  rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override {
+    int64_t r = fs->read(h, h->buf.pos, n, NULL, scratch);
+    ceph_assert(r >= 0);
+    *result = rocksdb::Slice(scratch, r);
+    return rocksdb::Status::OK();
+  }
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  rocksdb::Status Skip(uint64_t n) override {
+    h->buf.skip(n);
+    return rocksdb::Status::OK();
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
+    h->buf.invalidate_cache(offset, length);
+    fs->invalidate_cache(h->file, offset, length);
+    return rocksdb::Status::OK();
+  }
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile {
+  BlueFS *fs;
+  BlueFS::FileReader *h;
+ public:
+  BlueRocksRandomAccessFile(BlueFS *fs, BlueFS::FileReader *h) : fs(fs), h(h) {}
+  ~BlueRocksRandomAccessFile() override {
+    delete h;
+  }
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result,
+		       char* scratch) const override {
+    int64_t r = fs->read_random(h, offset, n, scratch);
+    ceph_assert(r >= 0);
+    *result = rocksdb::Slice(scratch, r);
+    return rocksdb::Status::OK();
+  }
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to eachother by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return snprintf(id, max_size, "%016llx",
+		    (unsigned long long)h->file->fnode.ino);
+  };
+
+  // Readahead the file starting from offset by n bytes for caching.
+  rocksdb::Status Prefetch(uint64_t offset, size_t n) override {
+    fs->read(h, offset, n, nullptr, nullptr);
+    return rocksdb::Status::OK();
+  }
+
+  //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+
+  void Hint(AccessPattern pattern) override {
+    if (pattern == RANDOM)
+      h->buf.max_prefetch = 4096;
+    else if (pattern == SEQUENTIAL)
+      h->buf.max_prefetch = fs->cct->_conf->bluefs_max_prefetch;
+  }
+
+  bool use_direct_io() const override {
+    return !fs->cct->_conf->bluefs_buffered_io;
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
+    h->buf.invalidate_cache(offset, length);
+    fs->invalidate_cache(h->file, offset, length);
+    return rocksdb::Status::OK();
+  }
+};
+
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class BlueRocksWritableFile : public rocksdb::WritableFile {
+  BlueFS *fs;
+  BlueFS::FileWriter *h;
+ public:
+  BlueRocksWritableFile(BlueFS *fs, BlueFS::FileWriter *h) : fs(fs), h(h) {}
+  ~BlueRocksWritableFile() override {
+    fs->close_writer(h);
+  }
+
+  // Indicates if the class makes use of unbuffered I/O
+  /*bool UseOSBuffer() const {
+    return true;
+    }*/
+
+  // This is needed when you want to allocate
+  // AlignedBuffer for use with file I/O classes
+  // Used for unbuffered file I/O when UseOSBuffer() returns false
+  /*size_t GetRequiredBufferAlignment() const {
+    return c_DefaultPageSize;
+    }*/
+
+  rocksdb::Status Append(const rocksdb::Slice& data) override {
+    fs->append_try_flush(h, data.data(), data.size());
+    return rocksdb::Status::OK();
+  }
+
+  // Positioned write for unbuffered access default forward
+  // to simple append as most of the tests are buffered by default
+  rocksdb::Status PositionedAppend(
+    const rocksdb::Slice& /* data */,
+    uint64_t /* offset */) override {
+    return rocksdb::Status::NotSupported();
+  }
+
+  // Truncate is necessary to trim the file to the correct size
+  // before closing. It is not always possible to keep track of the file
+  // size due to whole pages writes. The behavior is undefined if called
+  // with other writes to follow.
+  rocksdb::Status Truncate(uint64_t size) override {
+    // we mirror the posix env, which does nothing here; instead, it
+    // truncates to the final size on close.  whatever!
+    return rocksdb::Status::OK();
+    //int r = fs->truncate(h, size);
+    //  return err_to_status(r);
+  }
+
+  rocksdb::Status Close() override {
+    fs->fsync(h);
+
+    // mimic posix env, here.  shrug.
+    size_t block_size;
+    size_t last_allocated_block;
+    GetPreallocationStatus(&block_size, &last_allocated_block);
+    if (last_allocated_block > 0) {
+      int r = fs->truncate(h, h->pos);
+      if (r < 0)
+	return err_to_status(r);
+    }
+
+    return rocksdb::Status::OK();
+  }
+
+  rocksdb::Status Flush() override {
+    fs->flush(h);
+    return rocksdb::Status::OK();
+  }
+
+  rocksdb::Status Sync() override { // sync data
+    fs->fsync(h);
+    return rocksdb::Status::OK();
+  }
+
+  // true if Sync() and Fsync() are safe to call concurrently with Append()
+  // and Flush().
+  bool IsSyncThreadSafe() const override {
+    return true;
+  }
+
+  // Indicates the upper layers if the current WritableFile implementation
+  // uses direct IO.
+  bool UseDirectIO() const {
+    return false;
+  }
+
+  void SetWriteLifeTimeHint(rocksdb::Env::WriteLifeTimeHint hint) override {
+    h->write_hint = (const int)hint;
+  }
+
+  /*
+   * Get the size of valid data in the file.
+   */
+  uint64_t GetFileSize() override {
+    return h->file->fnode.size + h->get_buffer_length();;
+  }
+
+  // For documentation, refer to RandomAccessFile::GetUniqueId()
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return snprintf(id, max_size, "%016llx",
+		    (unsigned long long)h->file->fnode.ino);
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  rocksdb::Status InvalidateCache(size_t offset, size_t length) override {
+    fs->fsync(h);
+    fs->invalidate_cache(h->file, offset, length);
+    return rocksdb::Status::OK();
+  }
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  rocksdb::Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+    // round down to page boundaries
+    int partial = offset & 4095;
+    offset -= partial;
+    nbytes += partial;
+    nbytes &= ~4095;
+    if (nbytes)
+      fs->flush_range(h, offset, nbytes);
+    return rocksdb::Status::OK();
+  }
+
+ protected:
+  /*
+   * Pre-allocate space for a file.
+   */
+  rocksdb::Status Allocate(uint64_t offset, uint64_t len) override {
+    int r = fs->preallocate(h->file, offset, len);
+    return err_to_status(r);
+  }
+};
+
+
+// Directory object represents collection of files and implements
+// filesystem operations that can be executed on directories.
+class BlueRocksDirectory : public rocksdb::Directory {
+  BlueFS *fs;
+ public:
+  explicit BlueRocksDirectory(BlueFS *f) : fs(f) {}
+
+  // Fsync directory. Can be called concurrently from multiple threads.
+  rocksdb::Status Fsync() override {
+    // it is sufficient to flush the log.
+    fs->sync_metadata(false);
+    return rocksdb::Status::OK();
+  }
+};
+
+// Identifies a locked file.
+class BlueRocksFileLock : public rocksdb::FileLock {
+ public:
+  BlueFS *fs;
+  BlueFS::FileLock *lock;
+  BlueRocksFileLock(BlueFS *fs, BlueFS::FileLock *l) : fs(fs), lock(l) { }
+  ~BlueRocksFileLock() override {
+  }
+};
+
+
+// --------------------
+// --- BlueRocksEnv ---
+// --------------------
+
+BlueRocksEnv::BlueRocksEnv(BlueFS *f)
+  : EnvWrapper(Env::Default()),  // forward most of it to POSIX
+    fs(f)
+{
+
+}
+
+rocksdb::Status BlueRocksEnv::NewSequentialFile(
+  const std::string& fname,
+  std::unique_ptr<rocksdb::SequentialFile>* result,
+  const rocksdb::EnvOptions& options)
+{
+  if (fname[0] == '/')
+    return target()->NewSequentialFile(fname, result, options);
+  auto [dir, file] = split(fname);
+  BlueFS::FileReader *h;
+  int r = fs->open_for_read(dir, file, &h, false);
+  if (r < 0)
+    return err_to_status(r);
+  result->reset(new BlueRocksSequentialFile(fs, h));
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::NewRandomAccessFile(
+  const std::string& fname,
+  std::unique_ptr<rocksdb::RandomAccessFile>* result,
+  const rocksdb::EnvOptions& options)
+{
+  auto [dir, file] = split(fname);
+  BlueFS::FileReader *h;
+  int r = fs->open_for_read(dir, file, &h, true);
+  if (r < 0)
+    return err_to_status(r);
+  result->reset(new BlueRocksRandomAccessFile(fs, h));
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::NewWritableFile(
+  const std::string& fname,
+  std::unique_ptr<rocksdb::WritableFile>* result,
+  const rocksdb::EnvOptions& options)
+{
+  auto [dir, file] = split(fname);
+  BlueFS::FileWriter *h;
+  int r = fs->open_for_write(dir, file, &h, false);
+  if (r < 0)
+    return err_to_status(r);
+  result->reset(new BlueRocksWritableFile(fs, h));
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::ReuseWritableFile(
+  const std::string& new_fname,
+  const std::string& old_fname,
+  std::unique_ptr<rocksdb::WritableFile>* result,
+  const rocksdb::EnvOptions& options)
+{
+  auto [old_dir, old_file] = split(old_fname);
+  auto [new_dir, new_file] = split(new_fname);
+
+  int r = fs->rename(old_dir, old_file, new_dir, new_file);
+  if (r < 0)
+    return err_to_status(r);
+
+  BlueFS::FileWriter *h;
+  r = fs->open_for_write(new_dir, new_file, &h, true);
+  if (r < 0)
+    return err_to_status(r);
+  result->reset(new BlueRocksWritableFile(fs, h));
+  fs->sync_metadata(false);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::NewDirectory(
+  const std::string& name,
+  std::unique_ptr<rocksdb::Directory>* result)
+{
+  if (!fs->dir_exists(name))
+    return rocksdb::Status::NotFound(name, strerror(ENOENT));
+  result->reset(new BlueRocksDirectory(fs));
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::FileExists(const std::string& fname)
+{
+  if (fname[0] == '/')
+    return target()->FileExists(fname);
+  auto [dir, file] = split(fname);
+  if (fs->stat(dir, file, NULL, NULL) == 0)
+    return rocksdb::Status::OK();
+  return err_to_status(-ENOENT);
+}
+
+rocksdb::Status BlueRocksEnv::GetChildren(
+  const std::string& dir,
+  std::vector<std::string>* result)
+{
+  result->clear();
+  int r = fs->readdir(dir, result);
+  if (r < 0)
+    return rocksdb::Status::NotFound(dir, strerror(ENOENT));//    return err_to_status(r);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::DeleteFile(const std::string& fname)
+{
+  auto [dir, file] = split(fname);
+  int r = fs->unlink(dir, file);
+  if (r < 0)
+    return err_to_status(r);
+  fs->sync_metadata(false);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::CreateDir(const std::string& dirname)
+{
+  int r = fs->mkdir(dirname);
+  if (r < 0)
+    return err_to_status(r);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::CreateDirIfMissing(const std::string& dirname)
+{
+  int r = fs->mkdir(dirname);
+  if (r < 0 && r != -EEXIST)
+    return err_to_status(r);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::DeleteDir(const std::string& dirname)
+{
+  int r = fs->rmdir(dirname);
+  if (r < 0)
+    return err_to_status(r);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::GetFileSize(
+  const std::string& fname,
+  uint64_t* file_size)
+{
+  auto [dir, file] = split(fname);
+  int r = fs->stat(dir, file, file_size, NULL);
+  if (r < 0)
+    return err_to_status(r);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::GetFileModificationTime(const std::string& fname,
+						      uint64_t* file_mtime)
+{
+  auto [dir, file] = split(fname);
+  utime_t mtime;
+  int r = fs->stat(dir, file, NULL, &mtime);
+  if (r < 0)
+    return err_to_status(r);
+  *file_mtime = mtime.sec();
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::RenameFile(
+  const std::string& src,
+  const std::string& target)
+{
+  auto [old_dir, old_file] = split(src);
+  auto [new_dir, new_file] = split(target);
+
+  int r = fs->rename(old_dir, old_file, new_dir, new_file);
+  if (r < 0)
+    return err_to_status(r);
+  fs->sync_metadata(false);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::LinkFile(
+  const std::string& src,
+  const std::string& target)
+{
+  ceph_abort();
+}
+
+rocksdb::Status BlueRocksEnv::AreFilesSame(
+  const std::string& first,
+  const std::string& second, bool* res)
+{
+  for (auto& path : {first, second}) {
+    if (fs->dir_exists(path)) {
+      continue;
+    }
+    auto [dir, file] = split(path);
+    int r = fs->stat(dir, file, nullptr, nullptr);
+    if (!r) {
+      continue;
+    } else if (r == -ENOENT) {
+      return rocksdb::Status::NotFound("AreFilesSame", path);
+    } else {
+      return err_to_status(r);
+    }
+  }
+  *res = (first == second);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::LockFile(
+  const std::string& fname,
+  rocksdb::FileLock** lock)
+{
+  auto [dir, file] = split(fname);
+  BlueFS::FileLock *l = NULL;
+  int r = fs->lock_file(dir, file, &l);
+  if (r < 0)
+    return err_to_status(r);
+  *lock = new BlueRocksFileLock(fs, l);
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::UnlockFile(rocksdb::FileLock* lock)
+{
+  BlueRocksFileLock *l = static_cast<BlueRocksFileLock*>(lock);
+  int r = fs->unlock_file(l->lock);
+  if (r < 0)
+    return err_to_status(r);
+  delete lock;
+  lock = nullptr;
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::GetAbsolutePath(
+  const std::string& db_path,
+  std::string* output_path)
+{
+  // this is a lie...
+  *output_path = "/" + db_path;
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::NewLogger(
+  const std::string& fname,
+  std::shared_ptr<rocksdb::Logger>* result)
+{
+  // ignore the filename :)
+  result->reset(create_rocksdb_ceph_logger());
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status BlueRocksEnv::GetTestDirectory(std::string* path)
+{
+  static int foo = 0;
+  *path = "temp_" + stringify(++foo);
+  return rocksdb::Status::OK();
+}
diff --git a/src/os/bluestore/BlueRocksEnv.h b/src/os/bluestore/BlueRocksEnv.h
new file mode 100644
index 000000000..62bcddcf6
--- /dev/null
+++ b/src/os/bluestore/BlueRocksEnv.h
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OS_BLUESTORE_BLUEROCKSENV_H
+#define CEPH_OS_BLUESTORE_BLUEROCKSENV_H
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/env_mirror.h"
+
+#include "include/ceph_assert.h"
+#include "kv/RocksDBStore.h"
+
+class BlueFS;
+
+class BlueRocksEnv : public rocksdb::EnvWrapper {
+public:
+  // Create a brand new sequentially-readable file with the specified name.
+  // On success, stores a pointer to the new file in *result and returns OK.
+  // On failure, stores nullptr in *result and returns non-OK.  If the file does
+  // not exist, returns a non-OK status.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  rocksdb::Status NewSequentialFile(
+    const std::string& fname,
+    std::unique_ptr<rocksdb::SequentialFile>* result,
+    const rocksdb::EnvOptions& options) override;
+
+  // Create a brand new random access read-only file with the
+  // specified name.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure, stores nullptr in *result and
+  // returns non-OK.  If the file does not exist, returns a non-OK
+  // status.
+  //
+  // The returned file may be concurrently accessed by multiple threads.
+  rocksdb::Status NewRandomAccessFile(
+    const std::string& fname,
+    std::unique_ptr<rocksdb::RandomAccessFile>* result,
+    const rocksdb::EnvOptions& options) override;
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure, stores nullptr in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  rocksdb::Status NewWritableFile(
+    const std::string& fname,
+    std::unique_ptr<rocksdb::WritableFile>* result,
+    const rocksdb::EnvOptions& options) override;
+
+  // Reuse an existing file by renaming it and opening it as writable.
+  rocksdb::Status ReuseWritableFile(
+    const std::string& fname,
+    const std::string& old_fname,
+    std::unique_ptr<rocksdb::WritableFile>* result,
+    const rocksdb::EnvOptions& options) override;
+
+  // Create an object that represents a directory. Will fail if directory
+  // doesn't exist. If the directory exists, it will open the directory
+  // and create a new Directory object.
+  //
+  // On success, stores a pointer to the new Directory in
+  // *result and returns OK. On failure stores nullptr in *result and
+  // returns non-OK.
+  rocksdb::Status NewDirectory(
+    const std::string& name,
+    std::unique_ptr<rocksdb::Directory>* result) override;
+
+  // Returns OK if the named file exists.
+  //         NotFound if the named file does not exist,
+  //                  the calling process does not have permission to determine
+  //                  whether this file exists, or if the path is invalid.
+  //         IOError if an IO Error was encountered
+  rocksdb::Status FileExists(const std::string& fname) override;
+
+  // Store in *result the names of the children of the specified directory.
+  // The names are relative to "dir".
+  // Original contents of *results are dropped.
+  rocksdb::Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) override;
+
+  // Delete the named file.
+  rocksdb::Status DeleteFile(const std::string& fname) override;
+
+  // Create the specified directory. Returns error if directory exists.
+  rocksdb::Status CreateDir(const std::string& dirname) override;
+
+  // Create directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  rocksdb::Status CreateDirIfMissing(const std::string& dirname) override;
+
+  // Delete the specified directory.
+  rocksdb::Status DeleteDir(const std::string& dirname) override;
+
+  // Store the size of fname in *file_size.
+  rocksdb::Status GetFileSize(const std::string& fname, uint64_t* file_size) override;
+
+  // Store the last modification time of fname in *file_mtime.
+  rocksdb::Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime) override;
+  // Rename file src to target.
+  rocksdb::Status RenameFile(const std::string& src,
+                            const std::string& target) override;
+  // Hard Link file src to target.
+  rocksdb::Status LinkFile(const std::string& src, const std::string& target) override;
+
+  // Tell if two files are identical
+  rocksdb::Status AreFilesSame(const std::string& first,
+			       const std::string& second, bool* res) override;
+
+  // Lock the specified file.  Used to prevent concurrent access to
+  // the same db by multiple processes.  On failure, stores nullptr in
+  // *lock and returns non-OK.
+  //
+  // On success, stores a pointer to the object that represents the
+  // acquired lock in *lock and returns OK.  The caller should call
+  // UnlockFile(*lock) to release the lock.  If the process exits,
+  // the lock will be automatically released.
+  //
+  // If somebody else already holds the lock, finishes immediately
+  // with a failure.  I.e., this call does not wait for existing locks
+  // to go away.
+  //
+  // May create the named file if it does not already exist.
+  rocksdb::Status LockFile(const std::string& fname, rocksdb::FileLock** lock) override;
+
+  // Release the lock acquired by a previous successful call to LockFile.
+  // REQUIRES: lock was returned by a successful LockFile() call
+  // REQUIRES: lock has not already been unlocked.
+  rocksdb::Status UnlockFile(rocksdb::FileLock* lock) override;
+
+  // *path is set to a temporary directory that can be used for testing. It may
+  // or may not have just been created. The directory may or may not differ
+  // between runs of the same process, but subsequent calls will return the
+  // same directory.
+  rocksdb::Status GetTestDirectory(std::string* path) override;
+
+  // Create and return a log file for storing informational messages.
+  rocksdb::Status NewLogger(
+    const std::string& fname,
+    std::shared_ptr<rocksdb::Logger>* result) override;
+
+  // Get full directory name for this db.
+  rocksdb::Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) override;
+
+  explicit BlueRocksEnv(BlueFS *f);
+private:
+  BlueFS *fs;
+};
+
+#endif
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
new file mode 100644
index 000000000..aa14d0204
--- /dev/null
+++ b/src/os/bluestore/BlueStore.cc
@@ -0,0 +1,19631 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <bit>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <algorithm>
+
+#include <boost/container/flat_set.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_real.hpp>
+
+#include "include/cpp-btree/btree_set.h"
+
+#include "BlueStore.h"
+#include "bluestore_common.h"
+#include "simple_bitmap.h"
+#include "os/kv.h"
+#include "include/compat.h"
+#include "include/intarith.h"
+#include "include/stringify.h"
+#include "include/str_map.h"
+#include "include/util.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/PriorityCache.h"
+#include "common/url_escape.h"
+#include "Allocator.h"
+#include "FreelistManager.h"
+#include "BlueFS.h"
+#include "BlueRocksEnv.h"
+#include "auth/Crypto.h"
+#include "common/EventTrace.h"
+#include "perfglue/heap_profiler.h"
+#include "common/blkdev.h"
+#include "common/numa.h"
+#include "common/pretty_binary.h"
+#include "kv/KeyValueHistogram.h"
+
+#ifdef HAVE_LIBZBD
+#include "ZonedAllocator.h"
+#include "ZonedFreelistManager.h"
+#endif
+
+#if defined(WITH_LTTNG)
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/bluestore.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+
+using bid_t = decltype(BlueStore::Blob::id);
+
+// bluestore_cache_onode
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
+			      bluestore_cache_onode);
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
+			      bluestore_cache_buffer);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
+			      bluestore_extent);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
+			      bluestore_blob);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
+			      bluestore_shared_blob);
+
+// bluestore_txc
+MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
+			      bluestore_txc);
+using std::byte;
+using std::deque;
+using std::min;
+using std::make_pair;
+using std::numeric_limits;
+using std::pair;
+using std::less;
+using std::list;
+using std::make_unique;
+using std::map;
+using std::max;
+using std::ostream;
+using std::ostringstream;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::unique_ptr;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::coarse_mono_clock;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+using ceph::JSONFormatter;
+using ceph::make_timespan;
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan_str;
+
+// kv store prefixes
+const string PREFIX_SUPER = "S";       // field -> value
+const string PREFIX_STAT = "T";        // field -> value(int64 array)
+const string PREFIX_COLL = "C";        // collection name -> cnode_t
+const string PREFIX_OBJ = "O";         // object name -> onode_t
+const string PREFIX_OMAP = "M";        // u64 + keyname -> value
+const string PREFIX_PGMETA_OMAP = "P"; // u64 + keyname -> value(for meta coll)
+const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
+const string PREFIX_PERPG_OMAP = "p";   // u64(pool) + u32(hash) + u64(id) + keyname -> value
+const string PREFIX_DEFERRED = "L";    // id -> deferred_transaction_t
+const string PREFIX_ALLOC = "B";       // u64 offset -> u64 length (freelist)
+const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
+const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t
+
+#ifdef HAVE_LIBZBD
+const string PREFIX_ZONED_FM_META = "Z";  // (see ZonedFreelistManager)
+const string PREFIX_ZONED_FM_INFO = "z";  // (see ZonedFreelistManager)
+const string PREFIX_ZONED_CL_INFO = "G";  // (per-zone cleaner metadata)
+#endif
+
+const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
+
+// write a label in the first block.  always use this size.  note that
+// bluefs makes a matching assumption about the location of its
+// superblock (always the second block of the device).
+#define BDEV_LABEL_BLOCK_SIZE  4096
+
+// reserve: label (4k) + bluefs super (4k), which means we start at 8k.
+#define SUPER_RESERVED  8192
+
+#define OBJECT_MAX_SIZE 0xffffffff // 32 bits
+
+
+/*
+ * extent map blob encoding
+ *
+ * we use the low bits of the blobid field to indicate some common scenarios
+ * and spanning vs local ids.  See ExtentMap::{encode,decode}_some().
+ */
+#define BLOBID_FLAG_CONTIGUOUS 0x1  // this extent starts at end of previous
+#define BLOBID_FLAG_ZEROOFFSET 0x2  // blob_offset is 0
+#define BLOBID_FLAG_SAMELENGTH 0x4  // length matches previous extent
+#define BLOBID_FLAG_SPANNING   0x8  // has spanning blob id
+#define BLOBID_SHIFT_BITS        4
+
+/*
+ * object name key structure
+ *
+ * encoded u8: shard + 2^7 (so that it sorts properly)
+ * encoded u64: poolid + 2^63 (so that it sorts properly)
+ * encoded u32: hash (bit reversed)
+ *
+ * escaped string: namespace
+ *
+ * escaped string: key or object name
+ * 1 char: '<', '=', or '>'.  if =, then object key == object name, and
+ *         we are done.  otherwise, we are followed by the object name.
+ * escaped string: object name (unless '=' above)
+ *
+ * encoded u64: snap
+ * encoded u64: generation
+ * 'o'
+ */
+#define ONODE_KEY_SUFFIX 'o'
+
+/*
+ * extent shard key
+ *
+ * object prefix key
+ * u32
+ * 'x'
+ */
+#define EXTENT_SHARD_KEY_SUFFIX 'x'
+
+/*
+ * string encoding in the key
+ *
+ * The key string needs to lexicographically sort the same way that
+ * ghobject_t does.  We do this by escaping anything <= to '#' with #
+ * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
+ * hex digits.
+ *
+ * We use ! as a terminator for strings; this works because it is < #
+ * and will get escaped if it is present in the string.
+ *
+ * NOTE: There is a bug in this implementation: due to implicit
+ * character type conversion in comparison it may produce unexpected
+ * ordering. Unfortunately fixing the bug would mean invalidating the
+ * keys in existing deployments. Instead we do additional sorting
+ * where it is needed.
+ */
+template<typename S>
+static void append_escaped(const string &in, S *out)
+{
+  char hexbyte[in.length() * 3 + 1];
+  char* ptr = &hexbyte[0];
+  for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+    if (*i <= '#') { // bug: unexpected result for *i > 0x7f
+      *ptr++ = '#';
+      *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
+      *ptr++ = "0123456789abcdef"[*i & 0x0f];
+    } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
+      *ptr++ = '~';
+      *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
+      *ptr++ = "0123456789abcdef"[*i & 0x0f];
+    } else {
+      *ptr++  = *i;
+    }
+  }
+  *ptr++ = '!';
+  out->append(hexbyte, ptr - &hexbyte[0]);
+}
+
+inline unsigned h2i(char c)
+{
+  if ((c >= '0') && (c <= '9')) {
+    return c - 0x30;
+  } else if ((c >= 'a') && (c <= 'f')) {
+    return c - 'a' + 10;
+  } else if ((c >= 'A') && (c <= 'F')) {
+    return c - 'A' + 10;
+  } else {
+    return 256; // make it always larger than 255
+  }
+}
+
+static int decode_escaped(const char *p, string *out)
+{
+  char buff[256];
+  char* ptr = &buff[0];
+  char* max = &buff[252];
+  const char *orig_p = p;
+  while (*p && *p != '!') {
+    if (*p == '#' || *p == '~') {
+      unsigned hex = 0;
+      p++;
+      hex = h2i(*p++) << 4;
+      if (hex > 255) {
+        return -EINVAL;
+      }
+      hex |= h2i(*p++);
+      if (hex > 255) {
+        return -EINVAL;
+      }
+      *ptr++ = hex;
+    } else {
+      *ptr++ = *p++;
+    }
+    if (ptr > max) {
+       out->append(buff, ptr-buff);
+       ptr = &buff[0];
+    }
+  }
+  if (ptr != buff) {
+     out->append(buff, ptr-buff);
+  }
+  return p - orig_p;
+}
+
+template<typename T>
+static void _key_encode_shard(shard_id_t shard, T *key)
+{
+  key->push_back((char)((uint8_t)shard.id + (uint8_t)0x80));
+}
+
+static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
+{
+  pshard->id = (uint8_t)*key - (uint8_t)0x80;
+  return key + 1;
+}
+
+static void get_coll_range(const coll_t& cid, int bits,
+  ghobject_t *temp_start, ghobject_t *temp_end,
+  ghobject_t *start, ghobject_t *end, bool legacy)
+{
+  spg_t pgid;
+  constexpr uint32_t MAX_HASH = std::numeric_limits<uint32_t>::max();
+  // use different nspaces due to we use different schemes when encoding
+  // keys for listing objects
+  const std::string_view MAX_NSPACE = legacy ? "\x7f" : "\xff";
+  if (cid.is_pg(&pgid)) {
+    start->shard_id = pgid.shard;
+    *temp_start = *start;
+
+    start->hobj.pool = pgid.pool();
+    temp_start->hobj.pool = -2ll - pgid.pool();
+
+    *end = *start;
+    *temp_end = *temp_start;
+
+    uint32_t reverse_hash = hobject_t::_reverse_bits(pgid.ps());
+    start->hobj.set_bitwise_key_u32(reverse_hash);
+    temp_start->hobj.set_bitwise_key_u32(reverse_hash);
+
+    uint64_t end_hash = reverse_hash  + (1ull << (32 - bits));
+    if (end_hash > MAX_HASH) {
+      // make sure end hobj is even greater than the maximum possible hobj
+      end->hobj.set_bitwise_key_u32(MAX_HASH);
+      temp_end->hobj.set_bitwise_key_u32(MAX_HASH);
+      end->hobj.nspace = MAX_NSPACE;
+    } else {
+      end->hobj.set_bitwise_key_u32(end_hash);
+      temp_end->hobj.set_bitwise_key_u32(end_hash);
+    }
+  } else {
+    start->shard_id = shard_id_t::NO_SHARD;
+    start->hobj.pool = -1ull;
+
+    *end = *start;
+    start->hobj.set_bitwise_key_u32(0);
+    end->hobj.set_bitwise_key_u32(MAX_HASH);
+    end->hobj.nspace = MAX_NSPACE;
+    // no separate temp section
+    *temp_start = *end;
+    *temp_end = *end;
+  }
+
+  start->generation = 0;
+  end->generation = 0;
+  temp_start->generation = 0;
+  temp_end->generation = 0;
+}
+
+static void get_shared_blob_key(uint64_t sbid, string *key)
+{
+  key->clear();
+  _key_encode_u64(sbid, key);
+}
+
+static int get_key_shared_blob(const string& key, uint64_t *sbid)
+{
+  const char *p = key.c_str();
+  if (key.length() < sizeof(uint64_t))
+    return -1;
+  _key_decode_u64(p, sbid);
+  return 0;
+}
+
+template<typename S>
+static void _key_encode_prefix(const ghobject_t& oid, S *key)
+{
+  _key_encode_shard(oid.shard_id, key);
+  _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
+  _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
+}
+
+static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
+{
+  p = _key_decode_shard(p, &oid->shard_id);
+
+  uint64_t pool;
+  p = _key_decode_u64(p, &pool);
+  oid->hobj.pool = pool - 0x8000000000000000ull;
+
+  unsigned hash;
+  p = _key_decode_u32(p, &hash);
+
+  oid->hobj.set_bitwise_key_u32(hash);
+
+  return p;
+}
+
+
+#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
+
+static int _get_key_object(const char *p, ghobject_t *oid)
+{
+  int r;
+
+  p = _key_decode_prefix(p, oid);
+
+  r = decode_escaped(p, &oid->hobj.nspace);
+  if (r < 0)
+    return -2;
+  p += r + 1;
+
+  string k;
+  r = decode_escaped(p, &k);
+  if (r < 0)
+    return -3;
+  p += r + 1;
+  if (*p == '=') {
+    // no key
+    ++p;
+    oid->hobj.oid.name = k;
+  } else if (*p == '<' || *p == '>') {
+    // key + name
+    ++p;
+    r = decode_escaped(p, &oid->hobj.oid.name);
+    if (r < 0)
+      return -5;
+    p += r + 1;
+    oid->hobj.set_key(k);
+  } else {
+    // malformed
+    return -6;
+  }
+
+  p = _key_decode_u64(p, &oid->hobj.snap.val);
+  p = _key_decode_u64(p, &oid->generation);
+
+  if (*p != ONODE_KEY_SUFFIX) {
+    return -7;
+  }
+  p++;
+  if (*p) {
+    // if we get something other than a null terminator here,
+    // something goes wrong.
+    return -8;
+  }
+
+  return 0;
+}
+
+template<typename S>
+static int get_key_object(const S& key, ghobject_t *oid)
+{
+  if (key.length() < ENCODED_KEY_PREFIX_LEN)
+    return -1;
+  if (key.length() == ENCODED_KEY_PREFIX_LEN)
+    return -2;
+  const char *p = key.c_str();
+  return _get_key_object(p, oid);
+}
+
+template<typename S>
+static void _get_object_key(const ghobject_t& oid, S *key)
+{
+  size_t max_len = ENCODED_KEY_PREFIX_LEN +
+                  (oid.hobj.nspace.length() * 3 + 1) +
+                  (oid.hobj.get_key().length() * 3 + 1) +
+                   1 + // for '<', '=', or '>'
+                  (oid.hobj.oid.name.length() * 3 + 1) +
+                   8 + 8 + 1;
+  key->reserve(max_len);
+
+  _key_encode_prefix(oid, key);
+
+  append_escaped(oid.hobj.nspace, key);
+
+  if (oid.hobj.get_key().length()) {
+    // is a key... could be < = or >.
+    append_escaped(oid.hobj.get_key(), key);
+    // (ASCII chars < = and > sort in that order, yay)
+    int r = oid.hobj.get_key().compare(oid.hobj.oid.name);
+    if (r) {
+      key->append(r > 0 ? ">" : "<");
+      append_escaped(oid.hobj.oid.name, key);
+    } else {
+      // same as no key
+      key->append("=");
+    }
+  } else {
+    // no key
+    append_escaped(oid.hobj.oid.name, key);
+    key->append("=");
+  }
+
+  _key_encode_u64(oid.hobj.snap, key);
+  _key_encode_u64(oid.generation, key);
+
+  key->push_back(ONODE_KEY_SUFFIX);
+}
+
+template<typename S>
+static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key)
+{
+  key->clear();
+  _get_object_key(oid, key);
+
+  // sanity check
+  if (true) {
+    ghobject_t t;
+    int r = get_key_object(*key, &t);
+    if (r || t != oid) {
+      derr << "  r " << r << dendl;
+      derr << "key " << pretty_binary_string(*key) << dendl;
+      derr << "oid " << oid << dendl;
+      derr << "  t " << t << dendl;
+      ceph_assert(r == 0 && t == oid);
+    }
+  }
+}
+
+// extent shard keys are the onode key, plus a u32, plus 'x'.  the trailing
+// char lets us quickly test whether it is a shard key without decoding any
+// of the prefix bytes.
+template<typename S>
+static void get_extent_shard_key(const S& onode_key, uint32_t offset,
+				 string *key)
+{
+  key->clear();
+  key->reserve(onode_key.length() + 4 + 1);
+  key->append(onode_key.c_str(), onode_key.size());
+  _key_encode_u32(offset, key);
+  key->push_back(EXTENT_SHARD_KEY_SUFFIX);
+}
+
+static void rewrite_extent_shard_key(uint32_t offset, string *key)
+{
+  ceph_assert(key->size() > sizeof(uint32_t) + 1);
+  ceph_assert(*key->rbegin() == EXTENT_SHARD_KEY_SUFFIX);
+  _key_encode_u32(offset, key->size() - sizeof(uint32_t) - 1, key);
+}
+
+template<typename S>
+static void generate_extent_shard_key_and_apply(
+  const S& onode_key,
+  uint32_t offset,
+  string *key,
+  std::function<void(const string& final_key)> apply)
+{
+  if (key->empty()) { // make full key
+    ceph_assert(!onode_key.empty());
+    get_extent_shard_key(onode_key, offset, key);
+  } else {
+    rewrite_extent_shard_key(offset, key);
+  }
+  apply(*key);
+}
+
+int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset)
+{
+  ceph_assert(key.size() > sizeof(uint32_t) + 1);
+  ceph_assert(*key.rbegin() == EXTENT_SHARD_KEY_SUFFIX);
+  int okey_len = key.size() - sizeof(uint32_t) - 1;
+  *onode_key = key.substr(0, okey_len);
+  const char *p = key.data() + okey_len;
+  _key_decode_u32(p, offset);
+  return 0;
+}
+
+static bool is_extent_shard_key(const string& key)
+{
+  return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX;
+}
+
+static void get_deferred_key(uint64_t seq, string *out)
+{
+  _key_encode_u64(seq, out);
+}
+
+static void get_pool_stat_key(int64_t pool_id, string *key)
+{
+  key->clear();
+  _key_encode_u64(pool_id, key);
+}
+
+static int get_key_pool_stat(const string& key, uint64_t* pool_id)
+{
+  const char *p = key.c_str();
+  if (key.length() < sizeof(uint64_t))
+    return -1;
+  _key_decode_u64(p, pool_id);
+  return 0;
+}
+
+#ifdef HAVE_LIBZBD
+static void get_zone_offset_object_key(
+  uint32_t zone,
+  uint64_t offset,
+  ghobject_t oid,
+  std::string *key)
+{
+  key->clear();
+  _key_encode_u32(zone, key);
+  _key_encode_u64(offset, key);
+  _get_object_key(oid, key);
+}
+
+static int get_key_zone_offset_object(
+  const string& key,
+  uint32_t *zone,
+  uint64_t *offset,
+  ghobject_t *oid)
+{
+  const char *p = key.c_str();
+  if (key.length() < sizeof(uint64_t) + sizeof(uint32_t) + ENCODED_KEY_PREFIX_LEN + 1)
+    return -1;
+  p = _key_decode_u32(p, zone);
+  p = _key_decode_u64(p, offset);
+  int r = _get_key_object(p, oid);
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+#endif
+
+template <int LogLevelV>
+void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em)
+{
+  uint64_t pos = 0;
+  for (auto& s : em.shards) {
+    dout(LogLevelV) << __func__ << "  shard " << *s.shard_info
+		    << (s.loaded ? " (loaded)" : "")
+		    << (s.dirty ? " (dirty)" : "")
+		    << dendl;
+  }
+  for (auto& e : em.extent_map) {
+    dout(LogLevelV) << __func__ << "  " << e << dendl;
+    ceph_assert(e.logical_offset >= pos);
+    pos = e.logical_offset + e.length;
+    const bluestore_blob_t& blob = e.blob->get_blob();
+    if (blob.has_csum()) {
+      vector<uint64_t> v;
+      unsigned n = blob.get_csum_count();
+      for (unsigned i = 0; i < n; ++i)
+	v.push_back(blob.get_csum_item(i));
+      dout(LogLevelV) << __func__ << "      csum: " << std::hex << v << std::dec
+		      << dendl;
+    }
+    std::lock_guard l(e.blob->shared_blob->get_cache()->lock);
+    for (auto& i : e.blob->shared_blob->bc.buffer_map) {
+      dout(LogLevelV) << __func__ << "       0x" << std::hex << i.first
+		      << "~" << i.second->length << std::dec
+		      << " " << *i.second << dendl;
+    }
+  }
+}
+
+template <int LogLevelV>
+void _dump_onode(CephContext *cct, const BlueStore::Onode& o)
+{
+  if (!cct->_conf->subsys.should_gather<ceph_subsys_bluestore, LogLevelV>())
+    return;
+  dout(LogLevelV) << __func__ << " " << &o << " " << o.oid
+		  << " nid " << o.onode.nid
+		  << " size 0x" << std::hex << o.onode.size
+		  << " (" << std::dec << o.onode.size << ")"
+		  << " expected_object_size " << o.onode.expected_object_size
+		  << " expected_write_size " << o.onode.expected_write_size
+		  << " in " << o.onode.extent_map_shards.size() << " shards"
+		  << ", " << o.extent_map.spanning_blob_map.size()
+		  << " spanning blobs"
+		  << dendl;
+  for (auto& [zone, offset] : o.onode.zone_offset_refs) {
+    dout(LogLevelV) << __func__ << " zone ref 0x" << std::hex << zone
+		    << " offset 0x" << offset << std::dec << dendl;
+  }
+  for (auto p = o.onode.attrs.begin();
+       p != o.onode.attrs.end();
+       ++p) {
+    dout(LogLevelV) << __func__ << "  attr " << p->first
+		    << " len " << p->second.length() << dendl;
+  }
+  _dump_extent_map<LogLevelV>(cct, o.extent_map);
+}
+
+template <int LogLevelV>
+void _dump_transaction(CephContext *cct, ObjectStore::Transaction *t)
+{
+  dout(LogLevelV) << __func__ << " transaction dump:\n";
+  JSONFormatter f(true);
+  f.open_object_section("transaction");
+  t->dump(&f);
+  f.close_section();
+  f.flush(*_dout);
+  *_dout << dendl;
+}
+
+// Buffer
+
+ostream& operator<<(ostream& out, const BlueStore::Buffer& b)
+{
+  out << "buffer(" << &b << " space " << b.space << " 0x" << std::hex
+      << b.offset << "~" << b.length << std::dec
+      << " " << BlueStore::Buffer::get_state_name(b.state);
+  if (b.flags)
+    out << " " << BlueStore::Buffer::get_flag_name(b.flags);
+  return out << ")";
+}
+
+namespace {
+
+/*
+ * Due to a bug in key string encoding (see a comment for append_escaped)
+ * the KeyValueDB iterator does not lexicographically sort the same
+ * way that ghobject_t does: objects with the same hash may have wrong order.
+ *
+ * This is the iterator wrapper that fixes the keys order.
+ */
+
+class CollectionListIterator {
+public:
+  CollectionListIterator(const KeyValueDB::Iterator &it)
+    : m_it(it) {
+  }
+  virtual ~CollectionListIterator() {
+  }
+
+  virtual bool valid() const = 0;
+  virtual const ghobject_t &oid() const = 0;
+  virtual void lower_bound(const ghobject_t &oid) = 0;
+  virtual void upper_bound(const ghobject_t &oid) = 0;
+  virtual void next() = 0;
+
+  virtual int cmp(const ghobject_t &oid) const = 0;
+
+  bool is_ge(const ghobject_t &oid) const {
+    return cmp(oid) >= 0;
+  }
+
+  bool is_lt(const ghobject_t &oid) const {
+    return cmp(oid) < 0;
+  }
+
+protected:
+  KeyValueDB::Iterator m_it;
+};
+
+class SimpleCollectionListIterator : public CollectionListIterator {
+public:
+  SimpleCollectionListIterator(CephContext *cct, const KeyValueDB::Iterator &it)
+    : CollectionListIterator(it), m_cct(cct) {
+  }
+
+  bool valid() const override {
+    return m_it->valid();
+  }
+
+  const ghobject_t &oid() const override {
+    ceph_assert(valid());
+
+    return m_oid;
+  }
+
+  void lower_bound(const ghobject_t &oid) override {
+    string key;
+    get_object_key(m_cct, oid, &key);
+
+    m_it->lower_bound(key);
+    get_oid();
+  }
+
+  void upper_bound(const ghobject_t &oid) override {
+    string key;
+    get_object_key(m_cct, oid, &key);
+
+    m_it->upper_bound(key);
+    get_oid();
+  }
+
+  void next() override {
+    ceph_assert(valid());
+
+    m_it->next();
+    get_oid();
+  }
+
+  int cmp(const ghobject_t &oid) const override {
+    ceph_assert(valid());
+
+    string key;
+    get_object_key(m_cct, oid, &key);
+
+    return m_it->key().compare(key);
+  }
+
+private:
+  CephContext *m_cct;
+  ghobject_t m_oid;
+
+  void get_oid() {
+    m_oid = ghobject_t();
+    while (m_it->valid() && is_extent_shard_key(m_it->key())) {
+      m_it->next();
+    }
+    if (!valid()) {
+      return;
+    }
+
+    int r = get_key_object(m_it->key(), &m_oid);
+    ceph_assert(r == 0);
+  }
+};
+
+class SortedCollectionListIterator : public CollectionListIterator {
+public:
+  SortedCollectionListIterator(const KeyValueDB::Iterator &it)
+    : CollectionListIterator(it), m_chunk_iter(m_chunk.end()) {
+  }
+
+  bool valid() const override {
+    return m_chunk_iter != m_chunk.end();
+  }
+
+  const ghobject_t &oid() const override {
+    ceph_assert(valid());
+
+    return m_chunk_iter->first;
+  }
+
+  void lower_bound(const ghobject_t &oid) override {
+    std::string key;
+    _key_encode_prefix(oid, &key);
+
+    m_it->lower_bound(key);
+    m_chunk_iter = m_chunk.end();
+    if (!get_next_chunk()) {
+      return;
+    }
+
+    if (this->oid().shard_id != oid.shard_id ||
+        this->oid().hobj.pool != oid.hobj.pool ||
+        this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
+      return;
+    }
+
+    m_chunk_iter = m_chunk.lower_bound(oid);
+    if (m_chunk_iter == m_chunk.end()) {
+      get_next_chunk();
+    }
+  }
+
+  void upper_bound(const ghobject_t &oid) override {
+    lower_bound(oid);
+
+    if (valid() && this->oid() == oid) {
+      next();
+    }
+  }
+
+  void next() override {
+    ceph_assert(valid());
+
+    m_chunk_iter++;
+    if (m_chunk_iter == m_chunk.end()) {
+      get_next_chunk();
+    }
+  }
+
+  int cmp(const ghobject_t &oid) const override {
+    ceph_assert(valid());
+
+    if (this->oid() < oid) {
+      return -1;
+    }
+    if (this->oid() > oid) {
+      return 1;
+    }
+    return 0;
+  }
+
+private:
+  std::map<ghobject_t, std::string> m_chunk;
+  std::map<ghobject_t, std::string>::iterator m_chunk_iter;
+
+  bool get_next_chunk() {
+    while (m_it->valid() && is_extent_shard_key(m_it->key())) {
+      m_it->next();
+    }
+
+    if (!m_it->valid()) {
+      return false;
+    }
+
+    ghobject_t oid;
+    int r = get_key_object(m_it->key(), &oid);
+    ceph_assert(r == 0);
+
+    m_chunk.clear();
+    while (true) {
+      m_chunk.insert({oid, m_it->key()});
+
+      do {
+        m_it->next();
+      } while (m_it->valid() && is_extent_shard_key(m_it->key()));
+
+      if (!m_it->valid()) {
+        break;
+      }
+
+      ghobject_t next;
+      r = get_key_object(m_it->key(), &next);
+      ceph_assert(r == 0);
+      if (next.shard_id != oid.shard_id ||
+          next.hobj.pool != oid.hobj.pool ||
+          next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
+        break;
+      }
+      oid = next;
+    }
+
+    m_chunk_iter = m_chunk.begin();
+    return true;
+  }
+};
+
+} // anonymous namespace
+
+// Garbage Collector
+
+void BlueStore::GarbageCollector::process_protrusive_extents(
+  const BlueStore::ExtentMap& extent_map, 
+  uint64_t start_offset,
+  uint64_t end_offset,
+  uint64_t start_touch_offset,
+  uint64_t end_touch_offset,
+  uint64_t min_alloc_size)
+{
+  ceph_assert(start_offset <= start_touch_offset && end_offset>= end_touch_offset);
+
+  uint64_t lookup_start_offset = p2align(start_offset, min_alloc_size);
+  uint64_t lookup_end_offset = round_up_to(end_offset, min_alloc_size);
+
+  dout(30) << __func__ << " (hex): [" << std::hex
+           << lookup_start_offset << ", " << lookup_end_offset 
+           << ")" << std::dec << dendl;
+
+  for (auto it = extent_map.seek_lextent(lookup_start_offset);
+       it != extent_map.extent_map.end() &&
+         it->logical_offset < lookup_end_offset;
+       ++it) {
+    uint64_t alloc_unit_start = it->logical_offset / min_alloc_size;
+    uint64_t alloc_unit_end = (it->logical_end() - 1) / min_alloc_size;
+
+    dout(30) << __func__ << " " << *it
+             << "alloc_units: " << alloc_unit_start << ".." << alloc_unit_end
+             << dendl;
+
+    Blob* b = it->blob.get();
+
+    if (it->logical_offset >=start_touch_offset &&
+        it->logical_end() <= end_touch_offset) {
+      // Process extents within the range affected by 
+      // the current write request.
+      // Need to take into account if existing extents
+      // can be merged with them (uncompressed case)
+      if (!b->get_blob().is_compressed()) {
+        if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
+	  --blob_info_counted->expected_allocations; // don't need to allocate
+                                                     // new AU for compressed
+                                                     // data since another
+                                                     // collocated uncompressed
+                                                     // blob already exists
+          dout(30) << __func__  << " --expected:"
+                   << alloc_unit_start << dendl;
+        }
+        used_alloc_unit = alloc_unit_end;
+        blob_info_counted =  nullptr;
+      }
+    } else if (b->get_blob().is_compressed()) {
+
+      // additionally we take compressed blobs that were not impacted
+      // by the write into account too
+      BlobInfo& bi =
+        affected_blobs.emplace(
+          b, BlobInfo(b->get_referenced_bytes())).first->second;
+
+      int adjust =
+       (used_alloc_unit && used_alloc_unit == alloc_unit_start) ? 0 : 1;
+      bi.expected_allocations += alloc_unit_end - alloc_unit_start + adjust;
+      dout(30) << __func__  << " expected_allocations=" 
+               << bi.expected_allocations << " end_au:"
+               << alloc_unit_end << dendl;
+
+      blob_info_counted =  &bi;
+      used_alloc_unit = alloc_unit_end;
+
+      ceph_assert(it->length <= bi.referenced_bytes);
+       bi.referenced_bytes -= it->length;
+      dout(30) << __func__ << " affected_blob:" << *b
+               << " unref 0x" << std::hex << it->length
+               << " referenced = 0x" << bi.referenced_bytes
+               << std::dec << dendl;
+      // NOTE: we can't move specific blob to resulting GC list here
+      // when reference counter == 0 since subsequent extents might
+      // decrement its expected_allocation. 
+      // Hence need to enumerate all the extents first.
+      if (!bi.collect_candidate) {
+        bi.first_lextent = it;
+        bi.collect_candidate = true;
+      }
+      bi.last_lextent = it;
+    } else {
+      if (blob_info_counted && used_alloc_unit == alloc_unit_start) {
+        // don't need to allocate new AU for compressed data since another
+        // collocated uncompressed blob already exists
+    	--blob_info_counted->expected_allocations;
+        dout(30) << __func__  << " --expected_allocations:"
+		 << alloc_unit_start << dendl;
+      }
+      used_alloc_unit = alloc_unit_end;
+      blob_info_counted = nullptr;
+    }
+  }
+
+  for (auto b_it = affected_blobs.begin();
+       b_it != affected_blobs.end();
+       ++b_it) {
+    Blob* b = b_it->first;
+    BlobInfo& bi = b_it->second;
+    if (bi.referenced_bytes == 0) {
+      uint64_t len_on_disk = b_it->first->get_blob().get_ondisk_length();
+      int64_t blob_expected_for_release =
+        round_up_to(len_on_disk, min_alloc_size) / min_alloc_size;
+
+      dout(30) << __func__ << " " << *(b_it->first)
+               << " expected4release=" << blob_expected_for_release
+               << " expected_allocations=" << bi.expected_allocations
+               << dendl;
+      int64_t benefit = blob_expected_for_release - bi.expected_allocations;
+      if (benefit >= g_conf()->bluestore_gc_enable_blob_threshold) {
+        if (bi.collect_candidate) {
+          auto it = bi.first_lextent;
+          bool bExit = false;
+          do {
+            if (it->blob.get() == b) {
+              extents_to_collect.insert(it->logical_offset, it->length);
+            }
+            bExit = it == bi.last_lextent;
+            ++it;
+          } while (!bExit);
+        }
+        expected_for_release += blob_expected_for_release;
+        expected_allocations += bi.expected_allocations;
+      }
+    }
+  }
+}
+
+int64_t BlueStore::GarbageCollector::estimate(
+  uint64_t start_offset,
+  uint64_t length,
+  const BlueStore::ExtentMap& extent_map,
+  const BlueStore::old_extent_map_t& old_extents,
+  uint64_t min_alloc_size)
+{
+
+  affected_blobs.clear();
+  extents_to_collect.clear();
+  used_alloc_unit = boost::optional<uint64_t >();
+  blob_info_counted = nullptr;
+
+  uint64_t gc_start_offset = start_offset;
+  uint64_t gc_end_offset = start_offset + length;
+
+  uint64_t end_offset = start_offset + length;
+
+  for (auto it = old_extents.begin(); it != old_extents.end(); ++it) {
+    Blob* b = it->e.blob.get();
+    if (b->get_blob().is_compressed()) {
+
+      // update gc_start_offset/gc_end_offset if needed
+      gc_start_offset = min(gc_start_offset, (uint64_t)it->e.blob_start());
+      gc_end_offset = std::max(gc_end_offset, (uint64_t)it->e.blob_end());
+
+      auto o = it->e.logical_offset;
+      auto l = it->e.length;
+
+      uint64_t ref_bytes = b->get_referenced_bytes();
+      // micro optimization to bypass blobs that have no more references
+      if (ref_bytes != 0) {
+        dout(30) << __func__ << " affected_blob:" << *b
+                 << " unref 0x" << std::hex << o << "~" << l
+                 << std::dec << dendl;
+	affected_blobs.emplace(b, BlobInfo(ref_bytes));
+      }
+    }
+  }
+  dout(30) << __func__ << " gc range(hex): [" << std::hex
+           << gc_start_offset << ", " << gc_end_offset 
+           << ")" << std::dec << dendl;
+
+  // enumerate preceeding extents to check if they reference affected blobs
+  if (gc_start_offset < start_offset || gc_end_offset > end_offset) {
+    process_protrusive_extents(extent_map,
+                               gc_start_offset,
+			       gc_end_offset,
+			       start_offset,
+			       end_offset,
+			       min_alloc_size);
+  }
+  return expected_for_release - expected_allocations;
+}
+
+// LruOnodeCacheShard
+struct LruOnodeCacheShard : public BlueStore::OnodeCacheShard {
+  typedef boost::intrusive::list<
+    BlueStore::Onode,
+    boost::intrusive::member_hook<
+      BlueStore::Onode,
+      boost::intrusive::list_member_hook<>,
+      &BlueStore::Onode::lru_item> > list_t;
+
+  list_t lru;
+
+  explicit LruOnodeCacheShard(CephContext *cct) : BlueStore::OnodeCacheShard(cct) {}
+
+  void _add(BlueStore::Onode* o, int level) override
+  {
+    o->set_cached();
+    if (o->pin_nref == 1) {
+      (level > 0) ? lru.push_front(*o) : lru.push_back(*o);
+      o->cache_age_bin = age_bins.front();
+      *(o->cache_age_bin) += 1;
+    }
+    ++num; // we count both pinned and unpinned entries
+    dout(20) << __func__ << " " << this << " " << o->oid << " added, num="
+             << num << dendl;
+  }
+  void _rm(BlueStore::Onode* o) override
+  {
+    o->clear_cached();
+    if (o->lru_item.is_linked()) {
+      *(o->cache_age_bin) -= 1;
+      lru.erase(lru.iterator_to(*o));
+    }
+    ceph_assert(num);
+    --num;
+    dout(20) << __func__ << " " << this << " " << " " << o->oid << " removed, num=" << num << dendl;
+  }
+
+  void maybe_unpin(BlueStore::Onode* o) override
+  {
+    OnodeCacheShard* ocs = this;
+    ocs->lock.lock();
+    // It is possible that during waiting split_cache moved us to different OnodeCacheShard.
+    while (ocs != o->c->get_onode_cache()) {
+      ocs->lock.unlock();
+      ocs = o->c->get_onode_cache();
+      ocs->lock.lock();
+    }
+    if (o->is_cached() && o->pin_nref == 1) {
+      if(!o->lru_item.is_linked()) {
+        if (o->exists) {
+	  lru.push_front(*o);
+	  o->cache_age_bin = age_bins.front();
+	  *(o->cache_age_bin) += 1;
+	  dout(20) << __func__ << " " << this << " " << o->oid << " unpinned"
+                   << dendl;
+        } else {
+	  ceph_assert(num);
+	  --num;
+	  o->clear_cached();
+	  dout(20) << __func__ << " " << this << " " << o->oid << " removed"
+                   << dendl;
+          // remove will also decrement nref
+          o->c->onode_space._remove(o->oid);
+        }
+      } else if (o->exists) {
+        // move onode within LRU
+        lru.erase(lru.iterator_to(*o));
+        lru.push_front(*o);
+        if (o->cache_age_bin != age_bins.front()) {
+          *(o->cache_age_bin) -= 1;
+          o->cache_age_bin = age_bins.front();
+          *(o->cache_age_bin) += 1;
+        }
+        dout(20) << __func__ << " " << this << " " << o->oid << " touched"
+                 << dendl;
+      }
+    }
+    ocs->lock.unlock();
+  }
+
+  void _trim_to(uint64_t new_size) override
+  {
+    if (new_size >= lru.size()) {
+      return; // don't even try
+    } 
+    uint64_t n = num - new_size; // note: we might get empty LRU
+                                 // before n == 0 due to pinned
+                                 // entries. And hence being unable
+                                 // to reach new_size target.
+    while (n-- > 0 && lru.size() > 0) {
+      BlueStore::Onode *o = &lru.back();
+      lru.pop_back();
+
+      dout(20) << __func__ << "  rm " << o->oid << " "
+               << o->nref << " " << o->cached << dendl;
+
+      *(o->cache_age_bin) -= 1;
+      if (o->pin_nref > 1) {
+        dout(20) << __func__ << " " << this << " " << " " << " " << o->oid << dendl;
+      } else {
+	ceph_assert(num);
+        --num;
+        o->clear_cached();
+        o->c->onode_space._remove(o->oid);
+      }
+    }
+  }
+  void _move_pinned(OnodeCacheShard *to, BlueStore::Onode *o) override
+  {
+    if (to == this) {
+      return;
+    }
+    _rm(o);
+    ceph_assert(o->nref > 1);
+    to->_add(o, 0);
+  }
+  void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) override
+  {
+    std::lock_guard l(lock);
+    *onodes += num;
+    *pinned_onodes += num - lru.size();
+  }
+};
+
+// OnodeCacheShard
+BlueStore::OnodeCacheShard *BlueStore::OnodeCacheShard::create(
+    CephContext* cct,
+    string type,
+    PerfCounters *logger)
+{
+  BlueStore::OnodeCacheShard *c = nullptr;
+  // Currently we only implement an LRU cache for onodes
+  c = new LruOnodeCacheShard(cct);
+  c->logger = logger;
+  return c;
+}
+
+// LruBufferCacheShard
+struct LruBufferCacheShard : public BlueStore::BufferCacheShard {
+  typedef boost::intrusive::list<
+    BlueStore::Buffer,
+    boost::intrusive::member_hook<
+      BlueStore::Buffer,
+      boost::intrusive::list_member_hook<>,
+      &BlueStore::Buffer::lru_item> > list_t;
+  list_t lru;
+
+  explicit LruBufferCacheShard(CephContext *cct) : BlueStore::BufferCacheShard(cct) {}
+
+  void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override {
+    if (near) {
+      auto q = lru.iterator_to(*near);
+      lru.insert(q, *b);
+    } else if (level > 0) {
+      lru.push_front(*b);
+    } else {
+      lru.push_back(*b);
+    }
+    buffer_bytes += b->length;
+    b->cache_age_bin = age_bins.front();
+    *(b->cache_age_bin) += b->length;
+    num = lru.size();
+  }
+  void _rm(BlueStore::Buffer *b) override {
+    ceph_assert(buffer_bytes >= b->length);
+    buffer_bytes -= b->length;
+    assert(*(b->cache_age_bin) >= b->length);
+    *(b->cache_age_bin) -= b->length;
+    auto q = lru.iterator_to(*b);
+    lru.erase(q);
+    num = lru.size();
+  }
+  void _move(BlueStore::BufferCacheShard *src, BlueStore::Buffer *b) override {
+    src->_rm(b);
+    _add(b, 0, nullptr);
+  }
+  void _adjust_size(BlueStore::Buffer *b, int64_t delta) override {
+    ceph_assert((int64_t)buffer_bytes + delta >= 0);
+    buffer_bytes += delta;
+    assert(*(b->cache_age_bin) + delta >= 0);
+    *(b->cache_age_bin) += delta;
+  }
+  void _touch(BlueStore::Buffer *b) override {
+    auto p = lru.iterator_to(*b);
+    lru.erase(p);
+    lru.push_front(*b);
+    *(b->cache_age_bin) -= b->length;
+    b->cache_age_bin = age_bins.front();
+    *(b->cache_age_bin) += b->length;
+    num = lru.size();
+    _audit("_touch_buffer end");
+  }
+
+  void _trim_to(uint64_t max) override
+  {
+    while (buffer_bytes > max) {
+      auto i = lru.rbegin();
+      if (i == lru.rend()) {
+        // stop if lru is now empty
+        break;
+      }
+
+      BlueStore::Buffer *b = &*i;
+      ceph_assert(b->is_clean());
+      dout(20) << __func__ << " rm " << *b << dendl;
+      assert(*(b->cache_age_bin) >= b->length);
+      *(b->cache_age_bin) -= b->length;
+      b->space->_rm_buffer(this, b);
+    }
+    num = lru.size();
+  }
+
+  void add_stats(uint64_t *extents,
+                 uint64_t *blobs,
+                 uint64_t *buffers,
+                 uint64_t *bytes) override {
+    *extents += num_extents;
+    *blobs += num_blobs;
+    *buffers += num;
+    *bytes += buffer_bytes;
+  }
+#ifdef DEBUG_CACHE
+  void _audit(const char *s) override
+  {
+    dout(10) << __func__ << " " << when << " start" << dendl;
+    uint64_t s = 0;
+    for (auto i = lru.begin(); i != lru.end(); ++i) {
+      s += i->length;
+    }
+    if (s != buffer_bytes) {
+      derr << __func__ << " buffer_size " << buffer_bytes << " actual " << s
+           << dendl;
+      for (auto i = lru.begin(); i != lru.end(); ++i) {
+        derr << __func__ << " " << *i << dendl;
+      }
+      ceph_assert(s == buffer_bytes);
+    }
+    dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
+             << " ok" << dendl;
+  }
+#endif
+};
+
+// TwoQBufferCacheShard
+
+struct TwoQBufferCacheShard : public BlueStore::BufferCacheShard {
+  typedef boost::intrusive::list<
+    BlueStore::Buffer,
+    boost::intrusive::member_hook<
+      BlueStore::Buffer,
+      boost::intrusive::list_member_hook<>,
+      &BlueStore::Buffer::lru_item> > list_t;
+  list_t hot;      ///< "Am" hot buffers
+  list_t warm_in;  ///< "A1in" newly warm buffers
+  list_t warm_out; ///< "A1out" empty buffers we've evicted
+
+  enum {
+    BUFFER_NEW = 0,
+    BUFFER_WARM_IN,   ///< in warm_in
+    BUFFER_WARM_OUT,  ///< in warm_out
+    BUFFER_HOT,       ///< in hot
+    BUFFER_TYPE_MAX
+  };
+
+  uint64_t list_bytes[BUFFER_TYPE_MAX] = {0}; ///< bytes per type
+
+public:
+  explicit TwoQBufferCacheShard(CephContext *cct) : BufferCacheShard(cct) {}
+
+  void _add(BlueStore::Buffer *b, int level, BlueStore::Buffer *near) override
+  {
+    dout(20) << __func__ << " level " << level << " near " << near
+             << " on " << *b
+             << " which has cache_private " << b->cache_private << dendl;
+    if (near) {
+      b->cache_private = near->cache_private;
+      switch (b->cache_private) {
+      case BUFFER_WARM_IN:
+        warm_in.insert(warm_in.iterator_to(*near), *b);
+        break;
+      case BUFFER_WARM_OUT:
+        ceph_assert(b->is_empty());
+        warm_out.insert(warm_out.iterator_to(*near), *b);
+        break;
+      case BUFFER_HOT:
+        hot.insert(hot.iterator_to(*near), *b);
+        break;
+      default:
+        ceph_abort_msg("bad cache_private");
+      }
+    } else if (b->cache_private == BUFFER_NEW) {
+      b->cache_private = BUFFER_WARM_IN;
+      if (level > 0) {
+        warm_in.push_front(*b);
+      } else {
+        // take caller hint to start at the back of the warm queue
+        warm_in.push_back(*b);
+      }
+    } else {
+      // we got a hint from discard
+      switch (b->cache_private) {
+      case BUFFER_WARM_IN:
+        // stay in warm_in.  move to front, even though 2Q doesn't actually
+        // do this.
+        dout(20) << __func__ << " move to front of warm " << *b << dendl;
+        warm_in.push_front(*b);
+        break;
+      case BUFFER_WARM_OUT:
+        b->cache_private = BUFFER_HOT;
+        // move to hot.  fall-thru
+      case BUFFER_HOT:
+        dout(20) << __func__ << " move to front of hot " << *b << dendl;
+        hot.push_front(*b);
+        break;
+      default:
+        ceph_abort_msg("bad cache_private");
+      }
+    }
+    b->cache_age_bin = age_bins.front();
+    if (!b->is_empty()) {
+      buffer_bytes += b->length;
+      list_bytes[b->cache_private] += b->length;
+      *(b->cache_age_bin) += b->length;
+    }
+    num = hot.size() + warm_in.size();
+  }
+
+  void _rm(BlueStore::Buffer *b) override
+  {
+    dout(20) << __func__ << " " << *b << dendl;
+    if (!b->is_empty()) {
+      ceph_assert(buffer_bytes >= b->length);
+      buffer_bytes -= b->length;
+      ceph_assert(list_bytes[b->cache_private] >= b->length);
+      list_bytes[b->cache_private] -= b->length;
+      assert(*(b->cache_age_bin) >= b->length);
+      *(b->cache_age_bin) -= b->length;
+    }
+    switch (b->cache_private) {
+    case BUFFER_WARM_IN:
+      warm_in.erase(warm_in.iterator_to(*b));
+      break;
+    case BUFFER_WARM_OUT:
+      warm_out.erase(warm_out.iterator_to(*b));
+      break;
+    case BUFFER_HOT:
+      hot.erase(hot.iterator_to(*b));
+      break;
+    default:
+      ceph_abort_msg("bad cache_private");
+    }
+    num = hot.size() + warm_in.size();
+  }
+
+  void _move(BlueStore::BufferCacheShard *srcc, BlueStore::Buffer *b) override
+  {
+    TwoQBufferCacheShard *src = static_cast<TwoQBufferCacheShard*>(srcc);
+    src->_rm(b);
+
+    // preserve which list we're on (even if we can't preserve the order!)
+    switch (b->cache_private) {
+    case BUFFER_WARM_IN:
+      ceph_assert(!b->is_empty());
+      warm_in.push_back(*b);
+      break;
+    case BUFFER_WARM_OUT:
+      ceph_assert(b->is_empty());
+      warm_out.push_back(*b);
+      break;
+    case BUFFER_HOT:
+      ceph_assert(!b->is_empty());
+      hot.push_back(*b);
+      break;
+    default:
+      ceph_abort_msg("bad cache_private");
+    }
+    if (!b->is_empty()) {
+      buffer_bytes += b->length;
+      list_bytes[b->cache_private] += b->length;
+      *(b->cache_age_bin) += b->length;
+    }
+    num = hot.size() + warm_in.size();
+  }
+
+  void _adjust_size(BlueStore::Buffer *b, int64_t delta) override
+  {
+    dout(20) << __func__ << " delta " << delta << " on " << *b << dendl;
+    if (!b->is_empty()) {
+      ceph_assert((int64_t)buffer_bytes + delta >= 0);
+      buffer_bytes += delta;
+      ceph_assert((int64_t)list_bytes[b->cache_private] + delta >= 0);
+      list_bytes[b->cache_private] += delta;
+      assert(*(b->cache_age_bin) + delta >= 0);
+      *(b->cache_age_bin) += delta;
+    }
+  }
+
+  void _touch(BlueStore::Buffer *b) override {
+    switch (b->cache_private) {
+    case BUFFER_WARM_IN:
+      // do nothing (somewhat counter-intuitively!)
+      break;
+    case BUFFER_WARM_OUT:
+      // move from warm_out to hot LRU
+      ceph_abort_msg("this happens via discard hint");
+      break;
+    case BUFFER_HOT:
+      // move to front of hot LRU
+      hot.erase(hot.iterator_to(*b));
+      hot.push_front(*b);
+      break;
+    }
+    *(b->cache_age_bin) -= b->length;
+    b->cache_age_bin = age_bins.front();
+    *(b->cache_age_bin) += b->length;
+    num = hot.size() + warm_in.size();
+    _audit("_touch_buffer end");
+  }
+
+  void _trim_to(uint64_t max) override
+  {
+    if (buffer_bytes > max) {
+      uint64_t kin = max * cct->_conf->bluestore_2q_cache_kin_ratio;
+      uint64_t khot = max - kin;
+
+      // pre-calculate kout based on average buffer size too,
+      // which is typical(the warm_in and hot lists may change later)
+      uint64_t kout = 0;
+      uint64_t buffer_num = hot.size() + warm_in.size();
+      if (buffer_num) {
+        uint64_t avg_size = buffer_bytes / buffer_num;
+        ceph_assert(avg_size);
+        uint64_t calculated_num = max / avg_size;
+        kout = calculated_num * cct->_conf->bluestore_2q_cache_kout_ratio;
+      }
+
+      if (list_bytes[BUFFER_HOT] < khot) {
+        // hot is small, give slack to warm_in
+        kin += khot - list_bytes[BUFFER_HOT];
+      } else if (list_bytes[BUFFER_WARM_IN] < kin) {
+        // warm_in is small, give slack to hot
+        khot += kin - list_bytes[BUFFER_WARM_IN];
+      }
+
+      // adjust warm_in list
+      int64_t to_evict_bytes = list_bytes[BUFFER_WARM_IN] - kin;
+      uint64_t evicted = 0;
+
+      while (to_evict_bytes > 0) {
+        auto p = warm_in.rbegin();
+        if (p == warm_in.rend()) {
+          // stop if warm_in list is now empty
+          break;
+        }
+
+        BlueStore::Buffer *b = &*p;
+        ceph_assert(b->is_clean());
+        dout(20) << __func__ << " buffer_warm_in -> out " << *b << dendl;
+        ceph_assert(buffer_bytes >= b->length);
+        buffer_bytes -= b->length;
+        ceph_assert(list_bytes[BUFFER_WARM_IN] >= b->length);
+        list_bytes[BUFFER_WARM_IN] -= b->length;
+        assert(*(b->cache_age_bin) >= b->length);
+        *(b->cache_age_bin) -= b->length;
+	to_evict_bytes -= b->length;
+        evicted += b->length;
+        b->state = BlueStore::Buffer::STATE_EMPTY;
+        b->data.clear();
+        warm_in.erase(warm_in.iterator_to(*b));
+        warm_out.push_front(*b);
+        b->cache_private = BUFFER_WARM_OUT;
+      }
+
+      if (evicted > 0) {
+        dout(20) << __func__ << " evicted " << byte_u_t(evicted)
+                 << " from warm_in list, done evicting warm_in buffers"
+                 << dendl;
+      }
+
+      // adjust hot list
+      to_evict_bytes = list_bytes[BUFFER_HOT] - khot;
+      evicted = 0;
+
+      while (to_evict_bytes > 0) {
+        auto p = hot.rbegin();
+        if (p == hot.rend()) {
+          // stop if hot list is now empty
+          break;
+        }
+
+        BlueStore::Buffer *b = &*p;
+        dout(20) << __func__ << " buffer_hot rm " << *b << dendl;
+        ceph_assert(b->is_clean());
+        // adjust evict size before buffer goes invalid
+        to_evict_bytes -= b->length;
+        evicted += b->length;
+        b->space->_rm_buffer(this, b);
+      }
+
+      if (evicted > 0) {
+        dout(20) << __func__ << " evicted " << byte_u_t(evicted)
+                 << " from hot list, done evicting hot buffers"
+                 << dendl;
+      }
+
+      // adjust warm out list too, if necessary
+      int64_t n = warm_out.size() - kout;
+      while (n-- > 0) {
+        BlueStore::Buffer *b = &*warm_out.rbegin();
+        ceph_assert(b->is_empty());
+        dout(20) << __func__ << " buffer_warm_out rm " << *b << dendl;
+        b->space->_rm_buffer(this, b);
+      }
+    }
+    num = hot.size() + warm_in.size();
+  }
+
+  void add_stats(uint64_t *extents,
+                 uint64_t *blobs,
+                 uint64_t *buffers,
+                 uint64_t *bytes) override {
+    *extents += num_extents;
+    *blobs += num_blobs;
+    *buffers += num;
+    *bytes += buffer_bytes;
+  }
+
+#ifdef DEBUG_CACHE
+  void _audit(const char *s) override
+  {
+    dout(10) << __func__ << " " << when << " start" << dendl;
+    uint64_t s = 0;
+    for (auto i = hot.begin(); i != hot.end(); ++i) {
+      s += i->length;
+    }
+
+    uint64_t hot_bytes = s;
+    if (hot_bytes != list_bytes[BUFFER_HOT]) {
+      derr << __func__ << " hot_list_bytes "
+           << list_bytes[BUFFER_HOT]
+           << " != actual " << hot_bytes
+           << dendl;
+      ceph_assert(hot_bytes == list_bytes[BUFFER_HOT]);
+    }
+
+    for (auto i = warm_in.begin(); i != warm_in.end(); ++i) {
+      s += i->length;
+    }
+
+    uint64_t warm_in_bytes = s - hot_bytes;
+    if (warm_in_bytes != list_bytes[BUFFER_WARM_IN]) {
+      derr << __func__ << " warm_in_list_bytes "
+           << list_bytes[BUFFER_WARM_IN]
+           << " != actual " << warm_in_bytes
+           << dendl;
+      ceph_assert(warm_in_bytes == list_bytes[BUFFER_WARM_IN]);
+    }
+
+    if (s != buffer_bytes) {
+      derr << __func__ << " buffer_bytes " << buffer_bytes << " actual " << s
+           << dendl;
+      ceph_assert(s == buffer_bytes);
+    }
+
+    dout(20) << __func__ << " " << when << " buffer_bytes " << buffer_bytes
+             << " ok" << dendl;
+  }
+#endif
+};
+
+// BuferCacheShard
+
+BlueStore::BufferCacheShard *BlueStore::BufferCacheShard::create(
+    CephContext* cct,
+    string type,
+    PerfCounters *logger)
+{
+  BufferCacheShard *c = nullptr;
+  if (type == "lru")
+    c = new LruBufferCacheShard(cct);
+  else if (type == "2q")
+    c = new TwoQBufferCacheShard(cct);
+  else
+    ceph_abort_msg("unrecognized cache type");
+  c->logger = logger;
+  return c;
+}
+
+// BufferSpace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.BufferSpace(" << this << " in " << cache << ") "
+
+void BlueStore::BufferSpace::_clear(BufferCacheShard* cache)
+{
+  // note: we already hold cache->lock
+  ldout(cache->cct, 20) << __func__ << dendl;
+  while (!buffer_map.empty()) {
+    _rm_buffer(cache, buffer_map.begin());
+  }
+}
+
+int BlueStore::BufferSpace::_discard(BufferCacheShard* cache, uint32_t offset, uint32_t length)
+{
+  // note: we already hold cache->lock
+  ldout(cache->cct, 20) << __func__ << std::hex << " 0x" << offset << "~" << length
+           << std::dec << dendl;
+  int cache_private = 0;
+  cache->_audit("discard start");
+  auto i = _data_lower_bound(offset);
+  uint32_t end = offset + length;
+  while (i != buffer_map.end()) {
+    Buffer *b = i->second.get();
+    if (b->offset >= end) {
+      break;
+    }
+    if (b->cache_private > cache_private) {
+      cache_private = b->cache_private;
+    }
+    if (b->offset < offset) {
+      int64_t front = offset - b->offset;
+      if (b->end() > end) {
+	// drop middle (split)
+	uint32_t tail = b->end() - end;
+	if (b->data.length()) {
+	  bufferlist bl;
+	  bl.substr_of(b->data, b->length - tail, tail);
+	  Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
+	  nb->maybe_rebuild();
+	  _add_buffer(cache, nb, 0, b);
+	} else {
+	  _add_buffer(cache, new Buffer(this, b->state, b->seq, end, tail,
+                                        b->flags),
+	              0, b);
+	}
+	if (!b->is_writing()) {
+	  cache->_adjust_size(b, front - (int64_t)b->length);
+	}
+	b->truncate(front);
+	b->maybe_rebuild();
+	cache->_audit("discard end 1");
+	break;
+      } else {
+	// drop tail
+	if (!b->is_writing()) {
+	  cache->_adjust_size(b, front - (int64_t)b->length);
+	}
+	b->truncate(front);
+	b->maybe_rebuild();
+	++i;
+	continue;
+      }
+    }
+    if (b->end() <= end) {
+      // drop entire buffer
+      _rm_buffer(cache, i++);
+      continue;
+    }
+    // drop front
+    uint32_t keep = b->end() - end;
+    if (b->data.length()) {
+      bufferlist bl;
+      bl.substr_of(b->data, b->length - keep, keep);
+      Buffer *nb = new Buffer(this, b->state, b->seq, end, bl, b->flags);
+      nb->maybe_rebuild();
+      _add_buffer(cache, nb, 0, b);
+    } else {
+      _add_buffer(cache, new Buffer(this, b->state, b->seq, end, keep,
+                                    b->flags),
+                  0, b);
+    }
+    _rm_buffer(cache, i);
+    cache->_audit("discard end 2");
+    break;
+  }
+  return cache_private;
+}
+
+void BlueStore::BufferSpace::read(
+  BufferCacheShard* cache, 
+  uint32_t offset,
+  uint32_t length,
+  BlueStore::ready_regions_t& res,
+  interval_set<uint32_t>& res_intervals,
+  int flags)
+{
+  res.clear();
+  res_intervals.clear();
+  uint32_t want_bytes = length;
+  uint32_t end = offset + length;
+
+  {
+    std::lock_guard l(cache->lock);
+    for (auto i = _data_lower_bound(offset);
+         i != buffer_map.end() && offset < end && i->first < end;
+         ++i) {
+      Buffer *b = i->second.get();
+      ceph_assert(b->end() > offset);
+
+      bool val = false;
+      if (flags & BYPASS_CLEAN_CACHE)
+        val = b->is_writing();
+      else
+        val = b->is_writing() || b->is_clean();
+      if (val) {
+        if (b->offset < offset) {
+	  uint32_t skip = offset - b->offset;
+	  uint32_t l = min(length, b->length - skip);
+	  res[offset].substr_of(b->data, skip, l);
+	  res_intervals.insert(offset, l);
+	  offset += l;
+	  length -= l;
+	  if (!b->is_writing()) {
+	    cache->_touch(b);
+          }
+	  continue;
+        }
+        if (b->offset > offset) {
+	  uint32_t gap = b->offset - offset;
+	  if (length <= gap) {
+	    break;
+	  }
+	  offset += gap;
+	  length -= gap;
+        }
+        if (!b->is_writing()) {
+	  cache->_touch(b);
+        }
+        if (b->length > length) {
+	  res[offset].substr_of(b->data, 0, length);
+	  res_intervals.insert(offset, length);
+          break;
+        } else {
+	  res[offset].append(b->data);
+	  res_intervals.insert(offset, b->length);
+          if (b->length == length)
+            break;
+	  offset += b->length;
+	  length -= b->length;
+        }
+      }
+    }
+  }
+
+  uint64_t hit_bytes = res_intervals.size();
+  ceph_assert(hit_bytes <= want_bytes);
+  uint64_t miss_bytes = want_bytes - hit_bytes;
+  cache->logger->inc(l_bluestore_buffer_hit_bytes, hit_bytes);
+  cache->logger->inc(l_bluestore_buffer_miss_bytes, miss_bytes);
+}
+
+void BlueStore::BufferSpace::_finish_write(BufferCacheShard* cache, uint64_t seq)
+{
+  auto i = writing.begin();
+  while (i != writing.end()) {
+    if (i->seq > seq) {
+      break;
+    }
+    if (i->seq < seq) {
+      ++i;
+      continue;
+    }
+
+    Buffer *b = &*i;
+    ceph_assert(b->is_writing());
+
+    if (b->flags & Buffer::FLAG_NOCACHE) {
+      writing.erase(i++);
+      ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl;
+      buffer_map.erase(b->offset);
+    } else {
+      b->state = Buffer::STATE_CLEAN;
+      writing.erase(i++);
+      b->maybe_rebuild();
+      b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
+      cache->_add(b, 1, nullptr);
+      ldout(cache->cct, 20) << __func__ << " added " << *b << dendl;
+    }
+  }
+  cache->_trim();
+  cache->_audit("finish_write end");
+}
+
+void BlueStore::BufferSpace::split(BufferCacheShard* cache, size_t pos, BlueStore::BufferSpace &r)
+{
+  std::lock_guard lk(cache->lock);
+  if (buffer_map.empty())
+    return;
+
+  auto p = --buffer_map.end();
+  while (true) {
+    if (p->second->end() <= pos)
+      break;
+
+    if (p->second->offset < pos) {
+      ldout(cache->cct, 30) << __func__ << " cut " << *p->second << dendl;
+      size_t left = pos - p->second->offset;
+      size_t right = p->second->length - left;
+      if (p->second->data.length()) {
+	bufferlist bl;
+	bl.substr_of(p->second->data, left, right);
+	r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+                                        0, bl, p->second->flags),
+		      0, p->second.get());
+      } else {
+	r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+                                        0, right, p->second->flags),
+		      0, p->second.get());
+      }
+      cache->_adjust_size(p->second.get(), -right);
+      p->second->truncate(left);
+      break;
+    }
+
+    ceph_assert(p->second->end() > pos);
+    ldout(cache->cct, 30) << __func__ << " move " << *p->second << dendl;
+    if (p->second->data.length()) {
+      r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+                               p->second->offset - pos, p->second->data, p->second->flags),
+                    0, p->second.get());
+    } else {
+      r._add_buffer(cache, new Buffer(&r, p->second->state, p->second->seq,
+                               p->second->offset - pos, p->second->length, p->second->flags),
+                    0, p->second.get());
+    }
+    if (p == buffer_map.begin()) {
+      _rm_buffer(cache, p);
+      break;
+    } else {
+      _rm_buffer(cache, p--);
+    }
+  }
+  ceph_assert(writing.empty());
+  cache->_trim();
+}
+
+// OnodeSpace
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.OnodeSpace(" << this << " in " << cache << ") "
+
+BlueStore::OnodeRef BlueStore::OnodeSpace::add_onode(const ghobject_t& oid,
+  OnodeRef& o)
+{
+  std::lock_guard l(cache->lock);
+  // add entry or return existing one
+  auto p = onode_map.emplace(oid, o);
+  if (!p.second) {
+    ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
+			  << " raced, returning existing " << p.first->second
+			  << dendl;
+    return p.first->second;
+  }
+  ldout(cache->cct, 20) << __func__ << " " << oid << " " << o << dendl;
+  cache->_add(o.get(), 1);
+  cache->_trim();
+  return o;
+}
+
+void BlueStore::OnodeSpace::_remove(const ghobject_t& oid)
+{
+  ldout(cache->cct, 20) << __func__ << " " << oid << " " << dendl;
+  onode_map.erase(oid);
+}
+
+BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
+{
+  ldout(cache->cct, 30) << __func__ << dendl;
+  OnodeRef o;
+
+  {
+    std::lock_guard l(cache->lock);
+    ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
+    if (p == onode_map.end()) {
+      ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
+      cache->logger->inc(l_bluestore_onode_misses);
+    } else {
+      ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
+                            << " " << p->second->nref
+                            << " " << p->second->cached
+			    << dendl;
+      // This will pin onode and implicitly touch the cache when Onode
+      // eventually will become unpinned
+      o = p->second;
+
+      cache->logger->inc(l_bluestore_onode_hits);
+    }
+  }
+
+  return o;
+}
+
+void BlueStore::OnodeSpace::clear()
+{
+  std::lock_guard l(cache->lock);
+  ldout(cache->cct, 10) << __func__ << " " << onode_map.size()<< dendl;
+  for (auto &p : onode_map) {
+    cache->_rm(p.second.get());
+  }
+  onode_map.clear();
+}
+
+bool BlueStore::OnodeSpace::empty()
+{
+  std::lock_guard l(cache->lock);
+  return onode_map.empty();
+}
+
+void BlueStore::OnodeSpace::rename(
+  OnodeRef& oldo,
+  const ghobject_t& old_oid,
+  const ghobject_t& new_oid,
+  const mempool::bluestore_cache_meta::string& new_okey)
+{
+  std::lock_guard l(cache->lock);
+  ldout(cache->cct, 30) << __func__ << " " << old_oid << " -> " << new_oid
+			<< dendl;
+  ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
+  po = onode_map.find(old_oid);
+  pn = onode_map.find(new_oid);
+  ceph_assert(po != pn);
+
+  ceph_assert(po != onode_map.end());
+  if (pn != onode_map.end()) {
+    ldout(cache->cct, 30) << __func__ << "  removing target " << pn->second
+			  << dendl;
+    cache->_rm(pn->second.get());
+    onode_map.erase(pn);
+  }
+  OnodeRef o = po->second;
+
+  // install a non-existent onode at old location
+  oldo.reset(new Onode(o->c, old_oid, o->key));
+  po->second = oldo;
+  cache->_add(oldo.get(), 1);
+  // add at new position and fix oid, key.
+  // This will pin 'o' and implicitly touch cache
+  // when it will eventually become unpinned
+  onode_map.insert(make_pair(new_oid, o));
+
+  o->oid = new_oid;
+  o->key = new_okey;
+  cache->_trim();
+}
+
+bool BlueStore::OnodeSpace::map_any(std::function<bool(Onode*)> f)
+{
+  std::lock_guard l(cache->lock);
+  ldout(cache->cct, 20) << __func__ << dendl;
+  for (auto& i : onode_map) {
+    if (f(i.second.get())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <int LogLevelV = 30>
+void BlueStore::OnodeSpace::dump(CephContext *cct)
+{
+  for (auto& i : onode_map) {
+    ldout(cct, LogLevelV) << i.first << " : " << i.second
+      << " " << i.second->nref
+      << " " << i.second->cached
+      << dendl;
+  }
+}
+
+// SharedBlob
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.sharedblob(" << this << ") "
+#undef dout_context
+#define dout_context coll->store->cct
+
+void BlueStore::SharedBlob::dump(Formatter* f) const
+{
+  f->dump_bool("loaded", loaded);
+  if (loaded) {
+    persistent->dump(f);
+  } else {
+    f->dump_unsigned("sbid_unloaded", sbid_unloaded);
+  }
+}
+
+ostream& operator<<(ostream& out, const BlueStore::SharedBlob& sb)
+{
+  out << "SharedBlob(" << &sb;
+  
+  if (sb.loaded) {
+    out << " loaded " << *sb.persistent;
+  } else {
+    out << " sbid 0x" << std::hex << sb.sbid_unloaded << std::dec;
+  }
+  return out << ")";
+}
+
+BlueStore::SharedBlob::SharedBlob(uint64_t i, Collection *_coll)
+  : coll(_coll), sbid_unloaded(i)
+{
+  ceph_assert(sbid_unloaded > 0);
+  if (get_cache()) {
+    get_cache()->add_blob();
+  }
+}
+
+BlueStore::SharedBlob::~SharedBlob()
+{
+  if (loaded && persistent) {
+    delete persistent; 
+  }
+}
+
+void BlueStore::SharedBlob::put()
+{
+  if (--nref == 0) {
+    dout(20) << __func__ << " " << this
+	     << " removing self from set " << get_parent()
+	     << dendl;
+  again:
+    auto coll_snap = coll;
+    if (coll_snap) {
+      std::lock_guard l(coll_snap->cache->lock);
+      if (coll_snap != coll) {
+	goto again;
+      }
+      if (!coll_snap->shared_blob_set.remove(this, true)) {
+	// race with lookup
+	return;
+      }
+      bc._clear(coll_snap->cache);
+      coll_snap->cache->rm_blob();
+    }
+    delete this;
+  }
+}
+
+void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length)
+{
+  ceph_assert(persistent);
+  persistent->ref_map.get(offset, length);
+}
+
+void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length,
+				    PExtentVector *r,
+				    bool *unshare)
+{
+  ceph_assert(persistent);
+  persistent->ref_map.put(offset, length, r,
+    unshare && !*unshare ? unshare : nullptr);
+}
+
+void BlueStore::SharedBlob::finish_write(uint64_t seq)
+{
+  while (true) {
+    BufferCacheShard *cache = coll->cache;
+    std::lock_guard l(cache->lock);
+    if (coll->cache != cache) {
+      dout(20) << __func__
+	       << " raced with sb cache update, was " << cache
+	       << ", now " << coll->cache << ", retrying"
+	       << dendl;
+      continue;
+    }
+    bc._finish_write(cache, seq);
+    break;
+  }
+}
+
+// SharedBlobSet
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.sharedblobset(" << this << ") "
+
+template <int LogLevelV = 30>
+void BlueStore::SharedBlobSet::dump(CephContext *cct)
+{
+  std::lock_guard l(lock);
+  for (auto& i : sb_map) {
+    ldout(cct, LogLevelV) << i.first << " : " << *i.second << dendl;
+  }
+}
+
+// Blob
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.blob(" << this << ") "
+
+void BlueStore::Blob::dump(Formatter* f) const
+{
+  if (is_spanning()) {
+    f->dump_unsigned("spanning_id ", id);
+  }
+  blob.dump(f);
+  if (shared_blob) {
+    f->dump_object("shared", *shared_blob);
+  }
+}
+
+ostream& operator<<(ostream& out, const BlueStore::Blob& b)
+{
+  out << "Blob(" << &b;
+  if (b.is_spanning()) {
+    out << " spanning " << b.id;
+  }
+  out << " " << b.get_blob() << " " << b.get_blob_use_tracker();
+  if (b.shared_blob) {
+    out << " " << *b.shared_blob;
+  } else {
+    out << " (shared_blob=NULL)";
+  }
+  out << ")";
+  return out;
+}
+
+void BlueStore::Blob::discard_unallocated(Collection *coll)
+{
+  if (get_blob().is_shared()) {
+    return;
+  }
+  if (get_blob().is_compressed()) {
+    bool discard = false;
+    bool all_invalid = true;
+    for (auto e : get_blob().get_extents()) {
+      if (!e.is_valid()) {
+        discard = true;
+      } else {
+        all_invalid = false;
+      }
+    }
+    ceph_assert(discard == all_invalid); // in case of compressed blob all
+				    // or none pextents are invalid.
+    if (discard) {
+      shared_blob->bc.discard(shared_blob->get_cache(), 0,
+                              get_blob().get_logical_length());
+    }
+  } else {
+    size_t pos = 0;
+    for (auto e : get_blob().get_extents()) {
+      if (!e.is_valid()) {
+	dout(20) << __func__ << " 0x" << std::hex << pos
+		 << "~" << e.length
+		 << std::dec << dendl;
+	shared_blob->bc.discard(shared_blob->get_cache(), pos, e.length);
+      }
+      pos += e.length;
+    }
+    if (get_blob().can_prune_tail()) {
+      dirty_blob().prune_tail();
+      used_in_blob.prune_tail(get_blob().get_ondisk_length());
+      dout(20) << __func__ << " pruned tail, now " << get_blob() << dendl;
+    }
+  }
+}
+
+void BlueStore::Blob::get_ref(
+  Collection *coll,
+  uint32_t offset,
+  uint32_t length)
+{
+  // Caller has to initialize Blob's logical length prior to increment 
+  // references.  Otherwise one is neither unable to determine required
+  // amount of counters in case of per-au tracking nor obtain min_release_size
+  // for single counter mode.
+  ceph_assert(get_blob().get_logical_length() != 0);
+  dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+           << std::dec << " " << *this << dendl;
+
+  if (used_in_blob.is_empty()) {
+    uint32_t min_release_size =
+      get_blob().get_release_size(coll->store->min_alloc_size);
+    uint64_t l = get_blob().get_logical_length();
+    dout(20) << __func__ << " init 0x" << std::hex << l << ", "
+             << min_release_size << std::dec << dendl;
+    used_in_blob.init(l, min_release_size);
+  }
+  used_in_blob.get(
+    offset,
+    length);
+}
+
+bool BlueStore::Blob::put_ref(
+  Collection *coll,
+  uint32_t offset,
+  uint32_t length,
+  PExtentVector *r)
+{
+  PExtentVector logical;
+
+  dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+           << std::dec << " " << *this << dendl;
+  
+  bool empty = used_in_blob.put(
+    offset,
+    length,
+    &logical);
+  r->clear();
+  // nothing to release
+  if (!empty && logical.empty()) {
+    return false;
+  }
+
+  bluestore_blob_t& b = dirty_blob();
+  return b.release_extents(empty, logical, r);
+}
+
+bool BlueStore::Blob::can_reuse_blob(uint32_t min_alloc_size,
+                		     uint32_t target_blob_size,
+		                     uint32_t b_offset,
+		                     uint32_t *length0) {
+  ceph_assert(min_alloc_size);
+  ceph_assert(target_blob_size);
+  if (!get_blob().is_mutable()) {
+    return false;
+  }
+
+  uint32_t length = *length0;
+  uint32_t end = b_offset + length;
+
+  // Currently for the sake of simplicity we omit blob reuse if data is
+  // unaligned with csum chunk. Later we can perform padding if needed.
+  if (get_blob().has_csum() &&
+     ((b_offset % get_blob().get_csum_chunk_size()) != 0 ||
+      (end % get_blob().get_csum_chunk_size()) != 0)) {
+    return false;
+  }
+
+  auto blen = get_blob().get_logical_length();
+  uint32_t new_blen = blen;
+
+  // make sure target_blob_size isn't less than current blob len
+  target_blob_size = std::max(blen, target_blob_size);
+
+  if (b_offset >= blen) {
+    // new data totally stands out of the existing blob
+    new_blen = end;
+  } else {
+    // new data overlaps with the existing blob
+    new_blen = std::max(blen, end);
+
+    uint32_t overlap = 0;
+    if (new_blen > blen) {
+      overlap = blen - b_offset;
+    } else {
+      overlap = length;
+    }
+
+    if (!get_blob().is_unallocated(b_offset, overlap)) {
+      // abort if any piece of the overlap has already been allocated
+      return false;
+    }
+  }
+
+  if (new_blen > blen) {
+    int64_t overflow = int64_t(new_blen) - target_blob_size;
+    // Unable to decrease the provided length to fit into max_blob_size
+    if (overflow >= length) {
+      return false;
+    }
+
+    // FIXME: in some cases we could reduce unused resolution
+    if (get_blob().has_unused()) {
+      return false;
+    }
+
+    if (overflow > 0) {
+      new_blen -= overflow;
+      length -= overflow;
+      *length0 = length;
+    }
+
+    if (new_blen > blen) {
+      dirty_blob().add_tail(new_blen);
+      used_in_blob.add_tail(new_blen,
+                            get_blob().get_release_size(min_alloc_size));
+    }
+  }
+  return true;
+}
+
+void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r)
+{
+  dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
+	   << " start " << *this << dendl;
+  ceph_assert(blob.can_split());
+  ceph_assert(used_in_blob.can_split());
+  bluestore_blob_t &lb = dirty_blob();
+  bluestore_blob_t &rb = r->dirty_blob();
+
+  used_in_blob.split(
+    blob_offset,
+    &(r->used_in_blob));
+
+  lb.split(blob_offset, rb);
+  shared_blob->bc.split(shared_blob->get_cache(), blob_offset, r->shared_blob->bc);
+
+  dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
+	   << " finish " << *this << dendl;
+  dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec
+	   << "    and " << *r << dendl;
+}
+
+#ifndef CACHE_BLOB_BL
+void BlueStore::Blob::decode(
+  bufferptr::const_iterator& p,
+  uint64_t struct_v,
+  uint64_t* sbid,
+  bool include_ref_map,
+  Collection *coll)
+{
+  denc(blob, p, struct_v);
+  if (blob.is_shared()) {
+    denc(*sbid, p);
+  }
+  if (include_ref_map) {
+    if (struct_v > 1) {
+      used_in_blob.decode(p);
+    } else {
+      used_in_blob.clear();
+      bluestore_extent_ref_map_t legacy_ref_map;
+      legacy_ref_map.decode(p);
+      if (coll) {
+        for (auto r : legacy_ref_map.ref_map) {
+          get_ref(
+            coll,
+            r.first,
+            r.second.refs * r.second.length);
+        }
+      }
+    }
+  }
+}
+#endif
+
+// Extent
+
+void BlueStore::Extent::dump(Formatter* f) const
+{
+  f->dump_unsigned("logical_offset", logical_offset);
+  f->dump_unsigned("length", length);
+  f->dump_unsigned("blob_offset", blob_offset);
+  f->dump_object("blob", *blob);
+}
+
+ostream& operator<<(ostream& out, const BlueStore::Extent& e)
+{
+  return out << std::hex << "0x" << e.logical_offset << "~" << e.length
+	     << ": 0x" << e.blob_offset << "~" << e.length << std::dec
+	     << " " << *e.blob;
+}
+
+// OldExtent
+BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c,
+						   uint32_t lo,
+						   uint32_t o,
+						   uint32_t l,
+						   BlobRef& b) {
+  OldExtent* oe = new OldExtent(lo, o, l, b);
+  b->put_ref(c.get(), o, l, &(oe->r));
+  oe->blob_empty = !b->is_referenced();
+  return oe;
+}
+
+// ExtentMap
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.extentmap(" << this << ") "
+#undef dout_context
+#define dout_context onode->c->store->cct
+
+BlueStore::ExtentMap::ExtentMap(Onode *o, size_t inline_shard_prealloc_size)
+  : onode(o),
+    inline_bl(inline_shard_prealloc_size) {
+}
+
+void BlueStore::ExtentMap::dump(Formatter* f) const
+{
+  f->open_array_section("extents");
+
+  for (auto& e : extent_map) {
+      f->dump_object("extent", e);
+  }
+  f->close_section();
+}
+
+void BlueStore::ExtentMap::dup(BlueStore* b, TransContext* txc,
+  CollectionRef& c, OnodeRef& oldo, OnodeRef& newo, uint64_t& srcoff,
+  uint64_t& length, uint64_t& dstoff) {
+
+  auto cct = onode->c->store->cct;
+  bool inject_21040 =
+    cct->_conf->bluestore_debug_inject_bug21040;  
+  vector<BlobRef> id_to_blob(oldo->extent_map.extent_map.size());
+  for (auto& e : oldo->extent_map.extent_map) {
+    e.blob->last_encoded_id = -1;
+  }
+
+  int n = 0;
+  uint64_t end = srcoff + length;
+  uint32_t dirty_range_begin = 0;
+  uint32_t dirty_range_end = 0;
+  bool src_dirty = false;
+  for (auto ep = oldo->extent_map.seek_lextent(srcoff);
+    ep != oldo->extent_map.extent_map.end();
+    ++ep) {
+    auto& e = *ep;
+    if (e.logical_offset >= end) {
+      break;
+    }
+    dout(20) << __func__ << "  src " << e << dendl;
+    BlobRef cb;
+    bool blob_duped = true;
+    if (e.blob->last_encoded_id >= 0) {
+      cb = id_to_blob[e.blob->last_encoded_id];
+      blob_duped = false;
+    } else { 
+      // dup the blob
+      const bluestore_blob_t& blob = e.blob->get_blob();
+      // make sure it is shared
+      if (!blob.is_shared()) {
+        c->make_blob_shared(b->_assign_blobid(txc), e.blob);
+	if (!inject_21040 && !src_dirty) {
+          src_dirty = true;
+          dirty_range_begin = e.logical_offset;
+	} else if (inject_21040 &&
+	           dirty_range_begin == 0 && dirty_range_end == 0) {
+	  dirty_range_begin = e.logical_offset;
+	}        
+        ceph_assert(e.logical_end() > 0);
+        // -1 to exclude next potential shard
+        dirty_range_end = e.logical_end() - 1;
+      } else {
+        c->load_shared_blob(e.blob->shared_blob);
+      }
+      cb = new Blob();
+      e.blob->last_encoded_id = n;
+      id_to_blob[n] = cb;
+      e.blob->dup(*cb);
+      // bump the extent refs on the copied blob's extents
+      for (auto p : blob.get_extents()) {
+        if (p.is_valid()) {
+          e.blob->shared_blob->get_ref(p.offset, p.length);
+        }
+      }
+      txc->write_shared_blob(e.blob->shared_blob);
+      dout(20) << __func__ << "    new " << *cb << dendl;
+    }
+
+    int skip_front, skip_back;
+    if (e.logical_offset < srcoff) {
+      skip_front = srcoff - e.logical_offset;
+    } else {
+      skip_front = 0;
+    }
+    if (e.logical_end() > end) {
+      skip_back = e.logical_end() - end;
+    } else {
+      skip_back = 0;
+    }
+
+    Extent* ne = new Extent(e.logical_offset + skip_front + dstoff - srcoff,
+      e.blob_offset + skip_front, e.length - skip_front - skip_back, cb);
+    newo->extent_map.extent_map.insert(*ne);
+    ne->blob->get_ref(c.get(), ne->blob_offset, ne->length);
+    // fixme: we may leave parts of new blob unreferenced that could
+    // be freed (relative to the shared_blob).
+    txc->statfs_delta.stored() += ne->length;
+    if (e.blob->get_blob().is_compressed()) {
+      txc->statfs_delta.compressed_original() += ne->length;
+      if (blob_duped) {
+        txc->statfs_delta.compressed() +=
+          cb->get_blob().get_compressed_payload_length();
+      }
+    }
+    dout(20) << __func__ << "  dst " << *ne << dendl;
+    ++n;
+  }
+  if ((!inject_21040 && src_dirty) ||
+       (inject_21040 && dirty_range_end > dirty_range_begin)) {
+    oldo->extent_map.dirty_range(dirty_range_begin,
+      dirty_range_end - dirty_range_begin);
+    txc->write_onode(oldo);
+  }
+  txc->write_onode(newo);
+
+  if (dstoff + length > newo->onode.size) {
+    newo->onode.size = dstoff + length;
+  }
+  newo->extent_map.dirty_range(dstoff, length);
+}
+void BlueStore::ExtentMap::update(KeyValueDB::Transaction t,
+                                  bool force)
+{
+  auto cct = onode->c->store->cct; //used by dout
+  dout(20) << __func__ << " " << onode->oid << (force ? " force" : "") << dendl;
+  if (onode->onode.extent_map_shards.empty()) {
+    if (inline_bl.length() == 0) {
+      unsigned n;
+      // we need to encode inline_bl to measure encoded length
+      bool never_happen = encode_some(0, OBJECT_MAX_SIZE, inline_bl, &n);
+      inline_bl.reassign_to_mempool(mempool::mempool_bluestore_inline_bl);
+      ceph_assert(!never_happen);
+      size_t len = inline_bl.length();
+      dout(20) << __func__ << "  inline shard " << len << " bytes from " << n
+	       << " extents" << dendl;
+      if (!force && len > cct->_conf->bluestore_extent_map_shard_max_size) {
+	request_reshard(0, OBJECT_MAX_SIZE);
+	return;
+      }
+    }
+    // will persist in the onode key.
+  } else {
+    // pending shard update
+    struct dirty_shard_t {
+      Shard *shard;
+      bufferlist bl;
+      dirty_shard_t(Shard *s) : shard(s) {}
+    };
+    vector<dirty_shard_t> encoded_shards;
+    // allocate slots for all shards in a single call instead of
+    // doing multiple allocations - one per each dirty shard
+    encoded_shards.reserve(shards.size());
+
+    auto p = shards.begin();
+    auto prev_p = p;
+    while (p != shards.end()) {
+      ceph_assert(p->shard_info->offset >= prev_p->shard_info->offset);
+      auto n = p;
+      ++n;
+      if (p->dirty) {
+	uint32_t endoff;
+	if (n == shards.end()) {
+	  endoff = OBJECT_MAX_SIZE;
+	} else {
+	  endoff = n->shard_info->offset;
+	}
+	encoded_shards.emplace_back(dirty_shard_t(&(*p)));
+        bufferlist& bl = encoded_shards.back().bl;
+	if (encode_some(p->shard_info->offset, endoff - p->shard_info->offset,
+			bl, &p->extents)) {
+	  if (force) {
+	    derr << __func__ << "  encode_some needs reshard" << dendl;
+	    ceph_assert(!force);
+	  }
+	}
+        size_t len = bl.length();
+
+	dout(20) << __func__ << "  shard 0x" << std::hex
+		 << p->shard_info->offset << std::dec << " is " << len
+		 << " bytes (was " << p->shard_info->bytes << ") from "
+		 << p->extents << " extents" << dendl;
+
+        if (!force) {
+	  if (len > cct->_conf->bluestore_extent_map_shard_max_size) {
+	    // we are big; reshard ourselves
+	    request_reshard(p->shard_info->offset, endoff);
+	  }
+	  // avoid resharding the trailing shard, even if it is small
+	  else if (n != shards.end() &&
+		   len < g_conf()->bluestore_extent_map_shard_min_size) {
+            ceph_assert(endoff != OBJECT_MAX_SIZE);
+	    if (p == shards.begin()) {
+	      // we are the first shard, combine with next shard
+	      request_reshard(p->shard_info->offset, endoff + 1);
+	    } else {
+	      // combine either with the previous shard or the next,
+	      // whichever is smaller
+	      if (prev_p->shard_info->bytes > n->shard_info->bytes) {
+		request_reshard(p->shard_info->offset, endoff + 1);
+	      } else {
+		request_reshard(prev_p->shard_info->offset, endoff);
+	      }
+	    }
+	  }
+        }
+      }
+      prev_p = p;
+      p = n;
+    }
+    if (needs_reshard()) {
+      return;
+    }
+
+    // schedule DB update for dirty shards
+    string key;
+    for (auto& it : encoded_shards) {
+      dout(20) << __func__ << "  encoding key for shard 0x" << std::hex
+	       << it.shard->shard_info->offset << std::dec << dendl;
+      it.shard->dirty = false;
+      it.shard->shard_info->bytes = it.bl.length();
+      generate_extent_shard_key_and_apply(
+	onode->key,
+	it.shard->shard_info->offset,
+	&key,
+        [&](const string& final_key) {
+          t->set(PREFIX_OBJ, final_key, it.bl);
+        }
+      );
+    }
+  }
+}
+
+bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
+{
+  if (spanning_blob_map.empty())
+    return 0;
+  bid_t bid = spanning_blob_map.rbegin()->first + 1;
+  // bid is valid and available.
+  if (bid >= 0)
+    return bid;
+  // Find next unused bid;
+  bid = rand() % (numeric_limits<bid_t>::max() + 1);
+  const auto begin_bid = bid;
+  do {
+    if (!spanning_blob_map.count(bid))
+      return bid;
+    else {
+      bid++;
+      if (bid < 0) bid = 0;
+    }
+  } while (bid != begin_bid);
+  auto cct = onode->c->store->cct; // used by dout
+  _dump_onode<0>(cct, *onode);
+  ceph_abort_msg("no available blob id");
+}
+
+void BlueStore::ExtentMap::reshard(
+  KeyValueDB *db,
+  KeyValueDB::Transaction t)
+{
+  auto cct = onode->c->store->cct; // used by dout
+
+  dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
+	   << needs_reshard_end << ")" << std::dec
+	   << " of " << onode->onode.extent_map_shards.size()
+	   << " shards on " << onode->oid << dendl;
+  for (auto& p : spanning_blob_map) {
+    dout(20) << __func__ << "   spanning blob " << p.first << " " << *p.second
+	     << dendl;
+  }
+  // determine shard index range
+  unsigned si_begin = 0, si_end = 0;
+  if (!shards.empty()) {
+    while (si_begin + 1 < shards.size() &&
+	   shards[si_begin + 1].shard_info->offset <= needs_reshard_begin) {
+      ++si_begin;
+    }
+    needs_reshard_begin = shards[si_begin].shard_info->offset;
+    for (si_end = si_begin; si_end < shards.size(); ++si_end) {
+      if (shards[si_end].shard_info->offset >= needs_reshard_end) {
+	needs_reshard_end = shards[si_end].shard_info->offset;
+	break;
+      }
+    }
+    if (si_end == shards.size()) {
+      needs_reshard_end = OBJECT_MAX_SIZE;
+    }
+    dout(20) << __func__ << "   shards [" << si_begin << "," << si_end << ")"
+	     << " over 0x[" << std::hex << needs_reshard_begin << ","
+	     << needs_reshard_end << ")" << std::dec << dendl;
+  }
+
+  fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
+
+  // we may need to fault in a larger interval later must have all
+  // referring extents for spanning blobs loaded in order to have
+  // accurate use_tracker values.
+  uint32_t spanning_scan_begin = needs_reshard_begin;
+  uint32_t spanning_scan_end = needs_reshard_end;
+
+  // remove old keys
+  string key;
+  for (unsigned i = si_begin; i < si_end; ++i) {
+    generate_extent_shard_key_and_apply(
+      onode->key, shards[i].shard_info->offset, &key,
+      [&](const string& final_key) {
+	t->rmkey(PREFIX_OBJ, final_key);
+      }
+      );
+  }
+
+  // calculate average extent size
+  unsigned bytes = 0;
+  unsigned extents = 0;
+  if (onode->onode.extent_map_shards.empty()) {
+    bytes = inline_bl.length();
+    extents = extent_map.size();
+  } else {
+    for (unsigned i = si_begin; i < si_end; ++i) {
+      bytes += shards[i].shard_info->bytes;
+      extents += shards[i].extents;
+    }
+  }
+  unsigned target = cct->_conf->bluestore_extent_map_shard_target_size;
+  unsigned slop = target *
+    cct->_conf->bluestore_extent_map_shard_target_size_slop;
+  unsigned extent_avg = bytes / std::max(1u, extents);
+  dout(20) << __func__ << "  extent_avg " << extent_avg << ", target " << target
+	   << ", slop " << slop << dendl;
+
+  // reshard
+  unsigned estimate = 0;
+  unsigned offset = needs_reshard_begin;
+  vector<bluestore_onode_t::shard_info> new_shard_info;
+  unsigned max_blob_end = 0;
+  Extent dummy(needs_reshard_begin);
+  for (auto e = extent_map.lower_bound(dummy);
+       e != extent_map.end();
+       ++e) {
+    if (e->logical_offset >= needs_reshard_end) {
+      break;
+    }
+    dout(30) << " extent " << *e << dendl;
+
+    // disfavor shard boundaries that span a blob
+    bool would_span = (e->logical_offset < max_blob_end) || e->blob_offset;
+    if (estimate &&
+	estimate + extent_avg > target + (would_span ? slop : 0)) {
+      // new shard
+      if (offset == needs_reshard_begin) {
+	new_shard_info.emplace_back(bluestore_onode_t::shard_info());
+	new_shard_info.back().offset = offset;
+	dout(20) << __func__ << "  new shard 0x" << std::hex << offset
+                 << std::dec << dendl;
+      }
+      offset = e->logical_offset;
+      new_shard_info.emplace_back(bluestore_onode_t::shard_info());
+      new_shard_info.back().offset = offset;
+      dout(20) << __func__ << "  new shard 0x" << std::hex << offset
+	       << std::dec << dendl;
+      estimate = 0;
+    }
+    estimate += extent_avg;
+    unsigned bs = e->blob_start();
+    if (bs < spanning_scan_begin) {
+      spanning_scan_begin = bs;
+    }
+    uint32_t be = e->blob_end();
+    if (be > max_blob_end) {
+      max_blob_end = be;
+    }
+    if (be > spanning_scan_end) {
+      spanning_scan_end = be;
+    }
+  }
+  if (new_shard_info.empty() && (si_begin > 0 ||
+				 si_end < shards.size())) {
+    // we resharded a partial range; we must produce at least one output
+    // shard
+    new_shard_info.emplace_back(bluestore_onode_t::shard_info());
+    new_shard_info.back().offset = needs_reshard_begin;
+    dout(20) << __func__ << "  new shard 0x" << std::hex << needs_reshard_begin
+	     << std::dec << " (singleton degenerate case)" << dendl;
+  }
+
+  auto& sv = onode->onode.extent_map_shards;
+  dout(20) << __func__ << "  new " << new_shard_info << dendl;
+  dout(20) << __func__ << "  old " << sv << dendl;
+  if (sv.empty()) {
+    // no old shards to keep
+    sv.swap(new_shard_info);
+    init_shards(true, true);
+  } else {
+    // splice in new shards
+    sv.erase(sv.begin() + si_begin, sv.begin() + si_end);
+    shards.erase(shards.begin() + si_begin, shards.begin() + si_end);
+    sv.insert(
+      sv.begin() + si_begin,
+      new_shard_info.begin(),
+      new_shard_info.end());
+    shards.insert(shards.begin() + si_begin, new_shard_info.size(), Shard());
+    si_end = si_begin + new_shard_info.size();
+
+    ceph_assert(sv.size() == shards.size());
+
+    // note that we need to update every shard_info of shards here,
+    // as sv might have been totally re-allocated above
+    for (unsigned i = 0; i < shards.size(); i++) {
+      shards[i].shard_info = &sv[i];
+    }
+
+    // mark newly added shards as dirty
+    for (unsigned i = si_begin; i < si_end; ++i) {
+      shards[i].loaded = true;
+      shards[i].dirty = true;
+    }
+  }
+  dout(20) << __func__ << "  fin " << sv << dendl;
+  inline_bl.clear();
+
+  if (sv.empty()) {
+    // no more shards; unspan all previously spanning blobs
+    auto p = spanning_blob_map.begin();
+    while (p != spanning_blob_map.end()) {
+      p->second->id = -1;
+      dout(30) << __func__ << " un-spanning " << *p->second << dendl;
+      p = spanning_blob_map.erase(p);
+    }
+  } else {
+    // identify new spanning blobs
+    dout(20) << __func__ << " checking spanning blobs 0x[" << std::hex
+	     << spanning_scan_begin << "," << spanning_scan_end << ")" << dendl;
+    if (spanning_scan_begin < needs_reshard_begin) {
+      fault_range(db, spanning_scan_begin,
+		  needs_reshard_begin - spanning_scan_begin);
+    }
+    if (spanning_scan_end > needs_reshard_end) {
+      fault_range(db, needs_reshard_end,
+		  spanning_scan_end - needs_reshard_end);
+    }
+    auto sp = sv.begin() + si_begin;
+    auto esp = sv.end();
+    unsigned shard_start = sp->offset;
+    unsigned shard_end;
+    ++sp;
+    if (sp == esp) {
+      shard_end = OBJECT_MAX_SIZE;
+    } else {
+      shard_end = sp->offset;
+    }
+    Extent dummy(needs_reshard_begin);
+
+    bool was_too_many_blobs_check = false;
+    auto too_many_blobs_threshold =
+      g_conf()->bluestore_debug_too_many_blobs_threshold;
+    auto& dumped_onodes = onode->c->onode_space.cache->dumped_onodes;
+    decltype(onode->c->onode_space.cache->dumped_onodes)::value_type* oid_slot = nullptr;
+    decltype(onode->c->onode_space.cache->dumped_onodes)::value_type* oldest_slot = nullptr;
+
+    for (auto e = extent_map.lower_bound(dummy); e != extent_map.end(); ++e) {
+      if (e->logical_offset >= needs_reshard_end) {
+	break;
+      }
+      dout(30) << " extent " << *e << dendl;
+      while (e->logical_offset >= shard_end) {
+	shard_start = shard_end;
+	ceph_assert(sp != esp);
+	++sp;
+	if (sp == esp) {
+	  shard_end = OBJECT_MAX_SIZE;
+	} else {
+	  shard_end = sp->offset;
+	}
+	dout(30) << __func__ << "  shard 0x" << std::hex << shard_start
+		 << " to 0x" << shard_end << std::dec << dendl;
+      }
+
+      if (e->blob_escapes_range(shard_start, shard_end - shard_start)) {
+	if (!e->blob->is_spanning()) {
+	  // We have two options: (1) split the blob into pieces at the
+	  // shard boundaries (and adjust extents accordingly), or (2)
+	  // mark it spanning.  We prefer to cut the blob if we can.  Note that
+	  // we may have to split it multiple times--potentially at every
+	  // shard boundary.
+	  bool must_span = false;
+	  BlobRef b = e->blob;
+	  if (b->can_split()) {
+	    uint32_t bstart = e->blob_start();
+	    uint32_t bend = e->blob_end();
+	    for (const auto& sh : shards) {
+	      if (bstart < sh.shard_info->offset &&
+		  bend > sh.shard_info->offset) {
+		uint32_t blob_offset = sh.shard_info->offset - bstart;
+		if (b->can_split_at(blob_offset)) {
+		  dout(20) << __func__ << "    splitting blob, bstart 0x"
+			   << std::hex << bstart << " blob_offset 0x"
+			   << blob_offset << std::dec << " " << *b << dendl;
+		  b = split_blob(b, blob_offset, sh.shard_info->offset);
+		  // switch b to the new right-hand side, in case it
+		  // *also* has to get split.
+		  bstart += blob_offset;
+		  onode->c->store->logger->inc(l_bluestore_blob_split);
+		} else {
+		  must_span = true;
+		  break;
+		}
+	      }
+	    }
+	  } else {
+	    must_span = true;
+	  }
+	  if (must_span) {
+            auto bid = allocate_spanning_blob_id();
+            b->id = bid;
+	    spanning_blob_map[b->id] = b;
+	    dout(20) << __func__ << "    adding spanning " << *b << dendl;
+	    if (!was_too_many_blobs_check &&
+	      too_many_blobs_threshold &&
+	      spanning_blob_map.size() >= size_t(too_many_blobs_threshold)) {
+
+	      was_too_many_blobs_check = true;
+	      for (size_t i = 0; i < dumped_onodes.size(); ++i) {
+		if (dumped_onodes[i].first == onode->oid) {
+		  oid_slot = &dumped_onodes[i];
+		  break;
+		}
+		if (!oldest_slot || (oldest_slot &&
+		    dumped_onodes[i].second < oldest_slot->second)) {
+		  oldest_slot = &dumped_onodes[i];
+		}
+	      }
+	    }
+	  }
+	}
+      } else {
+	if (e->blob->is_spanning()) {
+	  spanning_blob_map.erase(e->blob->id);
+	  e->blob->id = -1;
+	  dout(30) << __func__ << "    un-spanning " << *e->blob << dendl;
+	}
+      }
+    }
+    bool do_dump = (!oid_slot && was_too_many_blobs_check) ||
+      (oid_slot &&
+	(mono_clock::now() - oid_slot->second >= make_timespan(5 * 60)));
+    if (do_dump) {
+      dout(0) << __func__
+	      << " spanning blob count exceeds threshold, "
+	      << spanning_blob_map.size() << " spanning blobs"
+	      << dendl;
+      _dump_onode<0>(cct, *onode);
+      if (oid_slot) {
+	oid_slot->second = mono_clock::now();
+      } else {
+	ceph_assert(oldest_slot);
+	oldest_slot->first = onode->oid;
+	oldest_slot->second = mono_clock::now();
+      }
+    }
+  }
+
+  clear_needs_reshard();
+}
+
+bool BlueStore::ExtentMap::encode_some(
+  uint32_t offset,
+  uint32_t length,
+  bufferlist& bl,
+  unsigned *pn)
+{
+  Extent dummy(offset);
+  auto start = extent_map.lower_bound(dummy);
+  uint32_t end = offset + length;
+
+  __u8 struct_v = 2; // Version 2 differs from v1 in blob's ref_map
+                     // serialization only. Hence there is no specific
+                     // handling at ExtentMap level.
+
+  unsigned n = 0;
+  size_t bound = 0;
+  bool must_reshard = false;
+  for (auto p = start;
+       p != extent_map.end() && p->logical_offset < end;
+       ++p, ++n) {
+    ceph_assert(p->logical_offset >= offset);
+    p->blob->last_encoded_id = -1;
+    if (!p->blob->is_spanning() && p->blob_escapes_range(offset, length)) {
+      dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
+	       << std::dec << " hit new spanning blob " << *p << dendl;
+      request_reshard(p->blob_start(), p->blob_end());
+      must_reshard = true;
+    }
+    if (!must_reshard) {
+      denc_varint(0, bound); // blobid
+      denc_varint(0, bound); // logical_offset
+      denc_varint(0, bound); // len
+      denc_varint(0, bound); // blob_offset
+
+      p->blob->bound_encode(
+        bound,
+        struct_v,
+        p->blob->shared_blob->get_sbid(),
+        false);
+    }
+  }
+  if (must_reshard) {
+    return true;
+  }
+
+  denc(struct_v, bound);
+  denc_varint(0, bound); // number of extents
+
+  {
+    auto app = bl.get_contiguous_appender(bound);
+    denc(struct_v, app);
+    denc_varint(n, app);
+    if (pn) {
+      *pn = n;
+    }
+
+    n = 0;
+    uint64_t pos = 0;
+    uint64_t prev_len = 0;
+    for (auto p = start;
+	 p != extent_map.end() && p->logical_offset < end;
+	 ++p, ++n) {
+      unsigned blobid;
+      bool include_blob = false;
+      if (p->blob->is_spanning()) {
+	blobid = p->blob->id << BLOBID_SHIFT_BITS;
+	blobid |= BLOBID_FLAG_SPANNING;
+      } else if (p->blob->last_encoded_id < 0) {
+	p->blob->last_encoded_id = n + 1;  // so it is always non-zero
+	include_blob = true;
+	blobid = 0;  // the decoder will infer the id from n
+      } else {
+	blobid = p->blob->last_encoded_id << BLOBID_SHIFT_BITS;
+      }
+      if (p->logical_offset == pos) {
+	blobid |= BLOBID_FLAG_CONTIGUOUS;
+      }
+      if (p->blob_offset == 0) {
+	blobid |= BLOBID_FLAG_ZEROOFFSET;
+      }
+      if (p->length == prev_len) {
+	blobid |= BLOBID_FLAG_SAMELENGTH;
+      } else {
+	prev_len = p->length;
+      }
+      denc_varint(blobid, app);
+      if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
+	denc_varint_lowz(p->logical_offset - pos, app);
+      }
+      if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
+	denc_varint_lowz(p->blob_offset, app);
+      }
+      if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
+	denc_varint_lowz(p->length, app);
+      }
+      pos = p->logical_end();
+      if (include_blob) {
+	p->blob->encode(app, struct_v, p->blob->shared_blob->get_sbid(), false);
+      }
+    }
+  }
+  /*derr << __func__ << bl << dendl;
+  derr << __func__ << ":";
+  bl.hexdump(*_dout);
+  *_dout << dendl;
+  */
+  return false;
+}
+
+/////////////////// BlueStore::ExtentMap::DecoderExtent ///////////
+void BlueStore::ExtentMap::ExtentDecoder::decode_extent(
+  Extent* le,
+  __u8 struct_v,
+  bptr_c_it_t& p,
+  Collection* c)
+{
+  uint64_t blobid;
+  denc_varint(blobid, p);
+  if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
+    uint64_t gap;
+    denc_varint_lowz(gap, p);
+    pos += gap;
+  }
+  le->logical_offset = pos;
+  if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
+    denc_varint_lowz(le->blob_offset, p);
+  } else {
+    le->blob_offset = 0;
+  }
+  if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
+    denc_varint_lowz(prev_len, p);
+  }
+  le->length = prev_len;
+  if (blobid & BLOBID_FLAG_SPANNING) {
+    consume_blobid(le, true, blobid >> BLOBID_SHIFT_BITS);
+  } else {
+    blobid >>= BLOBID_SHIFT_BITS;
+    if (blobid) {
+      consume_blobid(le, false, blobid - 1);
+    } else {
+      Blob *b = new Blob();
+      uint64_t sbid = 0;
+      b->decode(p, struct_v, &sbid, false, c);
+      consume_blob(le, extent_pos, sbid, b);
+    }
+  }
+  pos += prev_len;
+  ++extent_pos;
+}
+
+unsigned BlueStore::ExtentMap::ExtentDecoder::decode_some(
+  const bufferlist& bl, Collection* c)
+{
+  __u8 struct_v;
+  uint32_t num;
+
+  ceph_assert(bl.get_num_buffers() <= 1);
+  auto p = bl.front().begin_deep();
+  denc(struct_v, p);
+  // Version 2 differs from v1 in blob's ref_map
+  // serialization only. Hence there is no specific
+  // handling at ExtentMap level below.
+  ceph_assert(struct_v == 1 || struct_v == 2);
+  denc_varint(num, p);
+
+  extent_pos = 0;
+  while (!p.end()) {
+    Extent* le = get_next_extent();
+    decode_extent(le, struct_v, p, c);
+    add_extent(le);
+  }
+  ceph_assert(extent_pos == num);
+  return num;
+}
+
+void BlueStore::ExtentMap::ExtentDecoder::decode_spanning_blobs(
+  bptr_c_it_t& p, Collection* c)
+{
+  __u8 struct_v;
+  denc(struct_v, p);
+  // Version 2 differs from v1 in blob's ref_map
+  // serialization only. Hence there is no specific
+  // handling at ExtentMap level.
+  ceph_assert(struct_v == 1 || struct_v == 2);
+
+  unsigned n;
+  denc_varint(n, p);
+  while (n--) {
+    BlueStore::BlobRef b(new Blob());
+    denc_varint(b->id, p);
+    uint64_t sbid = 0;
+    b->decode(p, struct_v, &sbid, true, c);
+    consume_spanning_blob(sbid, b);
+  }
+}
+
+/////////////////// BlueStore::ExtentMap::DecoderExtentFull ///////////
+void BlueStore::ExtentMap::ExtentDecoderFull::consume_blobid(
+  BlueStore::Extent* le, bool spanning, uint64_t blobid) {
+  ceph_assert(le);
+  if (spanning) {
+    le->assign_blob(extent_map.get_spanning_blob(blobid));
+  } else {
+    ceph_assert(blobid < blobs.size());
+    le->assign_blob(blobs[blobid]);
+    // we build ref_map dynamically for non-spanning blobs
+    le->blob->get_ref(
+      extent_map.onode->c,
+      le->blob_offset,
+      le->length);
+  }
+}
+
+void BlueStore::ExtentMap::ExtentDecoderFull::consume_blob(
+  BlueStore::Extent* le, uint64_t extent_no, uint64_t sbid, BlobRef b) {
+  ceph_assert(le);
+  blobs.resize(extent_no + 1);
+  blobs[extent_no] = b;
+  extent_map.onode->c->open_shared_blob(sbid, b);
+  le->assign_blob(b);
+  le->blob->get_ref(
+    extent_map.onode->c,
+    le->blob_offset,
+    le->length);
+}
+
+void BlueStore::ExtentMap::ExtentDecoderFull::consume_spanning_blob(
+  uint64_t sbid, BlueStore::BlobRef b) {
+  extent_map.spanning_blob_map[b->id] = b;
+  extent_map.onode->c->open_shared_blob(sbid, b);
+}
+
+BlueStore::Extent* BlueStore::ExtentMap::ExtentDecoderFull::get_next_extent()
+{
+  return new Extent();
+}
+
+void BlueStore::ExtentMap::ExtentDecoderFull::add_extent(BlueStore::Extent* le)
+{
+  extent_map.extent_map.insert(*le);
+}
+
+unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
+{
+  ExtentDecoderFull edecoder(*this);
+  unsigned n = edecoder.decode_some(bl, onode->c);
+  return n;
+}
+
+void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
+{
+  // Version 2 differs from v1 in blob's ref_map
+  // serialization only. Hence there is no specific
+  // handling at ExtentMap level.
+  __u8 struct_v = 2;
+
+  denc(struct_v, p);
+  denc_varint((uint32_t)0, p);
+  size_t key_size = 0;
+  denc_varint((uint32_t)0, key_size);
+  p += spanning_blob_map.size() * key_size;
+  for (const auto& i : spanning_blob_map) {
+    i.second->bound_encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
+  }
+}
+
+void BlueStore::ExtentMap::encode_spanning_blobs(
+  bufferlist::contiguous_appender& p)
+{
+  // Version 2 differs from v1 in blob's ref_map
+  // serialization only. Hence there is no specific
+  // handling at ExtentMap level.
+  __u8 struct_v = 2;
+
+  denc(struct_v, p);
+  denc_varint(spanning_blob_map.size(), p);
+  for (auto& i : spanning_blob_map) {
+    denc_varint(i.second->id, p);
+    i.second->encode(p, struct_v, i.second->shared_blob->get_sbid(), true);
+  }
+}
+
+void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
+{
+  shards.resize(onode->onode.extent_map_shards.size());
+  unsigned i = 0;
+  for (auto &s : onode->onode.extent_map_shards) {
+    shards[i].shard_info = &s;
+    shards[i].loaded = loaded;
+    shards[i].dirty = dirty;
+    ++i;
+  }
+}
+
+void BlueStore::ExtentMap::fault_range(
+  KeyValueDB *db,
+  uint32_t offset,
+  uint32_t length)
+{
+  dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << std::dec << dendl;
+  auto start = seek_shard(offset);
+  auto last = seek_shard(offset + length);
+
+  if (start < 0)
+    return;
+
+  ceph_assert(last >= start);
+  string key;
+  while (start <= last) {
+    ceph_assert((size_t)start < shards.size());
+    auto p = &shards[start];
+    if (!p->loaded) {
+      dout(30) << __func__ << " opening shard 0x" << std::hex
+	       << p->shard_info->offset << std::dec << dendl;
+      bufferlist v;
+      generate_extent_shard_key_and_apply(
+	onode->key, p->shard_info->offset, &key,
+        [&](const string& final_key) {
+          int r = db->get(PREFIX_OBJ, final_key, &v);
+          if (r < 0) {
+	    derr << __func__ << " missing shard 0x" << std::hex
+		 << p->shard_info->offset << std::dec << " for " << onode->oid
+		 << dendl;
+	    ceph_assert(r >= 0);
+          }
+        }
+      );
+      p->extents = decode_some(v);
+      p->loaded = true;
+      dout(20) << __func__ << " open shard 0x" << std::hex
+	       << p->shard_info->offset
+	       << " for range 0x" << offset << "~" << length << std::dec
+	       << " (" << v.length() << " bytes)" << dendl;
+      ceph_assert(p->dirty == false);
+      ceph_assert(v.length() == p->shard_info->bytes);
+      onode->c->store->logger->inc(l_bluestore_onode_shard_misses);
+    } else {
+      onode->c->store->logger->inc(l_bluestore_onode_shard_hits);
+    }
+    ++start;
+  }
+}
+
+void BlueStore::ExtentMap::dirty_range(
+  uint32_t offset,
+  uint32_t length)
+{
+  dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << std::dec << dendl;
+  if (shards.empty()) {
+    dout(20) << __func__ << " mark inline shard dirty" << dendl;
+    inline_bl.clear();
+    return;
+  }
+  auto start = seek_shard(offset);
+  if (length == 0) {
+    length = 1;
+  }
+  auto last = seek_shard(offset + length - 1);
+  if (start < 0)
+    return;
+
+  ceph_assert(last >= start);
+  while (start <= last) {
+    ceph_assert((size_t)start < shards.size());
+    auto p = &shards[start];
+    if (!p->loaded) {
+      derr << __func__ << "on write 0x" << std::hex << offset
+	   << "~" << length << " shard 0x" << p->shard_info->offset
+	   << std::dec << " is not loaded, can't mark dirty" << dendl;
+      ceph_abort_msg("can't mark unloaded shard dirty");
+    }
+    if (!p->dirty) {
+      dout(20) << __func__ << " mark shard 0x" << std::hex
+	       << p->shard_info->offset << std::dec << " dirty" << dendl;
+      p->dirty = true;
+    }
+    ++start;
+  }
+}
+
+BlueStore::extent_map_t::iterator BlueStore::ExtentMap::find(
+  uint64_t offset)
+{
+  Extent dummy(offset);
+  return extent_map.find(dummy);
+}
+
+BlueStore::extent_map_t::iterator BlueStore::ExtentMap::seek_lextent(
+  uint64_t offset)
+{
+  Extent dummy(offset);
+  auto fp = extent_map.lower_bound(dummy);
+  if (fp != extent_map.begin()) {
+    --fp;
+    if (fp->logical_end() <= offset) {
+      ++fp;
+    }
+  }
+  return fp;
+}
+
+BlueStore::extent_map_t::const_iterator BlueStore::ExtentMap::seek_lextent(
+  uint64_t offset) const
+{
+  Extent dummy(offset);
+  auto fp = extent_map.lower_bound(dummy);
+  if (fp != extent_map.begin()) {
+    --fp;
+    if (fp->logical_end() <= offset) {
+      ++fp;
+    }
+  }
+  return fp;
+}
+
+bool BlueStore::ExtentMap::has_any_lextents(uint64_t offset, uint64_t length)
+{
+  auto fp = seek_lextent(offset);
+  if (fp == extent_map.end() || fp->logical_offset >= offset + length) {
+    return false;
+  }
+  return true;
+}
+
+int BlueStore::ExtentMap::compress_extent_map(
+  uint64_t offset,
+  uint64_t length)
+{
+  if (extent_map.empty())
+    return 0;
+  int removed = 0;
+  auto p = seek_lextent(offset);
+  if (p != extent_map.begin()) {
+    --p;  // start to the left of offset
+  }
+  // the caller should have just written to this region
+  ceph_assert(p != extent_map.end());
+
+  // identify the *next* shard
+  auto pshard = shards.begin();
+  while (pshard != shards.end() &&
+	 p->logical_offset >= pshard->shard_info->offset) {
+    ++pshard;
+  }
+  uint64_t shard_end;
+  if (pshard != shards.end()) {
+    shard_end = pshard->shard_info->offset;
+  } else {
+    shard_end = OBJECT_MAX_SIZE;
+  }
+
+  auto n = p;
+  for (++n; n != extent_map.end(); p = n++) {
+    if (n->logical_offset > offset + length) {
+      break;  // stop after end
+    }
+    while (n != extent_map.end() &&
+	   p->logical_end() == n->logical_offset &&
+	   p->blob == n->blob &&
+	   p->blob_offset + p->length == n->blob_offset &&
+	   n->logical_offset < shard_end) {
+      dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+	       << " next shard 0x" << shard_end << std::dec
+	       << " merging " << *p << " and " << *n << dendl;
+      p->length += n->length;
+      rm(n++);
+      ++removed;
+    }
+    if (n == extent_map.end()) {
+      break;
+    }
+    if (n->logical_offset >= shard_end) {
+      ceph_assert(pshard != shards.end());
+      ++pshard;
+      if (pshard != shards.end()) {
+	shard_end = pshard->shard_info->offset;
+      } else {
+	shard_end = OBJECT_MAX_SIZE;
+      }
+    }
+  }
+  if (removed) {
+    onode->c->store->logger->inc(l_bluestore_extent_compress, removed);
+  }
+  return removed;
+}
+
+void BlueStore::ExtentMap::punch_hole(
+  CollectionRef &c, 
+  uint64_t offset,
+  uint64_t length,
+  old_extent_map_t *old_extents)
+{
+  auto p = seek_lextent(offset);
+  uint64_t end = offset + length;
+  while (p != extent_map.end()) {
+    if (p->logical_offset >= end) {
+      break;
+    }
+    if (p->logical_offset < offset) {
+      if (p->logical_end() > end) {
+	// split and deref middle
+	uint64_t front = offset - p->logical_offset;
+	OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + front, 
+					  length, p->blob);
+	old_extents->push_back(*oe);
+	add(end,
+	    p->blob_offset + front + length,
+	    p->length - front - length,
+	    p->blob);
+	p->length = front;
+	break;
+      } else {
+	// deref tail
+	ceph_assert(p->logical_end() > offset); // else seek_lextent bug
+	uint64_t keep = offset - p->logical_offset;
+	OldExtent* oe = OldExtent::create(c, offset, p->blob_offset + keep,
+					  p->length - keep, p->blob);
+	old_extents->push_back(*oe);
+	p->length = keep;
+	++p;
+	continue;
+      }
+    }
+    if (p->logical_offset + p->length <= end) {
+      // deref whole lextent
+      OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
+				        p->length, p->blob);
+      old_extents->push_back(*oe);
+      rm(p++);
+      continue;
+    }
+    // deref head
+    uint64_t keep = p->logical_end() - end;
+    OldExtent* oe = OldExtent::create(c, p->logical_offset, p->blob_offset,
+				      p->length - keep, p->blob);
+    old_extents->push_back(*oe);
+
+    add(end, p->blob_offset + p->length - keep, keep, p->blob);
+    rm(p);
+    break;
+  }
+}
+
+BlueStore::Extent *BlueStore::ExtentMap::set_lextent(
+  CollectionRef &c,
+  uint64_t logical_offset,
+  uint64_t blob_offset, uint64_t length, BlobRef b,
+  old_extent_map_t *old_extents)
+{
+  // We need to have completely initialized Blob to increment its ref counters.
+  ceph_assert(b->get_blob().get_logical_length() != 0);
+
+  // Do get_ref prior to punch_hole to prevent from putting reused blob into 
+  // old_extents list if we overwre the blob totally
+  // This might happen during WAL overwrite.
+  b->get_ref(onode->c, blob_offset, length);
+
+  if (old_extents) {
+    punch_hole(c, logical_offset, length, old_extents);
+  }
+
+  Extent *le = new Extent(logical_offset, blob_offset, length, b);
+  extent_map.insert(*le);
+  if (spans_shard(logical_offset, length)) {
+    request_reshard(logical_offset, logical_offset + length);
+  }
+  return le;
+}
+
+BlueStore::BlobRef BlueStore::ExtentMap::split_blob(
+  BlobRef lb,
+  uint32_t blob_offset,
+  uint32_t pos)
+{
+  uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset;
+  dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos
+	   << " blob_offset 0x" << blob_offset << std::dec << " " << *lb
+	   << dendl;
+  BlobRef rb = onode->c->new_blob();
+  lb->split(onode->c, blob_offset, rb.get());
+
+  for (auto ep = seek_lextent(pos);
+       ep != extent_map.end() && ep->logical_offset < end_pos;
+       ++ep) {
+    if (ep->blob != lb) {
+      continue;
+    }
+    if (ep->logical_offset < pos) {
+      // split extent
+      size_t left = pos - ep->logical_offset;
+      Extent *ne = new Extent(pos, 0, ep->length - left, rb);
+      extent_map.insert(*ne);
+      ep->length = left;
+      dout(30) << __func__ << "  split " << *ep << dendl;
+      dout(30) << __func__ << "     to " << *ne << dendl;
+    } else {
+      // switch blob
+      ceph_assert(ep->blob_offset >= blob_offset);
+
+      ep->blob = rb;
+      ep->blob_offset -= blob_offset;
+      dout(30) << __func__ << "  adjusted " << *ep << dendl;
+    }
+  }
+  return rb;
+}
+
+// Onode
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.onode(" << this << ")." << __func__ << " "
+
+const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags)
+{
+  if (bluestore_onode_t::is_pgmeta_omap(flags)) {
+    return PREFIX_PGMETA_OMAP;
+  }
+  if (bluestore_onode_t::is_perpg_omap(flags)) {
+    return PREFIX_PERPG_OMAP;
+  }
+  if (bluestore_onode_t::is_perpool_omap(flags)) {
+    return PREFIX_PERPOOL_OMAP;
+  }
+  return PREFIX_OMAP;
+}
+
+// '-' < '.' < '~'
+void BlueStore::Onode::calc_omap_header(
+  uint8_t flags,
+  const Onode* o,
+  std::string* out)
+{
+  if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
+    if (bluestore_onode_t::is_perpg_omap(flags)) {
+      _key_encode_u64(o->c->pool(), out);
+      _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
+    } else if (bluestore_onode_t::is_perpool_omap(flags)) {
+      _key_encode_u64(o->c->pool(), out);
+    }
+  }
+  _key_encode_u64(o->onode.nid, out);
+  out->push_back('-');
+}
+
+void BlueStore::Onode::calc_omap_key(uint8_t flags,
+				    const Onode* o,
+				    const std::string& key,
+				    std::string* out)
+{
+  if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
+    if (bluestore_onode_t::is_perpg_omap(flags)) {
+      _key_encode_u64(o->c->pool(), out);
+      _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
+    } else if (bluestore_onode_t::is_perpool_omap(flags)) {
+      _key_encode_u64(o->c->pool(), out);
+    }
+  }
+  _key_encode_u64(o->onode.nid, out);
+  out->push_back('.');
+  out->append(key);
+}
+
+void BlueStore::Onode::calc_omap_tail(
+  uint8_t flags,
+  const Onode* o,
+  std::string* out)
+{
+  if (!bluestore_onode_t::is_pgmeta_omap(flags)) {
+    if (bluestore_onode_t::is_perpg_omap(flags)) {
+      _key_encode_u64(o->c->pool(), out);
+      _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out);
+    } else if (bluestore_onode_t::is_perpool_omap(flags)) {
+      _key_encode_u64(o->c->pool(), out);
+    }
+  }
+  _key_encode_u64(o->onode.nid, out);
+  out->push_back('~');
+}
+
+void BlueStore::Onode::get()
+{
+  ++nref;
+  ++pin_nref;
+}
+void BlueStore::Onode::put()
+{
+  if (--pin_nref == 1) {
+    c->get_onode_cache()->maybe_unpin(this);
+  }
+  if (--nref == 0) {
+    delete this;
+  }
+}
+
+void BlueStore::Onode::decode_raw(
+  BlueStore::Onode* on,
+  const bufferlist& v,
+  BlueStore::ExtentMap::ExtentDecoder& edecoder)
+{
+  on->exists = true;
+  auto p = v.front().begin_deep();
+  on->onode.decode(p);
+
+  // initialize extent_map
+  edecoder.decode_spanning_blobs(p, on->c);
+  if (on->onode.extent_map_shards.empty()) {
+    denc(on->extent_map.inline_bl, p);
+    edecoder.decode_some(on->extent_map.inline_bl, on->c);
+  }
+}
+
+BlueStore::Onode* BlueStore::Onode::create_decode(
+  CollectionRef c,
+  const ghobject_t& oid,
+  const string& key,
+  const bufferlist& v,
+  bool allow_empty)
+{
+  ceph_assert(v.length() || allow_empty);
+  Onode* on = new Onode(c.get(), oid, key);
+
+  if (v.length()) {
+    ExtentMap::ExtentDecoderFull edecoder(on->extent_map);
+    decode_raw(on, v, edecoder);
+
+    for (auto& i : on->onode.attrs) {
+      i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+    }
+
+    // initialize extent_map
+    if (on->onode.extent_map_shards.empty()) {
+      on->extent_map.inline_bl.reassign_to_mempool(
+        mempool::mempool_bluestore_cache_data);
+    } else {
+      on->extent_map.init_shards(false, false);
+    }
+  }
+  return on;
+}
+
+void BlueStore::Onode::flush()
+{
+  if (flushing_count.load()) {
+    ldout(c->store->cct, 20) << __func__ << " cnt:" << flushing_count << dendl;
+    waiting_count++;
+    std::unique_lock l(flush_lock);
+    while (flushing_count.load()) {
+      flush_cond.wait(l);
+    }
+    waiting_count--;
+  }
+  ldout(c->store->cct, 20) << __func__ << " done" << dendl;
+}
+
+void BlueStore::Onode::dump(Formatter* f) const
+{
+  onode.dump(f);
+  extent_map.dump(f);
+}
+
+void BlueStore::Onode::rewrite_omap_key(const string& old, string *out)
+{
+  if (!onode.is_pgmeta_omap()) {
+    if (onode.is_perpg_omap()) {
+      _key_encode_u64(c->pool(), out);
+      _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out);
+    } else if (onode.is_perpool_omap()) {
+      _key_encode_u64(c->pool(), out);
+    }
+  }
+  _key_encode_u64(onode.nid, out);
+  out->append(old.c_str() + out->length(), old.size() - out->length());
+}
+
+void BlueStore::Onode::decode_omap_key(const string& key, string *user_key)
+{
+  size_t pos = sizeof(uint64_t) + 1;
+  if (!onode.is_pgmeta_omap()) {
+    if (onode.is_perpg_omap()) {
+      pos += sizeof(uint64_t) + sizeof(uint32_t);
+    } else if (onode.is_perpool_omap()) {
+      pos += sizeof(uint64_t);
+    }
+  }
+  *user_key = key.substr(pos);
+}
+
+// =======================================================
+// WriteContext
+ 
+/// Checks for writes to the same pextent within a blob
+bool BlueStore::WriteContext::has_conflict(
+  BlobRef b,
+  uint64_t loffs,
+  uint64_t loffs_end,
+  uint64_t min_alloc_size)
+{
+  ceph_assert((loffs % min_alloc_size) == 0);
+  ceph_assert((loffs_end % min_alloc_size) == 0);
+  for (auto w : writes) {
+    if (b == w.b) {
+      auto loffs2 = p2align(w.logical_offset, min_alloc_size);
+      auto loffs2_end = p2roundup(w.logical_offset + w.length0, min_alloc_size);
+      if ((loffs <= loffs2 && loffs_end > loffs2) ||
+          (loffs >= loffs2 && loffs < loffs2_end)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+ 
+// =======================================================
+
+// DeferredBatch
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.DeferredBatch(" << this << ") "
+#undef dout_context
+#define dout_context cct
+
+void BlueStore::DeferredBatch::prepare_write(
+  CephContext *cct,
+  uint64_t seq, uint64_t offset, uint64_t length,
+  bufferlist::const_iterator& blp)
+{
+  _discard(cct, offset, length);
+  auto i = iomap.insert(make_pair(offset, deferred_io()));
+  ceph_assert(i.second);  // this should be a new insertion
+  i.first->second.seq = seq;
+  blp.copy(length, i.first->second.bl);
+  i.first->second.bl.reassign_to_mempool(
+    mempool::mempool_bluestore_writing_deferred);
+  dout(20) << __func__ << " seq " << seq
+	   << " 0x" << std::hex << offset << "~" << length
+	   << " crc " << i.first->second.bl.crc32c(-1)
+	   << std::dec << dendl;
+  seq_bytes[seq] += length;
+#ifdef DEBUG_DEFERRED
+  _audit(cct);
+#endif
+}
+
+void BlueStore::DeferredBatch::_discard(
+  CephContext *cct, uint64_t offset, uint64_t length)
+{
+  generic_dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+		   << std::dec << dendl;
+  auto p = iomap.lower_bound(offset);
+  if (p != iomap.begin()) {
+    --p;
+    auto end = p->first + p->second.bl.length();
+    if (end > offset) {
+      bufferlist head;
+      head.substr_of(p->second.bl, 0, offset - p->first);
+      dout(20) << __func__ << "  keep head " << p->second.seq
+	       << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+	       << " -> 0x" << head.length() << std::dec << dendl;
+      auto i = seq_bytes.find(p->second.seq);
+      ceph_assert(i != seq_bytes.end());
+      if (end > offset + length) {
+	bufferlist tail;
+	tail.substr_of(p->second.bl, offset + length - p->first,
+		       end - (offset + length));
+	dout(20) << __func__ << "  keep tail " << p->second.seq
+		 << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+		 << " -> 0x" << tail.length() << std::dec << dendl;
+	auto &n = iomap[offset + length];
+	n.bl.swap(tail);
+	n.seq = p->second.seq;
+	i->second -= length;
+      } else {
+	i->second -= end - offset;
+      }
+      ceph_assert(i->second >= 0);
+      p->second.bl.swap(head);
+    }
+    ++p;
+  }
+  while (p != iomap.end()) {
+    if (p->first >= offset + length) {
+      break;
+    }
+    auto i = seq_bytes.find(p->second.seq);
+    ceph_assert(i != seq_bytes.end());
+    auto end = p->first + p->second.bl.length();
+    if (end > offset + length) {
+      unsigned drop_front = offset + length - p->first;
+      unsigned keep_tail = end - (offset + length);
+      dout(20) << __func__ << "  truncate front " << p->second.seq
+	       << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+	       << " drop_front 0x" << drop_front << " keep_tail 0x" << keep_tail
+	       << " to 0x" << (offset + length) << "~" << keep_tail
+	       << std::dec << dendl;
+      auto &s = iomap[offset + length];
+      s.seq = p->second.seq;
+      s.bl.substr_of(p->second.bl, drop_front, keep_tail);
+      i->second -= drop_front;
+    } else {
+      dout(20) << __func__ << "  drop " << p->second.seq
+	       << " 0x" << std::hex << p->first << "~" << p->second.bl.length()
+	       << std::dec << dendl;
+      i->second -= p->second.bl.length();
+    }
+    ceph_assert(i->second >= 0);
+    p = iomap.erase(p);
+  }
+}
+
+void BlueStore::DeferredBatch::_audit(CephContext *cct)
+{
+  map<uint64_t,int> sb;
+  for (auto p : seq_bytes) {
+    sb[p.first] = 0;  // make sure we have the same set of keys
+  }
+  uint64_t pos = 0;
+  for (auto& p : iomap) {
+    ceph_assert(p.first >= pos);
+    sb[p.second.seq] += p.second.bl.length();
+    pos = p.first + p.second.bl.length();
+  }
+  ceph_assert(sb == seq_bytes);
+}
+
+
+// Collection
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore(" << store->path << ").collection(" << cid << " " << this << ") "
+
+BlueStore::Collection::Collection(BlueStore *store_, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t cid)
+  : CollectionImpl(store_->cct, cid),
+    store(store_),
+    cache(bc),
+    exists(true),
+    onode_space(oc),
+    commit_queue(nullptr)
+{
+}
+
+bool BlueStore::Collection::flush_commit(Context *c)
+{
+  return osr->flush_commit(c);
+}
+
+void BlueStore::Collection::flush()
+{
+  osr->flush();
+}
+
+void BlueStore::Collection::flush_all_but_last()
+{
+  osr->flush_all_but_last();
+}
+
+void BlueStore::Collection::open_shared_blob(uint64_t sbid, BlobRef b)
+{
+  ceph_assert(!b->shared_blob);
+  const bluestore_blob_t& blob = b->get_blob();
+  if (!blob.is_shared()) {
+    b->shared_blob = new SharedBlob(this);
+    return;
+  }
+
+  b->shared_blob = shared_blob_set.lookup(sbid);
+  if (b->shared_blob) {
+    ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
+			  << std::dec << " had " << *b->shared_blob << dendl;
+  } else {
+    b->shared_blob = new SharedBlob(sbid, this);
+    shared_blob_set.add(this, b->shared_blob.get());
+    ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
+			  << std::dec << " opened " << *b->shared_blob
+			  << dendl;
+  }
+}
+
+void BlueStore::Collection::load_shared_blob(SharedBlobRef sb)
+{
+  if (!sb->is_loaded()) {
+
+    bufferlist v;
+    string key;
+    auto sbid = sb->get_sbid();
+    get_shared_blob_key(sbid, &key);
+    int r = store->db->get(PREFIX_SHARED_BLOB, key, &v);
+    if (r < 0) {
+	lderr(store->cct) << __func__ << " sbid 0x" << std::hex << sbid
+			  << std::dec << " not found at key "
+			  << pretty_binary_string(key) << dendl;
+      ceph_abort_msg("uh oh, missing shared_blob");
+    }
+
+    sb->loaded = true;
+    sb->persistent = new bluestore_shared_blob_t(sbid);
+    auto p = v.cbegin();
+    decode(*(sb->persistent), p);
+    ldout(store->cct, 10) << __func__ << " sbid 0x" << std::hex << sbid
+			  << std::dec << " loaded shared_blob " << *sb << dendl;
+  }
+}
+
+void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b)
+{
+  ldout(store->cct, 10) << __func__ << " " << *b << dendl;
+  ceph_assert(!b->shared_blob->is_loaded());
+
+  // update blob
+  bluestore_blob_t& blob = b->dirty_blob();
+  blob.set_flag(bluestore_blob_t::FLAG_SHARED);
+
+  // update shared blob
+  b->shared_blob->loaded = true;
+  b->shared_blob->persistent = new bluestore_shared_blob_t(sbid);
+  shared_blob_set.add(this, b->shared_blob.get());
+  for (auto p : blob.get_extents()) {
+    if (p.is_valid()) {
+      b->shared_blob->get_ref(
+	p.offset,
+	p.length);
+    }
+  }
+  ldout(store->cct, 20) << __func__ << " now " << *b << dendl;
+}
+
+uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb)
+{
+  ldout(store->cct, 10) << __func__ << " " << *sb << dendl;
+  ceph_assert(sb->is_loaded());
+
+  uint64_t sbid = sb->get_sbid();
+  shared_blob_set.remove(sb);
+  sb->loaded = false;
+  delete sb->persistent;
+  sb->sbid_unloaded = 0;
+  ldout(store->cct, 20) << __func__ << " now " << *sb << dendl;
+  return sbid;
+}
+
+BlueStore::OnodeRef BlueStore::Collection::get_onode(
+  const ghobject_t& oid,
+  bool create,
+  bool is_createop)
+{
+  ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
+
+  spg_t pgid;
+  if (cid.is_pg(&pgid)) {
+    if (!oid.match(cnode.bits, pgid.ps())) {
+      lderr(store->cct) << __func__ << " oid " << oid << " not part of "
+			<< pgid << " bits " << cnode.bits << dendl;
+      ceph_abort();
+    }
+  }
+
+  OnodeRef o = onode_space.lookup(oid);
+  if (o)
+    return o;
+
+  string key;
+  get_object_key(store->cct, oid, &key);
+
+  ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
+			<< pretty_binary_string(key) << dendl;
+
+  bufferlist v;
+  int r = -ENOENT;
+  Onode *on;
+  if (!is_createop) {
+    r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
+    ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
+  }
+  if (v.length() == 0) {
+    ceph_assert(r == -ENOENT);
+    if (!create)
+      return OnodeRef();
+  } else {
+    ceph_assert(r >= 0);
+  }
+
+  // new object, load onode if available
+  on = Onode::create_decode(this, oid, key, v, true);
+  o.reset(on);
+  return onode_space.add_onode(oid, o);
+}
+
+void BlueStore::Collection::split_cache(
+  Collection *dest)
+{
+  ldout(store->cct, 10) << __func__ << " to " << dest << dendl;
+
+  auto *ocache = get_onode_cache();
+  auto *ocache_dest = dest->get_onode_cache();
+
+ // lock cache shards
+  std::lock(ocache->lock, ocache_dest->lock, cache->lock, dest->cache->lock);
+  std::lock_guard l(ocache->lock, std::adopt_lock);
+  std::lock_guard l2(ocache_dest->lock, std::adopt_lock);
+  std::lock_guard l3(cache->lock, std::adopt_lock);
+  std::lock_guard l4(dest->cache->lock, std::adopt_lock);
+
+  int destbits = dest->cnode.bits;
+  spg_t destpg;
+  bool is_pg = dest->cid.is_pg(&destpg);
+  ceph_assert(is_pg);
+
+  auto p = onode_space.onode_map.begin();
+  while (p != onode_space.onode_map.end()) {
+    OnodeRef o = p->second;
+    if (!p->second->oid.match(destbits, destpg.pgid.ps())) {
+      // onode does not belong to this child
+      ldout(store->cct, 20) << __func__ << " not moving " << o << " " << o->oid
+			    << dendl;
+      ++p;
+    } else {
+      ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid
+			    << dendl;
+
+      // ensuring that nref is always >= 2 and hence onode is pinned
+      OnodeRef o_pin = o;
+
+      p = onode_space.onode_map.erase(p);
+      dest->onode_space.onode_map[o->oid] = o;
+      if (o->cached) {
+        get_onode_cache()->_move_pinned(dest->get_onode_cache(), o.get());
+      }
+      o->c = dest;
+
+      // move over shared blobs and buffers.  cover shared blobs from
+      // both extent map and spanning blob map (the full extent map
+      // may not be faulted in)
+      vector<SharedBlob*> sbvec;
+      for (auto& e : o->extent_map.extent_map) {
+	sbvec.push_back(e.blob->shared_blob.get());
+      }
+      for (auto& b : o->extent_map.spanning_blob_map) {
+	sbvec.push_back(b.second->shared_blob.get());
+      }
+      for (auto sb : sbvec) {
+	if (sb->coll == dest) {
+	  ldout(store->cct, 20) << __func__ << "  already moved " << *sb
+				<< dendl;
+	  continue;
+	}
+	ldout(store->cct, 20) << __func__ << "  moving " << *sb << dendl;
+	if (sb->get_sbid()) {
+	  ldout(store->cct, 20) << __func__
+				<< "   moving registration " << *sb << dendl;
+	  shared_blob_set.remove(sb);
+	  dest->shared_blob_set.add(dest, sb);
+	}
+	sb->coll = dest;
+	if (dest->cache != cache) {
+	  for (auto& i : sb->bc.buffer_map) {
+	    if (!i.second->is_writing()) {
+	      ldout(store->cct, 20) << __func__ << "   moving " << *i.second
+				    << dendl;
+	      dest->cache->_move(cache, i.second.get());
+	    }
+	  }
+	}
+      }
+    }
+  }
+  dest->cache->_trim();
+}
+
+// =======================================================
+
+// MempoolThread
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.MempoolThread(" << this << ") "
+#undef dout_context
+#define dout_context store->cct
+
+void *BlueStore::MempoolThread::entry()
+{
+  std::unique_lock l{lock};
+
+  uint32_t prev_config_change = store->config_changed.load();
+  uint64_t base = store->osd_memory_base;
+  double fragmentation = store->osd_memory_expected_fragmentation;
+  uint64_t target = store->osd_memory_target;
+  uint64_t min = store->osd_memory_cache_min;
+  uint64_t max = min;
+
+  // When setting the maximum amount of memory to use for cache, first 
+  // assume some base amount of memory for the OSD and then fudge in
+  // some overhead for fragmentation that scales with cache usage.
+  uint64_t ltarget = (1.0 - fragmentation) * target;
+  if (ltarget > base + min) {
+    max = ltarget - base;
+  }
+
+  binned_kv_cache = store->db->get_priority_cache();
+  binned_kv_onode_cache = store->db->get_priority_cache(PREFIX_OBJ);
+  if (store->cache_autotune && binned_kv_cache != nullptr) {
+    pcm = std::make_shared<PriorityCache::Manager>(
+        store->cct, min, max, target, true, "bluestore-pricache");
+    pcm->insert("kv", binned_kv_cache, true);
+    pcm->insert("meta", meta_cache, true);
+    pcm->insert("data", data_cache, true);
+    if (binned_kv_onode_cache != nullptr) {
+      pcm->insert("kv_onode", binned_kv_onode_cache, true);
+    }
+  }
+
+  utime_t next_balance = ceph_clock_now();
+  utime_t next_resize = ceph_clock_now();
+  utime_t next_bin_rotation = ceph_clock_now();
+  utime_t next_deferred_force_submit = ceph_clock_now();
+  utime_t alloc_stats_dump_clock = ceph_clock_now();
+
+  bool interval_stats_trim = false;
+  while (!stop) {
+    // Update pcm cache settings if related configuration was changed
+    uint32_t cur_config_change = store->config_changed.load();
+    if (cur_config_change != prev_config_change) {
+      _update_cache_settings();
+      prev_config_change = cur_config_change;
+    }
+
+    // define various intervals for background work
+    double age_bin_interval = store->cache_age_bin_interval;
+    double autotune_interval = store->cache_autotune_interval;
+    double resize_interval = store->osd_memory_cache_resize_interval;
+    double max_defer_interval = store->max_defer_interval;
+    double alloc_stats_dump_interval =
+      store->cct->_conf->bluestore_alloc_stats_dump_interval;
+
+    // alloc stats dump
+    if (alloc_stats_dump_interval > 0 &&
+        alloc_stats_dump_clock + alloc_stats_dump_interval < ceph_clock_now()) {
+      store->_record_allocation_stats();
+      alloc_stats_dump_clock = ceph_clock_now();
+    }
+    // cache age binning
+    if (age_bin_interval > 0 && next_bin_rotation < ceph_clock_now()) {
+      if (binned_kv_cache != nullptr) {
+        binned_kv_cache->import_bins(store->kv_bins);
+      }
+      if (binned_kv_onode_cache != nullptr) {
+        binned_kv_onode_cache->import_bins(store->kv_onode_bins);
+      }
+      meta_cache->import_bins(store->meta_bins);
+      data_cache->import_bins(store->data_bins);
+
+      if (pcm != nullptr) {
+        pcm->shift_bins();
+      }
+      next_bin_rotation = ceph_clock_now();
+      next_bin_rotation += age_bin_interval;
+    }
+    // cache balancing
+    if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
+      if (binned_kv_cache != nullptr) {
+        binned_kv_cache->set_cache_ratio(store->cache_kv_ratio);
+      }
+      if (binned_kv_onode_cache != nullptr) {
+        binned_kv_onode_cache->set_cache_ratio(store->cache_kv_onode_ratio);
+      }
+      meta_cache->set_cache_ratio(store->cache_meta_ratio);
+      data_cache->set_cache_ratio(store->cache_data_ratio);
+
+      // Log events at 5 instead of 20 when balance happens.
+      interval_stats_trim = true;
+
+      if (pcm != nullptr) {
+        pcm->balance();
+      }
+
+      next_balance = ceph_clock_now();
+      next_balance += autotune_interval;
+    }
+    // memory resizing (ie autotuning)
+    if (resize_interval > 0 && next_resize < ceph_clock_now()) {
+      if (ceph_using_tcmalloc() && pcm != nullptr) {
+        pcm->tune_memory();
+      }
+      next_resize = ceph_clock_now();
+      next_resize += resize_interval;
+    }
+    // deferred force submit
+    if (max_defer_interval > 0 &&
+	next_deferred_force_submit < ceph_clock_now()) {
+      if (store->get_deferred_last_submitted() + max_defer_interval <
+	  ceph_clock_now()) {
+	store->deferred_try_submit();
+      }
+      next_deferred_force_submit = ceph_clock_now();
+      next_deferred_force_submit += max_defer_interval/3;
+    }
+
+    // Now Resize the shards 
+    _resize_shards(interval_stats_trim);
+    interval_stats_trim = false;
+
+    store->_update_logger();
+    auto wait = ceph::make_timespan(
+      store->cct->_conf->bluestore_cache_trim_interval);
+    cond.wait_for(l, wait);
+  }
+  // do final dump
+  store->_record_allocation_stats();
+  stop = false;
+  pcm = nullptr;
+  return NULL;
+}
+
+void BlueStore::MempoolThread::_resize_shards(bool interval_stats)
+{
+  size_t onode_shards = store->onode_cache_shards.size();
+  size_t buffer_shards = store->buffer_cache_shards.size();
+  int64_t kv_used = store->db->get_cache_usage();
+  int64_t kv_onode_used = store->db->get_cache_usage(PREFIX_OBJ);
+  int64_t meta_used = meta_cache->_get_used_bytes();
+  int64_t data_used = data_cache->_get_used_bytes();
+
+  uint64_t cache_size = store->cache_size;
+  int64_t kv_alloc =
+     static_cast<int64_t>(store->cache_kv_ratio * cache_size);
+  int64_t kv_onode_alloc =
+     static_cast<int64_t>(store->cache_kv_onode_ratio * cache_size);
+  int64_t meta_alloc =
+     static_cast<int64_t>(store->cache_meta_ratio * cache_size);
+  int64_t data_alloc =
+     static_cast<int64_t>(store->cache_data_ratio * cache_size);
+
+  if (pcm != nullptr && binned_kv_cache != nullptr) {
+    cache_size = pcm->get_tuned_mem();
+    kv_alloc = binned_kv_cache->get_committed_size();
+    meta_alloc = meta_cache->get_committed_size();
+    data_alloc = data_cache->get_committed_size();
+    if (binned_kv_onode_cache != nullptr) {
+      kv_onode_alloc = binned_kv_onode_cache->get_committed_size();
+    }
+  }
+  
+  if (interval_stats) {
+    dout(5) << __func__  << " cache_size: " << cache_size
+                  << " kv_alloc: " << kv_alloc
+                  << " kv_used: " << kv_used
+                  << " kv_onode_alloc: " << kv_onode_alloc
+                  << " kv_onode_used: " << kv_onode_used
+                  << " meta_alloc: " << meta_alloc
+                  << " meta_used: " << meta_used
+                  << " data_alloc: " << data_alloc
+                  << " data_used: " << data_used << dendl;
+  } else {
+    dout(20) << __func__  << " cache_size: " << cache_size
+                   << " kv_alloc: " << kv_alloc
+                   << " kv_used: " << kv_used
+                   << " kv_onode_alloc: " << kv_onode_alloc
+                   << " kv_onode_used: " << kv_onode_used
+                   << " meta_alloc: " << meta_alloc
+                   << " meta_used: " << meta_used
+                   << " data_alloc: " << data_alloc
+                   << " data_used: " << data_used << dendl;
+  }
+
+  uint64_t max_shard_onodes = static_cast<uint64_t>(
+      (meta_alloc / (double) onode_shards) / meta_cache->get_bytes_per_onode());
+  uint64_t max_shard_buffer = static_cast<uint64_t>(data_alloc / buffer_shards);
+
+  dout(30) << __func__ << " max_shard_onodes: " << max_shard_onodes
+                 << " max_shard_buffer: " << max_shard_buffer << dendl;
+
+  for (auto i : store->onode_cache_shards) {
+    i->set_max(max_shard_onodes);
+  }
+  for (auto i : store->buffer_cache_shards) {
+    i->set_max(max_shard_buffer);
+  }
+}
+
+void BlueStore::MempoolThread::_update_cache_settings()
+{
+  // Nothing to do if pcm is not used.
+  if (pcm == nullptr) {
+    return;
+  }
+
+  uint64_t target = store->osd_memory_target;
+  uint64_t base = store->osd_memory_base;
+  uint64_t min = store->osd_memory_cache_min;
+  uint64_t max = min;
+  double fragmentation = store->osd_memory_expected_fragmentation;
+
+  uint64_t ltarget = (1.0 - fragmentation) * target;
+  if (ltarget > base + min) {
+    max = ltarget - base;
+  }
+
+  // set pcm cache levels
+  pcm->set_target_memory(target);
+  pcm->set_min_memory(min);
+  pcm->set_max_memory(max);
+
+  dout(5) << __func__  << " updated pcm target: " << target
+                << " pcm min: " << min
+                << " pcm max: " << max
+                << dendl;
+}
+
+// =======================================================
+
+// OmapIteratorImpl
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore.OmapIteratorImpl(" << this << ") "
+
+BlueStore::OmapIteratorImpl::OmapIteratorImpl(
+  PerfCounters* _logger, CollectionRef c, OnodeRef& o, KeyValueDB::Iterator it)
+  : logger(_logger), c(c), o(o), it(it)
+{
+  logger->inc(l_bluestore_omap_iterator_count);
+  std::shared_lock l(c->lock);
+  if (o->onode.has_omap()) {
+    o->get_omap_key(string(), &head);
+    o->get_omap_tail(&tail);
+    it->lower_bound(head);
+  }
+}
+BlueStore::OmapIteratorImpl::~OmapIteratorImpl()
+{
+  logger->dec(l_bluestore_omap_iterator_count);
+}
+
+string BlueStore::OmapIteratorImpl::_stringify() const
+{
+  stringstream s;
+  s << " omap_iterator(cid = " << c->cid
+    <<", oid = " << o->oid << ")";
+  return s.str();
+}
+
+int BlueStore::OmapIteratorImpl::seek_to_first()
+{
+  std::shared_lock l(c->lock);
+  auto start1 = mono_clock::now();
+  if (o->onode.has_omap()) {
+    it->lower_bound(head);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  c->store->log_latency(
+    __func__,
+    l_bluestore_omap_seek_to_first_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+  return 0;
+}
+
+int BlueStore::OmapIteratorImpl::upper_bound(const string& after)
+{
+  std::shared_lock l(c->lock);
+  auto start1 = mono_clock::now();
+  if (o->onode.has_omap()) {
+    string key;
+    o->get_omap_key(after, &key);
+    ldout(c->store->cct,20) << __func__ << " after " << after << " key "
+			    << pretty_binary_string(key) << dendl;
+    it->upper_bound(key);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  c->store->log_latency_fn(
+    __func__,
+    l_bluestore_omap_upper_bound_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age,
+    [&] (const ceph::timespan& lat) {
+      return ", after = " + after +
+	_stringify();
+    }
+  );
+  return 0;
+}
+
+int BlueStore::OmapIteratorImpl::lower_bound(const string& to)
+{
+  std::shared_lock l(c->lock);
+  auto start1 = mono_clock::now();
+  if (o->onode.has_omap()) {
+    string key;
+    o->get_omap_key(to, &key);
+    ldout(c->store->cct,20) << __func__ << " to " << to << " key "
+			    << pretty_binary_string(key) << dendl;
+    it->lower_bound(key);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  c->store->log_latency_fn(
+    __func__,
+    l_bluestore_omap_lower_bound_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age,
+    [&] (const ceph::timespan& lat) {
+      return ", to = " + to +
+	_stringify();
+    }
+  );
+  return 0;
+}
+
+bool BlueStore::OmapIteratorImpl::valid()
+{
+  std::shared_lock l(c->lock);
+  bool r = o->onode.has_omap() && it && it->valid() &&
+    it->raw_key().second < tail;
+  if (it && it->valid()) {
+    ldout(c->store->cct,20) << __func__ << " is at "
+			    << pretty_binary_string(it->raw_key().second)
+			    << dendl;
+  }
+  return r;
+}
+
+int BlueStore::OmapIteratorImpl::next()
+{
+  int r = -1;
+  std::shared_lock l(c->lock);
+  auto start1 = mono_clock::now();
+  if (o->onode.has_omap()) {
+    it->next();
+    r = 0;
+  }
+  c->store->log_latency(
+    __func__,
+    l_bluestore_omap_next_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+  return r;
+}
+
+string BlueStore::OmapIteratorImpl::key()
+{
+  std::shared_lock l(c->lock);
+  ceph_assert(it->valid());
+  string db_key = it->raw_key().second;
+  string user_key;
+  o->decode_omap_key(db_key, &user_key);
+
+  return user_key;
+}
+
+bufferlist BlueStore::OmapIteratorImpl::value()
+{
+  std::shared_lock l(c->lock);
+  ceph_assert(it->valid());
+  return it->value();
+}
+
+
+// =====================================
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore(" << path << ") "
+#undef dout_context
+#define dout_context cct
+
+
+static void aio_cb(void *priv, void *priv2)
+{
+  BlueStore *store = static_cast<BlueStore*>(priv);
+  BlueStore::AioContext *c = static_cast<BlueStore::AioContext*>(priv2);
+  c->aio_finish(store);
+}
+
+static void discard_cb(void *priv, void *priv2)
+{
+  BlueStore *store = static_cast<BlueStore*>(priv);
+  interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
+  store->handle_discard(*tmp);
+}
+
+void BlueStore::handle_discard(interval_set<uint64_t>& to_release)
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(alloc);
+  alloc->release(to_release);
+}
+
+BlueStore::BlueStore(CephContext *cct, const string& path)
+  : BlueStore(cct, path, 0) {}
+
+BlueStore::BlueStore(CephContext *cct,
+  const string& path,
+  uint64_t _min_alloc_size)
+  : ObjectStore(cct, path),
+    throttle(cct),
+    finisher(cct, "commit_finisher", "cfin"),
+    kv_sync_thread(this),
+    kv_finalize_thread(this),
+#ifdef HAVE_LIBZBD
+    zoned_cleaner_thread(this),
+#endif
+    min_alloc_size(_min_alloc_size),
+    min_alloc_size_order(std::countr_zero(_min_alloc_size)),
+    mempool_thread(this)
+{
+  _init_logger();
+  cct->_conf.add_observer(this);
+  set_cache_shards(1);
+}
+
+BlueStore::~BlueStore()
+{
+  cct->_conf.remove_observer(this);
+  _shutdown_logger();
+  ceph_assert(!mounted);
+  ceph_assert(db == NULL);
+  ceph_assert(bluefs == NULL);
+  ceph_assert(fsid_fd < 0);
+  ceph_assert(path_fd < 0);
+  for (auto i : onode_cache_shards) {
+    delete i;
+  }
+  for (auto i : buffer_cache_shards) {
+    delete i;
+  }
+  onode_cache_shards.clear();
+  buffer_cache_shards.clear();
+}
+
+const char **BlueStore::get_tracked_conf_keys() const
+{
+  static const char* KEYS[] = {
+    "bluestore_csum_type",
+    "bluestore_compression_mode",
+    "bluestore_compression_algorithm",
+    "bluestore_compression_min_blob_size",
+    "bluestore_compression_min_blob_size_ssd",
+    "bluestore_compression_min_blob_size_hdd",
+    "bluestore_compression_max_blob_size",
+    "bluestore_compression_max_blob_size_ssd",
+    "bluestore_compression_max_blob_size_hdd",
+    "bluestore_compression_required_ratio",
+    "bluestore_max_alloc_size",
+    "bluestore_prefer_deferred_size",
+    "bluestore_prefer_deferred_size_hdd",
+    "bluestore_prefer_deferred_size_ssd",
+    "bluestore_deferred_batch_ops",
+    "bluestore_deferred_batch_ops_hdd",
+    "bluestore_deferred_batch_ops_ssd",
+    "bluestore_throttle_bytes",
+    "bluestore_throttle_deferred_bytes",
+    "bluestore_throttle_cost_per_io_hdd",
+    "bluestore_throttle_cost_per_io_ssd",
+    "bluestore_throttle_cost_per_io",
+    "bluestore_max_blob_size",
+    "bluestore_max_blob_size_ssd",
+    "bluestore_max_blob_size_hdd",
+    "osd_memory_target",
+    "osd_memory_target_cgroup_limit_ratio",
+    "osd_memory_base",
+    "osd_memory_cache_min",
+    "osd_memory_expected_fragmentation",
+    "bluestore_cache_autotune",
+    "bluestore_cache_autotune_interval",
+    "bluestore_cache_age_bin_interval",
+    "bluestore_cache_kv_age_bins",
+    "bluestore_cache_kv_onode_age_bins",
+    "bluestore_cache_meta_age_bins",
+    "bluestore_cache_data_age_bins",
+    "bluestore_warn_on_legacy_statfs",
+    "bluestore_warn_on_no_per_pool_omap",
+    "bluestore_warn_on_no_per_pg_omap",
+    "bluestore_max_defer_interval",
+    NULL
+  };
+  return KEYS;
+}
+
+void BlueStore::handle_conf_change(const ConfigProxy& conf,
+				   const std::set<std::string> &changed)
+{
+  if (changed.count("bluestore_warn_on_legacy_statfs")) {
+    _check_legacy_statfs_alert();
+  }
+  if (changed.count("bluestore_warn_on_no_per_pool_omap") ||
+      changed.count("bluestore_warn_on_no_per_pg_omap")) {
+    _check_no_per_pg_or_pool_omap_alert();
+  }
+
+  if (changed.count("bluestore_csum_type")) {
+    _set_csum();
+  }
+  if (changed.count("bluestore_compression_mode") ||
+      changed.count("bluestore_compression_algorithm") ||
+      changed.count("bluestore_compression_min_blob_size") ||
+      changed.count("bluestore_compression_max_blob_size")) {
+    if (bdev) {
+      _set_compression();
+    }
+  }
+  if (changed.count("bluestore_max_blob_size") ||
+      changed.count("bluestore_max_blob_size_ssd") ||
+      changed.count("bluestore_max_blob_size_hdd")) {
+    if (bdev) {
+      // only after startup
+      _set_blob_size();
+    }
+  }
+  if (changed.count("bluestore_prefer_deferred_size") ||
+      changed.count("bluestore_prefer_deferred_size_hdd") ||
+      changed.count("bluestore_prefer_deferred_size_ssd") ||
+      changed.count("bluestore_max_alloc_size") ||
+      changed.count("bluestore_deferred_batch_ops") ||
+      changed.count("bluestore_deferred_batch_ops_hdd") ||
+      changed.count("bluestore_deferred_batch_ops_ssd")) {
+    if (bdev) {
+      // only after startup
+      _set_alloc_sizes();
+    }
+  }
+  if (changed.count("bluestore_throttle_cost_per_io") ||
+      changed.count("bluestore_throttle_cost_per_io_hdd") ||
+      changed.count("bluestore_throttle_cost_per_io_ssd")) {
+    if (bdev) {
+      _set_throttle_params();
+    }
+  }
+  if (changed.count("bluestore_throttle_bytes") ||
+      changed.count("bluestore_throttle_deferred_bytes") ||
+      changed.count("bluestore_throttle_trace_rate")) {
+    throttle.reset_throttle(conf);
+  }
+  if (changed.count("bluestore_max_defer_interval")) {
+    if (bdev) {
+      _set_max_defer_interval();
+    }
+  }
+  if (changed.count("osd_memory_target") ||
+      changed.count("osd_memory_base") ||
+      changed.count("osd_memory_cache_min") ||
+      changed.count("osd_memory_expected_fragmentation")) {
+    _update_osd_memory_options();
+  }
+}
+
+void BlueStore::_set_compression()
+{
+  auto m = Compressor::get_comp_mode_type(cct->_conf->bluestore_compression_mode);
+  if (m) {
+    _clear_compression_alert();
+    comp_mode = *m;
+  } else {
+    derr << __func__ << " unrecognized value '"
+         << cct->_conf->bluestore_compression_mode
+         << "' for bluestore_compression_mode, reverting to 'none'"
+         << dendl;
+    comp_mode = Compressor::COMP_NONE;
+    string s("unknown mode: ");
+    s += cct->_conf->bluestore_compression_mode;
+    _set_compression_alert(true, s.c_str());
+  }
+
+  compressor = nullptr;
+
+  if (cct->_conf->bluestore_compression_min_blob_size) {
+    comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size;
+  } else {
+    ceph_assert(bdev);
+    if (_use_rotational_settings()) {
+      comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_hdd;
+    } else {
+      comp_min_blob_size = cct->_conf->bluestore_compression_min_blob_size_ssd;
+    }
+  }
+
+  if (cct->_conf->bluestore_compression_max_blob_size) {
+    comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size;
+  } else {
+    ceph_assert(bdev);
+    if (_use_rotational_settings()) {
+      comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_hdd;
+    } else {
+      comp_max_blob_size = cct->_conf->bluestore_compression_max_blob_size_ssd;
+    }
+  }
+
+  auto& alg_name = cct->_conf->bluestore_compression_algorithm;
+  if (!alg_name.empty()) {
+    compressor = Compressor::create(cct, alg_name);
+    if (!compressor) {
+      derr << __func__ << " unable to initialize " << alg_name.c_str() << " compressor"
+           << dendl;
+      _set_compression_alert(false, alg_name.c_str());
+    }
+  }
+ 
+  dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
+	   << " alg " << (compressor ? compressor->get_type_name() : "(none)")
+	   << " min_blob " << comp_min_blob_size
+	   << " max_blob " << comp_max_blob_size
+	   << dendl;
+}
+
+void BlueStore::_set_csum()
+{
+  csum_type = Checksummer::CSUM_NONE;
+  int t = Checksummer::get_csum_string_type(cct->_conf->bluestore_csum_type);
+  if (t > Checksummer::CSUM_NONE)
+    csum_type = t;
+
+  dout(10) << __func__ << " csum_type "
+	   << Checksummer::get_csum_type_string(csum_type)
+	   << dendl;
+}
+
+void BlueStore::_set_throttle_params()
+{
+  if (cct->_conf->bluestore_throttle_cost_per_io) {
+    throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io;
+  } else {
+    ceph_assert(bdev);
+    if (_use_rotational_settings()) {
+      throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd;
+    } else {
+      throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd;
+    }
+  }
+
+  dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io
+	   << dendl;
+}
+void BlueStore::_set_blob_size()
+{
+  if (cct->_conf->bluestore_max_blob_size) {
+    max_blob_size = cct->_conf->bluestore_max_blob_size;
+  } else {
+    ceph_assert(bdev);
+    if (_use_rotational_settings()) {
+      max_blob_size = cct->_conf->bluestore_max_blob_size_hdd;
+    } else {
+      max_blob_size = cct->_conf->bluestore_max_blob_size_ssd;
+    }
+  }
+  dout(10) << __func__ << " max_blob_size 0x" << std::hex << max_blob_size
+           << std::dec << dendl;
+}
+
+void BlueStore::_update_osd_memory_options()
+{
+  osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
+  osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
+  osd_memory_expected_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
+  osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
+  config_changed++;
+  dout(10) << __func__
+           << " osd_memory_target " << osd_memory_target
+           << " osd_memory_base " << osd_memory_base
+           << " osd_memory_expected_fragmentation " << osd_memory_expected_fragmentation
+           << " osd_memory_cache_min " << osd_memory_cache_min
+           << dendl;
+}
+
+int BlueStore::_set_cache_sizes()
+{
+  ceph_assert(bdev);
+  cache_autotune = cct->_conf.get_val<bool>("bluestore_cache_autotune");
+  cache_autotune_interval =
+      cct->_conf.get_val<double>("bluestore_cache_autotune_interval");
+  cache_age_bin_interval =
+      cct->_conf.get_val<double>("bluestore_cache_age_bin_interval");
+  auto _set_bin = [&](std::string conf_name, std::vector<uint64_t>* intervals)
+  {
+    std::string intervals_str = cct->_conf.get_val<std::string>(conf_name);
+    std::istringstream interval_stream(intervals_str);
+    std::copy(
+      std::istream_iterator<uint64_t>(interval_stream),
+      std::istream_iterator<uint64_t>(),
+      std::back_inserter(*intervals));
+  };
+  _set_bin("bluestore_cache_age_bins_kv", &kv_bins);
+  _set_bin("bluestore_cache_age_bins_kv_onode", &kv_onode_bins);
+  _set_bin("bluestore_cache_age_bins_meta", &meta_bins);
+  _set_bin("bluestore_cache_age_bins_data", &data_bins);
+
+  osd_memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
+  osd_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
+  osd_memory_expected_fragmentation =
+      cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
+  osd_memory_cache_min = cct->_conf.get_val<Option::size_t>("osd_memory_cache_min");
+  osd_memory_cache_resize_interval = 
+      cct->_conf.get_val<double>("osd_memory_cache_resize_interval");
+
+  if (cct->_conf->bluestore_cache_size) {
+    cache_size = cct->_conf->bluestore_cache_size;
+  } else {
+    // choose global cache size based on backend type
+    if (_use_rotational_settings()) {
+      cache_size = cct->_conf->bluestore_cache_size_hdd;
+    } else {
+      cache_size = cct->_conf->bluestore_cache_size_ssd;
+    }
+  }
+
+  cache_meta_ratio = cct->_conf.get_val<double>("bluestore_cache_meta_ratio");
+  if (cache_meta_ratio < 0 || cache_meta_ratio > 1.0) {
+    derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
+         << ") must be in range [0,1.0]" << dendl;
+    return -EINVAL;
+  }
+
+  cache_kv_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_ratio");
+  if (cache_kv_ratio < 0 || cache_kv_ratio > 1.0) {
+    derr << __func__ << " bluestore_cache_kv_ratio (" << cache_kv_ratio
+         << ") must be in range [0,1.0]" << dendl;
+    return -EINVAL;
+  }
+
+  cache_kv_onode_ratio = cct->_conf.get_val<double>("bluestore_cache_kv_onode_ratio");
+  if (cache_kv_onode_ratio < 0 || cache_kv_onode_ratio > 1.0) {
+    derr << __func__ << " bluestore_cache_kv_onode_ratio (" << cache_kv_onode_ratio
+         << ") must be in range [0,1.0]" << dendl;
+    return -EINVAL;
+  }
+
+  if (cache_meta_ratio + cache_kv_ratio > 1.0) {
+    derr << __func__ << " bluestore_cache_meta_ratio (" << cache_meta_ratio
+         << ") + bluestore_cache_kv_ratio (" << cache_kv_ratio
+         << ") = " << cache_meta_ratio + cache_kv_ratio << "; must be <= 1.0"
+         << dendl;
+    return -EINVAL;
+  }
+
+  cache_data_ratio = (double)1.0 - 
+                     (double)cache_meta_ratio - 
+                     (double)cache_kv_ratio - 
+                     (double)cache_kv_onode_ratio;
+  if (cache_data_ratio < 0) {
+    // deal with floating point imprecision
+    cache_data_ratio = 0;
+  }
+    
+  dout(1) << __func__ << " cache_size " << cache_size
+          << " meta " << cache_meta_ratio
+	  << " kv " << cache_kv_ratio
+	  << " data " << cache_data_ratio
+	  << dendl;
+  return 0;
+}
+
+int BlueStore::write_meta(const std::string& key, const std::string& value)
+{
+  bluestore_bdev_label_t label;
+  string p = path + "/block";
+  int r = _read_bdev_label(cct, p, &label);
+  if (r < 0) {
+    return ObjectStore::write_meta(key, value);
+  }
+  label.meta[key] = value;
+  r = _write_bdev_label(cct, p, label);
+  ceph_assert(r == 0);
+  return ObjectStore::write_meta(key, value);
+}
+
+int BlueStore::read_meta(const std::string& key, std::string *value)
+{
+  bluestore_bdev_label_t label;
+  string p = path + "/block";
+  int r = _read_bdev_label(cct, p, &label);
+  if (r < 0) {
+    return ObjectStore::read_meta(key, value);
+  }
+  auto i = label.meta.find(key);
+  if (i == label.meta.end()) {
+    return ObjectStore::read_meta(key, value);
+  }
+  *value = i->second;
+  return 0;
+}
+
+void BlueStore::_init_logger()
+{
+  PerfCountersBuilder b(cct, "bluestore",
+                        l_bluestore_first, l_bluestore_last);
+
+  // space utilization stats
+  //****************************************
+  b.add_u64(l_bluestore_allocated, "allocated",
+	    "Sum for allocated bytes",
+	    "al_b",
+	    PerfCountersBuilder::PRIO_CRITICAL,
+	    unit_t(UNIT_BYTES));
+  b.add_u64(l_bluestore_stored, "stored",
+	    "Sum for stored bytes",
+	    "st_b",
+	    PerfCountersBuilder::PRIO_CRITICAL,
+	    unit_t(UNIT_BYTES));
+  b.add_u64(l_bluestore_fragmentation, "fragmentation_micros",
+            "How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
+  b.add_u64(l_bluestore_alloc_unit, "alloc_unit",
+	    "allocation unit size in bytes",
+	    "au_b",
+	    PerfCountersBuilder::PRIO_CRITICAL,
+	    unit_t(UNIT_BYTES));
+  //****************************************
+
+  // Update op processing state latencies
+  //****************************************
+  b.add_time_avg(l_bluestore_state_prepare_lat, "state_prepare_lat",
+		 "Average prepare state latency",
+		 "sprl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_state_aio_wait_lat, "state_aio_wait_lat",
+		 "Average aio_wait state latency",
+		 "sawl", PerfCountersBuilder::PRIO_INTERESTING);
+  b.add_time_avg(l_bluestore_state_io_done_lat, "state_io_done_lat",
+		 "Average io_done state latency",
+		 "sidl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_state_kv_queued_lat, "state_kv_queued_lat",
+		"Average kv_queued state latency",
+		"skql", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_state_kv_committing_lat, "state_kv_commiting_lat",
+		 "Average kv_commiting state latency",
+		 "skcl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_state_kv_done_lat, "state_kv_done_lat",
+		 "Average kv_done state latency",
+		 "skdl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_state_finishing_lat, "state_finishing_lat",
+		 "Average finishing state latency",
+		 "sfnl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_state_done_lat, "state_done_lat",
+		 "Average done state latency",
+		 "sdnl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_state_deferred_queued_lat, "state_deferred_queued_lat",
+		 "Average deferred_queued state latency",
+		 "sdql", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_state_deferred_aio_wait_lat, "state_deferred_aio_wait_lat",
+		 "Average aio_wait state latency",
+		 "sdal", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_state_deferred_cleanup_lat, "state_deferred_cleanup_lat",
+		 "Average cleanup state latency",
+		 "sdcl", PerfCountersBuilder::PRIO_USEFUL);
+  //****************************************
+
+  // Update Transaction stats
+  //****************************************
+  b.add_time_avg(l_bluestore_throttle_lat, "txc_throttle_lat",
+		 "Average submit throttle latency",
+		 "th_l", PerfCountersBuilder::PRIO_CRITICAL);
+  b.add_time_avg(l_bluestore_submit_lat, "txc_submit_lat",
+		 "Average submit latency",
+		 "s_l", PerfCountersBuilder::PRIO_CRITICAL);
+  b.add_time_avg(l_bluestore_commit_lat, "txc_commit_lat",
+		 "Average commit latency",
+		 "c_l", PerfCountersBuilder::PRIO_CRITICAL);
+  b.add_u64_counter(l_bluestore_txc, "txc_count", "Transactions committed");
+  //****************************************
+
+  // Read op stats
+  //****************************************
+  b.add_time_avg(l_bluestore_read_onode_meta_lat, "read_onode_meta_lat",
+		 "Average read onode metadata latency",
+		 "roml", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_read_wait_aio_lat, "read_wait_aio_lat",
+		 "Average read I/O waiting latency",
+		 "rwal", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_csum_lat, "csum_lat",
+		 "Average checksum latency",
+		 "csml", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluestore_read_eio, "read_eio",
+                    "Read EIO errors propagated to high level callers");
+  b.add_u64_counter(l_bluestore_reads_with_retries, "reads_with_retries",
+                    "Read operations that required at least one retry due to failed checksum validation",
+		    "rd_r", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_read_lat, "read_lat",
+		 "Average read latency",
+		 "r_l", PerfCountersBuilder::PRIO_CRITICAL);
+  //****************************************
+
+  // kv_thread latencies
+  //****************************************
+  b.add_time_avg(l_bluestore_kv_flush_lat, "kv_flush_lat",
+		 "Average kv_thread flush latency",
+		 "kfsl", PerfCountersBuilder::PRIO_INTERESTING);
+  b.add_time_avg(l_bluestore_kv_commit_lat, "kv_commit_lat",
+		 "Average kv_thread commit latency",
+		 "kcol", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_kv_sync_lat, "kv_sync_lat",
+		 "Average kv_sync thread latency",
+		 "kscl", PerfCountersBuilder::PRIO_INTERESTING);
+  b.add_time_avg(l_bluestore_kv_final_lat, "kv_final_lat",
+		 "Average kv_finalize thread latency",
+		 "kfll", PerfCountersBuilder::PRIO_INTERESTING);
+  //****************************************
+
+  // write op stats
+  //****************************************
+  b.add_u64_counter(l_bluestore_write_big, "write_big",
+		    "Large aligned writes into fresh blobs");
+  b.add_u64_counter(l_bluestore_write_big_bytes, "write_big_bytes",
+		    "Large aligned writes into fresh blobs (bytes)",
+		    NULL,
+		    PerfCountersBuilder::PRIO_DEBUGONLY,
+		    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluestore_write_big_blobs, "write_big_blobs",
+		    "Large aligned writes into fresh blobs (blobs)");
+  b.add_u64_counter(l_bluestore_write_big_deferred,
+		    "write_big_deferred",
+		    "Big overwrites using deferred");
+
+  b.add_u64_counter(l_bluestore_write_small, "write_small",
+		    "Small writes into existing or sparse small blobs");
+  b.add_u64_counter(l_bluestore_write_small_bytes, "write_small_bytes",
+		    "Small writes into existing or sparse small blobs (bytes)",
+		    NULL,
+		    PerfCountersBuilder::PRIO_DEBUGONLY,
+		    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluestore_write_small_unused,
+		    "write_small_unused",
+		    "Small writes into unused portion of existing blob");
+  b.add_u64_counter(l_bluestore_write_small_pre_read,
+		    "write_small_pre_read",
+		    "Small writes that required we read some data (possibly "
+		    "cached) to fill out the block");
+
+  b.add_u64_counter(l_bluestore_write_pad_bytes, "write_pad_bytes",
+		    "Sum for write-op padded bytes",
+		    NULL,
+		    PerfCountersBuilder::PRIO_DEBUGONLY,
+		    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluestore_write_penalty_read_ops, "write_penalty_read_ops",
+		    "Sum for write penalty read ops");
+  b.add_u64_counter(l_bluestore_write_new, "write_new",
+		    "Write into new blob");
+
+  b.add_u64_counter(l_bluestore_issued_deferred_writes,
+		    "issued_deferred_writes",
+		    "Total deferred writes issued");
+  b.add_u64_counter(l_bluestore_issued_deferred_write_bytes,
+		    "issued_deferred_write_bytes",
+		    "Total bytes in issued deferred writes",
+		    NULL,
+		    PerfCountersBuilder::PRIO_DEBUGONLY,
+		    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluestore_submitted_deferred_writes,
+		    "submitted_deferred_writes",
+		    "Total deferred writes submitted to disk");
+  b.add_u64_counter(l_bluestore_submitted_deferred_write_bytes,
+		    "submitted_deferred_write_bytes",
+		    "Total bytes submitted to disk by deferred writes",
+		    NULL,
+		    PerfCountersBuilder::PRIO_DEBUGONLY,
+		    unit_t(UNIT_BYTES));
+
+  b.add_u64_counter(l_bluestore_write_big_skipped_blobs,
+      "write_big_skipped_blobs",
+      "Large aligned writes into fresh blobs skipped due to zero detection (blobs)");
+  b.add_u64_counter(l_bluestore_write_big_skipped_bytes,
+      "write_big_skipped_bytes",
+      "Large aligned writes into fresh blobs skipped due to zero detection (bytes)");
+  b.add_u64_counter(l_bluestore_write_small_skipped,
+      "write_small_skipped",
+      "Small writes into existing or sparse small blobs skipped due to zero detection");
+  b.add_u64_counter(l_bluestore_write_small_skipped_bytes,
+      "write_small_skipped_bytes",
+      "Small writes into existing or sparse small blobs skipped due to zero detection (bytes)");
+  //****************************************
+
+  // compressions stats
+  //****************************************
+  b.add_u64(l_bluestore_compressed, "compressed",
+	    "Sum for stored compressed bytes",
+	    "c", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluestore_compressed_allocated, "compressed_allocated",
+	    "Sum for bytes allocated for compressed data",
+	    "c_a", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_u64(l_bluestore_compressed_original, "compressed_original",
+	    "Sum for original bytes that were compressed",
+	    "c_o", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+  b.add_time_avg(l_bluestore_compress_lat, "compress_lat",
+	    "Average compress latency",
+	    "_cpl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_decompress_lat, "decompress_lat",
+	    "Average decompress latency",
+	    "dcpl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluestore_compress_success_count, "compress_success_count",
+	    "Sum for beneficial compress ops");
+  b.add_u64_counter(l_bluestore_compress_rejected_count, "compress_rejected_count",
+	    "Sum for compress ops rejected due to low net gain of space");
+  //****************************************
+
+  // onode cache stats
+  //****************************************
+  b.add_u64(l_bluestore_onodes, "onodes",
+	    "Number of onodes in cache");
+  b.add_u64(l_bluestore_pinned_onodes, "onodes_pinned",
+            "Number of pinned onodes in cache");
+  b.add_u64_counter(l_bluestore_onode_hits, "onode_hits",
+		    "Count of onode cache lookup hits",
+		    "o_ht", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluestore_onode_misses, "onode_misses",
+		    "Count of onode cache lookup misses",
+		    "o_ms", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluestore_onode_shard_hits, "onode_shard_hits",
+		    "Count of onode shard cache lookups hits");
+  b.add_u64_counter(l_bluestore_onode_shard_misses,
+		    "onode_shard_misses",
+		    "Count of onode shard cache lookups misses");
+  b.add_u64(l_bluestore_extents, "onode_extents",
+	    "Number of extents in cache");
+  b.add_u64(l_bluestore_blobs, "onode_blobs",
+	    "Number of blobs in cache");
+  //****************************************
+
+  // buffer cache stats
+  //****************************************
+  b.add_u64(l_bluestore_buffers, "buffers",
+	    "Number of buffers in cache");
+  b.add_u64(l_bluestore_buffer_bytes, "buffer_bytes",
+	    "Number of buffer bytes in cache",
+	     NULL,
+	     PerfCountersBuilder::PRIO_DEBUGONLY,
+	     unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluestore_buffer_hit_bytes, "buffer_hit_bytes",
+	    "Sum for bytes of read hit in the cache",
+	    NULL,
+	    PerfCountersBuilder::PRIO_DEBUGONLY,
+	    unit_t(UNIT_BYTES));
+  b.add_u64_counter(l_bluestore_buffer_miss_bytes, "buffer_miss_bytes",
+	    "Sum for bytes of read missed in the cache",
+	    NULL,
+	    PerfCountersBuilder::PRIO_DEBUGONLY,
+	    unit_t(UNIT_BYTES));
+  //****************************************
+
+  // internal stats
+  //****************************************
+  b.add_u64_counter(l_bluestore_onode_reshard, "onode_reshard",
+		    "Onode extent map reshard events");
+  b.add_u64_counter(l_bluestore_blob_split, "blob_split",
+		    "Sum for blob splitting due to resharding");
+  b.add_u64_counter(l_bluestore_extent_compress, "extent_compress",
+		    "Sum for extents that have been removed due to compression");
+  b.add_u64_counter(l_bluestore_gc_merged, "gc_merged",
+		    "Sum for extents that have been merged due to garbage "
+		    "collection");
+  //****************************************
+  // misc
+  //****************************************
+  b.add_u64_counter(l_bluestore_omap_iterator_count, "omap_iterator_count",
+    "Open omap iterators count");
+  b.add_u64_counter(l_bluestore_omap_rmkeys_count, "omap_rmkeys_count",
+    "amount of omap keys removed via rmkeys");
+  b.add_u64_counter(l_bluestore_omap_rmkey_ranges_count, "omap_rmkey_range_count",
+    "amount of omap key ranges removed via rmkeys");
+  //****************************************
+  // other client ops latencies
+  //****************************************
+  b.add_time_avg(l_bluestore_omap_seek_to_first_lat, "omap_seek_to_first_lat",
+    "Average omap iterator seek_to_first call latency",
+    "osfl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_omap_upper_bound_lat, "omap_upper_bound_lat",
+    "Average omap iterator upper_bound call latency",
+    "oubl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_omap_lower_bound_lat, "omap_lower_bound_lat",
+    "Average omap iterator lower_bound call latency",
+    "olbl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_omap_next_lat, "omap_next_lat",
+    "Average omap iterator next call latency",
+    "onxl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_omap_get_keys_lat, "omap_get_keys_lat",
+    "Average omap get_keys call latency",
+    "ogkl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_omap_get_values_lat, "omap_get_values_lat",
+    "Average omap get_values call latency",
+    "ogvl", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_omap_clear_lat, "omap_clear_lat",
+    "Average omap clear call latency");
+  b.add_time_avg(l_bluestore_clist_lat, "clist_lat",
+    "Average collection listing latency",
+    "cl_l", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_remove_lat, "remove_lat",
+    "Average removal latency",
+    "rm_l", PerfCountersBuilder::PRIO_USEFUL);
+  b.add_time_avg(l_bluestore_truncate_lat, "truncate_lat",
+    "Average truncate latency",
+    "tr_l", PerfCountersBuilder::PRIO_USEFUL);
+  //****************************************
+
+  // Resulting size axis configuration for op histograms, values are in bytes
+  PerfHistogramCommon::axis_config_d alloc_hist_x_axis_config{
+    "Given size (bytes)",
+    PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
+    0,                               ///< Start at 0
+    4096,                            ///< Quantization unit
+    13,                               ///< Enough to cover 4+M requests
+  };
+  // Req size axis configuration for op histograms, values are in bytes
+  PerfHistogramCommon::axis_config_d alloc_hist_y_axis_config{
+    "Request size (bytes)",
+    PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
+    0,                               ///< Start at 0
+    4096,                            ///< Quantization unit
+    13,                               ///< Enough to cover 4+M requests
+  };
+  b.add_u64_counter_histogram(
+    l_bluestore_allocate_hist, "allocate_histogram",
+    alloc_hist_x_axis_config, alloc_hist_y_axis_config,
+    "Histogram of requested block allocations vs. given ones");
+
+  logger = b.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+}
+
+int BlueStore::_reload_logger()
+{
+  struct store_statfs_t store_statfs;
+  int r = statfs(&store_statfs);
+  if (r >= 0) {
+    logger->set(l_bluestore_allocated, store_statfs.allocated);
+    logger->set(l_bluestore_stored, store_statfs.data_stored);
+    logger->set(l_bluestore_compressed, store_statfs.data_compressed);
+    logger->set(l_bluestore_compressed_allocated, store_statfs.data_compressed_allocated);
+    logger->set(l_bluestore_compressed_original, store_statfs.data_compressed_original);
+  }
+  return r;
+}
+
+void BlueStore::_shutdown_logger()
+{
+  cct->get_perfcounters_collection()->remove(logger);
+  delete logger;
+}
+
+int BlueStore::get_block_device_fsid(CephContext* cct, const string& path,
+				     uuid_d *fsid)
+{
+  bluestore_bdev_label_t label;
+  int r = _read_bdev_label(cct, path, &label);
+  if (r < 0)
+    return r;
+  *fsid = label.osd_uuid;
+  return 0;
+}
+
+int BlueStore::_open_path()
+{
+  // sanity check(s)
+  ceph_assert(path_fd < 0);
+  path_fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_DIRECTORY|O_CLOEXEC));
+  if (path_fd < 0) {
+    int r = -errno;
+    derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
+	 << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void BlueStore::_close_path()
+{
+  VOID_TEMP_FAILURE_RETRY(::close(path_fd));
+  path_fd = -1;
+}
+
+int BlueStore::_write_bdev_label(CephContext *cct,
+				 const string &path, bluestore_bdev_label_t label)
+{
+  dout(10) << __func__ << " path " << path << " label " << label << dendl;
+  bufferlist bl;
+  encode(label, bl);
+  uint32_t crc = bl.crc32c(-1);
+  encode(crc, bl);
+  ceph_assert(bl.length() <= BDEV_LABEL_BLOCK_SIZE);
+  bufferptr z(BDEV_LABEL_BLOCK_SIZE - bl.length());
+  z.zero();
+  bl.append(std::move(z));
+
+  int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_WRONLY|O_CLOEXEC|O_DIRECT));
+  if (fd < 0) {
+    fd = -errno;
+    derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
+	 << dendl;
+    return fd;
+  }
+  bl.rebuild_aligned_size_and_memory(BDEV_LABEL_BLOCK_SIZE, BDEV_LABEL_BLOCK_SIZE, IOV_MAX);
+  int r = bl.write_fd(fd);
+  if (r < 0) {
+    derr << __func__ << " failed to write to " << path
+	 << ": " << cpp_strerror(r) << dendl;
+    goto out;
+  }
+  r = ::fsync(fd);
+  if (r < 0) {
+    derr << __func__ << " failed to fsync " << path
+	 << ": " << cpp_strerror(r) << dendl;
+  }
+out:
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  return r;
+}
+
+int BlueStore::_read_bdev_label(CephContext* cct, const string &path,
+				bluestore_bdev_label_t *label)
+{
+  dout(10) << __func__ << dendl;
+  int fd = TEMP_FAILURE_RETRY(::open(path.c_str(), O_RDONLY|O_CLOEXEC));
+  if (fd < 0) {
+    fd = -errno;
+    derr << __func__ << " failed to open " << path << ": " << cpp_strerror(fd)
+	 << dendl;
+    return fd;
+  }
+  bufferlist bl;
+  int r = bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  if (r < 0) {
+    derr << __func__ << " failed to read from " << path
+	 << ": " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  uint32_t crc, expected_crc;
+  auto p = bl.cbegin();
+  try {
+    decode(*label, p);
+    bufferlist t;
+    t.substr_of(bl, 0, p.get_off());
+    crc = t.crc32c(-1);
+    decode(expected_crc, p);
+  }
+  catch (ceph::buffer::error& e) {
+    derr << __func__ << " unable to decode label at offset " << p.get_off()
+	 << ": " << e.what()
+	 << dendl;
+    return -ENOENT;
+  }
+  if (crc != expected_crc) {
+    derr << __func__ << " bad crc on label, expected " << expected_crc
+	 << " != actual " << crc << dendl;
+    return -EIO;
+  }
+  dout(10) << __func__ << " got " << *label << dendl;
+  return 0;
+}
+
+int BlueStore::_check_or_set_bdev_label(
+  string path, uint64_t size, string desc, bool create)
+{
+  bluestore_bdev_label_t label;
+  if (create) {
+    label.osd_uuid = fsid;
+    label.size = size;
+    label.btime = ceph_clock_now();
+    label.description = desc;
+    int r = _write_bdev_label(cct, path, label);
+    if (r < 0)
+      return r;
+  } else {
+    int r = _read_bdev_label(cct, path, &label);
+    if (r < 0)
+      return r;
+    if (cct->_conf->bluestore_debug_permit_any_bdev_label) {
+      dout(20) << __func__ << " bdev " << path << " fsid " << label.osd_uuid
+	   << " and fsid " << fsid << " check bypassed" << dendl;
+    } else if (label.osd_uuid != fsid) {
+      derr << __func__ << " bdev " << path << " fsid " << label.osd_uuid
+	   << " does not match our fsid " << fsid << dendl;
+      return -EIO;
+    }
+  }
+  return 0;
+}
+
+void BlueStore::_set_alloc_sizes(void)
+{
+  max_alloc_size = cct->_conf->bluestore_max_alloc_size;
+
+#ifdef HAVE_LIBZBD
+  ceph_assert(bdev);
+  if (bdev->is_smr()) {
+    prefer_deferred_size = 0;
+  } else
+#endif
+  if (cct->_conf->bluestore_prefer_deferred_size) {
+    prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size;
+  } else {
+    if (_use_rotational_settings()) {
+      prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_hdd;
+    } else {
+      prefer_deferred_size = cct->_conf->bluestore_prefer_deferred_size_ssd;
+    }
+  }
+
+  if (cct->_conf->bluestore_deferred_batch_ops) {
+    deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops;
+  } else {
+    if (_use_rotational_settings()) {
+      deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_hdd;
+    } else {
+      deferred_batch_ops = cct->_conf->bluestore_deferred_batch_ops_ssd;
+    }
+  }
+
+  dout(10) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
+	   << std::dec << " order " << (int)min_alloc_size_order
+	   << " max_alloc_size 0x" << std::hex << max_alloc_size
+	   << " prefer_deferred_size 0x" << prefer_deferred_size
+	   << std::dec
+	   << " deferred_batch_ops " << deferred_batch_ops
+	   << dendl;
+}
+
+int BlueStore::_open_bdev(bool create)
+{
+  ceph_assert(bdev == NULL);
+  string p = path + "/block";
+  bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this), discard_cb, static_cast<void*>(this));
+  int r = bdev->open(p);
+  if (r < 0)
+    goto fail;
+
+  if (create && cct->_conf->bdev_enable_discard) {
+    interval_set<uint64_t> whole_device;
+    whole_device.insert(0, bdev->get_size());
+    bdev->try_discard(whole_device, false);
+  }
+
+  if (bdev->supported_bdev_label()) {
+    r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
+    if (r < 0)
+      goto fail_close;
+  }
+
+  // initialize global block parameters
+  block_size = bdev->get_block_size();
+  block_mask = ~(block_size - 1);
+  block_size_order = std::countr_zero(block_size);
+  ceph_assert(block_size == 1u << block_size_order);
+  _set_max_defer_interval();
+  // and set cache_size based on device type
+  r = _set_cache_sizes();
+  if (r < 0) {
+    goto fail_close;
+  }
+  // get block dev optimal io size
+  optimal_io_size = bdev->get_optimal_io_size();
+
+  return 0;
+
+ fail_close:
+  bdev->close();
+ fail:
+  delete bdev;
+  bdev = NULL;
+  return r;
+}
+
+void BlueStore::_validate_bdev()
+{
+  ceph_assert(bdev);
+  uint64_t dev_size = bdev->get_size();
+  ceph_assert(dev_size > _get_ondisk_reserved());
+}
+
+void BlueStore::_close_bdev()
+{
+  ceph_assert(bdev);
+  bdev->close();
+  delete bdev;
+  bdev = NULL;
+}
+
+int BlueStore::_open_fm(KeyValueDB::Transaction t,
+                        bool read_only,
+                        bool db_avail,
+                        bool fm_restore)
+{
+  int r;
+
+  dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
+  ceph_assert(fm == NULL);
+  // fm_restore means we are transitioning from null-fm to bitmap-fm
+  ceph_assert(!fm_restore || (freelist_type != "null"));
+  // fm restore must pass in a valid transaction
+  ceph_assert(!fm_restore || (t != nullptr));
+
+  // when function is called in repair mode (to_repair=true) we skip db->open()/create()
+  bool can_have_null_fm = !is_db_rotational() &&
+                          !read_only &&
+                          db_avail &&
+                          cct->_conf->bluestore_allocation_from_file &&
+                          !bdev->is_smr();
+
+  // When allocation-info is stored in a single file we set freelist_type to "null"
+  if (can_have_null_fm) {
+    freelist_type = "null";
+    need_to_destage_allocation_file = true;
+  }
+  fm = FreelistManager::create(cct, freelist_type, PREFIX_ALLOC);
+  ceph_assert(fm);
+  if (t) {
+    // create mode. initialize freespace
+    dout(20) << __func__ << " initializing freespace" << dendl;
+    {
+      bufferlist bl;
+      bl.append(freelist_type);
+      t->set(PREFIX_SUPER, "freelist_type", bl);
+    }
+    // being able to allocate in units less than bdev block size 
+    // seems to be a bad idea.
+    ceph_assert(cct->_conf->bdev_block_size <= min_alloc_size);
+
+    uint64_t alloc_size = min_alloc_size;
+    if (bdev->is_smr() && freelist_type != "zoned") {
+      derr << "SMR device but freelist_type = " << freelist_type << " (not zoned)"
+           << dendl;
+      return -EINVAL;
+    }
+    if (!bdev->is_smr() && freelist_type == "zoned") {
+      derr << "non-SMR device (or SMR support not built-in) but freelist_type = zoned"
+	   << dendl;
+      return -EINVAL;
+    }
+
+    fm->create(bdev->get_size(), alloc_size,
+	       zone_size, first_sequential_zone,
+	       t);
+
+    // allocate superblock reserved space.  note that we do not mark
+    // bluefs space as allocated in the freelist; we instead rely on
+    // bluefs doing that itself.
+    auto reserved = _get_ondisk_reserved();
+    if (fm_restore) {
+      // we need to allocate the full space in restore case
+      // as later we will add free-space marked in the allocator file
+      fm->allocate(0, bdev->get_size(), t);
+    } else {
+      // allocate superblock reserved space.  note that we do not mark
+      // bluefs space as allocated in the freelist; we instead rely on
+      // bluefs doing that itself.
+      fm->allocate(0, reserved, t);
+    }
+    // debug code - not needed for NULL FM
+    if (cct->_conf->bluestore_debug_prefill > 0) {
+      uint64_t end = bdev->get_size() - reserved;
+      dout(1) << __func__ << " pre-fragmenting freespace, using "
+	      << cct->_conf->bluestore_debug_prefill << " with max free extent "
+	      << cct->_conf->bluestore_debug_prefragment_max << dendl;
+      uint64_t start = p2roundup(reserved, min_alloc_size);
+      uint64_t max_b = cct->_conf->bluestore_debug_prefragment_max / min_alloc_size;
+      float r = cct->_conf->bluestore_debug_prefill;
+      r /= 1.0 - r;
+      bool stop = false;
+
+      while (!stop && start < end) {
+	uint64_t l = (rand() % max_b + 1) * min_alloc_size;
+	if (start + l > end) {
+	  l = end - start;
+          l = p2align(l, min_alloc_size);
+        }
+        ceph_assert(start + l <= end);
+
+	uint64_t u = 1 + (uint64_t)(r * (double)l);
+	u = p2roundup(u, min_alloc_size);
+        if (start + l + u > end) {
+          u = end - (start + l);
+          // trim to align so we don't overflow again
+          u = p2align(u, min_alloc_size);
+          stop = true;
+        }
+        ceph_assert(start + l + u <= end);
+
+	dout(20) << __func__ << " free 0x" << std::hex << start << "~" << l
+		 << " use 0x" << u << std::dec << dendl;
+
+        if (u == 0) {
+          // break if u has been trimmed to nothing
+          break;
+        }
+
+	fm->allocate(start + l, u, t);
+	start += l + u;
+      }
+    }
+    r = _write_out_fm_meta(0);
+    ceph_assert(r == 0);
+  } else {
+    if (can_have_null_fm) {
+      commit_to_null_manager();
+    }
+    r = fm->init(db, read_only,
+      [&](const std::string& key, std::string* result) {
+        return read_meta(key, result);
+    });
+    if (r < 0) {
+      derr << __func__ << " failed: " << cpp_strerror(r) << dendl;
+      delete fm;
+      fm = NULL;
+      return r;
+    }
+  }
+  // if space size tracked by free list manager is that higher than actual
+  // dev size one can hit out-of-space allocation which will result
+  // in data loss and/or assertions
+  // Probably user altered the device size somehow.
+  // The only fix for now is to redeploy OSD.
+  if (fm->get_size() >= bdev->get_size() + min_alloc_size) {
+    ostringstream ss;
+    ss << "slow device size mismatch detected, "
+	<< " fm size(" << fm->get_size()
+	<< ") > slow device size(" << bdev->get_size()
+	<< "), Please stop using this OSD as it might cause data loss.";
+    _set_disk_size_mismatch_alert(ss.str());
+  }
+  return 0;
+}
+
+void BlueStore::_close_fm()
+{
+  dout(10) << __func__ << dendl;
+  ceph_assert(fm);
+  fm->shutdown();
+  delete fm;
+  fm = NULL;
+}
+
+int BlueStore::_write_out_fm_meta(uint64_t target_size)
+{
+  int r = 0;
+  string p = path + "/block";
+
+  std::vector<std::pair<string, string>> fm_meta;
+  fm->get_meta(target_size, &fm_meta);
+
+  for (auto& m : fm_meta) {
+    r = write_meta(m.first, m.second);
+    ceph_assert(r == 0);
+  }
+  return r;
+}
+
+int BlueStore::_create_alloc()
+{
+  ceph_assert(alloc == NULL);
+  ceph_assert(shared_alloc.a == NULL);
+  ceph_assert(bdev->get_size());
+
+  uint64_t alloc_size = min_alloc_size;
+
+  std::string allocator_type = cct->_conf->bluestore_allocator;
+
+#ifdef HAVE_LIBZBD
+  if (freelist_type == "zoned") {
+    allocator_type = "zoned";
+  }
+#endif
+
+  alloc = Allocator::create(
+    cct, allocator_type,
+    bdev->get_size(),
+    alloc_size,
+    zone_size,
+    first_sequential_zone,
+    "block");
+  if (!alloc) {
+    lderr(cct) << __func__ << " failed to create " << allocator_type << " allocator"
+	       << dendl;
+    return -EINVAL;
+  }
+
+#ifdef HAVE_LIBZBD
+  if (freelist_type == "zoned") {
+    Allocator *a = Allocator::create(
+      cct, cct->_conf->bluestore_allocator,
+      bdev->get_conventional_region_size(),
+      alloc_size,
+      0, 0,
+      "zoned_block");
+    if (!a) {
+      lderr(cct) << __func__ << " failed to create " << cct->_conf->bluestore_allocator
+		 << " allocator" << dendl;
+      delete alloc;
+      return -EINVAL;
+    }
+    shared_alloc.set(a, alloc_size);
+  } else
+#endif
+  {
+    // BlueFS will share the same allocator
+    shared_alloc.set(alloc, alloc_size);
+  }
+
+  return 0;
+}
+
+int BlueStore::_init_alloc(std::map<uint64_t, uint64_t> *zone_adjustments)
+{
+  int r = _create_alloc();
+  if (r < 0) {
+    return r;
+  }
+  ceph_assert(alloc != NULL);
+
+#ifdef HAVE_LIBZBD
+  if (bdev->is_smr()) {
+    auto a = dynamic_cast<ZonedAllocator*>(alloc);
+    ceph_assert(a);
+    auto f = dynamic_cast<ZonedFreelistManager*>(fm);
+    ceph_assert(f);
+    vector<uint64_t> wp = bdev->get_zones();
+    vector<zone_state_t> zones = f->get_zone_states(db);
+    ceph_assert(wp.size() == zones.size());
+
+    // reconcile zone state
+    auto num_zones = bdev->get_size() / zone_size;
+    for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
+      ceph_assert(wp[i] >= i * zone_size);
+      ceph_assert(wp[i] <= (i + 1) * zone_size); // pos might be at start of next zone
+      uint64_t p = wp[i] - i * zone_size;
+      if (zones[i].write_pointer > p) {
+	derr << __func__ << " zone 0x" << std::hex << i
+	     << " bluestore write pointer 0x" << zones[i].write_pointer
+	     << " > device write pointer 0x" << p
+	     << std::dec << " -- VERY SUSPICIOUS!" << dendl;
+      } else if (zones[i].write_pointer < p) {
+	// this is "normal" in that it can happen after any crash (if we have a
+	// write in flight but did not manage to commit the transaction)
+	auto delta = p - zones[i].write_pointer;
+	dout(1) << __func__ << " zone 0x" << std::hex << i
+		 << " device write pointer 0x" << p
+		 << " > bluestore pointer 0x" << zones[i].write_pointer
+		 << ", advancing 0x" << delta << std::dec << dendl;
+	(*zone_adjustments)[zones[i].write_pointer] = delta;
+	zones[i].num_dead_bytes += delta;
+	zones[i].write_pointer = p;
+      }
+    }
+
+    // start with conventional zone "free" (bluefs may adjust this when it starts up)
+    auto reserved = _get_ondisk_reserved();
+    // for now we require a conventional zone
+    ceph_assert(bdev->get_conventional_region_size());
+    ceph_assert(shared_alloc.a != alloc);  // zoned allocator doesn't use conventional region
+    shared_alloc.a->init_add_free(
+      reserved,
+      p2align(bdev->get_conventional_region_size(), min_alloc_size) - reserved);
+
+    // init sequential zone based on the device's write pointers
+    a->init_from_zone_pointers(std::move(zones));
+    dout(1) << __func__
+	    << " loaded zone pointers: "
+	    << std::hex
+	    << ", allocator type " << alloc->get_type()
+	    << ", capacity 0x" << alloc->get_capacity()
+	    << ", block size 0x" << alloc->get_block_size()
+	    << ", free 0x" << alloc->get_free()
+	    << ", fragmentation " << alloc->get_fragmentation()
+	    << std::dec << dendl;
+
+    return 0;
+  }
+#endif
+
+  uint64_t num = 0, bytes = 0;
+  utime_t start_time = ceph_clock_now();
+  if (!fm->is_null_manager()) {
+    // This is the original path - loading allocation map from RocksDB and feeding into the allocator
+    dout(5) << __func__ << "::NCB::loading allocation from FM -> alloc" << dendl;
+    // initialize from freelist
+    fm->enumerate_reset();
+    uint64_t offset, length;
+    while (fm->enumerate_next(db, &offset, &length)) {
+      alloc->init_add_free(offset, length);
+      ++num;
+      bytes += length;
+    }
+    fm->enumerate_reset();
+
+    utime_t duration = ceph_clock_now() - start_time;
+    dout(5) << __func__ << "::num_entries=" << num << " free_size=" << bytes << " alloc_size=" <<
+      alloc->get_capacity() - bytes << " time=" << duration << " seconds" << dendl;
+  } else {
+    // This is the new path reading the allocation map from a flat bluefs file and feeding them into the allocator
+
+    if (!cct->_conf->bluestore_allocation_from_file) {
+      derr << __func__ << "::NCB::cct->_conf->bluestore_allocation_from_file is set to FALSE with an active NULL-FM" << dendl;
+      derr << __func__ << "::NCB::Please change the value of bluestore_allocation_from_file to TRUE in your ceph.conf file" << dendl;
+      return -ENOTSUP; // Operation not supported
+    }
+    if (restore_allocator(alloc, &num, &bytes) == 0) {
+      dout(5) << __func__ << "::NCB::restore_allocator() completed successfully alloc=" << alloc << dendl;
+    } else {
+      // This must mean that we had an unplanned shutdown and didn't manage to destage the allocator
+      dout(0) << __func__ << "::NCB::restore_allocator() failed! Run Full Recovery from ONodes (might take a while) ..." << dendl;
+      // if failed must recover from on-disk ONode internal state
+      if (read_allocation_from_drive_on_startup() != 0) {
+	derr << __func__ << "::NCB::Failed Recovery" << dendl;
+	derr << __func__ << "::NCB::Ceph-OSD won't start, make sure your drives are connected and readable" << dendl;
+	derr << __func__ << "::NCB::If no HW fault is found, please report failure and consider redeploying OSD" << dendl;
+	return -ENOTRECOVERABLE;
+      }
+    }
+  }
+  dout(1) << __func__
+          << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
+          << std::hex
+          << ", allocator type " << alloc->get_type()
+          << ", capacity 0x" << alloc->get_capacity()
+          << ", block size 0x" << alloc->get_block_size()
+          << ", free 0x" << alloc->get_free()
+          << ", fragmentation " << alloc->get_fragmentation()
+          << std::dec << dendl;
+
+  return 0;
+}
+
+void BlueStore::_post_init_alloc(const std::map<uint64_t, uint64_t>& zone_adjustments)
+{
+  int r = 0;
+#ifdef HAVE_LIBZBD
+  if (bdev->is_smr()) {
+    if (zone_adjustments.empty()) {
+      return;
+    }
+    dout(1) << __func__ << " adjusting freelist based on device write pointers" << dendl;
+    auto f = dynamic_cast<ZonedFreelistManager*>(fm);
+    ceph_assert(f);
+    KeyValueDB::Transaction t = db->get_transaction();
+    for (auto& i : zone_adjustments) {
+      // allocate AND release since this gap is now dead space
+      // note that the offset is imprecise, but only need to select the zone
+      f->allocate(i.first, i.second, t);
+      f->release(i.first, i.second, t);
+    }
+    r = db->submit_transaction_sync(t);
+  } else
+#endif
+  if (fm->is_null_manager()) {
+    // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
+    // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
+    // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort
+    //  to recovery from RocksDB::ONodes
+    r = invalidate_allocation_file_on_bluefs();
+  }
+  ceph_assert(r >= 0);
+}
+
+void BlueStore::_close_alloc()
+{
+  ceph_assert(bdev);
+  bdev->discard_drain();
+
+  ceph_assert(alloc);
+  alloc->shutdown();
+  delete alloc;
+
+  ceph_assert(shared_alloc.a);
+  if (alloc != shared_alloc.a) {
+    shared_alloc.a->shutdown();
+    delete shared_alloc.a;
+  }
+
+  shared_alloc.reset();
+  alloc = nullptr;
+}
+
+int BlueStore::_open_fsid(bool create)
+{
+  ceph_assert(fsid_fd < 0);
+  int flags = O_RDWR|O_CLOEXEC;
+  if (create)
+    flags |= O_CREAT;
+  fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
+  if (fsid_fd < 0) {
+    int err = -errno;
+    derr << __func__ << " " << cpp_strerror(err) << dendl;
+    return err;
+  }
+  return 0;
+}
+
+int BlueStore::_read_fsid(uuid_d *uuid)
+{
+  char fsid_str[40];
+  memset(fsid_str, 0, sizeof(fsid_str));
+  int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
+  if (ret < 0) {
+    derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  if (ret > 36)
+    fsid_str[36] = 0;
+  else
+    fsid_str[ret] = 0;
+  if (!uuid->parse(fsid_str)) {
+    derr << __func__ << " unparsable uuid " << fsid_str << dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+int BlueStore::_write_fsid()
+{
+  int r = ::ftruncate(fsid_fd, 0);
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  string str = stringify(fsid) + "\n";
+  r = safe_write(fsid_fd, str.c_str(), str.length());
+  if (r < 0) {
+    derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = ::fsync(fsid_fd);
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void BlueStore::_close_fsid()
+{
+  VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+  fsid_fd = -1;
+}
+
+int BlueStore::_lock_fsid()
+{
+  struct flock l;
+  memset(&l, 0, sizeof(l));
+  l.l_type = F_WRLCK;
+  l.l_whence = SEEK_SET;
+  int r = ::fcntl(fsid_fd, F_SETLK, &l);
+  if (r < 0) {
+    int err = errno;
+    derr << __func__ << " failed to lock " << path << "/fsid"
+	 << " (is another ceph-osd still running?)"
+	 << cpp_strerror(err) << dendl;
+    return -err;
+  }
+  return 0;
+}
+
+bool BlueStore::is_rotational()
+{
+  if (bdev) {
+    return bdev->is_rotational();
+  }
+
+  bool rotational = true;
+  int r = _open_path();
+  if (r < 0)
+    goto out;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+  r = _read_fsid(&fsid);
+  if (r < 0)
+    goto out_fsid;
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_fsid;
+  r = _open_bdev(false);
+  if (r < 0)
+    goto out_fsid;
+  rotational = bdev->is_rotational();
+  _close_bdev();
+ out_fsid:
+  _close_fsid();
+ out_path:
+  _close_path();
+  out:
+  return rotational;
+}
+
+bool BlueStore::is_journal_rotational()
+{
+  if (!bluefs) {
+    dout(5) << __func__ << " bluefs disabled, default to store media type"
+            << dendl;
+    return is_rotational();
+  }
+  dout(10) << __func__ << " " << (int)bluefs->wal_is_rotational() << dendl;
+  return bluefs->wal_is_rotational();
+}
+
+bool BlueStore::is_db_rotational()
+{
+  if (!bluefs) {
+    dout(5) << __func__ << " bluefs disabled, default to store media type"
+            << dendl;
+    return is_rotational();
+  }
+  dout(10) << __func__ << " " << (int)bluefs->db_is_rotational() << dendl;
+  return bluefs->db_is_rotational();
+}
+
+bool BlueStore::_use_rotational_settings()
+{
+  if (cct->_conf->bluestore_debug_enforce_settings == "hdd") {
+    return true;
+  }
+  if (cct->_conf->bluestore_debug_enforce_settings == "ssd") {
+    return false;
+  }
+  return bdev->is_rotational();
+}
+
+bool BlueStore::is_statfs_recoverable() const
+{
+  // abuse fm for now
+  return has_null_manager();
+}
+
+bool BlueStore::test_mount_in_use()
+{
+  // most error conditions mean the mount is not in use (e.g., because
+  // it doesn't exist).  only if we fail to lock do we conclude it is
+  // in use.
+  bool ret = false;
+  int r = _open_path();
+  if (r < 0)
+    return false;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+  r = _lock_fsid();
+  if (r < 0)
+    ret = true; // if we can't lock, it is in use
+  _close_fsid();
+ out_path:
+  _close_path();
+  return ret;
+}
+
+int BlueStore::_minimal_open_bluefs(bool create)
+{
+  int r;
+  bluefs = new BlueFS(cct);
+
+  string bfn;
+  struct stat st;
+
+  bfn = path + "/block.db";
+  if (::stat(bfn.c_str(), &st) == 0) {
+    r = bluefs->add_block_device(
+      BlueFS::BDEV_DB, bfn,
+      create && cct->_conf->bdev_enable_discard,
+      SUPER_RESERVED);
+    if (r < 0) {
+      derr << __func__ << " add block device(" << bfn << ") returned: "
+            << cpp_strerror(r) << dendl;
+      goto free_bluefs;
+    }
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
+      r = _check_or_set_bdev_label(
+	bfn,
+	bluefs->get_block_device_size(BlueFS::BDEV_DB),
+        "bluefs db", create);
+      if (r < 0) {
+        derr << __func__
+	      << " check block device(" << bfn << ") label returned: "
+              << cpp_strerror(r) << dendl;
+        goto free_bluefs;
+      }
+    }
+    bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
+    bluefs_layout.dedicated_db = true;
+  } else {
+    r = -errno;
+    if (::lstat(bfn.c_str(), &st) == -1) {
+      r = 0;
+      bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
+    } else {
+      derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+	    << cpp_strerror(r) << dendl;
+      goto free_bluefs;
+    }
+  }
+
+  // shared device
+  bfn = path + "/block";
+  // never trim here
+  r = bluefs->add_block_device(bluefs_layout.shared_bdev, bfn, false,
+                               0, // no need to provide valid 'reserved' for shared dev
+                               &shared_alloc);
+  if (r < 0) {
+    derr << __func__ << " add block device(" << bfn << ") returned: "
+	  << cpp_strerror(r) << dendl;
+    goto free_bluefs;
+  }
+
+  bfn = path + "/block.wal";
+  if (::stat(bfn.c_str(), &st) == 0) {
+    r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn,
+				 create && cct->_conf->bdev_enable_discard,
+                                 BDEV_LABEL_BLOCK_SIZE);
+    if (r < 0) {
+      derr << __func__ << " add block device(" << bfn << ") returned: "
+	    << cpp_strerror(r) << dendl;
+      goto free_bluefs;
+    }
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
+      r = _check_or_set_bdev_label(
+	bfn,
+	bluefs->get_block_device_size(BlueFS::BDEV_WAL),
+        "bluefs wal", create);
+      if (r < 0) {
+        derr << __func__ << " check block device(" << bfn
+              << ") label returned: " << cpp_strerror(r) << dendl;
+        goto free_bluefs;
+      }
+    }
+
+    bluefs_layout.dedicated_wal = true;
+  } else {
+    r = 0;
+    if (::lstat(bfn.c_str(), &st) != -1) {
+      r = -errno;
+      derr << __func__ << " " << bfn << " symlink exists but target unusable: "
+           << cpp_strerror(r) << dendl;
+      goto free_bluefs;
+    }
+  }
+  return 0;
+
+free_bluefs:
+  ceph_assert(bluefs);
+  delete bluefs;
+  bluefs = NULL;
+  return r;
+}
+
+int BlueStore::_open_bluefs(bool create, bool read_only)
+{
+  int r = _minimal_open_bluefs(create);
+  if (r < 0) {
+    return r;
+  }
+  BlueFSVolumeSelector* vselector = nullptr;
+  if (bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW ||
+      cct->_conf->bluestore_volume_selection_policy == "use_some_extra_enforced" ||
+      cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
+
+    string options = cct->_conf->bluestore_rocksdb_options;
+    string options_annex = cct->_conf->bluestore_rocksdb_options_annex;
+    if (!options_annex.empty()) {
+      if (!options.empty() &&
+        *options.rbegin() != ',') {
+        options += ',';
+      }
+      options += options_annex;
+    }
+
+    rocksdb::Options rocks_opts;
+    r = RocksDBStore::ParseOptionsFromStringStatic(
+      cct,
+      options,
+      rocks_opts,
+      nullptr);
+    if (r < 0) {
+      return r;
+    }
+    if (cct->_conf->bluestore_volume_selection_policy == "fit_to_fast") {
+      vselector = new FitToFastVolumeSelector(
+        bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+        bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+        bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100);
+    } else {
+      double reserved_factor = cct->_conf->bluestore_volume_selection_reserved_factor;
+      vselector =
+        new RocksDBBlueFSVolumeSelector(
+          bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
+          bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
+          bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
+          1024 * 1024 * 1024, //FIXME: set expected l0 size here
+          rocks_opts.max_bytes_for_level_base,
+          rocks_opts.max_bytes_for_level_multiplier,
+          reserved_factor,
+          cct->_conf->bluestore_volume_selection_reserved,
+          cct->_conf->bluestore_volume_selection_policy.find("use_some_extra")
+             == 0);
+    }    
+  }
+  if (create) {
+    bluefs->mkfs(fsid, bluefs_layout);
+  }
+  bluefs->set_volume_selector(vselector);
+  r = bluefs->mount();
+  if (r < 0) {
+    derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
+  }
+  ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
+  return r;
+}
+
+void BlueStore::_close_bluefs()
+{
+  bluefs->umount(db_was_opened_read_only);
+  _minimal_close_bluefs();
+}
+
+void BlueStore::_minimal_close_bluefs()
+{
+  delete bluefs;
+  bluefs = NULL;
+}
+
+int BlueStore::_is_bluefs(bool create, bool* ret)
+{
+  if (create) {
+    *ret = cct->_conf->bluestore_bluefs;
+  } else {
+    string s;
+    int r = read_meta("bluefs", &s);
+    if (r < 0) {
+      derr << __func__ << " unable to read 'bluefs' meta" << dendl;
+      return -EIO;
+    }
+    if (s == "1") {
+      *ret = true;
+    } else if (s == "0") {
+      *ret = false;
+    } else {
+      derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
+	   << dendl;
+      return -EIO;
+    }
+  }
+  return 0;
+}
+
+/*
+* opens both DB and dependant super_meta, FreelistManager and allocator
+* in the proper order
+*/
+int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
+{
+  dout(5) << __func__ << "::NCB::read_only=" << read_only << ", to_repair=" << to_repair << dendl;
+  {
+    string type;
+    int r = read_meta("type", &type);
+    if (r < 0) {
+      derr << __func__ << " failed to load os-type: " << cpp_strerror(r)
+        << dendl;
+      return r;
+    }
+
+    if (type != "bluestore") {
+      derr << __func__ << " expected bluestore, but type is " << type << dendl;
+      return -EIO;
+    }
+  }
+
+  // SMR devices may require a freelist adjustment, but that can only happen after
+  // the db is read-write. we'll stash pending changes here.
+  std::map<uint64_t, uint64_t> zone_adjustments;
+
+  int r = _open_path();
+  if (r < 0)
+    return r;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+
+  r = _read_fsid(&fsid);
+  if (r < 0)
+    goto out_fsid;
+
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_fsid;
+
+  r = _open_bdev(false);
+  if (r < 0)
+    goto out_fsid;
+
+  // GBH: can probably skip open_db step in REad-Only mode when operating in NULL-FM mode
+  // (might need to open if failed to restore from file)
+
+  // open in read-only first to read FM list and init allocator
+  // as they might be needed for some BlueFS procedures
+  r = _open_db(false, false, true);
+  if (r < 0)
+    goto out_bdev;
+
+  r = _open_super_meta();
+  if (r < 0) {
+    goto out_db;
+  }
+
+  r = _open_fm(nullptr, true, false);
+  if (r < 0)
+    goto out_db;
+
+  r = _init_alloc(&zone_adjustments);
+  if (r < 0)
+    goto out_fm;
+
+  // Re-open in the proper mode(s).
+
+  // Can't simply bypass second open for read-only mode as we need to
+  // load allocated extents from bluefs into allocator.
+  // And now it's time to do that
+  //
+  _close_db();
+  r = _open_db(false, to_repair, read_only);
+  if (r < 0) {
+    goto out_alloc;
+  }
+
+  if (!read_only) {
+    _post_init_alloc(zone_adjustments);
+  }
+
+  // when function is called in repair mode (to_repair=true) we skip db->open()/create()
+  // we can't change bluestore allocation so no need to invlidate allocation-file
+  if (fm->is_null_manager() && !read_only && !to_repair) {
+    // Now that we load the allocation map we need to invalidate the file as new allocation won't be reflected
+    // Changes to the allocation map (alloc/release) are not updated inline and will only be stored on umount()
+    // This means that we should not use the existing file on failure case (unplanned shutdown) and must resort
+    //  to recovery from RocksDB::ONodes
+    r = invalidate_allocation_file_on_bluefs();
+    if (r != 0) {
+      derr << __func__ << "::NCB::invalidate_allocation_file_on_bluefs() failed!" << dendl;
+      goto out_alloc;
+    }
+  }
+
+  // when function is called in repair mode (to_repair=true) we skip db->open()/create()
+  if (!is_db_rotational() && !read_only && !to_repair && cct->_conf->bluestore_allocation_from_file
+#ifdef HAVE_LIBZBD
+      && !bdev->is_smr()
+#endif
+    ) {
+    dout(5) << __func__ << "::NCB::Commit to Null-Manager" << dendl;
+    commit_to_null_manager();
+    need_to_destage_allocation_file = true;
+    dout(10) << __func__ << "::NCB::need_to_destage_allocation_file was set" << dendl;
+  }
+
+  return 0;
+
+out_alloc:
+  _close_alloc();
+out_fm:
+  _close_fm();
+ out_db:
+  _close_db();
+ out_bdev:
+  _close_bdev();
+ out_fsid:
+  _close_fsid();
+ out_path:
+  _close_path();
+  return r;
+}
+
+void BlueStore::_close_db_and_around()
+{
+  if (db) {
+    _close_db();
+  }
+  _close_around_db();
+}
+
+void BlueStore::_close_around_db()
+{
+  if (bluefs) {
+    _close_bluefs();
+  }
+  _close_fm();
+  _close_alloc();
+  _close_bdev();
+  _close_fsid();
+  _close_path();
+}
+
+int BlueStore::open_db_environment(KeyValueDB **pdb, bool to_repair)
+{
+  _kv_only = true;
+  int r = _open_db_and_around(false, to_repair);
+  if (r == 0) {
+    *pdb = db;
+  } else {
+    *pdb = nullptr;
+  }
+  return r;
+}
+
+int BlueStore::close_db_environment()
+{
+  if (db) {
+    delete db;
+    db = nullptr;
+  }
+  _close_around_db();
+  return 0;
+}
+
+/* gets access to bluefs supporting RocksDB */
+BlueFS* BlueStore::get_bluefs() {
+  return bluefs;
+}
+
+int BlueStore::_prepare_db_environment(bool create, bool read_only,
+				       std::string* _fn, std::string* _kv_backend)
+{
+  int r;
+  ceph_assert(!db);
+  std::string& fn=*_fn;
+  std::string& kv_backend=*_kv_backend;
+  fn = path + "/db";
+  std::shared_ptr<Int64ArrayMergeOperator> merge_op(new Int64ArrayMergeOperator);
+
+  if (create) {
+    kv_backend = cct->_conf->bluestore_kvbackend;
+  } else {
+    r = read_meta("kv_backend", &kv_backend);
+    if (r < 0) {
+      derr << __func__ << " unable to read 'kv_backend' meta" << dendl;
+      return -EIO;
+    }
+  }
+  dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
+
+  bool do_bluefs;
+  r = _is_bluefs(create, &do_bluefs);
+  if (r < 0) {
+    return r;
+  }
+  dout(10) << __func__ << " do_bluefs = " << do_bluefs << dendl;
+
+  map<string,string> kv_options;
+  // force separate wal dir for all new deployments.
+  kv_options["separate_wal_dir"] = 1;
+  rocksdb::Env *env = NULL;
+  if (do_bluefs) {
+    dout(10) << __func__ << " initializing bluefs" << dendl;
+    if (kv_backend != "rocksdb") {
+      derr << " backend must be rocksdb to use bluefs" << dendl;
+      return -EINVAL;
+    }
+
+    r = _open_bluefs(create, read_only);
+    if (r < 0) {
+      return r;
+    }
+
+    if (cct->_conf->bluestore_bluefs_env_mirror) {
+      rocksdb::Env* a = new BlueRocksEnv(bluefs);
+      rocksdb::Env* b = rocksdb::Env::Default();
+      if (create) {
+        string cmd = "rm -rf " + path + "/db " +
+          path + "/db.slow " +
+          path + "/db.wal";
+        int r = system(cmd.c_str());
+        (void)r;
+      }
+      env = new rocksdb::EnvMirror(b, a, false, true);
+    } else {
+      env = new BlueRocksEnv(bluefs);
+
+      // simplify the dir names, too, as "seen" by rocksdb
+      fn = "db";
+    }
+    BlueFSVolumeSelector::paths paths;
+    bluefs->get_vselector_paths(fn, paths);
+
+    {
+      ostringstream db_paths;
+      bool first = true;
+      for (auto& p : paths) {
+        if (!first) {
+          db_paths << " ";
+        }
+        first = false;
+        db_paths << p.first << "," << p.second;
+
+      }
+      kv_options["db_paths"] = db_paths.str();
+      dout(1) << __func__ << " set db_paths to " << db_paths.str() << dendl;
+    }
+
+    if (create) {
+      for (auto& p : paths) {
+        env->CreateDir(p.first);
+      }
+      // Selectors don't provide wal path so far hence create explicitly
+      env->CreateDir(fn + ".wal");
+    } else {
+      std::vector<std::string> res;
+      // check for dir presence
+      auto r = env->GetChildren(fn+".wal", &res);
+      if (r.IsNotFound()) {
+	kv_options.erase("separate_wal_dir");
+      }
+    }
+  } else {
+    string walfn = path + "/db.wal";
+
+    if (create) {
+      int r = ::mkdir(fn.c_str(), 0755);
+      if (r < 0)
+	r = -errno;
+      if (r < 0 && r != -EEXIST) {
+	derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
+	     << dendl;
+	return r;
+      }
+
+      // wal_dir, too!
+      r = ::mkdir(walfn.c_str(), 0755);
+      if (r < 0)
+	r = -errno;
+      if (r < 0 && r != -EEXIST) {
+	derr << __func__ << " failed to create " << walfn
+	  << ": " << cpp_strerror(r)
+	  << dendl;
+	return r;
+      }
+    } else {
+      struct stat st;
+      r = ::stat(walfn.c_str(), &st);
+      if (r < 0 && errno == ENOENT) {
+	kv_options.erase("separate_wal_dir");
+      }
+    }
+  }
+
+
+  db = KeyValueDB::create(cct,
+			  kv_backend,
+			  fn,
+			  kv_options,
+			  static_cast<void*>(env));
+  if (!db) {
+    derr << __func__ << " error creating db" << dendl;
+    if (bluefs) {
+      _close_bluefs();
+    }
+    // delete env manually here since we can't depend on db to do this
+    // under this case
+    delete env;
+    env = NULL;
+    return -EIO;
+  }
+
+  FreelistManager::setup_merge_operators(db, freelist_type);
+  db->set_merge_operator(PREFIX_STAT, merge_op);
+  db->set_cache_size(cache_kv_ratio * cache_size);
+  return 0;
+}
+
+int BlueStore::_open_db(bool create, bool to_repair_db, bool read_only)
+{
+  int r;
+  ceph_assert(!(create && read_only));
+  string options;
+  string options_annex;
+  stringstream err;
+  string kv_dir_fn;
+  string kv_backend;
+  std::string sharding_def;
+  // prevent write attempts to BlueFS in case we failed before BlueFS was opened
+  db_was_opened_read_only = true;
+  r = _prepare_db_environment(create, read_only, &kv_dir_fn, &kv_backend);
+  if (r < 0) {
+    derr << __func__ << " failed to prepare db environment: " << err.str() << dendl;
+    return -EIO;
+  }
+  // if reached here then BlueFS is already opened
+  db_was_opened_read_only = read_only;
+  dout(10) << __func__ << "::db_was_opened_read_only was set to " << read_only << dendl;
+  if (kv_backend == "rocksdb") {
+    options = cct->_conf->bluestore_rocksdb_options;
+    options_annex = cct->_conf->bluestore_rocksdb_options_annex;
+    if (!options_annex.empty()) {
+      if (!options.empty() &&
+        *options.rbegin() != ',') {
+        options += ',';
+      }
+      options += options_annex;
+    }
+
+    if (cct->_conf.get_val<bool>("bluestore_rocksdb_cf")) {
+      sharding_def = cct->_conf.get_val<std::string>("bluestore_rocksdb_cfs");
+    }
+  }
+
+  db->init(options);
+  if (to_repair_db)
+    return 0;
+  if (create) {
+    r = db->create_and_open(err, sharding_def);
+  } else {
+    // we pass in cf list here, but it is only used if the db already has
+    // column families created.
+    r = read_only ?
+      db->open_read_only(err, sharding_def) :
+      db->open(err, sharding_def);
+  }
+  if (r) {
+    derr << __func__ << " erroring opening db: " << err.str() << dendl;
+    _close_db();
+    return -EIO;
+  }
+  dout(1) << __func__ << " opened " << kv_backend
+	  << " path " << kv_dir_fn << " options " << options << dendl;
+  return 0;
+}
+
+void BlueStore::_close_db()
+{
+  dout(10) << __func__ << ":read_only=" << db_was_opened_read_only
+           << " fm=" << fm
+           << " destage_alloc_file=" << need_to_destage_allocation_file
+           << " per_pool=" << per_pool_stat_collection
+           << " pool stats=" << osd_pools.size()
+           << dendl;
+  bool do_destage = !db_was_opened_read_only && need_to_destage_allocation_file;
+  if (do_destage && is_statfs_recoverable()) {
+    auto t = db->get_transaction();
+    store_statfs_t s;
+    if (per_pool_stat_collection) {
+      KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
+      uint64_t pool_id;
+      for (it->upper_bound(string()); it->valid(); it->next()) {
+        int r = get_key_pool_stat(it->key(), &pool_id);
+        if (r >= 0) {
+          dout(10) << __func__ << " wiping statfs for: " << pool_id << dendl;
+        } else {
+          derr << __func__ << " wiping invalid statfs key: " << it->key() << dendl;
+        }
+        t->rmkey(PREFIX_STAT, it->key());
+      }
+
+      std::lock_guard l(vstatfs_lock);
+      for(auto &p : osd_pools) {
+        string key;
+        get_pool_stat_key(p.first, &key);
+        bufferlist bl;
+        if (!p.second.is_empty()) {
+          p.second.encode(bl);
+          p.second.publish(&s);
+          t->set(PREFIX_STAT, key, bl);
+          dout(10) << __func__ << " persisting: "
+                   << p.first << "->"  << s
+                   << dendl;
+        }
+      }
+    } else {
+      bufferlist bl;
+      {
+        std::lock_guard l(vstatfs_lock);
+        vstatfs.encode(bl);
+        vstatfs.publish(&s);
+      }
+      t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+      dout(10) << __func__ << "persisting: " << s << dendl;
+    }
+    int r = db->submit_transaction_sync(t);
+    dout(10) << __func__ << " statfs persisted." << dendl;
+    ceph_assert(r >= 0);
+  }
+  ceph_assert(db);
+  delete db;
+  db = nullptr;
+
+  if (do_destage && fm && fm->is_null_manager()) {
+    int ret = store_allocator(alloc);
+    if (ret != 0) {
+      derr << __func__ << "::NCB::store_allocator() failed (continue with bitmapFreelistManager)" << dendl;
+    }
+  }
+
+  if (bluefs) {
+    _close_bluefs();
+  }
+}
+
+void BlueStore::_dump_alloc_on_failure()
+{
+  auto dump_interval =
+    cct->_conf->bluestore_bluefs_alloc_failure_dump_interval;
+  if (dump_interval > 0 &&
+    next_dump_on_bluefs_alloc_failure <= ceph_clock_now()) {
+    shared_alloc.a->dump();
+    next_dump_on_bluefs_alloc_failure = ceph_clock_now();
+    next_dump_on_bluefs_alloc_failure += dump_interval;
+  }
+}
+
+int BlueStore::_open_collections()
+{
+  if (!coll_map.empty()) {
+    // could be opened from another path
+    dout(20) << __func__ << "::NCB::collections are already opened, nothing to do" << dendl;
+    return 0;
+  }
+
+  dout(10) << __func__ << dendl;
+  collections_had_errors = false;
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
+  size_t load_cnt = 0;
+  for (it->upper_bound(string());
+       it->valid();
+       it->next()) {
+    coll_t cid;
+    if (cid.parse(it->key())) {
+      auto c = ceph::make_ref<Collection>(
+	  this,
+	  onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
+          buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
+	  cid);
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      try {
+        decode(c->cnode, p);
+      } catch (ceph::buffer::error& e) {
+        derr << __func__ << " failed to decode cnode, key:"
+             << pretty_binary_string(it->key()) << dendl;
+        return -EIO;
+      }   
+      dout(20) << __func__ << " opened " << cid << " " << c
+	       << " " << c->cnode << dendl;
+      _osr_attach(c.get());
+      coll_map[cid] = c;
+      load_cnt++;
+    } else {
+      derr << __func__ << " unrecognized collection " << it->key() << dendl;
+      collections_had_errors = true;
+    }
+  }
+  dout(10) << __func__ << " collections loaded: " << load_cnt
+           <<  dendl;
+  return 0;
+}
+
+void BlueStore::_fsck_collections(int64_t* errors)
+{
+  if (collections_had_errors) {
+    dout(10) << __func__ << dendl;
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL, KeyValueDB::ITERATOR_NOCACHE);
+    for (it->upper_bound(string());
+      it->valid();
+      it->next()) {
+      coll_t cid;
+      if (!cid.parse(it->key())) {
+        derr << __func__ << " unrecognized collection " << it->key() << dendl;
+        if (errors) {
+          (*errors)++;
+        }
+      }
+    }
+  }
+}
+
+void BlueStore::_set_per_pool_omap()
+{
+  per_pool_omap = OMAP_BULK;
+  bufferlist bl;
+  db->get(PREFIX_SUPER, "per_pool_omap", &bl);
+  if (bl.length()) {
+    auto s = bl.to_str();
+    if (s == stringify(OMAP_PER_POOL)) {
+      per_pool_omap = OMAP_PER_POOL;
+    } else if (s == stringify(OMAP_PER_PG)) {
+      per_pool_omap = OMAP_PER_PG;
+    } else {
+      ceph_assert(s == stringify(OMAP_BULK));
+    }
+    dout(10) << __func__ << " per_pool_omap = " << per_pool_omap << dendl;
+  } else {
+    dout(10) << __func__ << " per_pool_omap not present" << dendl;
+  }
+  _check_no_per_pg_or_pool_omap_alert();
+}
+
+void BlueStore::_open_statfs()
+{
+  osd_pools.clear();
+  vstatfs.reset();
+
+  bufferlist bl;
+  int r = db->get(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, &bl);
+  if (r >= 0) {
+    per_pool_stat_collection = false;
+    if (size_t(bl.length()) >= sizeof(vstatfs.values)) {
+      auto it = bl.cbegin();
+      vstatfs.decode(it);
+      dout(10) << __func__ << " store_statfs is found" << dendl;
+    } else {
+      dout(10) << __func__ << " store_statfs is corrupt, using empty" << dendl;
+    }
+    _check_legacy_statfs_alert();
+  } else {
+    per_pool_stat_collection = true;
+    dout(10) << __func__ << " per-pool statfs is enabled" << dendl;
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_STAT, KeyValueDB::ITERATOR_NOCACHE);
+    for (it->upper_bound(string());
+	 it->valid();
+	 it->next()) {
+
+      uint64_t pool_id;
+      int r = get_key_pool_stat(it->key(), &pool_id);
+      ceph_assert(r == 0);
+
+      bufferlist bl;
+      bl = it->value();
+      auto p = bl.cbegin();
+      auto& st = osd_pools[pool_id];
+      try {
+        st.decode(p);
+        vstatfs += st;
+
+        dout(10) << __func__ << " pool " << std::hex << pool_id
+		 << " statfs(hex) " << st
+		 << std::dec << dendl;
+      } catch (ceph::buffer::error& e) {
+        derr << __func__ << " failed to decode pool stats, key:"
+             << pretty_binary_string(it->key()) << dendl;
+      }   
+    }
+  }
+  dout(10) << __func__ << " statfs " << std::hex
+           << vstatfs  << std::dec << dendl;
+
+}
+
+int BlueStore::_setup_block_symlink_or_file(
+  string name,
+  string epath,
+  uint64_t size,
+  bool create)
+{
+  dout(20) << __func__ << " name " << name << " path " << epath
+	   << " size " << size << " create=" << (int)create << dendl;
+  int r = 0;
+  int flags = O_RDWR|O_CLOEXEC;
+  if (create)
+    flags |= O_CREAT;
+  if (epath.length()) {
+    r = ::symlinkat(epath.c_str(), path_fd, name.c_str());
+    if (r < 0) {
+      r = -errno;
+      derr << __func__ << " failed to create " << name << " symlink to "
+           << epath << ": " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    if (!epath.compare(0, strlen(SPDK_PREFIX), SPDK_PREFIX)) {
+      int fd = ::openat(path_fd, epath.c_str(), flags, 0644);
+      if (fd < 0) {
+	r = -errno;
+	derr << __func__ << " failed to open " << epath << " file: "
+	     << cpp_strerror(r) << dendl;
+	return r;
+      }
+      // write the Transport ID of the NVMe device
+      // a transport id for PCIe looks like: "trtype:PCIe traddr:0000:02:00.0"
+      // where "0000:02:00.0" is the selector of a PCI device, see
+      // the first column of "lspci -mm -n -D"
+      // a transport id for tcp looks like: "trype:TCP adrfam:IPv4 traddr:172.31.89.152 trsvcid:4420"
+      string trid = epath.substr(strlen(SPDK_PREFIX));
+      r = ::write(fd, trid.c_str(), trid.size());
+      ceph_assert(r == static_cast<int>(trid.size()));
+      dout(1) << __func__ << " created " << name << " symlink to "
+              << epath << dendl;
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+    }
+  }
+  if (size) {
+    int fd = ::openat(path_fd, name.c_str(), flags, 0644);
+    if (fd >= 0) {
+      // block file is present
+      struct stat st;
+      int r = ::fstat(fd, &st);
+      if (r == 0 &&
+	  S_ISREG(st.st_mode) &&   // if it is a regular file
+	  st.st_size == 0) {       // and is 0 bytes
+	r = ::ftruncate(fd, size);
+	if (r < 0) {
+	  r = -errno;
+	  derr << __func__ << " failed to resize " << name << " file to "
+	       << size << ": " << cpp_strerror(r) << dendl;
+	  VOID_TEMP_FAILURE_RETRY(::close(fd));
+	  return r;
+	}
+
+	if (cct->_conf->bluestore_block_preallocate_file) {
+          r = ::ceph_posix_fallocate(fd, 0, size);
+          if (r > 0) {
+	    derr << __func__ << " failed to prefallocate " << name << " file to "
+	      << size << ": " << cpp_strerror(r) << dendl;
+	    VOID_TEMP_FAILURE_RETRY(::close(fd));
+	    return -r;
+	  }
+	}
+	dout(1) << __func__ << " resized " << name << " file to "
+		<< byte_u_t(size) << dendl;
+      }
+      VOID_TEMP_FAILURE_RETRY(::close(fd));
+    } else {
+      int r = -errno;
+      if (r != -ENOENT) {
+	derr << __func__ << " failed to open " << name << " file: "
+	     << cpp_strerror(r) << dendl;
+	return r;
+      }
+    }
+  }
+  return 0;
+}
+
+int BlueStore::mkfs()
+{
+  dout(1) << __func__ << " path " << path << dendl;
+  int r;
+  uuid_d old_fsid;
+  uint64_t reserved;
+  if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
+    derr << __func__ << " osd_max_object_size "
+	 << cct->_conf->osd_max_object_size << " > bluestore max "
+	 << OBJECT_MAX_SIZE << dendl;
+    return -EINVAL;
+  }
+
+  {
+    string done;
+    r = read_meta("mkfs_done", &done);
+    if (r == 0) {
+      dout(1) << __func__ << " already created" << dendl;
+      if (cct->_conf->bluestore_fsck_on_mkfs) {
+        r = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
+        if (r < 0) {
+          derr << __func__ << " fsck found fatal error: " << cpp_strerror(r)
+               << dendl;
+          return r;
+        }
+        if (r > 0) {
+          derr << __func__ << " fsck found " << r << " errors" << dendl;
+          r = -EIO;
+        }
+      }
+      return r; // idempotent
+    }
+  }
+
+  {
+    string type;
+    r = read_meta("type", &type);
+    if (r == 0) {
+      if (type != "bluestore") {
+	derr << __func__ << " expected bluestore, but type is " << type << dendl;
+	return -EIO;
+      }
+    } else {
+      r = write_meta("type", "bluestore");
+      if (r < 0)
+        return r;
+    }
+  }
+
+  r = _open_path();
+  if (r < 0)
+    return r;
+
+  r = _open_fsid(true);
+  if (r < 0)
+    goto out_path_fd;
+
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_close_fsid;
+
+  r = _read_fsid(&old_fsid);
+  if (r < 0 || old_fsid.is_zero()) {
+    if (fsid.is_zero()) {
+      fsid.generate_random();
+      dout(1) << __func__ << " generated fsid " << fsid << dendl;
+    } else {
+      dout(1) << __func__ << " using provided fsid " << fsid << dendl;
+    }
+    // we'll write it later.
+  } else {
+    if (!fsid.is_zero() && fsid != old_fsid) {
+      derr << __func__ << " on-disk fsid " << old_fsid
+	   << " != provided " << fsid << dendl;
+      r = -EINVAL;
+      goto out_close_fsid;
+    }
+    fsid = old_fsid;
+  }
+
+  r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
+				   cct->_conf->bluestore_block_size,
+				   cct->_conf->bluestore_block_create);
+  if (r < 0)
+    goto out_close_fsid;
+  if (cct->_conf->bluestore_bluefs) {
+    r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
+	cct->_conf->bluestore_block_wal_size,
+	cct->_conf->bluestore_block_wal_create);
+    if (r < 0)
+      goto out_close_fsid;
+    r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
+	cct->_conf->bluestore_block_db_size,
+	cct->_conf->bluestore_block_db_create);
+    if (r < 0)
+      goto out_close_fsid;
+  }
+
+  r = _open_bdev(true);
+  if (r < 0)
+    goto out_close_fsid;
+
+  // choose freelist manager
+#ifdef HAVE_LIBZBD
+  if (bdev->is_smr()) {
+    freelist_type = "zoned";
+    zone_size = bdev->get_zone_size();
+    first_sequential_zone = bdev->get_conventional_region_size() / zone_size;
+    bdev->reset_all_zones();
+  } else
+#endif
+  {
+    freelist_type = "bitmap";
+  }
+  dout(10) << " freelist_type " << freelist_type << dendl;
+
+  // choose min_alloc_size
+  dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
+	  << " block_size: 0x" << block_size << std::dec << dendl;
+  if ((cct->_conf->bluestore_use_optimal_io_size_for_min_alloc_size) && (optimal_io_size != 0)) {
+    dout(5) << __func__ << " optimal_io_size 0x" << std::hex << optimal_io_size
+		<< " for min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+    min_alloc_size = optimal_io_size;
+  }
+  else if (cct->_conf->bluestore_min_alloc_size) {
+    min_alloc_size = cct->_conf->bluestore_min_alloc_size;
+  } else {
+    ceph_assert(bdev);
+    if (_use_rotational_settings()) {
+      min_alloc_size = cct->_conf->bluestore_min_alloc_size_hdd;
+    } else {
+      min_alloc_size = cct->_conf->bluestore_min_alloc_size_ssd;
+    }
+  }
+  _validate_bdev();
+
+  // make sure min_alloc_size is power of 2 aligned.
+  if (!std::has_single_bit(min_alloc_size)) {
+    derr << __func__ << " min_alloc_size 0x"
+	 << std::hex << min_alloc_size << std::dec
+	 << " is not power of 2 aligned!"
+	 << dendl;
+    r = -EINVAL;
+    goto out_close_bdev;
+  }
+
+  // make sure min_alloc_size is >= and aligned with block size
+  if (min_alloc_size % block_size != 0) {
+    derr << __func__ << " min_alloc_size 0x"
+	 << std::hex << min_alloc_size
+	 << " is less or not aligned with block_size: 0x"
+	 << block_size << std::dec <<  dendl;
+    r = -EINVAL;
+    goto out_close_bdev;
+  }
+
+  r = _create_alloc();
+  if (r < 0) {
+    goto out_close_bdev;
+  }
+
+  reserved = _get_ondisk_reserved();
+  alloc->init_add_free(reserved,
+    p2align(bdev->get_size(), min_alloc_size) - reserved);
+#ifdef HAVE_LIBZBD
+  if (bdev->is_smr() && alloc != shared_alloc.a) {
+    shared_alloc.a->init_add_free(reserved,
+				  p2align(bdev->get_conventional_region_size(),
+					  min_alloc_size) - reserved);
+  }
+#endif
+
+  r = _open_db(true);
+  if (r < 0)
+    goto out_close_alloc;
+
+  {
+    KeyValueDB::Transaction t = db->get_transaction();
+    r = _open_fm(t, false, true);
+    if (r < 0)
+      goto out_close_db;
+    {
+      bufferlist bl;
+      encode((uint64_t)0, bl);
+      t->set(PREFIX_SUPER, "nid_max", bl);
+      t->set(PREFIX_SUPER, "blobid_max", bl);
+    }
+
+    {
+      bufferlist bl;
+      encode((uint64_t)min_alloc_size, bl);
+      t->set(PREFIX_SUPER, "min_alloc_size", bl);
+    }
+    {
+      bufferlist bl;
+      if (cct->_conf.get_val<bool>("bluestore_debug_legacy_omap")) {
+	bl.append(stringify(OMAP_BULK));
+      } else {
+	bl.append(stringify(OMAP_PER_PG));
+      }
+      t->set(PREFIX_SUPER, "per_pool_omap", bl);
+    }
+
+#ifdef HAVE_LIBZBD
+    if (bdev->is_smr()) {
+      {
+	bufferlist bl;
+	encode((uint64_t)zone_size, bl);
+	t->set(PREFIX_SUPER, "zone_size", bl);
+      }
+      {
+	bufferlist bl;
+	encode((uint64_t)first_sequential_zone, bl);
+	t->set(PREFIX_SUPER, "first_sequential_zone", bl);
+      }
+    }
+#endif
+    
+    ondisk_format = latest_ondisk_format;
+    _prepare_ondisk_format_super(t);
+    db->submit_transaction_sync(t);
+  }
+
+  r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);
+  if (r < 0)
+    goto out_close_fm;
+
+  r = write_meta("bluefs", stringify(bluefs ? 1 : 0));
+  if (r < 0)
+    goto out_close_fm;
+
+  if (fsid != old_fsid) {
+    r = _write_fsid();
+    if (r < 0) {
+      derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
+      goto out_close_fm;
+    }
+  }
+
+ out_close_fm:
+  _close_fm();
+ out_close_db:
+  _close_db();
+ out_close_alloc:
+  _close_alloc();
+ out_close_bdev:
+  _close_bdev();
+ out_close_fsid:
+  _close_fsid();
+ out_path_fd:
+  _close_path();
+
+  if (r == 0 &&
+      cct->_conf->bluestore_fsck_on_mkfs) {
+    int rc = fsck(cct->_conf->bluestore_fsck_on_mkfs_deep);
+    if (rc < 0)
+      return rc;
+    if (rc > 0) {
+      derr << __func__ << " fsck found " << rc << " errors" << dendl;
+      r = -EIO;
+    }
+  }
+
+  if (r == 0) {
+    // indicate success by writing the 'mkfs_done' file
+    r = write_meta("mkfs_done", "yes");
+  }
+
+  if (r < 0) {
+    derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
+  } else {
+    dout(0) << __func__ << " success" << dendl;
+  }
+  return r;
+}
+
+int BlueStore::add_new_bluefs_device(int id, const string& dev_path)
+{
+  dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
+  int r;
+  ceph_assert(path_fd < 0);
+
+  ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
+
+  if (!cct->_conf->bluestore_bluefs) {
+    derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+    return -EIO;
+  }
+  dout(5) << __func__ << "::NCB::calling open_db_and_around(read-only)" << dendl;
+  r = _open_db_and_around(true);
+  if (r < 0) {
+    return r;
+  }
+
+  if (id == BlueFS::BDEV_NEWWAL) {
+    string p = path + "/block.wal";
+    r = _setup_block_symlink_or_file("block.wal", dev_path,
+	cct->_conf->bluestore_block_wal_size,
+	true);
+    ceph_assert(r == 0);
+
+    r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, p,
+				 cct->_conf->bdev_enable_discard,
+                                 BDEV_LABEL_BLOCK_SIZE);
+    ceph_assert(r == 0);
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
+      r = _check_or_set_bdev_label(
+	p,
+	bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
+        "bluefs wal",
+	true);
+      ceph_assert(r == 0);
+    }
+
+    bluefs_layout.dedicated_wal = true;
+  } else if (id == BlueFS::BDEV_NEWDB) {
+    string p = path + "/block.db";
+    r = _setup_block_symlink_or_file("block.db", dev_path,
+	cct->_conf->bluestore_block_db_size,
+	true);
+    ceph_assert(r == 0);
+
+    r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, p,
+				 cct->_conf->bdev_enable_discard,
+                                 SUPER_RESERVED);
+    ceph_assert(r == 0);
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
+      r = _check_or_set_bdev_label(
+	p,
+	bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
+        "bluefs db",
+	true);
+      ceph_assert(r == 0);
+    }
+    bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
+    bluefs_layout.dedicated_db = true;
+  }
+  bluefs->umount();
+  bluefs->mount();
+
+  r = bluefs->prepare_new_device(id, bluefs_layout);
+  ceph_assert(r == 0);
+
+  if (r < 0) {
+    derr << __func__ << " failed, " << cpp_strerror(r) << dendl;
+  } else {
+    dout(0) << __func__ << " success" << dendl;
+  }
+
+  _close_db_and_around();
+  return r;
+}
+
+int BlueStore::migrate_to_existing_bluefs_device(const set<int>& devs_source,
+  int id)
+{
+  dout(10) << __func__ << " id:" << id << dendl;
+  ceph_assert(path_fd < 0);
+
+  ceph_assert(id == BlueFS::BDEV_SLOW || id == BlueFS::BDEV_DB);
+
+  if (!cct->_conf->bluestore_bluefs) {
+    derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+    return -EIO;
+  }
+
+  int r = _open_db_and_around(true);
+  if (r < 0) {
+    return r;
+  }
+  auto close_db = make_scope_guard([&] {
+    _close_db_and_around();
+  });
+  uint64_t used_space = 0;
+  for(auto src_id : devs_source) {
+    used_space += bluefs->get_used(src_id);
+  }
+  uint64_t target_free = bluefs->get_free(id);
+  if (target_free < used_space) {
+    derr << __func__
+         << " can't migrate, free space at target: " << target_free
+	 << " is less than required space: " << used_space
+	 << dendl;
+    return -ENOSPC;
+  }
+  if (devs_source.count(BlueFS::BDEV_DB)) {
+    bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
+    bluefs_layout.dedicated_db = false;
+  }
+  if (devs_source.count(BlueFS::BDEV_WAL)) {
+    bluefs_layout.dedicated_wal = false;
+  }
+  r = bluefs->device_migrate_to_existing(cct, devs_source, id, bluefs_layout);
+  if (r < 0) {
+    derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  if (devs_source.count(BlueFS::BDEV_DB)) {
+    r = unlink(string(path + "/block.db").c_str());
+    ceph_assert(r == 0);
+  }
+  if (devs_source.count(BlueFS::BDEV_WAL)) {
+    r = unlink(string(path + "/block.wal").c_str());
+    ceph_assert(r == 0);
+  }
+  return r;
+}
+
+int BlueStore::migrate_to_new_bluefs_device(const set<int>& devs_source,
+  int id,
+  const string& dev_path)
+{
+  dout(10) << __func__ << " path " << dev_path << " id:" << id << dendl;
+  ceph_assert(path_fd < 0);
+
+  ceph_assert(id == BlueFS::BDEV_NEWWAL || id == BlueFS::BDEV_NEWDB);
+
+  if (!cct->_conf->bluestore_bluefs) {
+    derr << __func__ << " bluefs isn't configured, can't add new device " << dendl;
+    return -EIO;
+  }
+
+  int r = _open_db_and_around(true);
+  if (r < 0) {
+    return r;
+  }
+  auto close_db = make_scope_guard([&] {
+    _close_db_and_around();
+  });
+
+  string link_db;
+  string link_wal;
+  if (devs_source.count(BlueFS::BDEV_DB) &&
+      bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
+    link_db = path + "/block.db";
+    bluefs_layout.shared_bdev = BlueFS::BDEV_DB;
+    bluefs_layout.dedicated_db = false;
+  }
+  if (devs_source.count(BlueFS::BDEV_WAL)) {
+    link_wal = path + "/block.wal";
+    bluefs_layout.dedicated_wal = false;
+  }
+
+  size_t target_size = 0;
+  string target_name;
+  if (id == BlueFS::BDEV_NEWWAL) {
+    target_name = "block.wal";
+    target_size = cct->_conf->bluestore_block_wal_size;
+    bluefs_layout.dedicated_wal = true;
+
+    r = bluefs->add_block_device(BlueFS::BDEV_NEWWAL, dev_path,
+				 cct->_conf->bdev_enable_discard,
+                                 BDEV_LABEL_BLOCK_SIZE);
+    ceph_assert(r == 0);
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_NEWWAL)) {
+      r = _check_or_set_bdev_label(
+	dev_path,
+	bluefs->get_block_device_size(BlueFS::BDEV_NEWWAL),
+        "bluefs wal",
+	true);
+      ceph_assert(r == 0);
+    }
+  } else if (id == BlueFS::BDEV_NEWDB) {
+    target_name = "block.db";
+    target_size = cct->_conf->bluestore_block_db_size;
+    bluefs_layout.shared_bdev = BlueFS::BDEV_SLOW;
+    bluefs_layout.dedicated_db = true;
+
+    r = bluefs->add_block_device(BlueFS::BDEV_NEWDB, dev_path,
+				 cct->_conf->bdev_enable_discard,
+                                 SUPER_RESERVED);
+    ceph_assert(r == 0);
+
+    if (bluefs->bdev_support_label(BlueFS::BDEV_NEWDB)) {
+      r = _check_or_set_bdev_label(
+	dev_path,
+	bluefs->get_block_device_size(BlueFS::BDEV_NEWDB),
+        "bluefs db",
+	true);
+      ceph_assert(r == 0);
+    }
+  }
+
+  bluefs->umount();
+  bluefs->mount();
+
+  r = bluefs->device_migrate_to_new(cct, devs_source, id, bluefs_layout);
+
+  if (r < 0) {
+    derr << __func__ << " failed during BlueFS migration, " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  if (!link_db.empty()) {
+    r = unlink(link_db.c_str());
+    ceph_assert(r == 0);
+  }
+  if (!link_wal.empty()) {
+    r = unlink(link_wal.c_str());
+    ceph_assert(r == 0);
+  }
+  r = _setup_block_symlink_or_file(
+    target_name,
+    dev_path,
+    target_size,
+    true);
+  ceph_assert(r == 0);
+  dout(0) << __func__ << " success" << dendl;
+
+  return r;
+}
+
+string BlueStore::get_device_path(unsigned id)
+{
+  string res;
+  if (id < BlueFS::MAX_BDEV) {
+    switch (id) {
+    case BlueFS::BDEV_WAL:
+      res = path + "/block.wal";
+      break;
+    case BlueFS::BDEV_DB:
+      if (id == bluefs_layout.shared_bdev) {
+	res = path + "/block";
+      } else {
+	res = path + "/block.db";
+      }
+      break;
+    case BlueFS::BDEV_SLOW:
+      res = path + "/block";
+      break;
+    }
+  }
+  return res;
+}
+
+int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
+{
+  bluestore_bdev_label_t label;
+  int r = _read_bdev_label(cct, path, &label);
+  if (r < 0) {
+    derr << "unable to read label for " << path << ": "
+          << cpp_strerror(r) << dendl;
+  } else {
+    label.size = size;
+    r = _write_bdev_label(cct, path, label);
+    if (r < 0) {
+      derr << "unable to write label for " << path << ": "
+            << cpp_strerror(r) << dendl;
+    }
+  }
+  return r;
+}
+
+int BlueStore::expand_devices(ostream& out)
+{
+  int r = _open_db_and_around(true);
+  ceph_assert(r == 0);
+  bluefs->dump_block_extents(out);
+  out << "Expanding DB/WAL..." << std::endl;
+  for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
+    if (devid == bluefs_layout.shared_bdev ) {
+      continue;
+    }
+    uint64_t size = bluefs->get_block_device_size(devid);
+    if (size == 0) {
+      // no bdev
+      continue;
+    }
+
+    out << devid
+	<<" : expanding " << " to 0x" << size << std::dec << std::endl;
+    string p = get_device_path(devid);
+    const char* path = p.c_str();
+    if (path == nullptr) {
+      derr << devid
+	    <<": can't find device path " << dendl;
+      continue;
+    }
+    if (bluefs->bdev_support_label(devid)) {
+      if (_set_bdev_label_size(p, size) >= 0) {
+        out << devid
+          << " : size label updated to " << size
+          << std::endl;
+      }
+    }
+  }
+  uint64_t size0 = fm->get_size();
+  uint64_t size = bdev->get_size();
+  if (size0 < size) {
+    out << bluefs_layout.shared_bdev
+      << " : expanding " << " from 0x" << std::hex
+      << size0 << " to 0x" << size << std::dec << std::endl;
+    _write_out_fm_meta(size);
+    if (bdev->supported_bdev_label()) {
+      if (_set_bdev_label_size(path, size) >= 0) {
+        out << bluefs_layout.shared_bdev
+          << " : size label updated to " << size
+          << std::endl;
+      }
+    }
+
+    if (fm && fm->is_null_manager()) {
+      // we grow the allocation range, must reflect it in the allocation file
+      alloc->init_add_free(size0, size - size0);
+      need_to_destage_allocation_file = true;
+    }
+    _close_db_and_around();
+
+    // mount in read/write to sync expansion changes
+    r = _mount();
+    ceph_assert(r == 0);
+    umount();
+  } else {
+    _close_db_and_around();
+  }
+  return r;
+}
+
+int BlueStore::dump_bluefs_sizes(ostream& out)
+{
+  int r = _open_db_and_around(true);
+  ceph_assert(r == 0);
+  bluefs->dump_block_extents(out);
+  _close_db_and_around();
+  return r;
+}
+
+void BlueStore::set_cache_shards(unsigned num)
+{
+  dout(10) << __func__ << " " << num << dendl;
+  size_t oold = onode_cache_shards.size();
+  size_t bold = buffer_cache_shards.size();
+  ceph_assert(num >= oold && num >= bold);
+  onode_cache_shards.resize(num);
+  buffer_cache_shards.resize(num);
+  for (unsigned i = oold; i < num; ++i) {
+    onode_cache_shards[i] = 
+        OnodeCacheShard::create(cct, cct->_conf->bluestore_cache_type,
+                                 logger);
+  }
+  for (unsigned i = bold; i < num; ++i) {
+    buffer_cache_shards[i] = 
+        BufferCacheShard::create(cct, cct->_conf->bluestore_cache_type,
+                                 logger);
+  }
+}
+
+//---------------------------------------------
+bool BlueStore::has_null_manager() const
+{
+  return (fm && fm->is_null_manager());
+}
+
+int BlueStore::_mount()
+{
+  dout(5) << __func__ << "NCB:: path " << path << dendl;
+
+  _kv_only = false;
+  if (cct->_conf->bluestore_fsck_on_mount) {
+    dout(5) << __func__ << "::NCB::calling fsck()" << dendl;
+    int rc = fsck(cct->_conf->bluestore_fsck_on_mount_deep);
+    if (rc < 0)
+      return rc;
+    if (rc > 0) {
+      derr << __func__ << " fsck found " << rc << " errors" << dendl;
+      return -EIO;
+    }
+  }
+
+  if (cct->_conf->osd_max_object_size > OBJECT_MAX_SIZE) {
+    derr << __func__ << " osd_max_object_size "
+	 << cct->_conf->osd_max_object_size << " > bluestore max "
+	 << OBJECT_MAX_SIZE << dendl;
+    return -EINVAL;
+  }
+
+  dout(5) << __func__ << "::NCB::calling open_db_and_around(read/write)" << dendl;
+  int r = _open_db_and_around(false);
+  if (r < 0) {
+    return r;
+  }
+  auto close_db = make_scope_guard([&] {
+    if (!mounted) {
+      _close_db_and_around();
+    }
+  });
+
+  r = _upgrade_super();
+  if (r < 0) {
+    return r;
+  }
+
+  // The recovery process for allocation-map needs to open collection early
+  r = _open_collections();
+  if (r < 0) {
+    return r;
+  }
+  auto shutdown_cache = make_scope_guard([&] {
+    if (!mounted) {
+      _shutdown_cache();
+    }
+  });
+
+  r = _reload_logger();
+  if (r < 0) {
+    return r;
+  }
+
+  _kv_start();
+  auto stop_kv = make_scope_guard([&] {
+    if (!mounted) {
+      _kv_stop();
+    }
+  });
+
+  r = _deferred_replay();
+  if (r < 0) {
+    return r;
+  }
+
+#ifdef HAVE_LIBZBD
+  if (bdev->is_smr()) {
+    _zoned_cleaner_start();
+  }
+#endif
+
+  mempool_thread.init();
+
+  if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
+    cct->_conf->bluestore_fsck_quick_fix_on_mount == true) {
+
+    auto was_per_pool_omap = per_pool_omap;
+
+    dout(1) << __func__ << " quick-fix on mount" << dendl;
+    _fsck_on_open(FSCK_SHALLOW, true);
+
+    //set again as hopefully it has been fixed
+    if (was_per_pool_omap != OMAP_PER_PG) {
+      _set_per_pool_omap();
+    }
+  }
+
+  mounted = true;
+  return 0;
+}
+
+int BlueStore::umount()
+{
+  ceph_assert(_kv_only || mounted);
+  _osr_drain_all();
+
+  mounted = false;
+
+  ceph_assert(alloc);
+
+  if (!_kv_only) {
+    mempool_thread.shutdown();
+#ifdef HAVE_LIBZBD
+    if (bdev->is_smr()) {
+      dout(20) << __func__ << " stopping zone cleaner thread" << dendl;
+      _zoned_cleaner_stop();
+    }
+#endif
+    dout(20) << __func__ << " stopping kv thread" << dendl;
+    _kv_stop();
+    // skip cache cleanup step on fast shutdown
+    if (likely(!m_fast_shutdown)) {
+      _shutdown_cache();
+    }
+    dout(20) << __func__ << " closing" << dendl;
+  }
+  _close_db_and_around();
+  // disable fsck on fast-shutdown
+  if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) {
+    int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
+    if (rc < 0)
+      return rc;
+    if (rc > 0) {
+      derr << __func__ << " fsck found " << rc << " errors" << dendl;
+      return -EIO;
+    }
+  }
+  return 0;
+}
+
+int BlueStore::cold_open()
+{
+  return _open_db_and_around(true);
+}
+
+int BlueStore::cold_close()
+{
+  _close_db_and_around();
+  return 0;
+}
+
+// derr wrapper to limit enormous output and avoid log flooding.
+// Of limited use where such output is expected for now
+#define fsck_derr(err_cnt, threshold) \
+  if (err_cnt <= threshold) {         \
+    bool need_skip_print = err_cnt == threshold; \
+    derr
+
+#define fsck_dendl \
+    dendl;          \
+    if (need_skip_print) \
+      derr << "more error lines skipped..." << dendl; \
+  }
+
+int _fsck_sum_extents(
+  const PExtentVector& extents,
+  bool compressed,
+  store_statfs_t& expected_statfs)
+{
+  for (auto e : extents) {
+    if (!e.is_valid())
+      continue;
+    expected_statfs.allocated += e.length;
+    if (compressed) {
+      expected_statfs.data_compressed_allocated += e.length;
+    }
+  }
+  return 0;
+}
+
+int BlueStore::_fsck_check_extents(
+  std::string_view ctx_descr,
+  const PExtentVector& extents,
+  bool compressed,
+  mempool_dynamic_bitset &used_blocks,
+  uint64_t granularity,
+  BlueStoreRepairer* repairer,
+  store_statfs_t& expected_statfs,
+  FSCKDepth depth)
+{
+  dout(30) << __func__ << " " << ctx_descr << ", extents " << extents << dendl;
+  int errors = 0;
+  for (auto e : extents) {
+    if (!e.is_valid())
+      continue;
+    expected_statfs.allocated += e.length;
+    if (compressed) {
+      expected_statfs.data_compressed_allocated += e.length;
+    }
+    if (depth != FSCK_SHALLOW) {
+      bool already = false;
+      apply_for_bitset_range(
+        e.offset, e.length, granularity, used_blocks,
+        [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+	  if (bs.test(pos)) {
+	    if (repairer) {
+	      repairer->note_misreference(
+	        pos * min_alloc_size, min_alloc_size, !already);
+	    }
+            if (!already) {
+              derr << __func__ << "::fsck error: " << ctx_descr << ", extent " << e
+		   << " or a subset is already allocated (misreferenced)" << dendl;
+	      ++errors;
+	      already = true;
+	    }
+	  }
+	  else
+	    bs.set(pos);
+        });
+
+      if (e.end() > bdev->get_size()) {
+        derr << "fsck error:  " << ctx_descr << ", extent " << e
+	     << " past end of block device" << dendl;
+        ++errors;
+      }
+    }
+  }
+  return errors;
+}
+
+void BlueStore::_fsck_check_statfs(
+  const store_statfs_t& expected_statfs,
+  const per_pool_statfs& expected_pool_statfs,
+  int64_t& errors,
+  int64_t& warnings,
+  BlueStoreRepairer* repairer)
+{
+  string key;
+  store_statfs_t actual_statfs;
+  store_statfs_t s;
+  {
+    // make a copy
+    per_pool_statfs my_expected_pool_statfs(expected_pool_statfs);
+    auto op = osd_pools.begin();
+    while (op != osd_pools.end()) {
+      get_pool_stat_key(op->first, &key);
+      op->second.publish(&s);
+      auto it_expected = my_expected_pool_statfs.find(op->first);
+      if (it_expected == my_expected_pool_statfs.end()) {
+        auto op0 = op++;
+        if (op0->second.is_empty()) {
+          // It's OK to lack relevant empty statfs record
+          continue;
+        }
+        derr << __func__ << "::fsck error: " << std::hex
+             << "pool " << op0->first << " has got no statfs to match against: "
+             << s
+             << std::dec << dendl;
+        ++errors;
+        if (repairer) {
+          osd_pools.erase(op0);
+          repairer->remove_key(db, PREFIX_STAT, key);
+        }
+      } else {
+        if (!(s == it_expected->second)) {
+          derr << "fsck error: actual " << s
+	       << " != expected " << it_expected->second
+	       << " for pool "
+	       << std::hex << op->first << std::dec << dendl;
+	  ++errors;
+	  if (repairer) {
+	    // repair in-memory in a hope this would be flushed properly on shutdown
+	    s = it_expected->second;
+	    op->second = it_expected->second;
+	    repairer->fix_statfs(db, key, it_expected->second);
+	  }
+	}
+        actual_statfs.add(s);
+        my_expected_pool_statfs.erase(it_expected);
+        ++op;
+      }
+    }
+    // check stats that lack matching entities in osd_pools
+    for (auto &p : my_expected_pool_statfs) {
+      if (p.second.is_zero()) {
+        // It's OK to lack relevant empty statfs record
+        continue;
+      }
+      get_pool_stat_key(p.first, &key);
+      derr << __func__ << "::fsck error: " << std::hex
+           << "pool " << p.first << " has got no actual statfs: "
+           << std::dec << p.second
+           << dendl;
+      ++errors;
+      if (repairer) {
+	osd_pools[p.first] = p.second;
+        repairer->fix_statfs(db, key, p.second);
+        actual_statfs.add(p.second);
+      }
+    }
+  }
+  // process global statfs
+  if (repairer) {
+    if (!per_pool_stat_collection) {
+      // by virtue of running this method, we correct the top-level
+      // error of having global stats
+      repairer->remove_key(db, PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY);
+      per_pool_stat_collection = true;
+    }
+    vstatfs = actual_statfs;
+    dout(20) << __func__ << " setting vstatfs to " << actual_statfs << dendl;
+  } else if (!per_pool_stat_collection) {
+    // check global stats only if fscking (not repairing) w/o per-pool stats
+    vstatfs.publish(&s);
+    if (!(s == expected_statfs)) {
+      derr << "fsck error: actual " << s
+           << " != expected " << expected_statfs << dendl;
+      ++errors;
+    }
+  }
+}
+
+void BlueStore::_fsck_repair_shared_blobs(
+  BlueStoreRepairer& repairer,
+  shared_blob_2hash_tracker_t& sb_ref_counts,
+  sb_info_space_efficient_map_t& sb_info)
+{
+  auto sb_ref_mismatches = sb_ref_counts.count_non_zero();
+  dout(1) << __func__ << " repairing shared_blobs, ref mismatch estimate: "
+	  << sb_ref_mismatches << dendl;
+  if (!sb_ref_mismatches) // not expected to succeed, just in case
+    return;
+
+
+  auto foreach_shared_blob = [&](std::function<
+    void (coll_t,
+          ghobject_t,
+          uint64_t,
+          const bluestore_blob_t&)> cb) {
+      auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+      if (it) {
+        CollectionRef c;
+        spg_t pgid;
+        for (it->lower_bound(string()); it->valid(); it->next()) {
+          dout(30) << __func__ << " key "
+	           << pretty_binary_string(it->key())
+	           << dendl;
+          if (is_extent_shard_key(it->key())) {
+	    continue;
+          }
+
+          ghobject_t oid;
+          int r = get_key_object(it->key(), &oid);
+          if (r < 0) {
+	    continue;
+          }
+
+          if (!c ||
+	    oid.shard_id != pgid.shard ||
+	    oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+	    !c->contains(oid)) {
+	    c = nullptr;
+	    for (auto& p : coll_map) {
+	      if (p.second->contains(oid)) {
+	        c = p.second;
+	        break;
+	      }
+	    }
+	    if (!c) {
+	      continue;
+	    }
+          }
+          dout(20) << __func__
+                   << " inspecting shared blob refs for col:" << c->cid
+	           << " obj:" << oid
+	           << dendl;
+
+          OnodeRef o;
+          o.reset(Onode::create_decode(c, oid, it->key(), it->value()));
+          o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+
+          _dump_onode<30>(cct, *o);
+
+          mempool::bluestore_fsck::set<BlobRef> passed_sbs;
+          for (auto& e : o->extent_map.extent_map) {
+	    auto& b = e.blob->get_blob();
+	    if (b.is_shared() && passed_sbs.count(e.blob) == 0) {
+	      auto sbid = e.blob->shared_blob->get_sbid();
+	      cb(c->cid, oid, sbid, b);
+	      passed_sbs.emplace(e.blob);
+	    }
+          } // for ... extent_map
+        } // for ... it->valid
+      } //if (it(PREFIX_OBJ))
+    }; //foreach_shared_blob fn declaration
+
+  mempool::bluestore_fsck::map<uint64_t, bluestore_extent_ref_map_t> refs_map;
+
+  // first iteration over objects to identify all the broken sbids
+  foreach_shared_blob( [&](coll_t cid,
+                           ghobject_t oid,
+                           uint64_t sbid,
+                           const bluestore_blob_t& b) {
+    auto it = refs_map.lower_bound(sbid);
+    if(it != refs_map.end() && it->first == sbid) {
+      return;
+    }
+    for (auto& p : b.get_extents()) {
+      if (p.is_valid() &&
+	  !sb_ref_counts.test_all_zero_range(sbid,
+					     p.offset,
+					     p.length)) {
+	refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t());
+        dout(20) << __func__
+                 << " broken shared blob found for col:" << cid
+	         << " obj:" << oid
+	         << " sbid 0x " << std::hex << sbid << std::dec
+	         << dendl;
+	break;
+      }
+    }
+  });
+
+  // second iteration over objects to build new ref map for the broken sbids
+  foreach_shared_blob( [&](coll_t cid,
+                           ghobject_t oid,
+                           uint64_t sbid,
+                           const bluestore_blob_t& b) {
+    auto it = refs_map.find(sbid);
+    if(it == refs_map.end()) {
+      return;
+    }
+    for (auto& p : b.get_extents()) {
+      if (p.is_valid()) {
+	it->second.get(p.offset, p.length);
+	break;
+      }
+    }
+  });
+
+  // update shared blob records
+  auto ref_it = refs_map.begin();
+  while (ref_it != refs_map.end()) {
+    size_t cnt = 0;
+    const size_t max_transactions = 4096;
+    KeyValueDB::Transaction txn = db->get_transaction();
+    for (cnt = 0;
+      cnt < max_transactions && ref_it != refs_map.end();
+      ref_it++) {
+      auto sbid = ref_it->first;
+      dout(20) << __func__ << " repaired shared_blob 0x"
+	<< std::hex << sbid << std::dec
+	<< ref_it->second << dendl;
+      repairer.fix_shared_blob(txn, sbid, &ref_it->second, 0);
+      cnt++;
+    }
+    if (cnt) {
+      db->submit_transaction_sync(txn);
+      cnt = 0;
+    }
+  }
+  // remove stray shared blob records
+  size_t cnt = 0;
+  const size_t max_transactions = 4096;
+  KeyValueDB::Transaction txn = db->get_transaction();
+  sb_info.foreach_stray([&](const sb_info_t& sbi) {
+    auto sbid = sbi.get_sbid();
+    dout(20) << __func__ << " removing stray shared_blob 0x"
+      << std::hex << sbid << std::dec
+      << dendl;
+    repairer.fix_shared_blob(txn, sbid, nullptr, 0);
+    cnt++;
+    if (cnt >= max_transactions) {}
+      db->submit_transaction_sync(txn);
+      txn = db->get_transaction();
+      cnt = 0;
+    });
+  if (cnt > 0) {
+    db->submit_transaction_sync(txn);
+  }
+
+  // amount of repairs to report to be equal to previously
+  // determined error estimation, not the actual number of updated shared blobs
+  repairer.inc_repaired(sb_ref_mismatches);
+}
+
+BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
+  BlueStore::FSCKDepth depth,
+  int64_t pool_id,
+  BlueStore::CollectionRef c,
+  const ghobject_t& oid,
+  const string& key,
+  const bufferlist& value,
+  mempool::bluestore_fsck::list<string>* expecting_shards,
+  map<BlobRef, bluestore_blob_t::unused_t>* referenced,
+  const BlueStore::FSCK_ObjectCtx& ctx)
+{
+  auto& errors = ctx.errors;
+  auto& num_objects = ctx.num_objects;
+  auto& num_extents = ctx.num_extents;
+  auto& num_blobs = ctx.num_blobs;
+  auto& num_sharded_objects = ctx.num_sharded_objects;
+  auto& num_spanning_blobs = ctx.num_spanning_blobs;
+  auto used_blocks = ctx.used_blocks;
+  auto sb_info_lock = ctx.sb_info_lock;
+  auto& sb_info = ctx.sb_info;
+  auto& sb_ref_counts = ctx.sb_ref_counts;
+  auto repairer = ctx.repairer;
+
+  store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
+    &ctx.expected_pool_statfs[pool_id] :
+    &ctx.expected_store_statfs;
+
+  map<uint32_t, uint64_t> zone_first_offsets;  // for zoned/smr devices
+
+  dout(10) << __func__ << "  " << oid << dendl;
+  OnodeRef o;
+  o.reset(Onode::create_decode(c, oid, key, value));
+  ++num_objects;
+
+  num_spanning_blobs += o->extent_map.spanning_blob_map.size();
+
+  o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+  _dump_onode<30>(cct, *o);
+  // shards
+  if (!o->extent_map.shards.empty()) {
+    ++num_sharded_objects;
+    if (depth != FSCK_SHALLOW) {
+      ceph_assert(expecting_shards);
+      for (auto& s : o->extent_map.shards) {
+        dout(20) << __func__ << "    shard " << *s.shard_info << dendl;
+        expecting_shards->push_back(string());
+        get_extent_shard_key(o->key, s.shard_info->offset,
+          &expecting_shards->back());
+        if (s.shard_info->offset >= o->onode.size) {
+          derr << "fsck error: " << oid << " shard 0x" << std::hex
+            << s.shard_info->offset << " past EOF at 0x" << o->onode.size
+            << std::dec << dendl;
+          ++errors;
+        }
+      }
+    }
+  }
+
+  // lextents
+  uint64_t pos = 0;
+  mempool::bluestore_fsck::map<BlobRef,
+    bluestore_blob_use_tracker_t> ref_map;
+  for (auto& l : o->extent_map.extent_map) {
+    dout(20) << __func__ << "    " << l << dendl;
+    if (l.logical_offset < pos) {
+      derr << "fsck error: " << oid << " lextent at 0x"
+        << std::hex << l.logical_offset
+        << " overlaps with the previous, which ends at 0x" << pos
+        << std::dec << dendl;
+      ++errors;
+    }
+    if (depth != FSCK_SHALLOW &&
+      o->extent_map.spans_shard(l.logical_offset, l.length)) {
+      derr << "fsck error: " << oid << " lextent at 0x"
+        << std::hex << l.logical_offset << "~" << l.length
+        << " spans a shard boundary"
+        << std::dec << dendl;
+      ++errors;
+    }
+    pos = l.logical_offset + l.length;
+    res_statfs->data_stored += l.length;
+    ceph_assert(l.blob);
+    const bluestore_blob_t& blob = l.blob->get_blob();
+
+#ifdef HAVE_LIBZBD
+    if (bdev->is_smr() && depth != FSCK_SHALLOW) {
+      for (auto& e : blob.get_extents()) {
+	if (e.is_valid()) {
+	  uint32_t zone = e.offset / zone_size;
+	  uint64_t offset = e.offset % zone_size;
+	  auto p = zone_first_offsets.find(zone);
+	  if (p == zone_first_offsets.end() || p->second > offset) {
+	    // FIXME: use interator for guided insert?
+	    zone_first_offsets[zone] = offset;
+	  }
+	}
+      }
+    }
+#endif
+
+    auto& ref = ref_map[l.blob];
+    if (ref.is_empty()) {
+      uint32_t min_release_size = blob.get_release_size(min_alloc_size);
+      uint32_t l = blob.get_logical_length();
+      ref.init(l, min_release_size);
+    }
+    ref.get(
+      l.blob_offset,
+      l.length);
+    ++num_extents;
+    if (depth != FSCK_SHALLOW &&
+      blob.has_unused()) {
+      ceph_assert(referenced);
+      auto p = referenced->find(l.blob);
+      bluestore_blob_t::unused_t* pu;
+      if (p == referenced->end()) {
+        pu = &(*referenced)[l.blob];
+      }
+      else {
+        pu = &p->second;
+      }
+      uint64_t blob_len = blob.get_logical_length();
+      ceph_assert((blob_len % (sizeof(*pu) * 8)) == 0);
+      ceph_assert(l.blob_offset + l.length <= blob_len);
+      uint64_t chunk_size = blob_len / (sizeof(*pu) * 8);
+      uint64_t start = l.blob_offset / chunk_size;
+      uint64_t end =
+        round_up_to(l.blob_offset + l.length, chunk_size) / chunk_size;
+      for (auto i = start; i < end; ++i) {
+        (*pu) |= (1u << i);
+      }
+    }
+  } //for (auto& l : o->extent_map.extent_map)
+
+  for (auto& i : ref_map) {
+    ++num_blobs;
+    const bluestore_blob_t& blob = i.first->get_blob();
+    bool equal =
+      depth == FSCK_SHALLOW ? true :
+      i.first->get_blob_use_tracker().equal(i.second);
+    if (!equal) {
+      derr << "fsck error: " << oid << " blob " << *i.first
+        << " doesn't match expected ref_map " << i.second << dendl;
+      ++errors;
+    }
+    if (blob.is_compressed()) {
+      res_statfs->data_compressed += blob.get_compressed_payload_length();
+      res_statfs->data_compressed_original +=
+        i.first->get_referenced_bytes();
+    }
+    if (depth != FSCK_SHALLOW && repairer) {
+      for (auto e : blob.get_extents()) {
+	if (!e.is_valid())
+	  continue;
+	repairer->set_space_used(e.offset, e.length, c->cid, oid);
+      }
+    }
+    if (blob.is_shared()) {
+      if (i.first->shared_blob->get_sbid() > blobid_max) {
+        derr << "fsck error: " << oid << " blob " << blob
+          << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
+          << blobid_max << dendl;
+        ++errors;
+      } else if (i.first->shared_blob->get_sbid() == 0) {
+        derr << "fsck error: " << oid << " blob " << blob
+          << " marked as shared but has uninitialized sbid"
+          << dendl;
+        ++errors;
+      }
+      // the below lock is optional and provided in multithreading mode only
+      if (sb_info_lock) {
+        sb_info_lock->lock();
+      }
+      auto sbid = i.first->shared_blob->get_sbid();
+      sb_info_t& sbi = sb_info.add_or_adopt(i.first->shared_blob->get_sbid());
+      ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID ||
+        sbi.pool_id == oid.hobj.get_logical_pool());
+      sbi.pool_id = oid.hobj.get_logical_pool();
+      bool compressed = blob.is_compressed();
+      for (auto e : blob.get_extents()) {
+        if (e.is_valid()) {
+	  if (compressed) {
+	    ceph_assert(sbi.allocated_chunks <= 0);
+	    sbi.allocated_chunks -= (e.length >> min_alloc_size_order);
+	  } else {
+	    ceph_assert(sbi.allocated_chunks >= 0);
+	    sbi.allocated_chunks += (e.length >> min_alloc_size_order);
+	  }
+	  sb_ref_counts.inc_range(sbid, e.offset, e.length, 1);
+        }
+      }
+      if (sb_info_lock) {
+        sb_info_lock->unlock();
+      }
+    } else if (depth != FSCK_SHALLOW) {
+      ceph_assert(used_blocks);
+      string ctx_descr = " oid " + stringify(oid);
+      errors += _fsck_check_extents(ctx_descr,
+	blob.get_extents(),
+        blob.is_compressed(),
+        *used_blocks,
+        fm->get_alloc_size(),
+	repairer,
+        *res_statfs,
+        depth);
+    } else {
+      errors += _fsck_sum_extents(
+        blob.get_extents(),
+        blob.is_compressed(),
+        *res_statfs);
+    }
+  } // for (auto& i : ref_map)
+
+  {
+    auto &sbm = o->extent_map.spanning_blob_map;
+    size_t broken = 0;
+    BlobRef first_broken;
+    for (auto it = sbm.begin(); it != sbm.end();) {
+      auto it1 = it++;
+      if (ref_map.count(it1->second) == 0) {
+        if (!broken) {
+          first_broken = it1->second;
+          ++errors;
+          derr << "fsck error:" << " stray spanning blob found:" << it1->first
+               << dendl;
+        }
+        broken++;
+        if (repairer) {
+          sbm.erase(it1);
+        }
+      }
+    }
+
+#ifdef HAVE_LIBZBD
+    if (bdev->is_smr() && depth != FSCK_SHALLOW) {
+      for (auto& [zone, first_offset] : zone_first_offsets) {
+	auto p = (*ctx.zone_refs)[zone].find(oid);
+	if (p != (*ctx.zone_refs)[zone].end()) {
+	  if (first_offset < p->second) {
+	    dout(20) << " slightly wonky zone ref 0x" << std::hex << zone
+		 << " offset 0x" << p->second
+		 << " but first offset is 0x" << first_offset
+		 << "; this can happen due to clone_range"
+		 << dendl;
+	  } else {
+	    dout(20) << " good zone ref 0x" << std::hex << zone << " offset 0x" << p->second
+		     << " <= first offset 0x" << first_offset
+		     << std::dec << dendl;
+	  }
+	  (*ctx.zone_refs)[zone].erase(p);
+	} else {
+	  derr << "fsck error: " << oid << " references zone 0x" << std::hex << zone
+	       << " but there is no zone ref" << std::dec << dendl;
+	  // FIXME: add repair
+	  ++errors;
+	}
+      }
+    }
+#endif
+
+    if (broken) {
+      derr << "fsck error: " << oid << " - " << broken
+           << " zombie spanning blob(s) found, the first one: "
+           << *first_broken << dendl;
+      if(repairer) {
+        repairer->fix_spanning_blobs(
+	  db,
+	  [&](KeyValueDB::Transaction txn) {
+	    _record_onode(o, txn);
+	  });
+      }
+    }
+  }
+
+  if (o->onode.has_omap()) {
+    _fsck_check_object_omap(depth, o, ctx);
+  }
+
+  return o;
+}
+
+#include "common/WorkQueue.h"
+
+class ShallowFSCKThreadPool : public ThreadPool
+{
+public:
+  ShallowFSCKThreadPool(CephContext* cct_, std::string nm, std::string tn, int n) :
+    ThreadPool(cct_, nm, tn, n) {
+  }
+  void worker(ThreadPool::WorkThread* wt) override {
+    int next_wq = 0;
+    while (!_stop) {
+      next_wq %= work_queues.size();
+      WorkQueue_ *wq = work_queues[next_wq++];
+
+      void* item = wq->_void_dequeue();
+      if (item) {
+        processing++;
+        TPHandle tp_handle(cct, nullptr, wq->timeout_interval, wq->suicide_interval);
+        wq->_void_process(item, tp_handle);
+        processing--;
+      }
+    }
+  }
+  template <size_t BatchLen>
+  struct FSCKWorkQueue : public ThreadPool::WorkQueue_
+  {
+    struct Entry {
+      int64_t pool_id;
+      BlueStore::CollectionRef c;
+      ghobject_t oid;
+      string key;
+      bufferlist value;
+    };
+    struct Batch {
+      std::atomic<size_t> running = { 0 };
+      size_t entry_count = 0;
+      std::array<Entry, BatchLen> entries;
+
+      int64_t errors = 0;
+      int64_t warnings = 0;
+      uint64_t num_objects = 0;
+      uint64_t num_extents = 0;
+      uint64_t num_blobs = 0;
+      uint64_t num_sharded_objects = 0;
+      uint64_t num_spanning_blobs = 0;
+      store_statfs_t expected_store_statfs;
+      BlueStore::per_pool_statfs expected_pool_statfs;
+    };
+
+    size_t batchCount;
+    BlueStore* store = nullptr;
+
+    ceph::mutex* sb_info_lock = nullptr;
+    sb_info_space_efficient_map_t* sb_info = nullptr;
+    shared_blob_2hash_tracker_t* sb_ref_counts = nullptr;
+    BlueStoreRepairer* repairer = nullptr;
+
+    Batch* batches = nullptr;
+    size_t last_batch_pos = 0;
+    bool batch_acquired = false;
+
+    FSCKWorkQueue(std::string n,
+                  size_t _batchCount,
+                  BlueStore* _store,
+                  ceph::mutex* _sb_info_lock,
+                  sb_info_space_efficient_map_t& _sb_info,
+		  shared_blob_2hash_tracker_t& _sb_ref_counts,
+                  BlueStoreRepairer* _repairer) :
+      WorkQueue_(n, ceph::timespan::zero(), ceph::timespan::zero()),
+      batchCount(_batchCount),
+      store(_store),
+      sb_info_lock(_sb_info_lock),
+      sb_info(&_sb_info),
+      sb_ref_counts(&_sb_ref_counts),
+      repairer(_repairer)
+    {
+      batches = new Batch[batchCount];
+    }
+    ~FSCKWorkQueue() {
+      delete[] batches;
+    }
+
+    /// Remove all work items from the queue.
+    void _clear() override {
+      //do nothing
+    }
+    /// Check whether there is anything to do.
+    bool _empty() override {
+      ceph_assert(false);
+    }
+
+    /// Get the next work item to process.
+    void* _void_dequeue() override {
+      size_t pos = rand() % batchCount;
+      size_t pos0 = pos;
+      do {
+        auto& batch = batches[pos];
+        if (batch.running.fetch_add(1) == 0) {
+          if (batch.entry_count) {
+            return &batch;
+          }
+        }
+        batch.running--;
+        pos++;
+        pos %= batchCount;
+      } while (pos != pos0);
+      return nullptr;
+    }
+    /** @brief Process the work item.
+     * This function will be called several times in parallel
+     * and must therefore be thread-safe. */
+    void _void_process(void* item, TPHandle& handle) override {
+      Batch* batch = (Batch*)item;
+
+      BlueStore::FSCK_ObjectCtx ctx(
+        batch->errors,
+        batch->warnings,
+        batch->num_objects,
+        batch->num_extents,
+        batch->num_blobs,
+        batch->num_sharded_objects,
+        batch->num_spanning_blobs,
+        nullptr, // used_blocks
+        nullptr, //used_omap_head
+	nullptr,
+        sb_info_lock,
+        *sb_info,
+	*sb_ref_counts,
+        batch->expected_store_statfs,
+        batch->expected_pool_statfs,
+        repairer);
+
+      for (size_t i = 0; i < batch->entry_count; i++) {
+        auto& entry = batch->entries[i];
+
+        store->fsck_check_objects_shallow(
+          BlueStore::FSCK_SHALLOW,
+          entry.pool_id,
+          entry.c,
+          entry.oid,
+          entry.key,
+          entry.value,
+          nullptr, // expecting_shards - this will need a protection if passed
+          nullptr, // referenced
+          ctx);
+      }
+      batch->entry_count = 0;
+      batch->running--;
+    }
+    /** @brief Synchronously finish processing a work item.
+     * This function is called after _void_process with the global thread pool lock held,
+     * so at most one copy will execute simultaneously for a given thread pool.
+     * It can be used for non-thread-safe finalization. */
+    void _void_process_finish(void*) override {
+      ceph_assert(false);
+    }
+
+    bool queue(
+      int64_t pool_id,
+      BlueStore::CollectionRef c,
+      const ghobject_t& oid,
+      const string& key,
+      const bufferlist& value) {
+      bool res = false;
+      size_t pos0 = last_batch_pos;
+      if (!batch_acquired) {
+        do {
+          auto& batch = batches[last_batch_pos];
+          if (batch.running.fetch_add(1) == 0) {
+            if (batch.entry_count < BatchLen) {
+              batch_acquired = true;
+              break;
+            }
+          }
+          batch.running.fetch_sub(1);
+          last_batch_pos++;
+          last_batch_pos %= batchCount;
+        } while (last_batch_pos != pos0);
+      }
+      if (batch_acquired) {
+        auto& batch = batches[last_batch_pos];
+        ceph_assert(batch.running);
+        ceph_assert(batch.entry_count < BatchLen);
+
+        auto& entry = batch.entries[batch.entry_count];
+        entry.pool_id = pool_id;
+        entry.c = c;
+        entry.oid = oid;
+        entry.key = key;
+        entry.value = value;
+
+        ++batch.entry_count;
+        if (batch.entry_count == BatchLen) {
+          batch_acquired = false;
+          batch.running.fetch_sub(1);
+          last_batch_pos++;
+          last_batch_pos %= batchCount;
+        }
+        res = true;
+      }
+      return res;
+    }
+
+    void finalize(ThreadPool& tp,
+                  BlueStore::FSCK_ObjectCtx& ctx) {
+      if (batch_acquired) {
+        auto& batch = batches[last_batch_pos];
+        ceph_assert(batch.running);
+        batch.running.fetch_sub(1);
+      }
+      tp.stop();
+
+      for (size_t i = 0; i < batchCount; i++) {
+        auto& batch = batches[i];
+
+        //process leftovers if any
+        if (batch.entry_count) {
+          TPHandle tp_handle(store->cct,
+            nullptr,
+            timeout_interval,
+            suicide_interval);
+          ceph_assert(batch.running == 0);
+
+          batch.running++; // just to be on-par with the regular call
+          _void_process(&batch, tp_handle);
+        }
+        ceph_assert(batch.entry_count == 0);
+
+        ctx.errors += batch.errors;
+        ctx.warnings += batch.warnings;
+        ctx.num_objects += batch.num_objects;
+        ctx.num_extents += batch.num_extents;
+        ctx.num_blobs += batch.num_blobs;
+        ctx.num_sharded_objects += batch.num_sharded_objects;
+        ctx.num_spanning_blobs += batch.num_spanning_blobs;
+
+        ctx.expected_store_statfs.add(batch.expected_store_statfs);
+
+        for (auto it = batch.expected_pool_statfs.begin();
+          it != batch.expected_pool_statfs.end();
+          it++) {
+          ctx.expected_pool_statfs[it->first].add(it->second);
+        }
+      }
+    }
+  };
+};
+
+void BlueStore::_fsck_check_object_omap(FSCKDepth depth,
+  OnodeRef& o,
+  const BlueStore::FSCK_ObjectCtx& ctx)
+{
+  auto& errors = ctx.errors;
+  auto& warnings = ctx.warnings;
+  auto repairer = ctx.repairer;
+
+  ceph_assert(o->onode.has_omap());
+  if (!o->onode.is_perpool_omap() && !o->onode.is_pgmeta_omap()) {
+    if (per_pool_omap == OMAP_PER_POOL) {
+      fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+        << "fsck error: " << o->oid
+        << " has omap that is not per-pool or pgmeta"
+        << fsck_dendl;
+      ++errors;
+    } else {
+      const char* w;
+      int64_t num;
+      if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
+        ++errors;
+        num = errors;
+        w = "error";
+      } else {
+        ++warnings;
+        num = warnings;
+        w = "warning";
+      }
+      fsck_derr(num, MAX_FSCK_ERROR_LINES)
+        << "fsck " << w << ": " << o->oid
+        << " has omap that is not per-pool or pgmeta"
+        << fsck_dendl;
+    }
+  } else if (!o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) {
+    if (per_pool_omap == OMAP_PER_PG) {
+      fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+        << "fsck error: " << o->oid
+        << " has omap that is not per-pg or pgmeta"
+        << fsck_dendl;
+      ++errors;
+    } else {
+      const char* w;
+      int64_t num;
+      if (cct->_conf->bluestore_fsck_error_on_no_per_pg_omap) {
+        ++errors;
+        num = errors;
+        w = "error";
+      } else {
+        ++warnings;
+        num = warnings;
+        w = "warning";
+      }
+      fsck_derr(num, MAX_FSCK_ERROR_LINES)
+        << "fsck " << w << ": " << o->oid
+        << " has omap that is not per-pg or pgmeta"
+        << fsck_dendl;
+    }
+  }
+  if (repairer &&
+    !o->onode.is_perpg_omap() &&
+    !o->onode.is_pgmeta_omap()) {
+    dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl;
+    bufferlist header;
+    map<string, bufferlist> kv;
+    {
+      KeyValueDB::Transaction txn = db->get_transaction();
+      uint64_t txn_cost = 0;
+      const string& prefix = Onode::calc_omap_prefix(o->onode.flags);
+      uint8_t new_flags = o->onode.flags |
+	bluestore_onode_t::FLAG_PERPOOL_OMAP |
+	bluestore_onode_t::FLAG_PERPG_OMAP;
+      const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags);
+
+      KeyValueDB::Iterator it = db->get_iterator(prefix);
+      string head, tail;
+      o->get_omap_header(&head);
+      o->get_omap_tail(&tail);
+      it->lower_bound(head);
+      // head
+      if (it->valid() && it->key() == head) {
+	dout(30) << __func__ << "  got header" << dendl;
+	header = it->value();
+	if (header.length()) {
+	  string new_head;
+	  Onode::calc_omap_header(new_flags, o.get(), &new_head);
+	  txn->set(new_omap_prefix, new_head, header);
+	  txn_cost += new_head.length() + header.length();
+	}
+	it->next();
+      }
+      // tail
+      {
+	string new_tail;
+	Onode::calc_omap_tail(new_flags, o.get(), &new_tail);
+	bufferlist empty;
+	txn->set(new_omap_prefix, new_tail, empty);
+	txn_cost += new_tail.length() + new_tail.length();
+      }
+      // values
+      string final_key;
+      Onode::calc_omap_key(new_flags, o.get(), string(), &final_key);
+      size_t base_key_len = final_key.size();
+      while (it->valid() && it->key() < tail) {
+	string user_key;
+	o->decode_omap_key(it->key(), &user_key);
+	dout(20) << __func__ << "  got " << pretty_binary_string(it->key())
+	  << " -> " << user_key << dendl;
+
+	final_key.resize(base_key_len);
+	final_key += user_key;
+	auto v = it->value();
+	txn->set(new_omap_prefix, final_key, v);
+	txn_cost += final_key.length() + v.length();
+
+	// submit a portion if cost exceeds 16MB
+	if (txn_cost >= 16 * (1 << 20) ) {
+	  db->submit_transaction_sync(txn);
+	  txn = db->get_transaction();
+	  txn_cost = 0;
+	}
+	it->next();
+      }
+      if (txn_cost > 0) {
+	db->submit_transaction_sync(txn);
+      }
+    }
+    // finalize: remove legacy data
+    {
+      KeyValueDB::Transaction txn = db->get_transaction();
+      // remove old keys
+      const string& old_omap_prefix = o->get_omap_prefix();
+      string old_head, old_tail;
+      o->get_omap_header(&old_head);
+      o->get_omap_tail(&old_tail);
+      txn->rm_range_keys(old_omap_prefix, old_head, old_tail);
+      txn->rmkey(old_omap_prefix, old_tail);
+      // set flag
+      o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP);
+      _record_onode(o, txn);
+      db->submit_transaction_sync(txn);
+      repairer->inc_repaired();
+      repairer->request_compaction();
+    }
+  }
+}
+
+void BlueStore::_fsck_check_objects(
+  FSCKDepth depth,
+  BlueStore::FSCK_ObjectCtx& ctx)
+{
+  auto& errors = ctx.errors;
+  auto sb_info_lock = ctx.sb_info_lock;
+  auto& sb_info = ctx.sb_info;
+  auto& sb_ref_counts = ctx.sb_ref_counts;
+  auto repairer = ctx.repairer;
+
+  uint64_t_btree_t used_nids;
+
+  size_t processed_myself = 0;
+
+  auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+  mempool::bluestore_fsck::list<string> expecting_shards;
+  if (it) {
+    const size_t thread_count = cct->_conf->bluestore_fsck_quick_fix_threads;
+    typedef ShallowFSCKThreadPool::FSCKWorkQueue<256> WQ;
+    std::unique_ptr<WQ> wq(
+      new WQ(
+        "FSCKWorkQueue",
+        (thread_count ? : 1) * 32,
+        this,
+        sb_info_lock,
+        sb_info,
+	sb_ref_counts,
+        repairer));
+
+    ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
+
+    thread_pool.add_work_queue(wq.get());
+    if (depth == FSCK_SHALLOW && thread_count > 0) {
+      //not the best place but let's check anyway
+      ceph_assert(sb_info_lock);
+      thread_pool.start();
+    }
+
+    // fill global if not overriden below
+    CollectionRef c;
+    int64_t pool_id = -1;
+    spg_t pgid;
+    for (it->lower_bound(string()); it->valid(); it->next()) {
+      dout(30) << __func__ << " key "
+        << pretty_binary_string(it->key()) << dendl;
+      if (is_extent_shard_key(it->key())) {
+        if (depth == FSCK_SHALLOW) {
+          continue;
+        }
+        while (!expecting_shards.empty() &&
+          expecting_shards.front() < it->key()) {
+          derr << "fsck error: missing shard key "
+            << pretty_binary_string(expecting_shards.front())
+            << dendl;
+          ++errors;
+          expecting_shards.pop_front();
+        }
+        if (!expecting_shards.empty() &&
+          expecting_shards.front() == it->key()) {
+          // all good
+          expecting_shards.pop_front();
+          continue;
+        }
+
+        uint32_t offset;
+        string okey;
+        get_key_extent_shard(it->key(), &okey, &offset);
+        derr << "fsck error: stray shard 0x" << std::hex << offset
+          << std::dec << dendl;
+        if (expecting_shards.empty()) {
+          derr << "fsck error: " << pretty_binary_string(it->key())
+            << " is unexpected" << dendl;
+          ++errors;
+          continue;
+        }
+        while (expecting_shards.front() > it->key()) {
+          derr << "fsck error:   saw " << pretty_binary_string(it->key())
+            << dendl;
+          derr << "fsck error:   exp "
+            << pretty_binary_string(expecting_shards.front()) << dendl;
+          ++errors;
+          expecting_shards.pop_front();
+          if (expecting_shards.empty()) {
+            break;
+          }
+        }
+        continue;
+      }
+
+      ghobject_t oid;
+      int r = get_key_object(it->key(), &oid);
+      if (r < 0) {
+        derr << "fsck error: bad object key "
+          << pretty_binary_string(it->key()) << dendl;
+        ++errors;
+        continue;
+      }
+      if (!c ||
+        oid.shard_id != pgid.shard ||
+        oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+        !c->contains(oid)) {
+        c = nullptr;
+        for (auto& p : coll_map) {
+          if (p.second->contains(oid)) {
+            c = p.second;
+            break;
+          }
+        }
+        if (!c) {
+          derr << "fsck error: stray object " << oid
+            << " not owned by any collection" << dendl;
+          ++errors;
+          continue;
+        }
+        pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
+        dout(20) << __func__ << "  collection " << c->cid << " " << c->cnode
+          << dendl;
+      }
+
+      if (depth != FSCK_SHALLOW &&
+        !expecting_shards.empty()) {
+        for (auto& k : expecting_shards) {
+          derr << "fsck error: missing shard key "
+            << pretty_binary_string(k) << dendl;
+        }
+        ++errors;
+        expecting_shards.clear();
+      }
+
+      bool queued = false;
+      if (depth == FSCK_SHALLOW && thread_count > 0) {
+        queued = wq->queue(
+          pool_id,
+          c,
+          oid,
+          it->key(),
+          it->value());
+      }
+      OnodeRef o;
+      map<BlobRef, bluestore_blob_t::unused_t> referenced;
+
+      if (!queued) {
+        ++processed_myself;
+         o = fsck_check_objects_shallow(
+          depth,
+          pool_id,
+          c,
+          oid,
+          it->key(),
+          it->value(),
+          &expecting_shards,
+          &referenced,
+          ctx);
+      }
+
+      if (depth != FSCK_SHALLOW) {
+        ceph_assert(o != nullptr);
+        if (o->onode.nid) {
+          if (o->onode.nid > nid_max) {
+            derr << "fsck error: " << oid << " nid " << o->onode.nid
+              << " > nid_max " << nid_max << dendl;
+            ++errors;
+          }
+          if (used_nids.count(o->onode.nid)) {
+            derr << "fsck error: " << oid << " nid " << o->onode.nid
+              << " already in use" << dendl;
+            ++errors;
+            continue; // go for next object
+          }
+          used_nids.insert(o->onode.nid);
+        }
+        for (auto& i : referenced) {
+          dout(20) << __func__ << "  referenced 0x" << std::hex << i.second
+            << std::dec << " for " << *i.first << dendl;
+          const bluestore_blob_t& blob = i.first->get_blob();
+          if (i.second & blob.unused) {
+            derr << "fsck error: " << oid << " blob claims unused 0x"
+              << std::hex << blob.unused
+              << " but extents reference 0x" << i.second << std::dec
+              << " on blob " << *i.first << dendl;
+            ++errors;
+          }
+          if (blob.has_csum()) {
+            uint64_t blob_len = blob.get_logical_length();
+            uint64_t unused_chunk_size = blob_len / (sizeof(blob.unused) * 8);
+            unsigned csum_count = blob.get_csum_count();
+            unsigned csum_chunk_size = blob.get_csum_chunk_size();
+            for (unsigned p = 0; p < csum_count; ++p) {
+              unsigned pos = p * csum_chunk_size;
+              unsigned firstbit = pos / unused_chunk_size;    // [firstbit,lastbit]
+              unsigned lastbit = (pos + csum_chunk_size - 1) / unused_chunk_size;
+              unsigned mask = 1u << firstbit;
+              for (unsigned b = firstbit + 1; b <= lastbit; ++b) {
+                mask |= 1u << b;
+              }
+              if ((blob.unused & mask) == mask) {
+                // this csum chunk region is marked unused
+                if (blob.get_csum_item(p) != 0) {
+                  derr << "fsck error: " << oid
+                    << " blob claims csum chunk 0x" << std::hex << pos
+                    << "~" << csum_chunk_size
+                    << " is unused (mask 0x" << mask << " of unused 0x"
+                    << blob.unused << ") but csum is non-zero 0x"
+                    << blob.get_csum_item(p) << std::dec << " on blob "
+                    << *i.first << dendl;
+                  ++errors;
+                }
+              }
+            }
+          }
+        }
+        // omap
+        if (o->onode.has_omap()) {
+          ceph_assert(ctx.used_omap_head);
+          if (ctx.used_omap_head->count(o->onode.nid)) {
+            derr << "fsck error: " << o->oid << " omap_head " << o->onode.nid
+                 << " already in use" << dendl;
+            ++errors;
+          } else {
+            ctx.used_omap_head->insert(o->onode.nid);
+          }
+        } // if (o->onode.has_omap())
+        if (depth == FSCK_DEEP) {
+          bufferlist bl;
+          uint64_t max_read_block = cct->_conf->bluestore_fsck_read_bytes_cap;
+          uint64_t offset = 0;
+          do {
+            uint64_t l = std::min(uint64_t(o->onode.size - offset), max_read_block);
+            int r = _do_read(c.get(), o, offset, l, bl,
+              CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+            if (r < 0) {
+              ++errors;
+              derr << "fsck error: " << oid << std::hex
+                << " error during read: "
+                << " " << offset << "~" << l
+                << " " << cpp_strerror(r) << std::dec
+                << dendl;
+              break;
+            }
+            offset += l;
+          } while (offset < o->onode.size);
+        } // deep
+      } //if (depth != FSCK_SHALLOW)
+    } // for (it->lower_bound(string()); it->valid(); it->next())
+    if (depth == FSCK_SHALLOW && thread_count > 0) {
+      wq->finalize(thread_pool, ctx);
+      if (processed_myself) {
+        // may be needs more threads?
+        dout(0) << __func__ << " partial offload"
+                << ", done myself " << processed_myself
+                << " of " << ctx.num_objects
+                << "objects, threads " << thread_count
+                << dendl;
+      }
+    }
+  } // if (it)
+}
+/**
+An overview for currently implemented repair logics 
+performed in fsck in two stages: detection(+preparation) and commit.
+Detection stage (in processing order):
+  (Issue -> Repair action to schedule)
+  - Detect undecodable keys for Shared Blobs -> Remove
+  - Detect undecodable records for Shared Blobs -> Remove 
+    (might trigger missed Shared Blob detection below)
+  - Detect stray records for Shared Blobs -> Remove
+  - Detect misreferenced pextents -> Fix
+    Prepare Bloom-like filter to track cid/oid -> pextent 
+    Prepare list of extents that are improperly referenced
+    Enumerate Onode records that might use 'misreferenced' pextents
+    (Bloom-like filter applied to reduce computation)
+      Per each questinable Onode enumerate all blobs and identify broken ones 
+      (i.e. blobs having 'misreferences')
+      Rewrite each broken blob data by allocating another extents and 
+      copying data there
+      If blob is shared - unshare it and mark corresponding Shared Blob 
+      for removal
+      Release previously allocated space
+      Update Extent Map
+  - Detect missed Shared Blobs -> Recreate
+  - Detect undecodable deferred transaction -> Remove
+  - Detect Freelist Manager's 'false free' entries -> Mark as used
+  - Detect Freelist Manager's leaked entries -> Mark as free
+  - Detect statfs inconsistency - Update
+  Commit stage (separate DB commit per each step):
+  - Apply leaked FM entries fix
+  - Apply 'false free' FM entries fix
+  - Apply 'Remove' actions
+  - Apply fix for misreference pextents
+  - Apply Shared Blob recreate 
+    (can be merged with the step above if misreferences were dectected)
+  - Apply StatFS update
+*/
+int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
+{
+  dout(5) << __func__
+    << (repair ? " repair" : " check")
+    << (depth == FSCK_DEEP ? " (deep)" :
+      depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
+    << dendl;
+
+  // in deep mode we need R/W write access to be able to replay deferred ops
+  const bool read_only = !(repair || depth == FSCK_DEEP);
+  int r = _open_db_and_around(read_only);
+  if (r < 0) {
+    return r;
+  }
+  auto close_db = make_scope_guard([&] {
+    _close_db_and_around();
+  });
+
+  if (!read_only) {
+    r = _upgrade_super();
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  // NullFreelistManager needs to open collection early
+  r = _open_collections();
+  if (r < 0) {
+    return r;
+  }
+
+  mempool_thread.init();
+  auto stop_mempool = make_scope_guard([&] {
+    mempool_thread.shutdown();
+    _shutdown_cache();
+  });
+  // we need finisher and kv_{sync,finalize}_thread *just* for replay
+  // enable in repair or deep mode modes only
+  if (!read_only) {
+    _kv_start();
+    r = _deferred_replay();
+    _kv_stop();
+  }
+
+  if (r < 0) {
+    return r;
+  }
+  return _fsck_on_open(depth, repair);
+}
+
+int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
+{
+  uint64_t sb_hash_size = uint64_t(
+    cct->_conf.get_val<Option::size_t>("osd_memory_target") *
+    cct->_conf.get_val<double>(
+      "bluestore_fsck_shared_blob_tracker_size"));
+
+  dout(1) << __func__
+	  << " <<<START>>>"
+	  << (repair ? " repair" : " check")
+	  << (depth == FSCK_DEEP ? " (deep)" :
+                depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
+          << " start sb_tracker_hash_size:" << sb_hash_size
+          << dendl;
+  int64_t errors = 0;
+  int64_t warnings = 0;
+  unsigned repaired = 0;
+
+  uint64_t_btree_t used_omap_head;
+  uint64_t_btree_t used_sbids;
+
+  mempool_dynamic_bitset used_blocks, bluefs_used_blocks;
+  KeyValueDB::Iterator it;
+  store_statfs_t expected_store_statfs;
+  per_pool_statfs expected_pool_statfs;
+
+  sb_info_space_efficient_map_t sb_info;
+  shared_blob_2hash_tracker_t sb_ref_counts(
+    sb_hash_size,
+    min_alloc_size);
+  size_t sb_ref_mismatches = 0;
+
+  /// map of oid -> (first_)offset for each zone
+  std::vector<std::unordered_map<ghobject_t, uint64_t>> zone_refs;   // FIXME: this may be a lot of RAM!
+
+  uint64_t num_objects = 0;
+  uint64_t num_extents = 0;
+  uint64_t num_blobs = 0;
+  uint64_t num_spanning_blobs = 0;
+  uint64_t num_shared_blobs = 0;
+  uint64_t num_sharded_objects = 0;
+  BlueStoreRepairer repairer;
+
+  auto alloc_size = fm->get_alloc_size();
+
+  utime_t start = ceph_clock_now();
+
+  _fsck_collections(&errors);
+  used_blocks.resize(fm->get_alloc_units());
+
+  if (bluefs) {
+    interval_set<uint64_t> bluefs_extents;
+
+    bluefs->foreach_block_extents(
+      bluefs_layout.shared_bdev,
+      [&](uint64_t start, uint32_t len) {
+        apply_for_bitset_range(start, len, alloc_size, used_blocks,
+          [&](uint64_t pos, mempool_dynamic_bitset& bs) {
+            ceph_assert(pos < bs.size());
+            bs.set(pos);
+          }
+        );
+      }
+    );
+  }
+
+  bluefs_used_blocks = used_blocks;
+
+  apply_for_bitset_range(
+    0, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
+    [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+      bs.set(pos);
+    }
+  );
+
+
+  if (repair) {
+    repairer.init_space_usage_tracker(
+      bdev->get_size(),
+      min_alloc_size);
+  }
+
+  if (bluefs) {
+    int r = bluefs->fsck();
+    if (r < 0) {
+      return r;
+    }
+    if (r > 0)
+      errors += r;
+  }
+
+  if (!per_pool_stat_collection) {
+    const char *w;
+    if (cct->_conf->bluestore_fsck_error_on_no_per_pool_stats) {
+      w = "error";
+      ++errors;
+    } else {
+      w = "warning";
+      ++warnings;
+    }
+    derr << "fsck " << w << ": store not yet converted to per-pool stats"
+	 << dendl;
+  }
+  if (per_pool_omap != OMAP_PER_PG) {
+    const char *w;
+    if (cct->_conf->bluestore_fsck_error_on_no_per_pool_omap) {
+      w = "error";
+      ++errors;
+    } else {
+      w = "warning";
+      ++warnings;
+    }
+    derr << "fsck " << w << ": store not yet converted to per-pg omap"
+	 << dendl;
+  }
+
+  if (g_conf()->bluestore_debug_fsck_abort) {
+    dout(1) << __func__ << " debug abort" << dendl;
+    goto out_scan;
+  }
+
+#ifdef HAVE_LIBZBD
+  if (bdev->is_smr()) {
+    auto a = dynamic_cast<ZonedAllocator*>(alloc);
+    ceph_assert(a);
+    auto f = dynamic_cast<ZonedFreelistManager*>(fm);
+    ceph_assert(f);
+    vector<uint64_t> wp = bdev->get_zones();
+    vector<zone_state_t> zones = f->get_zone_states(db);
+    ceph_assert(wp.size() == zones.size());
+    auto num_zones = bdev->get_size() / zone_size;
+    for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
+      uint64_t p = wp[i] == (i + 1) * zone_size ? zone_size : wp[i] % zone_size;
+      if (zones[i].write_pointer > p &&
+	  zones[i].num_dead_bytes < zones[i].write_pointer) {
+	derr << "fsck error: zone 0x" << std::hex << i
+	     << " bluestore write pointer 0x" << zones[i].write_pointer
+	     << " > device write pointer 0x" << p
+	     << " (with only 0x" << zones[i].num_dead_bytes << " dead bytes)"
+	     << std::dec << dendl;
+	++errors;
+      }
+    }
+
+    if (depth != FSCK_SHALLOW) {
+      // load zone refs
+      zone_refs.resize(bdev->get_size() / zone_size);
+      it = db->get_iterator(PREFIX_ZONED_CL_INFO, KeyValueDB::ITERATOR_NOCACHE);
+      if (it) {
+	for (it->lower_bound(string());
+	     it->valid();
+	     it->next()) {
+	  uint32_t zone = 0;
+	  uint64_t offset = 0;
+	  ghobject_t oid;
+	  string key = it->key();
+	  int r = get_key_zone_offset_object(key, &zone, &offset, &oid);
+	  if (r < 0) {
+	    derr << "fsck error: invalid zone ref key " << pretty_binary_string(key)
+		 << dendl;
+	    if (repair) {
+	      repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
+	    }
+	    ++errors;
+	    continue;
+	  }
+	  dout(30) << " zone ref 0x" << std::hex << zone << " offset 0x" << offset
+		   << " -> " << std::dec << oid << dendl;
+	  if (zone_refs[zone].count(oid)) {
+	    derr << "fsck error: second zone ref in zone 0x" << std::hex << zone
+		 << " offset 0x" << offset << std::dec << " for " << oid << dendl;
+	    if (repair) {
+	      repairer.remove_key(db, PREFIX_ZONED_CL_INFO, key);
+	    }
+	    ++errors;
+	    continue;
+	  }
+	  zone_refs[zone][oid] = offset;
+	}
+      }
+    }
+  }
+#endif
+
+  dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl;
+  it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
+  if (it) {
+    for (it->lower_bound(string()); it->valid(); it->next()) {
+      string key = it->key();
+      uint64_t sbid;
+      if (get_key_shared_blob(key, &sbid) < 0) {
+        // Failed to parse the key.
+	// This gonna to be handled at the second stage
+	continue;
+      }
+      bluestore_shared_blob_t shared_blob(sbid);
+      bufferlist bl = it->value();
+      auto blp = bl.cbegin();
+      try {
+	decode(shared_blob, blp);
+      }
+      catch (ceph::buffer::error& e) {
+	// this gonna to be handled at the second stage
+	continue;
+      }
+      dout(20) << __func__ << "  " << shared_blob << dendl;
+      auto& sbi = sb_info.add_maybe_stray(sbid);
+
+      // primarily to silent the 'unused' warning
+      ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID);
+
+      for (auto& r : shared_blob.ref_map.ref_map) {
+	sb_ref_counts.inc_range(
+	  sbid,
+	  r.first,
+	  r.second.length,
+	  -r.second.refs);
+      }
+    }
+  } // if (it) //checking shared_blobs (phase1)
+
+  // walk PREFIX_OBJ
+  {
+    dout(1) << __func__ << " walking object keyspace" << dendl;
+    ceph::mutex sb_info_lock =  ceph::make_mutex("BlueStore::fsck::sbinfo_lock");
+    BlueStore::FSCK_ObjectCtx ctx(
+      errors,
+      warnings,
+      num_objects,
+      num_extents,
+      num_blobs,
+      num_sharded_objects,
+      num_spanning_blobs,
+      &used_blocks,
+      &used_omap_head,
+      &zone_refs,
+      //no need for the below lock when in non-shallow mode as
+      // there is no multithreading in this case
+      depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
+      sb_info,
+      sb_ref_counts,
+      expected_store_statfs,
+      expected_pool_statfs,
+      repair ? &repairer : nullptr);
+
+    _fsck_check_objects(depth, ctx);
+  }
+
+#ifdef HAVE_LIBZBD
+  if (bdev->is_smr() && depth != FSCK_SHALLOW) {
+    dout(1) << __func__ << " checking for leaked zone refs" << dendl;
+    for (uint32_t zone = 0; zone < zone_refs.size(); ++zone) {
+      for (auto& [oid, offset] : zone_refs[zone]) {
+	derr << "fsck error: stray zone ref 0x" << std::hex << zone
+	     << " offset 0x" << offset << " -> " << std::dec << oid << dendl;
+	// FIXME: add repair
+	++errors;
+      }
+    }
+  }
+#endif
+
+  sb_ref_mismatches = sb_ref_counts.count_non_zero();
+  if (sb_ref_mismatches != 0) {
+    derr << "fsck error:" << "*" << sb_ref_mismatches
+         << " shared blob references aren't matching, at least "
+         << sb_ref_mismatches << " found" << dendl;
+    errors += sb_ref_mismatches;
+  }
+
+  if (depth != FSCK_SHALLOW && repair) {
+    _fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info);
+  }
+  dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl;
+  it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
+  if (it) {
+    // FIXME minor: perhaps simplify for shallow mode?
+    // fill global if not overriden below
+    auto expected_statfs = &expected_store_statfs;
+    for (it->lower_bound(string()); it->valid(); it->next()) {
+      string key = it->key();
+      uint64_t sbid;
+      if (get_key_shared_blob(key, &sbid)) {
+	derr << "fsck error: bad key '" << key
+	  << "' in shared blob namespace" << dendl;
+	if (repair) {
+	  repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+	}
+	++errors;
+	continue;
+      }
+      auto p = sb_info.find(sbid);
+      if (p == sb_info.end()) {
+        if (sb_ref_mismatches > 0) {
+	  // highly likely this has been already reported before, ignoring...
+	  dout(5) << __func__ << " found duplicate(?) stray shared blob data for sbid 0x"
+	    << std::hex << sbid << std::dec << dendl;
+	} else {
+	  derr<< "fsck error: found stray shared blob data for sbid 0x"
+	    << std::hex << sbid << std::dec << dendl;
+	  ++errors;
+	  if (repair) {
+	    repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+	  }
+	}
+      } else {
+	++num_shared_blobs;
+	sb_info_t& sbi = *p;
+	bluestore_shared_blob_t shared_blob(sbid);
+	bufferlist bl = it->value();
+	auto blp = bl.cbegin();
+	try {
+	  decode(shared_blob, blp);
+	}
+	catch (ceph::buffer::error& e) {
+	  ++errors;
+
+	  derr << "fsck error: failed to decode Shared Blob"
+	    << pretty_binary_string(key) << dendl;
+	  if (repair) {
+	    dout(20) << __func__ << " undecodable Shared Blob, key:'"
+	      << pretty_binary_string(key)
+	      << "', removing" << dendl;
+	    repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+	  }
+	  continue;
+	}
+	dout(20) << __func__ << "  " << shared_blob << dendl;
+	PExtentVector extents;
+	for (auto& r : shared_blob.ref_map.ref_map) {
+	  extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
+	}
+	if (sbi.pool_id != sb_info_t::INVALID_POOL_ID &&
+	    (per_pool_stat_collection || repair)) {
+	  expected_statfs = &expected_pool_statfs[sbi.pool_id];
+	}
+	std::stringstream ss;
+	ss << "sbid 0x" << std::hex << sbid << std::dec;
+	errors += _fsck_check_extents(ss.str(),
+	  extents,
+	  sbi.allocated_chunks < 0,
+	  used_blocks,
+	  fm->get_alloc_size(),
+	  repair ? &repairer : nullptr,
+	  *expected_statfs,
+	  depth);
+      }
+    }
+  } // if (it) /* checking shared_blobs (phase 2)*/
+
+  if (repair && repairer.preprocess_misreference(db)) {
+
+    dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
+    auto& misref_extents = repairer.get_misreferences();
+    interval_set<uint64_t> to_release;
+    it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+    if (it) {
+      // fill global if not overriden below
+      auto expected_statfs = &expected_store_statfs;
+
+      CollectionRef c;
+      spg_t pgid;
+      KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
+      bool bypass_rest = false;
+      for (it->lower_bound(string()); it->valid() && !bypass_rest;
+	   it->next()) {
+	dout(30) << __func__ << " key "
+		 << pretty_binary_string(it->key()) << dendl;
+	if (is_extent_shard_key(it->key())) {
+	  continue;
+	}
+
+	ghobject_t oid;
+	int r = get_key_object(it->key(), &oid);
+	if (r < 0 || !repairer.is_used(oid)) {
+	  continue;
+	}
+
+	if (!c ||
+	    oid.shard_id != pgid.shard ||
+	    oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+	    !c->contains(oid)) {
+	  c = nullptr;
+	  for (auto& p : coll_map) {
+	    if (p.second->contains(oid)) {
+	      c = p.second;
+	      break;
+	    }
+	  }
+	  if (!c) {
+	    continue;
+	  }
+	  if (per_pool_stat_collection || repair) {
+	    auto pool_id = c->cid.is_pg(&pgid) ? pgid.pool() : META_POOL_ID;
+	    expected_statfs = &expected_pool_statfs[pool_id];
+	  }
+	}
+	if (!repairer.is_used(c->cid)) {
+	  continue;
+	}
+
+	dout(20) << __func__ << " check misreference for col:" << c->cid
+		  << " obj:" << oid << dendl;
+
+        OnodeRef o;
+        o.reset(Onode::create_decode(c, oid, it->key(), it->value()));
+	o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+	mempool::bluestore_fsck::set<BlobRef> blobs;
+
+	for (auto& e : o->extent_map.extent_map) {
+	  blobs.insert(e.blob);
+	}
+	bool need_onode_update = false;
+	bool first_dump = true;
+	for(auto b : blobs) {
+	  bool broken_blob = false;
+	  auto& pextents = b->dirty_blob().dirty_extents();
+	  for (auto& e : pextents) {
+	    if (!e.is_valid()) {
+	      continue;
+	    }
+	    // for the sake of simplicity and proper shared blob handling
+	    // always rewrite the whole blob even when it's partially
+	    // misreferenced.
+	    if (misref_extents.intersects(e.offset, e.length)) {
+	      if (first_dump) {
+		first_dump = false;
+		_dump_onode<10>(cct, *o);
+	      }
+	      broken_blob = true;
+	      break;
+	    }
+	  }
+	  if (!broken_blob)
+	    continue;
+	  bool compressed = b->get_blob().is_compressed();
+          need_onode_update = true;
+	  dout(10) << __func__
+		    << " fix misreferences in oid:" << oid
+		    << " " << *b << dendl;
+	  uint64_t b_off = 0;
+	  PExtentVector pext_to_release;
+	  pext_to_release.reserve(pextents.size());
+	  // rewriting all valid pextents
+	  for (auto e = pextents.begin(); e != pextents.end();
+	         e++) {
+	    auto b_off_cur = b_off;
+	    b_off += e->length;
+	    if (!e->is_valid()) {
+	      continue;
+	    }
+	    PExtentVector exts;
+	    dout(5) << __func__ << "::NCB::(F)alloc=" << alloc << ", length=" << e->length << dendl;
+	    int64_t alloc_len =
+              alloc->allocate(e->length, min_alloc_size,
+				       0, 0, &exts);
+	    if (alloc_len < 0 || alloc_len < (int64_t)e->length) {
+	      derr << __func__
+	           << " failed to allocate 0x" << std::hex << e->length
+		   << " allocated 0x " << (alloc_len < 0 ? 0 : alloc_len)
+		   << " min_alloc_size 0x" << min_alloc_size
+		   << " available 0x " << alloc->get_free()
+		   << std::dec << dendl;
+	      if (alloc_len > 0) {
+                alloc->release(exts);
+	      }
+	      bypass_rest = true;
+	      break;
+	    }
+            expected_statfs->allocated += e->length;
+	    if (compressed) {
+	      expected_statfs->data_compressed_allocated += e->length;
+	    }
+
+	    bufferlist bl;
+	    IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
+	    r = bdev->read(e->offset, e->length, &bl, &ioc, false);
+	    if (r < 0) {
+	      derr << __func__ << " failed to read from 0x" << std::hex << e->offset
+		    <<"~" << e->length << std::dec << dendl;
+	      ceph_abort_msg("read failed, wtf");
+	    }
+	    pext_to_release.push_back(*e);
+	    e = pextents.erase(e);
+    	    e = pextents.insert(e, exts.begin(), exts.end());
+	    b->get_blob().map_bl(
+	      b_off_cur, bl,
+	      [&](uint64_t offset, bufferlist& t) {
+		int r = bdev->write(offset, t, false);
+		ceph_assert(r == 0);
+	      });
+	    e += exts.size() - 1;
+            for (auto& p : exts) {
+	      fm->allocate(p.offset, p.length, txn);
+	    }
+	  } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
+
+	  if (b->get_blob().is_shared()) {
+            b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
+
+	    auto sbid = b->shared_blob->get_sbid();
+	    auto sb_it = sb_info.find(sbid);
+	    ceph_assert(sb_it != sb_info.end());
+	    sb_info_t& sbi = *sb_it;
+
+	    if (sbi.allocated_chunks < 0) {
+	      // NB: it's crucial to use compressed_allocated_chunks from sb_info_t
+	      // as we originally used that value while accumulating
+	      // expected_statfs
+	      expected_statfs->allocated -= uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
+	      expected_statfs->data_compressed_allocated -=
+		uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
+	    } else {
+	      expected_statfs->allocated -= uint64_t(sbi.allocated_chunks) << min_alloc_size_order;
+	    }
+	    sbi.allocated_chunks = 0;
+	    repairer.fix_shared_blob(txn, sbid, nullptr, 0);
+
+	    // relying on blob's pextents to decide what to release.
+	    for (auto& p : pext_to_release) {
+	      to_release.union_insert(p.offset, p.length);
+	    }
+	  } else {
+	    for (auto& p : pext_to_release) {
+	      expected_statfs->allocated -= p.length;
+	      if (compressed) {
+		expected_statfs->data_compressed_allocated -= p.length;
+	      }
+	      to_release.union_insert(p.offset, p.length);
+	    }
+	  }
+	  if (bypass_rest) {
+	    break;
+	  }
+	} // for(auto b : blobs) 
+	if (need_onode_update) {
+	  o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
+	  _record_onode(o, txn);
+	}
+      } // for (it->lower_bound(string()); it->valid(); it->next())
+
+      for (auto it = to_release.begin(); it != to_release.end(); ++it) {
+	dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
+		 << "~" << it.get_len() << std::dec << dendl;
+	fm->release(it.get_start(), it.get_len(), txn);
+      }
+      alloc->release(to_release);
+      to_release.clear();
+    } // if (it) {
+  } //if (repair && repairer.preprocess_misreference()) {
+  sb_info.clear();
+  sb_ref_counts.reset();
+
+  dout(1) << __func__ << " checking pool_statfs" << dendl;
+  _fsck_check_statfs(expected_store_statfs, expected_pool_statfs,
+    errors, warnings, repair ? &repairer : nullptr);
+  if (depth != FSCK_SHALLOW) {
+    dout(1) << __func__ << " checking for stray omap data " << dendl;
+    it = db->get_iterator(PREFIX_OMAP, KeyValueDB::ITERATOR_NOCACHE);
+    if (it) {
+      uint64_t last_omap_head = 0;
+      for (it->lower_bound(string()); it->valid(); it->next()) {
+        uint64_t omap_head;
+
+        _key_decode_u64(it->key().c_str(), &omap_head);
+
+        if (used_omap_head.count(omap_head) == 0 &&
+           omap_head != last_omap_head) {
+          pair<string,string> rk = it->raw_key();
+          fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+            << "fsck error: found stray omap data on omap_head "
+            << omap_head << " " << last_omap_head
+            << " prefix/key: " << url_escape(rk.first)
+            << " " << url_escape(rk.second)
+            << fsck_dendl;
+          ++errors;
+          last_omap_head = omap_head;
+        }
+      }
+    }
+    it = db->get_iterator(PREFIX_PGMETA_OMAP, KeyValueDB::ITERATOR_NOCACHE);
+    if (it) {
+      uint64_t last_omap_head = 0;
+      for (it->lower_bound(string()); it->valid(); it->next()) {
+        uint64_t omap_head;
+        _key_decode_u64(it->key().c_str(), &omap_head);
+        if (used_omap_head.count(omap_head) == 0 &&
+	    omap_head != last_omap_head) {
+          pair<string,string> rk = it->raw_key();
+          fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+            << "fsck error: found stray (pgmeta) omap data on omap_head "
+            << omap_head << " " << last_omap_head
+            << " prefix/key: " << url_escape(rk.first)
+            << " " << url_escape(rk.second)
+            << fsck_dendl;
+          last_omap_head = omap_head;
+	  ++errors;
+        }
+      }
+    }
+    it = db->get_iterator(PREFIX_PERPOOL_OMAP, KeyValueDB::ITERATOR_NOCACHE);
+    if (it) {
+      uint64_t last_omap_head = 0;
+      for (it->lower_bound(string()); it->valid(); it->next()) {
+        uint64_t pool;
+        uint64_t omap_head;
+        string k = it->key();
+        const char *c = k.c_str();
+        c = _key_decode_u64(c, &pool);
+        c = _key_decode_u64(c, &omap_head);
+        if (used_omap_head.count(omap_head) == 0 &&
+          omap_head != last_omap_head) {
+          pair<string,string> rk = it->raw_key();
+          fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+            << "fsck error: found stray (per-pool) omap data on omap_head "
+            << omap_head << " " << last_omap_head
+            << " prefix/key: " << url_escape(rk.first)
+            << " " << url_escape(rk.second)
+            << fsck_dendl;
+          ++errors;
+          last_omap_head = omap_head;
+        }
+      }
+    }
+    it = db->get_iterator(PREFIX_PERPG_OMAP, KeyValueDB::ITERATOR_NOCACHE);
+    if (it) {
+      uint64_t last_omap_head = 0;
+      for (it->lower_bound(string()); it->valid(); it->next()) {
+        uint64_t pool;
+        uint32_t hash;
+        uint64_t omap_head;
+        string k = it->key();
+        const char* c = k.c_str();
+        c = _key_decode_u64(c, &pool);
+        c = _key_decode_u32(c, &hash);
+        c = _key_decode_u64(c, &omap_head);
+        if (used_omap_head.count(omap_head) == 0 &&
+          omap_head != last_omap_head) {
+          fsck_derr(errors, MAX_FSCK_ERROR_LINES)
+            << "fsck error: found stray (per-pg) omap data on omap_head "
+	    << " key " << pretty_binary_string(it->key())
+            << omap_head << " " << last_omap_head << " " << used_omap_head.count(omap_head) << fsck_dendl;
+          ++errors;
+          last_omap_head = omap_head;
+        }
+      }
+    }
+    dout(1) << __func__ << " checking deferred events" << dendl;
+    it = db->get_iterator(PREFIX_DEFERRED, KeyValueDB::ITERATOR_NOCACHE);
+    if (it) {
+      for (it->lower_bound(string()); it->valid(); it->next()) {
+        bufferlist bl = it->value();
+        auto p = bl.cbegin();
+        bluestore_deferred_transaction_t wt;
+        try {
+	  decode(wt, p);
+        } catch (ceph::buffer::error& e) {
+	  derr << "fsck error: failed to decode deferred txn "
+	       << pretty_binary_string(it->key()) << dendl;
+	  if (repair) {
+            dout(20) << __func__ << " undecodable deferred TXN record, key: '"
+		     << pretty_binary_string(it->key())
+		     << "', removing" << dendl;
+	    repairer.remove_key(db, PREFIX_DEFERRED, it->key());
+	  }
+	  continue;
+        }
+        dout(20) << __func__ << "  deferred " << wt.seq
+	         << " ops " << wt.ops.size()
+	         << " released 0x" << std::hex << wt.released << std::dec << dendl;
+        for (auto e = wt.released.begin(); e != wt.released.end(); ++e) {
+          apply_for_bitset_range(
+            e.get_start(), e.get_len(), alloc_size, used_blocks,
+            [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+              bs.set(pos);
+            }
+          );
+        }
+      }
+    }
+
+    // skip freelist vs allocated compare when we have Null fm
+    if (!fm->is_null_manager()) {
+      dout(1) << __func__ << " checking freelist vs allocated" << dendl;
+#ifdef HAVE_LIBZBD
+      if (freelist_type == "zoned") {
+	// verify per-zone state
+	//  - verify no allocations beyond write pointer
+	//  - verify num_dead_bytes count (neither allocated nor
+	//    free space past the write pointer)
+	auto a = dynamic_cast<ZonedAllocator*>(alloc);
+	auto num_zones = bdev->get_size() / zone_size;
+
+	// mark the free space past the write pointer
+	for (uint32_t zone = first_sequential_zone; zone < num_zones; ++zone) {
+	  auto wp = a->get_write_pointer(zone);
+	  uint64_t offset = zone_size * zone + wp;
+	  uint64_t length = zone_size - wp;
+	  if (!length) {
+	    continue;
+	  }
+	  bool intersects = false;
+	  dout(10) << "  marking zone 0x" << std::hex << zone
+		   << " region after wp 0x" << offset << "~" << length
+		   << std::dec << dendl;
+	  apply_for_bitset_range(
+	    offset, length, alloc_size, used_blocks,
+	    [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+	      if (bs.test(pos)) {
+		derr << "fsck error: zone 0x" << std::hex << zone
+		     << " has used space at 0x" << pos * alloc_size
+		     << " beyond write pointer 0x" << wp
+		     << std::dec << dendl;
+		intersects = true;
+	      } else {
+		bs.set(pos);
+	      }
+	    }
+	    );
+	  if (intersects) {
+	    ++errors;
+	  }
+	}
+
+	used_blocks.flip();
+
+	// skip conventional zones
+	uint64_t pos = (first_sequential_zone * zone_size) / min_alloc_size - 1;
+	pos = used_blocks.find_next(pos);
+
+	uint64_t zone_dead = 0;
+	for (uint32_t zone = first_sequential_zone;
+	     zone < num_zones;
+	     ++zone, zone_dead = 0) {
+	  while (pos != decltype(used_blocks)::npos &&
+		 (pos * min_alloc_size) / zone_size == zone) {
+	    dout(40) << " zone 0x" << std::hex << zone
+		     << " dead 0x" << (pos * min_alloc_size) << "~" << min_alloc_size
+		     << std::dec << dendl;
+	    zone_dead += min_alloc_size;
+	    pos = used_blocks.find_next(pos);
+	  }
+	  dout(20) << " zone 0x" << std::hex << zone << " dead is 0x" << zone_dead
+		   << std::dec << dendl;
+	  // cross-check dead bytes against zone state
+	  if (a->get_dead_bytes(zone) != zone_dead) {
+	    derr << "fsck error: zone 0x" << std::hex << zone << " has 0x" << zone_dead
+		 << " dead bytes but freelist says 0x" << a->get_dead_bytes(zone)
+		 << dendl;
+	    ++errors;
+	    // TODO: repair
+	  }
+	}
+	used_blocks.flip();
+      } else
+#endif
+      {
+	fm->enumerate_reset();
+	uint64_t offset, length;
+	while (fm->enumerate_next(db, &offset, &length)) {
+	  bool intersects = false;
+	  apply_for_bitset_range(
+	    offset, length, alloc_size, used_blocks,
+	    [&](uint64_t pos, mempool_dynamic_bitset &bs) {
+	      ceph_assert(pos < bs.size());
+	      if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
+		if (offset == SUPER_RESERVED &&
+		    length == min_alloc_size - SUPER_RESERVED) {
+		  // this is due to the change just after luminous to min_alloc_size
+		  // granularity allocations, and our baked in assumption at the top
+		  // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
+		  // (vs luminous's round_up_to(SUPER_RESERVED,block_size)).  harmless,
+		  // since we will never allocate this region below min_alloc_size.
+		  dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
+			   << " and min_alloc_size, 0x" << std::hex << offset << "~"
+			   << length << std::dec << dendl;
+		} else {
+		  intersects = true;
+		  if (repair) {
+		    repairer.fix_false_free(db, fm,
+					    pos * min_alloc_size,
+					    min_alloc_size);
+		  }
+		}
+	      } else {
+		bs.set(pos);
+	      }
+	    }
+	    );
+	  if (intersects) {
+	    derr << "fsck error: free extent 0x" << std::hex << offset
+		 << "~" << length << std::dec
+		 << " intersects allocated blocks" << dendl;
+	    ++errors;
+	  }
+	}
+	fm->enumerate_reset();
+
+	// check for leaked extents
+	size_t count = used_blocks.count();
+	if (used_blocks.size() != count) {
+	  ceph_assert(used_blocks.size() > count);
+	  used_blocks.flip();
+	  size_t start = used_blocks.find_first();
+	  while (start != decltype(used_blocks)::npos) {
+	    size_t cur = start;
+	    while (true) {
+	      size_t next = used_blocks.find_next(cur);
+	      if (next != cur + 1) {
+		++errors;
+		derr << "fsck error: leaked extent 0x" << std::hex
+		     << ((uint64_t)start * fm->get_alloc_size()) << "~"
+		     << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
+		     << dendl;
+		if (repair) {
+		  repairer.fix_leaked(db,
+				      fm,
+				      start * min_alloc_size,
+				      (cur + 1 - start) * min_alloc_size);
+		}
+		start = next;
+		break;
+	      }
+	      cur = next;
+	    }
+	  }
+	  used_blocks.flip();
+	}
+      }
+    }
+  }
+  if (repair) {
+    if (per_pool_omap != OMAP_PER_PG) {
+      dout(5) << __func__ << " fixing per_pg_omap" << dendl;
+      repairer.fix_per_pool_omap(db, OMAP_PER_PG);
+    }
+
+    dout(5) << __func__ << " applying repair results" << dendl;
+    repaired = repairer.apply(db);
+    dout(5) << __func__ << " repair applied" << dendl;
+  }
+
+out_scan:
+  dout(2) << __func__ << " " << num_objects << " objects, "
+	  << num_sharded_objects << " of them sharded.  "
+	  << dendl;
+  dout(2) << __func__ << " " << num_extents << " extents to "
+	  << num_blobs << " blobs, "
+	  << num_spanning_blobs << " spanning, "
+	  << num_shared_blobs << " shared."
+	  << dendl;
+
+  utime_t duration = ceph_clock_now() - start;
+  dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, "
+	  << warnings << " warnings, "
+	  << repaired << " repaired, "
+	  << (errors + warnings - (int)repaired) << " remaining in "
+	  << duration << " seconds" << dendl;
+
+  // In non-repair mode we should return error count only as
+  // it indicates if store status is OK.
+  // In repair mode both errors and warnings are taken into account
+  // since repaired counter relates to them both.
+  return repair ? errors + warnings - (int)repaired : errors;
+}
+
+/// methods to inject various errors fsck can repair
+void BlueStore::inject_broken_shared_blob_key(const string& key,
+				  const bufferlist& bl)
+{
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+  txn->set(PREFIX_SHARED_BLOB, key, bl);
+  db->submit_transaction_sync(txn);
+};
+
+void BlueStore::inject_no_shared_blob_key()
+{
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+  ceph_assert(blobid_last > 0);
+  // kill the last used sbid, this can be broken due to blobid preallocation
+  // in rare cases, leaving as-is for the sake of simplicity
+  uint64_t sbid = blobid_last;
+
+  string key;
+  dout(5) << __func__<< " " << sbid << dendl;
+  get_shared_blob_key(sbid, &key);
+  txn->rmkey(PREFIX_SHARED_BLOB, key);
+  db->submit_transaction_sync(txn);
+};
+
+void BlueStore::inject_stray_shared_blob_key(uint64_t sbid)
+{
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  dout(5) << __func__ << " " << sbid << dendl;
+
+  string key;
+  get_shared_blob_key(sbid, &key);
+  bluestore_shared_blob_t persistent(sbid);
+  persistent.ref_map.get(0xdead0000, min_alloc_size);
+  bufferlist bl;
+  encode(persistent, bl);
+  dout(20) << __func__ << " sbid " << sbid
+    << " takes " << bl.length() << " bytes, updating"
+    << dendl;
+
+  txn->set(PREFIX_SHARED_BLOB, key, bl);
+  db->submit_transaction_sync(txn);
+};
+
+
+void BlueStore::inject_leaked(uint64_t len)
+{
+  PExtentVector exts;
+  int64_t alloc_len = alloc->allocate(len, min_alloc_size,
+					   min_alloc_size * 256, 0, &exts);
+
+  if (fm->is_null_manager()) {
+    return;
+  }
+
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  ceph_assert(alloc_len >= (int64_t)len);
+  for (auto& p : exts) {
+    fm->allocate(p.offset, p.length, txn);
+  }
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
+{
+  ceph_assert(!fm->is_null_manager());
+
+  KeyValueDB::Transaction txn;
+  OnodeRef o;
+  CollectionRef c = _get_collection(cid);
+  ceph_assert(c);
+  {
+    std::unique_lock l{c->lock}; // just to avoid internal asserts
+    o = c->get_onode(oid, false);
+    ceph_assert(o);
+    o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+  }
+
+  bool injected = false;
+  txn = db->get_transaction();
+  auto& em = o->extent_map.extent_map;
+  std::vector<const PExtentVector*> v;
+  if (em.size()) {
+    v.push_back(&em.begin()->blob->get_blob().get_extents());
+  }
+  if (em.size() > 1) {
+    auto it = em.end();
+    --it;
+    v.push_back(&(it->blob->get_blob().get_extents()));
+  }
+  for (auto pext : v) {
+    if (pext->size()) {
+      auto p = pext->begin();
+      while (p != pext->end()) {
+	if (p->is_valid()) {
+	  dout(20) << __func__ << " release 0x" << std::hex << p->offset
+	           << "~" << p->length << std::dec << dendl;
+	  fm->release(p->offset, p->length, txn);
+	  injected = true;
+	  break;
+	}
+	++p;
+      }
+    }
+  }
+  ceph_assert(injected);
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_legacy_omap()
+{
+  dout(1) << __func__ << dendl;
+  per_pool_omap = OMAP_BULK;
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+  txn->rmkey(PREFIX_SUPER, "per_pool_omap");
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_legacy_omap(coll_t cid, ghobject_t oid)
+{
+  dout(1) << __func__ << " "
+          << cid << " " << oid
+          <<dendl;
+  KeyValueDB::Transaction txn;
+  OnodeRef o;
+  CollectionRef c = _get_collection(cid);
+  ceph_assert(c);
+  {
+    std::unique_lock l{ c->lock }; // just to avoid internal asserts
+    o = c->get_onode(oid, false);
+    ceph_assert(o);
+  }
+  o->onode.clear_flag(
+    bluestore_onode_t::FLAG_PERPG_OMAP |
+    bluestore_onode_t::FLAG_PERPOOL_OMAP |
+    bluestore_onode_t::FLAG_PGMETA_OMAP);
+  txn = db->get_transaction();
+  _record_onode(o, txn);
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_stray_omap(uint64_t head, const string& name)
+{
+  dout(1) << __func__ << dendl;
+  KeyValueDB::Transaction txn = db->get_transaction();
+
+  string key;
+  bufferlist bl;
+  _key_encode_u64(head, &key);
+  key.append(name);
+  txn->set(PREFIX_OMAP, key, bl);
+
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_statfs(const string& key, const store_statfs_t& new_statfs)
+{
+  BlueStoreRepairer repairer;
+  repairer.fix_statfs(db, key, new_statfs);
+  repairer.apply(db);
+}
+
+void BlueStore::inject_global_statfs(const store_statfs_t& new_statfs)
+{
+  KeyValueDB::Transaction t = db->get_transaction();
+  volatile_statfs v;
+  v = new_statfs;
+  bufferlist bl;
+  v.encode(bl);
+  t->set(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+  db->submit_transaction_sync(t);
+}
+
+void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
+				    coll_t cid2, ghobject_t oid2,
+				    uint64_t offset)
+{
+  OnodeRef o1;
+  CollectionRef c1 = _get_collection(cid1);
+  ceph_assert(c1);
+  {
+    std::unique_lock l{c1->lock}; // just to avoid internal asserts
+    o1 = c1->get_onode(oid1, false);
+    ceph_assert(o1);
+    o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
+  }
+  OnodeRef o2;
+  CollectionRef c2 = _get_collection(cid2);
+  ceph_assert(c2);
+  {
+    std::unique_lock l{c2->lock}; // just to avoid internal asserts
+    o2 = c2->get_onode(oid2, false);
+    ceph_assert(o2);
+    o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
+  }
+  Extent& e1 = *(o1->extent_map.seek_lextent(offset));
+  Extent& e2 = *(o2->extent_map.seek_lextent(offset));
+
+  // require onode/extent layout to be the same (and simple)
+  // to make things easier
+  ceph_assert(o1->onode.extent_map_shards.empty());
+  ceph_assert(o2->onode.extent_map_shards.empty());
+  ceph_assert(o1->extent_map.spanning_blob_map.size() == 0);
+  ceph_assert(o2->extent_map.spanning_blob_map.size() == 0);
+  ceph_assert(e1.logical_offset == e2.logical_offset);
+  ceph_assert(e1.length == e2.length);
+  ceph_assert(e1.blob_offset == e2.blob_offset);
+
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  // along with misreference error this will create space leaks errors
+  e2.blob->dirty_blob() = e1.blob->get_blob();
+  o2->extent_map.dirty_range(offset, e2.length);
+  o2->extent_map.update(txn, false);
+
+  _record_onode(o2, txn);
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_zombie_spanning_blob(coll_t cid, ghobject_t oid,
+                                            int16_t blob_id)
+{
+  OnodeRef o;
+  CollectionRef c = _get_collection(cid);
+  ceph_assert(c);
+  {
+    std::unique_lock l{ c->lock }; // just to avoid internal asserts
+    o = c->get_onode(oid, false);
+    ceph_assert(o);
+    o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+  }
+
+  BlobRef b = c->new_blob();
+  b->id = blob_id;
+  o->extent_map.spanning_blob_map[blob_id] = b;
+
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  _record_onode(o, txn);
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_bluefs_file(std::string_view dir, std::string_view name, size_t new_size)
+{
+  ceph_assert(bluefs);
+
+  BlueFS::FileWriter* p_handle = nullptr;
+  auto ret = bluefs->open_for_write(dir, name, &p_handle, false);
+  ceph_assert(ret == 0);
+
+  std::string s('0', new_size);
+  bufferlist bl;
+  bl.append(s);
+  p_handle->append(bl);
+
+  bluefs->fsync(p_handle);
+  bluefs->close_writer(p_handle);
+}
+
+void BlueStore::collect_metadata(map<string,string> *pm)
+{
+  dout(10) << __func__ << dendl;
+  bdev->collect_metadata("bluestore_bdev_", pm);
+  if (bluefs) {
+    (*pm)["bluefs"] = "1";
+    // this value is for backward compatibility only
+    (*pm)["bluefs_single_shared_device"] = \
+      stringify((int)bluefs_layout.single_shared_device());
+    (*pm)["bluefs_dedicated_db"] = \
+       stringify((int)bluefs_layout.dedicated_db);
+    (*pm)["bluefs_dedicated_wal"] = \
+       stringify((int)bluefs_layout.dedicated_wal);
+    bluefs->collect_metadata(pm, bluefs_layout.shared_bdev);
+  } else {
+    (*pm)["bluefs"] = "0";
+  }
+
+  // report numa mapping for underlying devices
+  int node = -1;
+  set<int> nodes;
+  set<string> failed;
+  int r = get_numa_node(&node, &nodes, &failed);
+  if (r >= 0) {
+    if (!failed.empty()) {
+      (*pm)["objectstore_numa_unknown_devices"] = stringify(failed);
+    }
+    if (!nodes.empty()) {
+      dout(1) << __func__ << " devices span numa nodes " << nodes << dendl;
+      (*pm)["objectstore_numa_nodes"] = stringify(nodes);
+    }
+    if (node >= 0) {
+      (*pm)["objectstore_numa_node"] = stringify(node);
+    }
+  }
+  (*pm)["bluestore_min_alloc_size"] = stringify(min_alloc_size);
+}
+
+int BlueStore::get_numa_node(
+  int *final_node,
+  set<int> *out_nodes,
+  set<string> *out_failed)
+{
+  int node = -1;
+  set<string> devices;
+  get_devices(&devices);
+  set<int> nodes;
+  set<string> failed;
+  for (auto& devname : devices) {
+    int n;
+    BlkDev bdev(devname);
+    int r = bdev.get_numa_node(&n);
+    if (r < 0) {
+      dout(10) << __func__ << " bdev " << devname << " can't detect numa_node"
+	       << dendl;
+      failed.insert(devname);
+      continue;
+    }
+    dout(10) << __func__ << " bdev " << devname << " on numa_node " << n
+	     << dendl;
+    nodes.insert(n);
+    if (node < 0) {
+      node = n;
+    }
+  }
+  if (node >= 0 && nodes.size() == 1 && failed.empty()) {
+    *final_node = node;
+  }
+  if (out_nodes) {
+    *out_nodes = nodes;
+  }
+  if (out_failed) {
+    *out_failed = failed;
+  }
+  return 0;
+}
+
+void BlueStore::prepare_for_fast_shutdown()
+{
+  m_fast_shutdown = true;
+}
+
+int BlueStore::get_devices(set<string> *ls)
+{
+  if (bdev) {
+    bdev->get_devices(ls);
+    if (bluefs) {
+      bluefs->get_devices(ls);
+    }
+    return 0;
+  }
+
+  // grumble, we haven't started up yet.
+  if (int r = _open_path(); r < 0) {
+    return r;
+  }
+  auto close_path = make_scope_guard([&] {
+    _close_path();
+  });
+  if (int r = _open_fsid(false); r < 0) {
+    return r;
+  }
+  auto close_fsid = make_scope_guard([&] {
+    _close_fsid();
+  });
+  if (int r = _read_fsid(&fsid); r < 0) {
+    return r;
+  }
+  if (int r = _lock_fsid(); r < 0) {
+    return r;
+  }
+  if (int r = _open_bdev(false); r < 0) {
+    return r;
+  }
+  auto close_bdev = make_scope_guard([&] {
+    _close_bdev();
+  });
+  if (int r = _minimal_open_bluefs(false); r < 0) {
+    return r;
+  }
+  bdev->get_devices(ls);
+  if (bluefs) {
+    bluefs->get_devices(ls);
+  }
+  _minimal_close_bluefs();
+  return 0;
+}
+
+void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
+{
+  buf->reset();
+
+  auto prefix = per_pool_omap == OMAP_BULK ?
+    PREFIX_OMAP :
+    per_pool_omap == OMAP_PER_POOL ?
+      PREFIX_PERPOOL_OMAP :
+      PREFIX_PERPG_OMAP;
+  buf->omap_allocated =
+    db->estimate_prefix_size(prefix, string());
+
+  uint64_t bfree = alloc->get_free();
+
+  if (bluefs) {
+    buf->internally_reserved = 0;
+    // include dedicated db, too, if that isn't the shared device.
+    if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
+      buf->total += bluefs->get_total(BlueFS::BDEV_DB);
+    }
+    // call any non-omap bluefs space "internal metadata"
+    buf->internal_metadata =
+      bluefs->get_used()
+      - buf->omap_allocated;
+  }
+
+  ExtBlkDevState ebd_state;
+  int rc = bdev->get_ebd_state(ebd_state);
+  if (rc == 0) {
+    buf->total += ebd_state.get_physical_total();
+
+    // we are limited by both the size of the virtual device and the
+    // underlying physical device.
+    bfree = std::min(bfree, ebd_state.get_physical_avail());
+
+    buf->allocated = ebd_state.get_physical_total() - ebd_state.get_physical_avail();;
+  } else {
+    buf->total += bdev->get_size();
+  }
+  buf->available = bfree;
+}
+
+int BlueStore::statfs(struct store_statfs_t *buf,
+		      osd_alert_list_t* alerts)
+{
+  if (alerts) {
+    alerts->clear();
+    _log_alerts(*alerts);
+  }
+  _get_statfs_overall(buf);
+  {
+    std::lock_guard l(vstatfs_lock);
+    buf->allocated = vstatfs.allocated();
+    buf->data_stored = vstatfs.stored();
+    buf->data_compressed = vstatfs.compressed();
+    buf->data_compressed_original = vstatfs.compressed_original();
+    buf->data_compressed_allocated = vstatfs.compressed_allocated();
+  }
+
+  dout(20) << __func__ << " " << *buf << dendl;
+  return 0;
+}
+
+int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+			   bool *out_per_pool_omap)
+{
+  dout(20) << __func__ << " pool " << pool_id<< dendl;
+
+  if (!per_pool_stat_collection) {
+    dout(20) << __func__ << " not supported in legacy mode " << dendl;
+    return -ENOTSUP;
+  }
+  buf->reset();
+
+  {
+    std::lock_guard l(vstatfs_lock);
+    osd_pools[pool_id].publish(buf);
+  }
+
+  string key_prefix;
+  _key_encode_u64(pool_id, &key_prefix);
+  *out_per_pool_omap = per_pool_omap != OMAP_BULK;
+  // stop calls after db was closed
+  if (*out_per_pool_omap && db) {
+    auto prefix = per_pool_omap == OMAP_PER_POOL ?
+      PREFIX_PERPOOL_OMAP :
+      PREFIX_PERPG_OMAP;
+    buf->omap_allocated = db->estimate_prefix_size(prefix, key_prefix);
+  }
+
+  dout(10) << __func__ << *buf << dendl;
+  return 0;
+}
+
+void BlueStore::_check_legacy_statfs_alert()
+{
+  string s;
+  if (!per_pool_stat_collection &&
+      cct->_conf->bluestore_warn_on_legacy_statfs) {
+    s = "legacy statfs reporting detected, "
+        "suggest to run store repair to get consistent statistic reports";
+  }
+  std::lock_guard l(qlock);
+  legacy_statfs_alert = s;
+}
+
+void BlueStore::_check_no_per_pg_or_pool_omap_alert()
+{
+  string per_pg, per_pool;
+  if (per_pool_omap != OMAP_PER_PG) {
+    if (cct->_conf->bluestore_warn_on_no_per_pg_omap) {
+      per_pg = "legacy (not per-pg) omap detected, "
+	"suggest to run store repair to benefit from faster PG removal";
+    }
+    if (per_pool_omap != OMAP_PER_POOL) {
+      if (cct->_conf->bluestore_warn_on_no_per_pool_omap) {
+	per_pool = "legacy (not per-pool) omap detected, "
+	  "suggest to run store repair to benefit from per-pool omap usage statistics";
+      }
+    }
+  }
+  std::lock_guard l(qlock);
+  no_per_pg_omap_alert = per_pg;
+  no_per_pool_omap_alert = per_pool;
+}
+
+// ---------------
+// cache
+
+BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid)
+{
+  std::shared_lock l(coll_lock);
+  ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+  if (cp == coll_map.end())
+    return CollectionRef();
+  return cp->second;
+}
+
+BlueStore::CollectionRef BlueStore::_get_collection_by_oid(const ghobject_t& oid)
+{
+  std::shared_lock l(coll_lock);
+
+  // FIXME: we must replace this with something more efficient
+
+  for (auto& i : coll_map) {
+    spg_t spgid;
+    if (i.first.is_pg(&spgid) &&
+	i.second->contains(oid)) {
+      return i.second;
+    }
+  }
+  return CollectionRef();
+}
+
+void BlueStore::_queue_reap_collection(CollectionRef& c)
+{
+  dout(10) << __func__ << " " << c << " " << c->cid << dendl;
+  // _reap_collections and this in the same thread,
+  // so no need a lock.
+  removed_collections.push_back(c);
+}
+
+void BlueStore::_reap_collections()
+{
+
+  list<CollectionRef> removed_colls;
+  {
+    // _queue_reap_collection and this in the same thread.
+    // So no need a lock.
+    if (!removed_collections.empty())
+      removed_colls.swap(removed_collections);
+    else
+      return;
+  }
+
+  list<CollectionRef>::iterator p = removed_colls.begin();
+  while (p != removed_colls.end()) {
+    CollectionRef c = *p;
+    dout(10) << __func__ << " " << c << " " << c->cid << dendl;
+    if (c->onode_space.map_any([&](Onode* o) {
+	  ceph_assert(!o->exists);
+	  if (o->flushing_count.load()) {
+	    dout(10) << __func__ << " " << c << " " << c->cid << " " << o->oid
+		     << " flush_txns " << o->flushing_count << dendl;
+	    return true;
+	  }
+	  return false;
+	})) {
+      ++p;
+      continue;
+    }
+    c->onode_space.clear();
+    p = removed_colls.erase(p);
+    dout(10) << __func__ << " " << c << " " << c->cid << " done" << dendl;
+  }
+  if (removed_colls.empty()) {
+    dout(10) << __func__ << " all reaped" << dendl;
+  } else {
+    removed_collections.splice(removed_collections.begin(), removed_colls);
+  }
+}
+
+void BlueStore::_update_logger()
+{
+  uint64_t num_onodes = 0;
+  uint64_t num_pinned_onodes = 0;
+  uint64_t num_extents = 0;
+  uint64_t num_blobs = 0;
+  uint64_t num_buffers = 0;
+  uint64_t num_buffer_bytes = 0;
+  for (auto c : onode_cache_shards) {
+    c->add_stats(&num_onodes, &num_pinned_onodes);
+  }
+  for (auto c : buffer_cache_shards) {
+    c->add_stats(&num_extents, &num_blobs,
+                 &num_buffers, &num_buffer_bytes);
+  }
+  logger->set(l_bluestore_onodes, num_onodes);
+  logger->set(l_bluestore_pinned_onodes, num_pinned_onodes);
+  logger->set(l_bluestore_extents, num_extents);
+  logger->set(l_bluestore_blobs, num_blobs);
+  logger->set(l_bluestore_buffers, num_buffers);
+  logger->set(l_bluestore_buffer_bytes, num_buffer_bytes);
+}
+
+// ---------------
+// read operations
+
+ObjectStore::CollectionHandle BlueStore::open_collection(const coll_t& cid)
+{
+  return _get_collection(cid);
+}
+
+ObjectStore::CollectionHandle BlueStore::create_new_collection(
+  const coll_t& cid)
+{
+  std::unique_lock l{coll_lock};
+  auto c = ceph::make_ref<Collection>(
+    this,
+    onode_cache_shards[cid.hash_to_shard(onode_cache_shards.size())],
+    buffer_cache_shards[cid.hash_to_shard(buffer_cache_shards.size())],
+    cid);
+  new_coll_map[cid] = c;
+  _osr_attach(c.get());
+  return c;
+}
+
+void BlueStore::set_collection_commit_queue(
+    const coll_t& cid,
+    ContextQueue *commit_queue)
+{
+  if (commit_queue) {
+    std::shared_lock l(coll_lock);
+    if (coll_map.count(cid)) {
+      coll_map[cid]->commit_queue = commit_queue;
+    } else if (new_coll_map.count(cid)) {
+      new_coll_map[cid]->commit_queue = commit_queue;
+    }
+  }
+}
+
+
+bool BlueStore::exists(CollectionHandle &c_, const ghobject_t& oid)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
+  if (!c->exists)
+    return false;
+
+  bool r = true;
+
+  {
+    std::shared_lock l(c->lock);
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists)
+      r = false;
+  }
+
+  return r;
+}
+
+int BlueStore::stat(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  struct stat *st,
+  bool allow_eio)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  if (!c->exists)
+    return -ENOENT;
+  dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
+
+  {
+    std::shared_lock l(c->lock);
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists)
+      return -ENOENT;
+    st->st_size = o->onode.size;
+    st->st_blksize = 4096;
+    st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
+    st->st_nlink = 1;
+  }
+
+  int r = 0;
+  if (_debug_mdata_eio(oid)) {
+    r = -EIO;
+    derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+  }
+  return r;
+}
+int BlueStore::set_collection_opts(
+  CollectionHandle& ch,
+  const pool_opts_t& opts)
+{
+  Collection *c = static_cast<Collection *>(ch.get());
+  dout(15) << __func__ << " " << ch->cid << " options " << opts << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  std::unique_lock l{c->lock};
+  c->pool_opts = opts;
+  return 0;
+}
+
+int BlueStore::read(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t length,
+  bufferlist& bl,
+  uint32_t op_flags)
+{
+  auto start = mono_clock::now();
+  Collection *c = static_cast<Collection *>(c_.get());
+  const coll_t &cid = c->get_cid();
+  dout(15) << __func__ << " " << cid << " " << oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << dendl;
+  if (!c->exists)
+    return -ENOENT;
+
+  bl.clear();
+  int r;
+  {
+    std::shared_lock l(c->lock);
+    auto start1 = mono_clock::now();
+    OnodeRef o = c->get_onode(oid, false);
+    log_latency("get_onode@read",
+      l_bluestore_read_onode_meta_lat,
+      mono_clock::now() - start1,
+      cct->_conf->bluestore_log_op_age);
+    if (!o || !o->exists) {
+      r = -ENOENT;
+      goto out;
+    }
+
+    if (offset == length && offset == 0)
+      length = o->onode.size;
+
+    r = _do_read(c, o, offset, length, bl, op_flags);
+    if (r == -EIO) {
+      logger->inc(l_bluestore_read_eio);
+    }
+  }
+
+ out:
+  if (r >= 0 && _debug_data_eio(oid)) {
+    r = -EIO;
+    derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+  } else if (oid.hobj.pool > 0 &&  /* FIXME, see #23029 */
+	     cct->_conf->bluestore_debug_random_read_err &&
+	     (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
+			     100.0)) == 0) {
+    dout(0) << __func__ << ": inject random EIO" << dendl;
+    r = -EIO;
+  }
+  dout(10) << __func__ << " " << cid << " " << oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << " = " << r << dendl;
+  log_latency(__func__,
+    l_bluestore_read_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  return r;
+}
+
+void BlueStore::_read_cache(
+  OnodeRef& o,
+  uint64_t offset,
+  size_t length,
+  int read_cache_policy,
+  ready_regions_t& ready_regions,
+  blobs2read_t& blobs2read)
+{
+  // build blob-wise list to of stuff read (that isn't cached)
+  unsigned left = length;
+  uint64_t pos = offset;
+  auto lp = o->extent_map.seek_lextent(offset);
+  while (left > 0 && lp != o->extent_map.extent_map.end()) {
+    if (pos < lp->logical_offset) {
+      unsigned hole = lp->logical_offset - pos;
+      if (hole >= left) {
+        break;
+      }
+      dout(30) << __func__ << "  hole 0x" << std::hex << pos << "~" << hole
+               << std::dec << dendl;
+      pos += hole;
+      left -= hole;
+    }
+    BlobRef& bptr = lp->blob;
+    unsigned l_off = pos - lp->logical_offset;
+    unsigned b_off = l_off + lp->blob_offset;
+    unsigned b_len = std::min(left, lp->length - l_off);
+
+    ready_regions_t cache_res;
+    interval_set<uint32_t> cache_interval;
+    bptr->shared_blob->bc.read(
+      bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
+      read_cache_policy);
+    dout(20) << __func__ << "  blob " << *bptr << std::hex
+             << " need 0x" << b_off << "~" << b_len
+             << " cache has 0x" << cache_interval
+             << std::dec << dendl;
+
+    auto pc = cache_res.begin();
+    uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
+    while (b_len > 0) {
+      unsigned l;
+      if (pc != cache_res.end() &&
+          pc->first == b_off) {
+        l = pc->second.length();
+        ready_regions[pos] = std::move(pc->second);
+        dout(30) << __func__ << "    use cache 0x" << std::hex << pos << ": 0x"
+                 << b_off << "~" << l << std::dec << dendl;
+        ++pc;
+      } else {
+        l = b_len;
+        if (pc != cache_res.end()) {
+          ceph_assert(pc->first > b_off);
+          l = pc->first - b_off;
+        }
+        dout(30) << __func__ << "    will read 0x" << std::hex << pos << ": 0x"
+                 << b_off << "~" << l << std::dec << dendl;
+        // merge regions
+        {
+          uint64_t r_off = b_off;
+          uint64_t r_len = l;
+          uint64_t front = r_off % chunk_size;
+          if (front) {
+            r_off -= front;
+            r_len += front;
+          }
+          unsigned tail = r_len % chunk_size;
+          if (tail) {
+            r_len += chunk_size - tail;
+          }
+          bool merged = false;
+          regions2read_t& r2r = blobs2read[bptr];
+          if (r2r.size()) {
+            read_req_t& pre = r2r.back();
+            if (r_off <= (pre.r_off + pre.r_len)) {
+              front += (r_off - pre.r_off);
+              pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
+              pre.regs.emplace_back(region_t(pos, b_off, l, front));
+              merged = true;
+            }
+          }
+          if (!merged) {
+            read_req_t req(r_off, r_len);
+            req.regs.emplace_back(region_t(pos, b_off, l, front));
+            r2r.emplace_back(std::move(req));
+          }
+        }
+      }
+      pos += l;
+      b_off += l;
+      left -= l;
+      b_len -= l;
+    }
+    ++lp;
+  }
+}
+
+int BlueStore::_prepare_read_ioc(
+  blobs2read_t& blobs2read,
+  vector<bufferlist>* compressed_blob_bls,
+  IOContext* ioc)
+{
+  for (auto& p : blobs2read) {
+    const BlobRef& bptr = p.first;
+    regions2read_t& r2r = p.second;
+    dout(20) << __func__ << "  blob " << *bptr << " need "
+             << r2r << dendl;
+    if (bptr->get_blob().is_compressed()) {
+      // read the whole thing
+      if (compressed_blob_bls->empty()) {
+        // ensure we avoid any reallocation on subsequent blobs
+        compressed_blob_bls->reserve(blobs2read.size());
+      }
+      compressed_blob_bls->push_back(bufferlist());
+      bufferlist& bl = compressed_blob_bls->back();
+      auto r = bptr->get_blob().map(
+        0, bptr->get_blob().get_ondisk_length(),
+        [&](uint64_t offset, uint64_t length) {
+          int r = bdev->aio_read(offset, length, &bl, ioc);
+          if (r < 0)
+            return r;
+          return 0;
+        });
+      if (r < 0) {
+        derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl;
+        if (r == -EIO) {
+          // propagate EIO to caller
+          return r;
+        }
+        ceph_assert(r == 0);
+      }
+    } else {
+      // read the pieces
+      for (auto& req : r2r) {
+        dout(20) << __func__ << "    region 0x" << std::hex
+                 << req.regs.front().logical_offset
+                 << ": 0x" << req.regs.front().blob_xoffset
+                 << " reading 0x" << req.r_off
+                 << "~" << req.r_len << std::dec
+                 << dendl;
+
+        // read it
+        auto r = bptr->get_blob().map(
+          req.r_off, req.r_len,
+          [&](uint64_t offset, uint64_t length) {
+            int r = bdev->aio_read(offset, length, &req.bl, ioc);
+            if (r < 0)
+              return r;
+            return 0;
+          });
+        if (r < 0) {
+          derr << __func__ << " bdev-read failed: " << cpp_strerror(r)
+               << dendl;
+          if (r == -EIO) {
+            // propagate EIO to caller
+            return r;
+          }
+          ceph_assert(r == 0);
+        }
+        ceph_assert(req.bl.length() == req.r_len);
+      }
+    }
+  }
+  return 0;
+}
+
+int BlueStore::_generate_read_result_bl(
+  OnodeRef& o,
+  uint64_t offset,
+  size_t length,
+  ready_regions_t& ready_regions,
+  vector<bufferlist>& compressed_blob_bls,
+  blobs2read_t& blobs2read,
+  bool buffered,
+  bool* csum_error,
+  bufferlist& bl)
+{
+ // enumerate and decompress desired blobs
+  auto p = compressed_blob_bls.begin();
+  blobs2read_t::iterator b2r_it = blobs2read.begin();
+  while (b2r_it != blobs2read.end()) {
+    const BlobRef& bptr = b2r_it->first;
+    regions2read_t& r2r = b2r_it->second;
+    dout(20) << __func__ << "  blob " << *bptr << " need "
+             << r2r << dendl;
+    if (bptr->get_blob().is_compressed()) {
+      ceph_assert(p != compressed_blob_bls.end());
+      bufferlist& compressed_bl = *p++;
+      if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
+                       r2r.front().regs.front().logical_offset) < 0) {
+        *csum_error = true;
+        return -EIO;
+      }
+      bufferlist raw_bl;
+      auto r = _decompress(compressed_bl, &raw_bl);
+      if (r < 0)
+        return r;
+      if (buffered) {
+        bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
+                                       raw_bl);
+      }
+      for (auto& req : r2r) {
+        for (auto& r : req.regs) {
+          ready_regions[r.logical_offset].substr_of(
+            raw_bl, r.blob_xoffset, r.length);
+        }
+      }
+    } else {
+      for (auto& req : r2r) {
+        if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
+                         req.regs.front().logical_offset) < 0) {
+          *csum_error = true;
+          return -EIO;
+        }
+        if (buffered) {
+          bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
+                                         req.r_off, req.bl);
+        }
+
+        // prune and keep result
+        for (const auto& r : req.regs) {
+          ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
+        }
+      }
+    }
+    ++b2r_it;
+  }
+
+  // generate a resulting buffer
+  auto pr = ready_regions.begin();
+  auto pr_end = ready_regions.end();
+  uint64_t pos = 0;
+  while (pos < length) {
+    if (pr != pr_end && pr->first == pos + offset) {
+      dout(30) << __func__ << " assemble 0x" << std::hex << pos
+               << ": data from 0x" << pr->first << "~" << pr->second.length()
+               << std::dec << dendl;
+      pos += pr->second.length();
+      bl.claim_append(pr->second);
+      ++pr;
+    } else {
+      uint64_t l = length - pos;
+      if (pr != pr_end) {
+        ceph_assert(pr->first > pos + offset);
+        l = pr->first - (pos + offset);
+      }
+      dout(30) << __func__ << " assemble 0x" << std::hex << pos
+               << ": zeros for 0x" << (pos + offset) << "~" << l
+               << std::dec << dendl;
+      bl.append_zero(l);
+      pos += l;
+    }
+  }
+  ceph_assert(bl.length() == length);
+  ceph_assert(pos == length);
+  ceph_assert(pr == pr_end);
+  return 0;
+}
+
+int BlueStore::_do_read(
+  Collection *c,
+  OnodeRef& o,
+  uint64_t offset,
+  size_t length,
+  bufferlist& bl,
+  uint32_t op_flags,
+  uint64_t retry_count)
+{
+  FUNCTRACE(cct);
+  int r = 0;
+  int read_cache_policy = 0; // do not bypass clean or dirty cache
+
+  dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+           << " size 0x" << o->onode.size << " (" << std::dec
+           << o->onode.size << ")" << dendl;
+  bl.clear();
+
+  if (offset >= o->onode.size) {
+    return r;
+  }
+
+  // generally, don't buffer anything, unless the client explicitly requests
+  // it.
+  bool buffered = false;
+  if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+    dout(20) << __func__ << " will do buffered read" << dendl;
+    buffered = true;
+  } else if (cct->_conf->bluestore_default_buffered_read &&
+	     (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+			  CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
+    dout(20) << __func__ << " defaulting to buffered read" << dendl;
+    buffered = true;
+  }
+
+  if (offset + length > o->onode.size) {
+    length = o->onode.size - offset;
+  }
+
+  auto start = mono_clock::now();
+  o->extent_map.fault_range(db, offset, length);
+  log_latency(__func__,
+    l_bluestore_read_onode_meta_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  _dump_onode<30>(cct, *o);
+
+  // for deep-scrub, we only read dirty cache and bypass clean cache in
+  // order to read underlying block device in case there are silent disk errors.
+  if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
+    dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
+    read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
+  }
+
+  // build blob-wise list to of stuff read (that isn't cached)
+  ready_regions_t ready_regions;
+  blobs2read_t blobs2read;
+  _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read);
+
+
+  // read raw blob data.
+  start = mono_clock::now(); // for the sake of simplicity
+                             // measure the whole block below.
+                             // The error isn't that much...
+  vector<bufferlist> compressed_blob_bls;
+  IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
+  r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc);
+  // we always issue aio for reading, so errors other than EIO are not allowed
+  if (r < 0)
+    return r;
+
+  int64_t num_ios = blobs2read.size();
+  if (ioc.has_pending_aios()) {
+    num_ios = ioc.get_num_ios();
+    bdev->aio_submit(&ioc);
+    dout(20) << __func__ << " waiting for aio" << dendl;
+    ioc.aio_wait();
+    r = ioc.get_return_value();
+    if (r < 0) {
+      ceph_assert(r == -EIO); // no other errors allowed
+      return -EIO;
+    }
+  }
+  log_latency_fn(__func__,
+    l_bluestore_read_wait_aio_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age,
+    [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
+  );
+
+  bool csum_error = false;
+  r = _generate_read_result_bl(o, offset, length, ready_regions,
+                              compressed_blob_bls, blobs2read,
+                              buffered && !ioc.skip_cache(),
+                              &csum_error, bl);
+  if (csum_error) {
+    // Handles spurious read errors caused by a kernel bug.
+    // We sometimes get all-zero pages as a result of the read under
+    // high memory pressure. Retrying the failing read succeeds in most 
+    // cases.
+    // See also: http://tracker.ceph.com/issues/22464
+    if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+      return -EIO;
+    }
+    return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
+  }
+  r = bl.length();
+  if (retry_count) {
+    logger->inc(l_bluestore_reads_with_retries);
+    dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
+            << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
+    stringstream s;
+    s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
+    _set_spurious_read_errors_alert(s.str());
+  }
+  return r;
+}
+
+int BlueStore::_verify_csum(OnodeRef& o,
+			    const bluestore_blob_t* blob, uint64_t blob_xoffset,
+			    const bufferlist& bl,
+			    uint64_t logical_offset) const
+{
+  int bad;
+  uint64_t bad_csum;
+  auto start = mono_clock::now();
+  int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
+  if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
+      (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
+    derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
+    bad = blob_xoffset;
+    r = -1;
+    bad_csum = 0xDEADBEEF;
+  }
+  if (r < 0) {
+    if (r == -1) {
+      PExtentVector pex;
+      blob->map(
+	bad,
+	blob->get_csum_chunk_size(),
+	[&](uint64_t offset, uint64_t length) {
+	  pex.emplace_back(bluestore_pextent_t(offset, length));
+          return 0;
+	});
+      derr << __func__ << " bad "
+	   << Checksummer::get_csum_type_string(blob->csum_type)
+	   << "/0x" << std::hex << blob->get_csum_chunk_size()
+	   << " checksum at blob offset 0x" << bad
+	   << ", got 0x" << bad_csum << ", expected 0x"
+	   << blob->get_csum_item(bad / blob->get_csum_chunk_size()) << std::dec
+	   << ", device location " << pex
+	   << ", logical extent 0x" << std::hex
+	   << (logical_offset + bad - blob_xoffset) << "~"
+	   << blob->get_csum_chunk_size() << std::dec
+	   << ", object " << o->oid
+	   << dendl;
+    } else {
+      derr << __func__ << " failed with exit code: " << cpp_strerror(r) << dendl;
+    }
+  }
+  log_latency(__func__,
+    l_bluestore_csum_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  if (cct->_conf->bluestore_ignore_data_csum) {
+    return 0;
+  }
+  return r;
+}
+
+int BlueStore::_decompress(bufferlist& source, bufferlist* result)
+{
+  int r = 0;
+  auto start = mono_clock::now();
+  auto i = source.cbegin();
+  bluestore_compression_header_t chdr;
+  decode(chdr, i);
+  int alg = int(chdr.type);
+  CompressorRef cp = compressor;
+  if (!cp || (int)cp->get_type() != alg) {
+    cp = Compressor::create(cct, alg);
+  }
+
+  if (!cp.get()) {
+    // if compressor isn't available - error, because cannot return
+    // decompressed data?
+    
+    const char* alg_name = Compressor::get_comp_alg_name(alg);
+    derr << __func__ << " can't load decompressor " << alg_name << dendl;
+    _set_compression_alert(false, alg_name);
+    r = -EIO;
+  } else {
+    r = cp->decompress(i, chdr.length, *result, chdr.compressor_message);
+    if (r < 0) {
+      derr << __func__ << " decompression failed with exit code " << r << dendl;
+      r = -EIO;
+    }
+  }
+  log_latency(__func__,
+    l_bluestore_decompress_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  return r;
+}
+
+// this stores fiemap into interval_set, other variations
+// use it internally
+int BlueStore::_fiemap(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t length,
+  interval_set<uint64_t>& destset)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  if (!c->exists)
+    return -ENOENT;
+  {
+    std::shared_lock l(c->lock);
+
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists) {
+      return -ENOENT;
+    }
+    _dump_onode<30>(cct, *o);
+
+    dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+	     << " size 0x" << o->onode.size << std::dec << dendl;
+
+    boost::intrusive::set<Extent>::iterator ep, eend;
+    if (offset >= o->onode.size)
+      goto out;
+
+    if (offset + length > o->onode.size) {
+      length = o->onode.size - offset;
+    }
+
+    o->extent_map.fault_range(db, offset, length);
+    eend = o->extent_map.extent_map.end();
+    ep = o->extent_map.seek_lextent(offset);
+    while (length > 0) {
+      dout(20) << __func__ << " offset " << offset << dendl;
+      if (ep != eend && ep->logical_offset + ep->length <= offset) {
+        ++ep;
+        continue;
+      }
+
+      uint64_t x_len = length;
+      if (ep != eend && ep->logical_offset <= offset) {
+        uint64_t x_off = offset - ep->logical_offset;
+        x_len = std::min(x_len, ep->length - x_off);
+        dout(30) << __func__ << " lextent 0x" << std::hex << offset << "~"
+	         << x_len << std::dec << " blob " << ep->blob << dendl;
+        destset.insert(offset, x_len);
+        length -= x_len;
+        offset += x_len;
+        if (x_off + x_len == ep->length)
+	  ++ep;
+        continue;
+      }
+      if (ep != eend &&
+	  ep->logical_offset > offset &&
+	  ep->logical_offset - offset < x_len) {
+        x_len = ep->logical_offset - offset;
+      }
+      offset += x_len;
+      length -= x_len;
+    }
+  }
+
+ out:
+  dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << " size = 0x(" << destset << ")" << std::dec << dendl;
+  return 0;
+}
+
+int BlueStore::fiemap(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t length,
+  bufferlist& bl)
+{
+  interval_set<uint64_t> m;
+  int r = _fiemap(c_, oid, offset, length, m);
+  if (r >= 0) {
+    encode(m, bl);
+  }
+  return r;
+}
+
+int BlueStore::fiemap(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t length,
+  map<uint64_t, uint64_t>& destmap)
+{
+  interval_set<uint64_t> m;
+  int r = _fiemap(c_, oid, offset, length, m);
+  if (r >= 0) {
+    destmap = std::move(m).detach();
+  }
+  return r;
+}
+
+int BlueStore::readv(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  interval_set<uint64_t>& m,
+  bufferlist& bl,
+  uint32_t op_flags)
+{
+  auto start = mono_clock::now();
+  Collection *c = static_cast<Collection *>(c_.get());
+  const coll_t &cid = c->get_cid();
+  dout(15) << __func__ << " " << cid << " " << oid
+           << " fiemap " << m
+           << dendl;
+  if (!c->exists)
+    return -ENOENT;
+
+  bl.clear();
+  int r;
+  {
+    std::shared_lock l(c->lock);
+    auto start1 = mono_clock::now();
+    OnodeRef o = c->get_onode(oid, false);
+    log_latency("get_onode@read",
+      l_bluestore_read_onode_meta_lat,
+      mono_clock::now() - start1,
+      cct->_conf->bluestore_log_op_age);
+    if (!o || !o->exists) {
+      r = -ENOENT;
+      goto out;
+    }
+
+    if (m.empty()) {
+      r = 0;
+      goto out;
+    }
+
+    r = _do_readv(c, o, m, bl, op_flags);
+    if (r == -EIO) {
+      logger->inc(l_bluestore_read_eio);
+    }
+  }
+
+ out:
+  if (r >= 0 && _debug_data_eio(oid)) {
+    r = -EIO;
+    derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+  } else if (oid.hobj.pool > 0 &&  /* FIXME, see #23029 */
+             cct->_conf->bluestore_debug_random_read_err &&
+             (rand() % (int)(cct->_conf->bluestore_debug_random_read_err *
+                             100.0)) == 0) {
+    dout(0) << __func__ << ": inject random EIO" << dendl;
+    r = -EIO;
+  }
+  dout(10) << __func__ << " " << cid << " " << oid
+           << " fiemap " << m << std::dec
+           << " = " << r << dendl;
+  log_latency(__func__,
+    l_bluestore_read_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  return r;
+}
+
+int BlueStore::_do_readv(
+  Collection *c,
+  OnodeRef& o,
+  const interval_set<uint64_t>& m,
+  bufferlist& bl,
+  uint32_t op_flags,
+  uint64_t retry_count)
+{
+  FUNCTRACE(cct);
+  int r = 0;
+  int read_cache_policy = 0; // do not bypass clean or dirty cache
+
+  dout(20) << __func__ << " fiemap " << m << std::hex
+           << " size 0x" << o->onode.size << " (" << std::dec
+           << o->onode.size << ")" << dendl;
+
+  // generally, don't buffer anything, unless the client explicitly requests
+  // it.
+  bool buffered = false;
+  if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+    dout(20) << __func__ << " will do buffered read" << dendl;
+    buffered = true;
+  } else if (cct->_conf->bluestore_default_buffered_read &&
+             (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+                          CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
+    dout(20) << __func__ << " defaulting to buffered read" << dendl;
+    buffered = true;
+  }
+  // this method must be idempotent since we may call it several times
+  // before we finally read the expected result.
+  bl.clear();
+
+  // call fiemap first!
+  ceph_assert(m.range_start() <= o->onode.size);
+  ceph_assert(m.range_end() <= o->onode.size);
+  auto start = mono_clock::now();
+  o->extent_map.fault_range(db, m.range_start(), m.range_end() - m.range_start());
+  log_latency(__func__,
+    l_bluestore_read_onode_meta_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  _dump_onode<30>(cct, *o);
+
+  IOContext ioc(cct, NULL, !cct->_conf->bluestore_fail_eio);
+  vector<std::tuple<ready_regions_t, vector<bufferlist>, blobs2read_t>> raw_results;
+  raw_results.reserve(m.num_intervals());
+  int i = 0;
+  for (auto p = m.begin(); p != m.end(); p++, i++) {
+    raw_results.push_back({});
+    _read_cache(o, p.get_start(), p.get_len(), read_cache_policy,
+                std::get<0>(raw_results[i]), std::get<2>(raw_results[i]));
+    r = _prepare_read_ioc(std::get<2>(raw_results[i]), &std::get<1>(raw_results[i]), &ioc);
+    // we always issue aio for reading, so errors other than EIO are not allowed
+    if (r < 0)
+      return r;
+  }
+
+  auto num_ios = m.size();
+  if (ioc.has_pending_aios()) {
+    num_ios = ioc.get_num_ios();
+    bdev->aio_submit(&ioc);
+    dout(20) << __func__ << " waiting for aio" << dendl;
+    ioc.aio_wait();
+    r = ioc.get_return_value();
+    if (r < 0) {
+      ceph_assert(r == -EIO); // no other errors allowed
+      return -EIO;
+    }
+  }
+  log_latency_fn(__func__,
+    l_bluestore_read_wait_aio_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age,
+    [&](auto lat) { return ", num_ios = " + stringify(num_ios); }
+  );
+
+  ceph_assert(raw_results.size() == (size_t)m.num_intervals());
+  i = 0;
+  for (auto p = m.begin(); p != m.end(); p++, i++) {
+    bool csum_error = false;
+    bufferlist t;
+    r = _generate_read_result_bl(o, p.get_start(), p.get_len(),
+                                 std::get<0>(raw_results[i]),
+                                 std::get<1>(raw_results[i]),
+                                 std::get<2>(raw_results[i]),
+                                 buffered, &csum_error, t);
+    if (csum_error) {
+      // Handles spurious read errors caused by a kernel bug.
+      // We sometimes get all-zero pages as a result of the read under
+      // high memory pressure. Retrying the failing read succeeds in most
+      // cases.
+      // See also: http://tracker.ceph.com/issues/22464
+      if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+        return -EIO;
+      }
+      return _do_readv(c, o, m, bl, op_flags, retry_count + 1);
+    }
+    bl.claim_append(t);
+  }
+  if (retry_count) {
+    logger->inc(l_bluestore_reads_with_retries);
+    dout(5) << __func__ << " read fiemap " << m
+            << " failed " << retry_count << " times before succeeding"
+            << dendl;
+  }
+  return bl.length();
+}
+
+int BlueStore::dump_onode(CollectionHandle &c_,
+  const ghobject_t& oid,
+  const string& section_name,
+  Formatter *f)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+
+  int r;
+  {
+    std::shared_lock l(c->lock);
+
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists) {
+      r = -ENOENT;
+      goto out;
+    }
+    // FIXME minor: actually the next line isn't enough to
+    // load shared blobs. Leaving as is for now..
+    //
+    o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+
+    _dump_onode<0>(cct, *o);
+    f->open_object_section(section_name.c_str());
+    o->dump(f);
+    f->close_section();
+    r = 0;
+  }
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oid
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::getattr(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  const char *name,
+  bufferptr& value)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
+  if (!c->exists)
+    return -ENOENT;
+
+  int r;
+  {
+    std::shared_lock l(c->lock);
+    mempool::bluestore_cache_meta::string k(name);
+
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists) {
+      r = -ENOENT;
+      goto out;
+    }
+
+    if (!o->onode.attrs.count(k)) {
+      r = -ENODATA;
+      goto out;
+    }
+    value = o->onode.attrs[k];
+    r = 0;
+  }
+ out:
+  if (r == 0 && _debug_mdata_eio(oid)) {
+    r = -EIO;
+    derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+  }
+  dout(10) << __func__ << " " << c->cid << " " << oid << " " << name
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::getattrs(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  map<string,bufferptr,less<>>& aset)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+
+  int r;
+  {
+    std::shared_lock l(c->lock);
+
+    OnodeRef o = c->get_onode(oid, false);
+    if (!o || !o->exists) {
+      r = -ENOENT;
+      goto out;
+    }
+    for (auto& i : o->onode.attrs) {
+      aset.emplace(i.first.c_str(), i.second);
+    }
+    r = 0;
+  }
+
+ out:
+  if (r == 0 && _debug_mdata_eio(oid)) {
+    r = -EIO;
+    derr << __func__ << " " << c->cid << " " << oid << " INJECT EIO" << dendl;
+  }
+  dout(10) << __func__ << " " << c->cid << " " << oid
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::list_collections(vector<coll_t>& ls)
+{
+  std::shared_lock l(coll_lock);
+  ls.reserve(coll_map.size());
+  for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
+       p != coll_map.end();
+       ++p)
+    ls.push_back(p->first);
+  return 0;
+}
+
+bool BlueStore::collection_exists(const coll_t& c)
+{
+  std::shared_lock l(coll_lock);
+  return coll_map.count(c);
+}
+
+int BlueStore::collection_empty(CollectionHandle& ch, bool *empty)
+{
+  dout(15) << __func__ << " " << ch->cid << dendl;
+  vector<ghobject_t> ls;
+  ghobject_t next;
+  int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
+			  &ls, &next);
+  if (r < 0) {
+    derr << __func__ << " collection_list returned: " << cpp_strerror(r)
+         << dendl;
+    return r;
+  }
+  *empty = ls.empty();
+  dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
+  return 0;
+}
+
+int BlueStore::collection_bits(CollectionHandle& ch)
+{
+  dout(15) << __func__ << " " << ch->cid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l(c->lock);
+  dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
+  return c->cnode.bits;
+}
+
+int BlueStore::collection_list(
+  CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
+  vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  c->flush();
+  dout(15) << __func__ << " " << c->cid
+           << " start " << start << " end " << end << " max " << max << dendl;
+  int r;
+  {
+    std::shared_lock l(c->lock);
+    r = _collection_list(c, start, end, max, false, ls, pnext);
+  }
+
+  dout(10) << __func__ << " " << c->cid
+    << " start " << start << " end " << end << " max " << max
+    << " = " << r << ", ls.size() = " << ls->size()
+    << ", next = " << (pnext ? *pnext : ghobject_t())  << dendl;
+  return r;
+}
+
+int BlueStore::collection_list_legacy(
+  CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
+  vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  c->flush();
+  dout(15) << __func__ << " " << c->cid
+           << " start " << start << " end " << end << " max " << max << dendl;
+  int r;
+  {
+    std::shared_lock l(c->lock);
+    r = _collection_list(c, start, end, max, true, ls, pnext);
+  }
+
+  dout(10) << __func__ << " " << c->cid
+    << " start " << start << " end " << end << " max " << max
+    << " = " << r << ", ls.size() = " << ls->size()
+    << ", next = " << (pnext ? *pnext : ghobject_t())  << dendl;
+  return r;
+}
+
+int BlueStore::_collection_list(
+  Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
+  bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+
+  if (!c->exists)
+    return -ENOENT;
+
+  ghobject_t static_next;
+  std::unique_ptr<CollectionListIterator> it;
+  ghobject_t coll_range_temp_start, coll_range_temp_end;
+  ghobject_t coll_range_start, coll_range_end;
+  ghobject_t pend;
+  bool temp;
+
+  if (!pnext)
+    pnext = &static_next;
+
+  auto log_latency = make_scope_guard(
+    [&, start_time = mono_clock::now(), func_name = __func__] {
+    log_latency_fn(
+      func_name,
+      l_bluestore_clist_lat,
+      mono_clock::now() - start_time,
+      cct->_conf->bluestore_log_collection_list_age,
+      [&](const ceph::timespan& lat) {
+	ostringstream ostr;
+	ostr << ", lat = " << timespan_str(lat)
+	     << " cid =" << c->cid
+	     << " start " << start << " end " << end
+	     << " max " << max;
+	return ostr.str();
+      });
+  });
+
+  if (start.is_max() || start.hobj.is_max()) {
+    *pnext = ghobject_t::get_max();
+    return 0;
+  }
+  get_coll_range(c->cid, c->cnode.bits, &coll_range_temp_start,
+                 &coll_range_temp_end, &coll_range_start, &coll_range_end, legacy);
+  dout(20) << __func__
+    << " range " << coll_range_temp_start
+    << " to " << coll_range_temp_end
+    << " and " << coll_range_start
+    << " to " << coll_range_end
+    << " start " << start << dendl;
+  if (legacy) {
+    it = std::make_unique<SimpleCollectionListIterator>(
+      cct, db->get_iterator(PREFIX_OBJ));
+  } else {
+    it = std::make_unique<SortedCollectionListIterator>(
+      db->get_iterator(PREFIX_OBJ));
+  }
+  if (start == ghobject_t() ||
+    start.hobj == hobject_t() ||
+    start == c->cid.get_min_hobj()) {
+    it->upper_bound(coll_range_temp_start);
+    temp = true;
+  } else {
+    if (start.hobj.is_temp()) {
+      temp = true;
+      ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
+    } else {
+      temp = false;
+      ceph_assert(start >= coll_range_start && start < coll_range_end);
+    }
+    dout(20) << __func__ << " temp=" << (int)temp << dendl;
+    it->lower_bound(start);
+  }
+  if (end.hobj.is_max()) {
+    pend = temp ? coll_range_temp_end : coll_range_end;
+  } else {
+    if (end.hobj.is_temp()) {
+      if (temp) {
+        pend = end;
+      } else {
+        *pnext = ghobject_t::get_max();
+        return 0;
+      }
+    } else {
+      pend = temp ? coll_range_temp_end : end;
+    }
+  }
+  dout(20) << __func__ << " pend " << pend << dendl;
+  while (true) {
+    if (!it->valid() || it->is_ge(pend)) {
+      if (!it->valid())
+	dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
+      else
+	dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
+      if (temp) {
+	if (end.hobj.is_temp()) {
+          if (it->valid() && it->is_lt(coll_range_temp_end)) {
+            *pnext = it->oid();
+            return 0;
+          }
+	  break;
+	}
+	dout(30) << __func__ << " switch to non-temp namespace" << dendl;
+	temp = false;
+	it->upper_bound(coll_range_start);
+        if (end.hobj.is_max())
+          pend = coll_range_end;
+        else
+          pend = end;
+	dout(30) << __func__ << " pend " << pend << dendl;
+	continue;
+      }
+      if (it->valid() && it->is_lt(coll_range_end)) {
+        *pnext = it->oid();
+        return 0;
+      }
+      break;
+    }
+    dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
+    if (ls->size() >= (unsigned)max) {
+      dout(20) << __func__ << " reached max " << max << dendl;
+      *pnext = it->oid();
+      return 0;
+    }
+    ls->push_back(it->oid());
+    it->next();
+  }
+  *pnext = ghobject_t::get_max();
+  return 0;
+}
+
+int BlueStore::omap_get(
+  CollectionHandle &c_,    ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  bufferlist *header,      ///< [out] omap header
+  map<string, bufferlist> *out /// < [out] Key to value map
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  return _omap_get(c, oid, header, out);
+}
+
+int BlueStore::_omap_get(
+  Collection *c,    ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  bufferlist *header,      ///< [out] omap header
+  map<string, bufferlist> *out /// < [out] Key to value map
+  )
+{
+  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  std::shared_lock l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  r = _onode_omap_get(o, header, out);
+ out:
+  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+	   << dendl;
+  return r;
+}
+
+int BlueStore::_onode_omap_get(
+  const OnodeRef &o,           ///< [in] Object containing omap
+  bufferlist *header,          ///< [out] omap header
+  map<string, bufferlist> *out /// < [out] Key to value map
+)
+{
+  int r = 0;
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.has_omap())
+    goto out;
+  o->flush();
+  {
+    const string& prefix = o->get_omap_prefix();
+    string head, tail;
+    o->get_omap_header(&head);
+    o->get_omap_tail(&tail);
+    KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
+    it->lower_bound(head);
+    while (it->valid()) {
+      if (it->key() == head) {
+        dout(30) << __func__ << "  got header" << dendl;
+        *header = it->value();
+      } else if (it->key() >= tail) {
+        dout(30) << __func__ << "  reached tail" << dendl;
+        break;
+      } else {
+        string user_key;
+        o->decode_omap_key(it->key(), &user_key);
+        dout(20) << __func__ << "  got " << pretty_binary_string(it->key())
+          << " -> " << user_key << dendl;
+        (*out)[user_key] = it->value();
+      }
+      it->next();
+    }
+  }
+out:
+  return r;
+}
+
+int BlueStore::omap_get_header(
+  CollectionHandle &c_,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  bufferlist *header,      ///< [out] omap header
+  bool allow_eio ///< [in] don't assert on eio
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  std::shared_lock l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.has_omap())
+    goto out;
+  o->flush();
+  {
+    string head;
+    o->get_omap_header(&head);
+    if (db->get(o->get_omap_prefix(), head, header) >= 0) {
+      dout(30) << __func__ << "  got header" << dendl;
+    } else {
+      dout(30) << __func__ << "  no header" << dendl;
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+	   << dendl;
+  return r;
+}
+
+int BlueStore::omap_get_keys(
+  CollectionHandle &c_,              ///< [in] Collection containing oid
+  const ghobject_t &oid, ///< [in] Object containing omap
+  set<string> *keys      ///< [out] Keys defined on oid
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  auto start1 = mono_clock::now();
+  std::shared_lock l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.has_omap())
+    goto out;
+  o->flush();
+  {
+    const string& prefix = o->get_omap_prefix();
+    string head, tail;
+    o->get_omap_key(string(), &head);
+    o->get_omap_tail(&tail);
+    KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
+    it->lower_bound(head);
+    while (it->valid()) {
+      if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      }
+      string user_key;
+      o->decode_omap_key(it->key(), &user_key);
+      dout(20) << __func__ << "  got " << pretty_binary_string(it->key())
+	       << " -> " << user_key << dendl;
+      keys->insert(user_key);
+      it->next();
+    }
+  }
+ out:
+  c->store->log_latency(
+    __func__,
+    l_bluestore_omap_get_keys_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+	   << dendl;
+  return r;
+}
+
+int BlueStore::omap_get_values(
+  CollectionHandle &c_,        ///< [in] Collection containing oid
+  const ghobject_t &oid,       ///< [in] Object containing omap
+  const set<string> &keys,     ///< [in] Keys to get
+  map<string, bufferlist> *out ///< [out] Returned keys and values
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  std::shared_lock l(c->lock);
+  auto start1 = mono_clock::now();
+  int r = 0;
+  string final_key;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.has_omap()) {
+    goto out;
+  }
+  o->flush();
+  {
+    const string& prefix = o->get_omap_prefix();
+    o->get_omap_key(string(), &final_key);
+    size_t base_key_len = final_key.size();
+    for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+      final_key.resize(base_key_len); // keep prefix
+      final_key += *p;
+      bufferlist val;
+      if (db->get(prefix, final_key, &val) >= 0) {
+	dout(30) << __func__ << "  got " << pretty_binary_string(final_key)
+		 << " -> " << *p << dendl;
+	out->insert(make_pair(*p, val));
+      }
+    }
+  }
+ out:
+  c->store->log_latency(
+    __func__,
+    l_bluestore_omap_get_values_lat,
+    mono_clock::now() - start1,
+    c->store->cct->_conf->bluestore_log_omap_iterator_age);
+
+  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+	   << dendl;
+  return r;
+}
+
+#ifdef WITH_SEASTAR
+int BlueStore::omap_get_values(
+  CollectionHandle &c_,        ///< [in] Collection containing oid
+  const ghobject_t &oid,       ///< [in] Object containing omap
+  const std::optional<string> &start_after,     ///< [in] Keys to get
+  map<string, bufferlist> *output ///< [out] Returned keys and values
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  std::shared_lock l(c->lock);
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.has_omap()) {
+    goto out;
+  }
+  o->flush();
+  {
+    ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid);
+    if (!iter) {
+      r = -ENOENT;
+      goto out;
+    }
+    iter->upper_bound(*start_after);
+    for (; iter->valid(); iter->next()) {
+      output->insert(make_pair(iter->key(), iter->value()));
+    }
+  }
+
+out:
+  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+          << dendl;
+  return r;
+}
+#endif
+
+int BlueStore::omap_check_keys(
+  CollectionHandle &c_,    ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  const set<string> &keys, ///< [in] Keys to check
+  set<string> *out         ///< [out] Subset of keys defined on oid
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  std::shared_lock l(c->lock);
+  int r = 0;
+  string final_key;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.has_omap()) {
+    goto out;
+  }
+  o->flush();
+  {
+    const string& prefix = o->get_omap_prefix();
+    o->get_omap_key(string(), &final_key);
+    size_t base_key_len = final_key.size();
+    for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+      final_key.resize(base_key_len); // keep prefix
+      final_key += *p;
+      bufferlist val;
+      if (db->get(prefix, final_key, &val) >= 0) {
+	dout(30) << __func__ << "  have " << pretty_binary_string(final_key)
+		 << " -> " << *p << dendl;
+	out->insert(*p);
+      } else {
+	dout(30) << __func__ << "  miss " << pretty_binary_string(final_key)
+		 << " -> " << *p << dendl;
+      }
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r
+	   << dendl;
+  return r;
+}
+
+ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
+  CollectionHandle &c_,              ///< [in] collection
+  const ghobject_t &oid  ///< [in] object
+  )
+{
+  Collection *c = static_cast<Collection *>(c_.get());
+  dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
+  if (!c->exists) {
+    return ObjectMap::ObjectMapIterator();
+  }
+  std::shared_lock l(c->lock);
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
+    return ObjectMap::ObjectMapIterator();
+  }
+  o->flush();
+  dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl;
+  auto bounds = KeyValueDB::IteratorBounds();
+  if (o->onode.has_omap()) {
+    std::string lower_bound, upper_bound;
+    o->get_omap_key(string(), &lower_bound);
+    o->get_omap_tail(&upper_bound);
+    bounds.lower_bound = std::move(lower_bound);
+    bounds.upper_bound = std::move(upper_bound);
+  }
+  KeyValueDB::Iterator it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds));
+  return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(logger,c, o, it));
+}
+
+// -----------------
+// write helpers
+
+uint64_t BlueStore::_get_ondisk_reserved() const {
+  ceph_assert(min_alloc_size);
+  return round_up_to(
+    std::max<uint64_t>(SUPER_RESERVED, min_alloc_size), min_alloc_size);
+}
+
+void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
+{
+  dout(10) << __func__ << " ondisk_format " << ondisk_format
+	   << " min_compat_ondisk_format " << min_compat_ondisk_format
+	   << dendl;
+  ceph_assert(ondisk_format == latest_ondisk_format);
+  {
+    bufferlist bl;
+    encode(ondisk_format, bl);
+    t->set(PREFIX_SUPER, "ondisk_format", bl);
+  }
+  {
+    bufferlist bl;
+    encode(min_compat_ondisk_format, bl);
+    t->set(PREFIX_SUPER, "min_compat_ondisk_format", bl);
+  }
+}
+
+int BlueStore::_open_super_meta()
+{
+  // nid
+  {
+    nid_max = 0;
+    bufferlist bl;
+    db->get(PREFIX_SUPER, "nid_max", &bl);
+    auto p = bl.cbegin();
+    try {
+      uint64_t v;
+      decode(v, p);
+      nid_max = v;
+    } catch (ceph::buffer::error& e) {
+      derr << __func__ << " unable to read nid_max" << dendl;
+      return -EIO;
+    }
+    dout(1) << __func__ << " old nid_max " << nid_max << dendl;
+    nid_last = nid_max.load();
+  }
+
+  // blobid
+  {
+    blobid_max = 0;
+    bufferlist bl;
+    db->get(PREFIX_SUPER, "blobid_max", &bl);
+    auto p = bl.cbegin();
+    try {
+      uint64_t v;
+      decode(v, p);
+      blobid_max = v;
+    } catch (ceph::buffer::error& e) {
+      derr << __func__ << " unable to read blobid_max" << dendl;
+      return -EIO;
+    }
+    dout(1) << __func__ << " old blobid_max " << blobid_max << dendl;
+    blobid_last = blobid_max.load();
+  }
+
+  // freelist
+  {
+    bufferlist bl;
+    db->get(PREFIX_SUPER, "freelist_type", &bl);
+    if (bl.length()) {
+      freelist_type = std::string(bl.c_str(), bl.length());
+    } else {
+      ceph_abort_msg("Not Support extent freelist manager");
+    }
+    dout(5) << __func__ << "::NCB::freelist_type=" << freelist_type << dendl;
+  }
+  // ondisk format
+  int32_t compat_ondisk_format = 0;
+  {
+    bufferlist bl;
+    int r = db->get(PREFIX_SUPER, "ondisk_format", &bl);
+    if (r < 0) {
+      // base case: kraken bluestore is v1 and readable by v1
+      dout(20) << __func__ << " missing ondisk_format; assuming kraken"
+	       << dendl;
+      ondisk_format = 1;
+      compat_ondisk_format = 1;
+    } else {
+      auto p = bl.cbegin();
+      try {
+	decode(ondisk_format, p);
+      } catch (ceph::buffer::error& e) {
+	derr << __func__ << " unable to read ondisk_format" << dendl;
+	return -EIO;
+      }
+      bl.clear();
+      {
+	r = db->get(PREFIX_SUPER, "min_compat_ondisk_format", &bl);
+	ceph_assert(!r);
+	auto p = bl.cbegin();
+	try {
+	  decode(compat_ondisk_format, p);
+	} catch (ceph::buffer::error& e) {
+	  derr << __func__ << " unable to read compat_ondisk_format" << dendl;
+	  return -EIO;
+	}
+      }
+    }
+    dout(1) << __func__ << " ondisk_format " << ondisk_format
+	     << " compat_ondisk_format " << compat_ondisk_format
+	     << dendl;
+  }
+
+  if (latest_ondisk_format < compat_ondisk_format) {
+    derr << __func__ << " compat_ondisk_format is "
+	 << compat_ondisk_format << " but we only understand version "
+	 << latest_ondisk_format << dendl;
+    return -EPERM;
+  }
+
+  {
+    bufferlist bl;
+    db->get(PREFIX_SUPER, "min_alloc_size", &bl);
+    auto p = bl.cbegin();
+    try {
+      uint64_t val;
+      decode(val, p);
+      min_alloc_size = val;
+      min_alloc_size_order = std::countr_zero(val);
+      min_alloc_size_mask  = min_alloc_size - 1;
+
+      ceph_assert(min_alloc_size == 1u << min_alloc_size_order);
+    } catch (ceph::buffer::error& e) {
+      derr << __func__ << " unable to read min_alloc_size" << dendl;
+      return -EIO;
+    }
+    dout(1) << __func__ << " min_alloc_size 0x" << std::hex << min_alloc_size
+	     << std::dec << dendl;
+    logger->set(l_bluestore_alloc_unit, min_alloc_size);
+  }
+
+  // smr fields
+  {
+    bufferlist bl;
+    int r = db->get(PREFIX_SUPER, "zone_size", &bl);
+    if (r >= 0) {
+      auto p = bl.cbegin();
+      decode(zone_size, p);
+      dout(1) << __func__ << " zone_size 0x" << std::hex << zone_size << std::dec << dendl;
+      ceph_assert(bdev->is_smr());
+    } else {
+      ceph_assert(!bdev->is_smr());
+    }
+  }
+  {
+    bufferlist bl;
+    int r = db->get(PREFIX_SUPER, "first_sequential_zone", &bl);
+    if (r >= 0) {
+      auto p = bl.cbegin();
+      decode(first_sequential_zone, p);
+      dout(1) << __func__ << " first_sequential_zone 0x" << std::hex
+	      << first_sequential_zone << std::dec << dendl;
+      ceph_assert(bdev->is_smr());
+    } else {
+      ceph_assert(!bdev->is_smr());
+    }
+  }
+
+  _set_per_pool_omap();
+
+  _open_statfs();
+  _set_alloc_sizes();
+  _set_throttle_params();
+
+  _set_csum();
+  _set_compression();
+  _set_blob_size();
+
+  _validate_bdev();
+  return 0;
+}
+
+int BlueStore::_upgrade_super()
+{
+  dout(1) << __func__ << " from " << ondisk_format << ", latest "
+	  << latest_ondisk_format << dendl;
+  if (ondisk_format < latest_ondisk_format) {
+    ceph_assert(ondisk_format > 0);
+    ceph_assert(ondisk_format < latest_ondisk_format);
+
+    KeyValueDB::Transaction t = db->get_transaction();
+    if (ondisk_format == 1) {
+      // changes:
+      // - super: added ondisk_format
+      // - super: added min_readable_ondisk_format
+      // - super: added min_compat_ondisk_format
+      // - super: added min_alloc_size
+      // - super: removed min_min_alloc_size
+      {
+	bufferlist bl;
+	db->get(PREFIX_SUPER, "min_min_alloc_size", &bl);
+	auto p = bl.cbegin();
+	try {
+	  uint64_t val;
+	  decode(val, p);
+	  min_alloc_size = val;
+	} catch (ceph::buffer::error& e) {
+	  derr << __func__ << " failed to read min_min_alloc_size" << dendl;
+	  return -EIO;
+	}
+	t->set(PREFIX_SUPER, "min_alloc_size", bl);
+	t->rmkey(PREFIX_SUPER, "min_min_alloc_size");
+      }
+      ondisk_format = 2;
+    }
+    if (ondisk_format == 2) {
+      // changes:
+      // - onode has FLAG_PERPOOL_OMAP.  Note that we do not know that *all*
+      //   oondes are using the per-pool prefix until a repair is run; at that
+      //   point the per_pool_omap=1 key will be set.
+      // - super: added per_pool_omap key, which indicates that *all* objects
+      //   are using the new prefix and key format
+      ondisk_format = 3;
+    }
+    if (ondisk_format == 3) {
+      // changes:
+      // - FreelistManager keeps meta within bdev label
+      int r = _write_out_fm_meta(0);
+      ceph_assert(r == 0);
+      ondisk_format = 4;
+    }
+    // This to be the last operation
+    _prepare_ondisk_format_super(t);
+    int r = db->submit_transaction_sync(t);
+    ceph_assert(r == 0);
+  }
+  // done
+  dout(1) << __func__ << " done" << dendl;
+  return 0;
+}
+
+void BlueStore::_assign_nid(TransContext *txc, OnodeRef& o)
+{
+  if (o->onode.nid) {
+    ceph_assert(o->exists);
+    return;
+  }
+  uint64_t nid = ++nid_last;
+  dout(20) << __func__ << " " << nid << dendl;
+  o->onode.nid = nid;
+  txc->last_nid = nid;
+  o->exists = true;
+}
+
+uint64_t BlueStore::_assign_blobid(TransContext *txc)
+{
+  uint64_t bid = ++blobid_last;
+  dout(20) << __func__ << " " << bid << dendl;
+  txc->last_blobid = bid;
+  return bid;
+}
+
+void BlueStore::get_db_statistics(Formatter *f)
+{
+  db->get_statistics(f);
+}
+
+BlueStore::TransContext *BlueStore::_txc_create(
+  Collection *c, OpSequencer *osr,
+  list<Context*> *on_commits,
+  TrackedOpRef osd_op)
+{
+  TransContext *txc = new TransContext(cct, c, osr, on_commits);
+  txc->t = db->get_transaction();
+
+#ifdef WITH_BLKIN
+  if (osd_op && osd_op->pg_trace) {
+    txc->trace.init("TransContext", &trace_endpoint,
+                    &osd_op->pg_trace);
+    txc->trace.event("txc create");
+    txc->trace.keyval("txc seq", txc->seq);
+  }
+#endif
+
+  osr->queue_new(txc);
+  dout(20) << __func__ << " osr " << osr << " = " << txc
+	   << " seq " << txc->seq << dendl;
+  return txc;
+}
+
+void BlueStore::_txc_calc_cost(TransContext *txc)
+{
+  // one "io" for the kv commit
+  auto ios = 1 + txc->ioc.get_num_ios();
+  auto cost = throttle_cost_per_io.load();
+  txc->cost = ios * cost + txc->bytes;
+  txc->ios = ios;
+  dout(10) << __func__ << " " << txc << " cost " << txc->cost << " ("
+	   << ios << " ios * " << cost << " + " << txc->bytes
+	   << " bytes)" << dendl;
+}
+
+void BlueStore::_txc_update_store_statfs(TransContext *txc)
+{
+  if (txc->statfs_delta.is_empty())
+    return;
+
+  logger->inc(l_bluestore_allocated, txc->statfs_delta.allocated());
+  logger->inc(l_bluestore_stored, txc->statfs_delta.stored());
+  logger->inc(l_bluestore_compressed, txc->statfs_delta.compressed());
+  logger->inc(l_bluestore_compressed_allocated, txc->statfs_delta.compressed_allocated());
+  logger->inc(l_bluestore_compressed_original, txc->statfs_delta.compressed_original());
+
+  if (per_pool_stat_collection) {
+    if (!is_statfs_recoverable()) {
+      bufferlist bl;
+      txc->statfs_delta.encode(bl);
+      string key;
+      get_pool_stat_key(txc->osd_pool_id, &key);
+      txc->t->merge(PREFIX_STAT, key, bl);
+    }
+
+    std::lock_guard l(vstatfs_lock);
+    auto& stats = osd_pools[txc->osd_pool_id];
+    stats += txc->statfs_delta;
+    
+    vstatfs += txc->statfs_delta; //non-persistent in this mode
+
+  } else {
+    if (!is_statfs_recoverable()) {
+      bufferlist bl;
+      txc->statfs_delta.encode(bl);
+      txc->t->merge(PREFIX_STAT, BLUESTORE_GLOBAL_STATFS_KEY, bl);
+    }
+
+    std::lock_guard l(vstatfs_lock);
+    vstatfs += txc->statfs_delta;
+  } 
+  txc->statfs_delta.reset();
+}
+
+void BlueStore::_txc_state_proc(TransContext *txc)
+{
+  while (true) {
+    dout(10) << __func__ << " txc " << txc
+	     << " " << txc->get_state_name() << dendl;
+    switch (txc->get_state()) {
+    case TransContext::STATE_PREPARE:
+      throttle.log_state_latency(*txc, logger, l_bluestore_state_prepare_lat);
+      if (txc->ioc.has_pending_aios()) {
+	txc->set_state(TransContext::STATE_AIO_WAIT);
+#ifdef WITH_BLKIN
+        if (txc->trace) {
+          txc->trace.keyval("pending aios", txc->ioc.num_pending.load());
+        }
+#endif
+	txc->had_ios = true;
+	_txc_aio_submit(txc);
+	return;
+      }
+      // ** fall-thru **
+
+    case TransContext::STATE_AIO_WAIT:
+      {
+	mono_clock::duration lat = throttle.log_state_latency(
+	  *txc, logger, l_bluestore_state_aio_wait_lat);
+	if (ceph::to_seconds<double>(lat) >= cct->_conf->bluestore_log_op_age) {
+	  dout(0) << __func__ << " slow aio_wait, txc = " << txc
+		  << ", latency = " << lat
+		  << dendl;
+	}
+      }
+
+      _txc_finish_io(txc);  // may trigger blocked txc's too
+      return;
+
+    case TransContext::STATE_IO_DONE:
+      ceph_assert(ceph_mutex_is_locked(txc->osr->qlock));  // see _txc_finish_io
+      if (txc->had_ios) {
+	++txc->osr->txc_with_unstable_io;
+      }
+      throttle.log_state_latency(*txc, logger, l_bluestore_state_io_done_lat);
+      txc->set_state(TransContext::STATE_KV_QUEUED);
+      if (cct->_conf->bluestore_sync_submit_transaction) {
+	if (txc->last_nid >= nid_max ||
+	    txc->last_blobid >= blobid_max) {
+	  dout(20) << __func__
+		   << " last_{nid,blobid} exceeds max, submit via kv thread"
+		   << dendl;
+	} else if (txc->osr->kv_committing_serially) {
+	  dout(20) << __func__ << " prior txc submitted via kv thread, us too"
+		   << dendl;
+	  // note: this is starvation-prone.  once we have a txc in a busy
+	  // sequencer that is committing serially it is possible to keep
+	  // submitting new transactions fast enough that we get stuck doing
+	  // so.  the alternative is to block here... fixme?
+	} else if (txc->osr->txc_with_unstable_io) {
+	  dout(20) << __func__ << " prior txc(s) with unstable ios "
+		   << txc->osr->txc_with_unstable_io.load() << dendl;
+	} else if (cct->_conf->bluestore_debug_randomize_serial_transaction &&
+		   rand() % cct->_conf->bluestore_debug_randomize_serial_transaction
+		   == 0) {
+	  dout(20) << __func__ << " DEBUG randomly forcing submit via kv thread"
+		   << dendl;
+	} else {
+	  _txc_apply_kv(txc, true);
+	}
+      }
+      {
+	std::lock_guard l(kv_lock);
+	kv_queue.push_back(txc);
+	if (!kv_sync_in_progress) {
+	  kv_sync_in_progress = true;
+	  kv_cond.notify_one();
+	}
+	if (txc->get_state() != TransContext::STATE_KV_SUBMITTED) {
+	  kv_queue_unsubmitted.push_back(txc);
+	  ++txc->osr->kv_committing_serially;
+	}
+	if (txc->had_ios)
+	  kv_ios++;
+	kv_throttle_costs += txc->cost;
+      }
+      return;
+    case TransContext::STATE_KV_SUBMITTED:
+      _txc_committed_kv(txc);
+      // ** fall-thru **
+
+    case TransContext::STATE_KV_DONE:
+      throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat);
+      if (txc->deferred_txn) {
+	txc->set_state(TransContext::STATE_DEFERRED_QUEUED);
+	_deferred_queue(txc);
+	return;
+      }
+      txc->set_state(TransContext::STATE_FINISHING);
+      break;
+
+    case TransContext::STATE_DEFERRED_CLEANUP:
+      throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_cleanup_lat);
+      txc->set_state(TransContext::STATE_FINISHING);
+      // ** fall-thru **
+
+    case TransContext::STATE_FINISHING:
+      throttle.log_state_latency(*txc, logger, l_bluestore_state_finishing_lat);
+      _txc_finish(txc);
+      return;
+
+    default:
+      derr << __func__ << " unexpected txc " << txc
+	   << " state " << txc->get_state_name() << dendl;
+      ceph_abort_msg("unexpected txc state");
+      return;
+    }
+  }
+}
+
+void BlueStore::_txc_finish_io(TransContext *txc)
+{
+  dout(20) << __func__ << " " << txc << dendl;
+
+  /*
+   * we need to preserve the order of kv transactions,
+   * even though aio will complete in any order.
+   */
+
+  OpSequencer *osr = txc->osr.get();
+  std::lock_guard l(osr->qlock);
+  txc->set_state(TransContext::STATE_IO_DONE);
+  txc->ioc.release_running_aios();
+  OpSequencer::q_list_t::iterator p = osr->q.iterator_to(*txc);
+  while (p != osr->q.begin()) {
+    --p;
+    if (p->get_state() < TransContext::STATE_IO_DONE) {
+      dout(20) << __func__ << " " << txc << " blocked by " << &*p << " "
+	       << p->get_state_name() << dendl;
+      return;
+    }
+    if (p->get_state() > TransContext::STATE_IO_DONE) {
+      ++p;
+      break;
+    }
+  }
+  do {
+    _txc_state_proc(&*p++);
+  } while (p != osr->q.end() &&
+	   p->get_state() == TransContext::STATE_IO_DONE);
+
+  if (osr->kv_submitted_waiters) {
+    osr->qcond.notify_all();
+  }
+}
+
+void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
+{
+  dout(20) << __func__ << " txc " << txc
+	   << " onodes " << txc->onodes
+	   << " shared_blobs " << txc->shared_blobs
+	   << dendl;
+
+  // finalize onodes
+  for (auto o : txc->onodes) {
+    _record_onode(o, t);
+    o->flushing_count++;
+  }
+
+  // objects we modified but didn't affect the onode
+  auto p = txc->modified_objects.begin();
+  while (p != txc->modified_objects.end()) {
+    if (txc->onodes.count(*p) == 0) {
+      (*p)->flushing_count++;
+      ++p;
+    } else {
+      // remove dups with onodes list to avoid problems in _txc_finish
+      p = txc->modified_objects.erase(p);
+    }
+  }
+
+  // finalize shared_blobs
+  for (auto sb : txc->shared_blobs) {
+    string key;
+    auto sbid = sb->get_sbid();
+    get_shared_blob_key(sbid, &key);
+    if (sb->persistent->empty()) {
+      dout(20) << __func__ << " shared_blob 0x"
+               << std::hex << sbid << std::dec
+	       << " is empty" << dendl;
+      t->rmkey(PREFIX_SHARED_BLOB, key);
+    } else {
+      bufferlist bl;
+      encode(*(sb->persistent), bl);
+      dout(20) << __func__ << " shared_blob 0x"
+               << std::hex << sbid << std::dec
+	       << " is " << bl.length() << " " << *sb << dendl;
+      t->set(PREFIX_SHARED_BLOB, key, bl);
+    }
+  }
+}
+
+void BlueStore::BSPerfTracker::update_from_perfcounters(
+  PerfCounters &logger)
+{
+  os_commit_latency_ns.consume_next(
+    logger.get_tavg_ns(
+      l_bluestore_commit_lat));
+  os_apply_latency_ns.consume_next(
+    logger.get_tavg_ns(
+      l_bluestore_commit_lat));
+}
+
+void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
+{
+  dout(20) << __func__ << " txc " << txc << std::hex
+	   << " allocated 0x" << txc->allocated
+	   << " released 0x" << txc->released
+	   << std::dec << dendl;
+
+  if (!fm->is_null_manager())
+  {
+    // We have to handle the case where we allocate *and* deallocate the
+    // same region in this transaction.  The freelist doesn't like that.
+    // (Actually, the only thing that cares is the BitmapFreelistManager
+    // debug check. But that's important.)
+    interval_set<uint64_t> tmp_allocated, tmp_released;
+    interval_set<uint64_t> *pallocated = &txc->allocated;
+    interval_set<uint64_t> *preleased = &txc->released;
+    if (!txc->allocated.empty() && !txc->released.empty()) {
+      interval_set<uint64_t> overlap;
+      overlap.intersection_of(txc->allocated, txc->released);
+      if (!overlap.empty()) {
+	tmp_allocated = txc->allocated;
+	tmp_allocated.subtract(overlap);
+	tmp_released = txc->released;
+	tmp_released.subtract(overlap);
+	dout(20) << __func__ << "  overlap 0x" << std::hex << overlap
+		 << ", new allocated 0x" << tmp_allocated
+		 << " released 0x" << tmp_released << std::dec
+		 << dendl;
+	pallocated = &tmp_allocated;
+	preleased = &tmp_released;
+      }
+    }
+
+    // update freelist with non-overlap sets
+    for (interval_set<uint64_t>::iterator p = pallocated->begin();
+	 p != pallocated->end();
+	 ++p) {
+      fm->allocate(p.get_start(), p.get_len(), t);
+    }
+    for (interval_set<uint64_t>::iterator p = preleased->begin();
+	 p != preleased->end();
+	 ++p) {
+      dout(20) << __func__ << " release 0x" << std::hex << p.get_start()
+	       << "~" << p.get_len() << std::dec << dendl;
+      fm->release(p.get_start(), p.get_len(), t);
+    }
+  }
+
+#ifdef HAVE_LIBZBD
+  if (bdev->is_smr()) {
+    for (auto& i : txc->old_zone_offset_refs) {
+      dout(20) << __func__ << " rm ref zone 0x" << std::hex << i.first.second
+	       << " offset 0x" << i.second << std::dec
+	       << " -> " << i.first.first->oid << dendl;
+      string key;
+      get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
+      txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
+    }
+    for (auto& i : txc->new_zone_offset_refs) {
+      // (zone, offset) -> oid
+      dout(20) << __func__ << " add ref zone 0x" << std::hex << i.first.second
+	       << " offset 0x" << i.second << std::dec
+	       << " -> " << i.first.first->oid << dendl;
+      string key;
+      get_zone_offset_object_key(i.first.second, i.second, i.first.first->oid, &key);
+      bufferlist v;
+      txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
+    }
+  }
+#endif
+
+  _txc_update_store_statfs(txc);
+}
+
+void BlueStore::_txc_apply_kv(TransContext *txc, bool sync_submit_transaction)
+{
+  ceph_assert(txc->get_state() == TransContext::STATE_KV_QUEUED);
+  {
+#if defined(WITH_LTTNG)
+    auto start = mono_clock::now();
+#endif
+
+#ifdef WITH_BLKIN
+    if (txc->trace) {
+      txc->trace.event("db async submit");
+    }
+#endif
+
+    int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction(txc->t);
+    ceph_assert(r == 0);
+    txc->set_state(TransContext::STATE_KV_SUBMITTED);
+    if (txc->osr->kv_submitted_waiters) {
+      std::lock_guard l(txc->osr->qlock);
+      txc->osr->qcond.notify_all();
+    }
+
+#if defined(WITH_LTTNG)
+    if (txc->tracing) {
+      tracepoint(
+	bluestore,
+	transaction_kv_submit_latency,
+	txc->osr->get_sequencer_id(),
+	txc->seq,
+	sync_submit_transaction,
+	ceph::to_seconds<double>(mono_clock::now() - start));
+    }
+#endif
+  }
+
+  for (auto ls : { &txc->onodes, &txc->modified_objects }) {
+    for (auto& o : *ls) {
+      dout(20) << __func__ << " onode " << o << " had " << o->flushing_count
+	       << dendl;
+      if (--o->flushing_count == 0 && o->waiting_count.load()) {
+        std::lock_guard l(o->flush_lock);
+	o->flush_cond.notify_all();
+      }
+    }
+  }
+}
+
+void BlueStore::_txc_committed_kv(TransContext *txc)
+{
+  dout(20) << __func__ << " txc " << txc << dendl;
+  throttle.complete_kv(*txc);
+  {
+    std::lock_guard l(txc->osr->qlock);
+    txc->set_state(TransContext::STATE_KV_DONE);
+    if (txc->ch->commit_queue) {
+      txc->ch->commit_queue->queue(txc->oncommits);
+    } else {
+      finisher.queue(txc->oncommits);
+    }
+  }
+  throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_committing_lat);
+  log_latency_fn(
+    __func__,
+    l_bluestore_commit_lat,
+    mono_clock::now() - txc->start,
+    cct->_conf->bluestore_log_op_age,
+    [&](auto lat) {
+      return ", txc = " + stringify(txc);
+    }
+  );
+}
+
+void BlueStore::_txc_finish(TransContext *txc)
+{
+  dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
+  ceph_assert(txc->get_state() == TransContext::STATE_FINISHING);
+
+  for (auto& sb : txc->shared_blobs_written) {
+    sb->finish_write(txc->seq);
+  }
+  txc->shared_blobs_written.clear();
+
+  while (!txc->removed_collections.empty()) {
+    _queue_reap_collection(txc->removed_collections.front());
+    txc->removed_collections.pop_front();
+  }
+
+  OpSequencerRef osr = txc->osr;
+  bool empty = false;
+  bool submit_deferred = false;
+  OpSequencer::q_list_t releasing_txc;
+  {
+    std::lock_guard l(osr->qlock);
+    txc->set_state(TransContext::STATE_DONE);
+    bool notify = false;
+    while (!osr->q.empty()) {
+      TransContext *txc = &osr->q.front();
+      dout(20) << __func__ << "  txc " << txc << " " << txc->get_state_name()
+	       << dendl;
+      if (txc->get_state() != TransContext::STATE_DONE) {
+	if (txc->get_state() == TransContext::STATE_PREPARE &&
+	  deferred_aggressive) {
+	  // for _osr_drain_preceding()
+          notify = true;
+	}
+	if (txc->get_state() == TransContext::STATE_DEFERRED_QUEUED &&
+	    osr->q.size() > g_conf()->bluestore_max_deferred_txc) {
+	  submit_deferred = true;
+	}
+        break;
+      }
+
+      osr->q.pop_front();
+      releasing_txc.push_back(*txc);
+    }
+
+    if (osr->q.empty()) {
+      dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
+      empty = true;
+    }
+
+    // only drain()/drain_preceding() need wakeup,
+    // other cases use kv_submitted_waiters
+    if (notify || empty) {
+      osr->qcond.notify_all();
+    }
+  }
+
+  while (!releasing_txc.empty()) {
+    // release to allocator only after all preceding txc's have also
+    // finished any deferred writes that potentially land in these
+    // blocks
+    auto txc = &releasing_txc.front();
+    _txc_release_alloc(txc);
+    releasing_txc.pop_front();
+    throttle.log_state_latency(*txc, logger, l_bluestore_state_done_lat);
+    throttle.complete(*txc);
+    delete txc;
+  }
+
+  if (submit_deferred) {
+    // we're pinning memory; flush!  we could be more fine-grained here but
+    // i'm not sure it's worth the bother.
+    deferred_try_submit();
+  }
+
+  if (empty && osr->zombie) {
+    std::lock_guard l(zombie_osr_lock);
+    if (zombie_osr_set.erase(osr->cid)) {
+      dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
+    } else {
+      dout(10) << __func__ << " empty zombie osr " << osr << " already reaped"
+	       << dendl;
+    }
+  }
+}
+
+void BlueStore::_txc_release_alloc(TransContext *txc)
+{
+  bool discard_queued = false;
+  // it's expected we're called with lazy_release_lock already taken!
+  if (unlikely(cct->_conf->bluestore_debug_no_reuse_blocks)) {
+      goto out;
+  }
+  discard_queued = bdev->try_discard(txc->released);
+  // if async discard succeeded, will do alloc->release when discard callback
+  // else we should release here
+  if (!discard_queued) {
+      dout(10) << __func__ << "(sync) " << txc << " " << std::hex
+               << txc->released << std::dec << dendl;
+      alloc->release(txc->released);
+  }
+
+out:
+  txc->allocated.clear();
+  txc->released.clear();
+}
+
+void BlueStore::_osr_attach(Collection *c)
+{
+  // note: caller has coll_lock
+  auto q = coll_map.find(c->cid);
+  if (q != coll_map.end()) {
+    c->osr = q->second->osr;
+    ldout(cct, 10) << __func__ << " " << c->cid
+		   << " reusing osr " << c->osr << " from existing coll "
+		   << q->second << dendl;
+  } else {
+    std::lock_guard l(zombie_osr_lock);
+    auto p = zombie_osr_set.find(c->cid);
+    if (p == zombie_osr_set.end()) {
+      c->osr = ceph::make_ref<OpSequencer>(this, next_sequencer_id++, c->cid);
+      ldout(cct, 10) << __func__ << " " << c->cid
+		     << " fresh osr " << c->osr << dendl;
+    } else {
+      c->osr = p->second;
+      zombie_osr_set.erase(p);
+      ldout(cct, 10) << __func__ << " " << c->cid
+		     << " resurrecting zombie osr " << c->osr << dendl;
+      c->osr->zombie = false;
+    }
+  }
+}
+
+void BlueStore::_osr_register_zombie(OpSequencer *osr)
+{
+  std::lock_guard l(zombie_osr_lock);
+  dout(10) << __func__ << " " << osr << " " << osr->cid << dendl;
+  osr->zombie = true;
+  auto i = zombie_osr_set.emplace(osr->cid, osr);
+  // this is either a new insertion or the same osr is already there
+  ceph_assert(i.second || i.first->second == osr);
+}
+
+void BlueStore::_osr_drain_preceding(TransContext *txc)
+{
+  OpSequencer *osr = txc->osr.get();
+  dout(10) << __func__ << " " << txc << " osr " << osr << dendl;
+  ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
+  {
+    // submit anything pending
+    osr->deferred_lock.lock();
+    if (osr->deferred_pending && !osr->deferred_running) {
+      _deferred_submit_unlock(osr);
+    } else {
+      osr->deferred_lock.unlock();
+    }
+  }
+  {
+    // wake up any previously finished deferred events
+    std::lock_guard l(kv_lock);
+    if (!kv_sync_in_progress) {
+      kv_sync_in_progress = true;
+      kv_cond.notify_one();
+    }
+  }
+  osr->drain_preceding(txc);
+  --deferred_aggressive;
+  dout(10) << __func__ << " " << osr << " done" << dendl;
+}
+
+void BlueStore::_osr_drain(OpSequencer *osr)
+{
+  dout(10) << __func__ << " " << osr << dendl;
+  ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
+  {
+    // submit anything pending
+    osr->deferred_lock.lock();
+    if (osr->deferred_pending && !osr->deferred_running) {
+      _deferred_submit_unlock(osr);
+    } else {
+      osr->deferred_lock.unlock();
+    }
+  }
+  {
+    // wake up any previously finished deferred events
+    std::lock_guard l(kv_lock);
+    if (!kv_sync_in_progress) {
+      kv_sync_in_progress = true;
+      kv_cond.notify_one();
+    }
+  }
+  osr->drain();
+  --deferred_aggressive;
+  dout(10) << __func__ << " " << osr << " done" << dendl;
+}
+
+void BlueStore::_osr_drain_all()
+{
+  dout(10) << __func__ << dendl;
+
+  set<OpSequencerRef> s;
+  vector<OpSequencerRef> zombies;
+  {
+    std::shared_lock l(coll_lock);
+    for (auto& i : coll_map) {
+      s.insert(i.second->osr);
+    }
+  }
+  {
+    std::lock_guard l(zombie_osr_lock);
+    for (auto& i : zombie_osr_set) {
+      s.insert(i.second);
+      zombies.push_back(i.second);
+    }
+  }
+  dout(20) << __func__ << " osr_set " << s << dendl;
+
+  ++deferred_aggressive;
+  {
+    // submit anything pending
+    deferred_try_submit();
+  }
+  {
+    // wake up any previously finished deferred events
+    std::lock_guard l(kv_lock);
+    kv_cond.notify_one();
+  }
+  {
+    std::lock_guard l(kv_finalize_lock);
+    kv_finalize_cond.notify_one();
+  }
+  for (auto osr : s) {
+    dout(20) << __func__ << " drain " << osr << dendl;
+    osr->drain();
+  }
+  --deferred_aggressive;
+
+  {
+    std::lock_guard l(zombie_osr_lock);
+    for (auto& osr : zombies) {
+      if (zombie_osr_set.erase(osr->cid)) {
+	dout(10) << __func__ << " reaping empty zombie osr " << osr << dendl;
+	ceph_assert(osr->q.empty());
+      } else if (osr->zombie) {
+	dout(10) << __func__ << " empty zombie osr " << osr
+		 << " already reaped" << dendl;
+	ceph_assert(osr->q.empty());
+      } else {
+	dout(10) << __func__ << " empty zombie osr " << osr
+		 << " resurrected" << dendl;
+      }
+    }
+  }
+
+  dout(10) << __func__ << " done" << dendl;
+}
+
+
+void BlueStore::_kv_start()
+{
+  dout(10) << __func__ << dendl;
+
+  finisher.start();
+  kv_sync_thread.create("bstore_kv_sync");
+  kv_finalize_thread.create("bstore_kv_final");
+}
+
+void BlueStore::_kv_stop()
+{
+  dout(10) << __func__ << dendl;
+  {
+    std::unique_lock l{kv_lock};
+    while (!kv_sync_started) {
+      kv_cond.wait(l);
+    }
+    kv_stop = true;
+    kv_cond.notify_all();
+  }
+  {
+    std::unique_lock l{kv_finalize_lock};
+    while (!kv_finalize_started) {
+      kv_finalize_cond.wait(l);
+    }
+    kv_finalize_stop = true;
+    kv_finalize_cond.notify_all();
+  }
+  kv_sync_thread.join();
+  kv_finalize_thread.join();
+  ceph_assert(removed_collections.empty());
+  {
+    std::lock_guard l(kv_lock);
+    kv_stop = false;
+  }
+  {
+    std::lock_guard l(kv_finalize_lock);
+    kv_finalize_stop = false;
+  }
+  dout(10) << __func__ << " stopping finishers" << dendl;
+  finisher.wait_for_empty();
+  finisher.stop();
+  dout(10) << __func__ << " stopped" << dendl;
+}
+
+void BlueStore::_kv_sync_thread()
+{
+  dout(10) << __func__ << " start" << dendl;
+  deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
+  std::unique_lock l{kv_lock};
+  ceph_assert(!kv_sync_started);
+  kv_sync_started = true;
+  kv_cond.notify_all();
+
+  auto t0 = mono_clock::now();
+  timespan twait = ceph::make_timespan(0);
+  size_t kv_submitted = 0;
+
+  while (true) {
+    auto period = cct->_conf->bluestore_kv_sync_util_logging_s;
+    auto observation_period =
+      ceph::make_timespan(period);
+    auto elapsed = mono_clock::now() - t0;
+    if (period && elapsed >= observation_period) {
+      dout(5) << __func__ << " utilization: idle "
+	      << twait << " of " << elapsed
+	      << ", submitted: " << kv_submitted
+	      <<dendl;
+      t0 = mono_clock::now();
+      twait = ceph::make_timespan(0);
+      kv_submitted = 0;
+    }
+    ceph_assert(kv_committing.empty());
+    if (kv_queue.empty() &&
+	((deferred_done_queue.empty() && deferred_stable_queue.empty()) ||
+	 !deferred_aggressive)) {
+      if (kv_stop)
+	break;
+      dout(20) << __func__ << " sleep" << dendl;
+      auto t = mono_clock::now();
+      kv_sync_in_progress = false;
+      kv_cond.wait(l);
+      twait += mono_clock::now() - t;
+
+      dout(20) << __func__ << " wake" << dendl;
+    } else {
+      deque<TransContext*> kv_submitting;
+      deque<DeferredBatch*> deferred_done, deferred_stable;
+      uint64_t aios = 0, costs = 0;
+
+      dout(20) << __func__ << " committing " << kv_queue.size()
+	       << " submitting " << kv_queue_unsubmitted.size()
+	       << " deferred done " << deferred_done_queue.size()
+	       << " stable " << deferred_stable_queue.size()
+	       << dendl;
+      kv_committing.swap(kv_queue);
+      kv_submitting.swap(kv_queue_unsubmitted);
+      deferred_done.swap(deferred_done_queue);
+      deferred_stable.swap(deferred_stable_queue);
+      aios = kv_ios;
+      costs = kv_throttle_costs;
+      kv_ios = 0;
+      kv_throttle_costs = 0;
+      l.unlock();
+
+      dout(30) << __func__ << " committing " << kv_committing << dendl;
+      dout(30) << __func__ << " submitting " << kv_submitting << dendl;
+      dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
+      dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;
+
+      auto start = mono_clock::now();
+
+      bool force_flush = false;
+      // if bluefs is sharing the same device as data (only), then we
+      // can rely on the bluefs commit to flush the device and make
+      // deferred aios stable.  that means that if we do have done deferred
+      // txcs AND we are not on a single device, we need to force a flush.
+      if (bluefs && bluefs_layout.single_shared_device()) {
+	if (aios) {
+	  force_flush = true;
+	} else if (kv_committing.empty() && deferred_stable.empty()) {
+	  force_flush = true;  // there's nothing else to commit!
+	} else if (deferred_aggressive) {
+	  force_flush = true;
+	}
+      } else {
+      	if (aios || !deferred_done.empty()) {
+	  force_flush = true;
+      	} else {
+	  dout(20) << __func__ << " skipping flush (no aios, no deferred_done)" << dendl;
+      	}
+      }
+
+      if (force_flush) {
+	dout(20) << __func__ << " num_aios=" << aios
+		 << " force_flush=" << (int)force_flush
+		 << ", flushing, deferred done->stable" << dendl;
+	// flush/barrier on block device
+	bdev->flush();
+
+        // if we flush then deferred done are now deferred stable
+        if (deferred_stable.empty()) {
+          deferred_stable.swap(deferred_done);
+        } else {
+          deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
+                                 deferred_done.end());
+          deferred_done.clear();
+        }
+      }
+      auto after_flush = mono_clock::now();
+
+      // we will use one final transaction to force a sync
+      KeyValueDB::Transaction synct = db->get_transaction();
+
+      // increase {nid,blobid}_max?  note that this covers both the
+      // case where we are approaching the max and the case we passed
+      // it.  in either case, we increase the max in the earlier txn
+      // we submit.
+      uint64_t new_nid_max = 0, new_blobid_max = 0;
+      if (nid_last + cct->_conf->bluestore_nid_prealloc/2 > nid_max) {
+	KeyValueDB::Transaction t =
+	  kv_submitting.empty() ? synct : kv_submitting.front()->t;
+	new_nid_max = nid_last + cct->_conf->bluestore_nid_prealloc;
+	bufferlist bl;
+	encode(new_nid_max, bl);
+	t->set(PREFIX_SUPER, "nid_max", bl);
+	dout(10) << __func__ << " new_nid_max " << new_nid_max << dendl;
+      }
+      if (blobid_last + cct->_conf->bluestore_blobid_prealloc/2 > blobid_max) {
+	KeyValueDB::Transaction t =
+	  kv_submitting.empty() ? synct : kv_submitting.front()->t;
+	new_blobid_max = blobid_last + cct->_conf->bluestore_blobid_prealloc;
+	bufferlist bl;
+	encode(new_blobid_max, bl);
+	t->set(PREFIX_SUPER, "blobid_max", bl);
+	dout(10) << __func__ << " new_blobid_max " << new_blobid_max << dendl;
+      }
+
+      for (auto txc : kv_committing) {
+	throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_queued_lat);
+	if (txc->get_state() == TransContext::STATE_KV_QUEUED) {
+	  ++kv_submitted;
+	  _txc_apply_kv(txc, false);
+	  --txc->osr->kv_committing_serially;
+	} else {
+	  ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
+	}
+	if (txc->had_ios) {
+	  --txc->osr->txc_with_unstable_io;
+	}
+      }
+
+      // release throttle *before* we commit.  this allows new ops
+      // to be prepared and enter pipeline while we are waiting on
+      // the kv commit sync/flush.  then hopefully on the next
+      // iteration there will already be ops awake.  otherwise, we
+      // end up going to sleep, and then wake up when the very first
+      // transaction is ready for commit.
+      throttle.release_kv_throttle(costs);
+
+      // cleanup sync deferred keys
+      for (auto b : deferred_stable) {
+	for (auto& txc : b->txcs) {
+	  bluestore_deferred_transaction_t& wt = *txc.deferred_txn;
+	  ceph_assert(wt.released.empty()); // only kraken did this
+	  string key;
+	  get_deferred_key(wt.seq, &key);
+	  synct->rm_single_key(PREFIX_DEFERRED, key);
+	}
+      }
+
+#if defined(WITH_LTTNG)
+      auto sync_start = mono_clock::now();
+#endif
+      // submit synct synchronously (block and wait for it to commit)
+      int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct);
+      ceph_assert(r == 0);
+
+#ifdef WITH_BLKIN
+      for (auto txc : kv_committing) {
+        if (txc->trace) {
+          txc->trace.event("db sync submit");
+          txc->trace.keyval("kv_committing size", kv_committing.size());
+        }
+      }
+#endif
+
+      int committing_size = kv_committing.size();
+      int deferred_size = deferred_stable.size();
+
+#if defined(WITH_LTTNG)
+      double sync_latency = ceph::to_seconds<double>(mono_clock::now() - sync_start);
+      for (auto txc: kv_committing) {
+	if (txc->tracing) {
+	  tracepoint(
+	    bluestore,
+	    transaction_kv_sync_latency,
+	    txc->osr->get_sequencer_id(),
+	    txc->seq,
+	    kv_committing.size(),
+	    deferred_done.size(),
+	    deferred_stable.size(),
+	    sync_latency);
+	}
+      }
+#endif
+
+      {
+	std::unique_lock m{kv_finalize_lock};
+	if (kv_committing_to_finalize.empty()) {
+	  kv_committing_to_finalize.swap(kv_committing);
+	} else {
+	  kv_committing_to_finalize.insert(
+	      kv_committing_to_finalize.end(),
+	      kv_committing.begin(),
+	      kv_committing.end());
+	  kv_committing.clear();
+	}
+	if (deferred_stable_to_finalize.empty()) {
+	  deferred_stable_to_finalize.swap(deferred_stable);
+	} else {
+	  deferred_stable_to_finalize.insert(
+	      deferred_stable_to_finalize.end(),
+	      deferred_stable.begin(),
+	      deferred_stable.end());
+	  deferred_stable.clear();
+	}
+	if (!kv_finalize_in_progress) {
+	  kv_finalize_in_progress = true;
+	  kv_finalize_cond.notify_one();
+	}
+      }
+
+      if (new_nid_max) {
+	nid_max = new_nid_max;
+	dout(10) << __func__ << " nid_max now " << nid_max << dendl;
+      }
+      if (new_blobid_max) {
+	blobid_max = new_blobid_max;
+	dout(10) << __func__ << " blobid_max now " << blobid_max << dendl;
+      }
+
+      {
+	auto finish = mono_clock::now();
+	ceph::timespan dur_flush = after_flush - start;
+	ceph::timespan dur_kv = finish - after_flush;
+	ceph::timespan dur = finish - start;
+	dout(20) << __func__ << " committed " << committing_size
+	  << " cleaned " << deferred_size
+	  << " in " << dur
+	  << " (" << dur_flush << " flush + " << dur_kv << " kv commit)"
+	  << dendl;
+	log_latency("kv_flush",
+	  l_bluestore_kv_flush_lat,
+	  dur_flush,
+	  cct->_conf->bluestore_log_op_age);
+	log_latency("kv_commit",
+	  l_bluestore_kv_commit_lat,
+	  dur_kv,
+	  cct->_conf->bluestore_log_op_age);
+	log_latency("kv_sync",
+	  l_bluestore_kv_sync_lat,
+	  dur,
+	  cct->_conf->bluestore_log_op_age);
+      }
+
+      l.lock();
+      // previously deferred "done" are now "stable" by virtue of this
+      // commit cycle.
+      deferred_stable_queue.swap(deferred_done);
+    }
+  }
+  dout(10) << __func__ << " finish" << dendl;
+  kv_sync_started = false;
+}
+
+void BlueStore::_kv_finalize_thread()
+{
+  deque<TransContext*> kv_committed;
+  deque<DeferredBatch*> deferred_stable;
+  dout(10) << __func__ << " start" << dendl;
+  std::unique_lock l(kv_finalize_lock);
+  ceph_assert(!kv_finalize_started);
+  kv_finalize_started = true;
+  kv_finalize_cond.notify_all();
+  while (true) {
+    ceph_assert(kv_committed.empty());
+    ceph_assert(deferred_stable.empty());
+    if (kv_committing_to_finalize.empty() &&
+	deferred_stable_to_finalize.empty()) {
+      if (kv_finalize_stop)
+	break;
+      dout(20) << __func__ << " sleep" << dendl;
+      kv_finalize_in_progress = false;
+      kv_finalize_cond.wait(l);
+      dout(20) << __func__ << " wake" << dendl;
+    } else {
+      kv_committed.swap(kv_committing_to_finalize);
+      deferred_stable.swap(deferred_stable_to_finalize);
+      l.unlock();
+      dout(20) << __func__ << " kv_committed " << kv_committed << dendl;
+      dout(20) << __func__ << " deferred_stable " << deferred_stable << dendl;
+
+      auto start = mono_clock::now();
+
+      while (!kv_committed.empty()) {
+	TransContext *txc = kv_committed.front();
+	ceph_assert(txc->get_state() == TransContext::STATE_KV_SUBMITTED);
+	_txc_state_proc(txc);
+	kv_committed.pop_front();
+      }
+
+      for (auto b : deferred_stable) {
+	auto p = b->txcs.begin();
+	while (p != b->txcs.end()) {
+	  TransContext *txc = &*p;
+	  p = b->txcs.erase(p); // unlink here because
+	  _txc_state_proc(txc); // this may destroy txc
+	}
+	delete b;
+      }
+      deferred_stable.clear();
+
+      if (!deferred_aggressive) {
+	if (deferred_queue_size >= deferred_batch_ops.load() ||
+	    throttle.should_submit_deferred()) {
+	  deferred_try_submit();
+	}
+      }
+
+      // this is as good a place as any ...
+      _reap_collections();
+
+      logger->set(l_bluestore_fragmentation,
+	  (uint64_t)(alloc->get_fragmentation() * 1000));
+
+      log_latency("kv_final",
+	l_bluestore_kv_final_lat,
+	mono_clock::now() - start,
+	cct->_conf->bluestore_log_op_age);
+
+      l.lock();
+    }
+  }
+  dout(10) << __func__ << " finish" << dendl;
+  kv_finalize_started = false;
+}
+
+#ifdef HAVE_LIBZBD
+void BlueStore::_zoned_cleaner_start()
+{
+  dout(10) << __func__ << dendl;
+  zoned_cleaner_thread.create("bstore_zcleaner");
+}
+
+void BlueStore::_zoned_cleaner_stop()
+{
+  dout(10) << __func__ << dendl;
+  {
+    std::unique_lock l{zoned_cleaner_lock};
+    while (!zoned_cleaner_started) {
+      zoned_cleaner_cond.wait(l);
+    }
+    zoned_cleaner_stop = true;
+    zoned_cleaner_cond.notify_all();
+  }
+  zoned_cleaner_thread.join();
+  {
+    std::lock_guard l{zoned_cleaner_lock};
+    zoned_cleaner_stop = false;
+  }
+  dout(10) << __func__ << " done" << dendl;
+}
+
+void BlueStore::_zoned_cleaner_thread()
+{
+  dout(10) << __func__ << " start" << dendl;
+  std::unique_lock l{zoned_cleaner_lock};
+  ceph_assert(!zoned_cleaner_started);
+  zoned_cleaner_started = true;
+  zoned_cleaner_cond.notify_all();
+  auto a = dynamic_cast<ZonedAllocator*>(alloc);
+  ceph_assert(a);
+  auto f = dynamic_cast<ZonedFreelistManager*>(fm);
+  ceph_assert(f);
+  while (true) {
+    // thresholds to trigger cleaning
+    // FIXME
+    float min_score = .05;                // score: bytes saved / bytes moved
+    uint64_t min_saved = zone_size / 32;  // min bytes saved to consider cleaning
+    auto zone_to_clean = a->pick_zone_to_clean(min_score, min_saved);
+    if (zone_to_clean < 0) {
+      if (zoned_cleaner_stop) {
+	break;
+      }
+      auto period = ceph::make_timespan(cct->_conf->bluestore_cleaner_sleep_interval);
+      dout(20) << __func__ << " sleep for " << period << dendl;
+      zoned_cleaner_cond.wait_for(l, period);
+      dout(20) << __func__ << " wake" << dendl;
+    } else {
+      l.unlock();
+      a->set_cleaning_zone(zone_to_clean);
+      _zoned_clean_zone(zone_to_clean, a, f);
+      a->clear_cleaning_zone(zone_to_clean);
+      l.lock();
+    }
+  }
+  dout(10) << __func__ << " finish" << dendl;
+  zoned_cleaner_started = false;
+}
+
+void BlueStore::_zoned_clean_zone(
+  uint64_t zone,
+  ZonedAllocator *a,
+  ZonedFreelistManager *f
+  )
+{
+  dout(10) << __func__ << " cleaning zone 0x" << std::hex << zone << std::dec << dendl;
+
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_ZONED_CL_INFO);
+  std::string zone_start;
+  get_zone_offset_object_key(zone, 0, ghobject_t(), &zone_start);
+  for (it->lower_bound(zone_start); it->valid(); it->next()) {
+    uint32_t z;
+    uint64_t offset;
+    ghobject_t oid;
+    string k = it->key();
+    int r = get_key_zone_offset_object(k, &z, &offset, &oid);
+    if (r < 0) {
+      derr << __func__ << " failed to decode zone ref " << pretty_binary_string(k)
+	   << dendl;
+      continue;
+    }
+    if (zone != z) {
+      dout(10) << __func__ << " reached end of zone refs" << dendl;
+      break;
+    }
+    dout(10) << __func__ << " zone 0x" << std::hex << zone << " offset 0x" << offset
+	     << std::dec << " " << oid << dendl;
+    _clean_some(oid, zone);
+  }
+
+  if (a->get_live_bytes(zone) > 0) {
+    derr << "zone 0x" << std::hex << zone << " still has 0x" << a->get_live_bytes(zone)
+	 << " live bytes" << std::dec << dendl;
+    // should we do something else here to avoid a live-lock in the event of a problem?
+    return;
+  }
+
+  // make sure transactions flush/drain/commit (and data is all rewritten
+  // safely elsewhere) before we blow away the cleaned zone
+  _osr_drain_all();
+
+  // reset the device zone
+  dout(10) << __func__ << " resetting zone 0x" << std::hex << zone << std::dec << dendl;
+  bdev->reset_zone(zone);
+
+  // record that we can now write there
+  f->mark_zone_to_clean_free(zone, db);
+  bdev->flush();
+
+  // then allow ourselves to start allocating there
+  dout(10) << __func__ << " done cleaning zone 0x" << std::hex << zone << std::dec
+	   << dendl;
+  a->reset_zone(zone);
+}
+
+void BlueStore::_clean_some(ghobject_t oid, uint32_t zone)
+{
+  dout(10) << __func__ << " " << oid << " from zone 0x" << std::hex << zone << std::dec
+	   << dendl;
+
+  CollectionRef cref = _get_collection_by_oid(oid);
+  if (!cref) {
+    dout(10) << __func__ << " can't find collection for " << oid << dendl;
+    return;
+  }
+  Collection *c = cref.get();
+
+  // serialize io dispatch vs other transactions
+  std::lock_guard l(atomic_alloc_and_submit_lock);
+  std::unique_lock l2(c->lock);
+
+  auto o = c->get_onode(oid, false);
+  if (!o) {
+    dout(10) << __func__ << " can't find " << oid << dendl;
+    return;
+  }
+
+  o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+  _dump_onode<30>(cct, *o);
+
+  // NOTE: This is a naive rewrite strategy.  If any blobs are
+  // shared, they will be duplicated for each object that references
+  // them.  That means any cloned/snapshotted objects will explode
+  // their utilization.  This won't matter for RGW workloads, but
+  // for RBD and CephFS it is completely unacceptable, and it's
+  // entirely reasonable to have "archival" data workloads on SMR
+  // for CephFS and (possibly/probably) RBD.
+  //
+  // At some point we need to replace this with something more
+  // sophisticated that ensures that a shared blob gets moved once
+  // and all referencing objects get updated to point to the new
+  // location.
+
+  map<uint32_t, uint32_t> to_move;
+  for (auto& e : o->extent_map.extent_map) {
+    bool touches_zone = false;
+    for (auto& be : e.blob->get_blob().get_extents()) {
+      if (be.is_valid()) {
+	uint32_t z = be.offset / zone_size;
+	if (z == zone) {
+	  touches_zone = true;
+	  break;
+	}
+      }
+    }
+    if (touches_zone) {
+      to_move[e.logical_offset] = e.length;
+    }
+  }
+  if (to_move.empty()) {
+    dout(10) << __func__ << " no references to zone 0x" << std::hex << zone
+	     << std::dec << " from " << oid << dendl;
+    return;
+  }
+
+  dout(10) << __func__ << " rewriting object extents 0x" << std::hex << to_move
+	   << std::dec << dendl;
+  OpSequencer *osr = c->osr.get();
+  TransContext *txc = _txc_create(c, osr, nullptr);
+
+  spg_t pgid;
+  if (c->cid.is_pg(&pgid)) {
+    txc->osd_pool_id = pgid.pool();
+  }
+
+  for (auto& [offset, length] : to_move) {
+    bufferlist bl;
+    int r = _do_read(c, o, offset, length, bl, 0);
+    ceph_assert(r == (int)length);
+
+    r = _do_write(txc, cref, o, offset, length, bl, 0);
+    ceph_assert(r >= 0);
+  }
+  txc->write_onode(o);
+
+  _txc_write_nodes(txc, txc->t);
+  _txc_finalize_kv(txc, txc->t);
+  _txc_state_proc(txc);
+}
+#endif
+
+bluestore_deferred_op_t *BlueStore::_get_deferred_op(
+  TransContext *txc, uint64_t len)
+{
+  if (!txc->deferred_txn) {
+    txc->deferred_txn = new bluestore_deferred_transaction_t;
+  }
+  txc->deferred_txn->ops.push_back(bluestore_deferred_op_t());
+  logger->inc(l_bluestore_issued_deferred_writes);
+  logger->inc(l_bluestore_issued_deferred_write_bytes, len);
+  return &txc->deferred_txn->ops.back();
+}
+
+void BlueStore::_deferred_queue(TransContext *txc)
+{
+  dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
+
+  DeferredBatch *tmp;
+  txc->osr->deferred_lock.lock();
+  {
+    if (!txc->osr->deferred_pending) {
+      tmp = new DeferredBatch(cct, txc->osr.get());
+    } else {
+      tmp  = txc->osr->deferred_pending;
+    }
+  }
+
+  tmp->txcs.push_back(*txc);
+  bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
+  for (auto opi = wt.ops.begin(); opi != wt.ops.end(); ++opi) {
+    const auto& op = *opi;
+    ceph_assert(op.op == bluestore_deferred_op_t::OP_WRITE);
+    bufferlist::const_iterator p = op.data.begin();
+    for (auto e : op.extents) {
+      tmp->prepare_write(cct, wt.seq, e.offset, e.length, p);
+    }
+  }
+
+  {
+    ++deferred_queue_size;
+    txc->osr->deferred_pending = tmp;
+    // condition "tmp->txcs.size() == 1" mean deferred_pending was originally empty.
+    // So we should add osr into deferred_queue.
+    if (!txc->osr->deferred_running && (tmp->txcs.size() == 1)) {
+      deferred_lock.lock();
+      deferred_queue.push_back(*txc->osr);
+      deferred_lock.unlock();
+    }
+
+    if (deferred_aggressive &&
+	!txc->osr->deferred_running) {
+      _deferred_submit_unlock(txc->osr.get());
+    } else {
+      txc->osr->deferred_lock.unlock();
+    }
+  }
+ }
+
+void BlueStore::deferred_try_submit()
+{
+  dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
+	   << deferred_queue_size << " txcs" << dendl;
+  vector<OpSequencerRef> osrs;
+
+  {
+    std::lock_guard l(deferred_lock);
+    osrs.reserve(deferred_queue.size());
+    for (auto& osr : deferred_queue) {
+      osrs.push_back(&osr);
+    }
+  }
+
+  for (auto& osr : osrs) {
+    osr->deferred_lock.lock();
+    if (osr->deferred_pending) {
+      if (!osr->deferred_running) {
+	_deferred_submit_unlock(osr.get());
+      } else {
+	osr->deferred_lock.unlock();
+	dout(20) << __func__ << "  osr " << osr << " already has running"
+		 << dendl;
+      }
+    } else {
+      osr->deferred_lock.unlock();
+      dout(20) << __func__ << "  osr " << osr << " has no pending" << dendl;
+    }
+  }
+
+  {
+    std::lock_guard l(deferred_lock);
+    deferred_last_submitted = ceph_clock_now();
+  }
+}
+
+void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
+{
+  dout(10) << __func__ << " osr " << osr
+	   << " " << osr->deferred_pending->iomap.size() << " ios pending "
+	   << dendl;
+  ceph_assert(osr->deferred_pending);
+  ceph_assert(!osr->deferred_running);
+
+  auto b = osr->deferred_pending;
+  deferred_queue_size -= b->seq_bytes.size();
+  ceph_assert(deferred_queue_size >= 0);
+
+  osr->deferred_running = osr->deferred_pending;
+  osr->deferred_pending = nullptr;
+
+  osr->deferred_lock.unlock();
+
+  for (auto& txc : b->txcs) {
+    throttle.log_state_latency(txc, logger, l_bluestore_state_deferred_queued_lat);
+  }
+  uint64_t start = 0, pos = 0;
+  bufferlist bl;
+  auto i = b->iomap.begin();
+  while (true) {
+    if (i == b->iomap.end() || i->first != pos) {
+      if (bl.length()) {
+	dout(20) << __func__ << " write 0x" << std::hex
+		 << start << "~" << bl.length()
+		 << " crc " << bl.crc32c(-1) << std::dec << dendl;
+	if (!g_conf()->bluestore_debug_omit_block_device_write) {
+	  logger->inc(l_bluestore_submitted_deferred_writes);
+	  logger->inc(l_bluestore_submitted_deferred_write_bytes, bl.length());
+	  int r = bdev->aio_write(start, bl, &b->ioc, false);
+	  ceph_assert(r == 0);
+	}
+      }
+      if (i == b->iomap.end()) {
+	break;
+      }
+      start = 0;
+      pos = i->first;
+      bl.clear();
+    }
+    dout(20) << __func__ << "   seq " << i->second.seq << " 0x"
+	     << std::hex << pos << "~" << i->second.bl.length() << std::dec
+	     << dendl;
+    if (!bl.length()) {
+      start = pos;
+    }
+    pos += i->second.bl.length();
+    bl.claim_append(i->second.bl);
+    ++i;
+  }
+
+  bdev->aio_submit(&b->ioc);
+}
+
+struct C_DeferredTrySubmit : public Context {
+  BlueStore *store;
+  C_DeferredTrySubmit(BlueStore *s) : store(s) {}
+  void finish(int r) {
+    store->deferred_try_submit();
+  }
+};
+
+void BlueStore::_deferred_aio_finish(OpSequencer *osr)
+{
+  dout(10) << __func__ << " osr " << osr << dendl;
+  ceph_assert(osr->deferred_running);
+  DeferredBatch *b = osr->deferred_running;
+
+  {
+    osr->deferred_lock.lock();
+    ceph_assert(osr->deferred_running == b);
+    osr->deferred_running = nullptr;
+    if (!osr->deferred_pending) {
+      dout(20) << __func__ << " dequeueing" << dendl;
+      {
+	deferred_lock.lock();
+	auto q = deferred_queue.iterator_to(*osr);
+	deferred_queue.erase(q);
+	deferred_lock.unlock();
+      }
+      osr->deferred_lock.unlock();
+    } else {
+      osr->deferred_lock.unlock();
+      if (deferred_aggressive) {
+	dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
+	finisher.queue(new C_DeferredTrySubmit(this));
+      } else {
+	dout(20) << __func__ << " leaving queued, more pending" << dendl;
+      }
+    }
+  }
+
+  {
+    uint64_t costs = 0;
+    {
+      for (auto& i : b->txcs) {
+	TransContext *txc = &i;
+	throttle.log_state_latency(*txc, logger, l_bluestore_state_deferred_aio_wait_lat);
+	txc->set_state(TransContext::STATE_DEFERRED_CLEANUP);
+	costs += txc->cost;
+      }
+    }
+    throttle.release_deferred_throttle(costs);
+  }
+
+  {
+    std::lock_guard l(kv_lock);
+    deferred_done_queue.emplace_back(b);
+
+    // in the normal case, do not bother waking up the kv thread; it will
+    // catch us on the next commit anyway.
+    if (deferred_aggressive && !kv_sync_in_progress) {
+	kv_sync_in_progress = true;
+	kv_cond.notify_one();
+    }
+  }
+}
+
+int BlueStore::_deferred_replay()
+{
+  dout(10) << __func__ << " start" << dendl;
+  int count = 0;
+  int r = 0;
+  interval_set<uint64_t> bluefs_extents;
+  if (bluefs) {
+    bluefs->foreach_block_extents(
+      bluefs_layout.shared_bdev,
+      [&] (uint64_t start, uint32_t len) {
+        bluefs_extents.insert(start, len);
+      }
+    );
+  }
+  CollectionRef ch = _get_collection(coll_t::meta());
+  bool fake_ch = false;
+  if (!ch) {
+    // hmm, replaying initial mkfs?
+    ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
+    fake_ch = true;
+  }
+  OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
+  for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
+    dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
+	     << dendl;
+    bluestore_deferred_transaction_t *deferred_txn =
+      new bluestore_deferred_transaction_t;
+    bufferlist bl = it->value();
+    auto p = bl.cbegin();
+    try {
+      decode(*deferred_txn, p);
+    } catch (ceph::buffer::error& e) {
+      derr << __func__ << " failed to decode deferred txn "
+	   << pretty_binary_string(it->key()) << dendl;
+      delete deferred_txn;
+      r = -EIO;
+      goto out;
+    }
+    bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents);
+    if (has_some) {
+      TransContext *txc = _txc_create(ch.get(), osr,  nullptr);
+      txc->deferred_txn = deferred_txn;
+      txc->set_state(TransContext::STATE_KV_DONE);
+      _txc_state_proc(txc);
+    } else {
+      delete deferred_txn;
+    }
+  }
+ out:
+  dout(20) << __func__ << " draining osr" << dendl;
+  _osr_register_zombie(osr);
+  _osr_drain_all();
+  if (fake_ch) {
+    new_coll_map.clear();
+  }
+  dout(10) << __func__ << " completed " << count << " events" << dendl;
+  return r;
+}
+
+bool BlueStore::_eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn,
+					     interval_set<uint64_t>& bluefs_extents)
+{
+  bool has_some = false;
+  dout(30) << __func__ << " bluefs_extents: " << std::hex << bluefs_extents << std::dec << dendl;
+  auto it = deferred_txn->ops.begin();
+  while (it != deferred_txn->ops.end()) {
+    // We process a pair of _data_/_extents_ (here: it->data/it->extents)
+    // by eliminating _extents_ that belong to bluefs, removing relevant parts of _data_
+    // example:
+    // +------------+---------------+---------------+---------------+
+    // | data       | aaaaaaaabbbbb | bbbbcccccdddd | ddddeeeeeefff |
+    // | extent     | 40000 - 44000 | 50000 - 58000 | 58000 - 60000 |
+    // | in bluefs? |       no      |      yes      |       no      |
+    // +------------+---------------+---------------+---------------+
+    // result:
+    // +------------+---------------+---------------+
+    // | data       | aaaaaaaabbbbb | ddddeeeeeefff |
+    // | extent     | 40000 - 44000 | 58000 - 60000 |
+    // +------------+---------------+---------------+
+    PExtentVector new_extents;
+    ceph::buffer::list new_data;
+    uint32_t data_offset = 0; // this tracks location of extent 'e' inside it->data
+    dout(30) << __func__ << " input extents: " << it->extents << dendl;
+    for (auto& e: it->extents) {
+      interval_set<uint64_t> region;
+      region.insert(e.offset, e.length);
+
+      auto mi = bluefs_extents.lower_bound(e.offset);
+      if (mi != bluefs_extents.begin()) {
+	--mi;
+	if (mi.get_end() <= e.offset) {
+	  ++mi;
+	}
+      }
+      while (mi != bluefs_extents.end() && mi.get_start() < e.offset + e.length) {
+	// The interval_set does not like (asserts) when we erase interval that does not exist.
+	// Hence we do we implement (region-mi) by ((region+mi)-mi).
+	region.union_insert(mi.get_start(), mi.get_len());
+	region.erase(mi.get_start(), mi.get_len());
+	++mi;
+      }
+      // 'region' is now a subset of e, without parts used by bluefs
+      // we trim coresponding parts from it->data (actally constructing new_data / new_extents)
+      for (auto ki = region.begin(); ki != region.end(); ki++) {
+	ceph::buffer::list chunk;
+	// A chunk from it->data; data_offset is a an offset where 'e' was located;
+	// 'ki.get_start() - e.offset' is an offset of ki inside 'e'.
+	chunk.substr_of(it->data, data_offset + (ki.get_start() - e.offset), ki.get_len());
+	new_data.claim_append(chunk);
+	new_extents.emplace_back(bluestore_pextent_t(ki.get_start(), ki.get_len()));
+      }
+      data_offset += e.length;
+    }
+    dout(30) << __func__ << " output extents: " << new_extents << dendl;
+    if (it->data.length() != new_data.length()) {
+      dout(10) << __func__ << " trimmed deferred extents: " << it->extents << "->" << new_extents << dendl;
+    }
+    if (new_extents.size() == 0) {
+      it = deferred_txn->ops.erase(it);
+    } else {
+      has_some = true;
+      std::swap(it->extents, new_extents);
+      std::swap(it->data, new_data);
+      ++it;
+    }
+  }
+  return has_some;
+}
+
+// ---------------------------
+// transactions
+
+int BlueStore::queue_transactions(
+  CollectionHandle& ch,
+  vector<Transaction>& tls,
+  TrackedOpRef op,
+  ThreadPool::TPHandle *handle)
+{
+  FUNCTRACE(cct);
+  list<Context *> on_applied, on_commit, on_applied_sync;
+  ObjectStore::Transaction::collect_contexts(
+    tls, &on_applied, &on_commit, &on_applied_sync);
+
+  auto start = mono_clock::now();
+
+  Collection *c = static_cast<Collection*>(ch.get());
+  OpSequencer *osr = c->osr.get();
+  dout(10) << __func__ << " ch " << c << " " << c->cid << dendl;
+
+  // With HM-SMR drives (and ZNS SSDs) we want the I/O allocation and I/O
+  // submission to happen atomically because if I/O submission happens in a
+  // different order than I/O allocation, we end up issuing non-sequential
+  // writes to the drive.  This is a temporary solution until ZONE APPEND
+  // support matures in the kernel.  For more information please see:
+  // https://www.usenix.org/conference/vault20/presentation/bjorling
+  if (bdev->is_smr()) {
+    atomic_alloc_and_submit_lock.lock();
+  }
+
+  // prepare
+  TransContext *txc = _txc_create(static_cast<Collection*>(ch.get()), osr,
+				  &on_commit, op);
+
+  for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
+    txc->bytes += (*p).get_num_bytes();
+    _txc_add_transaction(txc, &(*p));
+  }
+  _txc_calc_cost(txc);
+
+  _txc_write_nodes(txc, txc->t);
+
+  // journal deferred items
+  if (txc->deferred_txn) {
+    txc->deferred_txn->seq = ++deferred_seq;
+    bufferlist bl;
+    encode(*txc->deferred_txn, bl);
+    string key;
+    get_deferred_key(txc->deferred_txn->seq, &key);
+    txc->t->set(PREFIX_DEFERRED, key, bl);
+  }
+
+  _txc_finalize_kv(txc, txc->t);
+
+#ifdef WITH_BLKIN
+  if (txc->trace) {
+    txc->trace.event("txc encode finished");
+  }
+#endif
+
+  if (handle)
+    handle->suspend_tp_timeout();
+
+  auto tstart = mono_clock::now();
+
+  if (!throttle.try_start_transaction(
+	*db,
+	*txc,
+	tstart)) {
+    // ensure we do not block here because of deferred writes
+    dout(10) << __func__ << " failed get throttle_deferred_bytes, aggressive"
+	     << dendl;
+    ++deferred_aggressive;
+    deferred_try_submit();
+    {
+      // wake up any previously finished deferred events
+      std::lock_guard l(kv_lock);
+      if (!kv_sync_in_progress) {
+	kv_sync_in_progress = true;
+	kv_cond.notify_one();
+      }
+    }
+    throttle.finish_start_transaction(*db, *txc, tstart);
+    --deferred_aggressive;
+  }
+  auto tend = mono_clock::now();
+
+  if (handle)
+    handle->reset_tp_timeout();
+
+  logger->inc(l_bluestore_txc);
+
+  // execute (start)
+  _txc_state_proc(txc);
+
+  if (bdev->is_smr()) {
+    atomic_alloc_and_submit_lock.unlock();
+  }
+
+  // we're immediately readable (unlike FileStore)
+  for (auto c : on_applied_sync) {
+    c->complete(0);
+  }
+  if (!on_applied.empty()) {
+    if (c->commit_queue) {
+      c->commit_queue->queue(on_applied);
+    } else {
+      finisher.queue(on_applied);
+    }
+  }
+
+#ifdef WITH_BLKIN
+  if (txc->trace) {
+    txc->trace.event("txc applied");
+  }
+#endif
+
+  log_latency("submit_transact",
+    l_bluestore_submit_lat,
+    mono_clock::now() - start,
+    cct->_conf->bluestore_log_op_age);
+  log_latency("throttle_transact",
+    l_bluestore_throttle_lat,
+    tend - tstart,
+    cct->_conf->bluestore_log_op_age);
+  return 0;
+}
+
+void BlueStore::_txc_aio_submit(TransContext *txc)
+{
+  dout(10) << __func__ << " txc " << txc << dendl;
+  bdev->aio_submit(&txc->ioc);
+}
+
+void BlueStore::_txc_add_transaction(TransContext *txc, Transaction *t)
+{
+  Transaction::iterator i = t->begin();
+
+  _dump_transaction<30>(cct, t);
+
+  vector<CollectionRef> cvec(i.colls.size());
+  unsigned j = 0;
+  for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
+       ++p, ++j) {
+    cvec[j] = _get_collection(*p);
+  }
+  
+  vector<OnodeRef> ovec(i.objects.size());
+
+  for (int pos = 0; i.have_op(); ++pos) {
+    Transaction::Op *op = i.decode_op();
+    int r = 0;
+
+    // no coll or obj
+    if (op->op == Transaction::OP_NOP)
+      continue;
+
+
+    // collection operations
+    CollectionRef &c = cvec[op->cid];
+
+    // initialize osd_pool_id and do a smoke test that all collections belong
+    // to the same pool
+    spg_t pgid;
+    if (!!c ? c->cid.is_pg(&pgid) : false) {
+      ceph_assert(txc->osd_pool_id == META_POOL_ID ||
+                  txc->osd_pool_id == pgid.pool());
+      txc->osd_pool_id = pgid.pool();
+    }
+
+    switch (op->op) {
+    case Transaction::OP_RMCOLL:
+      {
+        const coll_t &cid = i.get_cid(op->cid);
+	r = _remove_collection(txc, cid, &c);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_MKCOLL:
+      {
+	ceph_assert(!c);
+	const coll_t &cid = i.get_cid(op->cid);
+	r = _create_collection(txc, cid, op->split_bits, &c);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_SPLIT_COLLECTION:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_SPLIT_COLLECTION2:
+      {
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+	r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_MERGE_COLLECTION:
+      {
+        uint32_t bits = op->split_bits;
+	r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_COLL_HINT:
+      {
+        uint32_t type = op->hint;
+        bufferlist hint;
+        i.decode_bl(hint);
+        auto hiter = hint.cbegin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          decode(pg_num, hiter);
+          decode(num_objs, hiter);
+          dout(10) << __func__ << " collection hint objects is a no-op, "
+		   << " pg_num " << pg_num << " num_objects " << num_objs
+		   << dendl;
+        } else {
+          // Ignore the hint
+          dout(10) << __func__ << " unknown collection hint " << type << dendl;
+        }
+	continue;
+      }
+      break;
+
+    case Transaction::OP_COLL_SETATTR:
+      r = -EOPNOTSUPP;
+      break;
+
+    case Transaction::OP_COLL_RMATTR:
+      r = -EOPNOTSUPP;
+      break;
+
+    case Transaction::OP_COLL_RENAME:
+      ceph_abort_msg("not implemented");
+      break;
+    }
+    if (r < 0) {
+      derr << __func__ << " error " << cpp_strerror(r)
+           << " not handled on operation " << op->op
+           << " (op " << pos << ", counting from 0)" << dendl;
+      _dump_transaction<0>(cct, t);
+      ceph_abort_msg("unexpected error");
+    }
+
+    // these operations implicity create the object
+    bool create = false;
+    if (op->op == Transaction::OP_TOUCH ||
+	op->op == Transaction::OP_CREATE ||
+	op->op == Transaction::OP_WRITE ||
+	op->op == Transaction::OP_ZERO) {
+      create = true;
+    }
+
+    // object operations
+    std::unique_lock l(c->lock);
+    OnodeRef &o = ovec[op->oid];
+    if (!o) {
+      ghobject_t oid = i.get_oid(op->oid);
+      o = c->get_onode(oid, create, op->op == Transaction::OP_CREATE);
+    }
+    if (!create && (!o || !o->exists)) {
+      dout(10) << __func__ << " op " << op->op << " got ENOENT on "
+	       << i.get_oid(op->oid) << dendl;
+      r = -ENOENT;
+      goto endop;
+    }
+
+    switch (op->op) {
+    case Transaction::OP_CREATE:
+    case Transaction::OP_TOUCH:
+      r = _touch(txc, c, o);
+      break;
+
+    case Transaction::OP_WRITE:
+      {
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	uint32_t fadvise_flags = i.get_fadvise_flags();
+        bufferlist bl;
+        i.decode_bl(bl);
+	r = _write(txc, c, o, off, len, bl, fadvise_flags);
+      }
+      break;
+
+    case Transaction::OP_ZERO:
+      {
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	r = _zero(txc, c, o, off, len);
+      }
+      break;
+
+    case Transaction::OP_TRIMCACHE:
+      {
+        // deprecated, no-op
+      }
+      break;
+
+    case Transaction::OP_TRUNCATE:
+      {
+        uint64_t off = op->off;
+	r = _truncate(txc, c, o, off);
+      }
+      break;
+
+    case Transaction::OP_REMOVE:
+      {
+	r = _remove(txc, c, o);
+      }
+      break;
+
+    case Transaction::OP_SETATTR:
+      {
+        string name = i.decode_string();
+        bufferptr bp;
+        i.decode_bp(bp);
+	r = _setattr(txc, c, o, name, bp);
+      }
+      break;
+
+    case Transaction::OP_SETATTRS:
+      {
+        map<string, bufferptr> aset;
+        i.decode_attrset(aset);
+	r = _setattrs(txc, c, o, aset);
+      }
+      break;
+
+    case Transaction::OP_RMATTR:
+      {
+	string name = i.decode_string();
+	r = _rmattr(txc, c, o, name);
+      }
+      break;
+
+    case Transaction::OP_RMATTRS:
+      {
+	r = _rmattrs(txc, c, o);
+      }
+      break;
+
+    case Transaction::OP_CLONE:
+      {
+	OnodeRef& no = ovec[op->dest_oid];
+	if (!no) {
+          const ghobject_t& noid = i.get_oid(op->dest_oid);
+	  no = c->get_onode(noid, true);
+	}
+	r = _clone(txc, c, o, no);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_CLONERANGE2:
+      {
+	OnodeRef& no = ovec[op->dest_oid];
+	if (!no) {
+	  const ghobject_t& noid = i.get_oid(op->dest_oid);
+	  no = c->get_onode(noid, true);
+	}
+        uint64_t srcoff = op->off;
+        uint64_t len = op->len;
+        uint64_t dstoff = op->dest_off;
+	r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
+      }
+      break;
+
+    case Transaction::OP_COLL_ADD:
+      ceph_abort_msg("not implemented");
+      break;
+
+    case Transaction::OP_COLL_REMOVE:
+      ceph_abort_msg("not implemented");
+      break;
+
+    case Transaction::OP_COLL_MOVE:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_COLL_MOVE_RENAME:
+    case Transaction::OP_TRY_RENAME:
+      {
+	ceph_assert(op->cid == op->dest_cid);
+	const ghobject_t& noid = i.get_oid(op->dest_oid);
+	OnodeRef& no = ovec[op->dest_oid];
+	if (!no) {
+	  no = c->get_onode(noid, false);
+	}
+	r = _rename(txc, c, o, no, noid);
+      }
+      break;
+
+    case Transaction::OP_OMAP_CLEAR:
+      {
+	r = _omap_clear(txc, c, o);
+      }
+      break;
+    case Transaction::OP_OMAP_SETKEYS:
+      {
+	bufferlist aset_bl;
+        i.decode_attrset_bl(&aset_bl);
+	r = _omap_setkeys(txc, c, o, aset_bl);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYS:
+      {
+	bufferlist keys_bl;
+        i.decode_keyset_bl(&keys_bl);
+	r = _omap_rmkeys(txc, c, o, keys_bl);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYRANGE:
+      {
+        string first, last;
+        first = i.decode_string();
+        last = i.decode_string();
+	r = _omap_rmkey_range(txc, c, o, first, last);
+      }
+      break;
+    case Transaction::OP_OMAP_SETHEADER:
+      {
+        bufferlist bl;
+        i.decode_bl(bl);
+	r = _omap_setheader(txc, c, o, bl);
+      }
+      break;
+
+    case Transaction::OP_SETALLOCHINT:
+      {
+	r = _set_alloc_hint(txc, c, o,
+			    op->expected_object_size,
+			    op->expected_write_size,
+			    op->hint);
+      }
+      break;
+
+    default:
+      derr << __func__ << " bad op " << op->op << dendl;
+      ceph_abort();
+    }
+
+  endop:
+    if (r < 0) {
+      bool ok = false;
+
+      if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+			    op->op == Transaction::OP_CLONE ||
+			    op->op == Transaction::OP_CLONERANGE2 ||
+			    op->op == Transaction::OP_COLL_ADD ||
+			    op->op == Transaction::OP_SETATTR ||
+			    op->op == Transaction::OP_SETATTRS ||
+			    op->op == Transaction::OP_RMATTR ||
+			    op->op == Transaction::OP_OMAP_SETKEYS ||
+			    op->op == Transaction::OP_OMAP_RMKEYS ||
+			    op->op == Transaction::OP_OMAP_RMKEYRANGE ||
+			    op->op == Transaction::OP_OMAP_SETHEADER))
+	// -ENOENT is usually okay
+	ok = true;
+      if (r == -ENODATA)
+	ok = true;
+
+      if (!ok) {
+	const char *msg = "unexpected error code";
+
+	if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+			     op->op == Transaction::OP_CLONE ||
+			     op->op == Transaction::OP_CLONERANGE2))
+	  msg = "ENOENT on clone suggests osd bug";
+
+	if (r == -ENOSPC)
+	  // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+	  // by partially applying transactions.
+	  msg = "ENOSPC from bluestore, misconfigured cluster";
+
+	if (r == -ENOTEMPTY) {
+	  msg = "ENOTEMPTY suggests garbage data in osd data dir";
+	}
+
+        derr << __func__ << " error " << cpp_strerror(r)
+             << " not handled on operation " << op->op
+             << " (op " << pos << ", counting from 0)"
+             << dendl;
+        derr << msg << dendl;
+        _dump_transaction<0>(cct, t);
+	ceph_abort_msg("unexpected error");
+      }
+    }
+  }
+}
+
+
+
+// -----------------
+// write operations
+
+int BlueStore::_touch(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  _assign_nid(txc, o);
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+void BlueStore::_pad_zeros(
+  bufferlist *bl, uint64_t *offset,
+  uint64_t chunk_size)
+{
+  auto length = bl->length();
+  dout(30) << __func__ << " 0x" << std::hex << *offset << "~" << length
+	   << " chunk_size 0x" << chunk_size << std::dec << dendl;
+  dout(40) << "before:\n";
+  bl->hexdump(*_dout);
+  *_dout << dendl;
+  // front
+  size_t front_pad = *offset % chunk_size;
+  size_t back_pad = 0;
+  size_t pad_count = 0;
+  if (front_pad) {
+    size_t front_copy = std::min<uint64_t>(chunk_size - front_pad, length);
+    bufferptr z = ceph::buffer::create_small_page_aligned(chunk_size);
+    z.zero(0, front_pad, false);
+    pad_count += front_pad;
+    bl->begin().copy(front_copy, z.c_str() + front_pad);
+    if (front_copy + front_pad < chunk_size) {
+      back_pad = chunk_size - (length + front_pad);
+      z.zero(front_pad + length, back_pad, false);
+      pad_count += back_pad;
+    }
+    bufferlist old, t;
+    old.swap(*bl);
+    t.substr_of(old, front_copy, length - front_copy);
+    bl->append(z);
+    bl->claim_append(t);
+    *offset -= front_pad;
+    length += pad_count;
+  }
+
+  // back
+  uint64_t end = *offset + length;
+  unsigned back_copy = end % chunk_size;
+  if (back_copy) {
+    ceph_assert(back_pad == 0);
+    back_pad = chunk_size - back_copy;
+    ceph_assert(back_copy <= length);
+    bufferptr tail(chunk_size);
+    bl->begin(length - back_copy).copy(back_copy, tail.c_str());
+    tail.zero(back_copy, back_pad, false);
+    bufferlist old;
+    old.swap(*bl);
+    bl->substr_of(old, 0, length - back_copy);
+    bl->append(tail);
+    length += back_pad;
+    pad_count += back_pad;
+  }
+  dout(20) << __func__ << " pad 0x" << std::hex << front_pad << " + 0x"
+	   << back_pad << " on front/back, now 0x" << *offset << "~"
+	   << length << std::dec << dendl;
+  dout(40) << "after:\n";
+  bl->hexdump(*_dout);
+  *_dout << dendl;
+  if (pad_count)
+    logger->inc(l_bluestore_write_pad_bytes, pad_count);
+  ceph_assert(bl->length() == length);
+}
+
+void BlueStore::_do_write_small(
+    TransContext *txc,
+    CollectionRef &c,
+    OnodeRef& o,
+    uint64_t offset, uint64_t length,
+    bufferlist::iterator& blp,
+    WriteContext *wctx)
+{
+  dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << std::dec << dendl;
+  ceph_assert(length < min_alloc_size);
+
+  uint64_t end_offs = offset + length;
+
+  logger->inc(l_bluestore_write_small);
+  logger->inc(l_bluestore_write_small_bytes, length);
+
+  bufferlist bl;
+  blp.copy(length, bl);
+
+  auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+  auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
+  uint32_t alloc_len = min_alloc_size;
+  auto offset0 = p2align<uint64_t>(offset, alloc_len);
+
+  bool any_change;
+
+  // search suitable extent in both forward and reverse direction in
+  // [offset - target_max_blob_size, offset + target_max_blob_size] range
+  // then check if blob can be reused via can_reuse_blob func or apply
+  // direct/deferred write (the latter for extents including or higher
+  // than 'offset' only).
+  o->extent_map.fault_range(db, min_off, offset + max_bsize - min_off);
+
+#ifdef HAVE_LIBZBD
+  // On zoned devices, the first goal is to support non-overwrite workloads,
+  // such as RGW, with large, aligned objects.  Therefore, for user writes
+  // _do_write_small should not trigger.  OSDs, however, write and update a tiny
+  // amount of metadata, such as OSD maps, to disk.  For those cases, we
+  // temporarily just pad them to min_alloc_size and write them to a new place
+  // on every update.
+  if (bdev->is_smr()) {
+    uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
+    uint64_t b_off0 = b_off;
+    o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+
+    // Zero detection -- small block
+    if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
+      BlobRef b = c->new_blob();
+      _pad_zeros(&bl, &b_off0, min_alloc_size);
+      wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, false, true);
+    } else { // if (bl.is_zero())
+      dout(20) << __func__ << " skip small zero block " << std::hex
+        << " (0x" << b_off0 << "~" << bl.length() << ")"
+        << " (0x" << b_off << "~" << length << ")"
+        << std::dec << dendl;
+      logger->inc(l_bluestore_write_small_skipped);
+      logger->inc(l_bluestore_write_small_skipped_bytes, length);
+    }
+
+    return;
+  }
+#endif
+
+  // Look for an existing mutable blob we can use.
+  auto begin = o->extent_map.extent_map.begin();
+  auto end = o->extent_map.extent_map.end();
+  auto ep = o->extent_map.seek_lextent(offset);
+  if (ep != begin) {
+    --ep;
+    if (ep->blob_end() <= offset) {
+      ++ep;
+    }
+  }
+  auto prev_ep = end;
+  if (ep != begin) {
+    prev_ep = ep;
+    --prev_ep;
+  }
+
+  boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
+  // We don't want to have more blobs than min alloc units fit
+  // into 2 max blobs
+  size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
+  bool above_blob_threshold = false;
+
+  inspected_blobs.reserve(blob_threshold);
+
+  uint64_t max_off = 0;
+  auto start_ep = ep;
+  auto end_ep = ep; // exclusively
+  do {
+    any_change = false;
+
+    if (ep != end && ep->logical_offset < offset + max_bsize) {
+      BlobRef b = ep->blob;
+      if (!above_blob_threshold) {
+	inspected_blobs.insert(&b->get_blob());
+	above_blob_threshold = inspected_blobs.size() >= blob_threshold;
+      }
+      max_off = ep->logical_end();
+      auto bstart = ep->blob_start();
+
+      dout(20) << __func__ << " considering " << *b
+	       << " bstart 0x" << std::hex << bstart << std::dec << dendl;
+      if (bstart >= end_offs) {
+	dout(20) << __func__ << " ignoring distant " << *b << dendl;
+      } else if (!b->get_blob().is_mutable()) {
+	dout(20) << __func__ << " ignoring immutable " << *b << dendl;
+      } else if (ep->logical_offset % min_alloc_size !=
+		  ep->blob_offset % min_alloc_size) {
+	dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl;
+      } else {
+	uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
+	// can we pad our head/tail out with zeros?
+	uint64_t head_pad, tail_pad;
+	head_pad = p2phase(offset, chunk_size);
+	tail_pad = p2nphase(end_offs, chunk_size);
+	if (head_pad || tail_pad) {
+	  o->extent_map.fault_range(db, offset - head_pad,
+				    end_offs - offset + head_pad + tail_pad);
+	}
+	if (head_pad &&
+	    o->extent_map.has_any_lextents(offset - head_pad, head_pad)) {
+	  head_pad = 0;
+	}
+	if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) {
+	  tail_pad = 0;
+	}
+
+	uint64_t b_off = offset - head_pad - bstart;
+	uint64_t b_len = length + head_pad + tail_pad;
+
+	// direct write into unused blocks of an existing mutable blob?
+	if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) &&
+	    b->get_blob().get_ondisk_length() >= b_off + b_len &&
+	    b->get_blob().is_unused(b_off, b_len) &&
+	    b->get_blob().is_allocated(b_off, b_len)) {
+	  _apply_padding(head_pad, tail_pad, bl);
+
+	  dout(20) << __func__ << "  write to unused 0x" << std::hex
+		   << b_off << "~" << b_len
+		   << " pad 0x" << head_pad << " + 0x" << tail_pad
+		   << std::dec << " of mutable " << *b << dendl;
+	  _buffer_cache_write(txc, b, b_off, bl,
+			      wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+	  if (!g_conf()->bluestore_debug_omit_block_device_write) {
+	    if (b_len < prefer_deferred_size) {
+	      dout(20) << __func__ << " deferring small 0x" << std::hex
+		       << b_len << std::dec << " unused write via deferred" << dendl;
+	      bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
+	      op->op = bluestore_deferred_op_t::OP_WRITE;
+	      b->get_blob().map(
+		b_off, b_len,
+		[&](uint64_t offset, uint64_t length) {
+		  op->extents.emplace_back(bluestore_pextent_t(offset, length));
+		  return 0;
+		});
+	      op->data = bl;
+	    } else {
+	      b->get_blob().map_bl(
+		b_off, bl,
+		[&](uint64_t offset, bufferlist& t) {
+		  bdev->aio_write(offset, t,
+				  &txc->ioc, wctx->buffered);
+		});
+	    }
+	  }
+	  b->dirty_blob().calc_csum(b_off, bl);
+	  dout(20) << __func__ << "  lex old " << *ep << dendl;
+	  Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length,
+						 b,
+						 &wctx->old_extents);
+	  b->dirty_blob().mark_used(le->blob_offset, le->length);
+
+	  txc->statfs_delta.stored() += le->length;
+	  dout(20) << __func__ << "  lex " << *le << dendl;
+	  logger->inc(l_bluestore_write_small_unused);
+	  return;
+	}
+	// read some data to fill out the chunk?
+	uint64_t head_read = p2phase(b_off, chunk_size);
+	uint64_t tail_read = p2nphase(b_off + b_len, chunk_size);
+	if ((head_read || tail_read) &&
+	    (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) &&
+	    head_read + tail_read < min_alloc_size) {
+	  b_off -= head_read;
+	  b_len += head_read + tail_read;
+
+	} else {
+	  head_read = tail_read = 0;
+	}
+
+	// chunk-aligned deferred overwrite?
+	if (b->get_blob().get_ondisk_length() >= b_off + b_len &&
+	    b_off % chunk_size == 0 &&
+	    b_len % chunk_size == 0 &&
+	    b->get_blob().is_allocated(b_off, b_len)) {
+
+	  _apply_padding(head_pad, tail_pad, bl);
+
+	  dout(20) << __func__ << "  reading head 0x" << std::hex << head_read
+		   << " and tail 0x" << tail_read << std::dec << dendl;
+	  if (head_read) {
+	    bufferlist head_bl;
+	    int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read,
+			     head_bl, 0);
+	    ceph_assert(r >= 0 && r <= (int)head_read);
+	    size_t zlen = head_read - r;
+	    if (zlen) {
+	      head_bl.append_zero(zlen);
+	      logger->inc(l_bluestore_write_pad_bytes, zlen);
+	    }
+	    head_bl.claim_append(bl);
+	    bl.swap(head_bl);
+	    logger->inc(l_bluestore_write_penalty_read_ops);
+	  }
+	  if (tail_read) {
+	    bufferlist tail_bl;
+	    int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read,
+			     tail_bl, 0);
+	    ceph_assert(r >= 0 && r <= (int)tail_read);
+	    size_t zlen = tail_read - r;
+	    if (zlen) {
+	      tail_bl.append_zero(zlen);
+	      logger->inc(l_bluestore_write_pad_bytes, zlen);
+	    }
+	    bl.claim_append(tail_bl);
+	    logger->inc(l_bluestore_write_penalty_read_ops);
+	  }
+          logger->inc(l_bluestore_write_small_pre_read);
+
+	  _buffer_cache_write(txc, b, b_off, bl,
+			      wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+	  b->dirty_blob().calc_csum(b_off, bl);
+
+	  if (!g_conf()->bluestore_debug_omit_block_device_write) {
+	    bluestore_deferred_op_t *op = _get_deferred_op(txc, bl.length());
+	    op->op = bluestore_deferred_op_t::OP_WRITE;
+	    int r = b->get_blob().map(
+	      b_off, b_len,
+	      [&](uint64_t offset, uint64_t length) {
+		op->extents.emplace_back(bluestore_pextent_t(offset, length));
+		return 0;
+	      });
+	    ceph_assert(r == 0);
+	    op->data = std::move(bl);
+	    dout(20) << __func__ << "  deferred write 0x" << std::hex << b_off << "~"
+		     << b_len << std::dec << " of mutable " << *b
+		     << " at " << op->extents << dendl;
+	  }
+
+	  Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length,
+						 b, &wctx->old_extents);
+	  b->dirty_blob().mark_used(le->blob_offset, le->length);
+	  txc->statfs_delta.stored() += le->length;
+	  dout(20) << __func__ << "  lex " << *le << dendl;
+	  return;
+	}
+	// try to reuse blob if we can
+	if (b->can_reuse_blob(min_alloc_size,
+			      max_bsize,
+			      offset0 - bstart,
+			      &alloc_len)) {
+	  ceph_assert(alloc_len == min_alloc_size); // expecting data always
+					       // fit into reused blob
+	  // Need to check for pending writes desiring to
+	  // reuse the same pextent. The rationale is that during GC two chunks
+	  // from garbage blobs(compressed?) can share logical space within the same
+	  // AU. That's in turn might be caused by unaligned len in clone_range2.
+	  // Hence the second write will fail in an attempt to reuse blob at
+	  // do_alloc_write().
+	  if (!wctx->has_conflict(b,
+				  offset0,
+				  offset0 + alloc_len, 
+				  min_alloc_size)) {
+
+	    // we can't reuse pad_head/pad_tail since they might be truncated 
+	    // due to existent extents
+	    uint64_t b_off = offset - bstart;
+	    uint64_t b_off0 = b_off;
+	    o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+
+	    // Zero detection -- small block
+	    if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
+	      _pad_zeros(&bl, &b_off0, chunk_size);
+
+	      dout(20) << __func__ << " reuse blob " << *b << std::hex
+		       << " (0x" << b_off0 << "~" << bl.length() << ")"
+		       << " (0x" << b_off << "~" << length << ")"
+		       << std::dec << dendl;
+
+	      wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+		  false, false);
+	      logger->inc(l_bluestore_write_small_unused);
+	    } else { // if (bl.is_zero())
+	      dout(20) << __func__ << " skip small zero block " << std::hex
+                << " (0x" << b_off0 << "~" << bl.length() << ")"
+                << " (0x" << b_off << "~" << length << ")"
+                << std::dec << dendl;
+	      logger->inc(l_bluestore_write_small_skipped);
+	      logger->inc(l_bluestore_write_small_skipped_bytes, length);
+	    }
+
+	    return;
+	  }
+	}
+      }
+      ++ep;
+      end_ep = ep;
+      any_change = true;
+    } // if (ep != end && ep->logical_offset < offset + max_bsize)
+
+    // check extent for reuse in reverse order
+    if (prev_ep != end && prev_ep->logical_offset >= min_off) {
+      BlobRef b = prev_ep->blob;
+      if (!above_blob_threshold) {
+	inspected_blobs.insert(&b->get_blob());
+	above_blob_threshold = inspected_blobs.size() >= blob_threshold;
+      }
+      start_ep = prev_ep;
+      auto bstart = prev_ep->blob_start();
+      dout(20) << __func__ << " considering " << *b
+	       << " bstart 0x" << std::hex << bstart << std::dec << dendl;
+      if (b->can_reuse_blob(min_alloc_size,
+			    max_bsize,
+                            offset0 - bstart,
+                            &alloc_len)) {
+	ceph_assert(alloc_len == min_alloc_size); // expecting data always
+					     // fit into reused blob
+	// Need to check for pending writes desiring to
+	// reuse the same pextent. The rationale is that during GC two chunks
+	// from garbage blobs(compressed?) can share logical space within the same
+	// AU. That's in turn might be caused by unaligned len in clone_range2.
+	// Hence the second write will fail in an attempt to reuse blob at
+	// do_alloc_write().
+	if (!wctx->has_conflict(b,
+				offset0,
+				offset0 + alloc_len, 
+				min_alloc_size)) {
+
+	  uint64_t b_off = offset - bstart;
+	  uint64_t b_off0 = b_off;
+	  o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+
+	  // Zero detection -- small block
+	  if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
+	    uint64_t chunk_size = b->get_blob().get_chunk_size(block_size);
+	    _pad_zeros(&bl, &b_off0, chunk_size);
+
+	    dout(20) << __func__ << " reuse blob " << *b << std::hex
+	      << " (0x" << b_off0 << "~" << bl.length() << ")"
+	      << " (0x" << b_off << "~" << length << ")"
+	      << std::dec << dendl;
+
+	    wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+		false, false);
+	    logger->inc(l_bluestore_write_small_unused);
+	  } else { // if (bl.is_zero())
+	    dout(20) << __func__ << " skip small zero block " << std::hex
+	      << " (0x" << b_off0 << "~" << bl.length() << ")"
+	      << " (0x" << b_off << "~" << length << ")"
+	      << std::dec << dendl;
+	    logger->inc(l_bluestore_write_small_skipped);
+	    logger->inc(l_bluestore_write_small_skipped_bytes, length);
+	  }
+
+	  return;
+	}
+      } 
+      if (prev_ep != begin) {
+	--prev_ep;
+	any_change = true;
+      } else {
+	prev_ep = end; // to avoid useless first extent re-check
+      }
+    } // if (prev_ep != end && prev_ep->logical_offset >= min_off) 
+  } while (any_change);
+
+  if (above_blob_threshold) {
+    dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
+            << " " << std::hex << min_off << "~" << max_off << std::dec
+	    << dendl;
+    ceph_assert(start_ep != end_ep);
+    for (auto ep = start_ep; ep != end_ep; ++ep) {
+      dout(20) << __func__ << " inserting for GC "
+              << std::hex << ep->logical_offset << "~" << ep->length
+	      << std::dec << dendl;
+
+      wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
+    }
+    // insert newly written extent to GC
+    wctx->extents_to_gc.union_insert(offset, length);
+      dout(20) << __func__ << " inserting (last) for GC "
+              << std::hex << offset << "~" << length
+	      << std::dec << dendl;
+  }
+  uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
+  uint64_t b_off0 = b_off;
+  o->extent_map.punch_hole(c, offset, length, &wctx->old_extents);
+
+  // Zero detection -- small block
+  if (!cct->_conf->bluestore_zero_block_detection || !bl.is_zero()) {
+    // new blob.
+    BlobRef b = c->new_blob();
+    _pad_zeros(&bl, &b_off0, block_size);
+    wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length,
+	min_alloc_size != block_size, // use 'unused' bitmap when alloc granularity
+                                      // doesn't match disk one only
+	true);
+  } else { // if (bl.is_zero())
+    dout(20) << __func__ << " skip small zero block " << std::hex
+      << " (0x" << b_off0 << "~" << bl.length() << ")"
+      << " (0x" << b_off << "~" << length << ")"
+      << std::dec << dendl;
+    logger->inc(l_bluestore_write_small_skipped);
+    logger->inc(l_bluestore_write_small_skipped_bytes, length);
+  }
+
+  return;
+}
+
+bool BlueStore::BigDeferredWriteContext::can_defer(
+    BlueStore::extent_map_t::iterator ep,
+    uint64_t prefer_deferred_size,
+    uint64_t block_size,
+    uint64_t offset,
+    uint64_t l)
+{
+  bool res = false;
+  auto& blob = ep->blob->get_blob();
+  if (offset >= ep->blob_start() &&
+    blob.is_mutable()) {
+    off = offset;
+    b_off = offset - ep->blob_start();
+    uint64_t chunk_size = blob.get_chunk_size(block_size);
+    uint64_t ondisk = blob.get_ondisk_length();
+    used = std::min(l, ondisk - b_off);
+
+    // will read some data to fill out the chunk?
+    head_read = p2phase<uint64_t>(b_off, chunk_size);
+    tail_read = p2nphase<uint64_t>(b_off + used, chunk_size);
+    b_off -= head_read;
+
+    ceph_assert(b_off % chunk_size == 0);
+    ceph_assert(blob_aligned_len() % chunk_size == 0);
+
+    res = blob_aligned_len() < prefer_deferred_size &&
+      blob_aligned_len() <= ondisk &&
+      blob.is_allocated(b_off, blob_aligned_len());
+    if (res) {
+      blob_ref = ep->blob;
+      blob_start = ep->blob_start();
+    }
+  }
+  return res;
+}
+
+bool BlueStore::BigDeferredWriteContext::apply_defer()
+{
+  int r = blob_ref->get_blob().map(
+    b_off, blob_aligned_len(),
+    [&](const bluestore_pextent_t& pext,
+      uint64_t offset,
+      uint64_t length) {
+        // apply deferred if overwrite breaks blob continuity only.
+        // if it totally overlaps some pextent - fallback to regular write
+        if (pext.offset < offset ||
+          pext.end() > offset + length) {
+          res_extents.emplace_back(bluestore_pextent_t(offset, length));
+          return 0;
+        }
+        return -1;
+    });
+  return r >= 0;
+}
+
+void BlueStore::_do_write_big_apply_deferred(
+    TransContext* txc,
+    CollectionRef& c,
+    OnodeRef& o,
+    BlueStore::BigDeferredWriteContext& dctx,
+    bufferlist::iterator& blp,
+    WriteContext* wctx)
+{
+  bufferlist bl;
+  dout(20) << __func__ << "  reading head 0x" << std::hex << dctx.head_read
+    << " and tail 0x" << dctx.tail_read << std::dec << dendl;
+  if (dctx.head_read) {
+    int r = _do_read(c.get(), o,
+      dctx.off - dctx.head_read,
+      dctx.head_read,
+      bl,
+      0);
+    ceph_assert(r >= 0 && r <= (int)dctx.head_read);
+    size_t zlen = dctx.head_read - r;
+    if (zlen) {
+      bl.append_zero(zlen);
+      logger->inc(l_bluestore_write_pad_bytes, zlen);
+    }
+    logger->inc(l_bluestore_write_penalty_read_ops);
+  }
+  blp.copy(dctx.used, bl);
+
+  if (dctx.tail_read) {
+    bufferlist tail_bl;
+    int r = _do_read(c.get(), o,
+      dctx.off + dctx.used, dctx.tail_read,
+      tail_bl, 0);
+    ceph_assert(r >= 0 && r <= (int)dctx.tail_read);
+    size_t zlen = dctx.tail_read - r;
+    if (zlen) {
+      tail_bl.append_zero(zlen);
+      logger->inc(l_bluestore_write_pad_bytes, zlen);
+    }
+    bl.claim_append(tail_bl);
+    logger->inc(l_bluestore_write_penalty_read_ops);
+  }
+  auto& b0 = dctx.blob_ref;
+  _buffer_cache_write(txc, b0, dctx.b_off, bl,
+    wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+  b0->dirty_blob().calc_csum(dctx.b_off, bl);
+
+  Extent* le = o->extent_map.set_lextent(c, dctx.off,
+    dctx.off - dctx.blob_start, dctx.used, b0, &wctx->old_extents);
+
+  // in fact this is a no-op for big writes but left here to maintain
+  // uniformity and avoid missing after some refactor.
+  b0->dirty_blob().mark_used(le->blob_offset, le->length);
+  txc->statfs_delta.stored() += le->length;
+
+  if (!g_conf()->bluestore_debug_omit_block_device_write) {
+    bluestore_deferred_op_t* op = _get_deferred_op(txc, bl.length());
+    op->op = bluestore_deferred_op_t::OP_WRITE;
+    op->extents.swap(dctx.res_extents);
+    op->data = std::move(bl);
+  }
+}
+
+void BlueStore::_do_write_big(
+    TransContext *txc,
+    CollectionRef &c,
+    OnodeRef& o,
+    uint64_t offset, uint64_t length,
+    bufferlist::iterator& blp,
+    WriteContext *wctx)
+{
+  dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   << " target_blob_size 0x" << wctx->target_blob_size << std::dec
+	   << " compress " << (int)wctx->compress
+	   << dendl;
+  logger->inc(l_bluestore_write_big);
+  logger->inc(l_bluestore_write_big_bytes, length);
+  auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+  uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load();
+  while (length > 0) {
+    bool new_blob = false;
+    BlobRef b;
+    uint32_t b_off = 0;
+    uint32_t l = 0;
+
+    //attempting to reuse existing blob
+    if (!wctx->compress) {
+      // enforce target blob alignment with max_bsize
+      l = max_bsize - p2phase(offset, max_bsize);
+      l = std::min(uint64_t(l), length);
+
+      auto end = o->extent_map.extent_map.end();
+
+      dout(20) << __func__ << " may be defer: 0x" << std::hex
+	       << offset << "~" << l
+               << std::dec << dendl;
+
+      if (prefer_deferred_size_snapshot &&
+          l <= prefer_deferred_size_snapshot * 2) {
+        // Single write that spans two adjusted existing blobs can result
+        // in up to two deferred blocks of 'prefer_deferred_size'
+        // So we're trying to minimize the amount of resulting blobs
+        // and preserve 2 blobs rather than inserting one more in between
+        // E.g. write 0x10000~20000 over existing blobs
+        // (0x0~20000 and 0x20000~20000) is better (from subsequent reading
+        // performance point of view) to result in two deferred writes to
+        // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000
+
+        // look for an existing mutable blob we can write into
+        auto ep = o->extent_map.seek_lextent(offset);
+        auto ep_next = end;
+        BigDeferredWriteContext head_info, tail_info;
+
+        bool will_defer = ep != end ?
+          head_info.can_defer(ep,
+            prefer_deferred_size_snapshot,
+            block_size,
+            offset,
+            l) :
+          false;
+        auto offset_next = offset + head_info.used;
+        auto remaining = l - head_info.used;
+        if (will_defer && remaining) {
+          will_defer = false;
+          if (remaining <= prefer_deferred_size_snapshot) {
+            ep_next = o->extent_map.seek_lextent(offset_next);
+            // check if we can defer remaining totally
+            will_defer = ep_next == end ?
+              false :
+              tail_info.can_defer(ep_next,
+                prefer_deferred_size_snapshot,
+                block_size,
+                offset_next,
+                remaining);
+            will_defer = will_defer && remaining == tail_info.used;
+          }
+        }
+        if (will_defer) {
+          dout(20) << __func__ << " " << *(head_info.blob_ref)
+            << " deferring big " << std::hex
+            << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")"
+            << std::dec << " write via deferred"
+            << dendl;
+          if (remaining) {
+            dout(20) << __func__ << " " << *(tail_info.blob_ref)
+              << " deferring big " << std::hex
+              << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")"
+              << std::dec << " write via deferred"
+              << dendl;
+          }
+
+          will_defer = head_info.apply_defer();
+          if (!will_defer) {
+            dout(20) << __func__
+              << " deferring big fell back, head isn't continuous"
+              << dendl;
+          } else if (remaining) {
+            will_defer = tail_info.apply_defer();
+            if (!will_defer) {
+              dout(20) << __func__
+                << " deferring big fell back, tail isn't continuous"
+                << dendl;
+            }
+          }
+        }
+        if (will_defer) {
+          _do_write_big_apply_deferred(txc, c, o, head_info, blp, wctx);
+          if (remaining) {
+            _do_write_big_apply_deferred(txc, c, o, tail_info,
+              blp, wctx);
+          }
+	  dout(20) << __func__ << " defer big: 0x" << std::hex
+		   << offset << "~" << l
+		   << std::dec << dendl;
+          offset += l;
+          length -= l;
+          logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1);
+          logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1);
+          continue;
+        }
+      }
+      dout(20) << __func__ << " lookup for blocks to reuse..." << dendl;
+
+      o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
+
+      // seek again as punch_hole could invalidate ep
+      auto ep = o->extent_map.seek_lextent(offset);
+      auto begin = o->extent_map.extent_map.begin();
+      auto prev_ep = end;
+      if (ep != begin) {
+        prev_ep = ep;
+        --prev_ep;
+      }
+
+      auto min_off = offset >= max_bsize ? offset - max_bsize : 0;
+      // search suitable extent in both forward and reverse direction in
+      // [offset - target_max_blob_size, offset + target_max_blob_size] range
+      // then check if blob can be reused via can_reuse_blob func.
+      bool any_change;
+      do {
+	any_change = false;
+	if (ep != end && ep->logical_offset < offset + max_bsize) {
+          dout(20) << __func__ << " considering " << *ep
+                   << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl;
+
+          if (offset >= ep->blob_start() &&
+              ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
+	                               offset - ep->blob_start(),
+	                               &l)) {
+	    b = ep->blob;
+            b_off = offset - ep->blob_start();
+            prev_ep = end; // to avoid check below
+	    dout(20) << __func__ << " reuse blob " << *b << std::hex
+		     << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
+	  } else {
+	    ++ep;
+	    any_change = true;
+	  }
+	}
+
+	if (prev_ep != end && prev_ep->logical_offset >= min_off) {
+          dout(20) << __func__ << " considering rev " << *prev_ep
+                   << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl;
+          if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize,
+                                    	    offset - prev_ep->blob_start(),
+                                    	    &l)) {
+	    b = prev_ep->blob;
+	    b_off = offset - prev_ep->blob_start();
+	    dout(20) << __func__ << " reuse blob " << *b << std::hex
+		     << " (0x" << b_off << "~" << l << ")" << std::dec << dendl;
+	  } else if (prev_ep != begin) {
+	    --prev_ep;
+	    any_change = true;
+	  } else {
+	    prev_ep = end; // to avoid useless first extent re-check
+	  }
+	}
+      } while (b == nullptr && any_change);
+    } else {
+      // trying to utilize as longer chunk as permitted in case of compression.
+      l = std::min(max_bsize, length);
+      o->extent_map.punch_hole(c, offset, l, &wctx->old_extents);
+    } // if (!wctx->compress)
+
+    if (b == nullptr) {
+      b = c->new_blob();
+      b_off = 0;
+      new_blob = true;
+    }
+    bufferlist t;
+    blp.copy(l, t);
+
+    // Zero detection -- big block
+    if (!cct->_conf->bluestore_zero_block_detection || !t.is_zero()) {
+      wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob);
+
+      dout(20) << __func__ << " schedule write big: 0x"
+      << std::hex << offset << "~" << l << std::dec
+      << (new_blob ? " new " : " reuse ")
+      << *b << dendl;
+
+      logger->inc(l_bluestore_write_big_blobs);
+    } else { // if (!t.is_zero())
+      dout(20) << __func__ << " skip big zero block " << std::hex
+        << " (0x" << b_off << "~" << t.length() << ")"
+        << " (0x" << b_off << "~" << l << ")"
+        << std::dec << dendl;
+      logger->inc(l_bluestore_write_big_skipped_blobs);
+      logger->inc(l_bluestore_write_big_skipped_bytes, l);
+    }
+
+    offset += l;
+    length -= l;
+  }
+}
+
+int BlueStore::_do_alloc_write(
+  TransContext *txc,
+  CollectionRef coll,
+  OnodeRef& o,
+  WriteContext *wctx)
+{
+  dout(20) << __func__ << " txc " << txc
+	   << " " << wctx->writes.size() << " blobs"
+	   << dendl;
+  if (wctx->writes.empty()) {
+    return 0;
+  }
+
+  CompressorRef c;
+  double crr = 0;
+  if (wctx->compress) {
+    c = select_option(
+      "compression_algorithm",
+      compressor,
+      [&]() {
+        string val;
+        if (coll->pool_opts.get(pool_opts_t::COMPRESSION_ALGORITHM, &val)) {
+          CompressorRef cp = compressor;
+          if (!cp || cp->get_type_name() != val) {
+            cp = Compressor::create(cct, val);
+	    if (!cp) {
+	      if (_set_compression_alert(false, val.c_str())) {
+	        derr << __func__ << " unable to initialize " << val.c_str()
+		     << " compressor" << dendl;
+	      }
+	    }
+          }
+          return std::optional<CompressorRef>(cp);
+        }
+        return std::optional<CompressorRef>();
+      }
+    );
+
+    crr = select_option(
+      "compression_required_ratio",
+      cct->_conf->bluestore_compression_required_ratio,
+      [&]() {
+        double val;
+        if (coll->pool_opts.get(pool_opts_t::COMPRESSION_REQUIRED_RATIO, &val)) {
+          return std::optional<double>(val);
+        }
+        return std::optional<double>();
+      }
+    );
+  }
+
+  // checksum
+  int64_t csum = csum_type.load();
+  csum = select_option(
+    "csum_type",
+    csum,
+    [&]() {
+      int64_t val;
+      if (coll->pool_opts.get(pool_opts_t::CSUM_TYPE, &val)) {
+        return std::optional<int64_t>(val);
+      }
+      return std::optional<int64_t>();
+    }
+  );
+
+  // compress (as needed) and calc needed space
+  uint64_t need = 0;
+  uint64_t data_size = 0;
+  // 'need' is amount of space that must be provided by allocator.
+  // 'data_size' is a size of data that will be transferred to disk.
+  // Note that data_size is always <= need. This comes from:
+  // - write to blob was unaligned, and there is free space
+  // - data has been compressed
+  //
+  // We make one decision and apply it to all blobs.
+  // All blobs will be deferred or none will.
+  // We assume that allocator does its best to provide contiguous space,
+  // and the condition is : (data_size < deferred).
+
+  auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size);
+  for (auto& wi : wctx->writes) {
+    if (c && wi.blob_length > min_alloc_size) {
+      auto start = mono_clock::now();
+
+      // compress
+      ceph_assert(wi.b_off == 0);
+      ceph_assert(wi.blob_length == wi.bl.length());
+
+      // FIXME: memory alignment here is bad
+      bufferlist t;
+      std::optional<int32_t> compressor_message;
+      int r = c->compress(wi.bl, t, compressor_message);
+      uint64_t want_len_raw = wi.blob_length * crr;
+      uint64_t want_len = p2roundup(want_len_raw, min_alloc_size);
+      bool rejected = false;
+      uint64_t compressed_len = t.length();
+      // do an approximate (fast) estimation for resulting blob size
+      // that doesn't take header overhead  into account
+      uint64_t result_len = p2roundup(compressed_len, min_alloc_size);
+      if (r == 0 && result_len <= want_len && result_len < wi.blob_length) {
+	bluestore_compression_header_t chdr;
+	chdr.type = c->get_type();
+	chdr.length = t.length();
+	chdr.compressor_message = compressor_message;
+	encode(chdr, wi.compressed_bl);
+	wi.compressed_bl.claim_append(t);
+
+	compressed_len = wi.compressed_bl.length();
+	result_len = p2roundup(compressed_len, min_alloc_size);
+	if (result_len <= want_len && result_len < wi.blob_length) {
+	  // Cool. We compressed at least as much as we were hoping to.
+	  // pad out to min_alloc_size
+	  wi.compressed_bl.append_zero(result_len - compressed_len);
+	  wi.compressed_len = compressed_len;
+	  wi.compressed = true;
+	  logger->inc(l_bluestore_write_pad_bytes, result_len - compressed_len);
+	  dout(20) << __func__ << std::hex << "  compressed 0x" << wi.blob_length
+		   << " -> 0x" << compressed_len << " => 0x" << result_len
+		   << " with " << c->get_type()
+		   << std::dec << dendl;
+	  txc->statfs_delta.compressed() += compressed_len;
+	  txc->statfs_delta.compressed_original() += wi.blob_length;
+	  txc->statfs_delta.compressed_allocated() += result_len;
+	  logger->inc(l_bluestore_compress_success_count);
+	  need += result_len;
+	  data_size += result_len;
+	} else {
+	  rejected = true;
+	}
+      } else if (r != 0) {
+	dout(5) << __func__ << std::hex << "  0x" << wi.blob_length
+		 << " bytes compressed using " << c->get_type_name()
+		 << std::dec
+		 << " failed with errcode = " << r
+		 << ", leaving uncompressed"
+		 << dendl;
+	logger->inc(l_bluestore_compress_rejected_count);
+	need += wi.blob_length;
+	data_size += wi.bl.length();
+      } else {
+	rejected = true;
+      }
+
+      if (rejected) {
+	dout(20) << __func__ << std::hex << "  0x" << wi.blob_length
+		 << " compressed to 0x" << compressed_len << " -> 0x" << result_len
+		 << " with " << c->get_type()
+		 << ", which is more than required 0x" << want_len_raw
+		 << " -> 0x" << want_len
+		 << ", leaving uncompressed"
+		 << std::dec << dendl;
+	logger->inc(l_bluestore_compress_rejected_count);
+	need += wi.blob_length;
+	data_size += wi.bl.length();
+      }
+      log_latency("compress@_do_alloc_write",
+	l_bluestore_compress_lat,
+        mono_clock::now() - start,
+	cct->_conf->bluestore_log_op_age );
+    } else {
+      need += wi.blob_length;
+      data_size += wi.bl.length();
+    }
+  }
+  PExtentVector prealloc;
+  prealloc.reserve(2 * wctx->writes.size());
+  int64_t prealloc_left = 0;
+  prealloc_left = alloc->allocate(
+    need, min_alloc_size, need,
+    0, &prealloc);
+  if (prealloc_left < 0 || prealloc_left < (int64_t)need) {
+    derr << __func__ << " failed to allocate 0x" << std::hex << need
+         << " allocated 0x " << (prealloc_left < 0 ? 0 : prealloc_left)
+         << " min_alloc_size 0x" << min_alloc_size
+         << " available 0x " << alloc->get_free()
+         << std::dec << dendl;
+    if (prealloc.size()) {
+      alloc->release(prealloc);
+    }
+    return -ENOSPC;
+  }
+  _collect_allocation_stats(need, min_alloc_size, prealloc);
+
+  dout(20) << __func__ << std::hex << " need=0x" << need << " data=0x" << data_size
+	   << " prealloc " << prealloc << dendl;
+  auto prealloc_pos = prealloc.begin();
+  ceph_assert(prealloc_pos != prealloc.end());
+
+  for (auto& wi : wctx->writes) {
+    bluestore_blob_t& dblob = wi.b->dirty_blob();
+    uint64_t b_off = wi.b_off;
+    bufferlist *l = &wi.bl;
+    uint64_t final_length = wi.blob_length;
+    uint64_t csum_length = wi.blob_length;
+    if (wi.compressed) {
+      final_length = wi.compressed_bl.length();
+      csum_length = final_length;
+      unsigned csum_order = std::countr_zero(csum_length);
+      l = &wi.compressed_bl;
+      dblob.set_compressed(wi.blob_length, wi.compressed_len);
+      if (csum != Checksummer::CSUM_NONE) {
+        dout(20) << __func__
+		 << " initialize csum setting for compressed blob " << *wi.b
+                 << " csum_type " << Checksummer::get_csum_type_string(csum)
+                 << " csum_order " << csum_order
+                 << " csum_length 0x" << std::hex << csum_length
+                 << " blob_length 0x" << wi.blob_length
+                 << " compressed_length 0x" << wi.compressed_len << std::dec
+                 << dendl;
+        dblob.init_csum(csum, csum_order, csum_length);
+      }
+    } else if (wi.new_blob) {
+      unsigned csum_order;
+      // initialize newly created blob only
+      ceph_assert(dblob.is_mutable());
+      if (l->length() != wi.blob_length) {
+        // hrm, maybe we could do better here, but let's not bother.
+        dout(20) << __func__ << " forcing csum_order to block_size_order "
+                << block_size_order << dendl;
+	csum_order = block_size_order;
+      } else {
+        csum_order = std::min<unsigned>(wctx->csum_order, std::countr_zero(l->length()));
+      }
+      // try to align blob with max_blob_size to improve
+      // its reuse ratio, e.g. in case of reverse write
+      uint32_t suggested_boff =
+       (wi.logical_offset - (wi.b_off0 - wi.b_off)) % max_bsize;
+      if ((suggested_boff % (1 << csum_order)) == 0 &&
+           suggested_boff + final_length <= max_bsize &&
+           suggested_boff > b_off) {
+        dout(20) << __func__ << " forcing blob_offset to 0x"
+                 << std::hex << suggested_boff << std::dec << dendl;
+        ceph_assert(suggested_boff >= b_off);
+        csum_length += suggested_boff - b_off;
+        b_off = suggested_boff;
+      }
+      if (csum != Checksummer::CSUM_NONE) {
+        dout(20) << __func__
+		 << " initialize csum setting for new blob " << *wi.b
+                 << " csum_type " << Checksummer::get_csum_type_string(csum)
+                 << " csum_order " << csum_order
+                 << " csum_length 0x" << std::hex << csum_length << std::dec
+                 << dendl;
+        dblob.init_csum(csum, csum_order, csum_length);
+      }
+    }
+
+    PExtentVector extents;
+    int64_t left = final_length;
+    auto prefer_deferred_size_snapshot = prefer_deferred_size.load();
+    while (left > 0) {
+      ceph_assert(prealloc_left > 0);
+      if (prealloc_pos->length <= left) {
+	prealloc_left -= prealloc_pos->length;
+	left -= prealloc_pos->length;
+	txc->statfs_delta.allocated() += prealloc_pos->length;
+	extents.push_back(*prealloc_pos);
+	++prealloc_pos;
+      } else {
+	extents.emplace_back(prealloc_pos->offset, left);
+	prealloc_pos->offset += left;
+	prealloc_pos->length -= left;
+	prealloc_left -= left;
+	txc->statfs_delta.allocated() += left;
+	left = 0;
+	break;
+      }
+    }
+    for (auto& p : extents) {
+      txc->allocated.insert(p.offset, p.length);
+    }
+    dblob.allocated(p2align(b_off, min_alloc_size), final_length, extents);
+
+    dout(20) << __func__ << " blob " << *wi.b << dendl;
+    if (dblob.has_csum()) {
+      dblob.calc_csum(b_off, *l);
+    }
+
+    if (wi.mark_unused) {
+      ceph_assert(!dblob.is_compressed());
+      auto b_end = b_off + wi.bl.length();
+      if (b_off) {
+        dblob.add_unused(0, b_off);
+      }
+      uint64_t llen = dblob.get_logical_length();
+      if (b_end < llen) {
+        dblob.add_unused(b_end, llen - b_end);
+      }
+    }
+
+    Extent *le = o->extent_map.set_lextent(coll, wi.logical_offset,
+                                           b_off + (wi.b_off0 - wi.b_off),
+                                           wi.length0,
+                                           wi.b,
+                                           nullptr);
+    wi.b->dirty_blob().mark_used(le->blob_offset, le->length);
+    txc->statfs_delta.stored() += le->length;
+    dout(20) << __func__ << "  lex " << *le << dendl;
+    _buffer_cache_write(txc, wi.b, b_off, wi.bl,
+                        wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
+
+    // queue io
+    if (!g_conf()->bluestore_debug_omit_block_device_write) {
+      if (data_size < prefer_deferred_size_snapshot) {
+	dout(20) << __func__ << " deferring 0x" << std::hex
+		 << l->length() << std::dec << " write via deferred" << dendl;
+	bluestore_deferred_op_t *op = _get_deferred_op(txc, l->length());
+	op->op = bluestore_deferred_op_t::OP_WRITE;
+	int r = wi.b->get_blob().map(
+	  b_off, l->length(),
+	  [&](uint64_t offset, uint64_t length) {
+	    op->extents.emplace_back(bluestore_pextent_t(offset, length));
+	    return 0;
+	  });
+        ceph_assert(r == 0);
+	op->data = *l;
+      } else {
+	wi.b->get_blob().map_bl(
+	  b_off, *l,
+	  [&](uint64_t offset, bufferlist& t) {
+	    bdev->aio_write(offset, t, &txc->ioc, false);
+	  });
+	logger->inc(l_bluestore_write_new);
+      }
+    }
+  }
+  ceph_assert(prealloc_pos == prealloc.end());
+  ceph_assert(prealloc_left == 0);
+  return 0;
+}
+
+void BlueStore::_wctx_finish(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef& o,
+  WriteContext *wctx,
+  set<SharedBlob*> *maybe_unshared_blobs)
+{
+#ifdef HAVE_LIBZBD
+  if (bdev->is_smr()) {
+    for (auto& w : wctx->writes) {
+      for (auto& e : w.b->get_blob().get_extents()) {
+	if (!e.is_valid()) {
+	  continue;
+	}
+	uint32_t zone = e.offset / zone_size;
+	if (!o->onode.zone_offset_refs.count(zone)) {
+	  uint64_t zoff = e.offset % zone_size;
+	  dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
+		   << " offset 0x" << zoff << std::dec << dendl;
+	  txc->note_write_zone_offset(o, zone, zoff);
+	}
+      }
+    }
+  }
+  set<uint32_t> zones_with_releases;
+#endif
+
+  auto oep = wctx->old_extents.begin();
+  while (oep != wctx->old_extents.end()) {
+    auto &lo = *oep;
+    oep = wctx->old_extents.erase(oep);
+    dout(20) << __func__ << " lex_old " << lo.e << dendl;
+    BlobRef b = lo.e.blob;
+    const bluestore_blob_t& blob = b->get_blob();
+    if (blob.is_compressed()) {
+      if (lo.blob_empty) {
+	txc->statfs_delta.compressed() -= blob.get_compressed_payload_length();
+      }
+      txc->statfs_delta.compressed_original() -= lo.e.length;
+    }
+    auto& r = lo.r;
+    txc->statfs_delta.stored() -= lo.e.length;
+    if (!r.empty()) {
+      dout(20) << __func__ << "  blob " << *b << " release " << r << dendl;
+      if (blob.is_shared()) {
+	PExtentVector final;
+        c->load_shared_blob(b->shared_blob);
+	bool unshare = false;
+	bool* unshare_ptr =
+	  !maybe_unshared_blobs || b->is_referenced() ? nullptr : &unshare;
+	for (auto e : r) {
+	  b->shared_blob->put_ref(
+	    e.offset, e.length, &final,
+	    unshare_ptr);
+#ifdef HAVE_LIBZBD
+	  // we also drop zone ref for shared blob extents
+	  if (bdev->is_smr() && e.is_valid()) {
+	    zones_with_releases.insert(e.offset / zone_size);
+	  }
+#endif
+	}
+	if (unshare) {
+	  ceph_assert(maybe_unshared_blobs);
+	  maybe_unshared_blobs->insert(b->shared_blob.get());
+	}
+	dout(20) << __func__ << "  shared_blob release " << final
+		 << " from " << *b->shared_blob << dendl;
+	txc->write_shared_blob(b->shared_blob);
+	r.clear();
+	r.swap(final);
+      }
+    }
+    // we can't invalidate our logical extents as we drop them because
+    // other lextents (either in our onode or others) may still
+    // reference them.  but we can throw out anything that is no
+    // longer allocated.  Note that this will leave behind edge bits
+    // that are no longer referenced but not deallocated (until they
+    // age out of the cache naturally).
+    b->discard_unallocated(c.get());
+    for (auto e : r) {
+      dout(20) << __func__ << "  release " << e << dendl;
+      txc->released.insert(e.offset, e.length);
+      txc->statfs_delta.allocated() -= e.length;
+      if (blob.is_compressed()) {
+        txc->statfs_delta.compressed_allocated() -= e.length;
+      }
+#ifdef HAVE_LIBZBD
+      if (bdev->is_smr() && e.is_valid()) {
+	zones_with_releases.insert(e.offset / zone_size);
+      }
+#endif
+    }
+
+    if (b->is_spanning() && !b->is_referenced() && lo.blob_empty) {
+      dout(20) << __func__ << "  spanning_blob_map removing empty " << *b
+	       << dendl;
+      o->extent_map.spanning_blob_map.erase(b->id);
+    }
+    delete &lo;
+  }
+
+#ifdef HAVE_LIBZBD
+  if (!zones_with_releases.empty()) {
+    // we need to fault the entire extent range in here to determinte if we've dropped
+    // all refs to a zone.
+    o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+    for (auto& b : o->extent_map.extent_map) {
+      for (auto& e : b.blob->get_blob().get_extents()) {
+	if (e.is_valid()) {
+	  zones_with_releases.erase(e.offset / zone_size);
+	}
+      }
+    }
+    for (auto zone : zones_with_releases) {
+      auto p = o->onode.zone_offset_refs.find(zone);
+      if (p != o->onode.zone_offset_refs.end()) {
+	dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
+		 << " offset 0x" << p->second << std::dec << dendl;
+	txc->note_release_zone_offset(o, zone, p->second);
+      }
+    }
+  }
+#endif
+}
+
+void BlueStore::_do_write_data(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef& o,
+  uint64_t offset,
+  uint64_t length,
+  bufferlist& bl,
+  WriteContext *wctx)
+{
+  uint64_t end = offset + length;
+  bufferlist::iterator p = bl.begin();
+
+  if (offset / min_alloc_size == (end - 1) / min_alloc_size &&
+      (length != min_alloc_size)) {
+    // we fall within the same block
+    _do_write_small(txc, c, o, offset, length, p, wctx);
+  } else {
+    uint64_t head_offset, head_length;
+    uint64_t middle_offset, middle_length;
+    uint64_t tail_offset, tail_length;
+
+    head_offset = offset;
+    head_length = p2nphase(offset, min_alloc_size);
+
+    tail_offset = p2align(end, min_alloc_size);
+    tail_length = p2phase(end, min_alloc_size);
+
+    middle_offset = head_offset + head_length;
+    middle_length = length - head_length - tail_length;
+
+    if (head_length) {
+      _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
+    }
+
+    _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
+
+    if (tail_length) {
+      _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
+    }
+  }
+}
+
+void BlueStore::_choose_write_options(
+   CollectionRef& c,
+   OnodeRef& o,
+   uint32_t fadvise_flags,
+   WriteContext *wctx)
+{
+  if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) {
+    dout(20) << __func__ << " will do buffered write" << dendl;
+    wctx->buffered = true;
+  } else if (cct->_conf->bluestore_default_buffered_write &&
+	     (fadvise_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
+			       CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) {
+    dout(20) << __func__ << " defaulting to buffered write" << dendl;
+    wctx->buffered = true;
+  }
+
+  // apply basic csum block size
+  wctx->csum_order = block_size_order;
+
+  // compression parameters
+  unsigned alloc_hints = o->onode.alloc_hint_flags;
+  auto cm = select_option(
+    "compression_mode",
+    comp_mode.load(),
+    [&]() {
+      string val;
+      if (c->pool_opts.get(pool_opts_t::COMPRESSION_MODE, &val)) {
+	return std::optional<Compressor::CompressionMode>(
+	  Compressor::get_comp_mode_type(val));
+      }
+      return std::optional<Compressor::CompressionMode>();
+    }
+  );
+
+  wctx->compress = (cm != Compressor::COMP_NONE) &&
+    ((cm == Compressor::COMP_FORCE) ||
+     (cm == Compressor::COMP_AGGRESSIVE &&
+      (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE) == 0) ||
+     (cm == Compressor::COMP_PASSIVE &&
+      (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE)));
+
+  if ((alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ) &&
+      (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ) == 0 &&
+      (alloc_hints & (CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE |
+                      CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY)) &&
+      (alloc_hints & CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE) == 0) {
+
+    dout(20) << __func__ << " will prefer large blob and csum sizes" << dendl;
+
+    if (o->onode.expected_write_size) {
+      wctx->csum_order = std::max(min_alloc_size_order,
+			          (uint8_t)std::countr_zero(o->onode.expected_write_size));
+    } else {
+      wctx->csum_order = min_alloc_size_order;
+    }
+
+    if (wctx->compress) {
+      wctx->target_blob_size = select_option(
+        "compression_max_blob_size",
+        comp_max_blob_size.load(),
+        [&]() {
+          int64_t val;
+          if (c->pool_opts.get(pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, &val)) {
+   	    return std::optional<uint64_t>((uint64_t)val);
+          }
+          return std::optional<uint64_t>();
+        }
+      );
+    }
+  } else {
+    if (wctx->compress) {
+      wctx->target_blob_size = select_option(
+        "compression_min_blob_size",
+        comp_min_blob_size.load(),
+        [&]() {
+          int64_t val;
+          if (c->pool_opts.get(pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, &val)) {
+   	    return std::optional<uint64_t>((uint64_t)val);
+          }
+          return std::optional<uint64_t>();
+        }
+      );
+    }
+  }
+
+  uint64_t max_bsize = max_blob_size.load();
+  if (wctx->target_blob_size == 0 || wctx->target_blob_size > max_bsize) {
+    wctx->target_blob_size = max_bsize;
+  }
+
+  // set the min blob size floor at 2x the min_alloc_size, or else we
+  // won't be able to allocate a smaller extent for the compressed
+  // data.
+  if (wctx->compress &&
+      wctx->target_blob_size < min_alloc_size * 2) {
+    wctx->target_blob_size = min_alloc_size * 2;
+  }
+
+  dout(20) << __func__ << " prefer csum_order " << wctx->csum_order
+           << " target_blob_size 0x" << std::hex << wctx->target_blob_size
+	   << " compress=" << (int)wctx->compress
+	   << " buffered=" << (int)wctx->buffered
+           << std::dec << dendl;
+}
+
+int BlueStore::_do_gc(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef& o,
+  const WriteContext& wctx,
+  uint64_t *dirty_start,
+  uint64_t *dirty_end)
+{
+
+  bool dirty_range_updated = false;
+  WriteContext wctx_gc;
+  wctx_gc.fork(wctx); // make a clone for garbage collection
+
+  auto & extents_to_collect = wctx.extents_to_gc;
+  for (auto it = extents_to_collect.begin();
+       it != extents_to_collect.end();
+       ++it) {
+    bufferlist bl;
+    auto offset = (*it).first;
+    auto length = (*it).second;
+    dout(20) << __func__ << " processing " << std::hex
+            << offset << "~" << length << std::dec
+	    << dendl;
+    int r = _do_read(c.get(), o, offset, length, bl, 0);
+    ceph_assert(r == (int)length);
+
+    _do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
+    logger->inc(l_bluestore_gc_merged, length);
+
+    if (*dirty_start > offset) {
+      *dirty_start = offset;
+      dirty_range_updated = true;
+    }
+
+    if (*dirty_end < offset + length) {
+      *dirty_end = offset + length;
+      dirty_range_updated = true;
+    }
+  }
+  if (dirty_range_updated) {
+    o->extent_map.fault_range(db, *dirty_start, *dirty_end);
+  }
+
+  dout(30) << __func__ << " alloc write" << dendl;
+  int r = _do_alloc_write(txc, c, o, &wctx_gc);
+  if (r < 0) {
+    derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
+         << dendl;
+    return r;
+  }
+
+  _wctx_finish(txc, c, o, &wctx_gc);
+  return 0;
+}
+
+int BlueStore::_do_write(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef& o,
+  uint64_t offset,
+  uint64_t length,
+  bufferlist& bl,
+  uint32_t fadvise_flags)
+{
+  int r = 0;
+
+  dout(20) << __func__
+	   << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length
+	   << " - have 0x" << o->onode.size
+	   << " (" << std::dec << o->onode.size << ")"
+	   << " bytes" << std::hex
+	   << " fadvise_flags 0x" << fadvise_flags
+	   << " alloc_hint 0x" << o->onode.alloc_hint_flags
+           << " expected_object_size " << o->onode.expected_object_size
+           << " expected_write_size " << o->onode.expected_write_size
+           << std::dec
+	   << dendl;
+  _dump_onode<30>(cct, *o);
+
+  if (length == 0) {
+    return 0;
+  }
+
+  uint64_t end = offset + length;
+
+  GarbageCollector gc(c->store->cct);
+  int64_t benefit = 0;
+  auto dirty_start = offset;
+  auto dirty_end = end;
+
+  WriteContext wctx;
+  _choose_write_options(c, o, fadvise_flags, &wctx);
+  o->extent_map.fault_range(db, offset, length);
+  _do_write_data(txc, c, o, offset, length, bl, &wctx);
+  r = _do_alloc_write(txc, c, o, &wctx);
+  if (r < 0) {
+    derr << __func__ << " _do_alloc_write failed with " << cpp_strerror(r)
+	 << dendl;
+    goto out;
+  }
+
+  if (wctx.extents_to_gc.empty() ||
+      wctx.extents_to_gc.range_start() > offset ||
+      wctx.extents_to_gc.range_end() < offset + length) {
+    benefit = gc.estimate(offset,
+			  length,
+			  o->extent_map,
+			  wctx.old_extents,
+			  min_alloc_size);
+  }
+
+  // NB: _wctx_finish() will empty old_extents
+  // so we must do gc estimation before that
+  _wctx_finish(txc, c, o, &wctx);
+  if (end > o->onode.size) {
+    dout(20) << __func__ << " extending size to 0x" << std::hex << end
+             << std::dec << dendl;
+    o->onode.size = end;
+  }
+
+  if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
+    wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
+    dout(20) << __func__
+             << " perform garbage collection for compressed extents, "
+             << "expected benefit = " << benefit << " AUs" << dendl;
+  }
+  if (!wctx.extents_to_gc.empty()) {
+    dout(20) << __func__ << " perform garbage collection" << dendl;
+
+    r = _do_gc(txc, c, o,
+      wctx,
+      &dirty_start, &dirty_end);
+    if (r < 0) {
+      derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
+            << dendl;
+      goto out;
+    }
+    dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
+	    << "~" << dirty_end - dirty_start << std::dec << dendl;
+  }
+  o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
+  o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);
+
+  r = 0;
+
+ out:
+  return r;
+}
+
+int BlueStore::_write(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& o,
+		      uint64_t offset, size_t length,
+		      bufferlist& bl,
+		      uint32_t fadvise_flags)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << dendl;
+  int r = 0;
+  if (offset + length >= OBJECT_MAX_SIZE) {
+    r = -E2BIG;
+  } else {
+    _assign_nid(txc, o);
+    r = _do_write(txc, c, o, offset, length, bl, fadvise_flags);
+    txc->write_onode(o);
+  }
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_zero(TransContext *txc,
+		     CollectionRef& c,
+		     OnodeRef& o,
+		     uint64_t offset, size_t length)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << dendl;
+  int r = 0;
+  if (offset + length >= OBJECT_MAX_SIZE) {
+    r = -E2BIG;
+  } else {
+    _assign_nid(txc, o);
+    r = _do_zero(txc, c, o, offset, length);
+  }
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_do_zero(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o,
+			uint64_t offset, size_t length)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << dendl;
+  int r = 0;
+
+  _dump_onode<30>(cct, *o);
+
+  WriteContext wctx;
+  o->extent_map.fault_range(db, offset, length);
+  o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
+  o->extent_map.dirty_range(offset, length);
+  _wctx_finish(txc, c, o, &wctx);
+
+  if (length > 0 && offset + length > o->onode.size) {
+    o->onode.size = offset + length;
+    dout(20) << __func__ << " extending size to " << offset + length
+	     << dendl;
+  }
+  txc->write_onode(o);
+
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << "~" << length << std::dec
+	   << " = " << r << dendl;
+  return r;
+}
+
+void BlueStore::_do_truncate(
+  TransContext *txc, CollectionRef& c, OnodeRef& o, uint64_t offset,
+  set<SharedBlob*> *maybe_unshared_blobs)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << std::dec << dendl;
+
+  _dump_onode<30>(cct, *o);
+
+  if (offset == o->onode.size)
+    return;
+
+  WriteContext wctx;
+  if (offset < o->onode.size) {
+    uint64_t length = o->onode.size - offset;
+    o->extent_map.fault_range(db, offset, length);
+    o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
+    o->extent_map.dirty_range(offset, length);
+
+    _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs);
+
+    // if we have shards past EOF, ask for a reshard
+    if (!o->onode.extent_map_shards.empty() &&
+	o->onode.extent_map_shards.back().offset >= offset) {
+      dout(10) << __func__ << "  request reshard past EOF" << dendl;
+      if (offset) {
+	o->extent_map.request_reshard(offset - 1, offset + length);
+      } else {
+	o->extent_map.request_reshard(0, length);
+      }
+    }
+  }
+
+  o->onode.size = offset;
+
+  txc->write_onode(o);
+}
+
+int BlueStore::_truncate(TransContext *txc,
+			 CollectionRef& c,
+			 OnodeRef& o,
+			 uint64_t offset)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << std::dec
+	   << dendl;
+
+  auto start_time = mono_clock::now();
+  int r = 0;
+  if (offset >= OBJECT_MAX_SIZE) {
+    r = -E2BIG;
+  } else {
+    _do_truncate(txc, c, o, offset);
+  }
+  log_latency_fn(
+    __func__,
+    l_bluestore_truncate_lat,
+    mono_clock::now() - start_time,
+    cct->_conf->bluestore_log_op_age,
+    [&](const ceph::timespan& lat) {
+      ostringstream ostr;
+      ostr << ", lat = " << timespan_str(lat)
+        << " cid =" << c->cid
+        << " oid =" << o->oid;
+      return ostr.str();
+    }
+  );
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " 0x" << std::hex << offset << std::dec
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_do_remove(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef& o)
+{
+  set<SharedBlob*> maybe_unshared_blobs;
+  bool is_gen = !o->oid.is_no_gen();
+  _do_truncate(txc, c, o, 0, is_gen ? &maybe_unshared_blobs : nullptr);
+  if (o->onode.has_omap()) {
+    o->flush();
+    _do_omap_clear(txc, o);
+  }
+  o->exists = false;
+  string key;
+  for (auto &s : o->extent_map.shards) {
+    dout(20) << __func__ << "  removing shard 0x" << std::hex
+	     << s.shard_info->offset << std::dec << dendl;
+    generate_extent_shard_key_and_apply(o->key, s.shard_info->offset, &key,
+      [&](const string& final_key) {
+        txc->t->rmkey(PREFIX_OBJ, final_key);
+      }
+    );
+  }
+  txc->t->rmkey(PREFIX_OBJ, o->key.c_str(), o->key.size());
+  txc->note_removed_object(o);
+  o->extent_map.clear();
+  o->onode = bluestore_onode_t();
+  _debug_obj_on_delete(o->oid);
+
+  if (!is_gen || maybe_unshared_blobs.empty()) {
+    return 0;
+  }
+
+  // see if we can unshare blobs still referenced by the head
+  dout(10) << __func__ << " gen and maybe_unshared_blobs "
+	   << maybe_unshared_blobs << dendl;
+  ghobject_t nogen = o->oid;
+  nogen.generation = ghobject_t::NO_GEN;
+  OnodeRef h = c->get_onode(nogen, false);
+
+  if (!h || !h->exists) {
+    return 0;
+  }
+
+  dout(20) << __func__ << " checking for unshareable blobs on " << h
+	   << " " << h->oid << dendl;
+  map<SharedBlob*,bluestore_extent_ref_map_t> expect;
+  for (auto& e : h->extent_map.extent_map) {
+    const bluestore_blob_t& b = e.blob->get_blob();
+    SharedBlob *sb = e.blob->shared_blob.get();
+    if (b.is_shared() &&
+	sb->loaded &&
+	maybe_unshared_blobs.count(sb)) {
+      if (b.is_compressed()) {
+	expect[sb].get(0, b.get_ondisk_length());
+      } else {
+	b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) {
+	    expect[sb].get(off, len);
+	    return 0;
+	  });
+      }
+    }
+  }
+
+  vector<SharedBlob*> unshared_blobs;
+  unshared_blobs.reserve(maybe_unshared_blobs.size());
+  for (auto& p : expect) {
+    dout(20) << " ? " << *p.first << " vs " << p.second << dendl;
+    if (p.first->persistent->ref_map == p.second) {
+      SharedBlob *sb = p.first;
+      dout(20) << __func__ << "  unsharing " << *sb << dendl;
+      unshared_blobs.push_back(sb);
+      txc->unshare_blob(sb);
+      uint64_t sbid = c->make_blob_unshared(sb);
+      string key;
+      get_shared_blob_key(sbid, &key);
+      txc->t->rmkey(PREFIX_SHARED_BLOB, key);
+    }
+  }
+
+  if (unshared_blobs.empty()) {
+    return 0;
+  }
+
+  for (auto& e : h->extent_map.extent_map) {
+    const bluestore_blob_t& b = e.blob->get_blob();
+    SharedBlob *sb = e.blob->shared_blob.get();
+    if (b.is_shared() &&
+        std::find(unshared_blobs.begin(), unshared_blobs.end(),
+                  sb) != unshared_blobs.end()) {
+      dout(20) << __func__ << "  unsharing " << e << dendl;
+      bluestore_blob_t& blob = e.blob->dirty_blob();
+      blob.clear_flag(bluestore_blob_t::FLAG_SHARED);
+      h->extent_map.dirty_range(e.logical_offset, 1);
+    }
+  }
+  txc->write_onode(h);
+
+  return 0;
+}
+
+int BlueStore::_remove(TransContext *txc,
+		       CollectionRef& c,
+		       OnodeRef& o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " onode " << o.get()
+	   << " txc "<< txc << dendl;
+ auto start_time = mono_clock::now();
+  int r = _do_remove(txc, c, o);
+
+  log_latency_fn(
+    __func__,
+    l_bluestore_remove_lat,
+    mono_clock::now() - start_time,
+    cct->_conf->bluestore_log_op_age,
+    [&](const ceph::timespan& lat) {
+      ostringstream ostr;
+      ostr << ", lat = " << timespan_str(lat)
+        << " cid =" << c->cid
+        << " oid =" << o->oid;
+      return ostr.str();
+    }
+  );
+
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_setattr(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o,
+			const string& name,
+			bufferptr& val)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << " (" << val.length() << " bytes)"
+	   << dendl;
+  int r = 0;
+  if (val.is_partial()) {
+    auto& b = o->onode.attrs[name.c_str()] = bufferptr(val.c_str(),
+						       val.length());
+    b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+  } else {
+    auto& b = o->onode.attrs[name.c_str()] = val;
+    b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+  }
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << " (" << val.length() << " bytes)"
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_setattrs(TransContext *txc,
+			 CollectionRef& c,
+			 OnodeRef& o,
+			 const map<string,bufferptr>& aset)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << aset.size() << " keys"
+	   << dendl;
+  int r = 0;
+  for (map<string,bufferptr>::const_iterator p = aset.begin();
+       p != aset.end(); ++p) {
+    if (p->second.is_partial()) {
+      auto& b = o->onode.attrs[p->first.c_str()] =
+	bufferptr(p->second.c_str(), p->second.length());
+      b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+    } else {
+      auto& b = o->onode.attrs[p->first.c_str()] = p->second;
+      b.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
+    }
+  }
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << aset.size() << " keys"
+	   << " = " << r << dendl;
+  return r;
+}
+
+
+int BlueStore::_rmattr(TransContext *txc,
+		       CollectionRef& c,
+		       OnodeRef& o,
+		       const string& name)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << dendl;
+  int r = 0;
+  auto it = o->onode.attrs.find(name.c_str());
+  if (it == o->onode.attrs.end())
+    goto out;
+
+  o->onode.attrs.erase(it);
+  txc->write_onode(o);
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_rmattrs(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+
+  if (o->onode.attrs.empty())
+    goto out;
+
+  o->onode.attrs.clear();
+  txc->write_onode(o);
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+void BlueStore::_do_omap_clear(TransContext *txc, OnodeRef& o)
+{
+  const string& omap_prefix = o->get_omap_prefix();
+  string prefix, tail;
+  o->get_omap_header(&prefix);
+  o->get_omap_tail(&tail);
+  txc->t->rm_range_keys(omap_prefix, prefix, tail);
+  txc->t->rmkey(omap_prefix, tail);
+  o->onode.clear_omap_flag();
+  dout(20) << __func__ << " remove range start: "
+           << pretty_binary_string(prefix) << " end: "
+           << pretty_binary_string(tail) << dendl;
+}
+
+int BlueStore::_omap_clear(TransContext *txc,
+			   CollectionRef& c,
+			   OnodeRef& o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  auto t0 = mono_clock::now();
+
+  int r = 0;
+  if (o->onode.has_omap()) {
+    o->flush();
+    _do_omap_clear(txc, o);
+    txc->write_onode(o);
+  }
+  logger->tinc(l_bluestore_omap_clear_lat, mono_clock::now() - t0);
+
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_omap_setkeys(TransContext *txc,
+			     CollectionRef& c,
+			     OnodeRef& o,
+			     bufferlist &bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r;
+  auto p = bl.cbegin();
+  __u32 num;
+  if (!o->onode.has_omap()) {
+    if (o->oid.is_pgmeta()) {
+      o->onode.set_omap_flags_pgmeta();
+    } else {
+      o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
+    }
+    txc->write_onode(o);
+
+    const string& prefix = o->get_omap_prefix();
+    string key_tail;
+    bufferlist tail;
+    o->get_omap_tail(&key_tail);
+    txc->t->set(prefix, key_tail, tail);
+  } else {
+    txc->note_modified_object(o);
+  }
+  const string& prefix = o->get_omap_prefix();
+  string final_key;
+  o->get_omap_key(string(), &final_key);
+  size_t base_key_len = final_key.size();
+  decode(num, p);
+  while (num--) {
+    string key;
+    bufferlist value;
+    decode(key, p);
+    decode(value, p);
+    final_key.resize(base_key_len); // keep prefix
+    final_key += key;
+    dout(20) << __func__ << "  " << pretty_binary_string(final_key)
+	     << " <- " << key << dendl;
+    txc->t->set(prefix, final_key, value);
+  }
+  r = 0;
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_omap_setheader(TransContext *txc,
+			       CollectionRef& c,
+			       OnodeRef& o,
+			       bufferlist& bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r;
+  string key;
+  if (!o->onode.has_omap()) {
+    if (o->oid.is_pgmeta()) {
+      o->onode.set_omap_flags_pgmeta();
+    } else {
+      o->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
+    }
+    txc->write_onode(o);
+
+    const string& prefix = o->get_omap_prefix();
+    string key_tail;
+    bufferlist tail;
+    o->get_omap_tail(&key_tail);
+    txc->t->set(prefix, key_tail, tail);
+  } else {
+    txc->note_modified_object(o);
+  }
+  const string& prefix = o->get_omap_prefix();
+  o->get_omap_header(&key);
+  txc->t->set(prefix, key, bl);
+  r = 0;
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_omap_rmkeys(TransContext *txc,
+			    CollectionRef& c,
+			    OnodeRef& o,
+			    bufferlist& bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  auto p = bl.cbegin();
+  __u32 num;
+  string final_key;
+  if (!o->onode.has_omap()) {
+    goto out;
+  }
+  {
+    const string& prefix = o->get_omap_prefix();
+    o->get_omap_key(string(), &final_key);
+    size_t base_key_len = final_key.size();
+    decode(num, p);
+    logger->inc(l_bluestore_omap_rmkeys_count, num);
+    while (num--) {
+      string key;
+      decode(key, p);
+      final_key.resize(base_key_len); // keep prefix
+      final_key += key;
+      dout(20) << __func__ << "  rm " << pretty_binary_string(final_key)
+	       << " <- " << key << dendl;
+      txc->t->rmkey(prefix, final_key);
+    }
+  }
+  txc->note_modified_object(o);
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_omap_rmkey_range(TransContext *txc,
+				 CollectionRef& c,
+				 OnodeRef& o,
+				 const string& first, const string& last)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  string key_first, key_last;
+  int r = 0;
+  if (!o->onode.has_omap()) {
+    goto out;
+  }
+  {
+    const string& prefix = o->get_omap_prefix();
+    o->flush();
+    o->get_omap_key(first, &key_first);
+    o->get_omap_key(last, &key_last);
+    logger->inc(l_bluestore_omap_rmkey_ranges_count);
+    txc->t->rm_range_keys(prefix, key_first, key_last);
+    dout(20) << __func__ << " remove range start: "
+             << pretty_binary_string(key_first) << " end: "
+             << pretty_binary_string(key_last) << dendl;
+  }
+  txc->note_modified_object(o);
+
+ out:
+  return r;
+}
+
+int BlueStore::_set_alloc_hint(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef& o,
+  uint64_t expected_object_size,
+  uint64_t expected_write_size,
+  uint32_t flags)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " object_size " << expected_object_size
+	   << " write_size " << expected_write_size
+	   << " flags " << ceph_osd_alloc_hint_flag_string(flags)
+	   << dendl;
+  int r = 0;
+  o->onode.expected_object_size = expected_object_size;
+  o->onode.expected_write_size = expected_write_size;
+  o->onode.alloc_hint_flags = flags;
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " object_size " << expected_object_size
+	   << " write_size " << expected_write_size
+	   << " flags " << ceph_osd_alloc_hint_flag_string(flags)
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_clone(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& oldo,
+		      OnodeRef& newo)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << dendl;
+  int r = 0;
+  if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
+    derr << __func__ << " mismatched hash on " << oldo->oid
+	 << " and " << newo->oid << dendl;
+    return -EINVAL;
+  }
+
+  _assign_nid(txc, newo);
+
+  // clone data
+  oldo->flush();
+  _do_truncate(txc, c, newo, 0);
+  if (cct->_conf->bluestore_clone_cow) {
+    _do_clone_range(txc, c, oldo, newo, 0, oldo->onode.size, 0);
+  } else {
+    bufferlist bl;
+    r = _do_read(c.get(), oldo, 0, oldo->onode.size, bl, 0);
+    if (r < 0)
+      goto out;
+    r = _do_write(txc, c, newo, 0, oldo->onode.size, bl, 0);
+    if (r < 0)
+      goto out;
+  }
+
+  // clone attrs
+  newo->onode.attrs = oldo->onode.attrs;
+
+  // clone omap
+  if (newo->onode.has_omap()) {
+    dout(20) << __func__ << " clearing old omap data" << dendl;
+    newo->flush();
+    _do_omap_clear(txc, newo);
+  }
+  if (oldo->onode.has_omap()) {
+    dout(20) << __func__ << " copying omap data" << dendl;
+    if (newo->oid.is_pgmeta()) {
+      newo->onode.set_omap_flags_pgmeta();
+    } else {
+      newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK);
+    }
+    // check if prefix for omap key is exactly the same size for both objects
+    // otherwise rewrite_omap_key will corrupt data
+    ceph_assert(oldo->onode.flags == newo->onode.flags);
+    const string& prefix = newo->get_omap_prefix();
+    string head, tail;
+    oldo->get_omap_header(&head);
+    oldo->get_omap_tail(&tail);
+    KeyValueDB::Iterator it = db->get_iterator(prefix, 0, KeyValueDB::IteratorBounds{head, tail});
+    it->lower_bound(head);
+    while (it->valid()) {
+      if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      } else {
+	dout(30) << __func__ << "  got header/data "
+		 << pretty_binary_string(it->key()) << dendl;
+        string key;
+	newo->rewrite_omap_key(it->key(), &key);
+	txc->t->set(prefix, key, it->value());
+      }
+      it->next();
+    }
+    string new_tail;
+    bufferlist new_tail_value;
+    newo->get_omap_tail(&new_tail);
+    txc->t->set(prefix, new_tail, new_tail_value);
+  }
+
+  txc->write_onode(newo);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_do_clone_range(
+  TransContext *txc,
+  CollectionRef& c,
+  OnodeRef& oldo,
+  OnodeRef& newo,
+  uint64_t srcoff,
+  uint64_t length,
+  uint64_t dstoff)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid
+	   << " 0x" << std::hex << srcoff << "~" << length << " -> "
+	   << " 0x" << dstoff << "~" << length << std::dec << dendl;
+  oldo->extent_map.fault_range(db, srcoff, length);
+  newo->extent_map.fault_range(db, dstoff, length);
+  _dump_onode<30>(cct, *oldo);
+  _dump_onode<30>(cct, *newo);
+
+  oldo->extent_map.dup(this, txc, c, oldo, newo, srcoff, length, dstoff);
+
+#ifdef HAVE_LIBZBD
+  if (bdev->is_smr()) {
+    // duplicate the refs for the shared region.
+    Extent dummy(dstoff);
+    for (auto e = newo->extent_map.extent_map.lower_bound(dummy);
+	 e != newo->extent_map.extent_map.end();
+	 ++e) {
+      if (e->logical_offset >= dstoff + length) {
+	break;
+      }
+      for (auto& ex : e->blob->get_blob().get_extents()) {
+	// note that we may introduce a new extent reference that is
+	// earlier than the first zone ref.  we allow this since it is
+	// a lot of work to avoid and has marginal impact on cleaning
+	// performance.
+	if (!ex.is_valid()) {
+	  continue;
+	}
+	uint32_t zone = ex.offset / zone_size;
+	if (!newo->onode.zone_offset_refs.count(zone)) {
+	  uint64_t zoff = ex.offset % zone_size;
+	  dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
+		   << " offset 0x" << zoff << std::dec
+		   << " -> " << newo->oid << dendl;
+	  txc->note_write_zone_offset(newo, zone, zoff);
+	}
+      }
+    }
+  }
+#endif
+
+  _dump_onode<30>(cct, *oldo);
+  _dump_onode<30>(cct, *newo);
+  return 0;
+}
+
+int BlueStore::_clone_range(TransContext *txc,
+			    CollectionRef& c,
+			    OnodeRef& oldo,
+			    OnodeRef& newo,
+			    uint64_t srcoff, uint64_t length, uint64_t dstoff)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
+	   << " to offset 0x" << dstoff << std::dec << dendl;
+  int r = 0;
+
+  if (srcoff + length >= OBJECT_MAX_SIZE ||
+      dstoff + length >= OBJECT_MAX_SIZE) {
+    r = -E2BIG;
+    goto out;
+  }
+  if (srcoff + length > oldo->onode.size) {
+    r = -EINVAL;
+    goto out;
+  }
+
+  _assign_nid(txc, newo);
+
+  if (length > 0) {
+    if (cct->_conf->bluestore_clone_cow) {
+      _do_zero(txc, c, newo, dstoff, length);
+      _do_clone_range(txc, c, oldo, newo, srcoff, length, dstoff);
+    } else {
+      bufferlist bl;
+      r = _do_read(c.get(), oldo, srcoff, length, bl, 0);
+      if (r < 0)
+	goto out;
+      r = _do_write(txc, c, newo, dstoff, bl.length(), bl, 0);
+      if (r < 0)
+	goto out;
+    }
+  }
+
+  txc->write_onode(newo);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << " from 0x" << std::hex << srcoff << "~" << length
+	   << " to offset 0x" << dstoff << std::dec
+	   << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_rename(TransContext *txc,
+		       CollectionRef& c,
+		       OnodeRef& oldo,
+		       OnodeRef& newo,
+		       const ghobject_t& new_oid)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << new_oid << dendl;
+  int r;
+  ghobject_t old_oid = oldo->oid;
+  mempool::bluestore_cache_meta::string new_okey;
+
+  if (newo) {
+    if (newo->exists) {
+      r = -EEXIST;
+      goto out;
+    }
+    ceph_assert(txc->onodes.count(newo) == 0);
+  }
+
+  txc->t->rmkey(PREFIX_OBJ, oldo->key.c_str(), oldo->key.size());
+
+  // rewrite shards
+  {
+    oldo->extent_map.fault_range(db, 0, oldo->onode.size);
+    get_object_key(cct, new_oid, &new_okey);
+    string key;
+    for (auto &s : oldo->extent_map.shards) {
+      generate_extent_shard_key_and_apply(oldo->key, s.shard_info->offset, &key,
+        [&](const string& final_key) {
+          txc->t->rmkey(PREFIX_OBJ, final_key);
+        }
+      );
+      s.dirty = true;
+    }
+  }
+
+  newo = oldo;
+  txc->write_onode(newo);
+
+  // this adjusts oldo->{oid,key}, and reset oldo to a fresh empty
+  // Onode in the old slot
+  c->onode_space.rename(oldo, old_oid, new_oid, new_okey);
+  r = 0;
+
+  // hold a ref to new Onode in old name position, to ensure we don't drop
+  // it from the cache before this txc commits (or else someone may come along
+  // and read newo's metadata via the old name).
+  txc->note_modified_object(oldo);
+
+#ifdef HAVE_LIBZBD
+  if (bdev->is_smr()) {
+    // adjust zone refs
+    for (auto& [zone, offset] : newo->onode.zone_offset_refs) {
+      dout(20) << __func__ << " rm ref zone 0x" << std::hex << zone
+	       << " offset 0x" << offset << std::dec
+	       << " -> " << oldo->oid << dendl;
+      string key;
+      get_zone_offset_object_key(zone, offset, oldo->oid, &key);
+      txc->t->rmkey(PREFIX_ZONED_CL_INFO, key);
+
+      dout(20) << __func__ << " add ref zone 0x" << std::hex << zone
+	       << " offset 0x" << offset << std::dec
+	       << " -> " << newo->oid << dendl;
+      get_zone_offset_object_key(zone, offset, newo->oid, &key);
+      bufferlist v;
+      txc->t->set(PREFIX_ZONED_CL_INFO, key, v);
+    }
+  }
+#endif
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
+	   << new_oid << " = " << r << dendl;
+  return r;
+}
+
+// collections
+
+int BlueStore::_create_collection(
+  TransContext *txc,
+  const coll_t &cid,
+  unsigned bits,
+  CollectionRef *c)
+{
+  dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
+  int r;
+  bufferlist bl;
+
+  {
+    std::unique_lock l(coll_lock);
+    if (*c) {
+      r = -EEXIST;
+      goto out;
+    }
+    auto p = new_coll_map.find(cid);
+    ceph_assert(p != new_coll_map.end());
+    *c = p->second;
+    (*c)->cnode.bits = bits;
+    coll_map[cid] = *c;
+    new_coll_map.erase(p);
+  }
+  encode((*c)->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(cid), bl);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_remove_collection(TransContext *txc, const coll_t &cid,
+				  CollectionRef *c)
+{
+  dout(15) << __func__ << " " << cid << dendl;
+  int r;
+
+  (*c)->flush_all_but_last();
+  {
+    std::unique_lock l(coll_lock);
+    if (!*c) {
+      r = -ENOENT;
+      goto out;
+    }
+    size_t nonexistent_count = 0;
+    ceph_assert((*c)->exists);
+    if ((*c)->onode_space.map_any([&](Onode* o) {
+      if (o->exists) {
+        dout(1) << __func__ << " " << o->oid << " " << o
+	        << " exists in onode_map" << dendl;
+          return true;
+      }
+      ++nonexistent_count;
+      return false;
+    })) {
+      r = -ENOTEMPTY;
+      goto out;
+    }
+    vector<ghobject_t> ls;
+    ghobject_t next;
+    // Enumerate onodes in db, up to nonexistent_count + 1
+    // then check if all of them are marked as non-existent.
+    // Bypass the check if (next != ghobject_t::get_max())
+    r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
+                         nonexistent_count + 1, false, &ls, &next);
+    if (r >= 0) {
+      // If true mean collecton has more objects than nonexistent_count,
+      // so bypass check.
+      bool exists = (!next.is_max());
+      for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
+        dout(10) << __func__ << " oid " << *it << dendl;
+        auto onode = (*c)->onode_space.lookup(*it);
+        exists = !onode || onode->exists;
+        if (exists) {
+          dout(1) << __func__ << " " << *it
+	  << " exists in db, "
+	  << (!onode ? "not present in ram" : "present in ram")
+	  << dendl;
+        }
+      }
+      if (!exists) {
+        _do_remove_collection(txc, c);
+        r = 0;
+      } else {
+        dout(10) << __func__ << " " << cid
+                 << " is non-empty" << dendl;
+	r = -ENOTEMPTY;
+      }
+    }
+  }
+out:
+  dout(10) << __func__ << " " << cid << " = " << r << dendl;
+  return r;
+}
+
+void BlueStore::_do_remove_collection(TransContext *txc,
+				      CollectionRef *c)
+{
+  coll_map.erase((*c)->cid);
+  txc->removed_collections.push_back(*c);
+  (*c)->exists = false;
+  _osr_register_zombie((*c)->osr.get());
+  txc->t->rmkey(PREFIX_COLL, stringify((*c)->cid));
+  c->reset();
+}
+
+int BlueStore::_split_collection(TransContext *txc,
+				CollectionRef& c,
+				CollectionRef& d,
+				unsigned bits, int rem)
+{
+  dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
+	   << " bits " << bits << dendl;
+  std::unique_lock l(c->lock);
+  std::unique_lock l2(d->lock);
+  int r;
+
+  // flush all previous deferred writes on this sequencer.  this is a bit
+  // heavyweight, but we need to make sure all deferred writes complete
+  // before we split as the new collection's sequencer may need to order
+  // this after those writes, and we don't bother with the complexity of
+  // moving those TransContexts over to the new osr.
+  _osr_drain_preceding(txc);
+
+  // move any cached items (onodes and referenced shared blobs) that will
+  // belong to the child collection post-split.  leave everything else behind.
+  // this may include things that don't strictly belong to the now-smaller
+  // parent split, but the OSD will always send us a split for every new
+  // child.
+
+  spg_t pgid, dest_pgid;
+  bool is_pg = c->cid.is_pg(&pgid);
+  ceph_assert(is_pg);
+  is_pg = d->cid.is_pg(&dest_pgid);
+  ceph_assert(is_pg);
+
+  // the destination should initially be empty.
+  ceph_assert(d->onode_space.empty());
+  ceph_assert(d->shared_blob_set.empty());
+  ceph_assert(d->cnode.bits == bits);
+
+  c->split_cache(d.get());
+
+  // adjust bits.  note that this will be redundant for all but the first
+  // split call for this parent (first child).
+  c->cnode.bits = bits;
+  ceph_assert(d->cnode.bits == bits);
+  r = 0;
+
+  bufferlist bl;
+  encode(c->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
+
+  dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
+	   << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+int BlueStore::_merge_collection(
+  TransContext *txc,
+  CollectionRef *c,
+  CollectionRef& d,
+  unsigned bits)
+{
+  dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid
+	   << " bits " << bits << dendl;
+  std::unique_lock l((*c)->lock);
+  std::unique_lock l2(d->lock);
+  int r;
+
+  coll_t cid = (*c)->cid;
+
+  // flush all previous deferred writes on the source collection to ensure
+  // that all deferred writes complete before we merge as the target collection's
+  // sequencer may need to order new ops after those writes.
+
+  _osr_drain((*c)->osr.get());
+
+  // move any cached items (onodes and referenced shared blobs) that will
+  // belong to the child collection post-split.  leave everything else behind.
+  // this may include things that don't strictly belong to the now-smaller
+  // parent split, but the OSD will always send us a split for every new
+  // child.
+
+  spg_t pgid, dest_pgid;
+  bool is_pg = cid.is_pg(&pgid);
+  ceph_assert(is_pg);
+  is_pg = d->cid.is_pg(&dest_pgid);
+  ceph_assert(is_pg);
+
+  // adjust bits.  note that this will be redundant for all but the first
+  // merge call for the parent/target.
+  d->cnode.bits = bits;
+
+  // behavior depends on target (d) bits, so this after that is updated.
+  (*c)->split_cache(d.get());
+
+  // remove source collection
+  {
+    std::unique_lock l3(coll_lock);
+    _do_remove_collection(txc, c);
+  }
+
+  r = 0;
+
+  bufferlist bl;
+  encode(d->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
+
+  dout(10) << __func__ << " " << cid << " to " << d->cid << " "
+	   << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+void BlueStore::log_latency(
+  const char* name,
+  int idx,
+  const ceph::timespan& l,
+  double lat_threshold,
+  const char* info) const
+{
+  logger->tinc(idx, l);
+  if (lat_threshold > 0.0 &&
+      l >= make_timespan(lat_threshold)) {
+    dout(0) << __func__ << " slow operation observed for " << name
+      << ", latency = " << l
+      << info
+      << dendl;
+  }
+}
+
+void BlueStore::log_latency_fn(
+  const char* name,
+  int idx,
+  const ceph::timespan& l,
+  double lat_threshold,
+  std::function<string (const ceph::timespan& lat)> fn) const
+{
+  logger->tinc(idx, l);
+  if (lat_threshold > 0.0 &&
+      l >= make_timespan(lat_threshold)) {
+    dout(0) << __func__ << " slow operation observed for " << name
+      << ", latency = " << l
+      << fn(l)
+      << dendl;
+  }
+}
+
+#if defined(WITH_LTTNG)
+void BlueStore::BlueStoreThrottle::emit_initial_tracepoint(
+  KeyValueDB &db,
+  TransContext &txc,
+  mono_clock::time_point start_throttle_acquire)
+{
+  pending_kv_ios += txc.ios;
+  if (txc.deferred_txn) {
+    pending_deferred_ios += txc.ios;
+  }
+
+  uint64_t started = 0;
+  uint64_t completed = 0;
+  if (should_trace(&started, &completed)) {
+    txc.tracing = true;
+    uint64_t rocksdb_base_level,
+      rocksdb_estimate_pending_compaction_bytes,
+      rocksdb_cur_size_all_mem_tables,
+      rocksdb_compaction_pending,
+      rocksdb_mem_table_flush_pending,
+      rocksdb_num_running_compactions,
+      rocksdb_num_running_flushes,
+      rocksdb_actual_delayed_write_rate;
+    db.get_property(
+      "rocksdb.base-level",
+      &rocksdb_base_level);
+    db.get_property(
+      "rocksdb.estimate-pending-compaction-bytes",
+      &rocksdb_estimate_pending_compaction_bytes);
+    db.get_property(
+      "rocksdb.cur-size-all-mem-tables",
+      &rocksdb_cur_size_all_mem_tables);
+    db.get_property(
+      "rocksdb.compaction-pending",
+      &rocksdb_compaction_pending);
+    db.get_property(
+      "rocksdb.mem-table-flush-pending",
+      &rocksdb_mem_table_flush_pending);
+    db.get_property(
+      "rocksdb.num-running-compactions",
+      &rocksdb_num_running_compactions);
+    db.get_property(
+      "rocksdb.num-running-flushes",
+      &rocksdb_num_running_flushes);
+    db.get_property(
+      "rocksdb.actual-delayed-write-rate",
+      &rocksdb_actual_delayed_write_rate);
+
+  
+    tracepoint(
+      bluestore,
+      transaction_initial_state,
+      txc.osr->get_sequencer_id(),
+      txc.seq,
+      throttle_bytes.get_current(),
+      throttle_deferred_bytes.get_current(),
+      pending_kv_ios,
+      pending_deferred_ios,
+      started,
+      completed,
+      ceph::to_seconds<double>(mono_clock::now() - start_throttle_acquire));
+
+    tracepoint(
+      bluestore,
+      transaction_initial_state_rocksdb,
+      txc.osr->get_sequencer_id(),
+      txc.seq,
+      rocksdb_base_level,
+      rocksdb_estimate_pending_compaction_bytes,
+      rocksdb_cur_size_all_mem_tables,
+      rocksdb_compaction_pending,
+      rocksdb_mem_table_flush_pending,
+      rocksdb_num_running_compactions,
+      rocksdb_num_running_flushes,
+      rocksdb_actual_delayed_write_rate);
+  }
+}
+#endif
+
+mono_clock::duration BlueStore::BlueStoreThrottle::log_state_latency(
+  TransContext &txc, PerfCounters *logger, int state)
+{
+  mono_clock::time_point now = mono_clock::now();
+  mono_clock::duration lat = now - txc.last_stamp;
+  logger->tinc(state, lat);
+#if defined(WITH_LTTNG)
+  if (txc.tracing &&
+      state >= l_bluestore_state_prepare_lat &&
+      state <= l_bluestore_state_done_lat) {
+    OID_ELAPSED("", lat.to_nsec() / 1000.0, txc.get_state_latency_name(state));
+    tracepoint(
+      bluestore,
+      transaction_state_duration,
+      txc.osr->get_sequencer_id(),
+      txc.seq,
+      state,
+      ceph::to_seconds<double>(lat));
+  }
+#endif
+  txc.last_stamp = now;
+  return lat;
+}
+
+bool BlueStore::BlueStoreThrottle::try_start_transaction(
+  KeyValueDB &db,
+  TransContext &txc,
+  mono_clock::time_point start_throttle_acquire)
+{
+  throttle_bytes.get(txc.cost);
+
+  if (!txc.deferred_txn || throttle_deferred_bytes.get_or_fail(txc.cost)) {
+    emit_initial_tracepoint(db, txc, start_throttle_acquire);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void BlueStore::BlueStoreThrottle::finish_start_transaction(
+  KeyValueDB &db,
+  TransContext &txc,
+  mono_clock::time_point start_throttle_acquire)
+{
+  ceph_assert(txc.deferred_txn);
+  throttle_deferred_bytes.get(txc.cost);
+  emit_initial_tracepoint(db, txc, start_throttle_acquire);
+}
+
+#if defined(WITH_LTTNG)
+void BlueStore::BlueStoreThrottle::complete_kv(TransContext &txc)
+{
+  pending_kv_ios -= 1;
+  ios_completed_since_last_traced++;
+  if (txc.tracing) {
+    tracepoint(
+      bluestore,
+      transaction_commit_latency,
+      txc.osr->get_sequencer_id(),
+      txc.seq,
+      ceph::to_seconds<double>(mono_clock::now() - txc.start));
+  }
+}
+#endif
+
+#if defined(WITH_LTTNG)
+void BlueStore::BlueStoreThrottle::complete(TransContext &txc)
+{
+  if (txc.deferred_txn) {
+    pending_deferred_ios -= 1;
+  }
+  if (txc.tracing) {
+    mono_clock::time_point now = mono_clock::now();
+    mono_clock::duration lat = now - txc.start;
+    tracepoint(
+      bluestore,
+      transaction_total_duration,
+      txc.osr->get_sequencer_id(),
+      txc.seq,
+      ceph::to_seconds<double>(lat));
+  }
+}
+#endif
+
+const string prefix_onode = "o";
+const string prefix_onode_shard = "x";
+const string prefix_other = "Z";
+//Itrerates through the db and collects the stats
+void BlueStore::generate_db_histogram(Formatter *f)
+{
+  //globals
+  uint64_t num_onodes = 0;
+  uint64_t num_shards = 0;
+  uint64_t num_super = 0;
+  uint64_t num_coll = 0;
+  uint64_t num_omap = 0;
+  uint64_t num_pgmeta_omap = 0;
+  uint64_t num_deferred = 0;
+  uint64_t num_alloc = 0;
+  uint64_t num_stat = 0;
+  uint64_t num_others = 0;
+  uint64_t num_shared_shards = 0;
+  size_t max_key_size =0, max_value_size = 0;
+  uint64_t total_key_size = 0, total_value_size = 0;
+  size_t key_size = 0, value_size = 0;
+  KeyValueHistogram hist;
+
+  auto start = coarse_mono_clock::now();
+
+  KeyValueDB::WholeSpaceIterator iter = db->get_wholespace_iterator();
+  iter->seek_to_first();
+  while (iter->valid()) {
+    dout(30) << __func__ << " Key: " << iter->key() << dendl;
+    key_size = iter->key_size();
+    value_size = iter->value_size();
+    hist.value_hist[hist.get_value_slab(value_size)]++;
+    max_key_size = std::max(max_key_size, key_size);
+    max_value_size = std::max(max_value_size, value_size);
+    total_key_size += key_size;
+    total_value_size += value_size;
+
+    pair<string,string> key(iter->raw_key());
+
+    if (key.first == PREFIX_SUPER) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_SUPER, key_size, value_size);
+	num_super++;
+    } else if (key.first == PREFIX_STAT) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_STAT, key_size, value_size);
+	num_stat++;
+    } else if (key.first == PREFIX_COLL) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_COLL, key_size, value_size);
+	num_coll++;
+    } else if (key.first == PREFIX_OBJ) {
+      if (key.second.back() == ONODE_KEY_SUFFIX) {
+	hist.update_hist_entry(hist.key_hist, prefix_onode, key_size, value_size);
+	num_onodes++;
+      } else {
+	hist.update_hist_entry(hist.key_hist, prefix_onode_shard, key_size, value_size);
+	num_shards++;
+      }
+    } else if (key.first == PREFIX_OMAP) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_OMAP, key_size, value_size);
+	num_omap++;
+    } else if (key.first == PREFIX_PERPOOL_OMAP) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_PERPOOL_OMAP, key_size, value_size);
+	num_omap++;
+    } else if (key.first == PREFIX_PERPG_OMAP) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_PERPG_OMAP, key_size, value_size);
+	num_omap++;
+    } else if (key.first == PREFIX_PGMETA_OMAP) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_PGMETA_OMAP, key_size, value_size);
+	num_pgmeta_omap++;
+    } else if (key.first == PREFIX_DEFERRED) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_DEFERRED, key_size, value_size);
+	num_deferred++;
+    } else if (key.first == PREFIX_ALLOC || key.first == PREFIX_ALLOC_BITMAP) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_ALLOC, key_size, value_size);
+	num_alloc++;
+    } else if (key.first == PREFIX_SHARED_BLOB) {
+	hist.update_hist_entry(hist.key_hist, PREFIX_SHARED_BLOB, key_size, value_size);
+	num_shared_shards++;
+    } else {
+	hist.update_hist_entry(hist.key_hist, prefix_other, key_size, value_size);
+	num_others++;
+    }
+    iter->next();
+  }
+
+  ceph::timespan duration = coarse_mono_clock::now() - start;
+  f->open_object_section("rocksdb_key_value_stats");
+  f->dump_unsigned("num_onodes", num_onodes);
+  f->dump_unsigned("num_shards", num_shards);
+  f->dump_unsigned("num_super", num_super);
+  f->dump_unsigned("num_coll", num_coll);
+  f->dump_unsigned("num_omap", num_omap);
+  f->dump_unsigned("num_pgmeta_omap", num_pgmeta_omap);
+  f->dump_unsigned("num_deferred", num_deferred);
+  f->dump_unsigned("num_alloc", num_alloc);
+  f->dump_unsigned("num_stat", num_stat);
+  f->dump_unsigned("num_shared_shards", num_shared_shards);
+  f->dump_unsigned("num_others", num_others);
+  f->dump_unsigned("max_key_size", max_key_size);
+  f->dump_unsigned("max_value_size", max_value_size);
+  f->dump_unsigned("total_key_size", total_key_size);
+  f->dump_unsigned("total_value_size", total_value_size);
+  f->close_section();
+
+  hist.dump(f);
+
+  dout(20) << __func__ << " finished in " << duration << " seconds" << dendl;
+
+}
+
+void BlueStore::_shutdown_cache()
+{
+  dout(10) << __func__ << dendl;
+  for (auto i : buffer_cache_shards) {
+    i->flush();
+    ceph_assert(i->empty());
+  }
+  for (auto& p : coll_map) {
+    p.second->onode_space.clear();
+    if (!p.second->shared_blob_set.empty()) {
+      derr << __func__ << " stray shared blobs on " << p.first << dendl;
+      p.second->shared_blob_set.dump<0>(cct);
+    }
+    ceph_assert(p.second->onode_space.empty());
+    ceph_assert(p.second->shared_blob_set.empty());
+  }
+  coll_map.clear();
+  for (auto i : onode_cache_shards) {
+    ceph_assert(i->empty());
+  }
+}
+
+// For external caller.
+// We use a best-effort policy instead, e.g.,
+// we don't care if there are still some pinned onodes/data in the cache
+// after this command is completed.
+int BlueStore::flush_cache(ostream *os)
+{
+  dout(10) << __func__ << dendl;
+  for (auto i : onode_cache_shards) {
+    i->flush();
+  }
+  for (auto i : buffer_cache_shards) {
+    i->flush();
+  }
+
+  return 0;
+}
+
+void BlueStore::_apply_padding(uint64_t head_pad,
+			       uint64_t tail_pad,
+			       bufferlist& padded)
+{
+  if (head_pad) {
+    padded.prepend_zero(head_pad);
+  }
+  if (tail_pad) {
+    padded.append_zero(tail_pad);
+  }
+  if (head_pad || tail_pad) {
+    dout(20) << __func__ << "  can pad head 0x" << std::hex << head_pad
+	      << " tail 0x" << tail_pad << std::dec << dendl;
+    logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad);
+  }
+}
+
+void BlueStore::_record_onode(OnodeRef& o, KeyValueDB::Transaction &txn)
+{
+  // finalize extent_map shards
+  o->extent_map.update(txn, false);
+  if (o->extent_map.needs_reshard()) {
+    o->extent_map.reshard(db, txn);
+    o->extent_map.update(txn, true);
+    if (o->extent_map.needs_reshard()) {
+      dout(20) << __func__ << " warning: still wants reshard, check options?"
+		<< dendl;
+      o->extent_map.clear_needs_reshard();
+    }
+    logger->inc(l_bluestore_onode_reshard);
+  }
+
+  // bound encode
+  size_t bound = 0;
+  denc(o->onode, bound);
+  o->extent_map.bound_encode_spanning_blobs(bound);
+  if (o->onode.extent_map_shards.empty()) {
+    denc(o->extent_map.inline_bl, bound);
+  }
+
+  // encode
+  bufferlist bl;
+  unsigned onode_part, blob_part, extent_part;
+  {
+    auto p = bl.get_contiguous_appender(bound, true);
+    denc(o->onode, p);
+    onode_part = p.get_logical_offset();
+    o->extent_map.encode_spanning_blobs(p);
+    blob_part = p.get_logical_offset() - onode_part;
+    if (o->onode.extent_map_shards.empty()) {
+      denc(o->extent_map.inline_bl, p);
+    }
+    extent_part = p.get_logical_offset() - onode_part - blob_part;
+  }
+
+  dout(20) << __func__  << " onode " << o->oid << " is " << bl.length()
+	    << " (" << onode_part << " bytes onode + "
+	    << blob_part << " bytes spanning blobs + "
+	    << extent_part << " bytes inline extents)"
+	    << dendl;
+
+
+  txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
+}
+
+void BlueStore::_log_alerts(osd_alert_list_t& alerts)
+{
+  std::lock_guard l(qlock);
+  size_t used = bluefs && bluefs_layout.shared_bdev == BlueFS::BDEV_SLOW ?
+    bluefs->get_used(BlueFS::BDEV_SLOW) : 0;
+  if (used > 0) {
+      auto db_used = bluefs->get_used(BlueFS::BDEV_DB);
+      auto db_total = bluefs->get_total(BlueFS::BDEV_DB);
+      ostringstream ss;
+      ss << "spilled over " << byte_u_t(used)
+         << " metadata from 'db' device (" << byte_u_t(db_used)
+         << " used of " << byte_u_t(db_total) << ") to slow device";
+      spillover_alert = ss.str();
+  } else if (!spillover_alert.empty()){
+    spillover_alert.clear();
+  }
+
+  if (!spurious_read_errors_alert.empty() &&
+      cct->_conf->bluestore_warn_on_spurious_read_errors) {
+    alerts.emplace(
+      "BLUESTORE_SPURIOUS_READ_ERRORS",
+      spurious_read_errors_alert);
+  }
+  if (!disk_size_mismatch_alert.empty()) {
+    alerts.emplace(
+      "BLUESTORE_DISK_SIZE_MISMATCH",
+      disk_size_mismatch_alert);
+  }
+  if (!legacy_statfs_alert.empty()) {
+    alerts.emplace(
+      "BLUESTORE_LEGACY_STATFS",
+      legacy_statfs_alert);
+  }
+  if (!spillover_alert.empty() &&
+      cct->_conf->bluestore_warn_on_bluefs_spillover) {
+    alerts.emplace(
+      "BLUEFS_SPILLOVER",
+      spillover_alert);
+  }
+  if (!no_per_pg_omap_alert.empty()) {
+    alerts.emplace(
+      "BLUESTORE_NO_PER_PG_OMAP",
+      no_per_pg_omap_alert);
+  }
+  if (!no_per_pool_omap_alert.empty()) {
+    alerts.emplace(
+      "BLUESTORE_NO_PER_POOL_OMAP",
+      no_per_pool_omap_alert);
+  }
+  string s0(failed_cmode);
+
+  if (!failed_compressors.empty()) {
+    if (!s0.empty()) {
+      s0 += ", ";
+    }
+    s0 += "unable to load:";
+    bool first = true;
+    for (auto& s : failed_compressors) {
+      if (first) {
+	first = false;
+      } else {
+	s0 += ", ";
+      }
+      s0 += s;
+    }
+    alerts.emplace(
+      "BLUESTORE_NO_COMPRESSION",
+      s0);
+  }
+}
+
+void BlueStore::_collect_allocation_stats(uint64_t need, uint32_t alloc_size,
+                                          const PExtentVector& extents)
+{
+  alloc_stats_count++;
+  alloc_stats_fragments += extents.size();
+  alloc_stats_size += need;
+
+  for (auto& e : extents) {
+    logger->hinc(l_bluestore_allocate_hist, e.length, need);
+  }
+}
+
+void BlueStore::_record_allocation_stats()
+{
+  // don't care about data consistency,
+  // fields can be partially modified while making the tuple
+  auto t0 = std::make_tuple(
+    alloc_stats_count.exchange(0),
+    alloc_stats_fragments.exchange(0),
+    alloc_stats_size.exchange(0));
+
+  dout(0) << " allocation stats probe "
+    << probe_count << ":"
+    << " cnt: " << std::get<0>(t0)
+    << " frags: " << std::get<1>(t0)
+    << " size: " << std::get<2>(t0)
+    << dendl;
+
+
+  //
+  // Keep the history for probes from the power-of-two sequence:
+  // -1, -2, -4, -8, -16
+  // 
+  size_t base = 1;
+  for (auto& t : alloc_stats_history) {
+    dout(0) << " probe -"
+      << base + (probe_count % base) << ": "
+      << std::get<0>(t)
+      << ",  " << std::get<1>(t)
+      << ", " << std::get<2>(t)
+      << dendl;
+    base <<= 1;
+  }
+  dout(0) << "------------" << dendl;
+
+  ++ probe_count;
+
+  for (ssize_t i = alloc_stats_history.size() - 1 ; i > 0 ; --i) {
+    if ((probe_count % (1 << i)) == 0) {
+      alloc_stats_history[i] = alloc_stats_history[i - 1];
+    }
+  }
+  alloc_stats_history[0].swap(t0);
+}
+
+// ===========================================
+// BlueStoreRepairer
+
+size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
+  const interval_set<uint64_t>& extents)
+{
+  ceph_assert(granularity); // initialized
+  // can't call for the second time
+  ceph_assert(!was_filtered_out);
+  ceph_assert(collections_bfs.size() == objects_bfs.size());
+
+  uint64_t prev_pos = 0;
+  uint64_t npos = collections_bfs.size();
+
+  bloom_vector collections_reduced;
+  bloom_vector objects_reduced;
+
+  for (auto e : extents) {
+    if (e.second == 0) {
+      continue;
+    }
+    uint64_t pos = max(e.first / granularity, prev_pos);
+    uint64_t end_pos = 1 + (e.first + e.second - 1) / granularity;
+    while (pos != npos && pos < end_pos)  {
+        ceph_assert( collections_bfs[pos].element_count() ==
+          objects_bfs[pos].element_count());
+        if (collections_bfs[pos].element_count()) {
+          collections_reduced.push_back(std::move(collections_bfs[pos]));
+          objects_reduced.push_back(std::move(objects_bfs[pos]));
+        }
+        ++pos;
+    }
+    prev_pos = end_pos;
+  }
+  collections_reduced.swap(collections_bfs);
+  objects_reduced.swap(objects_bfs);
+  was_filtered_out = true;
+  return collections_bfs.size();
+}
+
+bool BlueStoreRepairer::remove_key(KeyValueDB *db,
+				   const string& prefix,
+				   const string& key)
+{
+  std::lock_guard l(lock);
+  if (!remove_key_txn) {
+    remove_key_txn = db->get_transaction();
+  }
+  ++to_repair_cnt;
+  remove_key_txn->rmkey(prefix, key);
+
+  return true;
+}
+
+void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db, int val)
+{
+  std::lock_guard l(lock); // possibly redundant
+  ceph_assert(fix_per_pool_omap_txn == nullptr);
+  fix_per_pool_omap_txn = db->get_transaction();
+  ++to_repair_cnt;
+  bufferlist bl;
+  bl.append(stringify(val));
+  fix_per_pool_omap_txn->set(PREFIX_SUPER, "per_pool_omap", bl);
+}
+
+bool BlueStoreRepairer::fix_shared_blob(
+  KeyValueDB::Transaction txn,
+  uint64_t sbid,
+  bluestore_extent_ref_map_t* ref_map,
+  size_t repaired)
+{
+  string key;
+  get_shared_blob_key(sbid, &key);
+  if (ref_map) {
+    bluestore_shared_blob_t persistent(sbid, std::move(*ref_map));
+    bufferlist bl;
+    encode(persistent, bl);
+    txn->set(PREFIX_SHARED_BLOB, key, bl);
+  } else {
+    txn->rmkey(PREFIX_SHARED_BLOB, key);
+  }
+  to_repair_cnt += repaired;
+  return true;
+}
+
+bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
+				   const string& key,
+				   const store_statfs_t& new_statfs)
+{
+  std::lock_guard l(lock);
+  if (!fix_statfs_txn) {
+    fix_statfs_txn = db->get_transaction();
+  }
+  BlueStore::volatile_statfs vstatfs;
+  vstatfs = new_statfs;
+  bufferlist bl;
+  vstatfs.encode(bl);
+  ++to_repair_cnt;
+  fix_statfs_txn->set(PREFIX_STAT, key, bl);
+  return true;
+}
+
+bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
+				   FreelistManager* fm,
+				   uint64_t offset, uint64_t len)
+{
+  std::lock_guard l(lock);
+  ceph_assert(!fm->is_null_manager());
+
+  if (!fix_fm_leaked_txn) {
+    fix_fm_leaked_txn = db->get_transaction();
+  }
+  ++to_repair_cnt;
+  fm->release(offset, len, fix_fm_leaked_txn);
+  return true;
+}
+bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
+				       FreelistManager* fm,
+				       uint64_t offset, uint64_t len)
+{
+  std::lock_guard l(lock);
+  ceph_assert(!fm->is_null_manager());
+
+  if (!fix_fm_false_free_txn) {
+    fix_fm_false_free_txn = db->get_transaction();
+  }
+  ++to_repair_cnt;
+  fm->allocate(offset, len, fix_fm_false_free_txn);
+  return true;
+}
+
+bool BlueStoreRepairer::fix_spanning_blobs(
+  KeyValueDB* db,
+  std::function<void(KeyValueDB::Transaction)> f)
+{
+  std::lock_guard l(lock);
+  if (!fix_onode_txn) {
+    fix_onode_txn = db->get_transaction();
+  }
+  f(fix_onode_txn);
+  ++to_repair_cnt;
+  return true;
+}
+
+bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
+{
+  //NB: not for use in multithreading mode!!!
+  if (misreferenced_extents.size()) {
+    size_t n = space_usage_tracker.filter_out(misreferenced_extents);
+    ceph_assert(n > 0);
+    if (!fix_misreferences_txn) {
+      fix_misreferences_txn = db->get_transaction();
+    }
+    return true;
+  }
+  return false;
+}
+
+unsigned BlueStoreRepairer::apply(KeyValueDB* db)
+{
+  //NB: not for use in multithreading mode!!!
+  if (fix_per_pool_omap_txn) {
+    auto ok = db->submit_transaction_sync(fix_per_pool_omap_txn) == 0;
+    ceph_assert(ok);
+    fix_per_pool_omap_txn = nullptr;
+  }
+  if (fix_fm_leaked_txn) {
+    auto ok = db->submit_transaction_sync(fix_fm_leaked_txn) == 0;
+    ceph_assert(ok);
+    fix_fm_leaked_txn = nullptr;
+  }
+  if (fix_fm_false_free_txn) {
+    auto ok = db->submit_transaction_sync(fix_fm_false_free_txn) == 0;
+    ceph_assert(ok);
+    fix_fm_false_free_txn = nullptr;
+  }
+  if (remove_key_txn) {
+    auto ok = db->submit_transaction_sync(remove_key_txn) == 0;
+    ceph_assert(ok);
+    remove_key_txn = nullptr;
+  }
+  if (fix_misreferences_txn) {
+    auto ok = db->submit_transaction_sync(fix_misreferences_txn) == 0;
+    ceph_assert(ok);
+    fix_misreferences_txn = nullptr;
+  }
+  if (fix_onode_txn) {
+    auto ok = db->submit_transaction_sync(fix_onode_txn) == 0;
+    ceph_assert(ok);
+    fix_onode_txn = nullptr;
+  }
+  if (fix_shared_blob_txn) {
+    auto ok = db->submit_transaction_sync(fix_shared_blob_txn) == 0;
+    ceph_assert(ok);
+    fix_shared_blob_txn = nullptr;
+  }
+  if (fix_statfs_txn) {
+    auto ok = db->submit_transaction_sync(fix_statfs_txn) == 0;
+    ceph_assert(ok);
+    fix_statfs_txn = nullptr;
+  }
+  if (need_compact) {
+    db->compact();
+    need_compact = false;
+  }
+  unsigned repaired = to_repair_cnt;
+  to_repair_cnt = 0;
+  return repaired;
+}
+
+// =======================================================
+// RocksDBBlueFSVolumeSelector
+
+uint8_t RocksDBBlueFSVolumeSelector::select_prefer_bdev(void* h) {
+  ceph_assert(h != nullptr);
+  uint64_t hint = reinterpret_cast<uint64_t>(h);
+  uint8_t res;
+  switch (hint) {
+  case LEVEL_SLOW:
+    res = BlueFS::BDEV_SLOW;
+    if (db_avail4slow > 0) {
+      // considering statically available db space vs.
+      // - observed maximums on DB dev for DB/WAL/UNSORTED data
+      // - observed maximum spillovers
+      uint64_t max_db_use = 0; // max db usage we potentially observed
+      max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_LOG - LEVEL_FIRST);
+      max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_WAL - LEVEL_FIRST);
+      max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_DB, LEVEL_DB - LEVEL_FIRST);
+      // this could go to db hence using it in the estimation
+      max_db_use += per_level_per_dev_max.at(BlueFS::BDEV_SLOW, LEVEL_DB - LEVEL_FIRST);
+
+      auto db_total = l_totals[LEVEL_DB - LEVEL_FIRST];
+      uint64_t avail = min(
+        db_avail4slow,
+        max_db_use < db_total ? db_total - max_db_use : 0);
+
+      // considering current DB dev usage for SLOW data
+      if (avail > per_level_per_dev_usage.at(BlueFS::BDEV_DB, LEVEL_SLOW - LEVEL_FIRST)) {
+        res = BlueFS::BDEV_DB;
+      }
+    }
+    break;
+  case LEVEL_LOG:
+  case LEVEL_WAL:
+    res = BlueFS::BDEV_WAL;
+    break;
+  case LEVEL_DB:
+  default:
+    res = BlueFS::BDEV_DB;
+    break;
+  }
+  return res;
+}
+
+void RocksDBBlueFSVolumeSelector::get_paths(const std::string& base, paths& res) const
+{
+  auto db_size = l_totals[LEVEL_DB - LEVEL_FIRST];
+  res.emplace_back(base, db_size);
+  auto slow_size = l_totals[LEVEL_SLOW - LEVEL_FIRST];
+  if (slow_size == 0) {
+    slow_size = db_size;
+  }
+  res.emplace_back(base + ".slow", slow_size);
+}
+
+void* RocksDBBlueFSVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
+  uint8_t res = LEVEL_DB;
+  if (dirname.length() > 5) {
+    // the "db.slow" and "db.wal" directory names are hard-coded at
+    // match up with bluestore.  the slow device is always the second
+    // one (when a dedicated block.db device is present and used at
+    // bdev 0).  the wal device is always last.
+    if (boost::algorithm::ends_with(dirname, ".slow")) {
+      res = LEVEL_SLOW;
+    }
+    else if (boost::algorithm::ends_with(dirname, ".wal")) {
+      res = LEVEL_WAL;
+    }
+  }
+  return reinterpret_cast<void*>(res);
+}
+
+void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
+  auto max_x = per_level_per_dev_usage.get_max_x();
+  auto max_y = per_level_per_dev_usage.get_max_y();
+
+  sout << "RocksDBBlueFSVolumeSelector Usage Matrix:" << std::endl;
+  constexpr std::array<const char*, 8> names{ {
+    "DEV/LEV",
+    "WAL",
+    "DB",
+    "SLOW",
+    "*",
+    "*",
+    "REAL",
+    "FILES",
+  } };
+  const size_t width = 12;
+  for (size_t i = 0; i < names.size(); ++i) {
+    sout.setf(std::ios::left, std::ios::adjustfield);
+    sout.width(width);
+    sout << names[i];
+  }
+  sout << std::endl;
+  for (size_t l = 0; l < max_y; l++) {
+    sout.setf(std::ios::left, std::ios::adjustfield);
+    sout.width(width);
+    switch (l + LEVEL_FIRST) {
+    case LEVEL_LOG:
+      sout << "LOG"; break;
+    case LEVEL_WAL:
+      sout << "WAL"; break;
+    case LEVEL_DB:
+      sout << "DB"; break;
+    case LEVEL_SLOW:
+      sout << "SLOW"; break;
+    case LEVEL_MAX:
+      sout << "TOTAL"; break;
+    }
+    for (size_t d = 0; d < max_x; d++) {
+      sout.setf(std::ios::left, std::ios::adjustfield);
+      sout.width(width);
+      sout << stringify(byte_u_t(per_level_per_dev_usage.at(d, l)));
+    }
+    sout.setf(std::ios::left, std::ios::adjustfield);
+    sout.width(width);
+    sout << stringify(per_level_files[l]) << std::endl;
+  }
+  ceph_assert(max_x == per_level_per_dev_max.get_max_x());
+  ceph_assert(max_y == per_level_per_dev_max.get_max_y());
+  sout << "MAXIMUMS:" << std::endl;
+  for (size_t l = 0; l < max_y; l++) {
+    sout.setf(std::ios::left, std::ios::adjustfield);
+    sout.width(width);
+    switch (l + LEVEL_FIRST) {
+    case LEVEL_LOG:
+      sout << "LOG"; break;
+    case LEVEL_WAL:
+      sout << "WAL"; break;
+    case LEVEL_DB:
+      sout << "DB"; break;
+    case LEVEL_SLOW:
+      sout << "SLOW"; break;
+    case LEVEL_MAX:
+      sout << "TOTAL"; break;
+    }
+    for (size_t d = 0; d < max_x - 1; d++) {
+      sout.setf(std::ios::left, std::ios::adjustfield);
+      sout.width(width);
+      sout << stringify(byte_u_t(per_level_per_dev_max.at(d, l)));
+    }
+    sout.setf(std::ios::left, std::ios::adjustfield);
+    sout.width(width);
+    sout << stringify(byte_u_t(per_level_per_dev_max.at(max_x - 1, l)));
+    sout << std::endl;
+  }
+  string sizes[] = {
+    ">> SIZE <<",
+    stringify(byte_u_t(l_totals[LEVEL_WAL - LEVEL_FIRST])),
+    stringify(byte_u_t(l_totals[LEVEL_DB - LEVEL_FIRST])),
+    stringify(byte_u_t(l_totals[LEVEL_SLOW - LEVEL_FIRST])),
+  };
+  for (size_t i = 0; i < (sizeof(sizes) / sizeof(sizes[0])); i++) {
+    sout.setf(std::ios::left, std::ios::adjustfield);
+    sout.width(width);
+    sout << sizes[i];
+  }
+  sout << std::endl;
+}
+
+BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const {
+  RocksDBBlueFSVolumeSelector* ns =
+    new RocksDBBlueFSVolumeSelector(0, 0, 0,
+				    0, 0, 0,
+				    0, 0, false);
+  return ns;
+}
+
+bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) {
+  RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other);
+  ceph_assert(o);
+  bool equal = true;
+  for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) {
+    for (size_t y = 0; y <LEVEL_MAX - LEVEL_FIRST + 1; y++) {
+      equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y));
+    }
+  }
+  for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) {
+    equal &= (per_level_files[t] == o->per_level_files[t]);
+  }
+  return equal;
+}
+
+// =======================================================
+
+//================================================================================================================
+// BlueStore is committing all allocation information (alloc/release) into RocksDB before the client Write is performed.
+// This cause a delay in write path and add significant load to the CPU/Memory/Disk.
+// The reason for the RocksDB updates is that it allows Ceph to survive any failure without losing the allocation state.
+//
+// We changed the code skiping RocksDB updates on allocation time and instead performing a full desatge of the allocator object
+// with all the OSD allocation state in a single step during umount().
+// This change leads to a 25% increase in IOPS and reduced latency in small random-write workload, but exposes the system
+// to losing allocation info in failure cases where we don't call umount.
+// We add code to perform a full allocation-map rebuild from information stored inside the ONode which is used in failure cases.
+// When we perform a graceful shutdown there is no need for recovery and we simply read the allocation-map from a flat file
+// where we store the allocation-map during umount().
+//================================================================================================================
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bluestore::NCB::" << __func__ << "::"
+
+static const std::string allocator_dir    = "ALLOCATOR_NCB_DIR";
+static const std::string allocator_file   = "ALLOCATOR_NCB_FILE";
+static uint32_t    s_format_version = 0x01; // support future changes to allocator-map file
+static uint32_t    s_serial         = 0x01;
+
+#if 1
+#define CEPHTOH_32 le32toh
+#define CEPHTOH_64 le64toh
+#define HTOCEPH_32 htole32
+#define HTOCEPH_64 htole64
+#else
+// help debug the encode/decode by forcing alien format
+#define CEPHTOH_32 be32toh
+#define CEPHTOH_64 be64toh
+#define HTOCEPH_32 htobe32
+#define HTOCEPH_64 htobe64
+#endif
+
+// 48 Bytes header for on-disk alloator image
+const uint64_t ALLOCATOR_IMAGE_VALID_SIGNATURE = 0x1FACE0FF;
+struct allocator_image_header {
+  uint32_t format_version;	// 0x00
+  uint32_t valid_signature;	// 0x04
+  utime_t  timestamp;		// 0x08
+  uint32_t serial;		// 0x10
+  uint32_t pad[0x7];		// 0x14
+
+  allocator_image_header() {
+    memset((char*)this, 0, sizeof(allocator_image_header));
+  }
+
+  // create header in CEPH format
+  allocator_image_header(utime_t timestamp, uint32_t format_version, uint32_t serial) {
+    this->format_version  = format_version;
+    this->timestamp       = timestamp;
+    this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
+    this->serial          = serial;
+    memset(this->pad, 0, sizeof(this->pad));
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const allocator_image_header& header) {
+    out << "format_version  = " << header.format_version << std::endl;
+    out << "valid_signature = " << header.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
+    out << "timestamp       = " << header.timestamp << std::endl;
+    out << "serial          = " << header.serial << std::endl;
+    for (unsigned i = 0; i < sizeof(header.pad)/sizeof(uint32_t); i++) {
+      if (header.pad[i]) {
+	out << "header.pad[" << i << "] = " << header.pad[i] << std::endl;
+      }
+    }
+    return out;
+  }
+
+  DENC(allocator_image_header, v, p) {
+    denc(v.format_version, p);
+    denc(v.valid_signature, p);
+    denc(v.timestamp.tv.tv_sec, p);
+    denc(v.timestamp.tv.tv_nsec, p);
+    denc(v.serial, p);
+    for (auto& pad: v.pad) {
+      denc(pad, p);
+    }
+  }
+
+
+  int verify(CephContext* cct, const std::string &path) {
+    if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
+      for (unsigned i = 0; i < (sizeof(pad) / sizeof(uint32_t)); i++) {
+	if (this->pad[i]) {
+	  derr << "Illegal Header - pad[" << i << "]="<< pad[i] << dendl;
+	  return -1;
+	}
+      }
+      return 0;
+    }
+    else {
+      derr << "Illegal Header - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
+      return -1;
+    }
+  }
+};
+WRITE_CLASS_DENC(allocator_image_header)
+
+// 56 Bytes trailer for on-disk alloator image
+struct allocator_image_trailer {
+  extent_t null_extent;         // 0x00
+
+  uint32_t format_version;	// 0x10
+  uint32_t valid_signature;	// 0x14
+
+  utime_t  timestamp;		// 0x18
+
+  uint32_t serial;		// 0x20
+  uint32_t pad;		// 0x24
+  uint64_t entries_count;	// 0x28
+  uint64_t allocation_size;	// 0x30
+
+  // trailer is created in CEPH format
+  allocator_image_trailer(utime_t timestamp, uint32_t format_version, uint32_t serial, uint64_t entries_count, uint64_t allocation_size) {
+    memset((char*)&(this->null_extent), 0, sizeof(this->null_extent));
+    this->format_version  = format_version;
+    this->valid_signature = ALLOCATOR_IMAGE_VALID_SIGNATURE;
+    this->timestamp       = timestamp;
+    this->serial          = serial;
+    this->pad             = 0;
+    this->entries_count   = entries_count;
+    this->allocation_size = allocation_size;
+  }
+
+  allocator_image_trailer() {
+    memset((char*)this, 0, sizeof(allocator_image_trailer));
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const allocator_image_trailer& trailer) {
+    if (trailer.null_extent.offset || trailer.null_extent.length) {
+      out << "trailer.null_extent.offset = " << trailer.null_extent.offset << std::endl;
+      out << "trailer.null_extent.length = " << trailer.null_extent.length << std::endl;
+    }
+    out << "format_version  = " << trailer.format_version << std::endl;
+    out << "valid_signature = " << trailer.valid_signature << "/" << ALLOCATOR_IMAGE_VALID_SIGNATURE << std::endl;
+    out << "timestamp       = " << trailer.timestamp << std::endl;
+    out << "serial          = " << trailer.serial << std::endl;
+    if (trailer.pad) {
+      out << "trailer.pad= " << trailer.pad << std::endl;
+    }
+    out << "entries_count   = " << trailer.entries_count   << std::endl;
+    out << "allocation_size = " << trailer.allocation_size << std::endl;
+    return out;
+  }
+
+  int verify(CephContext* cct, const std::string &path, const allocator_image_header *p_header, uint64_t entries_count, uint64_t allocation_size) {
+    if (valid_signature == ALLOCATOR_IMAGE_VALID_SIGNATURE) {
+
+      // trailer must starts with null extents (both fields set to zero) [no need to convert formats for zero)
+      if (null_extent.offset || null_extent.length) {
+	derr << "illegal trailer - null_extent = [" << null_extent.offset << "," << null_extent.length << "]"<< dendl;
+	return -1;
+      }
+
+      if (serial != p_header->serial) {
+	derr << "Illegal trailer: header->serial(" << p_header->serial << ") != trailer->serial(" << serial << ")" << dendl;
+	return -1;
+      }
+
+      if (format_version != p_header->format_version) {
+	derr << "Illegal trailer: header->format_version(" << p_header->format_version
+	     << ") != trailer->format_version(" << format_version << ")" << dendl;
+	return -1;
+      }
+
+      if (timestamp != p_header->timestamp) {
+	derr << "Illegal trailer: header->timestamp(" << p_header->timestamp
+	     << ") != trailer->timestamp(" << timestamp << ")" << dendl;
+	return -1;
+      }
+
+      if (this->entries_count != entries_count) {
+	derr << "Illegal trailer: entries_count(" << entries_count << ") != trailer->entries_count("
+	     << this->entries_count << ")" << dendl;
+	return -1;
+      }
+
+      if (this->allocation_size != allocation_size) {
+	derr << "Illegal trailer: allocation_size(" << allocation_size << ") != trailer->allocation_size("
+	     << this->allocation_size << ")" << dendl;
+	return -1;
+      }
+
+      if (pad) {
+	derr << "Illegal Trailer - pad="<< pad << dendl;
+	return -1;
+      }
+
+      // if arrived here -> trailer is valid !!
+      return 0;
+    } else {
+      derr << "Illegal Trailer - signature="<< valid_signature << "(" << ALLOCATOR_IMAGE_VALID_SIGNATURE << ")" << dendl;
+      return -1;
+    }
+  }
+
+  DENC(allocator_image_trailer, v, p) {
+    denc(v.null_extent.offset, p);
+    denc(v.null_extent.length, p);
+    denc(v.format_version, p);
+    denc(v.valid_signature, p);
+    denc(v.timestamp.tv.tv_sec, p);
+    denc(v.timestamp.tv.tv_nsec, p);
+    denc(v.serial, p);
+    denc(v.pad, p);
+    denc(v.entries_count, p);
+    denc(v.allocation_size, p);
+  }
+};
+WRITE_CLASS_DENC(allocator_image_trailer)
+
+
+//-------------------------------------------------------------------------------------
+// invalidate old allocation file if exists so will go directly to recovery after failure
+// we can safely ignore non-existing file
+int BlueStore::invalidate_allocation_file_on_bluefs()
+{
+  // mark that allocation-file was invalidated and we should destage a new copy whne closing db
+  need_to_destage_allocation_file = true;
+  dout(10) << __func__ << " need_to_destage_allocation_file was set" << dendl;
+
+  BlueFS::FileWriter *p_handle = nullptr;
+  if (!bluefs->dir_exists(allocator_dir)) {
+    dout(5) << "allocator_dir(" << allocator_dir << ") doesn't exist" << dendl;
+    // nothing to do -> return
+    return 0;
+  }
+
+  int ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
+  if (ret != 0) {
+    dout(5) << __func__ << " allocator_file(" << allocator_file << ") doesn't exist" << dendl;
+    // nothing to do -> return
+    return 0;
+  }
+
+
+  ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, true);
+  if (ret != 0) {
+    derr << __func__ << "::NCB:: Failed open_for_write with error-code "
+         << ret << dendl;
+    return -1;
+  }
+
+  dout(5) << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
+  ret = bluefs->truncate(p_handle, 0);
+  if (ret != 0) {
+    derr << __func__ << "::NCB:: Failed truncaste with error-code "
+         << ret << dendl;
+    bluefs->close_writer(p_handle);
+    return -1;
+  }
+
+  bluefs->fsync(p_handle);
+  bluefs->close_writer(p_handle);
+
+  return 0;
+}
+
+//-----------------------------------------------------------------------------------
+int BlueStore::copy_allocator(Allocator* src_alloc, Allocator* dest_alloc, uint64_t* p_num_entries)
+{
+  *p_num_entries = 0;
+  auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
+    (*p_num_entries)++;
+  };
+  src_alloc->foreach(count_entries);
+
+  dout(5) << "count num_entries=" << *p_num_entries << dendl;
+
+  // add 16K extra entries in case new allocation happened
+  (*p_num_entries) += 16*1024;
+  unique_ptr<extent_t[]> arr;
+  try {
+    arr = make_unique<extent_t[]>(*p_num_entries);
+  } catch (std::bad_alloc&) {
+    derr << "****Failed dynamic allocation, num_entries=" << *p_num_entries << dendl;
+    return -1;
+  }
+
+  uint64_t idx         = 0;
+  auto copy_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
+    if (extent_length > 0) {
+      if (idx < *p_num_entries) {
+	arr[idx] = {extent_offset, extent_length};
+      }
+      idx++;
+    }
+    else {
+      derr << "zero length extent!!! offset=" << extent_offset << ", index=" << idx << dendl;
+    }
+  };
+  src_alloc->foreach(copy_entries);
+
+  dout(5) << "copy num_entries=" << idx << dendl;
+  if (idx > *p_num_entries) {
+    derr << "****spillover, num_entries=" << *p_num_entries << ", spillover=" << (idx - *p_num_entries) << dendl;
+    ceph_assert(idx <= *p_num_entries);
+  }
+
+  *p_num_entries = idx;
+
+  for (idx = 0; idx < *p_num_entries; idx++) {
+    const extent_t *p_extent = &arr[idx];
+    dest_alloc->init_add_free(p_extent->offset, p_extent->length);
+  }
+
+  return 0;
+}
+
+//-----------------------------------------------------------------------------------
+static uint32_t flush_extent_buffer_with_crc(BlueFS::FileWriter *p_handle, const char* buffer, const char *p_curr, uint32_t crc)
+{
+  std::ptrdiff_t length = p_curr - buffer;
+  p_handle->append(buffer, length);
+
+  crc = ceph_crc32c(crc, (const uint8_t*)buffer, length);
+  uint32_t encoded_crc = HTOCEPH_32(crc);
+  p_handle->append((byte*)&encoded_crc, sizeof(encoded_crc));
+
+  return crc;
+}
+
+const unsigned MAX_EXTENTS_IN_BUFFER = 4 * 1024; // 4K extents = 64KB of data
+// write the allocator to a flat bluefs file - 4K extents at a time
+//-----------------------------------------------------------------------------------
+int BlueStore::store_allocator(Allocator* src_allocator)
+{
+  // when storing allocations to file we must be sure there is no background compactions
+  // the easiest way to achieve it is to make sure db is closed
+  ceph_assert(db == nullptr);
+  utime_t  start_time = ceph_clock_now();
+  int ret = 0;
+
+  // create dir if doesn't exist already
+  if (!bluefs->dir_exists(allocator_dir) ) {
+    ret = bluefs->mkdir(allocator_dir);
+    if (ret != 0) {
+      derr << "Failed mkdir with error-code " << ret << dendl;
+      return -1;
+    }
+  }
+  bluefs->compact_log();
+  // reuse previous file-allocation if exists
+  ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
+  bool overwrite_file = (ret == 0);
+  BlueFS::FileWriter *p_handle = nullptr;
+  ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, overwrite_file);
+  if (ret != 0) {
+    derr <<  __func__ << "Failed open_for_write with error-code " << ret << dendl;
+    return -1;
+  }
+
+  uint64_t file_size = p_handle->file->fnode.size;
+  uint64_t allocated = p_handle->file->fnode.get_allocated();
+  dout(10) << "file_size=" << file_size << ", allocated=" << allocated << dendl;
+
+  bluefs->sync_metadata(false);
+  unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(src_allocator));
+  if (!allocator) {
+    bluefs->close_writer(p_handle);
+    return -1;
+  }
+
+  // store all extents (except for the bluefs extents we removed) in a single flat file
+  utime_t                 timestamp = ceph_clock_now();
+  uint32_t                crc       = -1;
+  {
+    allocator_image_header  header(timestamp, s_format_version, s_serial);
+    bufferlist              header_bl;
+    encode(header, header_bl);
+    crc = header_bl.crc32c(crc);
+    encode(crc, header_bl);
+    p_handle->append(header_bl);
+  }
+
+  crc = -1;					 // reset crc
+  extent_t        buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
+  extent_t       *p_curr          = buffer;
+  const extent_t *p_end           = buffer + MAX_EXTENTS_IN_BUFFER;
+  uint64_t        extent_count    = 0;
+  uint64_t        allocation_size = 0;
+  auto iterated_allocation = [&](uint64_t extent_offset, uint64_t extent_length) {
+    if (extent_length == 0) {
+      derr <<  __func__ << "" << extent_count << "::[" << extent_offset << "," << extent_length << "]" << dendl;
+      ret = -1;
+      return;
+    }
+    p_curr->offset = HTOCEPH_64(extent_offset);
+    p_curr->length = HTOCEPH_64(extent_length);
+    extent_count++;
+    allocation_size += extent_length;
+    p_curr++;
+
+    if (p_curr == p_end) {
+      crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
+      p_curr = buffer; // recycle the buffer
+    }
+  };
+  allocator->foreach(iterated_allocation);
+  // if got null extent -> fail the operation
+  if (ret != 0) {
+    derr << "Illegal extent, fail store operation" << dendl;
+    derr << "invalidate using bluefs->truncate(p_handle, 0)" << dendl;
+    bluefs->truncate(p_handle, 0);
+    bluefs->close_writer(p_handle);
+    return -1;
+  }
+
+  // if we got any leftovers -> add crc and append to file
+  if (p_curr > buffer) {
+    crc = flush_extent_buffer_with_crc(p_handle, (const char*)buffer, (const char*)p_curr, crc);
+  }
+
+  {
+    allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
+    bufferlist trailer_bl;
+    encode(trailer, trailer_bl);
+    uint32_t crc = -1;
+    crc = trailer_bl.crc32c(crc);
+    encode(crc, trailer_bl);
+    p_handle->append(trailer_bl);
+  }
+
+  bluefs->fsync(p_handle);
+  bluefs->truncate(p_handle, p_handle->pos);
+  bluefs->fsync(p_handle);
+
+  utime_t duration = ceph_clock_now() - start_time;
+  dout(5) <<"WRITE-extent_count=" << extent_count << ", allocation_size=" << allocation_size << ", serial=" << s_serial << dendl;
+  dout(5) <<"p_handle->pos=" << p_handle->pos << " WRITE-duration=" << duration << " seconds" << dendl;
+
+  bluefs->close_writer(p_handle);
+  need_to_destage_allocation_file = false;
+  return 0;
+}
+
+//-----------------------------------------------------------------------------------
+Allocator* BlueStore::create_bitmap_allocator(uint64_t bdev_size) {
+  // create allocator
+  uint64_t alloc_size = min_alloc_size;
+  Allocator* alloc = Allocator::create(cct, "bitmap", bdev_size, alloc_size,
+				       zone_size, first_sequential_zone,
+				       "recovery");
+  if (alloc) {
+    return alloc;
+  } else {
+    derr << "Failed Allocator Creation" << dendl;
+    return nullptr;
+  }
+}
+
+//-----------------------------------------------------------------------------------
+size_t calc_allocator_image_header_size()
+{
+  utime_t                 timestamp = ceph_clock_now();
+  allocator_image_header  header(timestamp, s_format_version, s_serial);
+  bufferlist              header_bl;
+  encode(header, header_bl);
+  uint32_t crc = -1;
+  crc = header_bl.crc32c(crc);
+  encode(crc, header_bl);
+
+  return header_bl.length();
+}
+
+//-----------------------------------------------------------------------------------
+int calc_allocator_image_trailer_size()
+{
+  utime_t                 timestamp       = ceph_clock_now();
+  uint64_t                extent_count    = -1;
+  uint64_t                allocation_size = -1;
+  uint32_t                crc             = -1;
+  bufferlist              trailer_bl;
+  allocator_image_trailer trailer(timestamp, s_format_version, s_serial, extent_count, allocation_size);
+
+  encode(trailer, trailer_bl);
+  crc = trailer_bl.crc32c(crc);
+  encode(crc, trailer_bl);
+  return trailer_bl.length();
+}
+
+//-----------------------------------------------------------------------------------
+int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes)
+{
+  if (cct->_conf->bluestore_debug_inject_allocation_from_file_failure > 0) {
+     boost::mt11213b rng(time(NULL));
+    boost::uniform_real<> ur(0, 1);
+    if (ur(rng) < cct->_conf->bluestore_debug_inject_allocation_from_file_failure) {
+      derr << __func__ << " failure injected." << dendl;
+      return -1;
+    }
+  }
+  utime_t start_time = ceph_clock_now();
+  BlueFS::FileReader *p_temp_handle = nullptr;
+  int ret = bluefs->open_for_read(allocator_dir, allocator_file, &p_temp_handle, false);
+  if (ret != 0) {
+    dout(1) << "Failed open_for_read with error-code " << ret << dendl;
+    return -1;
+  }
+  unique_ptr<BlueFS::FileReader> p_handle(p_temp_handle);
+  uint64_t read_alloc_size = 0;
+  uint64_t file_size = p_handle->file->fnode.size;
+  dout(5) << "file_size=" << file_size << ",sizeof(extent_t)=" << sizeof(extent_t) << dendl;
+
+  // make sure we were able to store a valid copy
+  if (file_size == 0) {
+    dout(1) << "No Valid allocation info on disk (empty file)" << dendl;
+    return -1;
+  }
+
+  // first read the header
+  size_t                 offset = 0;
+  allocator_image_header header;
+  int                    header_size = calc_allocator_image_header_size();
+  {
+    bufferlist header_bl,temp_bl;
+    int        read_bytes = bluefs->read(p_handle.get(), offset, header_size, &temp_bl, nullptr);
+    if (read_bytes != header_size) {
+      derr << "Failed bluefs->read() for header::read_bytes=" << read_bytes << ", req_bytes=" << header_size << dendl;
+      return -1;
+    }
+
+    offset += read_bytes;
+
+    header_bl.claim_append(temp_bl);
+    auto p = header_bl.cbegin();
+    decode(header, p);
+    if (header.verify(cct, path) != 0 ) {
+      derr << "header = \n" << header << dendl;
+      return -1;
+    }
+
+    uint32_t crc_calc = -1, crc;
+    crc_calc = header_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
+    decode(crc, p);
+    if (crc != crc_calc) {
+      derr << "crc mismatch!!! crc=" << crc << ", crc_calc=" << crc_calc << dendl;
+      derr << "header = \n" << header << dendl;
+      return -1;
+    }
+
+    // increment version for next store
+    s_serial = header.serial + 1;
+  }
+
+  // then read the payload (extents list) using a recycled buffer
+  extent_t        buffer[MAX_EXTENTS_IN_BUFFER]; // 64KB
+  uint32_t        crc                = -1;
+  int             trailer_size       = calc_allocator_image_trailer_size();
+  uint64_t        extent_count       = 0;
+  uint64_t        extents_bytes_left = file_size - (header_size + trailer_size + sizeof(crc));
+  while (extents_bytes_left) {
+    int req_bytes  = std::min(extents_bytes_left, static_cast<uint64_t>(sizeof(buffer)));
+    int read_bytes = bluefs->read(p_handle.get(), offset, req_bytes, nullptr, (char*)buffer);
+    if (read_bytes != req_bytes) {
+      derr << "Failed bluefs->read()::read_bytes=" << read_bytes << ", req_bytes=" << req_bytes << dendl;
+      return -1;
+    }
+
+    offset             += read_bytes;
+    extents_bytes_left -= read_bytes;
+
+    const unsigned  num_extent_in_buffer = read_bytes/sizeof(extent_t);
+    const extent_t *p_end                = buffer + num_extent_in_buffer;
+    for (const extent_t *p_ext = buffer; p_ext < p_end; p_ext++) {
+      uint64_t offset = CEPHTOH_64(p_ext->offset);
+      uint64_t length = CEPHTOH_64(p_ext->length);
+      read_alloc_size += length;
+
+      if (length > 0) {
+	allocator->init_add_free(offset, length);
+	extent_count ++;
+      } else {
+	derr << "extent with zero length at idx=" << extent_count << dendl;
+	return -1;
+      }
+    }
+
+    uint32_t calc_crc = ceph_crc32c(crc, (const uint8_t*)buffer, read_bytes);
+    read_bytes        = bluefs->read(p_handle.get(), offset, sizeof(crc), nullptr, (char*)&crc);
+    if (read_bytes == sizeof(crc) ) {
+      crc     = CEPHTOH_32(crc);
+      if (crc != calc_crc) {
+	derr << "data crc mismatch!!! crc=" << crc << ", calc_crc=" << calc_crc << dendl;
+	derr << "extents_bytes_left=" << extents_bytes_left << ", offset=" << offset << ", extent_count=" << extent_count << dendl;
+	return -1;
+      }
+
+      offset += read_bytes;
+      if (extents_bytes_left) {
+	extents_bytes_left -= read_bytes;
+      }
+    } else {
+      derr << "Failed bluefs->read() for crc::read_bytes=" << read_bytes << ", req_bytes=" << sizeof(crc) << dendl;
+      return -1;
+    }
+
+  }
+
+  // finally, read the trailer and verify it is in good shape and that we got all the extents
+  {
+    bufferlist trailer_bl,temp_bl;
+    int        read_bytes = bluefs->read(p_handle.get(), offset, trailer_size, &temp_bl, nullptr);
+    if (read_bytes != trailer_size) {
+      derr << "Failed bluefs->read() for trailer::read_bytes=" << read_bytes << ", req_bytes=" << trailer_size << dendl;
+      return -1;
+    }
+    offset += read_bytes;
+
+    trailer_bl.claim_append(temp_bl);
+    uint32_t crc_calc = -1;
+    uint32_t crc;
+    allocator_image_trailer trailer;
+    auto p = trailer_bl.cbegin();
+    decode(trailer, p);
+    if (trailer.verify(cct, path, &header, extent_count, read_alloc_size) != 0 ) {
+      derr << "trailer=\n" << trailer << dendl;
+      return -1;
+    }
+
+    crc_calc = trailer_bl.cbegin().crc32c(p.get_off(), crc_calc); //crc from begin to current pos
+    decode(crc, p);
+    if (crc != crc_calc) {
+      derr << "trailer crc mismatch!::crc=" << crc << ", crc_calc=" << crc_calc << dendl;
+      derr << "trailer=\n" << trailer << dendl;
+      return -1;
+    }
+  }
+
+  utime_t duration = ceph_clock_now() - start_time;
+  dout(5) << "READ--extent_count=" << extent_count << ", read_alloc_size=  "
+	    << read_alloc_size << ", file_size=" << file_size << dendl;
+  dout(5) << "READ duration=" << duration << " seconds, s_serial=" << header.serial << dendl;
+  *num   = extent_count;
+  *bytes = read_alloc_size;
+  return 0;
+}
+
+//-----------------------------------------------------------------------------------
+int BlueStore::restore_allocator(Allocator* dest_allocator, uint64_t *num, uint64_t *bytes)
+{
+  utime_t    start = ceph_clock_now();
+  auto temp_allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
+  int ret = __restore_allocator(temp_allocator.get(), num, bytes);
+  if (ret != 0) {
+    return ret;
+  }
+
+  uint64_t num_entries = 0;
+  dout(5) << " calling copy_allocator(bitmap_allocator -> shared_alloc.a)" << dendl;
+  copy_allocator(temp_allocator.get(), dest_allocator, &num_entries);
+  utime_t duration = ceph_clock_now() - start;
+  dout(5) << "restored in " << duration << " seconds, num_entries=" << num_entries << dendl;
+  return ret;
+}
+
+//-----------------------------------------------------------------------------------
+void BlueStore::set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length)
+{
+  dout(30) << __func__ << " 0x" << std::hex
+           << offset << "~" << length
+           << " " << min_alloc_size_mask
+           << dendl;
+  ceph_assert((offset & min_alloc_size_mask) == 0);
+  ceph_assert((length & min_alloc_size_mask) == 0);
+  sbmap->set(offset >> min_alloc_size_order, length >> min_alloc_size_order);
+}
+
+void BlueStore::ExtentDecoderPartial::_consume_new_blob(bool spanning,
+                                                        uint64_t extent_no,
+                                                        uint64_t sbid,
+                                                        BlobRef b)
+{
+  [[maybe_unused]] auto cct = store.cct;
+  ceph_assert(per_pool_statfs);
+  ceph_assert(oid != ghobject_t());
+
+  auto &blob = b->get_blob();
+  if(spanning) {
+    dout(20) << __func__ << " " << spanning << " " << b->id << dendl;
+    ceph_assert(b->id >= 0);
+    spanning_blobs[b->id] = b;
+    ++stats.spanning_blob_count;
+  } else {
+    dout(20) << __func__ << " " << spanning << " " << extent_no << dendl;
+    blobs[extent_no] = b;
+  }
+  bool compressed = blob.is_compressed();
+  if (!blob.is_shared()) {
+    for (auto& pe : blob.get_extents()) {
+      if (pe.offset == bluestore_pextent_t::INVALID_OFFSET) {
+        ++stats.skipped_illegal_extent;
+        continue;
+      }
+      store.set_allocation_in_simple_bmap(&sbmap, pe.offset, pe.length);
+
+      per_pool_statfs->allocated() += pe.length;
+      if (compressed) {
+        per_pool_statfs->compressed_allocated() += pe.length;
+      }
+    }
+    if (compressed) {
+      per_pool_statfs->compressed() +=
+        blob.get_compressed_payload_length();
+      ++stats.compressed_blob_count;
+    }
+  } else {
+    auto it = sb_info.find(sbid);
+    if (it == sb_info.end()) {
+      derr << __func__ << " shared blob not found:" << sbid
+           << dendl;
+    }
+    auto &sbi = *it;
+    auto pool_id = oid.hobj.get_logical_pool();
+    if (sbi.pool_id == sb_info_t::INVALID_POOL_ID) {
+      sbi.pool_id = pool_id;
+      size_t alloc_delta = sbi.allocated_chunks << min_alloc_size_order;
+      per_pool_statfs->allocated() += alloc_delta;
+      if (compressed) {
+        per_pool_statfs->compressed_allocated() += alloc_delta;
+        ++stats.compressed_blob_count;
+      }
+    }
+    if (compressed) {
+      per_pool_statfs->compressed() +=
+        blob.get_compressed_payload_length();
+    }
+  }
+}
+
+void BlueStore::ExtentDecoderPartial::consume_blobid(Extent* le,
+                                                     bool spanning,
+                                                     uint64_t blobid)
+{
+  [[maybe_unused]] auto cct = store.cct;
+  dout(20) << __func__ << " " << spanning << " " << blobid << dendl;
+  auto &map = spanning ? spanning_blobs : blobs;
+  auto it = map.find(blobid);
+  ceph_assert(it != map.end());
+  per_pool_statfs->stored() += le->length;
+  if (it->second->get_blob().is_compressed()) {
+    per_pool_statfs->compressed_original() += le->length;
+  }
+}
+
+void BlueStore::ExtentDecoderPartial::consume_blob(Extent* le,
+                                                   uint64_t extent_no,
+                                                   uint64_t sbid,
+                                                   BlobRef b)
+{
+  _consume_new_blob(false, extent_no, sbid, b);
+  per_pool_statfs->stored() += le->length;
+  if (b->get_blob().is_compressed()) {
+    per_pool_statfs->compressed_original() += le->length;
+  }
+}
+
+void BlueStore::ExtentDecoderPartial::consume_spanning_blob(uint64_t sbid,
+                                                            BlobRef b)
+{
+  _consume_new_blob(true, 0/*doesn't matter*/, sbid, b);
+}
+
+void BlueStore::ExtentDecoderPartial::reset(const ghobject_t _oid,
+                                            volatile_statfs* _per_pool_statfs)
+{
+  oid = _oid;
+  per_pool_statfs = _per_pool_statfs;
+  blob_map_t empty;
+  blob_map_t empty2;
+  std::swap(blobs, empty);
+  std::swap(spanning_blobs, empty2);
+}
+
+int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats_t& stats)
+{
+  sb_info_space_efficient_map_t sb_info;
+  // iterate over all shared blobs
+  auto it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
+  if (!it) {
+    derr << "failed getting shared blob's iterator" << dendl;
+    return -ENOENT;
+  }
+  if (it) {
+    for (it->lower_bound(string()); it->valid(); it->next()) {
+      const auto& key = it->key();
+      dout(20) << __func__ << " decode sb " << pretty_binary_string(key) << dendl;
+      uint64_t sbid = 0;
+      if (get_key_shared_blob(key, &sbid) != 0) {
+	derr << __func__ << " bad shared blob key '" << pretty_binary_string(key)
+	     << "'" << dendl;
+      }
+      bluestore_shared_blob_t shared_blob(sbid);
+      bufferlist bl = it->value();
+      auto blp = bl.cbegin();
+      try {
+        decode(shared_blob, blp);
+      }
+      catch (ceph::buffer::error& e) {
+	derr << __func__ << " failed to decode Shared Blob"
+	     << pretty_binary_string(key) << dendl;
+	continue;
+      }
+      dout(20) << __func__ << "  " << shared_blob << dendl;
+      uint64_t allocated = 0;
+      for (auto& r : shared_blob.ref_map.ref_map) {
+        ceph_assert(r.first != bluestore_pextent_t::INVALID_OFFSET);
+        set_allocation_in_simple_bmap(sbmap, r.first, r.second.length);
+        allocated += r.second.length;
+      }
+      auto &sbi = sb_info.add_or_adopt(sbid);
+      ceph_assert(p2phase(allocated, min_alloc_size) == 0);
+      sbi.allocated_chunks += (allocated >> min_alloc_size_order);
+      ++stats.shared_blob_count;
+    }
+  }
+
+  it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+  if (!it) {
+    derr << "failed getting onode's iterator" << dendl;
+    return -ENOENT;
+  }
+
+  uint64_t            kv_count       = 0;
+  uint64_t            count_interval = 1'000'000;
+  ExtentDecoderPartial edecoder(*this,
+                                stats,
+                                *sbmap,
+                                sb_info,
+                                min_alloc_size_order);
+
+  // iterate over all ONodes stored in RocksDB
+  for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) {
+    // trace an even after every million processed objects (typically every 5-10 seconds)
+    if (kv_count && (kv_count % count_interval == 0) ) {
+      dout(5) << __func__ << " processed objects count = " << kv_count << dendl;
+    }
+
+    auto key = it->key();
+    auto okey = key;
+    dout(20) << __func__ << " decode onode " << pretty_binary_string(key) << dendl;
+    ghobject_t oid;
+    if (!is_extent_shard_key(it->key())) {
+      int r = get_key_object(okey, &oid);
+      if (r != 0) {
+        derr << __func__ << " failed to decode onode key = "
+             << pretty_binary_string(okey) << dendl;
+        return -EIO;
+      }
+      edecoder.reset(oid,
+        &stats.actual_pool_vstatfs[oid.hobj.get_logical_pool()]);
+      Onode dummy_on(cct);
+      Onode::decode_raw(&dummy_on,
+        it->value(),
+        edecoder);
+      ++stats.onode_count;
+    } else {
+      uint32_t offset;
+      int r = get_key_extent_shard(key, &okey, &offset);
+      if (r != 0) {
+        derr << __func__ << " failed to decode onode extent key = "
+             << pretty_binary_string(key) << dendl;
+        return -EIO;
+      }
+      r = get_key_object(okey, &oid);
+      if (r != 0) {
+        derr << __func__
+             << " failed to decode onode key= " << pretty_binary_string(okey)
+             << " from extent key= " << pretty_binary_string(key)
+             << dendl;
+        return -EIO;
+      }
+      ceph_assert(oid == edecoder.get_oid());
+      edecoder.decode_some(it->value(), nullptr);
+      ++stats.shard_count;
+    }
+  }
+
+  std::lock_guard l(vstatfs_lock);
+  store_statfs_t s;
+  osd_pools.clear();
+  for (auto& p : stats.actual_pool_vstatfs) {
+    if (per_pool_stat_collection) {
+      osd_pools[p.first] = p.second;
+    }
+    stats.actual_store_vstatfs += p.second;
+    p.second.publish(&s);
+    dout(5) << __func__ << " recovered pool "
+            << std::hex
+            << p.first << "->" << s
+            << std::dec
+            << " per-pool:" << per_pool_stat_collection
+            << dendl;
+  }
+  vstatfs = stats.actual_store_vstatfs;
+  vstatfs.publish(&s);
+  dout(5) << __func__ << " recovered " << s
+          << dendl;
+  return 0;
+}
+
+//---------------------------------------------------------
+int BlueStore::reconstruct_allocations(SimpleBitmap *sbmap, read_alloc_stats_t &stats)
+{
+  // first set space used by superblock
+  auto super_length = std::max<uint64_t>(min_alloc_size, SUPER_RESERVED);
+  set_allocation_in_simple_bmap(sbmap, 0, super_length);
+  stats.extent_count++;
+
+  // then set all space taken by Objects
+  int ret = read_allocation_from_onodes(sbmap, stats);
+  if (ret < 0) {
+    derr << "failed read_allocation_from_onodes()" << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+//-----------------------------------------------------------------------------------
+static void copy_simple_bitmap_to_allocator(SimpleBitmap* sbmap, Allocator* dest_alloc, uint64_t alloc_size)
+{
+  int alloc_size_shift = std::countr_zero(alloc_size);
+  uint64_t offset = 0;
+  extent_t ext    = sbmap->get_next_clr_extent(offset);
+  while (ext.length != 0) {
+    dest_alloc->init_add_free(ext.offset << alloc_size_shift, ext.length << alloc_size_shift);
+    offset = ext.offset + ext.length;
+    ext = sbmap->get_next_clr_extent(offset);
+  }
+}
+
+//---------------------------------------------------------
+int BlueStore::read_allocation_from_drive_on_startup()
+{
+  int ret = 0;
+
+  ret = _open_collections();
+  if (ret < 0) {
+    return ret;
+  }
+  auto shutdown_cache = make_scope_guard([&] {
+    _shutdown_cache();
+  });
+
+  utime_t            start = ceph_clock_now();
+  read_alloc_stats_t stats = {};
+  SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
+  ret = reconstruct_allocations(&sbmap, stats);
+  if (ret != 0) {
+    return ret;
+  }
+
+  copy_simple_bitmap_to_allocator(&sbmap, alloc, min_alloc_size);
+
+  utime_t duration = ceph_clock_now() - start;
+  dout(1) << "::Allocation Recovery was completed in " << duration << " seconds, extent_count=" << stats.extent_count << dendl;
+  return ret;
+}
+
+
+
+
+// Only used for debugging purposes - we build a secondary allocator from the Onodes and compare it to the existing one
+// Not meant to be run by customers
+#ifdef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
+
+#include <stdlib.h>
+#include <algorithm>
+//---------------------------------------------------------
+int cmpfunc (const void * a, const void * b)
+{
+  if ( ((extent_t*)a)->offset > ((extent_t*)b)->offset ) {
+    return 1;
+  }
+  else if( ((extent_t*)a)->offset < ((extent_t*)b)->offset ) {
+    return -1;
+  }
+  else {
+    return 0;
+  }
+}
+
+// compare the allocator built from Onodes with the system allocator (CF-B)
+//---------------------------------------------------------
+int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t req_extent_count, uint64_t memory_target)
+{
+  uint64_t allocation_size = std::min((req_extent_count) * sizeof(extent_t), memory_target / 3);
+  uint64_t extent_count    = allocation_size/sizeof(extent_t);
+  dout(5) << "req_extent_count=" << req_extent_count << ", granted extent_count="<< extent_count << dendl;
+
+  unique_ptr<extent_t[]> arr1;
+  unique_ptr<extent_t[]> arr2;
+  try {
+    arr1 = make_unique<extent_t[]>(extent_count);
+    arr2 = make_unique<extent_t[]>(extent_count);
+  } catch (std::bad_alloc&) {
+    derr << "****Failed dynamic allocation, extent_count=" << extent_count << dendl;
+    return -1;
+  }
+
+  // copy the extents from the allocators into simple array and then compare them
+  uint64_t size1 = 0, size2 = 0;
+  uint64_t idx1  = 0, idx2  = 0;
+  auto iterated_mapper1 = [&](uint64_t offset, uint64_t length) {
+    size1 += length;
+    if (idx1 < extent_count) {
+      arr1[idx1++] = {offset, length};
+    }
+    else if (idx1 == extent_count) {
+      derr << "(2)compare_allocators:: spillover"  << dendl;
+      idx1 ++;
+    }
+
+  };
+
+  auto iterated_mapper2 = [&](uint64_t offset, uint64_t length) {
+    size2 += length;
+    if (idx2 < extent_count) {
+      arr2[idx2++] = {offset, length};
+    }
+    else if (idx2 == extent_count) {
+      derr << "(2)compare_allocators:: spillover"  << dendl;
+      idx2 ++;
+    }
+  };
+
+  alloc1->foreach(iterated_mapper1);
+  alloc2->foreach(iterated_mapper2);
+
+  qsort(arr1.get(), std::min(idx1, extent_count), sizeof(extent_t), cmpfunc);
+  qsort(arr2.get(), std::min(idx2, extent_count), sizeof(extent_t), cmpfunc);
+
+  if (idx1 == idx2) {
+    idx1 = idx2 = std::min(idx1, extent_count);
+    if (memcmp(arr1.get(), arr2.get(), sizeof(extent_t) * idx2) == 0) {
+      return 0;
+    }
+    derr << "Failed memcmp(arr1, arr2, sizeof(extent_t)*idx2)"  << dendl;
+    for (uint64_t i = 0; i < idx1; i++) {
+      if (memcmp(arr1.get()+i, arr2.get()+i, sizeof(extent_t)) != 0) {
+	derr << "!!!![" << i << "] arr1::<" << arr1[i].offset << "," << arr1[i].length << ">" << dendl;
+	derr << "!!!![" << i << "] arr2::<" << arr2[i].offset << "," << arr2[i].length << ">" << dendl;
+	return -1;
+      }
+    }
+    return 0;
+  } else {
+    derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl;
+    return -1;
+  }
+}
+
+//---------------------------------------------------------
+int BlueStore::add_existing_bluefs_allocation(Allocator* allocator, read_alloc_stats_t &stats)
+{
+  // then add space used by bluefs to store rocksdb
+  unsigned extent_count = 0;
+  if (bluefs) {
+    bluefs->foreach_block_extents(
+      bluefs_layout.shared_bdev,
+      [&](uint64_t start, uint32_t len) {
+        allocator->init_rm_free(start, len);
+        stats.extent_count++;
+      }
+    );
+  }
+
+  dout(5) << "bluefs extent_count=" << extent_count << dendl;
+  return 0;
+}
+
+//---------------------------------------------------------
+int BlueStore::read_allocation_from_drive_for_bluestore_tool()
+{
+  dout(5) << __func__ << dendl;
+  int ret = 0;
+  uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
+  ret = _open_db_and_around(true, false);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = _open_collections();
+  if (ret < 0) {
+    _close_db_and_around();
+    return ret;
+  }
+
+  utime_t            duration;
+  read_alloc_stats_t stats = {};
+  utime_t            start = ceph_clock_now();
+
+  auto shutdown_cache = make_scope_guard([&] {
+    dout(1) << "Allocation Recovery was completed in " << duration
+	    << " seconds; insert_count=" << stats.insert_count
+	    << "; extent_count=" << stats.extent_count << dendl;
+    _shutdown_cache();
+    _close_db_and_around();
+  });
+
+  {
+    auto allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
+    //reconstruct allocations into a temp simple-bitmap and copy into allocator
+    {
+      SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
+      ret = reconstruct_allocations(&sbmap, stats);
+      if (ret != 0) {
+	return ret;
+      }
+      copy_simple_bitmap_to_allocator(&sbmap, allocator.get(), min_alloc_size);
+    }
+
+    // add allocation space used by the bluefs itself
+    ret = add_existing_bluefs_allocation(allocator.get(), stats);
+    if (ret < 0) {
+      return ret;
+    }
+
+    duration = ceph_clock_now() - start;
+    stats.insert_count = 0;
+    auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
+      stats.insert_count++;
+    };
+    allocator->foreach(count_entries);
+    ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target);
+    if (ret == 0) {
+      dout(5) << "Allocator drive - file integrity check OK" << dendl;
+    } else {
+      derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl;
+    }
+  }
+
+  dout(1) << stats << dendl;
+  return ret;
+}
+
+//---------------------------------------------------------
+Allocator* BlueStore::clone_allocator_without_bluefs(Allocator *src_allocator)
+{
+  uint64_t   bdev_size = bdev->get_size();
+  Allocator* allocator = create_bitmap_allocator(bdev_size);
+  if (allocator) {
+    dout(5) << "bitmap-allocator=" << allocator << dendl;
+  } else {
+    derr << "****failed create_bitmap_allocator()" << dendl;
+    return nullptr;
+  }
+
+  uint64_t num_entries = 0;
+  copy_allocator(src_allocator, allocator, &num_entries);
+
+  // BlueFS stores its internal allocation outside RocksDB (FM) so we should not destage them to the allcoator-file
+  // we are going to hide bluefs allocation during allocator-destage as they are stored elsewhere
+  {
+    bluefs->foreach_block_extents(
+      bluefs_layout.shared_bdev,
+      [&] (uint64_t start, uint32_t len) {
+        allocator->init_add_free(start, len);
+      }
+    );
+  }
+
+  return allocator;
+}
+
+//---------------------------------------------------------
+static void clear_allocation_objects_from_rocksdb(KeyValueDB *db, CephContext *cct, const std::string &path)
+{
+  dout(5) << "t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP)" << dendl;
+  KeyValueDB::Transaction t = db->get_transaction();
+  t->rmkeys_by_prefix(PREFIX_ALLOC_BITMAP);
+  db->submit_transaction_sync(t);
+}
+
+//---------------------------------------------------------
+void BlueStore::copy_allocator_content_to_fm(Allocator *allocator, FreelistManager *real_fm)
+{
+  unsigned max_txn = 1024;
+  dout(5) << "max_transaction_submit=" << max_txn << dendl;
+  uint64_t size = 0, idx = 0;
+  KeyValueDB::Transaction txn = db->get_transaction();
+  auto iterated_insert = [&](uint64_t offset, uint64_t length) {
+    size += length;
+    real_fm->release(offset, length, txn);
+    if ((++idx % max_txn) == 0) {
+      db->submit_transaction_sync(txn);
+      txn = db->get_transaction();
+    }
+  };
+  allocator->foreach(iterated_insert);
+  if (idx % max_txn != 0) {
+    db->submit_transaction_sync(txn);
+  }
+  dout(5) << "size=" << size << ", num extents=" << idx  << dendl;
+}
+
+//---------------------------------------------------------
+Allocator* BlueStore::initialize_allocator_from_freelist(FreelistManager *real_fm)
+{
+  dout(5) << "real_fm->enumerate_next" << dendl;
+  Allocator* allocator2 = create_bitmap_allocator(bdev->get_size());
+  if (allocator2) {
+    dout(5) << "bitmap-allocator=" << allocator2 << dendl;
+  } else {
+    return nullptr;
+  }
+
+  uint64_t size2 = 0, idx2 = 0;
+  real_fm->enumerate_reset();
+  uint64_t offset, length;
+  while (real_fm->enumerate_next(db, &offset, &length)) {
+    allocator2->init_add_free(offset, length);
+    ++idx2;
+    size2 += length;
+  }
+  real_fm->enumerate_reset();
+
+  dout(5) << "size2=" << size2 << ", num2=" << idx2 << dendl;
+  return allocator2;
+}
+
+//---------------------------------------------------------
+// close the active fm and open it in a new mode like makefs()
+// but make sure to mark the full device space as allocated
+// later we will mark all exetents from the allocator as free
+int BlueStore::reset_fm_for_restore()
+{
+  dout(5) << "<<==>> fm->clear_null_manager()" << dendl;
+  fm->shutdown();
+  delete fm;
+  fm = nullptr;
+  freelist_type = "bitmap";
+  KeyValueDB::Transaction t = db->get_transaction();
+  // call _open_fm() with fm_restore set to TRUE
+  // this will mark the full device space as allocated (and not just the reserved space)
+  _open_fm(t, true, true, true);
+  if (fm == nullptr) {
+    derr << "Failed _open_fm()" << dendl;
+    return -1;
+  }
+  db->submit_transaction_sync(t);
+  ceph_assert(!fm->is_null_manager());
+  dout(5) << "fm was reactivated in full mode" << dendl;
+  return 0;
+}
+
+
+//---------------------------------------------------------
+// create a temp allocator filled with allocation state from the fm
+// and compare it to the base allocator passed in
+int BlueStore::verify_rocksdb_allocations(Allocator *allocator)
+{
+  dout(5) << "verify that alloc content is identical to FM" << dendl;
+  // initialize from freelist
+  Allocator* temp_allocator = initialize_allocator_from_freelist(fm);
+  if (temp_allocator == nullptr) {
+    return -1;
+  }
+
+  uint64_t insert_count = 0;
+  auto count_entries = [&](uint64_t extent_offset, uint64_t extent_length) {
+    insert_count++;
+  };
+  temp_allocator->foreach(count_entries);
+  uint64_t memory_target = cct->_conf.get_val<Option::size_t>("osd_memory_target");
+  int ret = compare_allocators(allocator, temp_allocator, insert_count, memory_target);
+
+  delete temp_allocator;
+
+  if (ret == 0) {
+    dout(5) << "SUCCESS!!! compare(allocator, temp_allocator)" << dendl;
+    return 0;
+  } else {
+    derr << "**** FAILURE compare(allocator, temp_allocator)::ret=" << ret << dendl;
+    return -1;
+  }
+}
+
+//---------------------------------------------------------
+int BlueStore::db_cleanup(int ret)
+{
+  _shutdown_cache();
+  _close_db_and_around();
+  return ret;
+}
+
+//---------------------------------------------------------
+// convert back the system from null-allocator to using rocksdb to store allocation
+int BlueStore::push_allocation_to_rocksdb()
+{
+  if (cct->_conf->bluestore_allocation_from_file) {
+    derr << "cct->_conf->bluestore_allocation_from_file must be cleared first" << dendl;
+    derr << "please change default to false in ceph.conf file>" << dendl;
+    return -1;
+  }
+
+  dout(5) << "calling open_db_and_around() in read/write mode" << dendl;
+  int ret = _open_db_and_around(false);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (!fm->is_null_manager()) {
+    derr << "This is not a NULL-MANAGER -> nothing to do..." << dendl;
+    return db_cleanup(0);
+  }
+
+  // start by creating a clone copy of the shared-allocator
+  unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(alloc));
+  if (!allocator) {
+    return db_cleanup(-1);
+  }
+
+  // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
+  clear_allocation_objects_from_rocksdb(db, cct, path);
+
+  // then open fm in new mode with the full devie marked as alloctaed
+  if (reset_fm_for_restore() != 0) {
+    return db_cleanup(-1);
+  }
+
+  // push the free-space from the allocator (shared-alloc without bfs) to rocksdb
+  copy_allocator_content_to_fm(allocator.get(), fm);
+
+  // compare the allocator info with the info stored in the fm/rocksdb
+  if (verify_rocksdb_allocations(allocator.get()) == 0) {
+    // all is good -> we can commit to rocksdb allocator
+    commit_to_real_manager();
+  } else {
+    return db_cleanup(-1);
+  }
+
+  // can't be too paranoid :-)
+  dout(5) << "Running full scale verification..." << dendl;
+  // close db/fm/allocator and start fresh
+  db_cleanup(0);
+  dout(5) << "calling open_db_and_around() in read-only mode" << dendl;
+  ret = _open_db_and_around(true);
+  if (ret < 0) {
+    return db_cleanup(ret);
+  }
+  ceph_assert(!fm->is_null_manager());
+  ceph_assert(verify_rocksdb_allocations(allocator.get()) == 0);
+
+  return db_cleanup(ret);
+}
+
+#endif // CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
+
+//-------------------------------------------------------------------------------------
+int BlueStore::commit_freelist_type()
+{
+  // When freelist_type to "bitmap" we will store allocation in RocksDB
+  // When allocation-info is stored in a single file we set freelist_type to "null"
+  // This will direct the startup code to read allocation from file and not RocksDB
+  KeyValueDB::Transaction t = db->get_transaction();
+  if (t == nullptr) {
+    derr << "db->get_transaction() failed!!!" << dendl;
+    return -1;
+  }
+
+  bufferlist bl;
+  bl.append(freelist_type);
+  t->set(PREFIX_SUPER, "freelist_type", bl);
+
+  int ret = db->submit_transaction_sync(t);
+  if (ret != 0) {
+    derr << "Failed db->submit_transaction_sync(t)" << dendl;
+  }
+  return ret;
+}
+
+//-------------------------------------------------------------------------------------
+int BlueStore::commit_to_null_manager()
+{
+  dout(5) << __func__ << " Set FreelistManager to NULL FM..." << dendl;
+  fm->set_null_manager();
+  freelist_type = "null";
+#if 1
+  return commit_freelist_type();
+#else
+  // should check how long this step take on a big configuration as deletes are expensive
+  if (commit_freelist_type() == 0) {
+    // remove all objects of PREFIX_ALLOC_BITMAP from RocksDB to guarantee a clean start
+    clear_allocation_objects_from_rocksdb(db, cct, path);
+  }
+#endif
+}
+
+
+//-------------------------------------------------------------------------------------
+int BlueStore::commit_to_real_manager()
+{
+  dout(5) << "Set FreelistManager to Real FM..." << dendl;
+  ceph_assert(!fm->is_null_manager());
+  freelist_type = "bitmap";
+  int ret = commit_freelist_type();
+  if (ret == 0) {
+    //remove the allocation_file
+    invalidate_allocation_file_on_bluefs();
+    ret = bluefs->unlink(allocator_dir, allocator_file);
+    bluefs->sync_metadata(false);
+    if (ret == 0) {
+      dout(5) << "Remove Allocation File successfully" << dendl;
+    }
+    else {
+      derr << "Remove Allocation File ret_code=" << ret << dendl;
+    }
+  }
+
+  return ret;
+}
+
+//================================================================================================================
+//================================================================================================================
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h
new file mode 100644
index 000000000..c3d014dc9
--- /dev/null
+++ b/src/os/bluestore/BlueStore.h
@@ -0,0 +1,4290 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_BLUESTORE_H
+#define CEPH_OSD_BLUESTORE_H
+
+#include "acconfig.h"
+
+#include <unistd.h>
+
+#include <atomic>
+#include <bit>
+#include <chrono>
+#include <ratio>
+#include <mutex>
+#include <condition_variable>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/unordered_set.hpp>
+#include <boost/intrusive/set.hpp>
+#include <boost/functional/hash.hpp>
+#include <boost/dynamic_bitset.hpp>
+#include <boost/circular_buffer.hpp>
+
+#include "include/cpp-btree/btree_set.h"
+
+#include "include/ceph_assert.h"
+#include "include/interval_set.h"
+#include "include/unordered_map.h"
+#include "include/mempool.h"
+#include "include/hash.h"
+#include "common/bloom_filter.hpp"
+#include "common/Finisher.h"
+#include "common/ceph_mutex.h"
+#include "common/Throttle.h"
+#include "common/perf_counters.h"
+#include "common/PriorityCache.h"
+#include "compressor/Compressor.h"
+#include "os/ObjectStore.h"
+
+#include "bluestore_types.h"
+#include "BlueFS.h"
+#include "common/EventTrace.h"
+
+#ifdef WITH_BLKIN
+#include "common/zipkin_trace.h"
+#endif
+
+class Allocator;
+class FreelistManager;
+class BlueStoreRepairer;
+class SimpleBitmap;
+//#define DEBUG_CACHE
+//#define DEBUG_DEFERRED
+
+
+
+// constants for Buffer::optimize()
+#define MAX_BUFFER_SLOP_RATIO_DEN  8  // so actually 1/N
+#define CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
+
+enum {
+  l_bluestore_first = 732430,
+  // space utilization stats
+  //****************************************
+  l_bluestore_allocated,
+  l_bluestore_stored,
+  l_bluestore_fragmentation,
+  l_bluestore_alloc_unit,
+  //****************************************
+
+  // Update op processing state latencies
+  //****************************************
+  l_bluestore_state_prepare_lat,
+  l_bluestore_state_aio_wait_lat,
+  l_bluestore_state_io_done_lat,
+  l_bluestore_state_kv_queued_lat,
+  l_bluestore_state_kv_committing_lat,
+  l_bluestore_state_kv_done_lat,
+  l_bluestore_state_finishing_lat,
+  l_bluestore_state_done_lat,
+
+  l_bluestore_state_deferred_queued_lat,
+  l_bluestore_state_deferred_aio_wait_lat,
+  l_bluestore_state_deferred_cleanup_lat,
+
+  l_bluestore_commit_lat,
+  //****************************************
+
+  // Update Transaction stats
+  //****************************************
+  l_bluestore_throttle_lat,
+  l_bluestore_submit_lat,
+  l_bluestore_txc,
+  //****************************************
+
+  // Read op stats
+  //****************************************
+  l_bluestore_read_onode_meta_lat,
+  l_bluestore_read_wait_aio_lat,
+  l_bluestore_csum_lat,
+  l_bluestore_read_eio,
+  l_bluestore_reads_with_retries,
+  l_bluestore_read_lat,
+  //****************************************
+
+  // kv_thread latencies
+  //****************************************
+  l_bluestore_kv_flush_lat,
+  l_bluestore_kv_commit_lat,
+  l_bluestore_kv_sync_lat,
+  l_bluestore_kv_final_lat,
+  //****************************************
+
+  // write op stats
+  //****************************************
+  l_bluestore_write_big,
+  l_bluestore_write_big_bytes,
+  l_bluestore_write_big_blobs,
+  l_bluestore_write_big_deferred,
+
+  l_bluestore_write_small,
+  l_bluestore_write_small_bytes,
+  l_bluestore_write_small_unused,
+  l_bluestore_write_small_pre_read,
+
+  l_bluestore_write_pad_bytes,
+  l_bluestore_write_penalty_read_ops,
+  l_bluestore_write_new,
+
+  l_bluestore_issued_deferred_writes,
+  l_bluestore_issued_deferred_write_bytes,
+  l_bluestore_submitted_deferred_writes,
+  l_bluestore_submitted_deferred_write_bytes,
+
+  l_bluestore_write_big_skipped_blobs,
+  l_bluestore_write_big_skipped_bytes,
+  l_bluestore_write_small_skipped,
+  l_bluestore_write_small_skipped_bytes,
+  //****************************************
+
+  // compressions stats
+  //****************************************
+  l_bluestore_compressed,
+  l_bluestore_compressed_allocated,
+  l_bluestore_compressed_original,
+  l_bluestore_compress_lat,
+  l_bluestore_decompress_lat,
+  l_bluestore_compress_success_count,
+  l_bluestore_compress_rejected_count,
+  //****************************************
+
+  // onode cache stats
+  //****************************************
+  l_bluestore_onodes,
+  l_bluestore_pinned_onodes,
+  l_bluestore_onode_hits,
+  l_bluestore_onode_misses,
+  l_bluestore_onode_shard_hits,
+  l_bluestore_onode_shard_misses,
+  l_bluestore_extents,
+  l_bluestore_blobs,
+  //****************************************
+
+  // buffer cache stats
+  //****************************************
+  l_bluestore_buffers,
+  l_bluestore_buffer_bytes,
+  l_bluestore_buffer_hit_bytes,
+  l_bluestore_buffer_miss_bytes,
+  //****************************************
+
+  // internal stats
+  //****************************************
+  l_bluestore_onode_reshard,
+  l_bluestore_blob_split,
+  l_bluestore_extent_compress,
+  l_bluestore_gc_merged,
+  //****************************************
+
+  // misc
+  //****************************************
+  l_bluestore_omap_iterator_count,
+  l_bluestore_omap_rmkeys_count,
+  l_bluestore_omap_rmkey_ranges_count,
+  //****************************************
+
+  // other client ops latencies
+  //****************************************
+  l_bluestore_omap_seek_to_first_lat,
+  l_bluestore_omap_upper_bound_lat,
+  l_bluestore_omap_lower_bound_lat,
+  l_bluestore_omap_next_lat,
+  l_bluestore_omap_get_keys_lat,
+  l_bluestore_omap_get_values_lat,
+  l_bluestore_omap_clear_lat,
+  l_bluestore_clist_lat,
+  l_bluestore_remove_lat,
+  l_bluestore_truncate_lat,
+  //****************************************
+
+  // allocation stats
+  //****************************************
+  l_bluestore_allocate_hist,
+  //****************************************
+  l_bluestore_last
+};
+
+#define META_POOL_ID ((uint64_t)-1ull)
+using bptr_c_it_t = buffer::ptr::const_iterator;
+
+class BlueStore : public ObjectStore,
+		  public md_config_obs_t {
+  // -----------------------------------------------------
+  // types
+public:
+  // config observer
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set<std::string> &changed) override;
+
+  //handler for discard event
+  void handle_discard(interval_set<uint64_t>& to_release);
+
+  void _set_csum();
+  void _set_compression();
+  void _set_throttle_params();
+  int _set_cache_sizes();
+  void _set_max_defer_interval() {
+    max_defer_interval =
+	cct->_conf.get_val<double>("bluestore_max_defer_interval");
+  }
+
+  struct TransContext;
+
+  typedef std::map<uint64_t, ceph::buffer::list> ready_regions_t;
+
+
+  struct BufferSpace;
+  struct Collection;
+  typedef boost::intrusive_ptr<Collection> CollectionRef;
+
+  struct AioContext {
+    virtual void aio_finish(BlueStore *store) = 0;
+    virtual ~AioContext() {}
+  };
+
+  /// cached buffer
+  struct Buffer {
+    MEMPOOL_CLASS_HELPERS();
+
+    enum {
+      STATE_EMPTY,     ///< empty buffer -- used for cache history
+      STATE_CLEAN,     ///< clean data that is up to date
+      STATE_WRITING,   ///< data that is being written (io not yet complete)
+    };
+    static const char *get_state_name(int s) {
+      switch (s) {
+      case STATE_EMPTY: return "empty";
+      case STATE_CLEAN: return "clean";
+      case STATE_WRITING: return "writing";
+      default: return "???";
+      }
+    }
+    enum {
+      FLAG_NOCACHE = 1,  ///< trim when done WRITING (do not become CLEAN)
+      // NOTE: fix operator<< when you define a second flag
+    };
+    static const char *get_flag_name(int s) {
+      switch (s) {
+      case FLAG_NOCACHE: return "nocache";
+      default: return "???";
+      }
+    }
+
+    BufferSpace *space;
+    uint16_t state;             ///< STATE_*
+    uint16_t cache_private = 0; ///< opaque (to us) value used by Cache impl
+    uint32_t flags;             ///< FLAG_*
+    uint64_t seq;
+    uint32_t offset, length;
+    ceph::buffer::list data;
+    std::shared_ptr<int64_t> cache_age_bin;  ///< cache age bin
+
+    boost::intrusive::list_member_hook<> lru_item;
+    boost::intrusive::list_member_hook<> state_item;
+
+    Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, uint32_t l,
+	   unsigned f = 0)
+      : space(space), state(s), flags(f), seq(q), offset(o), length(l) {}
+    Buffer(BufferSpace *space, unsigned s, uint64_t q, uint32_t o, ceph::buffer::list& b,
+	   unsigned f = 0)
+      : space(space), state(s), flags(f), seq(q), offset(o),
+	length(b.length()), data(b) {}
+
+    bool is_empty() const {
+      return state == STATE_EMPTY;
+    }
+    bool is_clean() const {
+      return state == STATE_CLEAN;
+    }
+    bool is_writing() const {
+      return state == STATE_WRITING;
+    }
+
+    uint32_t end() const {
+      return offset + length;
+    }
+
+    void truncate(uint32_t newlen) {
+      ceph_assert(newlen < length);
+      if (data.length()) {
+	ceph::buffer::list t;
+	t.substr_of(data, 0, newlen);
+	data = std::move(t);
+      }
+      length = newlen;
+    }
+    void maybe_rebuild() {
+      if (data.length() &&
+	  (data.get_num_buffers() > 1 ||
+	   data.front().wasted() > data.length() / MAX_BUFFER_SLOP_RATIO_DEN)) {
+	data.rebuild();
+      }
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->dump_string("state", get_state_name(state));
+      f->dump_unsigned("seq", seq);
+      f->dump_unsigned("offset", offset);
+      f->dump_unsigned("length", length);
+      f->dump_unsigned("data_length", data.length());
+    }
+  };
+
+  struct BufferCacheShard;
+
+  /// map logical extent range (object) onto buffers
+  struct BufferSpace {
+    enum {
+      BYPASS_CLEAN_CACHE = 0x1,  // bypass clean cache
+    };
+
+    typedef boost::intrusive::list<
+      Buffer,
+      boost::intrusive::member_hook<
+        Buffer,
+	boost::intrusive::list_member_hook<>,
+	&Buffer::state_item> > state_list_t;
+
+    mempool::bluestore_cache_meta::map<uint32_t, std::unique_ptr<Buffer>>
+      buffer_map;
+
+    // we use a bare intrusive list here instead of std::map because
+    // it uses less memory and we expect this to be very small (very
+    // few IOs in flight to the same Blob at the same time).
+    state_list_t writing;   ///< writing buffers, sorted by seq, ascending
+
+    ~BufferSpace() {
+      ceph_assert(buffer_map.empty());
+      ceph_assert(writing.empty());
+    }
+
+    void _add_buffer(BufferCacheShard* cache, Buffer* b, int level, Buffer* near) {
+      cache->_audit("_add_buffer start");
+      buffer_map[b->offset].reset(b);
+      if (b->is_writing()) {
+        // we might get already cached data for which resetting mempool is inppropriate
+        // hence calling try_assign_to_mempool
+        b->data.try_assign_to_mempool(mempool::mempool_bluestore_writing);
+        if (writing.empty() || writing.rbegin()->seq <= b->seq) {
+          writing.push_back(*b);
+        } else {
+          auto it = writing.begin();
+          while (it->seq < b->seq) {
+            ++it;
+          }
+
+          ceph_assert(it->seq >= b->seq);
+          // note that this will insert b before it
+          // hence the order is maintained
+          writing.insert(it, *b);
+        }
+      } else {
+        b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
+        cache->_add(b, level, near);
+      }
+      cache->_audit("_add_buffer end");
+    }
+    void _rm_buffer(BufferCacheShard* cache, Buffer *b) {
+      _rm_buffer(cache, buffer_map.find(b->offset));
+    }
+    void _rm_buffer(BufferCacheShard* cache,
+		    std::map<uint32_t, std::unique_ptr<Buffer>>::iterator p) {
+      ceph_assert(p != buffer_map.end());
+      cache->_audit("_rm_buffer start");
+      if (p->second->is_writing()) {
+        writing.erase(writing.iterator_to(*p->second));
+      } else {
+	cache->_rm(p->second.get());
+      }
+      buffer_map.erase(p);
+      cache->_audit("_rm_buffer end");
+    }
+
+    std::map<uint32_t,std::unique_ptr<Buffer>>::iterator _data_lower_bound(
+      uint32_t offset) {
+      auto i = buffer_map.lower_bound(offset);
+      if (i != buffer_map.begin()) {
+	--i;
+	if (i->first + i->second->length <= offset)
+	  ++i;
+      }
+      return i;
+    }
+
+    // must be called under protection of the Cache lock
+    void _clear(BufferCacheShard* cache);
+
+    // return value is the highest cache_private of a trimmed buffer, or 0.
+    int discard(BufferCacheShard* cache, uint32_t offset, uint32_t length) {
+      std::lock_guard l(cache->lock);
+      int ret = _discard(cache, offset, length);
+      cache->_trim();
+      return ret;
+    }
+    int _discard(BufferCacheShard* cache, uint32_t offset, uint32_t length);
+
+    void write(BufferCacheShard* cache, uint64_t seq, uint32_t offset, ceph::buffer::list& bl,
+	       unsigned flags) {
+      std::lock_guard l(cache->lock);
+      Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl,
+			     flags);
+      b->cache_private = _discard(cache, offset, bl.length());
+      _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
+      cache->_trim();
+    }
+    void _finish_write(BufferCacheShard* cache, uint64_t seq);
+    void did_read(BufferCacheShard* cache, uint32_t offset, ceph::buffer::list& bl) {
+      std::lock_guard l(cache->lock);
+      Buffer *b = new Buffer(this, Buffer::STATE_CLEAN, 0, offset, bl);
+      b->cache_private = _discard(cache, offset, bl.length());
+      _add_buffer(cache, b, 1, nullptr);
+      cache->_trim();
+    }
+
+    void read(BufferCacheShard* cache, uint32_t offset, uint32_t length,
+	      BlueStore::ready_regions_t& res,
+	      interval_set<uint32_t>& res_intervals,
+	      int flags = 0);
+
+    void truncate(BufferCacheShard* cache, uint32_t offset) {
+      discard(cache, offset, (uint32_t)-1 - offset);
+    }
+
+    void split(BufferCacheShard* cache, size_t pos, BufferSpace &r);
+
+    void dump(BufferCacheShard* cache, ceph::Formatter *f) const {
+      std::lock_guard l(cache->lock);
+      f->open_array_section("buffers");
+      for (auto& i : buffer_map) {
+	f->open_object_section("buffer");
+	ceph_assert(i.first == i.second->offset);
+	i.second->dump(f);
+	f->close_section();
+      }
+      f->close_section();
+    }
+  };
+
+  struct SharedBlobSet;
+
+  /// in-memory shared blob state (incl cached buffers)
+  struct SharedBlob {
+    MEMPOOL_CLASS_HELPERS();
+
+    std::atomic_int nref = {0}; ///< reference count
+    bool loaded = false;
+
+    CollectionRef coll;
+    union {
+      uint64_t sbid_unloaded;              ///< sbid if persistent isn't loaded
+      bluestore_shared_blob_t *persistent; ///< persistent part of the shared blob if any
+    };
+    BufferSpace bc;             ///< buffer cache
+
+    SharedBlob(Collection *_coll) : coll(_coll), sbid_unloaded(0) {
+      if (get_cache()) {
+	get_cache()->add_blob();
+      }
+    }
+    SharedBlob(uint64_t i, Collection *_coll);
+    ~SharedBlob();
+
+    uint64_t get_sbid() const {
+      return loaded ? persistent->sbid : sbid_unloaded;
+    }
+
+    friend void intrusive_ptr_add_ref(SharedBlob *b) { b->get(); }
+    friend void intrusive_ptr_release(SharedBlob *b) { b->put(); }
+
+    void dump(ceph::Formatter* f) const;
+    friend std::ostream& operator<<(std::ostream& out, const SharedBlob& sb);
+
+    void get() {
+      ++nref;
+    }
+    void put();
+
+    /// get logical references
+    void get_ref(uint64_t offset, uint32_t length);
+
+    /// put logical references, and get back any released extents
+    void put_ref(uint64_t offset, uint32_t length,
+		 PExtentVector *r, bool *unshare);
+
+    void finish_write(uint64_t seq);
+
+    friend bool operator==(const SharedBlob &l, const SharedBlob &r) {
+      return l.get_sbid() == r.get_sbid();
+    }
+    inline BufferCacheShard* get_cache() {
+      return coll ? coll->cache : nullptr;
+    }
+    inline SharedBlobSet* get_parent() {
+      return coll ? &(coll->shared_blob_set) : nullptr;
+    }
+    inline bool is_loaded() const {
+      return loaded;
+    }
+
+  };
+  typedef boost::intrusive_ptr<SharedBlob> SharedBlobRef;
+
+  /// a lookup table of SharedBlobs
+  struct SharedBlobSet {
+    /// protect lookup, insertion, removal
+    ceph::mutex lock = ceph::make_mutex("BlueStore::SharedBlobSet::lock");
+
+    // we use a bare pointer because we don't want to affect the ref
+    // count
+    mempool::bluestore_cache_meta::unordered_map<uint64_t,SharedBlob*> sb_map;
+
+    SharedBlobRef lookup(uint64_t sbid) {
+      std::lock_guard l(lock);
+      auto p = sb_map.find(sbid);
+      if (p == sb_map.end() ||
+	  p->second->nref == 0) {
+        return nullptr;
+      }
+      return p->second;
+    }
+
+    void add(Collection* coll, SharedBlob *sb) {
+      std::lock_guard l(lock);
+      sb_map[sb->get_sbid()] = sb;
+      sb->coll = coll;
+    }
+
+    bool remove(SharedBlob *sb, bool verify_nref_is_zero=false) {
+      std::lock_guard l(lock);
+      ceph_assert(sb->get_parent() == this);
+      if (verify_nref_is_zero && sb->nref != 0) {
+	return false;
+      }
+      // only remove if it still points to us
+      auto p = sb_map.find(sb->get_sbid());
+      if (p != sb_map.end() &&
+	  p->second == sb) {
+	sb_map.erase(p);
+      }
+      return true;
+    }
+
+    bool empty() {
+      std::lock_guard l(lock);
+      return sb_map.empty();
+    }
+
+    template <int LogLevelV>
+    void dump(CephContext *cct);
+  };
+
+//#define CACHE_BLOB_BL  // not sure if this is a win yet or not... :/
+
+  /// in-memory blob metadata and associated cached buffers (if any)
+  struct Blob {
+    MEMPOOL_CLASS_HELPERS();
+
+    std::atomic_int nref = {0};     ///< reference count
+    int16_t id = -1;                ///< id, for spanning blobs only, >= 0
+    int16_t last_encoded_id = -1;   ///< (ephemeral) used during encoding only
+    SharedBlobRef shared_blob;      ///< shared blob state (if any)
+
+  private:
+    mutable bluestore_blob_t blob;  ///< decoded blob metadata
+#ifdef CACHE_BLOB_BL
+    mutable ceph::buffer::list blob_bl;     ///< cached encoded blob, blob is dirty if empty
+#endif
+    /// refs from this shard.  ephemeral if id<0, persisted if spanning.
+    bluestore_blob_use_tracker_t used_in_blob;
+
+  public:
+
+    friend void intrusive_ptr_add_ref(Blob *b) { b->get(); }
+    friend void intrusive_ptr_release(Blob *b) { b->put(); }
+
+    void dump(ceph::Formatter* f) const;
+    friend std::ostream& operator<<(std::ostream& out, const Blob &b);
+
+    const bluestore_blob_use_tracker_t& get_blob_use_tracker() const {
+      return used_in_blob;
+    }
+    bool is_referenced() const {
+      return used_in_blob.is_not_empty();
+    }
+    uint32_t get_referenced_bytes() const {
+      return used_in_blob.get_referenced_bytes();
+    }
+
+    bool is_spanning() const {
+      return id >= 0;
+    }
+
+    bool can_split() const {
+      std::lock_guard l(shared_blob->get_cache()->lock);
+      // splitting a BufferSpace writing list is too hard; don't try.
+      return shared_blob->bc.writing.empty() &&
+             used_in_blob.can_split() &&
+             get_blob().can_split();
+    }
+
+    bool can_split_at(uint32_t blob_offset) const {
+      return used_in_blob.can_split_at(blob_offset) &&
+             get_blob().can_split_at(blob_offset);
+    }
+
+    bool can_reuse_blob(uint32_t min_alloc_size,
+			uint32_t target_blob_size,
+			uint32_t b_offset,
+			uint32_t *length0);
+
+    void dup(Blob& o) {
+      o.shared_blob = shared_blob;
+      o.blob = blob;
+#ifdef CACHE_BLOB_BL
+      o.blob_bl = blob_bl;
+#endif
+    }
+
+    inline const bluestore_blob_t& get_blob() const {
+      return blob;
+    }
+    inline bluestore_blob_t& dirty_blob() {
+#ifdef CACHE_BLOB_BL
+      blob_bl.clear();
+#endif
+      return blob;
+    }
+
+    /// discard buffers for unallocated regions
+    void discard_unallocated(Collection *coll);
+
+    /// get logical references
+    void get_ref(Collection *coll, uint32_t offset, uint32_t length);
+    /// put logical references, and get back any released extents
+    bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
+		 PExtentVector *r);
+
+    /// split the blob
+    void split(Collection *coll, uint32_t blob_offset, Blob *o);
+
+    void get() {
+      ++nref;
+    }
+    void put() {
+      if (--nref == 0)
+	delete this;
+    }
+
+
+#ifdef CACHE_BLOB_BL
+    void _encode() const {
+      if (blob_bl.length() == 0 ) {
+	encode(blob, blob_bl);
+      } else {
+	ceph_assert(blob_bl.length());
+      }
+    }
+    void bound_encode(
+      size_t& p,
+      bool include_ref_map) const {
+      _encode();
+      p += blob_bl.length();
+      if (include_ref_map) {
+	used_in_blob.bound_encode(p);
+      }
+    }
+    void encode(
+      ceph::buffer::list::contiguous_appender& p,
+      bool include_ref_map) const {
+      _encode();
+      p.append(blob_bl);
+      if (include_ref_map) {
+	used_in_blob.encode(p);
+      }
+    }
+    void decode(
+      ceph::buffer::ptr::const_iterator& p,
+      bool include_ref_map,
+      Collection */*coll*/) {
+      const char *start = p.get_pos();
+      denc(blob, p);
+      const char *end = p.get_pos();
+      blob_bl.clear();
+      blob_bl.append(start, end - start);
+      if (include_ref_map) {
+	used_in_blob.decode(p);
+      }
+    }
+#else
+    void bound_encode(
+      size_t& p,
+      uint64_t struct_v,
+      uint64_t sbid,
+      bool include_ref_map) const {
+      denc(blob, p, struct_v);
+      if (blob.is_shared()) {
+        denc(sbid, p);
+      }
+      if (include_ref_map) {
+	used_in_blob.bound_encode(p);
+      }
+    }
+    void encode(
+      ceph::buffer::list::contiguous_appender& p,
+      uint64_t struct_v,
+      uint64_t sbid,
+      bool include_ref_map) const {
+      denc(blob, p, struct_v);
+      if (blob.is_shared()) {
+        denc(sbid, p);
+      }
+      if (include_ref_map) {
+	used_in_blob.encode(p);
+      }
+    }
+    void decode(
+      ceph::buffer::ptr::const_iterator& p,
+      uint64_t struct_v,
+      uint64_t* sbid,
+      bool include_ref_map,
+      Collection *coll);
+#endif
+  };
+  typedef boost::intrusive_ptr<Blob> BlobRef;
+  typedef mempool::bluestore_cache_meta::map<int,BlobRef> blob_map_t;
+
+  /// a logical extent, pointing to (some portion of) a blob
+  typedef boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true> > ExtentBase; //making an alias to avoid build warnings
+  struct Extent : public ExtentBase {
+    MEMPOOL_CLASS_HELPERS();
+
+    uint32_t logical_offset = 0;      ///< logical offset
+    uint32_t blob_offset = 0;         ///< blob offset
+    uint32_t length = 0;              ///< length
+    BlobRef  blob;                    ///< the blob with our data
+
+    /// ctor for lookup only
+    explicit Extent(uint32_t lo) : ExtentBase(), logical_offset(lo) { }
+    /// ctor for delayed initialization (see decode_some())
+    explicit Extent() : ExtentBase() {
+    }
+    /// ctor for general usage
+    Extent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
+      : ExtentBase(),
+        logical_offset(lo), blob_offset(o), length(l) {
+      assign_blob(b);
+    }
+    ~Extent() {
+      if (blob) {
+	blob->shared_blob->get_cache()->rm_extent();
+      }
+    }
+
+    void dump(ceph::Formatter* f) const;
+
+    void assign_blob(const BlobRef& b) {
+      ceph_assert(!blob);
+      blob = b;
+      blob->shared_blob->get_cache()->add_extent();
+    }
+
+    // comparators for intrusive_set
+    friend bool operator<(const Extent &a, const Extent &b) {
+      return a.logical_offset < b.logical_offset;
+    }
+    friend bool operator>(const Extent &a, const Extent &b) {
+      return a.logical_offset > b.logical_offset;
+    }
+    friend bool operator==(const Extent &a, const Extent &b) {
+      return a.logical_offset == b.logical_offset;
+    }
+
+    uint32_t blob_start() const {
+      return logical_offset - blob_offset;
+    }
+
+    uint32_t blob_end() const {
+      return blob_start() + blob->get_blob().get_logical_length();
+    }
+
+    uint32_t logical_end() const {
+      return logical_offset + length;
+    }
+
+    // return true if any piece of the blob is out of
+    // the given range [o, o + l].
+    bool blob_escapes_range(uint32_t o, uint32_t l) const {
+      return blob_start() < o || blob_end() > o + l;
+    }
+  };
+  typedef boost::intrusive::set<Extent> extent_map_t;
+
+
+  friend std::ostream& operator<<(std::ostream& out, const Extent& e);
+
+  struct OldExtent {
+    boost::intrusive::list_member_hook<> old_extent_item;
+    Extent e;
+    PExtentVector r;
+    bool blob_empty; // flag to track the last removed extent that makes blob
+                     // empty - required to update compression stat properly
+    OldExtent(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b)
+      : e(lo, o, l, b), blob_empty(false) {
+    }
+    static OldExtent* create(CollectionRef c,
+                             uint32_t lo,
+			     uint32_t o,
+			     uint32_t l,
+			     BlobRef& b);
+  };
+  typedef boost::intrusive::list<
+      OldExtent,
+      boost::intrusive::member_hook<
+        OldExtent,
+    boost::intrusive::list_member_hook<>,
+    &OldExtent::old_extent_item> > old_extent_map_t;
+
+  struct Onode;
+
+  /// a sharded extent map, mapping offsets to lextents to blobs
+  struct ExtentMap {
+    Onode *onode;
+    extent_map_t extent_map;        ///< map of Extents to Blobs
+    blob_map_t spanning_blob_map;   ///< blobs that span shards
+    typedef boost::intrusive_ptr<Onode> OnodeRef;
+
+    struct Shard {
+      bluestore_onode_t::shard_info *shard_info = nullptr;
+      unsigned extents = 0;  ///< count extents in this shard
+      bool loaded = false;   ///< true if shard is loaded
+      bool dirty = false;    ///< true if shard is dirty and needs reencoding
+    };
+
+    mempool::bluestore_cache_meta::vector<Shard> shards;    ///< shards
+
+    ceph::buffer::list inline_bl;    ///< cached encoded map, if unsharded; empty=>dirty
+
+    uint32_t needs_reshard_begin = 0;
+    uint32_t needs_reshard_end = 0;
+
+    void dup(BlueStore* b, TransContext*, CollectionRef&, OnodeRef&, OnodeRef&,
+      uint64_t&, uint64_t&, uint64_t&);
+
+    bool needs_reshard() const {
+      return needs_reshard_end > needs_reshard_begin;
+    }
+    void clear_needs_reshard() {
+      needs_reshard_begin = needs_reshard_end = 0;
+    }
+    void request_reshard(uint32_t begin, uint32_t end) {
+      if (begin < needs_reshard_begin) {
+	needs_reshard_begin = begin;
+      }
+      if (end > needs_reshard_end) {
+	needs_reshard_end = end;
+      }
+    }
+
+    struct DeleteDisposer {
+      void operator()(Extent *e) { delete e; }
+    };
+
+    ExtentMap(Onode *o, size_t inline_shard_prealloc_size);
+    ~ExtentMap() {
+      extent_map.clear_and_dispose(DeleteDisposer());
+    }
+
+    void clear() {
+      extent_map.clear_and_dispose(DeleteDisposer());
+      shards.clear();
+      inline_bl.clear();
+      clear_needs_reshard();
+    }
+
+    void dump(ceph::Formatter* f) const;
+
+    bool encode_some(uint32_t offset, uint32_t length, ceph::buffer::list& bl,
+		     unsigned *pn);
+
+    class ExtentDecoder {
+      uint64_t pos = 0;
+      uint64_t prev_len = 0;
+      uint64_t extent_pos = 0;
+    protected:
+      virtual void consume_blobid(Extent* le,
+                                  bool spanning,
+                                  uint64_t blobid) = 0;
+      virtual void consume_blob(Extent* le,
+                                uint64_t extent_no,
+                                uint64_t sbid,
+                                BlobRef b) = 0;
+      virtual void consume_spanning_blob(uint64_t sbid, BlobRef b) = 0;
+      virtual Extent* get_next_extent() = 0;
+      virtual void add_extent(Extent*) = 0;
+
+      void decode_extent(Extent* le,
+                         __u8 struct_v,
+                         bptr_c_it_t& p,
+                         Collection* c);
+    public:
+      virtual ~ExtentDecoder() {
+      }
+
+      unsigned decode_some(const ceph::buffer::list& bl, Collection* c);
+      void decode_spanning_blobs(bptr_c_it_t& p, Collection* c);
+    };
+
+    class ExtentDecoderFull : public ExtentDecoder {
+      ExtentMap& extent_map;
+      std::vector<BlobRef> blobs;
+    protected:
+      void consume_blobid(Extent* le, bool spanning, uint64_t blobid) override;
+      void consume_blob(Extent* le,
+                        uint64_t extent_no,
+                        uint64_t sbid,
+                        BlobRef b) override;
+      void consume_spanning_blob(uint64_t sbid, BlobRef b) override;
+      Extent* get_next_extent() override;
+      void add_extent(Extent* ) override;
+    public:
+      ExtentDecoderFull (ExtentMap& _extent_map) : extent_map(_extent_map) {
+      }
+    };
+
+    unsigned decode_some(ceph::buffer::list& bl);
+
+    void bound_encode_spanning_blobs(size_t& p);
+    void encode_spanning_blobs(ceph::buffer::list::contiguous_appender& p);
+    BlobRef get_spanning_blob(int id) {
+      auto p = spanning_blob_map.find(id);
+      ceph_assert(p != spanning_blob_map.end());
+      return p->second;
+    }
+
+    void update(KeyValueDB::Transaction t, bool force);
+    decltype(BlueStore::Blob::id) allocate_spanning_blob_id();
+    void reshard(
+      KeyValueDB *db,
+      KeyValueDB::Transaction t);
+
+    /// initialize Shards from the onode
+    void init_shards(bool loaded, bool dirty);
+
+    /// return index of shard containing offset
+    /// or -1 if not found
+    int seek_shard(uint32_t offset) {
+      size_t end = shards.size();
+      size_t mid, left = 0;
+      size_t right = end; // one passed the right end
+
+      while (left < right) {
+        mid = left + (right - left) / 2;
+        if (offset >= shards[mid].shard_info->offset) {
+          size_t next = mid + 1;
+          if (next >= end || offset < shards[next].shard_info->offset)
+            return mid;
+          //continue to search forwards
+          left = next;
+        } else {
+          //continue to search backwards
+          right = mid;
+        }
+      }
+
+      return -1; // not found
+    }
+
+    /// check if a range spans a shard
+    bool spans_shard(uint32_t offset, uint32_t length) {
+      if (shards.empty()) {
+	return false;
+      }
+      int s = seek_shard(offset);
+      ceph_assert(s >= 0);
+      if (s == (int)shards.size() - 1) {
+	return false; // last shard
+      }
+      if (offset + length <= shards[s+1].shard_info->offset) {
+	return false;
+      }
+      return true;
+    }
+
+    /// ensure that a range of the map is loaded
+    void fault_range(KeyValueDB *db,
+		     uint32_t offset, uint32_t length);
+
+    /// ensure a range of the map is marked dirty
+    void dirty_range(uint32_t offset, uint32_t length);
+
+    /// for seek_lextent test
+    extent_map_t::iterator find(uint64_t offset);
+
+    /// seek to the first lextent including or after offset
+    extent_map_t::iterator seek_lextent(uint64_t offset);
+    extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
+
+    /// add a new Extent
+    void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
+      extent_map.insert(*new Extent(lo, o, l, b));
+    }
+
+    /// remove (and delete) an Extent
+    void rm(extent_map_t::iterator p) {
+      extent_map.erase_and_dispose(p, DeleteDisposer());
+    }
+
+    bool has_any_lextents(uint64_t offset, uint64_t length);
+
+    /// consolidate adjacent lextents in extent_map
+    int compress_extent_map(uint64_t offset, uint64_t length);
+
+    /// punch a logical hole.  add lextents to deref to target list.
+    void punch_hole(CollectionRef &c,
+		    uint64_t offset, uint64_t length,
+		    old_extent_map_t *old_extents);
+
+    /// put new lextent into lextent_map overwriting existing ones if
+    /// any and update references accordingly
+    Extent *set_lextent(CollectionRef &c,
+			uint64_t logical_offset,
+			uint64_t offset, uint64_t length,
+                        BlobRef b,
+			old_extent_map_t *old_extents);
+
+    /// split a blob (and referring extents)
+    BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
+  };
+
+  /// Compressed Blob Garbage collector
+  /*
+  The primary idea of the collector is to estimate a difference between
+  allocation units(AU) currently present for compressed blobs and new AUs
+  required to store that data uncompressed. 
+  Estimation is performed for protrusive extents within a logical range
+  determined by a concatenation of old_extents collection and specific(current)
+  write request.
+  The root cause for old_extents use is the need to handle blob ref counts
+  properly. Old extents still hold blob refs and hence we need to traverse
+  the collection to determine if blob to be released.
+  Protrusive extents are extents that fit into the blob std::set in action
+  (ones that are below the logical range from above) but not removed totally
+  due to the current write. 
+  E.g. for
+  extent1 <loffs = 100, boffs = 100, len  = 100> -> 
+    blob1<compressed, len_on_disk=4096, logical_len=8192>
+  extent2 <loffs = 200, boffs = 200, len  = 100> ->
+    blob2<raw, len_on_disk=4096, llen=4096>
+  extent3 <loffs = 300, boffs = 300, len  = 100> ->
+    blob1<compressed, len_on_disk=4096, llen=8192>
+  extent4 <loffs = 4096, boffs = 0, len  = 100>  ->
+    blob3<raw, len_on_disk=4096, llen=4096>
+  write(300~100)
+  protrusive extents are within the following ranges <0~300, 400~8192-400>
+  In this case existing AUs that might be removed due to GC (i.e. blob1) 
+  use 2x4K bytes.
+  And new AUs expected after GC = 0 since extent1 to be merged into blob2.
+  Hence we should do a collect.
+  */
+  class GarbageCollector
+  {
+  public:
+    /// return amount of allocation units that might be saved due to GC
+    int64_t estimate(
+      uint64_t offset,
+      uint64_t length,
+      const ExtentMap& extent_map,
+      const old_extent_map_t& old_extents,
+      uint64_t min_alloc_size);
+
+    /// return a collection of extents to perform GC on
+    const interval_set<uint64_t>& get_extents_to_collect() const {
+      return extents_to_collect;
+    }
+    GarbageCollector(CephContext* _cct) : cct(_cct) {}
+
+  private:
+    struct BlobInfo {
+      uint64_t referenced_bytes = 0;    ///< amount of bytes referenced in blob
+      int64_t expected_allocations = 0; ///< new alloc units required 
+                                        ///< in case of gc fulfilled
+      bool collect_candidate = false;   ///< indicate if blob has any extents 
+                                        ///< eligible for GC.
+      extent_map_t::const_iterator first_lextent; ///< points to the first 
+                                                  ///< lextent referring to 
+                                                  ///< the blob if any.
+                                                  ///< collect_candidate flag 
+                                                  ///< determines the validity
+      extent_map_t::const_iterator last_lextent;  ///< points to the last 
+                                                  ///< lextent referring to 
+                                                  ///< the blob if any.
+
+      BlobInfo(uint64_t ref_bytes) :
+        referenced_bytes(ref_bytes) {
+      }
+    };
+    CephContext* cct;
+    std::map<Blob*, BlobInfo> affected_blobs; ///< compressed blobs and their ref_map
+                                         ///< copies that are affected by the
+                                         ///< specific write
+
+    ///< protrusive extents that should be collected if GC takes place
+    interval_set<uint64_t> extents_to_collect;
+
+    boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
+                                                ///<  unit when traversing 
+                                                ///< protrusive extents. 
+                                                ///< Other extents mapped to
+                                                ///< this AU to be ignored 
+                                                ///< (except the case where
+                                                ///< uncompressed extent follows
+                                                ///< compressed one - see below).
+    BlobInfo* blob_info_counted = nullptr; ///< std::set if previous allocation unit
+                                           ///< caused expected_allocations
+					   ///< counter increment at this blob.
+                                           ///< if uncompressed extent follows 
+                                           ///< a decrement for the 
+                                	   ///< expected_allocations counter 
+                                           ///< is needed
+    int64_t expected_allocations = 0;      ///< new alloc units required in case
+                                           ///< of gc fulfilled
+    int64_t expected_for_release = 0;      ///< alloc units currently used by
+                                           ///< compressed blobs that might
+                                           ///< gone after GC
+
+  protected:
+    void process_protrusive_extents(const BlueStore::ExtentMap& extent_map, 
+				    uint64_t start_offset,
+				    uint64_t end_offset,
+				    uint64_t start_touch_offset,
+				    uint64_t end_touch_offset,
+				    uint64_t min_alloc_size);
+  };
+
+  struct OnodeSpace;
+  struct OnodeCacheShard;
+  /// an in-memory object
+  struct Onode {
+    MEMPOOL_CLASS_HELPERS();
+
+    std::atomic_int nref = 0;      ///< reference count
+    std::atomic_int pin_nref = 0;  ///< reference count replica to track pinning
+    Collection *c;
+    ghobject_t oid;
+
+    /// key under PREFIX_OBJ where we are stored
+    mempool::bluestore_cache_meta::string key;
+
+    boost::intrusive::list_member_hook<> lru_item;
+
+    bluestore_onode_t onode;  ///< metadata stored as value in kv store
+    bool exists;              ///< true if object logically exists
+    bool cached;              ///< Onode is logically in the cache
+                              /// (it can be pinned and hence physically out
+                              /// of it at the moment though)
+    ExtentMap extent_map;
+
+    // track txc's that have not been committed to kv store (and whose
+    // effects cannot be read via the kvdb read methods)
+    std::atomic<int> flushing_count = {0};
+    std::atomic<int> waiting_count = {0};
+    /// protect flush_txns
+    ceph::mutex flush_lock = ceph::make_mutex("BlueStore::Onode::flush_lock");
+    ceph::condition_variable flush_cond;   ///< wait here for uncommitted txns
+    std::shared_ptr<int64_t> cache_age_bin;  ///< cache age bin
+
+    Onode(Collection *c, const ghobject_t& o,
+	  const mempool::bluestore_cache_meta::string& k)
+      : c(c),
+	oid(o),
+	key(k),
+	exists(false),
+        cached(false),
+	extent_map(this,
+	  c->store->cct->_conf->
+	    bluestore_extent_map_inline_shard_prealloc_size) {
+    }
+    Onode(Collection* c, const ghobject_t& o,
+      const std::string& k)
+      : c(c),
+        oid(o),
+        key(k),
+        exists(false),
+        cached(false),
+        extent_map(this,
+	  c->store->cct->_conf->
+	    bluestore_extent_map_inline_shard_prealloc_size) {
+    }
+    Onode(Collection* c, const ghobject_t& o,
+      const char* k)
+      : c(c),
+        oid(o),
+        key(k),
+        exists(false),
+        cached(false),
+        extent_map(this,
+	  c->store->cct->_conf->
+	    bluestore_extent_map_inline_shard_prealloc_size) {
+    }
+    Onode(CephContext* cct)
+      : c(nullptr),
+        exists(false),
+        cached(false),
+        extent_map(this,
+	  cct->_conf->
+	    bluestore_extent_map_inline_shard_prealloc_size) {
+    }
+    static void decode_raw(
+      BlueStore::Onode* on,
+      const bufferlist& v,
+      ExtentMap::ExtentDecoder& dencoder);
+
+    static Onode* create_decode(
+      CollectionRef c,
+      const ghobject_t& oid,
+      const std::string& key,
+      const ceph::buffer::list& v,
+      bool allow_empty = false);
+
+    void dump(ceph::Formatter* f) const;
+
+    void flush();
+    void get();
+    void put();
+
+    inline bool is_cached() const {
+      return cached;
+    }
+    inline void set_cached() {
+      ceph_assert(!cached);
+      cached = true;
+    }
+    inline void clear_cached() {
+      ceph_assert(cached);
+      cached = false;
+    }
+
+    static const std::string& calc_omap_prefix(uint8_t flags);
+    static void calc_omap_header(uint8_t flags, const Onode* o,
+      std::string* out);
+    static void calc_omap_key(uint8_t flags, const Onode* o,
+      const std::string& key, std::string* out);
+    static void calc_omap_tail(uint8_t flags, const Onode* o,
+      std::string* out);
+
+    const std::string& get_omap_prefix() {
+      return calc_omap_prefix(onode.flags);
+    }
+    void get_omap_header(std::string* out) {
+      calc_omap_header(onode.flags, this, out);
+    }
+    void get_omap_key(const std::string& key, std::string* out) {
+      calc_omap_key(onode.flags, this, key, out);
+    }
+    void get_omap_tail(std::string* out) {
+      calc_omap_tail(onode.flags, this, out);
+    }
+
+    void rewrite_omap_key(const std::string& old, std::string *out);
+    void decode_omap_key(const std::string& key, std::string *user_key);
+
+#ifdef HAVE_LIBZBD
+    // Return the offset of an object on disk.  This function is intended *only*
+    // for use with zoned storage devices because in these devices, the objects
+    // are laid out contiguously on disk, which is not the case in general.
+    // Also, it should always be called after calling extent_map.fault_range(),
+    // so that the extent map is loaded.
+    int64_t zoned_get_ondisk_starting_offset() const {
+      return extent_map.extent_map.begin()->blob->
+	  get_blob().calc_offset(0, nullptr);
+    }
+#endif
+private:
+    void _decode(const ceph::buffer::list& v);
+  };
+  typedef boost::intrusive_ptr<Onode> OnodeRef;
+
+  /// A generic Cache Shard
+  struct CacheShard {
+    CephContext *cct;
+    PerfCounters *logger;
+
+    /// protect lru and other structures
+    ceph::recursive_mutex lock = {
+      ceph::make_recursive_mutex("BlueStore::CacheShard::lock") };
+
+    std::atomic<uint64_t> max = {0};
+    std::atomic<uint64_t> num = {0};
+    boost::circular_buffer<std::shared_ptr<int64_t>> age_bins;
+
+    CacheShard(CephContext* cct) : cct(cct), logger(nullptr), age_bins(1) {
+      shift_bins();
+    }
+    virtual ~CacheShard() {}
+
+    void set_max(uint64_t max_) {
+      max = max_;
+    }
+
+    uint64_t _get_num() {
+      return num;
+    }
+
+    virtual void _trim_to(uint64_t new_size) = 0;
+    void _trim() {
+      if (cct->_conf->objectstore_blackhole) {
+	// do not trim if we are throwing away IOs a layer down
+	return;
+      }
+      _trim_to(max);
+    }
+
+    void trim() {
+      std::lock_guard l(lock);
+      _trim();    
+    }
+    void flush() {
+      std::lock_guard l(lock);
+      // we should not be shutting down after the blackhole is enabled
+      ceph_assert(!cct->_conf->objectstore_blackhole);
+      _trim_to(0);
+    }
+
+    virtual void shift_bins() {
+      std::lock_guard l(lock);
+      age_bins.push_front(std::make_shared<int64_t>(0));
+    }
+    virtual uint32_t get_bin_count() {
+      std::lock_guard l(lock);
+      return age_bins.capacity();
+    }
+    virtual void set_bin_count(uint32_t count) {
+      std::lock_guard l(lock);
+      age_bins.set_capacity(count);
+    }
+    virtual uint64_t sum_bins(uint32_t start, uint32_t end) {
+      std::lock_guard l(lock);
+      auto size = age_bins.size();
+      if (size < start) {
+        return 0;
+      }
+      uint64_t count = 0;
+      end = (size < end) ? size : end;
+      for (auto i = start; i < end; i++) {
+        count += *(age_bins[i]);
+      }
+      return count;
+    }
+
+#ifdef DEBUG_CACHE
+    virtual void _audit(const char *s) = 0;
+#else
+    void _audit(const char *s) { /* no-op */ }
+#endif
+  };
+
+  /// A Generic onode Cache Shard
+  struct OnodeCacheShard : public CacheShard {
+    std::array<std::pair<ghobject_t, ceph::mono_clock::time_point>, 64> dumped_onodes;
+
+  public:
+    OnodeCacheShard(CephContext* cct) : CacheShard(cct) {}
+    static OnodeCacheShard *create(CephContext* cct, std::string type,
+                                   PerfCounters *logger);
+
+    //The following methods prefixed with '_' to be called under
+    // Shard's lock
+    virtual void _add(Onode* o, int level) = 0;
+    virtual void _rm(Onode* o) = 0;
+    virtual void _move_pinned(OnodeCacheShard *to, Onode *o) = 0;
+
+    virtual void maybe_unpin(Onode* o) = 0;
+    virtual void add_stats(uint64_t *onodes, uint64_t *pinned_onodes) = 0;
+    bool empty() {
+      return _get_num() == 0;
+    }
+  };
+
+  /// A Generic buffer Cache Shard
+  struct BufferCacheShard : public CacheShard {
+    std::atomic<uint64_t> num_extents = {0};
+    std::atomic<uint64_t> num_blobs = {0};
+    uint64_t buffer_bytes = 0;
+
+  public:
+    BufferCacheShard(CephContext* cct) : CacheShard(cct) {}
+    static BufferCacheShard *create(CephContext* cct, std::string type, 
+                                    PerfCounters *logger);
+    virtual void _add(Buffer *b, int level, Buffer *near) = 0;
+    virtual void _rm(Buffer *b) = 0;
+    virtual void _move(BufferCacheShard *src, Buffer *b) = 0;
+    virtual void _touch(Buffer *b) = 0;
+    virtual void _adjust_size(Buffer *b, int64_t delta) = 0;
+
+    uint64_t _get_bytes() {
+      return buffer_bytes;
+    }
+
+    void add_extent() {
+      ++num_extents;
+    }
+    void rm_extent() {
+      --num_extents;
+    }
+
+    void add_blob() {
+      ++num_blobs;
+    }
+    void rm_blob() {
+      --num_blobs;
+    }
+
+    virtual void add_stats(uint64_t *extents,
+                           uint64_t *blobs,
+                           uint64_t *buffers,
+                           uint64_t *bytes) = 0;
+
+    bool empty() {
+      std::lock_guard l(lock);
+      return _get_bytes() == 0;
+    }
+  };
+
+  struct OnodeSpace {
+    OnodeCacheShard *cache;
+
+  private:
+    /// forward lookups
+    mempool::bluestore_cache_meta::unordered_map<ghobject_t,OnodeRef> onode_map;
+
+    friend struct Collection; // for split_cache()
+    friend struct Onode; // for put()
+    friend struct LruOnodeCacheShard;
+    void _remove(const ghobject_t& oid);
+  public:
+    OnodeSpace(OnodeCacheShard *c) : cache(c) {}
+    ~OnodeSpace() {
+      clear();
+    }
+
+    OnodeRef add_onode(const ghobject_t& oid, OnodeRef& o);
+    OnodeRef lookup(const ghobject_t& o);
+    void rename(OnodeRef& o, const ghobject_t& old_oid,
+		const ghobject_t& new_oid,
+		const mempool::bluestore_cache_meta::string& new_okey);
+    void clear();
+    bool empty();
+
+    template <int LogLevelV>
+    void dump(CephContext *cct);
+
+    /// return true if f true for any item
+    bool map_any(std::function<bool(Onode*)> f);
+  };
+
+  class OpSequencer;
+  using OpSequencerRef = ceph::ref_t<OpSequencer>;
+
+  struct Collection : public CollectionImpl {
+    BlueStore *store;
+    OpSequencerRef osr;
+    BufferCacheShard *cache;       ///< our cache shard
+    bluestore_cnode_t cnode;
+    ceph::shared_mutex lock =
+      ceph::make_shared_mutex("BlueStore::Collection::lock", true, false);
+
+    bool exists;
+
+    SharedBlobSet shared_blob_set;      ///< open SharedBlobs
+
+    // cache onodes on a per-collection basis to avoid lock
+    // contention.
+    OnodeSpace onode_space;
+
+    //pool options
+    pool_opts_t pool_opts;
+    ContextQueue *commit_queue;
+
+    OnodeCacheShard* get_onode_cache() const {
+      return onode_space.cache;
+    }
+    OnodeRef get_onode(const ghobject_t& oid, bool create, bool is_createop=false);
+
+    // the terminology is confusing here, sorry!
+    //
+    //  blob_t     shared_blob_t
+    //  !shared    unused                -> open
+    //  shared     !loaded               -> open + shared
+    //  shared     loaded                -> open + shared + loaded
+    //
+    // i.e.,
+    //  open = SharedBlob is instantiated
+    //  shared = blob_t shared flag is std::set; SharedBlob is hashed.
+    //  loaded = SharedBlob::shared_blob_t is loaded from kv store
+    void open_shared_blob(uint64_t sbid, BlobRef b);
+    void load_shared_blob(SharedBlobRef sb);
+    void make_blob_shared(uint64_t sbid, BlobRef b);
+    uint64_t make_blob_unshared(SharedBlob *sb);
+
+    BlobRef new_blob() {
+      BlobRef b = new Blob();
+      b->shared_blob = new SharedBlob(this);
+      return b;
+    }
+
+    bool contains(const ghobject_t& oid) {
+      if (cid.is_meta())
+	return oid.hobj.pool == -1;
+      spg_t spgid;
+      if (cid.is_pg(&spgid))
+	return
+	  spgid.pgid.contains(cnode.bits, oid) &&
+	  oid.shard_id == spgid.shard;
+      return false;
+    }
+
+    int64_t pool() const {
+      return cid.pool();
+    }
+
+    void split_cache(Collection *dest);
+
+    bool flush_commit(Context *c) override;
+    void flush() override;
+    void flush_all_but_last();
+
+    Collection(BlueStore *ns, OnodeCacheShard *oc, BufferCacheShard *bc, coll_t c);
+  };
+
+  class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
+
+    PerfCounters* logger = nullptr;
+    CollectionRef c;
+    OnodeRef o;
+    KeyValueDB::Iterator it;
+    std::string head, tail;
+
+    std::string _stringify() const;
+  public:
+    OmapIteratorImpl(PerfCounters* l, CollectionRef c, OnodeRef& o, KeyValueDB::Iterator it);
+    virtual ~OmapIteratorImpl();
+    int seek_to_first() override;
+    int upper_bound(const std::string &after) override;
+    int lower_bound(const std::string &to) override;
+    bool valid() override;
+    int next() override;
+    std::string key() override;
+    ceph::buffer::list value() override;
+    std::string tail_key() override {
+      return tail;
+    }
+
+    int status() override {
+      return 0;
+    }
+  };
+
+  struct volatile_statfs{
+    enum {
+      STATFS_ALLOCATED = 0,
+      STATFS_STORED,
+      STATFS_COMPRESSED_ORIGINAL,
+      STATFS_COMPRESSED,
+      STATFS_COMPRESSED_ALLOCATED,
+      STATFS_LAST
+    };
+    int64_t values[STATFS_LAST];
+    volatile_statfs() {
+      memset(this, 0, sizeof(volatile_statfs));
+    }
+    void reset() {
+      *this = volatile_statfs();
+    }
+    bool empty() const {
+      for (size_t i = 0; i < STATFS_LAST; ++i) {
+	if (values[i]) {
+	  return false;
+	}
+      }
+      return true;
+    }
+    void publish(store_statfs_t* buf) const {
+      buf->allocated = allocated();
+      buf->data_stored = stored();
+      buf->data_compressed = compressed();
+      buf->data_compressed_original = compressed_original();
+      buf->data_compressed_allocated = compressed_allocated();
+    }
+
+    volatile_statfs& operator+=(const volatile_statfs& other) {
+      for (size_t i = 0; i < STATFS_LAST; ++i) {
+	values[i] += other.values[i];
+      }
+      return *this;
+    }
+    int64_t& allocated() {
+      return values[STATFS_ALLOCATED];
+    }
+    int64_t& stored() {
+      return values[STATFS_STORED];
+    }
+    int64_t& compressed_original() {
+      return values[STATFS_COMPRESSED_ORIGINAL];
+    }
+    int64_t& compressed() {
+      return values[STATFS_COMPRESSED];
+    }
+    int64_t& compressed_allocated() {
+      return values[STATFS_COMPRESSED_ALLOCATED];
+    }
+    int64_t allocated() const {
+      return values[STATFS_ALLOCATED];
+    }
+    int64_t stored() const {
+      return values[STATFS_STORED];
+    }
+    int64_t compressed_original() const {
+      return values[STATFS_COMPRESSED_ORIGINAL];
+    }
+    int64_t compressed() const {
+      return values[STATFS_COMPRESSED];
+    }
+    int64_t compressed_allocated() const {
+      return values[STATFS_COMPRESSED_ALLOCATED];
+    }
+    volatile_statfs& operator=(const store_statfs_t& st) {
+      values[STATFS_ALLOCATED] = st.allocated;
+      values[STATFS_STORED] = st.data_stored;
+      values[STATFS_COMPRESSED_ORIGINAL] = st.data_compressed_original;
+      values[STATFS_COMPRESSED] = st.data_compressed;
+      values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated;
+      return *this;
+    }
+    bool is_empty() {
+      return values[STATFS_ALLOCATED] == 0 &&
+	values[STATFS_STORED] == 0 &&
+	values[STATFS_COMPRESSED] == 0 &&
+	values[STATFS_COMPRESSED_ORIGINAL] == 0 &&
+	values[STATFS_COMPRESSED_ALLOCATED] == 0;
+    }
+    void decode(ceph::buffer::list::const_iterator& it) {
+      using ceph::decode;
+      for (size_t i = 0; i < STATFS_LAST; i++) {
+	decode(values[i], it);
+      }
+    }
+
+    void encode(ceph::buffer::list& bl) {
+      using ceph::encode;
+      for (size_t i = 0; i < STATFS_LAST; i++) {
+	encode(values[i], bl);
+      }
+    }
+  };
+
+  struct TransContext final : public AioContext {
+    MEMPOOL_CLASS_HELPERS();
+
+    typedef enum {
+      STATE_PREPARE,
+      STATE_AIO_WAIT,
+      STATE_IO_DONE,
+      STATE_KV_QUEUED,     // queued for kv_sync_thread submission
+      STATE_KV_SUBMITTED,  // submitted to kv; not yet synced
+      STATE_KV_DONE,
+      STATE_DEFERRED_QUEUED,    // in deferred_queue (pending or running)
+      STATE_DEFERRED_CLEANUP,   // remove deferred kv record
+      STATE_DEFERRED_DONE,
+      STATE_FINISHING,
+      STATE_DONE,
+    } state_t;
+
+    const char *get_state_name() {
+      switch (state) {
+      case STATE_PREPARE: return "prepare";
+      case STATE_AIO_WAIT: return "aio_wait";
+      case STATE_IO_DONE: return "io_done";
+      case STATE_KV_QUEUED: return "kv_queued";
+      case STATE_KV_SUBMITTED: return "kv_submitted";
+      case STATE_KV_DONE: return "kv_done";
+      case STATE_DEFERRED_QUEUED: return "deferred_queued";
+      case STATE_DEFERRED_CLEANUP: return "deferred_cleanup";
+      case STATE_DEFERRED_DONE: return "deferred_done";
+      case STATE_FINISHING: return "finishing";
+      case STATE_DONE: return "done";
+      }
+      return "???";
+    }
+
+#if defined(WITH_LTTNG)
+    const char *get_state_latency_name(int state) {
+      switch (state) {
+      case l_bluestore_state_prepare_lat: return "prepare";
+      case l_bluestore_state_aio_wait_lat: return "aio_wait";
+      case l_bluestore_state_io_done_lat: return "io_done";
+      case l_bluestore_state_kv_queued_lat: return "kv_queued";
+      case l_bluestore_state_kv_committing_lat: return "kv_committing";
+      case l_bluestore_state_kv_done_lat: return "kv_done";
+      case l_bluestore_state_deferred_queued_lat: return "deferred_queued";
+      case l_bluestore_state_deferred_cleanup_lat: return "deferred_cleanup";
+      case l_bluestore_state_finishing_lat: return "finishing";
+      case l_bluestore_state_done_lat: return "done";
+      }
+      return "???";
+    }
+#endif
+
+    inline void set_state(state_t s) {
+       state = s;
+#ifdef WITH_BLKIN
+       if (trace) {
+         trace.event(get_state_name());
+       } 
+#endif
+    }
+    inline state_t get_state() {
+      return state;
+    }
+
+    CollectionRef ch;
+    OpSequencerRef osr;  // this should be ch->osr
+    boost::intrusive::list_member_hook<> sequencer_item;
+
+    uint64_t bytes = 0, ios = 0, cost = 0;
+
+    std::set<OnodeRef> onodes;     ///< these need to be updated/written
+    std::set<OnodeRef> modified_objects;  ///< objects we modified (and need a ref)
+
+#ifdef HAVE_LIBZBD
+    // zone refs to add/remove.  each zone ref is a (zone, offset) tuple.  The offset
+    // is the first offset in the zone that the onode touched; subsequent writes
+    // to that zone do not generate additional refs.  This is a bit imprecise but
+    // is sufficient to generate reasonably sequential reads when doing zone
+    // cleaning with less metadata than a ref for every extent.
+    std::map<std::pair<OnodeRef, uint32_t>, uint64_t> new_zone_offset_refs;
+    std::map<std::pair<OnodeRef, uint32_t>, uint64_t> old_zone_offset_refs;
+#endif
+    
+    std::set<SharedBlobRef> shared_blobs;  ///< these need to be updated/written
+    std::set<SharedBlobRef> shared_blobs_written; ///< update these on io completion
+
+    KeyValueDB::Transaction t; ///< then we will commit this
+    std::list<Context*> oncommits;  ///< more commit completions
+    std::list<CollectionRef> removed_collections; ///< colls we removed
+
+    boost::intrusive::list_member_hook<> deferred_queue_item;
+    bluestore_deferred_transaction_t *deferred_txn = nullptr; ///< if any
+
+    interval_set<uint64_t> allocated, released;
+    volatile_statfs statfs_delta;	   ///< overall store statistics delta
+    uint64_t osd_pool_id = META_POOL_ID;    ///< osd pool id we're operating on
+
+    IOContext ioc;
+    bool had_ios = false;  ///< true if we submitted IOs before our kv txn
+
+    uint64_t seq = 0;
+    ceph::mono_clock::time_point start;
+    ceph::mono_clock::time_point last_stamp;
+
+    uint64_t last_nid = 0;     ///< if non-zero, highest new nid we allocated
+    uint64_t last_blobid = 0;  ///< if non-zero, highest new blobid we allocated
+
+#if defined(WITH_LTTNG)
+    bool tracing = false;
+#endif
+
+#ifdef WITH_BLKIN
+    ZTracer::Trace trace;
+#endif
+
+    explicit TransContext(CephContext* cct, Collection *c, OpSequencer *o,
+			  std::list<Context*> *on_commits)
+      : ch(c),
+	osr(o),
+	ioc(cct, this),
+	start(ceph::mono_clock::now()) {
+      last_stamp = start;
+      if (on_commits) {
+	oncommits.swap(*on_commits);
+      }
+    }
+    ~TransContext() {
+#ifdef WITH_BLKIN
+      if (trace) {
+        trace.event("txc destruct");
+      }
+#endif
+      delete deferred_txn;
+    }
+
+    void write_onode(OnodeRef& o) {
+      onodes.insert(o);
+    }
+    void write_shared_blob(SharedBlobRef &sb) {
+      shared_blobs.insert(sb);
+    }
+    void unshare_blob(SharedBlob *sb) {
+      shared_blobs.erase(sb);
+    }
+
+    /// note we logically modified object (when onode itself is unmodified)
+    void note_modified_object(OnodeRef& o) {
+      // onode itself isn't written, though
+      modified_objects.insert(o);
+    }
+    void note_removed_object(OnodeRef& o) {
+      modified_objects.insert(o);
+      onodes.erase(o);
+    }
+
+#ifdef HAVE_LIBZBD
+    void note_write_zone_offset(OnodeRef& o, uint32_t zone, uint64_t offset) {
+      o->onode.zone_offset_refs[zone] = offset;
+      new_zone_offset_refs[std::make_pair(o, zone)] = offset;
+    }
+    void note_release_zone_offset(OnodeRef& o, uint32_t zone, uint64_t offset) {
+      old_zone_offset_refs[std::make_pair(o, zone)] = offset;
+      o->onode.zone_offset_refs.erase(zone);
+    }
+#endif
+
+    void aio_finish(BlueStore *store) override {
+      store->txc_aio_finish(this);
+    }
+  private:
+    state_t state = STATE_PREPARE;
+  };
+
+  class BlueStoreThrottle {
+#if defined(WITH_LTTNG)
+    const std::chrono::time_point<ceph::mono_clock> time_base = ceph::mono_clock::now();
+
+    // Time of last chosen io (microseconds)
+    std::atomic<uint64_t> previous_emitted_tp_time_mono_mcs = {0};
+    std::atomic<uint64_t> ios_started_since_last_traced = {0};
+    std::atomic<uint64_t> ios_completed_since_last_traced = {0};
+
+    std::atomic_uint pending_kv_ios = {0};
+    std::atomic_uint pending_deferred_ios = {0};
+
+    // Min period between trace points (microseconds)
+    std::atomic<uint64_t> trace_period_mcs = {0};
+
+    bool should_trace(
+      uint64_t *started,
+      uint64_t *completed) {
+      uint64_t min_period_mcs = trace_period_mcs.load(
+	std::memory_order_relaxed);
+
+      if (min_period_mcs == 0) {
+	*started = 1;
+	*completed = ios_completed_since_last_traced.exchange(0);
+	return true;
+      } else {
+	ios_started_since_last_traced++;
+	auto now_mcs = ceph::to_microseconds<uint64_t>(
+	  ceph::mono_clock::now() - time_base);
+	uint64_t previous_mcs = previous_emitted_tp_time_mono_mcs;
+	uint64_t period_mcs = now_mcs - previous_mcs;
+	if (period_mcs > min_period_mcs) {
+	  if (previous_emitted_tp_time_mono_mcs.compare_exchange_strong(
+		previous_mcs, now_mcs)) {
+	    // This would be racy at a sufficiently extreme trace rate, but isn't
+	    // worth the overhead of doing it more carefully.
+	    *started = ios_started_since_last_traced.exchange(0);
+	    *completed = ios_completed_since_last_traced.exchange(0);
+	    return true;
+	  }
+	}
+	return false;
+      }
+    }
+#endif
+
+#if defined(WITH_LTTNG)
+    void emit_initial_tracepoint(
+      KeyValueDB &db,
+      TransContext &txc,
+      ceph::mono_clock::time_point);
+#else
+    void emit_initial_tracepoint(
+      KeyValueDB &db,
+      TransContext &txc,
+      ceph::mono_clock::time_point) {}
+#endif
+
+    Throttle throttle_bytes;           ///< submit to commit
+    Throttle throttle_deferred_bytes;  ///< submit to deferred complete
+
+  public:
+    BlueStoreThrottle(CephContext *cct) :
+      throttle_bytes(cct, "bluestore_throttle_bytes", 0),
+      throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes", 0)
+    {
+      reset_throttle(cct->_conf);
+    }
+
+#if defined(WITH_LTTNG)
+    void complete_kv(TransContext &txc);
+    void complete(TransContext &txc);
+#else
+    void complete_kv(TransContext &txc) {}
+    void complete(TransContext &txc) {}
+#endif
+
+    ceph::mono_clock::duration log_state_latency(
+      TransContext &txc, PerfCounters *logger, int state);
+    bool try_start_transaction(
+      KeyValueDB &db,
+      TransContext &txc,
+      ceph::mono_clock::time_point);
+    void finish_start_transaction(
+      KeyValueDB &db,
+      TransContext &txc,
+      ceph::mono_clock::time_point);
+    void release_kv_throttle(uint64_t cost) {
+      throttle_bytes.put(cost);
+    }
+    void release_deferred_throttle(uint64_t cost) {
+      throttle_deferred_bytes.put(cost);
+    }
+    bool should_submit_deferred() {
+      return throttle_deferred_bytes.past_midpoint();
+    }
+    void reset_throttle(const ConfigProxy &conf) {
+      throttle_bytes.reset_max(conf->bluestore_throttle_bytes);
+      throttle_deferred_bytes.reset_max(
+	conf->bluestore_throttle_bytes +
+	conf->bluestore_throttle_deferred_bytes);
+#if defined(WITH_LTTNG)
+      double rate = conf.get_val<double>("bluestore_throttle_trace_rate");
+      trace_period_mcs = rate > 0 ? floor((1/rate) * 1000000.0) : 0;
+#endif
+    }
+  } throttle;
+
+  typedef boost::intrusive::list<
+    TransContext,
+    boost::intrusive::member_hook<
+      TransContext,
+      boost::intrusive::list_member_hook<>,
+      &TransContext::deferred_queue_item> > deferred_queue_t;
+
+  struct DeferredBatch final : public AioContext {
+    OpSequencer *osr;
+    struct deferred_io {
+      ceph::buffer::list bl;    ///< data
+      uint64_t seq;     ///< deferred transaction seq
+    };
+    std::map<uint64_t,deferred_io> iomap; ///< map of ios in this batch
+    deferred_queue_t txcs;           ///< txcs in this batch
+    IOContext ioc;                   ///< our aios
+    /// bytes of pending io for each deferred seq (may be 0)
+    std::map<uint64_t,int> seq_bytes;
+
+    void _discard(CephContext *cct, uint64_t offset, uint64_t length);
+    void _audit(CephContext *cct);
+
+    DeferredBatch(CephContext *cct, OpSequencer *osr)
+      : osr(osr), ioc(cct, this) {}
+
+    /// prepare a write
+    void prepare_write(CephContext *cct,
+		       uint64_t seq, uint64_t offset, uint64_t length,
+		       ceph::buffer::list::const_iterator& p);
+
+    void aio_finish(BlueStore *store) override {
+      store->_deferred_aio_finish(osr);
+    }
+  };
+
+  class OpSequencer : public RefCountedObject {
+  public:
+    ceph::mutex qlock = ceph::make_mutex("BlueStore::OpSequencer::qlock");
+    ceph::condition_variable qcond;
+    typedef boost::intrusive::list<
+      TransContext,
+      boost::intrusive::member_hook<
+        TransContext,
+	boost::intrusive::list_member_hook<>,
+	&TransContext::sequencer_item> > q_list_t;
+    q_list_t q;  ///< transactions
+
+    boost::intrusive::list_member_hook<> deferred_osr_queue_item;
+
+    DeferredBatch *deferred_running = nullptr;
+    DeferredBatch *deferred_pending = nullptr;
+
+    ceph::mutex deferred_lock = ceph::make_mutex("BlueStore::OpSequencer::deferred_lock");
+
+    BlueStore *store;
+    coll_t cid;
+
+    uint64_t last_seq = 0;
+
+    std::atomic_int txc_with_unstable_io = {0};  ///< num txcs with unstable io
+
+    std::atomic_int kv_committing_serially = {0};
+
+    std::atomic_int kv_submitted_waiters = {0};
+
+    std::atomic_bool zombie = {false};    ///< in zombie_osr std::set (collection going away)
+
+    const uint32_t sequencer_id;
+
+    uint32_t get_sequencer_id() const {
+      return sequencer_id;
+    }
+
+    void queue_new(TransContext *txc) {
+      std::lock_guard l(qlock);
+      txc->seq = ++last_seq;
+      q.push_back(*txc);
+    }
+
+    void drain() {
+      std::unique_lock l(qlock);
+      while (!q.empty())
+	qcond.wait(l);
+    }
+
+    void drain_preceding(TransContext *txc) {
+      std::unique_lock l(qlock);
+      while (&q.front() != txc)
+	qcond.wait(l);
+    }
+
+    bool _is_all_kv_submitted() {
+      // caller must hold qlock & q.empty() must not empty
+      ceph_assert(!q.empty());
+      TransContext *txc = &q.back();
+      if (txc->get_state() >= TransContext::STATE_KV_SUBMITTED) {
+	return true;
+      }
+      return false;
+    }
+
+    void flush() {
+      std::unique_lock l(qlock);
+      while (true) {
+	// std::set flag before the check because the condition
+	// may become true outside qlock, and we need to make
+	// sure those threads see waiters and signal qcond.
+	++kv_submitted_waiters;
+	if (q.empty() || _is_all_kv_submitted()) {
+	  --kv_submitted_waiters;
+	  return;
+	}
+	qcond.wait(l);
+	--kv_submitted_waiters;
+      }
+    }
+
+    void flush_all_but_last() {
+      std::unique_lock l(qlock);
+      ceph_assert (q.size() >= 1);
+      while (true) {
+	// std::set flag before the check because the condition
+	// may become true outside qlock, and we need to make
+	// sure those threads see waiters and signal qcond.
+	++kv_submitted_waiters;
+	if (q.size() <= 1) {
+	  --kv_submitted_waiters;
+	  return;
+	} else {
+	  auto it = q.rbegin();
+	  it++;
+	  if (it->get_state() >= TransContext::STATE_KV_SUBMITTED) {
+	    --kv_submitted_waiters;
+	    return;
+          }
+	}
+	qcond.wait(l);
+	--kv_submitted_waiters;
+      }
+      }
+
+    bool flush_commit(Context *c) {
+      std::lock_guard l(qlock);
+      if (q.empty()) {
+	return true;
+      }
+      TransContext *txc = &q.back();
+      if (txc->get_state() >= TransContext::STATE_KV_DONE) {
+	return true;
+      }
+      txc->oncommits.push_back(c);
+      return false;
+    }
+  private:
+    FRIEND_MAKE_REF(OpSequencer);
+    OpSequencer(BlueStore *store, uint32_t sequencer_id, const coll_t& c)
+      : RefCountedObject(store->cct),
+	store(store), cid(c), sequencer_id(sequencer_id) {
+    }
+    ~OpSequencer() {
+      ceph_assert(q.empty());
+    }
+  };
+
+  typedef boost::intrusive::list<
+    OpSequencer,
+    boost::intrusive::member_hook<
+      OpSequencer,
+      boost::intrusive::list_member_hook<>,
+      &OpSequencer::deferred_osr_queue_item> > deferred_osr_queue_t;
+
+  struct KVSyncThread : public Thread {
+    BlueStore *store;
+    explicit KVSyncThread(BlueStore *s) : store(s) {}
+    void *entry() override {
+      store->_kv_sync_thread();
+      return NULL;
+    }
+  };
+  struct KVFinalizeThread : public Thread {
+    BlueStore *store;
+    explicit KVFinalizeThread(BlueStore *s) : store(s) {}
+    void *entry() override {
+      store->_kv_finalize_thread();
+      return NULL;
+    }
+  };
+
+#ifdef HAVE_LIBZBD
+  struct ZonedCleanerThread : public Thread {
+    BlueStore *store;
+    explicit ZonedCleanerThread(BlueStore *s) : store(s) {}
+    void *entry() override {
+      store->_zoned_cleaner_thread();
+      return nullptr;
+    }
+  };
+#endif
+  
+  struct BigDeferredWriteContext {
+    uint64_t off = 0;     // original logical offset
+    uint32_t b_off = 0;   // blob relative offset
+    uint32_t used = 0;
+    uint64_t head_read = 0;
+    uint64_t tail_read = 0;
+    BlobRef blob_ref;
+    uint64_t blob_start = 0;
+    PExtentVector res_extents;
+
+    inline uint64_t blob_aligned_len() const {
+      return used + head_read + tail_read;
+    }
+
+    bool can_defer(BlueStore::extent_map_t::iterator ep,
+      uint64_t prefer_deferred_size,
+      uint64_t block_size,
+      uint64_t offset,
+      uint64_t l);
+    bool apply_defer();
+  };
+
+  // --------------------------------------------------------
+  // members
+private:
+  BlueFS *bluefs = nullptr;
+  bluefs_layout_t bluefs_layout;
+  utime_t next_dump_on_bluefs_alloc_failure;
+
+  KeyValueDB *db = nullptr;
+  BlockDevice *bdev = nullptr;
+  std::string freelist_type;
+  FreelistManager *fm = nullptr;
+
+  Allocator *alloc = nullptr;   ///< allocator consumed by BlueStore
+  bluefs_shared_alloc_context_t shared_alloc; ///< consumed by BlueFS (may be == alloc)
+
+  uuid_d fsid;
+  int path_fd = -1;  ///< open handle to $path
+  int fsid_fd = -1;  ///< open handle (locked) to $path/fsid
+  bool mounted = false;
+
+  // store open_db options:
+  bool db_was_opened_read_only = true;
+  bool need_to_destage_allocation_file = false;
+
+  ///< rwlock to protect coll_map/new_coll_map
+  ceph::shared_mutex coll_lock = ceph::make_shared_mutex("BlueStore::coll_lock");
+  mempool::bluestore_cache_other::unordered_map<coll_t, CollectionRef> coll_map;
+  bool collections_had_errors = false;
+  std::map<coll_t,CollectionRef> new_coll_map;
+
+  mempool::bluestore_cache_buffer::vector<BufferCacheShard*> buffer_cache_shards;
+  mempool::bluestore_cache_onode::vector<OnodeCacheShard*> onode_cache_shards;
+
+  /// protect zombie_osr_set
+  ceph::mutex zombie_osr_lock = ceph::make_mutex("BlueStore::zombie_osr_lock");
+  uint32_t next_sequencer_id = 0;
+  std::map<coll_t,OpSequencerRef> zombie_osr_set; ///< std::set of OpSequencers for deleted collections
+
+  std::atomic<uint64_t> nid_last = {0};
+  std::atomic<uint64_t> nid_max = {0};
+  std::atomic<uint64_t> blobid_last = {0};
+  std::atomic<uint64_t> blobid_max = {0};
+
+  ceph::mutex deferred_lock = ceph::make_mutex("BlueStore::deferred_lock");
+  ceph::mutex atomic_alloc_and_submit_lock =
+      ceph::make_mutex("BlueStore::atomic_alloc_and_submit_lock");
+  std::atomic<uint64_t> deferred_seq = {0};
+  deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
+  std::atomic_int deferred_queue_size = {0};         ///< num txc's queued across all osrs
+  std::atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
+  Finisher  finisher;
+  utime_t  deferred_last_submitted = utime_t();
+
+  KVSyncThread kv_sync_thread;
+  ceph::mutex kv_lock = ceph::make_mutex("BlueStore::kv_lock");
+  ceph::condition_variable kv_cond;
+  bool _kv_only = false;
+  bool kv_sync_started = false;
+  bool kv_stop = false;
+  bool kv_finalize_started = false;
+  bool kv_finalize_stop = false;
+  std::deque<TransContext*> kv_queue;             ///< ready, already submitted
+  std::deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread
+  std::deque<TransContext*> kv_committing;        ///< currently syncing
+  std::deque<DeferredBatch*> deferred_done_queue;   ///< deferred ios done
+  bool kv_sync_in_progress = false;
+
+  KVFinalizeThread kv_finalize_thread;
+  ceph::mutex kv_finalize_lock = ceph::make_mutex("BlueStore::kv_finalize_lock");
+  ceph::condition_variable kv_finalize_cond;
+  std::deque<TransContext*> kv_committing_to_finalize;   ///< pending finalization
+  std::deque<DeferredBatch*> deferred_stable_to_finalize; ///< pending finalization
+  bool kv_finalize_in_progress = false;
+
+#ifdef HAVE_LIBZBD
+  ZonedCleanerThread zoned_cleaner_thread;
+  ceph::mutex zoned_cleaner_lock = ceph::make_mutex("BlueStore::zoned_cleaner_lock");
+  ceph::condition_variable zoned_cleaner_cond;
+  bool zoned_cleaner_started = false;
+  bool zoned_cleaner_stop = false;
+  std::deque<uint64_t> zoned_cleaner_queue;
+#endif
+
+  PerfCounters *logger = nullptr;
+
+  std::list<CollectionRef> removed_collections;
+
+  ceph::shared_mutex debug_read_error_lock =
+    ceph::make_shared_mutex("BlueStore::debug_read_error_lock");
+  std::set<ghobject_t> debug_data_error_objects;
+  std::set<ghobject_t> debug_mdata_error_objects;
+
+  std::atomic<int> csum_type = {Checksummer::CSUM_CRC32C};
+
+  uint64_t block_size = 0;     ///< block size of block device (power of 2)
+  uint64_t block_mask = 0;     ///< mask to get just the block offset
+  size_t block_size_order = 0; ///< bits to shift to get block size
+  uint64_t optimal_io_size = 0;///< best performance io size for block device
+
+  uint64_t min_alloc_size;     ///< minimum allocation unit (power of 2)
+  uint8_t  min_alloc_size_order = 0;///< bits to shift to get min_alloc_size
+  uint64_t min_alloc_size_mask;///< mask for fast checking of allocation alignment
+  static_assert(std::numeric_limits<uint8_t>::max() >
+		std::numeric_limits<decltype(min_alloc_size)>::digits,
+		"not enough bits for min_alloc_size");
+
+  // smr-only
+  uint64_t zone_size = 0;              ///< number of SMR zones 
+  uint64_t first_sequential_zone = 0;  ///< first SMR zone that is sequential-only
+
+  enum {
+    // Please preserve the order since it's DB persistent
+    OMAP_BULK = 0,
+    OMAP_PER_POOL = 1,
+    OMAP_PER_PG = 2,
+    } per_pool_omap = OMAP_BULK;
+
+  ///< maximum allocation unit (power of 2)
+  std::atomic<uint64_t> max_alloc_size = {0};
+
+  ///< number threshold for forced deferred writes
+  std::atomic<int> deferred_batch_ops = {0};
+
+  ///< size threshold for forced deferred writes
+  std::atomic<uint64_t> prefer_deferred_size = {0};
+
+  ///< approx cost per io, in bytes
+  std::atomic<uint64_t> throttle_cost_per_io = {0};
+
+  std::atomic<Compressor::CompressionMode> comp_mode =
+    {Compressor::COMP_NONE}; ///< compression mode
+  CompressorRef compressor;
+  std::atomic<uint64_t> comp_min_blob_size = {0};
+  std::atomic<uint64_t> comp_max_blob_size = {0};
+
+  std::atomic<uint64_t> max_blob_size = {0};  ///< maximum blob size
+
+  uint64_t kv_ios = 0;
+  uint64_t kv_throttle_costs = 0;
+
+  // cache trim control
+  uint64_t cache_size = 0;       ///< total cache size
+  double cache_meta_ratio = 0;   ///< cache ratio dedicated to metadata
+  double cache_kv_ratio = 0;     ///< cache ratio dedicated to kv (e.g., rocksdb)
+  double cache_kv_onode_ratio = 0; ///< cache ratio dedicated to kv onodes (e.g., rocksdb onode CF)
+  double cache_data_ratio = 0;   ///< cache ratio dedicated to object data
+  bool cache_autotune = false;   ///< cache autotune setting
+  double cache_age_bin_interval = 0; ///< time to wait between cache age bin rotations
+  double cache_autotune_interval = 0; ///< time to wait between cache rebalancing
+  std::vector<uint64_t> kv_bins; ///< kv autotune bins
+  std::vector<uint64_t> kv_onode_bins; ///< kv onode autotune bins
+  std::vector<uint64_t> meta_bins; ///< meta autotune bins
+  std::vector<uint64_t> data_bins; ///< data autotune bins
+  uint64_t osd_memory_target = 0;   ///< OSD memory target when autotuning cache
+  uint64_t osd_memory_base = 0;     ///< OSD base memory when autotuning cache
+  double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
+  uint64_t osd_memory_cache_min = 0; ///< Min memory to assign when autotuning cache
+  double osd_memory_cache_resize_interval = 0; ///< Time to wait between cache resizing 
+  double max_defer_interval = 0; ///< Time to wait between last deferred submit
+  std::atomic<uint32_t> config_changed = {0}; ///< Counter to determine if there is a configuration change.
+
+  typedef std::map<uint64_t, volatile_statfs> osd_pools_map;
+
+  ceph::mutex vstatfs_lock = ceph::make_mutex("BlueStore::vstatfs_lock");
+  volatile_statfs vstatfs;
+  osd_pools_map osd_pools; // protected by vstatfs_lock as well
+
+  bool per_pool_stat_collection = true;
+
+  struct MempoolThread : public Thread {
+  public:
+    BlueStore *store;
+
+    ceph::condition_variable cond;
+    ceph::mutex lock = ceph::make_mutex("BlueStore::MempoolThread::lock");
+    bool stop = false;
+    std::shared_ptr<PriorityCache::PriCache> binned_kv_cache = nullptr;
+    std::shared_ptr<PriorityCache::PriCache> binned_kv_onode_cache = nullptr;
+    std::shared_ptr<PriorityCache::Manager> pcm = nullptr;
+
+    struct MempoolCache : public PriorityCache::PriCache {
+      BlueStore *store;
+      uint64_t bins[PriorityCache::Priority::LAST+1] = {0};
+      int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
+      int64_t committed_bytes = 0;
+      double cache_ratio = 0;
+
+      MempoolCache(BlueStore *s) : store(s) {};
+
+      virtual uint64_t _get_used_bytes() const = 0;
+      virtual uint64_t _sum_bins(uint32_t start, uint32_t end) const = 0;
+
+      virtual int64_t request_cache_bytes(
+          PriorityCache::Priority pri, uint64_t total_cache) const {
+        int64_t assigned = get_cache_bytes(pri);
+
+        switch (pri) {
+        case PriorityCache::Priority::PRI0:
+	  {
+            // BlueStore caches currently don't put anything in PRI0
+	    break;
+	  }
+        case PriorityCache::Priority::LAST:
+          {
+            uint32_t max = get_bin_count();
+	    int64_t request = _get_used_bytes() - _sum_bins(0, max);
+            return(request > assigned) ? request - assigned : 0;
+          }
+        default:
+	  {
+	    ceph_assert(pri > 0 && pri < PriorityCache::Priority::LAST);
+            auto prev_pri = static_cast<PriorityCache::Priority>(pri - 1);
+            uint64_t start = get_bins(prev_pri);
+            uint64_t end = get_bins(pri);
+            int64_t request = _sum_bins(start, end);
+            return(request > assigned) ? request - assigned : 0;
+	  }
+	}
+        return -EOPNOTSUPP;
+      }
+ 
+      virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
+        return cache_bytes[pri];
+      }
+      virtual int64_t get_cache_bytes() const { 
+        int64_t total = 0;
+
+        for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
+          PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
+          total += get_cache_bytes(pri);
+        }
+        return total;
+      }
+      virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+        cache_bytes[pri] = bytes;
+      }
+      virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
+        cache_bytes[pri] += bytes;
+      }
+      virtual int64_t commit_cache_size(uint64_t total_cache) {
+        committed_bytes = PriorityCache::get_chunk(
+            get_cache_bytes(), total_cache);
+        return committed_bytes;
+      }
+      virtual int64_t get_committed_size() const {
+        return committed_bytes;
+      }
+      virtual uint64_t get_bins(PriorityCache::Priority pri) const {
+        if (pri > PriorityCache::Priority::PRI0 &&
+            pri < PriorityCache::Priority::LAST) {
+          return bins[pri];
+        }
+        return 0;
+      }
+      virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
+        if (pri <= PriorityCache::Priority::PRI0 ||
+            pri >= PriorityCache::Priority::LAST) {
+          return;
+        }
+        bins[pri] = end_bin;
+        uint64_t max = 0;
+        for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) {
+          if (bins[pri] > max) {
+            max = bins[pri];
+          }
+        }
+        set_bin_count(max);
+      }
+      virtual void import_bins(const std::vector<uint64_t> &bins_v) {
+        uint64_t max = 0;
+        for (int pri = 1; pri < PriorityCache::Priority::LAST; pri++) {
+          unsigned i = (unsigned) pri - 1;
+          if (i < bins_v.size()) {
+            bins[pri] = bins_v[i];
+            if (bins[pri] > max) {
+              max = bins[pri];
+            }
+          } else {
+            bins[pri] = 0;
+          }
+        }
+        set_bin_count(max);
+      }
+      virtual double get_cache_ratio() const {
+        return cache_ratio;
+      }
+      virtual void set_cache_ratio(double ratio) {
+        cache_ratio = ratio;
+      }
+      virtual std::string get_cache_name() const = 0;
+      virtual uint32_t get_bin_count() const = 0;
+      virtual void set_bin_count(uint32_t count) = 0;
+    };
+
+    struct MetaCache : public MempoolCache {
+      MetaCache(BlueStore *s) : MempoolCache(s) {};
+
+      virtual uint32_t get_bin_count() const {
+        return store->onode_cache_shards[0]->get_bin_count();
+      }
+      virtual void set_bin_count(uint32_t count) {
+        for (auto i : store->onode_cache_shards) {
+          i->set_bin_count(count);
+        }
+      }
+      virtual uint64_t _get_used_bytes() const {
+        return mempool::bluestore_blob::allocated_bytes() +
+          mempool::bluestore_extent::allocated_bytes() +
+          mempool::bluestore_cache_buffer::allocated_bytes() +
+          mempool::bluestore_cache_meta::allocated_bytes() +
+          mempool::bluestore_cache_other::allocated_bytes() +
+	   mempool::bluestore_cache_onode::allocated_bytes() +
+          mempool::bluestore_shared_blob::allocated_bytes() +
+          mempool::bluestore_inline_bl::allocated_bytes();
+      }
+      virtual void shift_bins() {
+        for (auto i : store->onode_cache_shards) {
+          i->shift_bins();
+        }
+      }
+      virtual uint64_t _sum_bins(uint32_t start, uint32_t end) const {
+        uint64_t onodes = 0;
+	for (auto i : store->onode_cache_shards) {
+	  onodes += i->sum_bins(start, end);
+	}
+	return onodes*get_bytes_per_onode();
+      }
+      virtual std::string get_cache_name() const {
+        return "BlueStore Meta Cache";
+      }
+      uint64_t _get_num_onodes() const {
+        uint64_t onode_num =
+            mempool::bluestore_cache_onode::allocated_items();
+        return (2 > onode_num) ? 2 : onode_num;
+      }
+      double get_bytes_per_onode() const {
+        return (double)_get_used_bytes() / (double)_get_num_onodes();
+      }
+    };
+    std::shared_ptr<MetaCache> meta_cache;
+
+    struct DataCache : public MempoolCache {
+      DataCache(BlueStore *s) : MempoolCache(s) {};
+
+      virtual uint32_t get_bin_count() const {
+        return store->buffer_cache_shards[0]->get_bin_count();
+      }
+      virtual void set_bin_count(uint32_t count) {
+        for (auto i : store->buffer_cache_shards) {
+          i->set_bin_count(count);
+        }
+      }
+      virtual uint64_t _get_used_bytes() const {
+        uint64_t bytes = 0;
+        for (auto i : store->buffer_cache_shards) {
+          bytes += i->_get_bytes();
+        }
+        return bytes; 
+      }
+      virtual void shift_bins() {
+        for (auto i : store->buffer_cache_shards) {
+          i->shift_bins();
+        }
+      }
+      virtual uint64_t _sum_bins(uint32_t start, uint32_t end) const {
+        uint64_t bytes = 0;
+        for (auto i : store->buffer_cache_shards) {
+          bytes += i->sum_bins(start, end);
+        }
+        return bytes;
+      }
+      virtual std::string get_cache_name() const {
+        return "BlueStore Data Cache";
+      }
+    };
+    std::shared_ptr<DataCache> data_cache;
+
+  public:
+    explicit MempoolThread(BlueStore *s)
+      : store(s),
+        meta_cache(new MetaCache(s)),
+        data_cache(new DataCache(s)) {}
+
+    void *entry() override;
+    void init() {
+      ceph_assert(stop == false);
+      create("bstore_mempool");
+    }
+    void shutdown() {
+      lock.lock();
+      stop = true;
+      cond.notify_all();
+      lock.unlock();
+      join();
+    }
+
+  private:
+    void _update_cache_settings();
+    void _resize_shards(bool interval_stats);
+  } mempool_thread;
+
+#ifdef WITH_BLKIN
+  ZTracer::Endpoint trace_endpoint {"0.0.0.0", 0, "BlueStore"};
+#endif
+
+  // --------------------------------------------------------
+  // private methods
+
+  void _init_logger();
+  void _shutdown_logger();
+  int _reload_logger();
+
+  int _open_path();
+  void _close_path();
+  int _open_fsid(bool create);
+  int _lock_fsid();
+  int _read_fsid(uuid_d *f);
+  int _write_fsid();
+  void _close_fsid();
+  void _set_alloc_sizes();
+  void _set_blob_size();
+  void _set_finisher_num();
+  void _set_per_pool_omap();
+  void _update_osd_memory_options();
+
+  int _open_bdev(bool create);
+  // Verifies if disk space is enough for reserved + min bluefs
+  // and alters the latter if needed.
+  // Depends on min_alloc_size hence should be called after
+  // its initialization (and outside of _open_bdev)
+  void _validate_bdev();
+  void _close_bdev();
+
+  int _minimal_open_bluefs(bool create);
+  void _minimal_close_bluefs();
+  int _open_bluefs(bool create, bool read_only);
+  void _close_bluefs();
+
+  int _is_bluefs(bool create, bool* ret);
+  /*
+  * opens both DB and dependant super_meta, FreelistManager and allocator
+  * in the proper order
+  */
+  int _open_db_and_around(bool read_only, bool to_repair = false);
+  void _close_db_and_around();
+  void _close_around_db();
+
+  int _prepare_db_environment(bool create, bool read_only,
+			      std::string* kv_dir, std::string* kv_backend);
+
+  /*
+   * @warning to_repair_db means that we open this db to repair it, will not
+   * hold the rocksdb's file lock.
+   */
+  int _open_db(bool create,
+	       bool to_repair_db=false,
+	       bool read_only = false);
+  void _close_db();
+  int _open_fm(KeyValueDB::Transaction t,
+               bool read_only,
+               bool db_avail,
+               bool fm_restore = false);
+  void _close_fm();
+  int _write_out_fm_meta(uint64_t target_size);
+  int _create_alloc();
+  int _init_alloc(std::map<uint64_t, uint64_t> *zone_adjustments);
+  void _post_init_alloc(const std::map<uint64_t, uint64_t>& zone_adjustments);
+  void _close_alloc();
+  int _open_collections();
+  void _fsck_collections(int64_t* errors);
+  void _close_collections();
+
+  int _setup_block_symlink_or_file(std::string name, std::string path, uint64_t size,
+				   bool create);
+
+public:
+  utime_t get_deferred_last_submitted() {
+    std::lock_guard l(deferred_lock);
+    return deferred_last_submitted;
+  }
+
+  static int _write_bdev_label(CephContext* cct,
+			       const std::string &path, bluestore_bdev_label_t label);
+  static int _read_bdev_label(CephContext* cct, const std::string &path,
+			      bluestore_bdev_label_t *label);
+private:
+  int _check_or_set_bdev_label(std::string path, uint64_t size, std::string desc,
+			       bool create);
+  int _set_bdev_label_size(const std::string& path, uint64_t size);
+
+  int _open_super_meta();
+
+  void _open_statfs();
+  void _get_statfs_overall(struct store_statfs_t *buf);
+
+  void _dump_alloc_on_failure();
+
+  CollectionRef _get_collection(const coll_t& cid);
+  CollectionRef _get_collection_by_oid(const ghobject_t& oid);
+  void _queue_reap_collection(CollectionRef& c);
+  void _reap_collections();
+  void _update_logger();
+
+  void _assign_nid(TransContext *txc, OnodeRef& o);
+  uint64_t _assign_blobid(TransContext *txc);
+
+  template <int LogLevelV>
+  friend void _dump_onode(CephContext *cct, const Onode& o);
+  template <int LogLevelV>
+  friend void _dump_extent_map(CephContext *cct, const ExtentMap& em);
+  template <int LogLevelV>
+  friend void _dump_transaction(CephContext *cct, Transaction *t);
+
+  TransContext *_txc_create(Collection *c, OpSequencer *osr,
+			    std::list<Context*> *on_commits,
+			    TrackedOpRef osd_op=TrackedOpRef());
+  void _txc_update_store_statfs(TransContext *txc);
+  void _txc_add_transaction(TransContext *txc, Transaction *t);
+  void _txc_calc_cost(TransContext *txc);
+  void _txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t);
+  void _txc_state_proc(TransContext *txc);
+  void _txc_aio_submit(TransContext *txc);
+public:
+  void txc_aio_finish(void *p) {
+    _txc_state_proc(static_cast<TransContext*>(p));
+  }
+private:
+  void _txc_finish_io(TransContext *txc);
+  void _txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t);
+  void _txc_apply_kv(TransContext *txc, bool sync_submit_transaction);
+  void _txc_committed_kv(TransContext *txc);
+  void _txc_finish(TransContext *txc);
+  void _txc_release_alloc(TransContext *txc);
+
+  void _osr_attach(Collection *c);
+  void _osr_register_zombie(OpSequencer *osr);
+  void _osr_drain(OpSequencer *osr);
+  void _osr_drain_preceding(TransContext *txc);
+  void _osr_drain_all();
+
+  void _kv_start();
+  void _kv_stop();
+  void _kv_sync_thread();
+  void _kv_finalize_thread();
+
+#ifdef HAVE_LIBZBD
+  void _zoned_cleaner_start();
+  void _zoned_cleaner_stop();
+  void _zoned_cleaner_thread();
+  void _zoned_clean_zone(uint64_t zone_num,
+			 class ZonedAllocator *a,
+			 class ZonedFreelistManager *f);
+  void _clean_some(ghobject_t oid, uint32_t zone_num);
+#endif
+
+  bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, uint64_t len);
+  void _deferred_queue(TransContext *txc);
+public:
+  void deferred_try_submit();
+private:
+  void _deferred_submit_unlock(OpSequencer *osr);
+  void _deferred_aio_finish(OpSequencer *osr);
+  int _deferred_replay();
+  bool _eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn,
+				    interval_set<uint64_t>& bluefs_extents);
+
+public:
+  using mempool_dynamic_bitset =
+    boost::dynamic_bitset<uint64_t,
+			  mempool::bluestore_fsck::pool_allocator<uint64_t>>;
+  using  per_pool_statfs =
+    mempool::bluestore_fsck::map<uint64_t, store_statfs_t>;
+
+  enum FSCKDepth {
+    FSCK_REGULAR,
+    FSCK_DEEP,
+    FSCK_SHALLOW
+  };
+  enum {
+    MAX_FSCK_ERROR_LINES = 100,
+  };
+
+private:
+  int _fsck_check_extents(
+    std::string_view ctx_descr,
+    const PExtentVector& extents,
+    bool compressed,
+    mempool_dynamic_bitset &used_blocks,
+    uint64_t granularity,
+    BlueStoreRepairer* repairer,
+    store_statfs_t& expected_statfs,
+    FSCKDepth depth);
+
+  void _fsck_check_statfs(
+    const store_statfs_t& expected_store_statfs,
+    const per_pool_statfs& expected_pool_statfs,
+    int64_t& errors,
+    int64_t &warnings,
+    BlueStoreRepairer* repairer);
+  void _fsck_repair_shared_blobs(
+    BlueStoreRepairer& repairer,
+    shared_blob_2hash_tracker_t& sb_ref_counts,
+    sb_info_space_efficient_map_t& sb_info);
+
+  int _fsck(FSCKDepth depth, bool repair);
+  int _fsck_on_open(BlueStore::FSCKDepth depth, bool repair);
+
+  void _buffer_cache_write(
+    TransContext *txc,
+    BlobRef b,
+    uint64_t offset,
+    ceph::buffer::list& bl,
+    unsigned flags) {
+    b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl,
+			     flags);
+    txc->shared_blobs_written.insert(b->shared_blob);
+  }
+
+  int _collection_list(
+    Collection *c, const ghobject_t& start, const ghobject_t& end,
+    int max, bool legacy, std::vector<ghobject_t> *ls, ghobject_t *next);
+
+  template <typename T, typename F>
+  T select_option(const std::string& opt_name, T val1, F f) {
+    //NB: opt_name reserved for future use
+    std::optional<T> val2 = f();
+    if (val2) {
+      return *val2;
+    }
+    return val1;
+  }
+
+  void _apply_padding(uint64_t head_pad,
+		      uint64_t tail_pad,
+		      ceph::buffer::list& padded);
+
+  void _record_onode(OnodeRef &o, KeyValueDB::Transaction &txn);
+
+  // -- ondisk version ---
+public:
+  const int32_t latest_ondisk_format = 4;        ///< our version
+  const int32_t min_readable_ondisk_format = 1;  ///< what we can read
+  const int32_t min_compat_ondisk_format = 3;    ///< who can read us
+
+private:
+  int32_t ondisk_format = 0;  ///< value detected on mount
+  bool    m_fast_shutdown = false;
+  int _upgrade_super();  ///< upgrade (called during open_super)
+  uint64_t _get_ondisk_reserved() const;
+  void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
+
+  // --- public interface ---
+public:
+  BlueStore(CephContext *cct, const std::string& path);
+  BlueStore(CephContext *cct, const std::string& path, uint64_t min_alloc_size); // Ctor for UT only
+  ~BlueStore() override;
+
+  std::string get_type() override {
+    return "bluestore";
+  }
+
+  bool needs_journal() override { return false; };
+  bool wants_journal() override { return false; };
+  bool allows_journal() override { return false; };
+
+  void prepare_for_fast_shutdown() override;
+  bool has_null_manager() const override;
+
+  uint64_t get_min_alloc_size() const override {
+    return min_alloc_size;
+  }
+
+  int get_devices(std::set<std::string> *ls) override;
+
+  bool is_rotational() override;
+  bool is_journal_rotational() override;
+  bool is_db_rotational();
+  bool is_statfs_recoverable() const;
+
+  std::string get_default_device_class() override {
+    std::string device_class;
+    std::map<std::string, std::string> metadata;
+    collect_metadata(&metadata);
+    auto it = metadata.find("bluestore_bdev_type");
+    if (it != metadata.end()) {
+      device_class = it->second;
+    }
+    return device_class;
+  }
+
+  int get_numa_node(
+    int *numa_node,
+    std::set<int> *nodes,
+    std::set<std::string> *failed) override;
+
+  static int get_block_device_fsid(CephContext* cct, const std::string& path,
+				   uuid_d *fsid);
+
+  bool test_mount_in_use() override;
+
+private:
+  int _mount();
+public:
+  int mount() override {
+    return _mount();
+  }
+  int umount() override;
+
+  int open_db_environment(KeyValueDB **pdb, bool to_repair);
+  int close_db_environment();
+  BlueFS* get_bluefs();
+
+  int write_meta(const std::string& key, const std::string& value) override;
+  int read_meta(const std::string& key, std::string *value) override;
+
+  // open in read-only and limited mode
+  int cold_open();
+  int cold_close();
+
+  int fsck(bool deep) override {
+    return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, false);
+  }
+  int repair(bool deep) override {
+    return _fsck(deep ? FSCK_DEEP : FSCK_REGULAR, true);
+  }
+  int quick_fix() override {
+    return _fsck(FSCK_SHALLOW, true);
+  }
+
+  void set_cache_shards(unsigned num) override;
+  void dump_cache_stats(ceph::Formatter *f) override {
+    int onode_count = 0, buffers_bytes = 0;
+    for (auto i: onode_cache_shards) {
+      onode_count += i->_get_num();
+    }
+    for (auto i: buffer_cache_shards) {
+      buffers_bytes += i->_get_bytes();
+    }
+    f->dump_int("bluestore_onode", onode_count);
+    f->dump_int("bluestore_buffers", buffers_bytes);
+  }
+  void dump_cache_stats(std::ostream& ss) override {
+    int onode_count = 0, buffers_bytes = 0;
+    for (auto i: onode_cache_shards) {
+      onode_count += i->_get_num();
+    }
+    for (auto i: buffer_cache_shards) {
+      buffers_bytes += i->_get_bytes();
+    }
+    ss << "bluestore_onode: " << onode_count;
+    ss << "bluestore_buffers: " << buffers_bytes;
+  }
+
+  int validate_hobject_key(const hobject_t &obj) const override {
+    return 0;
+  }
+  unsigned get_max_attr_name_length() override {
+    return 256;  // arbitrary; there is no real limit internally
+  }
+
+  int mkfs() override;
+  int mkjournal() override {
+    return 0;
+  }
+
+  void get_db_statistics(ceph::Formatter *f) override;
+  void generate_db_histogram(ceph::Formatter *f) override;
+  void _shutdown_cache();
+  int flush_cache(std::ostream *os = NULL) override;
+  void dump_perf_counters(ceph::Formatter *f) override {
+    f->open_object_section("perf_counters");
+    logger->dump_formatted(f, false, false);
+    f->close_section();
+  }
+
+  int add_new_bluefs_device(int id, const std::string& path);
+  int migrate_to_existing_bluefs_device(const std::set<int>& devs_source,
+    int id);
+  int migrate_to_new_bluefs_device(const std::set<int>& devs_source,
+    int id,
+    const std::string& path);
+  int expand_devices(std::ostream& out);
+  std::string get_device_path(unsigned id);
+
+  int dump_bluefs_sizes(std::ostream& out);
+
+public:
+  int statfs(struct store_statfs_t *buf,
+             osd_alert_list_t* alerts = nullptr) override;
+  int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+		  bool *per_pool_omap) override;
+
+  void collect_metadata(std::map<std::string,std::string> *pm) override;
+
+  bool exists(CollectionHandle &c, const ghobject_t& oid) override;
+  int set_collection_opts(
+    CollectionHandle& c,
+    const pool_opts_t& opts) override;
+  int stat(
+    CollectionHandle &c,
+    const ghobject_t& oid,
+    struct stat *st,
+    bool allow_eio = false) override;
+  int read(
+    CollectionHandle &c,
+    const ghobject_t& oid,
+    uint64_t offset,
+    size_t len,
+    ceph::buffer::list& bl,
+    uint32_t op_flags = 0) override;
+
+private:
+
+  // --------------------------------------------------------
+  // intermediate data structures used while reading
+  struct region_t {
+    uint64_t logical_offset;
+    uint64_t blob_xoffset;   //region offset within the blob
+    uint64_t length;
+
+    // used later in read process
+    uint64_t front = 0;
+
+    region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0)
+      : logical_offset(offset),
+      blob_xoffset(b_offs),
+      length(len),
+      front(front){}
+    region_t(const region_t& from)
+      : logical_offset(from.logical_offset),
+      blob_xoffset(from.blob_xoffset),
+      length(from.length),
+      front(from.front){}
+
+    friend std::ostream& operator<<(std::ostream& out, const region_t& r) {
+      return out << "0x" << std::hex << r.logical_offset << ":"
+        << r.blob_xoffset << "~" << r.length << std::dec;
+    }
+  };
+
+  // merged blob read request
+  struct read_req_t {
+    uint64_t r_off = 0;
+    uint64_t r_len = 0;
+    ceph::buffer::list bl;
+    std::list<region_t> regs; // original read regions
+
+    read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {}
+
+    friend std::ostream& operator<<(std::ostream& out, const read_req_t& r) {
+      out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : [";
+      for (const auto& reg : r.regs)
+        out << reg;
+      return out << "]}" << std::dec;
+    }
+  };
+
+  typedef std::list<read_req_t> regions2read_t;
+  typedef std::map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
+
+  void _read_cache(
+    OnodeRef& o,
+    uint64_t offset,
+    size_t length,
+    int read_cache_policy,
+    ready_regions_t& ready_regions,
+    blobs2read_t& blobs2read);
+
+
+  int _prepare_read_ioc(
+    blobs2read_t& blobs2read,
+    std::vector<ceph::buffer::list>* compressed_blob_bls,
+    IOContext* ioc);
+
+  int _generate_read_result_bl(
+    OnodeRef& o,
+    uint64_t offset,
+    size_t length,
+    ready_regions_t& ready_regions,
+    std::vector<ceph::buffer::list>& compressed_blob_bls,
+    blobs2read_t& blobs2read,
+    bool buffered,
+    bool* csum_error,
+    ceph::buffer::list& bl);
+
+  int _do_read(
+    Collection *c,
+    OnodeRef& o,
+    uint64_t offset,
+    size_t len,
+    ceph::buffer::list& bl,
+    uint32_t op_flags = 0,
+    uint64_t retry_count = 0);
+
+  int _do_readv(
+    Collection *c,
+    OnodeRef& o,
+    const interval_set<uint64_t>& m,
+    ceph::buffer::list& bl,
+    uint32_t op_flags = 0,
+    uint64_t retry_count = 0);
+
+  int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
+	      uint64_t offset, size_t len, interval_set<uint64_t>& destset);
+public:
+  int fiemap(CollectionHandle &c, const ghobject_t& oid,
+	     uint64_t offset, size_t len, ceph::buffer::list& bl) override;
+  int fiemap(CollectionHandle &c, const ghobject_t& oid,
+	     uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) override;
+
+  int readv(
+    CollectionHandle &c_,
+    const ghobject_t& oid,
+    interval_set<uint64_t>& m,
+    ceph::buffer::list& bl,
+    uint32_t op_flags) override;
+
+  int dump_onode(CollectionHandle &c, const ghobject_t& oid,
+    const std::string& section_name, ceph::Formatter *f) override;
+
+  int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
+	      ceph::buffer::ptr& value) override;
+
+  int getattrs(CollectionHandle &c, const ghobject_t& oid,
+	       std::map<std::string,ceph::buffer::ptr, std::less<>>& aset) override;
+
+  int list_collections(std::vector<coll_t>& ls) override;
+
+  CollectionHandle open_collection(const coll_t &c) override;
+  CollectionHandle create_new_collection(const coll_t& cid) override;
+  void set_collection_commit_queue(const coll_t& cid,
+				   ContextQueue *commit_queue) override;
+
+  bool collection_exists(const coll_t& c) override;
+  int collection_empty(CollectionHandle& c, bool *empty) override;
+  int collection_bits(CollectionHandle& c) override;
+
+  int collection_list(CollectionHandle &c,
+		      const ghobject_t& start,
+		      const ghobject_t& end,
+		      int max,
+		      std::vector<ghobject_t> *ls, ghobject_t *next) override;
+
+  int collection_list_legacy(CollectionHandle &c,
+                             const ghobject_t& start,
+                             const ghobject_t& end,
+                             int max,
+                             std::vector<ghobject_t> *ls,
+                             ghobject_t *next) override;
+
+  int omap_get(
+    CollectionHandle &c,     ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+    ) override;
+  int _omap_get(
+    Collection *c,     ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+    );
+  int _onode_omap_get(
+    const OnodeRef& o,           ///< [in] Object containing omap
+    ceph::buffer::list *header,          ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+  );
+
+
+  /// Get omap header
+  int omap_get_header(
+    CollectionHandle &c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    bool allow_eio = false ///< [in] don't assert on eio
+    ) override;
+
+  /// Get keys defined on oid
+  int omap_get_keys(
+    CollectionHandle &c,              ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    std::set<std::string> *keys      ///< [out] Keys defined on oid
+    ) override;
+
+  /// Get key values
+  int omap_get_values(
+    CollectionHandle &c,         ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::set<std::string> &keys,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) override;
+
+#ifdef WITH_SEASTAR
+  int omap_get_values(
+    CollectionHandle &c,         ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::optional<std::string> &start_after,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) override;
+#endif
+
+  /// Filters keys into out which are defined on oid
+  int omap_check_keys(
+    CollectionHandle &c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    const std::set<std::string> &keys, ///< [in] Keys to check
+    std::set<std::string> *out         ///< [out] Subset of keys defined on oid
+    ) override;
+
+  ObjectMap::ObjectMapIterator get_omap_iterator(
+    CollectionHandle &c,   ///< [in] collection
+    const ghobject_t &oid  ///< [in] object
+    ) override;
+
+  void set_fsid(uuid_d u) override {
+    fsid = u;
+  }
+  uuid_d get_fsid() override {
+    return fsid;
+  }
+
+  uint64_t estimate_objects_overhead(uint64_t num_objects) override {
+    return num_objects * 300; //assuming per-object overhead is 300 bytes
+  }
+
+  struct BSPerfTracker {
+    PerfCounters::avg_tracker<uint64_t> os_commit_latency_ns;
+    PerfCounters::avg_tracker<uint64_t> os_apply_latency_ns;
+
+    objectstore_perf_stat_t get_cur_stats() const {
+      objectstore_perf_stat_t ret;
+      ret.os_commit_latency_ns = os_commit_latency_ns.current_avg();
+      ret.os_apply_latency_ns = os_apply_latency_ns.current_avg();
+      return ret;
+    }
+
+    void update_from_perfcounters(PerfCounters &logger);
+  } perf_tracker;
+
+  objectstore_perf_stat_t get_cur_stats() override {
+    perf_tracker.update_from_perfcounters(*logger);
+    return perf_tracker.get_cur_stats();
+  }
+  const PerfCounters* get_perf_counters() const override {
+    return logger;
+  }
+  const PerfCounters* get_bluefs_perf_counters() const {
+    return bluefs->get_perf_counters();
+  }
+  KeyValueDB* get_kv() {
+    return db;
+  }
+
+  int queue_transactions(
+    CollectionHandle& ch,
+    std::vector<Transaction>& tls,
+    TrackedOpRef op = TrackedOpRef(),
+    ThreadPool::TPHandle *handle = NULL) override;
+
+  // error injection
+  void inject_data_error(const ghobject_t& o) override {
+    std::unique_lock l(debug_read_error_lock);
+    debug_data_error_objects.insert(o);
+  }
+  void inject_mdata_error(const ghobject_t& o) override {
+    std::unique_lock l(debug_read_error_lock);
+    debug_mdata_error_objects.insert(o);
+  }
+
+  /// methods to inject various errors fsck can repair
+  void inject_broken_shared_blob_key(const std::string& key,
+			 const ceph::buffer::list& bl);
+  void inject_no_shared_blob_key();
+  void inject_stray_shared_blob_key(uint64_t sbid);
+
+  void inject_leaked(uint64_t len);
+  void inject_false_free(coll_t cid, ghobject_t oid);
+  void inject_statfs(const std::string& key, const store_statfs_t& new_statfs);
+  void inject_global_statfs(const store_statfs_t& new_statfs);
+  void inject_misreference(coll_t cid1, ghobject_t oid1,
+			   coll_t cid2, ghobject_t oid2,
+			   uint64_t offset);
+  void inject_zombie_spanning_blob(coll_t cid, ghobject_t oid, int16_t blob_id);
+  // resets global per_pool_omap in DB
+  void inject_legacy_omap();
+  // resets per_pool_omap | pgmeta_omap for onode
+  void inject_legacy_omap(coll_t cid, ghobject_t oid);
+  void inject_stray_omap(uint64_t head, const std::string& name);
+
+  void inject_bluefs_file(std::string_view dir,
+			  std::string_view name,
+			  size_t new_size);
+
+  void compact() override {
+    ceph_assert(db);
+    db->compact();
+  }
+  bool has_builtin_csum() const override {
+    return true;
+  }
+
+  inline void log_latency(const char* name,
+    int idx,
+    const ceph::timespan& lat,
+    double lat_threshold,
+    const char* info = "") const;
+
+  inline void log_latency_fn(const char* name,
+    int idx,
+    const ceph::timespan& lat,
+    double lat_threshold,
+    std::function<std::string (const ceph::timespan& lat)> fn) const;
+
+private:
+  bool _debug_data_eio(const ghobject_t& o) {
+    if (!cct->_conf->bluestore_debug_inject_read_err) {
+      return false;
+    }
+    std::shared_lock l(debug_read_error_lock);
+    return debug_data_error_objects.count(o);
+  }
+  bool _debug_mdata_eio(const ghobject_t& o) {
+    if (!cct->_conf->bluestore_debug_inject_read_err) {
+      return false;
+    }
+    std::shared_lock l(debug_read_error_lock);
+    return debug_mdata_error_objects.count(o);
+  }
+  void _debug_obj_on_delete(const ghobject_t& o) {
+    if (cct->_conf->bluestore_debug_inject_read_err) {
+      std::unique_lock l(debug_read_error_lock);
+      debug_data_error_objects.erase(o);
+      debug_mdata_error_objects.erase(o);
+    }
+  }
+private:
+  ceph::mutex qlock = ceph::make_mutex("BlueStore::Alerts::qlock");
+  std::string failed_cmode;
+  std::set<std::string> failed_compressors;
+  std::string spillover_alert;
+  std::string legacy_statfs_alert;
+  std::string no_per_pool_omap_alert;
+  std::string no_per_pg_omap_alert;
+  std::string disk_size_mismatch_alert;
+  std::string spurious_read_errors_alert;
+
+  void _log_alerts(osd_alert_list_t& alerts);
+  bool _set_compression_alert(bool cmode, const char* s) {
+    std::lock_guard l(qlock);
+    if (cmode) {
+      bool ret = failed_cmode.empty();
+      failed_cmode = s;
+      return ret;
+    }
+    return failed_compressors.emplace(s).second;
+  }
+  void _clear_compression_alert() {
+    std::lock_guard l(qlock);
+    failed_compressors.clear();
+    failed_cmode.clear();
+  }
+
+  void _check_legacy_statfs_alert();
+  void _check_no_per_pg_or_pool_omap_alert();
+  void _set_disk_size_mismatch_alert(const std::string& s) {
+    std::lock_guard l(qlock);
+    disk_size_mismatch_alert = s;
+  }
+  void _set_spurious_read_errors_alert(const std::string& s) {
+    std::lock_guard l(qlock);
+    spurious_read_errors_alert = s;
+  }
+
+private:
+
+  // --------------------------------------------------------
+  // read processing internal methods
+  int _verify_csum(
+    OnodeRef& o,
+    const bluestore_blob_t* blob,
+    uint64_t blob_xoffset,
+    const ceph::buffer::list& bl,
+    uint64_t logical_offset) const;
+  int _decompress(ceph::buffer::list& source, ceph::buffer::list* result);
+
+
+  // --------------------------------------------------------
+  // write ops
+
+  struct WriteContext {
+    bool buffered = false;          ///< buffered write
+    bool compress = false;          ///< compressed write
+    uint64_t target_blob_size = 0;  ///< target (max) blob size
+    unsigned csum_order = 0;        ///< target checksum chunk order
+
+    old_extent_map_t old_extents;   ///< must deref these blobs
+    interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection
+
+    struct write_item {
+      uint64_t logical_offset;      ///< write logical offset
+      BlobRef b;
+      uint64_t blob_length;
+      uint64_t b_off;
+      ceph::buffer::list bl;
+      uint64_t b_off0; ///< original offset in a blob prior to padding
+      uint64_t length0; ///< original data length prior to padding
+
+      bool mark_unused;
+      bool new_blob; ///< whether new blob was created
+
+      bool compressed = false;
+      ceph::buffer::list compressed_bl;
+      size_t compressed_len = 0;
+
+      write_item(
+	uint64_t logical_offs,
+        BlobRef b,
+        uint64_t blob_len,
+        uint64_t o,
+        ceph::buffer::list& bl,
+        uint64_t o0,
+        uint64_t l0,
+        bool _mark_unused,
+	bool _new_blob)
+       :
+         logical_offset(logical_offs),
+         b(b),
+         blob_length(blob_len),
+         b_off(o),
+         bl(bl),
+         b_off0(o0),
+         length0(l0),
+         mark_unused(_mark_unused),
+	 new_blob(_new_blob) {}
+    };
+    std::vector<write_item> writes;                 ///< blobs we're writing
+
+    /// partial clone of the context
+    void fork(const WriteContext& other) {
+      buffered = other.buffered;
+      compress = other.compress;
+      target_blob_size = other.target_blob_size;
+      csum_order = other.csum_order;
+    }
+    void write(
+      uint64_t loffs,
+      BlobRef b,
+      uint64_t blob_len,
+      uint64_t o,
+      ceph::buffer::list& bl,
+      uint64_t o0,
+      uint64_t len0,
+      bool _mark_unused,
+      bool _new_blob) {
+      writes.emplace_back(loffs,
+                          b,
+                          blob_len,
+                          o,
+                          bl,
+                          o0,
+                          len0,
+                          _mark_unused,
+                          _new_blob);
+    }
+    /// Checks for writes to the same pextent within a blob
+    bool has_conflict(
+      BlobRef b,
+      uint64_t loffs,
+      uint64_t loffs_end,
+      uint64_t min_alloc_size);
+  };
+  void _do_write_small(
+    TransContext *txc,
+    CollectionRef &c,
+    OnodeRef& o,
+    uint64_t offset, uint64_t length,
+    ceph::buffer::list::iterator& blp,
+    WriteContext *wctx);
+  void _do_write_big_apply_deferred(
+    TransContext* txc,
+    CollectionRef& c,
+    OnodeRef& o,
+    BigDeferredWriteContext& dctx,
+    bufferlist::iterator& blp,
+    WriteContext* wctx);
+  void _do_write_big(
+    TransContext *txc,
+    CollectionRef &c,
+    OnodeRef& o,
+    uint64_t offset, uint64_t length,
+    ceph::buffer::list::iterator& blp,
+    WriteContext *wctx);
+  int _do_alloc_write(
+    TransContext *txc,
+    CollectionRef c,
+    OnodeRef& o,
+    WriteContext *wctx);
+  void _wctx_finish(
+    TransContext *txc,
+    CollectionRef& c,
+    OnodeRef& o,
+    WriteContext *wctx,
+    std::set<SharedBlob*> *maybe_unshared_blobs=0);
+
+  int _write(TransContext *txc,
+	     CollectionRef& c,
+	     OnodeRef& o,
+	     uint64_t offset, size_t len,
+	     ceph::buffer::list& bl,
+	     uint32_t fadvise_flags);
+  void _pad_zeros(ceph::buffer::list *bl, uint64_t *offset,
+		  uint64_t chunk_size);
+
+  void _choose_write_options(CollectionRef& c,
+                             OnodeRef& o,
+                             uint32_t fadvise_flags,
+                             WriteContext *wctx);
+
+  int _do_gc(TransContext *txc,
+             CollectionRef& c,
+             OnodeRef& o,
+             const WriteContext& wctx,
+             uint64_t *dirty_start,
+             uint64_t *dirty_end);
+
+  int _do_write(TransContext *txc,
+		CollectionRef &c,
+		OnodeRef& o,
+		uint64_t offset, uint64_t length,
+		ceph::buffer::list& bl,
+		uint32_t fadvise_flags);
+  void _do_write_data(TransContext *txc,
+                      CollectionRef& c,
+                      OnodeRef& o,
+                      uint64_t offset,
+                      uint64_t length,
+                      ceph::buffer::list& bl,
+                      WriteContext *wctx);
+
+  int _touch(TransContext *txc,
+	     CollectionRef& c,
+	     OnodeRef& o);
+  int _do_zero(TransContext *txc,
+	       CollectionRef& c,
+	       OnodeRef& o,
+	       uint64_t offset, size_t len);
+  int _zero(TransContext *txc,
+	    CollectionRef& c,
+	    OnodeRef& o,
+	    uint64_t offset, size_t len);
+  void _do_truncate(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& o,
+		   uint64_t offset,
+		   std::set<SharedBlob*> *maybe_unshared_blobs=0);
+  int _truncate(TransContext *txc,
+		CollectionRef& c,
+		OnodeRef& o,
+		uint64_t offset);
+  int _remove(TransContext *txc,
+	      CollectionRef& c,
+	      OnodeRef& o);
+  int _do_remove(TransContext *txc,
+		 CollectionRef& c,
+		 OnodeRef& o);
+  int _setattr(TransContext *txc,
+	       CollectionRef& c,
+	       OnodeRef& o,
+	       const std::string& name,
+	       ceph::buffer::ptr& val);
+  int _setattrs(TransContext *txc,
+		CollectionRef& c,
+		OnodeRef& o,
+		const std::map<std::string,ceph::buffer::ptr>& aset);
+  int _rmattr(TransContext *txc,
+	      CollectionRef& c,
+	      OnodeRef& o,
+	      const std::string& name);
+  int _rmattrs(TransContext *txc,
+	       CollectionRef& c,
+	       OnodeRef& o);
+  void _do_omap_clear(TransContext *txc, OnodeRef& o);
+  int _omap_clear(TransContext *txc,
+		  CollectionRef& c,
+		  OnodeRef& o);
+  int _omap_setkeys(TransContext *txc,
+		    CollectionRef& c,
+		    OnodeRef& o,
+		    ceph::buffer::list& bl);
+  int _omap_setheader(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& o,
+		      ceph::buffer::list& header);
+  int _omap_rmkeys(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& o,
+		   ceph::buffer::list& bl);
+  int _omap_rmkey_range(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o,
+			const std::string& first, const std::string& last);
+  int _set_alloc_hint(
+    TransContext *txc,
+    CollectionRef& c,
+    OnodeRef& o,
+    uint64_t expected_object_size,
+    uint64_t expected_write_size,
+    uint32_t flags);
+  int _do_clone_range(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& oldo,
+		      OnodeRef& newo,
+		      uint64_t srcoff, uint64_t length, uint64_t dstoff);
+  int _clone(TransContext *txc,
+	     CollectionRef& c,
+	     OnodeRef& oldo,
+	     OnodeRef& newo);
+  int _clone_range(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& oldo,
+		   OnodeRef& newo,
+		   uint64_t srcoff, uint64_t length, uint64_t dstoff);
+  int _rename(TransContext *txc,
+	      CollectionRef& c,
+	      OnodeRef& oldo,
+	      OnodeRef& newo,
+	      const ghobject_t& new_oid);
+  int _create_collection(TransContext *txc, const coll_t &cid,
+			 unsigned bits, CollectionRef *c);
+  int _remove_collection(TransContext *txc, const coll_t &cid,
+                         CollectionRef *c);
+  void _do_remove_collection(TransContext *txc, CollectionRef *c);
+  int _split_collection(TransContext *txc,
+			CollectionRef& c,
+			CollectionRef& d,
+			unsigned bits, int rem);
+  int _merge_collection(TransContext *txc,
+			CollectionRef *c,
+			CollectionRef& d,
+			unsigned bits);
+
+  void _collect_allocation_stats(uint64_t need, uint32_t alloc_size,
+                                 const PExtentVector&);
+  void _record_allocation_stats();
+private:
+  uint64_t probe_count = 0;
+  std::atomic<uint64_t> alloc_stats_count = {0};
+  std::atomic<uint64_t> alloc_stats_fragments = { 0 };
+  std::atomic<uint64_t> alloc_stats_size = { 0 };
+  // 
+  std::array<std::tuple<uint64_t, uint64_t, uint64_t>, 5> alloc_stats_history =
+  { std::make_tuple(0ul, 0ul, 0ul) };
+
+  inline bool _use_rotational_settings();
+
+public:
+  typedef btree::btree_set<
+    uint64_t, std::less<uint64_t>,
+    mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
+
+  struct FSCK_ObjectCtx {
+    int64_t& errors;
+    int64_t& warnings;
+    uint64_t& num_objects;
+    uint64_t& num_extents;
+    uint64_t& num_blobs;
+    uint64_t& num_sharded_objects;
+    uint64_t& num_spanning_blobs;
+
+    mempool_dynamic_bitset* used_blocks;
+    uint64_t_btree_t* used_omap_head;
+    std::vector<std::unordered_map<ghobject_t, uint64_t>> *zone_refs;
+
+    ceph::mutex* sb_info_lock;
+    sb_info_space_efficient_map_t& sb_info;
+    // approximate amount of references per <shared blob, chunk>
+    shared_blob_2hash_tracker_t& sb_ref_counts;
+
+    store_statfs_t& expected_store_statfs;
+    per_pool_statfs& expected_pool_statfs;
+    BlueStoreRepairer* repairer;
+
+    FSCK_ObjectCtx(int64_t& e,
+                   int64_t& w,
+                   uint64_t& _num_objects,
+                   uint64_t& _num_extents,
+                   uint64_t& _num_blobs,
+                   uint64_t& _num_sharded_objects,
+                   uint64_t& _num_spanning_blobs,
+                   mempool_dynamic_bitset* _ub,
+                   uint64_t_btree_t* _used_omap_head,
+		   std::vector<std::unordered_map<ghobject_t, uint64_t>> *_zone_refs,
+
+                   ceph::mutex* _sb_info_lock,
+                   sb_info_space_efficient_map_t& _sb_info,
+		   shared_blob_2hash_tracker_t& _sb_ref_counts,
+                   store_statfs_t& _store_statfs,
+                   per_pool_statfs& _pool_statfs,
+                   BlueStoreRepairer* _repairer) :
+      errors(e),
+      warnings(w),
+      num_objects(_num_objects),
+      num_extents(_num_extents),
+      num_blobs(_num_blobs),
+      num_sharded_objects(_num_sharded_objects),
+      num_spanning_blobs(_num_spanning_blobs),
+      used_blocks(_ub),
+      used_omap_head(_used_omap_head),
+      zone_refs(_zone_refs),
+      sb_info_lock(_sb_info_lock),
+      sb_info(_sb_info),
+      sb_ref_counts(_sb_ref_counts),
+      expected_store_statfs(_store_statfs),
+      expected_pool_statfs(_pool_statfs),
+      repairer(_repairer) {
+    }
+  };
+
+  OnodeRef fsck_check_objects_shallow(
+    FSCKDepth depth,
+    int64_t pool_id,
+    CollectionRef c,
+    const ghobject_t& oid,
+    const std::string& key,
+    const ceph::buffer::list& value,
+    mempool::bluestore_fsck::list<std::string>* expecting_shards,
+    std::map<BlobRef, bluestore_blob_t::unused_t>* referenced,
+    const BlueStore::FSCK_ObjectCtx& ctx);
+#ifdef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
+  int  push_allocation_to_rocksdb();
+  int  read_allocation_from_drive_for_bluestore_tool();
+#endif
+  void set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length);
+
+private:
+  struct  read_alloc_stats_t {
+    uint32_t onode_count             = 0;
+    uint32_t shard_count             = 0;
+
+    uint32_t skipped_illegal_extent  = 0;
+
+    uint64_t shared_blob_count      = 0;
+    uint64_t compressed_blob_count   = 0;
+    uint64_t spanning_blob_count     = 0;
+    uint64_t insert_count            = 0;
+    uint64_t extent_count            = 0;
+
+    std::map<uint64_t, volatile_statfs> actual_pool_vstatfs;
+    volatile_statfs actual_store_vstatfs;
+  };
+  class ExtentDecoderPartial : public ExtentMap::ExtentDecoder {
+    BlueStore& store;
+    read_alloc_stats_t& stats;
+    SimpleBitmap& sbmap;
+    sb_info_space_efficient_map_t& sb_info;
+    uint8_t min_alloc_size_order;
+    Extent extent;
+    ghobject_t oid;
+    volatile_statfs* per_pool_statfs = nullptr;
+    blob_map_t blobs;
+    blob_map_t spanning_blobs;
+
+    void _consume_new_blob(bool spanning,
+                           uint64_t extent_no,
+                           uint64_t sbid,
+                           BlobRef b);
+  protected:
+    void consume_blobid(Extent*, bool spanning, uint64_t blobid) override;
+    void consume_blob(Extent* le,
+                      uint64_t extent_no,
+                      uint64_t sbid,
+                      BlobRef b) override;
+    void consume_spanning_blob(uint64_t sbid, BlobRef b) override;
+    Extent* get_next_extent() override {
+      ++stats.extent_count;
+      extent = Extent();
+      return &extent;
+    }
+    void add_extent(Extent*) override {
+    }
+  public:
+    ExtentDecoderPartial(BlueStore& _store,
+                         read_alloc_stats_t& _stats,
+                         SimpleBitmap& _sbmap,
+                         sb_info_space_efficient_map_t& _sb_info,
+                         uint8_t _min_alloc_size_order)
+      : store(_store), stats(_stats), sbmap(_sbmap), sb_info(_sb_info),
+        min_alloc_size_order(_min_alloc_size_order)
+    {}
+    const ghobject_t& get_oid() const {
+      return oid;
+    }
+    void reset(const ghobject_t _oid,
+      volatile_statfs* _per_pool_statfs);
+  };
+
+  friend std::ostream& operator<<(std::ostream& out, const read_alloc_stats_t& stats) {
+    out << "==========================================================" << std::endl;
+    out << "NCB::onode_count             = " ;out.width(10);out << stats.onode_count << std::endl
+	<< "NCB::shard_count             = " ;out.width(10);out << stats.shard_count << std::endl
+	<< "NCB::shared_blob_count      = " ;out.width(10);out << stats.shared_blob_count << std::endl
+	<< "NCB::compressed_blob_count   = " ;out.width(10);out << stats.compressed_blob_count << std::endl
+	<< "NCB::spanning_blob_count     = " ;out.width(10);out << stats.spanning_blob_count << std::endl
+	<< "NCB::skipped_illegal_extent  = " ;out.width(10);out << stats.skipped_illegal_extent << std::endl
+	<< "NCB::extent_count            = " ;out.width(10);out << stats.extent_count << std::endl
+	<< "NCB::insert_count            = " ;out.width(10);out << stats.insert_count << std::endl;
+
+    out << "==========================================================" << std::endl;
+
+    return out;
+  }
+
+  int  compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t req_extent_count, uint64_t memory_target);
+  Allocator* create_bitmap_allocator(uint64_t bdev_size);
+  int  add_existing_bluefs_allocation(Allocator* allocator, read_alloc_stats_t& stats);
+  int  allocator_add_restored_entries(Allocator *allocator, const void *buff, unsigned extent_count, uint64_t *p_read_alloc_size,
+				      uint64_t  *p_extent_count, const void *v_header, BlueFS::FileReader *p_handle, uint64_t offset);
+
+  int  copy_allocator(Allocator* src_alloc, Allocator *dest_alloc, uint64_t* p_num_entries);
+  int  store_allocator(Allocator* allocator);
+  int  invalidate_allocation_file_on_bluefs();
+  int  __restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes);
+  int  restore_allocator(Allocator* allocator, uint64_t *num, uint64_t *bytes);
+  int  read_allocation_from_drive_on_startup();
+  int  reconstruct_allocations(SimpleBitmap *smbmp, read_alloc_stats_t &stats);
+  int  read_allocation_from_onodes(SimpleBitmap *smbmp, read_alloc_stats_t& stats);
+  int  commit_freelist_type();
+  int  commit_to_null_manager();
+  int  commit_to_real_manager();
+  int  db_cleanup(int ret);
+  int  reset_fm_for_restore();
+  int  verify_rocksdb_allocations(Allocator *allocator);
+  Allocator* clone_allocator_without_bluefs(Allocator *src_allocator);
+  Allocator* initialize_allocator_from_freelist(FreelistManager *real_fm);
+  void copy_allocator_content_to_fm(Allocator *allocator, FreelistManager *real_fm);
+
+
+  void _fsck_check_object_omap(FSCKDepth depth,
+    OnodeRef& o,
+    const BlueStore::FSCK_ObjectCtx& ctx);
+
+  void _fsck_check_objects(FSCKDepth depth,
+    FSCK_ObjectCtx& ctx);
+};
+
+inline std::ostream& operator<<(std::ostream& out, const BlueStore::volatile_statfs& s) {
+  return out 
+    << " allocated:"
+      << s.values[BlueStore::volatile_statfs::STATFS_ALLOCATED]
+    << " stored:"
+      << s.values[BlueStore::volatile_statfs::STATFS_STORED]
+    << " compressed:"
+      << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED]
+    << " compressed_orig:"
+      << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ORIGINAL]
+    << " compressed_alloc:"
+      << s.values[BlueStore::volatile_statfs::STATFS_COMPRESSED_ALLOCATED];
+}
+
+static inline void intrusive_ptr_add_ref(BlueStore::Onode *o) {
+  o->get();
+}
+static inline void intrusive_ptr_release(BlueStore::Onode *o) {
+  o->put();
+}
+
+static inline void intrusive_ptr_add_ref(BlueStore::OpSequencer *o) {
+  o->get();
+}
+static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) {
+  o->put();
+}
+
+class BlueStoreRepairer
+{
+  ceph::mutex lock = ceph::make_mutex("BlueStore::BlueStoreRepairer::lock");
+
+public:
+  // to simplify future potential migration to mempools
+  using fsck_interval = interval_set<uint64_t>;
+
+  // Structure to track what pextents are used for specific cid/oid.
+  // Similar to Bloom filter positive and false-positive matches are 
+  // possible only.
+  // Maintains two lists of bloom filters for both cids and oids
+  //   where each list entry is a BF for specific disk pextent
+  //   The length of the extent per filter is measured on init.
+  // Allows to filter out 'uninteresting' pextents to speadup subsequent
+  //  'is_used' access. 
+  struct StoreSpaceTracker {
+    const uint64_t BLOOM_FILTER_SALT_COUNT = 2;
+    const uint64_t BLOOM_FILTER_TABLE_SIZE = 32; // bytes per single filter
+    const uint64_t BLOOM_FILTER_EXPECTED_COUNT = 16; // arbitrary selected
+    static const uint64_t DEF_MEM_CAP = 128 * 1024 * 1024;
+
+    typedef mempool::bluestore_fsck::vector<bloom_filter> bloom_vector;
+    bloom_vector collections_bfs;
+    bloom_vector objects_bfs;
+    
+    bool was_filtered_out = false; 
+    uint64_t granularity = 0; // extent length for a single filter
+
+    StoreSpaceTracker() {
+    }
+    StoreSpaceTracker(const StoreSpaceTracker& from) :
+      collections_bfs(from.collections_bfs),
+      objects_bfs(from.objects_bfs),
+      granularity(from.granularity) {
+    }
+
+    void init(uint64_t total,
+	      uint64_t min_alloc_size,
+	      uint64_t mem_cap = DEF_MEM_CAP) {
+      ceph_assert(!granularity); // not initialized yet
+      ceph_assert(std::has_single_bit(min_alloc_size));
+      ceph_assert(mem_cap);
+      
+      total = round_up_to(total, min_alloc_size);
+      granularity = total * BLOOM_FILTER_TABLE_SIZE * 2 / mem_cap;
+
+      if (!granularity) {
+	granularity = min_alloc_size;
+      } else {
+	granularity = round_up_to(granularity, min_alloc_size);
+      }
+
+      uint64_t entries = round_up_to(total, granularity) / granularity;
+      collections_bfs.resize(entries,
+        bloom_filter(BLOOM_FILTER_SALT_COUNT,
+                     BLOOM_FILTER_TABLE_SIZE,
+                     0,
+                     BLOOM_FILTER_EXPECTED_COUNT));
+      objects_bfs.resize(entries, 
+        bloom_filter(BLOOM_FILTER_SALT_COUNT,
+                     BLOOM_FILTER_TABLE_SIZE,
+                     0,
+                     BLOOM_FILTER_EXPECTED_COUNT));
+    }
+    inline uint32_t get_hash(const coll_t& cid) const {
+      return cid.hash_to_shard(1);
+    }
+    inline void set_used(uint64_t offset, uint64_t len,
+			 const coll_t& cid, const ghobject_t& oid) {
+      ceph_assert(granularity); // initialized
+      
+      // can't call this func after filter_out has been applied
+      ceph_assert(!was_filtered_out);
+      if (!len) {
+	return;
+      }
+      auto pos = offset / granularity;
+      auto end_pos = (offset + len - 1) / granularity;
+      while (pos <= end_pos) {
+        collections_bfs[pos].insert(get_hash(cid));
+        objects_bfs[pos].insert(oid.hobj.get_hash());
+        ++pos;
+      }
+    }
+    // filter-out entries unrelated to the specified(broken) extents.
+    // 'is_used' calls are permitted after that only
+    size_t filter_out(const fsck_interval& extents);
+
+    // determines if collection's present after filtering-out 
+    inline bool is_used(const coll_t& cid) const {
+      ceph_assert(was_filtered_out);
+      for(auto& bf : collections_bfs) {
+        if (bf.contains(get_hash(cid))) {
+          return true;
+        }
+      }
+      return false;
+    }
+    // determines if object's present after filtering-out 
+    inline bool is_used(const ghobject_t& oid) const {
+      ceph_assert(was_filtered_out);
+      for(auto& bf : objects_bfs) {
+        if (bf.contains(oid.hobj.get_hash())) {
+          return true;
+        }
+      }
+      return false;
+    }
+    // determines if collection's present before filtering-out 
+    inline bool is_used(const coll_t& cid, uint64_t offs) const {
+      ceph_assert(granularity); // initialized
+      ceph_assert(!was_filtered_out);
+      auto &bf = collections_bfs[offs / granularity];
+      if (bf.contains(get_hash(cid))) {
+        return true;
+      }
+      return false;
+    }
+    // determines if object's present before filtering-out 
+    inline bool is_used(const ghobject_t& oid, uint64_t offs) const {
+      ceph_assert(granularity); // initialized
+      ceph_assert(!was_filtered_out);
+      auto &bf = objects_bfs[offs / granularity];
+      if (bf.contains(oid.hobj.get_hash())) {
+        return true;
+      }
+      return false;
+    }
+  };
+
+public:
+  void fix_per_pool_omap(KeyValueDB *db, int);
+  bool remove_key(KeyValueDB *db, const std::string& prefix, const std::string& key);
+  bool fix_shared_blob(KeyValueDB::Transaction txn,
+			uint64_t sbid,
+			bluestore_extent_ref_map_t* ref_map,
+			size_t repaired = 1);
+  bool fix_statfs(KeyValueDB *db, const std::string& key,
+    const store_statfs_t& new_statfs);
+
+  bool fix_leaked(KeyValueDB *db,
+		  FreelistManager* fm,
+		  uint64_t offset, uint64_t len);
+  bool fix_false_free(KeyValueDB *db,
+		      FreelistManager* fm,
+		      uint64_t offset, uint64_t len);
+  bool fix_spanning_blobs(
+    KeyValueDB* db,
+    std::function<void(KeyValueDB::Transaction)> f);
+
+  bool preprocess_misreference(KeyValueDB *db);
+
+  unsigned apply(KeyValueDB* db);
+
+  void note_misreference(uint64_t offs, uint64_t len, bool inc_error) {
+    std::lock_guard l(lock);
+    misreferenced_extents.union_insert(offs, len);
+    if (inc_error) {
+      ++to_repair_cnt;
+    }
+  }
+  //////////////////////
+  //In fact two methods below are the only ones in this class which are thread-safe!!
+  void inc_repaired(size_t n = 1) {
+    to_repair_cnt += n;
+  }
+  void request_compaction() {
+    need_compact = true;
+  }
+  //////////////////////
+
+  void init_space_usage_tracker(
+    uint64_t total_space, uint64_t lres_tracking_unit_size)
+  {
+    //NB: not for use in multithreading mode!!!
+    space_usage_tracker.init(total_space, lres_tracking_unit_size);
+  }
+  void set_space_used(uint64_t offset, uint64_t len,
+    const coll_t& cid, const ghobject_t& oid) {
+    std::lock_guard l(lock);
+    space_usage_tracker.set_used(offset, len, cid, oid);
+  }
+  inline bool is_used(const coll_t& cid) const {
+    //NB: not for use in multithreading mode!!!
+    return space_usage_tracker.is_used(cid);
+  }
+  inline bool is_used(const ghobject_t& oid) const {
+    //NB: not for use in multithreading mode!!!
+    return space_usage_tracker.is_used(oid);
+  }
+
+  const fsck_interval& get_misreferences() const {
+    //NB: not for use in multithreading mode!!!
+    return misreferenced_extents;
+  }
+  KeyValueDB::Transaction get_fix_misreferences_txn() {
+    //NB: not for use in multithreading mode!!!
+    return fix_misreferences_txn;
+  }
+
+private:
+  std::atomic<unsigned> to_repair_cnt = { 0 };
+  std::atomic<bool> need_compact = { false };
+  KeyValueDB::Transaction fix_per_pool_omap_txn;
+  KeyValueDB::Transaction fix_fm_leaked_txn;
+  KeyValueDB::Transaction fix_fm_false_free_txn;
+  KeyValueDB::Transaction remove_key_txn;
+  KeyValueDB::Transaction fix_statfs_txn;
+  KeyValueDB::Transaction fix_shared_blob_txn;
+
+  KeyValueDB::Transaction fix_misreferences_txn;
+  KeyValueDB::Transaction fix_onode_txn;
+
+  StoreSpaceTracker space_usage_tracker;
+
+  // non-shared extents with multiple references
+  fsck_interval misreferenced_extents;
+
+};
+
+class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector
+{
+  template <class T, size_t MaxX, size_t MaxY>
+  class matrix_2d {
+    T values[MaxX][MaxY];
+  public:
+    matrix_2d() {
+      clear();
+    }
+    T& at(size_t x, size_t y) {
+      ceph_assert(x < MaxX);
+      ceph_assert(y < MaxY);
+
+      return values[x][y];
+    }
+    size_t get_max_x() const {
+      return MaxX;
+    }
+    size_t get_max_y() const {
+      return MaxY;
+    }
+    void clear() {
+      memset(values, 0, sizeof(values));
+    }
+  };
+
+  enum {
+    // use 0/nullptr as unset indication
+    LEVEL_FIRST = 1,
+    LEVEL_LOG = LEVEL_FIRST, // BlueFS log
+    LEVEL_WAL,
+    LEVEL_DB,
+    LEVEL_SLOW,
+    LEVEL_MAX
+  };
+  // add +1 row for corresponding per-device totals
+  // add +1 column for per-level actual (taken from file size) total
+  typedef matrix_2d<std::atomic<uint64_t>, BlueFS::MAX_BDEV + 1, LEVEL_MAX - LEVEL_FIRST + 1> per_level_per_dev_usage_t;
+
+  per_level_per_dev_usage_t per_level_per_dev_usage;
+  // file count per level, add +1 to keep total file count
+  std::atomic<uint64_t> per_level_files[LEVEL_MAX - LEVEL_FIRST + 1] = { 0 };
+
+  // Note: maximum per-device totals below might be smaller than corresponding
+  // perf counters by up to a single alloc unit (1M) due to superblock extent.
+  // The later is not accounted here.
+  per_level_per_dev_usage_t per_level_per_dev_max;
+
+  uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST];
+  uint64_t db_avail4slow = 0;
+  enum {
+    OLD_POLICY,
+    USE_SOME_EXTRA
+  };
+
+public:
+  RocksDBBlueFSVolumeSelector(
+    uint64_t _wal_total,
+    uint64_t _db_total,
+    uint64_t _slow_total,
+    uint64_t _level0_size,
+    uint64_t _level_base,
+    uint64_t _level_multiplier,
+    double reserved_factor,
+    uint64_t reserved,
+    bool new_pol)
+  {
+    l_totals[LEVEL_LOG - LEVEL_FIRST] = 0; // not used at the moment
+    l_totals[LEVEL_WAL - LEVEL_FIRST] = _wal_total;
+    l_totals[LEVEL_DB - LEVEL_FIRST] = _db_total;
+    l_totals[LEVEL_SLOW - LEVEL_FIRST] = _slow_total;
+
+    if (!new_pol) {
+      return;
+    }
+
+    // Calculating how much extra space is available at DB volume.
+    // Depending on the presence of explicit reserved size specification it might be either
+    // * DB volume size - reserved
+    // or
+    // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
+    if (!reserved) {
+      uint64_t prev_levels = _level0_size;
+      uint64_t cur_level = _level_base;
+      uint64_t cur_threshold = 0;
+      do {
+        uint64_t next_level = cur_level * _level_multiplier;
+        uint64_t next_threshold = prev_levels + cur_level + next_level * reserved_factor;
+        if (_db_total <= next_threshold) {
+          db_avail4slow = cur_threshold ? _db_total - cur_threshold : 0;
+          break;
+        } else {
+          prev_levels += cur_level;
+          cur_level = next_level;
+          cur_threshold = next_threshold;
+        }
+      } while (true);
+    } else {
+      db_avail4slow = _db_total - reserved;
+    }
+  }
+
+  void* get_hint_for_log() const override {
+    return  reinterpret_cast<void*>(LEVEL_LOG);
+  }
+  void* get_hint_by_dir(std::string_view dirname) const override;
+
+  void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+    if (hint == nullptr)
+      return;
+    size_t pos = (size_t)hint - LEVEL_FIRST;
+    for (auto& p : fnode.extents) {
+      auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
+      auto& max = per_level_per_dev_max.at(p.bdev, pos);
+      uint64_t v = cur.fetch_add(p.length) + p.length;
+      while (v > max) {
+	max.exchange(v);
+      }
+      {
+        //update per-device totals
+        auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+        auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+        uint64_t v = cur.fetch_add(p.length) + p.length;
+	while (v > max) {
+	  max.exchange(v);
+	}
+      }
+    }
+    {
+      //update per-level actual totals
+      auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+      auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
+      uint64_t v = cur.fetch_add(fnode.size) + fnode.size;
+      while (v > max) {
+	max.exchange(v);
+      }
+    }
+    ++per_level_files[pos];
+    ++per_level_files[LEVEL_MAX - LEVEL_FIRST];
+  }
+  void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+    if (hint == nullptr)
+      return;
+    size_t pos = (size_t)hint - LEVEL_FIRST;
+    for (auto& p : fnode.extents) {
+      auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
+      ceph_assert(cur >= p.length);
+      cur -= p.length;
+
+      //update per-device totals
+      auto& cur2 = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
+      ceph_assert(cur2 >= p.length);
+      cur2 -= p.length;
+    }
+    //update per-level actual totals
+    auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+    ceph_assert(cur >= fnode.size);
+    cur -= fnode.size;
+    ceph_assert(per_level_files[pos] > 0);
+    --per_level_files[pos];
+    ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0);
+    --per_level_files[LEVEL_MAX - LEVEL_FIRST];
+  }
+  void add_usage(void* hint, uint64_t size_more) override {
+    if (hint == nullptr)
+      return;
+    size_t pos = (size_t)hint - LEVEL_FIRST;
+    //update per-level actual totals
+    auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+    auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
+    uint64_t v = cur.fetch_add(size_more) + size_more;
+    while (v > max) {
+      max.exchange(v);
+    }
+  }
+  void sub_usage(void* hint, uint64_t size_less) override {
+    if (hint == nullptr)
+      return;
+    size_t pos = (size_t)hint - LEVEL_FIRST;
+    //update per-level actual totals
+    auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
+    ceph_assert(cur >= size_less);
+    cur -= size_less;
+  }
+
+  uint8_t select_prefer_bdev(void* h) override;
+  void get_paths(
+    const std::string& base,
+    BlueFSVolumeSelector::paths& res) const override;
+
+  void dump(std::ostream& sout) override;
+  BlueFSVolumeSelector* clone_empty() const override;
+  bool compare(BlueFSVolumeSelector* other) override;
+};
+
+#endif
diff --git a/src/os/bluestore/BtreeAllocator.cc b/src/os/bluestore/BtreeAllocator.cc
new file mode 100644
index 000000000..2455ec111
--- /dev/null
+++ b/src/os/bluestore/BtreeAllocator.cc
@@ -0,0 +1,471 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "BtreeAllocator.h"
+
+#include <bit>
+#include <limits>
+
+#include "common/config_proxy.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef  dout_prefix
+#define dout_prefix *_dout << "BtreeAllocator "
+
+/*
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified B-tree
+ * looking for a block that matches the specified criteria.
+ */
+uint64_t BtreeAllocator::_pick_block_after(uint64_t *cursor,
+					   uint64_t size,
+					   uint64_t align)
+{
+  auto rs_start = range_tree.lower_bound(*cursor);
+  for (auto rs = rs_start; rs != range_tree.end(); ++rs) {
+    uint64_t offset = rs->first;
+    if (offset + size <= rs->second) {
+      *cursor = offset + size;
+      return offset;
+    }
+  }
+  if (*cursor == 0) {
+    // If we already started from beginning, don't bother with searching from beginning
+    return -1ULL;
+  }
+  // If we reached end, start from beginning till cursor.
+  for (auto rs = range_tree.begin(); rs != rs_start; ++rs) {
+    uint64_t offset = rs->first;
+    if (offset + size <= rs->second) {
+      *cursor = offset + size;
+      return offset;
+    }
+  }
+  return -1ULL;
+}
+
+uint64_t BtreeAllocator::_pick_block_fits(uint64_t size,
+                                        uint64_t align)
+{
+  // instead of searching from cursor, just pick the smallest range which fits
+  // the needs
+  auto rs_start = range_size_tree.lower_bound(range_value_t{0,size});
+  for (auto rs = rs_start; rs != range_size_tree.end(); ++rs) {
+    uint64_t offset = rs->start;
+    if (offset + size <= rs->start + rs->size) {
+      return offset;
+    }
+  }
+  return -1ULL;
+}
+
+void BtreeAllocator::_add_to_tree(uint64_t start, uint64_t size)
+{
+  ceph_assert(size != 0);
+
+  uint64_t end = start + size;
+
+  auto rs_after = range_tree.upper_bound(start);
+
+  /* Make sure we don't overlap with either of our neighbors */
+  auto rs_before = range_tree.end();
+  if (rs_after != range_tree.begin()) {
+    rs_before = std::prev(rs_after);
+  }
+
+  bool merge_before = (rs_before != range_tree.end() && rs_before->second == start);
+  bool merge_after = (rs_after != range_tree.end() && rs_after->first == end);
+
+  if (merge_before && merge_after) {
+    // | before   |//////| after |
+    // | before >>>>>>>>>>>>>>>  |
+    range_seg_t seg_before{rs_before->first, rs_before->second};
+    range_seg_t seg_after{rs_after->first, rs_after->second};
+    // expand the head seg before rs_{before,after} are invalidated
+    rs_before->second = seg_after.end;
+    // remove the tail seg from offset tree
+    range_tree.erase(rs_after);
+    // remove the head and tail seg from size tree
+    range_size_tree.erase(seg_before);
+    range_size_tree.erase(seg_after);
+    // insert the merged seg into size tree
+    range_size_tree.emplace(seg_before.start, seg_after.end);
+  } else if (merge_before) {
+    // | before   |//////|
+    // | before >>>>>>>> |
+    // remove the head seg from the size tree
+    range_seg_t seg_before{rs_before->first, rs_before->second};
+    range_size_tree.erase(seg_before);
+    // expand the head seg in the offset tree
+    rs_before->second = end;
+    // insert the merged seg into size tree
+    range_size_tree.emplace(seg_before.start, end);
+  } else if (merge_after) {
+    // |//////| after |
+    // | merge after  |
+    // remove the tail seg from size tree
+    range_seg_t seg_after{rs_after->first, rs_after->second};
+    range_size_tree.erase(seg_after);
+    // remove the tail seg from offset tree
+    range_tree.erase(rs_after);
+    // insert the merged seg
+    range_tree.emplace(start, seg_after.end);
+    range_size_tree.emplace(start, seg_after.end);
+  } else {
+    // no neighbours
+    range_tree.emplace_hint(rs_after, start, end);
+    range_size_tree.emplace(start, end);
+  }
+  num_free += size;
+}
+
+void BtreeAllocator::_process_range_removal(uint64_t start, uint64_t end,
+  BtreeAllocator::range_tree_t::iterator& rs)
+{
+  bool left_over = (rs->first != start);
+  bool right_over = (rs->second != end);
+
+  range_seg_t seg_whole{rs->first, rs->second};
+  range_size_tree.erase(seg_whole);
+
+  // | left <|////|  right |
+  if (left_over && right_over) {
+    // add the spin-off right seg
+    range_seg_t seg_after{end, seg_whole.end};
+    range_tree.emplace_hint(rs, seg_after.start, seg_after.end);
+    range_size_tree.emplace(seg_after);
+    // shink the left seg in offset tree
+    rs->second = start;
+    // insert the shrinked left seg back into size tree
+    range_size_tree.emplace(seg_whole.start, start);
+  } else if (left_over) {
+    // | left <|///////////|
+    // shrink the left seg in the offset tree
+    rs->second = start;
+    // insert the shrinked left seg back into size tree
+    range_size_tree.emplace(seg_whole.start, start);
+  } else if (right_over) {
+    // |//////////| right |
+    // remove the whole seg from offset tree
+    range_tree.erase(rs);
+    // add the spin-off right seg
+    range_seg_t seg_after{end, seg_whole.end};
+    range_tree.emplace(seg_after.start, seg_after.end);
+    range_size_tree.emplace(seg_after);
+  } else {
+    range_tree.erase(rs);
+  }
+  num_free -= (end - start);
+}
+
+void BtreeAllocator::_remove_from_tree(uint64_t start, uint64_t size)
+{
+  uint64_t end = start + size;
+
+  ceph_assert(size != 0);
+  ceph_assert(size <= num_free);
+
+  auto rs = range_tree.find(start);
+  /* Make sure we completely overlap with someone */
+  ceph_assert(rs != range_tree.end());
+  ceph_assert(rs->first <= start);
+  ceph_assert(rs->second >= end);
+
+  _process_range_removal(start, end, rs);
+}
+
+void BtreeAllocator::_try_remove_from_tree(uint64_t start, uint64_t size,
+  std::function<void(uint64_t, uint64_t, bool)> cb)
+{
+  uint64_t end = start + size;
+
+  ceph_assert(size != 0);
+
+  auto rs = range_tree.find(start);
+
+  if (rs == range_tree.end() || rs->first >= end) {
+    cb(start, size, false);
+    return;
+  }
+
+  do {
+
+    auto next_rs = rs;
+    ++next_rs;
+
+    if (start < rs->first) {
+      cb(start, rs->first - start, false);
+      start = rs->first;
+    }
+    auto range_end = std::min(rs->second, end);
+    _process_range_removal(start, range_end, rs);
+    cb(start, range_end - start, true);
+    start = range_end;
+
+    rs = next_rs;
+  } while (rs != range_tree.end() && rs->first < end && start < end);
+  if (start < end) {
+    cb(start, end - start, false);
+  }
+}
+
+int64_t BtreeAllocator::_allocate(
+  uint64_t want,
+  uint64_t unit,
+  uint64_t max_alloc_size,
+  int64_t  hint, // unused, for now!
+  PExtentVector* extents)
+{
+  uint64_t allocated = 0;
+  while (allocated < want) {
+    uint64_t offset, length;
+    int r = _allocate(std::min(max_alloc_size, want - allocated),
+                      unit, &offset, &length);
+    if (r < 0) {
+      // Allocation failed.
+      break;
+    }
+    extents->emplace_back(offset, length);
+    allocated += length;
+  }
+  assert(range_size_tree.size() == range_tree.size());
+  return allocated ? allocated : -ENOSPC;
+}
+
+int BtreeAllocator::_allocate(
+  uint64_t size,
+  uint64_t unit,
+  uint64_t *offset,
+  uint64_t *length)
+{
+  uint64_t max_size = 0;
+  if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) {
+    max_size = p->size;
+  }
+
+  bool force_range_size_alloc = false;
+  if (max_size < size) {
+    if (max_size < unit) {
+      return -ENOSPC;
+    }
+    size = p2align(max_size, unit);
+    ceph_assert(size > 0);
+    force_range_size_alloc = true;
+  }
+
+  const int free_pct = num_free * 100 / device_size;
+  uint64_t start = 0;
+  /*
+   * If we're running low on space switch to using the size
+   * sorted B-tree (best-fit).
+   */
+  if (force_range_size_alloc ||
+      max_size < range_size_alloc_threshold ||
+      free_pct < range_size_alloc_free_pct) {
+    do {
+      start = _pick_block_fits(size, unit);
+      dout(20) << __func__ << " best fit=" << start << " size=" << size << dendl;
+      if (start != uint64_t(-1ULL)) {
+        break;
+      }
+      // try to collect smaller extents as we could fail to retrieve
+      // that large block due to misaligned extents
+      size = p2align(size >> 1, unit);
+    } while (size >= unit);
+  } else {
+    do {
+      /*
+       * Find the largest power of 2 block size that evenly divides the
+       * requested size. This is used to try to allocate blocks with similar
+       * alignment from the same area (i.e. same cursor bucket) but it does
+       * not guarantee that other allocations sizes may exist in the same
+       * region.
+       */
+      uint64_t* cursor = &lbas[cbits(size) - 1];
+      start = _pick_block_after(cursor, size, unit);
+      dout(20) << __func__ << " first fit=" << start << " size=" << size << dendl;
+      if (start != uint64_t(-1ULL)) {
+        break;
+      }
+      // try to collect smaller extents as we could fail to retrieve
+      // that large block due to misaligned extents
+      size = p2align(size >> 1, unit);
+    } while (size >= unit);
+  }
+  if (start == -1ULL) {
+    return -ENOSPC;
+  }
+
+  _remove_from_tree(start, size);
+
+  *offset = start;
+  *length = size;
+  return 0;
+}
+
+void BtreeAllocator::_release(const interval_set<uint64_t>& release_set)
+{
+  for (auto p = release_set.begin(); p != release_set.end(); ++p) {
+    const auto offset = p.get_start();
+    const auto length = p.get_len();
+    ceph_assert(offset + length <= uint64_t(device_size));
+    ldout(cct, 10) << __func__ << std::hex
+      << " offset 0x" << offset
+      << " length 0x" << length
+      << std::dec << dendl;
+    _add_to_tree(offset, length);
+  }
+}
+
+void BtreeAllocator::_release(const PExtentVector& release_set) {
+  for (auto& e : release_set) {
+    ldout(cct, 10) << __func__ << std::hex
+      << " offset 0x" << e.offset
+      << " length 0x" << e.length
+      << std::dec << dendl;
+    _add_to_tree(e.offset, e.length);
+  }
+}
+
+void BtreeAllocator::_shutdown()
+{
+  range_size_tree.clear();
+  range_tree.clear();
+}
+
+BtreeAllocator::BtreeAllocator(CephContext* cct,
+			       int64_t device_size,
+			       int64_t block_size,
+			       uint64_t max_mem,
+			       std::string_view name) :
+  Allocator(name, device_size, block_size),
+  range_size_alloc_threshold(
+    cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
+  range_size_alloc_free_pct(
+    cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
+  range_count_cap(max_mem / sizeof(range_seg_t)),
+  cct(cct)
+{}
+
+BtreeAllocator::BtreeAllocator(CephContext* cct,
+			       int64_t device_size,
+			       int64_t block_size,
+			       std::string_view name) :
+  BtreeAllocator(cct, device_size, block_size, 0 /* max_mem */, name)
+{}
+
+BtreeAllocator::~BtreeAllocator()
+{
+  shutdown();
+}
+
+int64_t BtreeAllocator::allocate(
+  uint64_t want,
+  uint64_t unit,
+  uint64_t max_alloc_size,
+  int64_t  hint, // unused, for now!
+  PExtentVector* extents)
+{
+  ldout(cct, 10) << __func__ << std::hex
+                 << " want 0x" << want
+                 << " unit 0x" << unit
+                 << " max_alloc_size 0x" << max_alloc_size
+                 << " hint 0x" << hint
+                 << std::dec << dendl;
+  ceph_assert(std::has_single_bit(unit));
+  ceph_assert(want % unit == 0);
+
+  if (max_alloc_size == 0) {
+    max_alloc_size = want;
+  }
+  if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
+      max_alloc_size >= cap) {
+    max_alloc_size = p2align(uint64_t(cap), (uint64_t)block_size);
+  }
+  std::lock_guard l(lock);
+  return _allocate(want, unit, max_alloc_size, hint, extents);
+}
+
+void BtreeAllocator::release(const interval_set<uint64_t>& release_set) {
+  std::lock_guard l(lock);
+  _release(release_set);
+}
+
+uint64_t BtreeAllocator::get_free()
+{
+  std::lock_guard l(lock);
+  return num_free;
+}
+
+double BtreeAllocator::get_fragmentation()
+{
+  std::lock_guard l(lock);
+  return _get_fragmentation();
+}
+
+void BtreeAllocator::dump()
+{
+  std::lock_guard l(lock);
+  _dump();
+}
+
+void BtreeAllocator::_dump() const
+{
+  ldout(cct, 0) << __func__ << " range_tree: " << dendl;
+  for (auto& rs : range_tree) {
+    ldout(cct, 0) << std::hex
+      << "0x" << rs.first << "~" << rs.second
+      << std::dec
+      << dendl;
+  }
+
+  ldout(cct, 0) << __func__ << " range_size_tree: " << dendl;
+  for (auto& rs : range_size_tree) {
+    ldout(cct, 0) << std::hex
+      << "0x" << rs.size << "@" << rs.start
+      << std::dec
+      << dendl;
+  }
+}
+
+void BtreeAllocator::foreach(std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+  std::lock_guard l(lock);
+  for (auto& rs : range_tree) {
+    notify(rs.first, rs.second - rs.first);
+  }
+}
+
+void BtreeAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+  if (!length)
+    return;
+  std::lock_guard l(lock);
+  ceph_assert(offset + length <= uint64_t(device_size));
+  ldout(cct, 10) << __func__ << std::hex
+                 << " offset 0x" << offset
+                 << " length 0x" << length
+                 << std::dec << dendl;
+  _add_to_tree(offset, length);
+}
+
+void BtreeAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+  if (!length)
+    return;
+  std::lock_guard l(lock);
+  ceph_assert(offset + length <= uint64_t(device_size));
+  ldout(cct, 10) << __func__ << std::hex
+                 << " offset 0x" << offset
+                 << " length 0x" << length
+                 << std::dec << dendl;
+  _remove_from_tree(offset, length);
+}
+
+void BtreeAllocator::shutdown()
+{
+  std::lock_guard l(lock);
+  _shutdown();
+}
diff --git a/src/os/bluestore/BtreeAllocator.h b/src/os/bluestore/BtreeAllocator.h
new file mode 100644
index 000000000..4561d9f4c
--- /dev/null
+++ b/src/os/bluestore/BtreeAllocator.h
@@ -0,0 +1,200 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <mutex>
+#include "include/cpp-btree/btree_map.h"
+#include "include/cpp-btree/btree_set.h"
+#include "Allocator.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+
+class BtreeAllocator : public Allocator {
+  struct range_seg_t {
+    uint64_t start;   ///< starting offset of this segment
+    uint64_t end;     ///< ending offset (non-inclusive)
+
+    range_seg_t(uint64_t start, uint64_t end)
+      : start{start},
+        end{end}
+    {}
+    inline uint64_t length() const {
+      return end - start;
+    }
+  };
+
+  struct range_value_t {
+    uint64_t size;
+    uint64_t start;
+    range_value_t(uint64_t start, uint64_t end)
+      : size{end - start},
+        start{start}
+    {}
+    range_value_t(const range_seg_t& rs)
+      : size{rs.length()},
+        start{rs.start}
+    {}
+  };
+  // do the radix sort
+  struct compare_range_value_t {
+    int operator()(const range_value_t& lhs,
+                   const range_value_t& rhs) const noexcept {
+      if (lhs.size < rhs.size) {
+        return -1;
+      } else if (lhs.size > rhs.size) {
+        return 1;
+      }
+      if (lhs.start < rhs.start) {
+        return -1;
+      } else if (lhs.start > rhs.start) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+  };
+protected:
+  /*
+  * ctor intended for the usage from descendant class(es) which
+  * provides handling for spilled over entries
+  * (when entry count >= max_entries)
+  */
+  BtreeAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
+    uint64_t max_mem,
+    std::string_view name);
+
+public:
+  BtreeAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
+                 std::string_view name);
+  ~BtreeAllocator();
+  const char* get_type() const override
+  {
+    return "btree";
+  }
+  int64_t allocate(
+    uint64_t want,
+    uint64_t unit,
+    uint64_t max_alloc_size,
+    int64_t  hint,
+    PExtentVector *extents) override;
+  void release(const interval_set<uint64_t>& release_set) override;
+  uint64_t get_free() override;
+  double get_fragmentation() override;
+
+  void dump() override;
+  void foreach(
+    std::function<void(uint64_t offset, uint64_t length)> notify) override;
+  void init_add_free(uint64_t offset, uint64_t length) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+  void shutdown() override;
+
+private:
+  // pick a range by search from cursor forward
+  uint64_t _pick_block_after(
+    uint64_t *cursor,
+    uint64_t size,
+    uint64_t align);
+  // pick a range with exactly the same size or larger
+  uint64_t _pick_block_fits(
+    uint64_t size,
+    uint64_t align);
+  int _allocate(
+    uint64_t size,
+    uint64_t unit,
+    uint64_t *offset,
+    uint64_t *length);
+
+  template<class T>
+  using pool_allocator = mempool::bluestore_alloc::pool_allocator<T>;
+  using range_tree_t =
+    btree::btree_map<
+      uint64_t /* start */,
+      uint64_t /* end */,
+      std::less<uint64_t>,
+      pool_allocator<std::pair<uint64_t, uint64_t>>>;
+  range_tree_t range_tree;    ///< main range tree
+  /*
+   * The range_size_tree should always contain the
+   * same number of segments as the range_tree.
+   * The only difference is that the range_size_tree
+   * is ordered by segment sizes.
+   */
+  using range_size_tree_t =
+    btree::btree_set<
+      range_value_t /* size, start */,
+      compare_range_value_t,
+      pool_allocator<range_value_t>>;
+  range_size_tree_t range_size_tree;
+
+  uint64_t num_free = 0;     ///< total bytes in freelist
+
+  /*
+   * This value defines the number of elements in the ms_lbas array.
+   * The value of 64 was chosen as it covers all power of 2 buckets
+   * up to UINT64_MAX.
+   * This is the equivalent of highest-bit of UINT64_MAX.
+   */
+  static constexpr unsigned MAX_LBAS = 64;
+  uint64_t lbas[MAX_LBAS] = {0};
+
+  /*
+   * Minimum size which forces the dynamic allocator to change
+   * it's allocation strategy.  Once the allocator cannot satisfy
+   * an allocation of this size then it switches to using more
+   * aggressive strategy (i.e search by size rather than offset).
+   */
+  uint64_t range_size_alloc_threshold = 0;
+  /*
+   * The minimum free space, in percent, which must be available
+   * in allocator to continue allocations in a first-fit fashion.
+   * Once the allocator's free space drops below this level we dynamically
+   * switch to using best-fit allocations.
+   */
+  int range_size_alloc_free_pct = 0;
+
+  /*
+  * Max amount of range entries allowed. 0 - unlimited
+  */
+  int64_t range_count_cap = 0;
+
+private:
+  CephContext* cct;
+  std::mutex lock;
+
+  double _get_fragmentation() const {
+    auto free_blocks = p2align(num_free, (uint64_t)block_size) / block_size;
+    if (free_blocks <= 1) {
+      return .0;
+    }
+    return (static_cast<double>(range_tree.size() - 1) / (free_blocks - 1));
+  }
+  void _dump() const;
+
+  uint64_t _lowest_size_available() const {
+    auto rs = range_size_tree.begin();
+    return rs != range_size_tree.end() ? rs->size : 0;
+  }
+
+  int64_t _allocate(
+    uint64_t want,
+    uint64_t unit,
+    uint64_t max_alloc_size,
+    int64_t  hint,
+    PExtentVector *extents);
+
+  void _release(const interval_set<uint64_t>& release_set);
+  void _release(const PExtentVector&  release_set);
+  void _shutdown();
+
+  // called when extent to be released/marked free
+  void _add_to_tree(uint64_t start, uint64_t size);
+  void _process_range_removal(uint64_t start, uint64_t end, range_tree_t::iterator& rs);
+  void _remove_from_tree(uint64_t start, uint64_t size);
+  void _try_remove_from_tree(uint64_t start, uint64_t size,
+    std::function<void(uint64_t offset, uint64_t length, bool found)> cb);
+
+  uint64_t _get_free() const {
+    return num_free;
+  }
+};
diff --git a/src/os/bluestore/FreelistManager.cc b/src/os/bluestore/FreelistManager.cc
new file mode 100644
index 000000000..69866fa40
--- /dev/null
+++ b/src/os/bluestore/FreelistManager.cc
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "FreelistManager.h"
+#include "BitmapFreelistManager.h"
+#ifdef HAVE_LIBZBD
+#include "ZonedFreelistManager.h"
+#endif
+
+FreelistManager *FreelistManager::create(
+  CephContext* cct,
+  std::string type,
+  std::string prefix)
+{
+  // a bit of a hack... we hard-code the prefixes here.  we need to
+  // put the freelistmanagers in different prefixes because the merge
+  // op is per prefix, has to done pre-db-open, and we don't know the
+  // freelist type until after we open the db.
+  ceph_assert(prefix == "B");
+  if (type == "bitmap") {
+    return new BitmapFreelistManager(cct, "B", "b");
+  }
+  if (type == "null") {
+    // use BitmapFreelistManager with the null option to stop allocations from going to RocksDB
+    auto *fm = new BitmapFreelistManager(cct, "B", "b");
+    fm->set_null_manager();
+    return fm;
+  }
+
+#ifdef HAVE_LIBZBD
+  // With zoned drives there is only one FreelistManager implementation that we
+  // can use, and we also know if a drive is zoned right after opening it
+  // (BlueStore::_open_bdev).  Hence, we set freelist_type to "zoned" whenever
+  // we open the device and it turns out to be is zoned.  We ignore |prefix|
+  // passed to create and use the prefixes defined for zoned devices at the top
+  // of BlueStore.cc.
+  if (type == "zoned")
+    return new ZonedFreelistManager(cct, "Z", "z");
+#endif
+
+  return NULL;
+}
+
+void FreelistManager::setup_merge_operators(KeyValueDB *db,
+					    const std::string& type)
+{
+#ifdef HAVE_LIBZBD
+  if (type == "zoned")
+    ZonedFreelistManager::setup_merge_operator(db, "z");
+  else
+#endif
+    BitmapFreelistManager::setup_merge_operator(db, "b");
+}
diff --git a/src/os/bluestore/FreelistManager.h b/src/os/bluestore/FreelistManager.h
new file mode 100644
index 000000000..7f44fe957
--- /dev/null
+++ b/src/os/bluestore/FreelistManager.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_FREELISTMANAGER_H
+#define CEPH_OS_BLUESTORE_FREELISTMANAGER_H
+
+#include <string>
+#include <vector>
+#include <mutex>
+#include <ostream>
+#include "kv/KeyValueDB.h"
+#include "bluestore_types.h"
+
+class FreelistManager {
+  bool         null_manager = false;
+public:
+  CephContext* cct;
+  explicit FreelistManager(CephContext* cct) : cct(cct) {}
+  virtual ~FreelistManager() {}
+
+  static FreelistManager *create(
+    CephContext* cct,
+    std::string type,
+    std::string prefix);
+
+  static void setup_merge_operators(KeyValueDB *db, const std::string &type);
+
+  virtual int create(uint64_t size, uint64_t granularity,
+		     uint64_t zone_size, uint64_t first_sequential_zone,
+		     KeyValueDB::Transaction txn) = 0;
+
+  virtual int init(KeyValueDB *kvdb, bool db_in_read_only,
+    std::function<int(const std::string&, std::string*)> cfg_reader) = 0;
+  virtual void sync(KeyValueDB* kvdb) = 0;
+  virtual void shutdown() = 0;
+
+  virtual void dump(KeyValueDB *kvdb) = 0;
+
+  virtual void enumerate_reset() = 0;
+  virtual bool enumerate_next(KeyValueDB *kvdb, uint64_t *offset, uint64_t *length) = 0;
+
+  virtual void allocate(
+    uint64_t offset, uint64_t length,
+    KeyValueDB::Transaction txn) = 0;
+  virtual void release(
+    uint64_t offset, uint64_t length,
+    KeyValueDB::Transaction txn) = 0;
+
+  virtual uint64_t get_size() const = 0;
+  virtual uint64_t get_alloc_units() const = 0;
+  virtual uint64_t get_alloc_size() const = 0;
+
+  virtual void get_meta(uint64_t target_size,
+  std::vector<std::pair<std::string, std::string>>*) const = 0;
+
+  void set_null_manager() {
+    null_manager = true;
+  }
+  bool is_null_manager() const {
+    return null_manager;
+  }
+};
+
+
+#endif
diff --git a/src/os/bluestore/HybridAllocator.cc b/src/os/bluestore/HybridAllocator.cc
new file mode 100644
index 000000000..2201d5958
--- /dev/null
+++ b/src/os/bluestore/HybridAllocator.cc
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "HybridAllocator.h"
+
+#include <bit>
+#include <limits>
+
+#include "common/config_proxy.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef  dout_prefix
+#define dout_prefix *_dout << "HybridAllocator "
+
+
+int64_t HybridAllocator::allocate(
+  uint64_t want,
+  uint64_t unit,
+  uint64_t max_alloc_size,
+  int64_t  hint,
+  PExtentVector* extents)
+{
+  ldout(cct, 10) << __func__ << std::hex
+                 << " want 0x" << want
+                 << " unit 0x" << unit
+                 << " max_alloc_size 0x" << max_alloc_size
+                 << " hint 0x" << hint
+                 << std::dec << dendl;
+  ceph_assert(std::has_single_bit(unit));
+  ceph_assert(want % unit == 0);
+
+  if (max_alloc_size == 0) {
+    max_alloc_size = want;
+  }
+  if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
+      max_alloc_size >= cap) {
+    max_alloc_size = p2align(uint64_t(cap), (uint64_t)get_block_size());
+  }
+
+  std::lock_guard l(lock);
+
+  int64_t res;
+  PExtentVector local_extents;
+
+  // preserve original 'extents' vector state
+  auto orig_size = extents->size();
+  auto orig_pos = extents->end();
+  if (orig_size) {
+    --orig_pos;
+  }
+
+  // try bitmap first to avoid unneeded contiguous extents split if
+  // desired amount is less than shortes range in AVL
+  if (bmap_alloc && bmap_alloc->get_free() &&
+    want < _lowest_size_available()) {
+    res = bmap_alloc->allocate(want, unit, max_alloc_size, hint, extents);
+    if (res < 0) {
+      // got a failure, release already allocated and
+      // start over allocation from avl
+      if (orig_size) {
+        local_extents.insert(
+          local_extents.end(), ++orig_pos, extents->end());
+        extents->resize(orig_size);
+      } else {
+        extents->swap(local_extents);
+      }
+      bmap_alloc->release(local_extents);
+      res = 0;
+    }
+    if ((uint64_t)res < want) {
+      auto res2 = _allocate(want - res, unit, max_alloc_size, hint, extents);
+      if (res2 < 0) {
+        res = res2; // caller to do the release
+      } else {
+        res += res2;
+      }
+    }
+  } else {
+    res = _allocate(want, unit, max_alloc_size, hint, extents);
+    if (res < 0) {
+      // got a failure, release already allocated and
+      // start over allocation from bitmap
+      if (orig_size) {
+        local_extents.insert(
+          local_extents.end(), ++orig_pos, extents->end());
+        extents->resize(orig_size);
+      } else {
+        extents->swap(local_extents);
+      }
+      _release(local_extents);
+      res = 0;
+    }
+    if ((uint64_t)res < want ) {
+      auto res2 = bmap_alloc ?
+        bmap_alloc->allocate(want - res, unit, max_alloc_size, hint, extents) :
+        0;
+      if (res2 < 0 ) {
+        res = res2; // caller to do the release
+      } else {
+        res += res2;
+      }
+    }
+  }
+  return res ? res : -ENOSPC;
+}
+
+void HybridAllocator::release(const interval_set<uint64_t>& release_set) {
+  std::lock_guard l(lock);
+  // this will attempt to put free ranges into AvlAllocator first and
+  // fallback to bitmap one via _try_insert_range call
+  _release(release_set);
+}
+
+uint64_t HybridAllocator::get_free()
+{
+  std::lock_guard l(lock);
+  return (bmap_alloc ? bmap_alloc->get_free() : 0) + _get_free();
+}
+
+double HybridAllocator::get_fragmentation()
+{
+  std::lock_guard l(lock);
+  auto f = AvlAllocator::_get_fragmentation();
+  auto bmap_free = bmap_alloc ? bmap_alloc->get_free() : 0;
+  if (bmap_free) {
+    auto _free = _get_free() + bmap_free;
+    auto bf = bmap_alloc->get_fragmentation();
+
+    f = f * _get_free() / _free + bf * bmap_free / _free;
+  }
+  return f;
+}
+
+void HybridAllocator::dump()
+{
+  std::lock_guard l(lock);
+  AvlAllocator::_dump();
+  if (bmap_alloc) {
+    bmap_alloc->dump();
+  }
+  ldout(cct, 0) << __func__
+    << " avl_free: " << _get_free()
+    << " bmap_free: " << (bmap_alloc ? bmap_alloc->get_free() : 0)
+    << dendl;
+}
+
+void HybridAllocator::foreach(
+  std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+  std::lock_guard l(lock);
+  AvlAllocator::_foreach(notify);
+  if (bmap_alloc) {
+    bmap_alloc->foreach(notify);
+  }
+}
+
+void HybridAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+  if (!length)
+    return;
+  std::lock_guard l(lock);
+  ldout(cct, 10) << __func__ << std::hex
+                 << " offset 0x" << offset
+                 << " length 0x" << length
+                 << std::dec << dendl;
+  _try_remove_from_tree(offset, length,
+    [&](uint64_t o, uint64_t l, bool found) {
+      if (!found) {
+        if (bmap_alloc) {
+          bmap_alloc->init_rm_free(o, l);
+        } else {
+          lderr(cct) << "init_rm_free lambda " << std::hex
+            << "Uexpected extent: "
+            << " 0x" << o << "~" << l
+            << std::dec << dendl;
+          ceph_assert(false);
+        }
+      }
+    });
+}
+
+void HybridAllocator::shutdown()
+{
+  std::lock_guard l(lock);
+  _shutdown();
+  if (bmap_alloc) {
+    bmap_alloc->shutdown();
+    delete bmap_alloc;
+    bmap_alloc = nullptr;
+  }
+}
+
+void HybridAllocator::_spillover_range(uint64_t start, uint64_t end)
+{
+  auto size = end - start;
+  dout(20) << __func__
+    << std::hex << " "
+    << start << "~" << size
+    << std::dec
+    << dendl;
+  ceph_assert(size);
+  if (!bmap_alloc) {
+    dout(1) << __func__
+      << std::hex
+      << " constructing fallback allocator"
+      << dendl;
+    bmap_alloc = new BitmapAllocator(cct,
+      get_capacity(),
+      get_block_size(),
+      get_name() + ".fallback");
+  }
+  bmap_alloc->init_add_free(start, size);
+}
+
+void HybridAllocator::_add_to_tree(uint64_t start, uint64_t size)
+{
+  if (bmap_alloc) {
+    uint64_t head = bmap_alloc->claim_free_to_left(start);
+    uint64_t tail = bmap_alloc->claim_free_to_right(start + size);
+    ceph_assert(head <= start);
+    start -= head;
+    size += head + tail;
+  }
+  AvlAllocator::_add_to_tree(start, size);
+}
diff --git a/src/os/bluestore/HybridAllocator.h b/src/os/bluestore/HybridAllocator.h
new file mode 100644
index 000000000..a4cf1e225
--- /dev/null
+++ b/src/os/bluestore/HybridAllocator.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <mutex>
+
+#include "AvlAllocator.h"
+#include "BitmapAllocator.h"
+
+class HybridAllocator : public AvlAllocator {
+  BitmapAllocator* bmap_alloc = nullptr;
+public:
+  HybridAllocator(CephContext* cct, int64_t device_size, int64_t _block_size,
+                  uint64_t max_mem,
+	          std::string_view name) :
+      AvlAllocator(cct, device_size, _block_size, max_mem, name) {
+  }
+  const char* get_type() const override
+  {
+    return "hybrid";
+  }
+  int64_t allocate(
+    uint64_t want,
+    uint64_t unit,
+    uint64_t max_alloc_size,
+    int64_t  hint,
+    PExtentVector *extents) override;
+  void release(const interval_set<uint64_t>& release_set) override;
+  uint64_t get_free() override;
+  double get_fragmentation() override;
+
+  void dump() override;
+  void foreach(
+    std::function<void(uint64_t offset, uint64_t length)> notify) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+  void shutdown() override;
+
+protected:
+  // intended primarily for UT
+  BitmapAllocator* get_bmap() {
+    return bmap_alloc;
+  }
+  const BitmapAllocator* get_bmap() const {
+    return bmap_alloc;
+  }
+private:
+
+  void _spillover_range(uint64_t start, uint64_t end) override;
+
+  // called when extent to be released/marked free
+  void _add_to_tree(uint64_t start, uint64_t size) override;
+};
diff --git a/src/os/bluestore/StupidAllocator.cc b/src/os/bluestore/StupidAllocator.cc
new file mode 100644
index 000000000..8f74a499e
--- /dev/null
+++ b/src/os/bluestore/StupidAllocator.cc
@@ -0,0 +1,355 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "StupidAllocator.h"
+#include "bluestore_types.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "stupidalloc 0x" << this << " "
+
+StupidAllocator::StupidAllocator(CephContext* cct,
+                                 int64_t capacity,
+                                 int64_t _block_size,
+                                 std::string_view name)
+  : Allocator(name, capacity, _block_size),
+    cct(cct), num_free(0),
+    free(10)
+{
+  ceph_assert(cct != nullptr);
+  ceph_assert(block_size > 0);
+}
+
+StupidAllocator::~StupidAllocator()
+{
+}
+
+unsigned StupidAllocator::_choose_bin(uint64_t orig_len)
+{
+  uint64_t len = orig_len / block_size;
+  int bin = std::min((int)cbits(len), (int)free.size() - 1);
+  ldout(cct, 30) << __func__ << " len 0x" << std::hex << orig_len
+		 << std::dec << " -> " << bin << dendl;
+  return bin;
+}
+
+void StupidAllocator::_insert_free(uint64_t off, uint64_t len)
+{
+  unsigned bin = _choose_bin(len);
+  ldout(cct, 30) << __func__ << " 0x" << std::hex << off << "~" << len
+		 << std::dec << " in bin " << bin << dendl;
+  while (true) {
+    free[bin].insert(off, len, &off, &len);
+    unsigned newbin = _choose_bin(len);
+    if (newbin == bin)
+      break;
+    ldout(cct, 30) << __func__ << " promoting 0x" << std::hex << off << "~" << len
+		   << std::dec << " to bin " << newbin << dendl;
+    free[bin].erase(off, len);
+    bin = newbin;
+  }
+}
+
+int64_t StupidAllocator::allocate_int(
+  uint64_t want_size, uint64_t alloc_unit, int64_t hint,
+  uint64_t *offset, uint32_t *length)
+{
+  std::lock_guard l(lock);
+  ldout(cct, 10) << __func__ << " want_size 0x" << std::hex << want_size
+	   	 << " alloc_unit 0x" << alloc_unit
+	   	 << " hint 0x" << hint << std::dec
+	   	 << dendl;
+  uint64_t want = std::max(alloc_unit, want_size);
+  int bin = _choose_bin(want);
+  int orig_bin = bin;
+
+  auto p = free[0].begin();
+
+  if (!hint)
+    hint = last_alloc;
+
+  // search up (from hint)
+  if (hint) {
+    for (bin = orig_bin; bin < (int)free.size(); ++bin) {
+      p = free[bin].lower_bound(hint);
+      while (p != free[bin].end()) {
+	if (p.get_len() >= want_size) {
+	  goto found;
+	}
+	++p;
+      }
+    }
+  }
+
+  // search up (from origin, and skip searched extents by hint)
+  for (bin = orig_bin; bin < (int)free.size(); ++bin) {
+    p = free[bin].begin();
+    auto end = hint ? free[bin].lower_bound(hint) : free[bin].end();
+    while (p != end) {
+      if (p.get_len() >= want_size) {
+	goto found;
+      }
+      ++p;
+    }
+  }
+
+  // search down (hint)
+  if (hint) {
+    for (bin = orig_bin; bin >= 0; --bin) {
+      p = free[bin].lower_bound(hint);
+      while (p != free[bin].end()) {
+	if (p.get_len() >= alloc_unit) {
+	  goto found;
+	}
+	++p;
+      }
+    }
+  }
+
+  // search down (from origin, and skip searched extents by hint)
+  for (bin = orig_bin; bin >= 0; --bin) {
+    p = free[bin].begin();
+    auto end = hint ? free[bin].lower_bound(hint) : free[bin].end();
+    while (p != end) {
+      if (p.get_len() >= alloc_unit) {
+	goto found;
+      }
+      ++p;
+    }
+  }
+
+  return -ENOSPC;
+
+ found:
+  *offset = p.get_start();
+  *length = std::min(std::max(alloc_unit, want_size), p2align(p.get_len(), alloc_unit));
+
+  if (cct->_conf->bluestore_debug_small_allocations) {
+    uint64_t max =
+      alloc_unit * (rand() % cct->_conf->bluestore_debug_small_allocations);
+    if (max && *length > max) {
+      ldout(cct, 10) << __func__ << " shortening allocation of 0x" << std::hex
+	       	     << *length << " -> 0x"
+	       	     << max << " due to debug_small_allocations" << std::dec
+		     << dendl;
+      *length = max;
+    }
+  }
+  ldout(cct, 30) << __func__ << " got 0x" << std::hex << *offset << "~" << *length
+	   	 << " from bin " << std::dec << bin << dendl;
+
+  free[bin].erase(*offset, *length);
+  uint64_t off, len;
+  if (*offset && free[bin].contains(*offset - 1, &off, &len)) {
+    int newbin = _choose_bin(len);
+    if (newbin != bin) {
+      ldout(cct, 30) << __func__ << " demoting 0x" << std::hex << off << "~" << len
+	       	     << std::dec << " to bin " << newbin << dendl;
+      free[bin].erase(off, len);
+      _insert_free(off, len);
+    }
+  }
+  if (free[bin].contains(*offset + *length, &off, &len)) {
+    int newbin = _choose_bin(len);
+    if (newbin != bin) {
+      ldout(cct, 30) << __func__ << " demoting 0x" << std::hex << off << "~" << len
+	       	     << std::dec << " to bin " << newbin << dendl;
+      free[bin].erase(off, len);
+      _insert_free(off, len);
+    }
+  }
+
+  num_free -= *length;
+  ceph_assert(num_free >= 0);
+  last_alloc = *offset + *length;
+  return 0;
+}
+
+int64_t StupidAllocator::allocate(
+  uint64_t want_size,
+  uint64_t alloc_unit,
+  uint64_t max_alloc_size,
+  int64_t hint,
+  PExtentVector *extents)
+{
+  uint64_t allocated_size = 0;
+  uint64_t offset = 0;
+  uint32_t length = 0;
+  int res = 0;
+
+  if (max_alloc_size == 0) {
+    max_alloc_size = want_size;
+  }
+  // cap with 32-bit val
+  max_alloc_size = std::min(max_alloc_size, 0x10000000 - alloc_unit);
+
+  while (allocated_size < want_size) {
+    res = allocate_int(std::min(max_alloc_size, (want_size - allocated_size)),
+       alloc_unit, hint, &offset, &length);
+    if (res != 0) {
+      /*
+       * Allocation failed.
+       */
+      break;
+    }
+    bool can_append = true;
+    if (!extents->empty()) {
+      bluestore_pextent_t &last_extent  = extents->back();
+      if (last_extent.end() == offset) {
+        uint64_t l64 = last_extent.length;
+        l64 += length;
+        if (l64 < 0x100000000 && l64 <= max_alloc_size) {
+	  can_append = false;
+	  last_extent.length += length;
+        }
+      }
+    }
+    if (can_append) {
+      extents->emplace_back(bluestore_pextent_t(offset, length));
+    }
+
+    allocated_size += length;
+    hint = offset + length;
+  }
+
+  if (allocated_size == 0) {
+    return -ENOSPC;
+  }
+  return allocated_size;
+}
+
+void StupidAllocator::release(
+  const interval_set<uint64_t>& release_set)
+{
+  std::lock_guard l(lock);
+  for (interval_set<uint64_t>::const_iterator p = release_set.begin();
+       p != release_set.end();
+       ++p) {
+    const auto offset = p.get_start();
+    const auto length = p.get_len();
+    ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		   << std::dec << dendl;
+    _insert_free(offset, length);
+    num_free += length;
+  }
+}
+
+uint64_t StupidAllocator::get_free()
+{
+  std::lock_guard l(lock);
+  return num_free;
+}
+
+double StupidAllocator::get_fragmentation()
+{
+  ceph_assert(get_block_size());
+  double res;
+  uint64_t max_intervals = 0;
+  uint64_t intervals = 0;
+  {
+    std::lock_guard l(lock);
+    max_intervals = p2roundup<uint64_t>(num_free,
+                                        get_block_size()) / get_block_size();
+    for (unsigned bin = 0; bin < free.size(); ++bin) {
+      intervals += free[bin].num_intervals();
+    }
+  }
+  ldout(cct, 30) << __func__ << " " << intervals << "/" << max_intervals 
+                 << dendl;
+  ceph_assert(intervals <= max_intervals);
+  if (!intervals || max_intervals <= 1) {
+    return 0.0;
+  }
+  intervals--;
+  max_intervals--;
+  res = (double)intervals / max_intervals;
+  return res;
+}
+
+void StupidAllocator::dump()
+{
+  std::lock_guard l(lock);
+  for (unsigned bin = 0; bin < free.size(); ++bin) {
+    ldout(cct, 0) << __func__ << " free bin " << bin << ": "
+	    	  << free[bin].num_intervals() << " extents" << dendl;
+    for (auto p = free[bin].begin();
+	 p != free[bin].end();
+	 ++p) {
+      ldout(cct, 0) << __func__ << "  0x" << std::hex << p.get_start() << "~"
+	      	    << p.get_len() << std::dec << dendl;
+    }
+  }
+}
+
+void StupidAllocator::foreach(std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+  std::lock_guard l(lock);
+  for (unsigned bin = 0; bin < free.size(); ++bin) {
+    for (auto p = free[bin].begin(); p != free[bin].end(); ++p) {
+      notify(p.get_start(), p.get_len());
+    }
+  }
+}
+
+void StupidAllocator::init_add_free(uint64_t offset, uint64_t length)
+{
+  if (!length)
+    return;
+  std::lock_guard l(lock);
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+		 << std::dec << dendl;
+  _insert_free(offset, length);
+  num_free += length;
+}
+
+void StupidAllocator::init_rm_free(uint64_t offset, uint64_t length)
+{
+  if (!length)
+    return;
+  std::lock_guard l(lock);
+  ldout(cct, 10) << __func__ << " 0x" << std::hex << offset << "~" << length
+	   	 << std::dec << dendl;
+  interval_set_t rm;
+  rm.insert(offset, length);
+  for (unsigned i = 0; i < free.size() && !rm.empty(); ++i) {
+    interval_set_t overlap;
+    overlap.intersection_of(rm, free[i]);
+    if (!overlap.empty()) {
+      ldout(cct, 20) << __func__ << " bin " << i << " rm 0x" << std::hex << overlap
+		     << std::dec << dendl;
+      auto it = overlap.begin();
+      auto it_end = overlap.end();
+      while (it != it_end) {
+        auto o = it.get_start();
+        auto l = it.get_len();
+
+        free[i].erase(o, l,
+          [&](uint64_t off, uint64_t len) {
+            unsigned newbin = _choose_bin(len);
+            if (newbin != i) {
+              ldout(cct, 30) << __func__ << " demoting1 0x" << std::hex << off << "~" << len
+                             << std::dec << " to bin " << newbin << dendl;
+              _insert_free(off, len);
+              return true;
+            }
+            return false;
+          });
+        ++it;
+      }
+
+      rm.subtract(overlap);
+    }
+  }
+  ceph_assert(rm.empty());
+  num_free -= length;
+  ceph_assert(num_free >= 0);
+}
+
+
+void StupidAllocator::shutdown()
+{
+  ldout(cct, 1) << __func__ << dendl;
+}
+
diff --git a/src/os/bluestore/StupidAllocator.h b/src/os/bluestore/StupidAllocator.h
new file mode 100644
index 000000000..443b09135
--- /dev/null
+++ b/src/os/bluestore/StupidAllocator.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_STUPIDALLOCATOR_H
+#define CEPH_OS_BLUESTORE_STUPIDALLOCATOR_H
+
+#include <mutex>
+
+#include "Allocator.h"
+#include "include/btree_map.h"
+#include "include/interval_set.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+#include "common/ceph_mutex.h"
+
+class StupidAllocator : public Allocator {
+  CephContext* cct;
+  ceph::mutex lock = ceph::make_mutex("StupidAllocator::lock");
+
+  int64_t num_free;     ///< total bytes in freelist
+
+  template <typename K, typename V> using allocator_t =
+    mempool::bluestore_alloc::pool_allocator<std::pair<const K, V>>;
+  template <typename K, typename V> using btree_map_t =
+    btree::btree_map<K, V, std::less<K>, allocator_t<K, V>>;
+  using interval_set_t = interval_set<uint64_t, btree_map_t>;
+  std::vector<interval_set_t> free;  ///< leading-edge copy
+
+  uint64_t last_alloc = 0;
+
+  unsigned _choose_bin(uint64_t len);
+  void _insert_free(uint64_t offset, uint64_t len);
+
+public:
+  StupidAllocator(CephContext* cct,
+                  int64_t size,
+                  int64_t block_size,
+		  std::string_view name);
+  ~StupidAllocator() override;
+  const char* get_type() const override
+  {
+    return "stupid";
+  }
+
+  int64_t allocate(
+    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+    int64_t hint, PExtentVector *extents) override;
+
+  int64_t allocate_int(
+    uint64_t want_size, uint64_t alloc_unit, int64_t hint,
+    uint64_t *offset, uint32_t *length);
+
+  void release(
+    const interval_set<uint64_t>& release_set) override;
+
+  uint64_t get_free() override;
+  double get_fragmentation() override;
+
+  void dump() override;
+  void foreach(std::function<void(uint64_t offset, uint64_t length)> notify) override;
+
+  void init_add_free(uint64_t offset, uint64_t length) override;
+  void init_rm_free(uint64_t offset, uint64_t length) override;
+
+  void shutdown() override;
+};
+
+#endif
diff --git a/src/os/bluestore/ZonedAllocator.cc b/src/os/bluestore/ZonedAllocator.cc
new file mode 100644
index 000000000..4139b4755
--- /dev/null
+++ b/src/os/bluestore/ZonedAllocator.cc
@@ -0,0 +1,240 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+// 
+// A simple allocator that just hands out space from the next empty zone.  This
+// is temporary, just to get the simplest append-only write workload to work.
+//
+// Copyright (C) 2020 Abutalib Aghayev
+//
+
+#include "ZonedAllocator.h"
+#include "bluestore_types.h"
+#include "zoned_types.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "ZonedAllocator(" << this << ") " << __func__ << " "
+
+ZonedAllocator::ZonedAllocator(CephContext* cct,
+			       int64_t size,
+			       int64_t blk_size,
+			       int64_t _zone_size,
+			       int64_t _first_sequential_zone,
+			       std::string_view name)
+    : Allocator(name, size, blk_size),
+      cct(cct),
+      size(size),
+      conventional_size(_first_sequential_zone * _zone_size),
+      sequential_size(size - conventional_size),
+      num_sequential_free(0),
+      block_size(blk_size),
+      zone_size(_zone_size),
+      first_seq_zone_num(_first_sequential_zone),
+      starting_zone_num(first_seq_zone_num),
+      num_zones(size / zone_size)
+{
+  ldout(cct, 10) << " size 0x" << std::hex << size
+		 << ", zone size 0x" << zone_size << std::dec
+		 << ", number of zones 0x" << num_zones
+		 << ", first sequential zone 0x" << starting_zone_num
+		 << ", sequential size 0x" << sequential_size
+		 << std::dec
+		 << dendl;
+  ceph_assert(size % zone_size == 0);
+
+  zone_states.resize(num_zones);
+}
+
+ZonedAllocator::~ZonedAllocator()
+{
+}
+
+int64_t ZonedAllocator::allocate(
+  uint64_t want_size,
+  uint64_t alloc_unit,
+  uint64_t max_alloc_size,
+  int64_t hint,
+  PExtentVector *extents)
+{
+  std::lock_guard l(lock);
+
+  ceph_assert(want_size % 4096 == 0);
+
+  ldout(cct, 10) << " trying to allocate 0x"
+		 << std::hex << want_size << std::dec << dendl;
+
+  uint64_t left = num_zones - first_seq_zone_num;
+  uint64_t zone_num = starting_zone_num;
+  for ( ; left > 0; ++zone_num, --left) {
+    if (zone_num == num_zones) {
+      zone_num = first_seq_zone_num;
+    }
+    if (zone_num == cleaning_zone) {
+      ldout(cct, 10) << " skipping zone 0x" << std::hex << zone_num
+		     << " because we are cleaning it" << std::dec << dendl;
+      continue;
+    }
+    if (!fits(want_size, zone_num)) {
+      ldout(cct, 10) << " skipping zone 0x" << std::hex << zone_num
+		     << " because there is not enough space: "
+		     << " want_size = 0x" << want_size
+		     << " available = 0x" << get_remaining_space(zone_num)
+		     << std::dec
+		     << dendl;
+      continue;
+    }
+    break;
+  }
+
+  if (left == 0) {
+    ldout(cct, 10) << " failed to allocate" << dendl;
+    return -ENOSPC;
+  }
+
+  uint64_t offset = get_offset(zone_num);
+
+  ldout(cct, 10) << " moving zone 0x" << std::hex
+		 << zone_num << " write pointer from 0x" << offset
+		 << " -> 0x" << offset + want_size
+		 << std::dec << dendl;
+
+  increment_write_pointer(zone_num, want_size);
+  num_sequential_free -= want_size;
+  if (get_remaining_space(zone_num) == 0) {
+    starting_zone_num = zone_num + 1;
+  }
+
+  ldout(cct, 10) << " allocated 0x" << std::hex << offset << "~" << want_size
+		 << " from zone 0x" << zone_num
+		 << " and zone offset 0x" << (offset % zone_size)
+		 << std::dec << dendl;
+
+  extents->emplace_back(bluestore_pextent_t(offset, want_size));
+  return want_size;
+}
+
+void ZonedAllocator::release(const interval_set<uint64_t>& release_set)
+{
+  std::lock_guard l(lock);
+  for (auto p = cbegin(release_set); p != cend(release_set); ++p) {
+    auto offset = p.get_start();
+    auto length = p.get_len();
+    uint64_t zone_num = offset / zone_size;
+    ldout(cct, 10) << " 0x" << std::hex << offset << "~" << length
+		   << " from zone 0x" << zone_num << std::dec << dendl;
+    uint64_t num_dead = std::min(zone_size - offset % zone_size, length);
+    for ( ; length; ++zone_num) {
+      increment_num_dead_bytes(zone_num, num_dead);
+      length -= num_dead;
+      num_dead = std::min(zone_size, length);
+    }
+  }
+}
+
+uint64_t ZonedAllocator::get_free()
+{
+  return num_sequential_free;
+}
+
+void ZonedAllocator::dump()
+{
+  std::lock_guard l(lock);
+}
+
+void ZonedAllocator::foreach(
+  std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+  std::lock_guard l(lock);
+}
+
+void ZonedAllocator::init_from_zone_pointers(
+  std::vector<zone_state_t> &&_zone_states)
+{
+  // this is called once, based on the device's zone pointers
+  std::lock_guard l(lock);
+  ldout(cct, 10) << dendl;
+  zone_states = std::move(_zone_states);
+  num_sequential_free = 0;
+  for (size_t i = first_seq_zone_num; i < num_zones; ++i) {
+    num_sequential_free += zone_size - (zone_states[i].write_pointer % zone_size);
+  }
+  ldout(cct, 10) << "free 0x" << std::hex << num_sequential_free
+		 << " / 0x" << sequential_size << std::dec
+		 << dendl;
+}
+
+int64_t ZonedAllocator::pick_zone_to_clean(float min_score, uint64_t min_saved)
+{
+  std::lock_guard l(lock);
+  int32_t best = -1;
+  float best_score = 0.0;
+  for (size_t i = first_seq_zone_num; i < num_zones; ++i) {
+    // value (score) = benefit / cost
+    //    benefit = how much net free space we'll get (dead bytes)
+    //    cost = how many bytes we'll have to rewrite (live bytes)
+    // avoid divide by zero on a zone with no live bytes
+    float score =
+      (float)zone_states[i].num_dead_bytes /
+      (float)(zone_states[i].get_num_live_bytes() + 1);
+    if (score > 0) {
+      ldout(cct, 20) << " zone 0x" << std::hex << i
+		     << " dead 0x" << zone_states[i].num_dead_bytes
+		     << " score " << score
+		     << dendl;
+    }
+    if (zone_states[i].num_dead_bytes < min_saved) {
+      continue;
+    }
+    if (best < 0 || score > best_score) {
+      best = i;
+      best_score = score;
+    }
+  }
+  if (best_score >= min_score) {
+    ldout(cct, 10) << " zone 0x" << std::hex << best << " with score " << best_score
+		   << ": 0x" << zone_states[best].num_dead_bytes
+		   << " dead and 0x"
+		   << zone_states[best].write_pointer - zone_states[best].num_dead_bytes
+		   << " live bytes" << std::dec << dendl;
+  } else if (best > 0) {
+    ldout(cct, 10) << " zone 0x" << std::hex << best << " with score " << best_score
+		   << ": 0x" << zone_states[best].num_dead_bytes
+		   << " dead and 0x"
+		   << zone_states[best].write_pointer - zone_states[best].num_dead_bytes
+		   << " live bytes" << std::dec
+		   << " but below min_score " << min_score
+		   << dendl;
+    best = -1;
+  } else {
+    ldout(cct, 10) << " no zones found that are good cleaning candidates" << dendl;
+  }
+  return best;
+}
+
+void ZonedAllocator::reset_zone(uint32_t zone)
+{
+  num_sequential_free += zone_states[zone].write_pointer;
+  zone_states[zone].reset();
+}
+
+bool ZonedAllocator::low_on_space(void)
+{
+  std::lock_guard l(lock);
+  double free_ratio = static_cast<double>(num_sequential_free) / sequential_size;
+
+  ldout(cct, 10) << " free 0x" << std::hex << num_sequential_free
+		 << "/ 0x" << sequential_size << std::dec
+		 << ", free ratio is " << free_ratio << dendl;
+  ceph_assert(num_sequential_free <= (int64_t)sequential_size);
+
+  // TODO: make 0.25 tunable
+  return free_ratio <= 0.25;
+}
+
+void ZonedAllocator::shutdown()
+{
+  ldout(cct, 1) << dendl;
+}
diff --git a/src/os/bluestore/ZonedAllocator.h b/src/os/bluestore/ZonedAllocator.h
new file mode 100644
index 000000000..0778bd0da
--- /dev/null
+++ b/src/os/bluestore/ZonedAllocator.h
@@ -0,0 +1,120 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+// 
+// A simple allocator that just hands out space from the next empty zone.  This
+// is temporary, just to get the simplest append-only write workload to work.
+//
+// Copyright (C) 2020 Abutalib Aghayev
+//
+
+#ifndef CEPH_OS_BLUESTORE_ZONEDALLOCATOR_H
+#define CEPH_OS_BLUESTORE_ZONEDALLOCATOR_H
+
+#include <mutex>
+
+#include "Allocator.h"
+#include "common/ceph_mutex.h"
+#include "include/btree_map.h"
+#include "include/interval_set.h"
+#include "include/mempool.h"
+#include "bluestore_types.h"
+#include "zoned_types.h"
+
+class ZonedAllocator : public Allocator {
+  CephContext* cct;
+
+  // Currently only one thread at a time calls into ZonedAllocator due to
+  // atomic_alloc_and_submit_lock in BlueStore.cc, but we do locking anyway
+  // because eventually ZONE_APPEND support will land and
+  // atomic_alloc_and_submit_lock will be removed.
+  ceph::mutex lock = ceph::make_mutex("ZonedAllocator::lock");
+
+  uint64_t size;
+  uint64_t conventional_size, sequential_size;
+  std::atomic<int64_t> num_sequential_free;  ///< total bytes in freelist
+  uint64_t block_size;
+  uint64_t zone_size;
+  uint64_t first_seq_zone_num;
+  uint64_t starting_zone_num;
+  uint64_t num_zones;
+  std::atomic<uint32_t> cleaning_zone = -1;
+  std::vector<zone_state_t> zone_states;
+
+  inline uint64_t get_offset(uint64_t zone_num) const {
+    return zone_num * zone_size + get_write_pointer(zone_num);
+  }
+
+public:
+  inline uint64_t get_write_pointer(uint64_t zone_num) const {
+    return zone_states[zone_num].get_write_pointer();
+  }
+private:
+  inline uint64_t get_remaining_space(uint64_t zone_num) const {
+    return zone_size - get_write_pointer(zone_num);
+  }
+
+  inline void increment_write_pointer(uint64_t zone_num, uint64_t want_size) {
+    zone_states[zone_num].increment_write_pointer(want_size);
+  }
+
+  inline void increment_num_dead_bytes(uint64_t zone_num, uint64_t length) {
+    zone_states[zone_num].increment_num_dead_bytes(length);
+  }
+
+  inline bool fits(uint64_t want_size, uint64_t zone_num) const {
+    return want_size <= get_remaining_space(zone_num);
+  }
+
+public:
+  ZonedAllocator(CephContext* cct, int64_t size, int64_t block_size,
+		 int64_t _zone_size,
+		 int64_t _first_sequential_zone,
+		 std::string_view name);
+  ~ZonedAllocator() override;
+
+  const char *get_type() const override {
+    return "zoned";
+  }
+
+  uint64_t get_dead_bytes(uint32_t zone) {
+    return zone_states[zone].num_dead_bytes;
+  }
+  uint64_t get_live_bytes(uint32_t zone) {
+    std::scoped_lock l(lock);
+    return zone_states[zone].write_pointer - zone_states[zone].num_dead_bytes;
+  }
+
+  int64_t allocate(
+    uint64_t want_size, uint64_t alloc_unit, uint64_t max_alloc_size,
+    int64_t hint, PExtentVector *extents) override;
+
+  void release(const interval_set<uint64_t>& release_set) override;
+
+  uint64_t get_free() override;
+
+  void dump() override;
+  void foreach(
+    std::function<void(uint64_t offset, uint64_t length)> notify) override;
+
+  int64_t pick_zone_to_clean(float min_score, uint64_t min_saved);
+  void set_cleaning_zone(uint32_t zone) {
+    cleaning_zone = zone;
+  }
+  void clear_cleaning_zone(uint32_t zone) {
+    cleaning_zone = -1;
+  }
+  void reset_zone(uint32_t zone);
+
+  void init_from_zone_pointers(
+    std::vector<zone_state_t> &&_zone_states);
+  void init_add_free(uint64_t offset, uint64_t length) override {}
+  void init_rm_free(uint64_t offset, uint64_t length) override {}
+
+  void shutdown() override;
+
+private:
+  bool low_on_space(void);
+};
+
+#endif
diff --git a/src/os/bluestore/ZonedFreelistManager.cc b/src/os/bluestore/ZonedFreelistManager.cc
new file mode 100644
index 000000000..3a5bce66f
--- /dev/null
+++ b/src/os/bluestore/ZonedFreelistManager.cc
@@ -0,0 +1,372 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+//
+// A freelist manager for zoned devices.  This iteration just keeps the write
+// pointer per zone.  Following iterations will add enough information to enable
+// cleaning of zones.
+//
+// Copyright (C) 2020 Abutalib Aghayev
+//
+
+#include "ZonedFreelistManager.h"
+#include "bluestore_common.h"
+#include "include/stringify.h"
+#include "kv/KeyValueDB.h"
+#include "os/kv.h"
+#include "zoned_types.h"
+
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "zoned freelist "
+
+using std::string;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::decode;
+using ceph::encode;
+
+void ZonedFreelistManager::write_zone_state_delta_to_db(
+  uint64_t zone_num,
+  const zone_state_t &zone_state,
+  KeyValueDB::Transaction txn)
+{
+  string key;
+  _key_encode_u64(zone_num, &key);
+  bufferlist bl;
+  zone_state.encode(bl);
+  txn->merge(info_prefix, key, bl);
+}
+
+void ZonedFreelistManager::write_zone_state_reset_to_db(
+  uint64_t zone_num,
+  const zone_state_t &zone_state,
+  KeyValueDB::Transaction txn)
+{
+  string key;
+  _key_encode_u64(zone_num, &key);
+  bufferlist bl;
+  zone_state.encode(bl);
+  txn->set(info_prefix, key, bl);
+}
+
+void ZonedFreelistManager::load_zone_state_from_db(
+  uint64_t zone_num,
+  zone_state_t &zone_state,
+  KeyValueDB::Iterator& it) const
+{
+  string k = it->key();
+  uint64_t zone_num_from_db;
+  _key_decode_u64(k.c_str(), &zone_num_from_db);
+  ceph_assert(zone_num_from_db == zone_num);
+
+  bufferlist bl = it->value();
+  auto p = bl.cbegin();
+  zone_state.decode(p);
+}
+
+void ZonedFreelistManager::init_zone_states(KeyValueDB::Transaction txn)
+{
+  dout(10) << __func__ << dendl;
+  for (uint64_t zone_num = 0; zone_num < num_zones; ++zone_num) {
+    zone_state_t zone_state;
+    write_zone_state_reset_to_db(zone_num, zone_state, txn);
+  }
+}
+
+void ZonedFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix)
+{
+  std::shared_ptr<Int64ArrayMergeOperator> merge_op(
+    new Int64ArrayMergeOperator);
+  db->set_merge_operator(prefix, merge_op);
+}
+
+ZonedFreelistManager::ZonedFreelistManager(
+  CephContext* cct,
+  string meta_prefix,
+  string info_prefix)
+  : FreelistManager(cct),
+    meta_prefix(meta_prefix),
+    info_prefix(info_prefix),
+    enumerate_zone_num(~0UL)
+{
+}
+
+int ZonedFreelistManager::create(
+  uint64_t new_size,
+  uint64_t granularity,
+  uint64_t new_zone_size,
+  uint64_t first_sequential_zone,
+  KeyValueDB::Transaction txn)
+{
+  size = new_size;
+  bytes_per_block = granularity;
+  zone_size = new_zone_size;
+  num_zones = size / zone_size;
+  starting_zone_num = first_sequential_zone;
+  enumerate_zone_num = ~0UL;
+
+  ceph_assert(size % zone_size == 0);
+
+  dout(1) << __func__ << std::hex
+	  << " size 0x" << size
+	  << " bytes_per_block 0x" << bytes_per_block
+	  << " zone size 0x " << zone_size
+	  << " num_zones 0x" << num_zones
+	  << " starting_zone 0x" << starting_zone_num << dendl;
+  {
+    bufferlist bl;
+    encode(size, bl);
+    txn->set(meta_prefix, "size", bl);
+  }
+  {
+    bufferlist bl;
+    encode(bytes_per_block, bl);
+    txn->set(meta_prefix, "bytes_per_block", bl);
+  }
+  {
+    bufferlist bl;
+    encode(zone_size, bl);
+    txn->set(meta_prefix, "zone_size", bl);
+  }
+  {
+    bufferlist bl;
+    encode(num_zones, bl);
+    txn->set(meta_prefix, "num_zones", bl);
+  }
+  {
+    bufferlist bl;
+    encode(starting_zone_num, bl);
+    txn->set(meta_prefix, "starting_zone_num", bl);
+  }
+
+  init_zone_states(txn);
+
+  return 0;
+}
+
+int ZonedFreelistManager::init(
+  KeyValueDB *kvdb,
+  bool db_in_read_only,
+  cfg_reader_t cfg_reader)
+{
+  dout(1) << __func__ << dendl;
+  int r = _read_cfg(cfg_reader);
+  if (r != 0) {
+    return r;
+  }
+
+  ceph_assert(num_zones == size / zone_size);
+
+  dout(10) << __func__ << std::hex
+	   << " size 0x" << size
+	   << " bytes_per_block 0x" << bytes_per_block
+	   << " zone size 0x" << zone_size
+	   << " num_zones 0x" << num_zones
+	   << " starting_zone 0x" << starting_zone_num
+	   << std::dec << dendl;
+  return 0;
+}
+
+void ZonedFreelistManager::sync(KeyValueDB* kvdb)
+{
+}
+
+void ZonedFreelistManager::shutdown()
+{
+  dout(1) << __func__ << dendl;
+}
+
+void ZonedFreelistManager::enumerate_reset()
+{
+  std::lock_guard l(lock);
+
+  dout(1) << __func__ << dendl;
+
+  enumerate_p.reset();
+  enumerate_zone_num = ~0UL;
+}
+
+// Currently, this just iterates over the list of zones and sets |offset| and
+// |length| to the write pointer and the number of remaining free bytes in a
+// given zone.  Hence, it can set |length| to 0 if a zone is full, and it can
+// also return two contiguous empty zones in two calls.  This does not violate
+// current semantics of the call and appears to work fine with the clients of
+// this call.
+bool ZonedFreelistManager::enumerate_next(
+  KeyValueDB *kvdb,
+  uint64_t *offset,
+  uint64_t *length)
+{
+  std::lock_guard l(lock);
+
+  // starting case
+  if (enumerate_zone_num == ~0UL) {
+    dout(30) << __func__ << " start" << dendl;
+    enumerate_p = kvdb->get_iterator(info_prefix);
+    enumerate_p->lower_bound(string());
+    ceph_assert(enumerate_p->valid());
+    enumerate_zone_num = 0;
+  } else {
+    enumerate_p->next();
+    if (!enumerate_p->valid()) {
+      dout(30) << __func__ << " end" << dendl;
+      return false;
+    }
+    ++enumerate_zone_num;
+  }
+
+  zone_state_t zone_state;
+  load_zone_state_from_db(enumerate_zone_num, zone_state, enumerate_p);
+
+  *offset = enumerate_zone_num * zone_size + zone_state.get_write_pointer();
+  *length = zone_size - zone_state.get_write_pointer();
+
+  dout(30) << __func__ << std::hex << " 0x" << *offset << "~" << *length
+	   << std::dec << dendl;
+
+  return true;
+}
+
+void ZonedFreelistManager::dump(KeyValueDB *kvdb)
+{
+  enumerate_reset();
+  uint64_t offset, length;
+  while (enumerate_next(kvdb, &offset, &length)) {
+    dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
+	     << std::dec << dendl;
+  }
+}
+
+// Advances the write pointer and writes the updated write pointer to database.
+void ZonedFreelistManager::allocate(
+  uint64_t offset,
+  uint64_t length,
+  KeyValueDB::Transaction txn)
+{
+  while (length > 0) {
+    uint64_t zone_num = offset / zone_size;
+    uint64_t this_len = std::min(length, zone_size - offset % zone_size);
+    dout(10) << __func__ << " 0x" << std::hex << offset << "~" << this_len
+	     << " zone 0x" << zone_num << std::dec << dendl;
+    zone_state_t zone_state;
+    zone_state.increment_write_pointer(this_len);
+    write_zone_state_delta_to_db(zone_num, zone_state, txn);
+    offset += this_len;
+    length -= this_len;
+  }
+}
+
+// Increments the number of dead bytes in a zone and writes the updated value to
+// database.  The dead bytes in the zone are not usable.  The cleaner will later
+// copy live objects from the zone to another zone an make the zone writable
+// again.  The number of dead bytes in a zone is used by the cleaner to select
+// which zones to clean -- the ones with most dead bytes are good candidates
+// since they require less I/O.
+void ZonedFreelistManager::release(
+  uint64_t offset,
+  uint64_t length,
+  KeyValueDB::Transaction txn)
+{
+  while (length > 0) {
+    uint64_t zone_num = offset / zone_size;
+    uint64_t this_len = std::min(length, zone_size - offset % zone_size);
+    dout(10) << __func__ << " 0x" << std::hex << offset << "~" << this_len
+	     << " zone 0x" << zone_num << std::dec << dendl;
+    zone_state_t zone_state;
+    zone_state.increment_num_dead_bytes(this_len);
+    write_zone_state_delta_to_db(zone_num, zone_state, txn);
+    length -= this_len;
+    offset += this_len;
+  }
+}
+
+void ZonedFreelistManager::get_meta(
+  uint64_t target_size,
+  std::vector<std::pair<string, string>>* res) const
+{
+  // We do not support expanding devices for now.
+  ceph_assert(target_size == 0);
+  res->emplace_back("zfm_size", stringify(size));
+  res->emplace_back("zfm_bytes_per_block", stringify(bytes_per_block));
+  res->emplace_back("zfm_zone_size", stringify(zone_size));
+  res->emplace_back("zfm_num_zones", stringify(num_zones));
+  res->emplace_back("zfm_starting_zone_num", stringify(starting_zone_num));
+}
+
+std::vector<zone_state_t> ZonedFreelistManager::get_zone_states(
+  KeyValueDB *kvdb) const
+{
+  std::vector<zone_state_t> zone_states;
+  auto p = kvdb->get_iterator(info_prefix);
+  uint64_t zone_num = 0;
+  for (p->lower_bound(string()); p->valid(); p->next(), ++zone_num) {
+    zone_state_t zone_state;
+    load_zone_state_from_db(zone_num, zone_state, p);
+    zone_states.emplace_back(zone_state);
+  }
+  return zone_states;
+}
+
+// TODO: The following function is copied almost verbatim from
+// BitmapFreelistManager.  Eliminate duplication.
+int ZonedFreelistManager::_read_cfg(cfg_reader_t cfg_reader)
+{
+  dout(1) << __func__ << dendl;
+
+  string err;
+
+  const size_t key_count = 5;
+  string keys[key_count] = {
+    "zfm_size",
+    "zfm_bytes_per_block",
+    "zfm_zone_size",
+    "zfm_num_zones",
+    "zfm_starting_zone_num"
+  };
+  uint64_t* vals[key_count] = {
+    &size,
+    &bytes_per_block,
+    &zone_size,
+    &num_zones,
+    &starting_zone_num};
+
+  for (size_t i = 0; i < key_count; i++) {
+    string val;
+    int r = cfg_reader(keys[i], &val);
+    if (r == 0) {
+      *(vals[i]) = strict_iecstrtoll(val.c_str(), &err);
+      if (!err.empty()) {
+        derr << __func__ << " Failed to parse - "
+          << keys[i] << ":" << val
+          << ", error: " << err << dendl;
+        return -EINVAL;
+      }
+    } else {
+      // this is expected for legacy deployed OSDs
+      dout(0) << __func__ << " " << keys[i] << " not found in bdev meta" << dendl;
+      return r;
+    }
+  }
+  return 0;
+}
+
+void ZonedFreelistManager::mark_zone_to_clean_free(
+  uint64_t zone,
+  KeyValueDB *kvdb)
+{
+  dout(10) << __func__ << " zone 0x" << std::hex << zone << std::dec << dendl;
+
+  KeyValueDB::Transaction txn = kvdb->get_transaction();
+
+  zone_state_t empty_zone_state;
+  write_zone_state_reset_to_db(zone, empty_zone_state, txn);
+
+  // block here until this commits so that we don't end up starting to allocate and
+  // write to the new zone before this fully commits.
+  kvdb->submit_transaction_sync(txn);
+}
diff --git a/src/os/bluestore/ZonedFreelistManager.h b/src/os/bluestore/ZonedFreelistManager.h
new file mode 100644
index 000000000..378a20f0a
--- /dev/null
+++ b/src/os/bluestore/ZonedFreelistManager.h
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+//
+// A freelist manager for zoned devices.
+//
+// Copyright (C) 2020 Abutalib Aghayev
+//
+
+#ifndef CEPH_OS_BLUESTORE_ZONEDFREELISTMANAGER_H
+#define CEPH_OS_BLUESTORE_ZONEDFREELISTMANAGER_H
+
+#include "FreelistManager.h"
+
+#include <string>
+#include <mutex>
+
+#include "common/ceph_mutex.h"
+#include "include/buffer.h"
+#include "kv/KeyValueDB.h"
+#include "zoned_types.h"
+
+using cfg_reader_t = std::function<int(const std::string&, std::string*)>;
+
+class ZonedFreelistManager : public FreelistManager {
+  std::string meta_prefix;    ///< device size, zone size, etc.
+  std::string info_prefix;    ///< per zone write pointer, dead bytes
+  mutable ceph::mutex lock = ceph::make_mutex("ZonedFreelistManager::lock");
+
+  uint64_t size;	      ///< size of sequential region (bytes)
+  uint64_t bytes_per_block;   ///< bytes per allocation unit (bytes)
+  uint64_t zone_size;	      ///< size of a single zone (bytes)
+  uint64_t num_zones;	      ///< number of sequential zones
+  uint64_t starting_zone_num; ///< the first sequential zone number
+
+  KeyValueDB::Iterator enumerate_p;
+  uint64_t enumerate_zone_num;
+
+  void write_zone_state_delta_to_db(uint64_t zone_num,
+				    const zone_state_t &zone_state,
+				    KeyValueDB::Transaction txn);
+  void write_zone_state_reset_to_db(uint64_t zone_num,
+				    const zone_state_t &zone_state,
+				    KeyValueDB::Transaction txn);
+  void load_zone_state_from_db(uint64_t zone_num,
+			       zone_state_t &zone_state,
+			       KeyValueDB::Iterator &it) const;
+
+  void init_zone_states(KeyValueDB::Transaction txn);
+
+  void increment_write_pointer(
+      uint64_t zone, uint64_t length, KeyValueDB::Transaction txn);
+  void increment_num_dead_bytes(
+      uint64_t zone, uint64_t num_bytes, KeyValueDB::Transaction txn);
+
+  int _read_cfg(cfg_reader_t cfg_reader);
+
+public:
+  ZonedFreelistManager(CephContext* cct,
+		       std::string meta_prefix,
+		       std::string info_prefix);
+
+  static void setup_merge_operator(KeyValueDB *db, std::string prefix);
+
+  int create(uint64_t size,
+	     uint64_t granularity,
+	     uint64_t zone_size,
+	     uint64_t first_sequential_zone,
+	     KeyValueDB::Transaction txn) override;
+
+  int init(KeyValueDB *kvdb,
+	   bool db_in_read_only,
+	   cfg_reader_t cfg_reader) override;
+
+  void shutdown() override;
+  void sync(KeyValueDB* kvdb) override;
+  void dump(KeyValueDB *kvdb) override;
+
+  void enumerate_reset() override;
+  bool enumerate_next(KeyValueDB *kvdb,
+		      uint64_t *offset,
+		      uint64_t *length) override;
+
+  void allocate(uint64_t offset,
+		uint64_t length,
+		KeyValueDB::Transaction txn) override;
+
+  void release(uint64_t offset,
+	       uint64_t length,
+	       KeyValueDB::Transaction txn) override;
+
+  inline uint64_t get_size() const override {
+    return size;
+  }
+
+  inline uint64_t get_alloc_units() const override {
+    return size / bytes_per_block;
+  }
+
+  inline uint64_t get_alloc_size() const override {
+    return bytes_per_block;
+  }
+
+  void get_meta(uint64_t target_size,
+		std::vector<std::pair<std::string, std::string>>*) const override;
+
+  std::vector<zone_state_t> get_zone_states(KeyValueDB *kvdb) const;
+
+  void mark_zone_to_clean_free(uint64_t zone,
+			       KeyValueDB *kvdb);
+};
+
+#endif
diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc
new file mode 100644
index 000000000..c8d2ede7b
--- /dev/null
+++ b/src/os/bluestore/bluefs_types.cc
@@ -0,0 +1,284 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include "bluefs_types.h"
+#include "common/Formatter.h"
+#include "include/uuid.h"
+#include "include/stringify.h"
+
+using std::list;
+using std::ostream;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+// bluefs_extent_t
+void bluefs_extent_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("offset", offset);
+  f->dump_unsigned("length", length);
+  f->dump_unsigned("bdev", bdev);
+}
+
+void bluefs_extent_t::generate_test_instances(list<bluefs_extent_t*>& ls)
+{
+  ls.push_back(new bluefs_extent_t);
+  ls.push_back(new bluefs_extent_t);
+  ls.back()->offset = 1;
+  ls.back()->length = 2;
+  ls.back()->bdev = 1;
+}
+
+ostream& operator<<(ostream& out, const bluefs_extent_t& e)
+{
+  return out << (int)e.bdev << ":0x" << std::hex << e.offset << "~" << e.length
+	     << std::dec;
+}
+
+// bluefs_layout_t
+
+void bluefs_layout_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(shared_bdev, bl);
+  encode(dedicated_db, bl);
+  encode(dedicated_wal, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bluefs_layout_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(shared_bdev, p);
+  decode(dedicated_db, p);
+  decode(dedicated_wal, p);
+  DECODE_FINISH(p);
+}
+
+void bluefs_layout_t::dump(Formatter *f) const
+{
+  f->dump_stream("shared_bdev") << shared_bdev;
+  f->dump_stream("dedicated_db") << dedicated_db;
+  f->dump_stream("dedicated_wal") << dedicated_wal;
+}
+
+// bluefs_super_t
+
+void bluefs_super_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 1, bl);
+  encode(uuid, bl);
+  encode(osd_uuid, bl);
+  encode(version, bl);
+  encode(block_size, bl);
+  encode(log_fnode, bl);
+  encode(memorized_layout, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bluefs_super_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(2, p);
+  decode(uuid, p);
+  decode(osd_uuid, p);
+  decode(version, p);
+  decode(block_size, p);
+  decode(log_fnode, p);
+  if (struct_v >= 2) {
+    decode(memorized_layout, p);
+  }
+  DECODE_FINISH(p);
+}
+
+void bluefs_super_t::dump(Formatter *f) const
+{
+  f->dump_stream("uuid") << uuid;
+  f->dump_stream("osd_uuid") << osd_uuid;
+  f->dump_unsigned("version", version);
+  f->dump_unsigned("block_size", block_size);
+  f->dump_object("log_fnode", log_fnode);
+}
+
+void bluefs_super_t::generate_test_instances(list<bluefs_super_t*>& ls)
+{
+  ls.push_back(new bluefs_super_t);
+  ls.push_back(new bluefs_super_t);
+  ls.back()->version = 1;
+  ls.back()->block_size = 4096;
+}
+
+ostream& operator<<(ostream& out, const bluefs_super_t& s)
+{
+  return out << "super(uuid " << s.uuid
+	     << " osd " << s.osd_uuid
+	     << " v " << s.version
+	     << " block_size 0x" << std::hex << s.block_size
+	     << " log_fnode 0x" << s.log_fnode
+	     << std::dec << ")";
+}
+
+// bluefs_fnode_t
+
+mempool::bluefs::vector<bluefs_extent_t>::iterator bluefs_fnode_t::seek(
+  uint64_t offset, uint64_t *x_off)
+{
+  auto p = extents.begin();
+
+  if (extents_index.size() > 4) {
+    auto it = std::upper_bound(extents_index.begin(), extents_index.end(),
+      offset);
+    assert(it != extents_index.begin());
+    --it;
+    assert(offset >= *it);
+    p += it - extents_index.begin();
+    offset -= *it;
+  }
+
+  while (p != extents.end()) {
+    if ((int64_t) offset >= p->length) {
+      offset -= p->length;
+      ++p;
+    } else {
+      break;
+    }
+  }
+  *x_off = offset;
+  return p;
+}
+
+bluefs_fnode_delta_t* bluefs_fnode_t::make_delta(bluefs_fnode_delta_t* delta) {
+  ceph_assert(delta);
+  delta->ino = ino;
+  delta->size = size;
+  delta->mtime = mtime;
+  delta->offset = allocated_commited;
+  delta->extents.clear();
+  if (allocated_commited < allocated) {
+    uint64_t x_off = 0;
+    auto p = seek(allocated_commited, &x_off);
+    ceph_assert(p != extents.end());
+    if (x_off > 0) {
+      ceph_assert(x_off < p->length);
+      delta->extents.emplace_back(p->bdev, p->offset + x_off, p->length - x_off);
+      ++p;
+    }
+    while (p != extents.end()) {
+      delta->extents.push_back(*p);
+      ++p;
+    }
+  }
+  return delta;
+}
+
+void bluefs_fnode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("ino", ino);
+  f->dump_unsigned("size", size);
+  f->dump_stream("mtime") << mtime;
+  f->open_array_section("extents");
+  for (auto& p : extents)
+    f->dump_object("extent", p);
+  f->close_section();
+}
+
+void bluefs_fnode_t::generate_test_instances(list<bluefs_fnode_t*>& ls)
+{
+  ls.push_back(new bluefs_fnode_t);
+  ls.push_back(new bluefs_fnode_t);
+  ls.back()->ino = 123;
+  ls.back()->size = 1048576;
+  ls.back()->mtime = utime_t(123,45);
+  ls.back()->extents.push_back(bluefs_extent_t(0, 1048576, 4096));
+  ls.back()->__unused__ = 1;
+}
+
+ostream& operator<<(ostream& out, const bluefs_fnode_t& file)
+{
+  return out << "file(ino " << file.ino
+	     << " size 0x" << std::hex << file.size << std::dec
+	     << " mtime " << file.mtime
+	     << " allocated " << std::hex << file.allocated << std::dec
+	     << " alloc_commit " << std::hex << file.allocated_commited << std::dec
+	     << " extents " << file.extents
+	     << ")";
+}
+
+// bluefs_fnode_delta_t
+
+std::ostream& operator<<(std::ostream& out, const bluefs_fnode_delta_t& delta)
+{
+  return out << "delta(ino " << delta.ino
+	     << " size 0x" << std::hex << delta.size << std::dec
+	     << " mtime " << delta.mtime
+	     << " offset " << std::hex << delta.offset << std::dec
+	     << " extents " << delta.extents
+	     << ")";
+}
+
+// bluefs_transaction_t
+
+void bluefs_transaction_t::encode(bufferlist& bl) const
+{
+  uint32_t crc = op_bl.crc32c(-1);
+  ENCODE_START(1, 1, bl);
+  encode(uuid, bl);
+  encode(seq, bl);
+  // not using bufferlist encode method, as it merely copies the bufferptr and not
+  // contents, meaning we're left with fragmented target bl
+  __u32 len = op_bl.length();
+  encode(len, bl);
+  for (auto& it : op_bl.buffers()) {
+    bl.append(it.c_str(),  it.length());
+  }
+  encode(crc, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bluefs_transaction_t::decode(bufferlist::const_iterator& p)
+{
+  uint32_t crc;
+  DECODE_START(1, p);
+  decode(uuid, p);
+  decode(seq, p);
+  decode(op_bl, p);
+  decode(crc, p);
+  DECODE_FINISH(p);
+  uint32_t actual = op_bl.crc32c(-1);
+  if (actual != crc)
+    throw ceph::buffer::malformed_input("bad crc " + stringify(actual)
+				  + " expected " + stringify(crc));
+}
+
+void bluefs_transaction_t::dump(Formatter *f) const
+{
+  f->dump_stream("uuid") << uuid;
+  f->dump_unsigned("seq", seq);
+  f->dump_unsigned("op_bl_length", op_bl.length());
+  f->dump_unsigned("crc", op_bl.crc32c(-1));
+}
+
+void bluefs_transaction_t::generate_test_instances(
+  list<bluefs_transaction_t*>& ls)
+{
+  ls.push_back(new bluefs_transaction_t);
+  ls.push_back(new bluefs_transaction_t);
+  ls.back()->op_init();
+  ls.back()->op_dir_create("dir");
+  ls.back()->op_dir_create("dir2");
+  bluefs_fnode_t fnode;
+  fnode.ino = 2;
+  ls.back()->op_file_update(fnode);
+  ls.back()->op_dir_link("dir", "file1", 2);
+  ls.back()->op_dir_unlink("dir", "file1");
+  ls.back()->op_file_remove(2);
+  ls.back()->op_dir_remove("dir2");
+}
+
+ostream& operator<<(ostream& out, const bluefs_transaction_t& t)
+{
+  return out << "txn(seq " << t.seq
+	     << " len 0x" << std::hex << t.op_bl.length()
+	     << " crc 0x" << t.op_bl.crc32c(-1)
+	     << std::dec << ")";
+}
diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h
new file mode 100644
index 000000000..d5d8ee5a6
--- /dev/null
+++ b/src/os/bluestore/bluefs_types.h
@@ -0,0 +1,339 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OS_BLUESTORE_BLUEFS_TYPES_H
+#define CEPH_OS_BLUESTORE_BLUEFS_TYPES_H
+
+#include <optional>
+
+#include "bluestore_types.h"
+#include "include/utime.h"
+#include "include/encoding.h"
+#include "include/denc.h"
+
+class bluefs_extent_t {
+public:
+  uint64_t offset = 0;
+  uint32_t length = 0;
+  uint8_t bdev;
+
+  bluefs_extent_t(uint8_t b = 0, uint64_t o = 0, uint32_t l = 0)
+    : offset(o), length(l), bdev(b) {}
+
+  uint64_t end() const { return  offset + length; }
+  DENC(bluefs_extent_t, v, p) {
+    DENC_START(1, 1, p);
+    denc_lba(v.offset, p);
+    denc_varint_lowz(v.length, p);
+    denc(v.bdev, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluefs_extent_t*>&);
+};
+WRITE_CLASS_DENC(bluefs_extent_t)
+
+std::ostream& operator<<(std::ostream& out, const bluefs_extent_t& e);
+
+struct bluefs_fnode_delta_t {
+  uint64_t ino;
+  uint64_t size;
+  utime_t mtime;
+  uint64_t offset; // Contains offset in file of extents.
+                   // Equal to 'allocated' when created.
+                   // Used for consistency checking.
+  mempool::bluefs::vector<bluefs_extent_t> extents;
+
+  DENC(bluefs_fnode_delta_t, v, p) {
+    DENC_START(1, 1, p);
+    denc_varint(v.ino, p);
+    denc_varint(v.size, p);
+    denc(v.mtime, p);
+    denc(v.offset, p);
+    denc(v.extents, p);
+    DENC_FINISH(p);
+  }
+};
+WRITE_CLASS_DENC(bluefs_fnode_delta_t)
+
+std::ostream& operator<<(std::ostream& out, const bluefs_fnode_delta_t& delta);
+
+struct bluefs_fnode_t {
+  uint64_t ino;
+  uint64_t size;
+  utime_t mtime;
+  uint8_t __unused__ = 0; // was prefer_bdev
+  mempool::bluefs::vector<bluefs_extent_t> extents;
+
+  // precalculated logical offsets for extents vector entries
+  // allows fast lookup for extent index by the offset value via upper_bound()
+  mempool::bluefs::vector<uint64_t> extents_index;
+
+  uint64_t allocated;
+  uint64_t allocated_commited;
+
+  bluefs_fnode_t() : ino(0), size(0), allocated(0), allocated_commited(0) {}
+  bluefs_fnode_t(uint64_t _ino, uint64_t _size, utime_t _mtime) :
+    ino(_ino), size(_size), mtime(_mtime), allocated(0), allocated_commited(0) {}
+  bluefs_fnode_t(const bluefs_fnode_t& other) :
+    ino(other.ino), size(other.size), mtime(other.mtime),
+    allocated(other.allocated),
+    allocated_commited(other.allocated_commited) {
+    clone_extents(other);
+  }
+
+  uint64_t get_allocated() const {
+    return allocated;
+  }
+
+  void recalc_allocated() {
+    allocated = 0;
+    extents_index.reserve(extents.size());
+    for (auto& p : extents) {
+      extents_index.emplace_back(allocated);
+      allocated += p.length;
+    }
+    allocated_commited = allocated;
+  }
+
+  DENC_HELPERS
+  void bound_encode(size_t& p) const {
+    _denc_friend(*this, p);
+  }
+  void encode(ceph::buffer::list::contiguous_appender& p) const {
+    DENC_DUMP_PRE(bluefs_fnode_t);
+    _denc_friend(*this, p);
+  }
+  void decode(ceph::buffer::ptr::const_iterator& p) {
+    _denc_friend(*this, p);
+    recalc_allocated();
+  }
+  template<typename T, typename P>
+  friend std::enable_if_t<std::is_same_v<bluefs_fnode_t, std::remove_const_t<T>>>
+  _denc_friend(T& v, P& p) {
+    DENC_START(1, 1, p);
+    denc_varint(v.ino, p);
+    denc_varint(v.size, p);
+    denc(v.mtime, p);
+    denc(v.__unused__, p);
+    denc(v.extents, p);
+    DENC_FINISH(p);
+  }
+  void reset_delta() {
+    allocated_commited = allocated;
+  }
+  void clone_extents(const bluefs_fnode_t& fnode) {
+    for (const auto& p : fnode.extents) {
+      append_extent(p);
+    }
+  }
+  void claim_extents(mempool::bluefs::vector<bluefs_extent_t>& extents) {
+    for (const auto& p : extents) {
+      append_extent(p);
+    }
+    extents.clear();
+  }
+  void append_extent(const bluefs_extent_t& ext) {
+    if (!extents.empty() &&
+	extents.back().end() == ext.offset &&
+	extents.back().bdev == ext.bdev &&
+	(uint64_t)extents.back().length + (uint64_t)ext.length < 0xffffffff) {
+      extents.back().length += ext.length;
+    } else {
+      extents_index.emplace_back(allocated);
+      extents.push_back(ext);
+    }
+    allocated += ext.length;
+  }
+
+  void pop_front_extent() {
+    auto it = extents.begin();
+    allocated -= it->length;
+    extents_index.erase(extents_index.begin());
+    for (auto& i: extents_index) {
+      i -= it->length;
+    }
+    extents.erase(it);
+  }
+  
+  void swap(bluefs_fnode_t& other) {
+    std::swap(ino, other.ino);
+    std::swap(size, other.size);
+    std::swap(mtime, other.mtime);
+    swap_extents(other);
+  }
+  void swap_extents(bluefs_fnode_t& other) {
+    other.extents.swap(extents);
+    other.extents_index.swap(extents_index);
+    std::swap(allocated, other.allocated);
+    std::swap(allocated_commited, other.allocated_commited);
+  }
+  void clear_extents() {
+    extents_index.clear();
+    extents.clear();
+    allocated = 0;
+    allocated_commited = 0;
+  }
+
+  mempool::bluefs::vector<bluefs_extent_t>::iterator seek(
+    uint64_t off, uint64_t *x_off);
+  bluefs_fnode_delta_t* make_delta(bluefs_fnode_delta_t* delta);
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluefs_fnode_t*>& ls);
+
+};
+WRITE_CLASS_DENC(bluefs_fnode_t)
+
+std::ostream& operator<<(std::ostream& out, const bluefs_fnode_t& file);
+
+struct bluefs_layout_t {
+  unsigned shared_bdev = 0;         ///< which bluefs bdev we are sharing
+  bool dedicated_db = false;        ///< whether block.db is present
+  bool dedicated_wal = false;       ///< whether block.wal is present
+
+  bool single_shared_device() const {
+    return !dedicated_db && !dedicated_wal;
+  }
+
+  bool operator==(const bluefs_layout_t& other) const {
+    return shared_bdev == other.shared_bdev &&
+           dedicated_db == other.dedicated_db &&
+           dedicated_wal == other.dedicated_wal;
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(bluefs_layout_t)
+
+struct bluefs_super_t {
+  uuid_d uuid;      ///< unique to this bluefs instance
+  uuid_d osd_uuid;  ///< matches the osd that owns us
+  uint64_t version;
+  uint32_t block_size;
+
+  bluefs_fnode_t log_fnode;
+
+  std::optional<bluefs_layout_t> memorized_layout;
+
+  bluefs_super_t()
+    : version(0),
+      block_size(4096) { }
+
+  uint64_t block_mask() const {
+    return ~((uint64_t)block_size - 1);
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluefs_super_t*>& ls);
+};
+WRITE_CLASS_ENCODER(bluefs_super_t)
+
+std::ostream& operator<<(std::ostream&, const bluefs_super_t& s);
+
+
+struct bluefs_transaction_t {
+  typedef enum {
+    OP_NONE = 0,
+    OP_INIT,        ///< initial (empty) file system marker
+    OP_ALLOC_ADD,   ///< OBSOLETE: add extent to available block storage (extent)
+    OP_ALLOC_RM,    ///< OBSOLETE: remove extent from available block storage (extent)
+    OP_DIR_LINK,    ///< (re)set a dir entry (dirname, filename, ino)
+    OP_DIR_UNLINK,  ///< remove a dir entry (dirname, filename)
+    OP_DIR_CREATE,  ///< create a dir (dirname)
+    OP_DIR_REMOVE,  ///< remove a dir (dirname)
+    OP_FILE_UPDATE, ///< set/update file metadata (file)
+    OP_FILE_REMOVE, ///< remove file (ino)
+    OP_JUMP,        ///< jump the seq # and offset
+    OP_JUMP_SEQ,    ///< jump the seq #
+    OP_FILE_UPDATE_INC, ///< incremental update file metadata (file)
+  } op_t;
+
+  uuid_d uuid;          ///< fs uuid
+  uint64_t seq;         ///< sequence number
+  ceph::buffer::list op_bl;     ///< encoded transaction ops
+
+  bluefs_transaction_t() : seq(0) {}
+
+  void clear() {
+    *this = bluefs_transaction_t();
+  }
+  bool empty() const {
+    return op_bl.length() == 0;
+  }
+
+  void op_init() {
+    using ceph::encode;
+    encode((__u8)OP_INIT, op_bl);
+  }
+  void op_dir_create(std::string_view dir) {
+    using ceph::encode;
+    encode((__u8)OP_DIR_CREATE, op_bl);
+    encode(dir, op_bl);
+  }
+  void op_dir_remove(std::string_view dir) {
+    using ceph::encode;
+    encode((__u8)OP_DIR_REMOVE, op_bl);
+    encode(dir, op_bl);
+  }
+  void op_dir_link(std::string_view dir, std::string_view file, uint64_t ino) {
+    using ceph::encode;
+    encode((__u8)OP_DIR_LINK, op_bl);
+    encode(dir, op_bl);
+    encode(file, op_bl);
+    encode(ino, op_bl);
+  }
+  void op_dir_unlink(std::string_view dir, std::string_view file) {
+    using ceph::encode;
+    encode((__u8)OP_DIR_UNLINK, op_bl);
+    encode(dir, op_bl);
+    encode(file, op_bl);
+  }
+  void op_file_update(bluefs_fnode_t& file) {
+    using ceph::encode;
+    encode((__u8)OP_FILE_UPDATE, op_bl);
+    encode(file, op_bl);
+    file.reset_delta();
+  }
+  /* streams update to bufferlist and clears update state */
+  void op_file_update_inc(bluefs_fnode_t& file) {
+    using ceph::encode;
+    bluefs_fnode_delta_t delta;
+    file.make_delta(&delta);
+    encode((__u8)OP_FILE_UPDATE_INC, op_bl);
+    encode(delta, op_bl);
+    file.reset_delta();
+  }
+  void op_file_remove(uint64_t ino) {
+    using ceph::encode;
+    encode((__u8)OP_FILE_REMOVE, op_bl);
+    encode(ino, op_bl);
+  }
+  void op_jump(uint64_t next_seq, uint64_t offset) {
+    using ceph::encode;
+    encode((__u8)OP_JUMP, op_bl);
+    encode(next_seq, op_bl);
+    encode(offset, op_bl);
+  }
+  void op_jump_seq(uint64_t next_seq) {
+    using ceph::encode;
+    encode((__u8)OP_JUMP_SEQ, op_bl);
+    encode(next_seq, op_bl);
+  }
+  void claim_ops(bluefs_transaction_t& from) {
+    op_bl.claim_append(from.op_bl);
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluefs_transaction_t*>& ls);
+};
+WRITE_CLASS_ENCODER(bluefs_transaction_t)
+
+std::ostream& operator<<(std::ostream& out, const bluefs_transaction_t& t);
+#endif
diff --git a/src/os/bluestore/bluestore_common.h b/src/os/bluestore/bluestore_common.h
new file mode 100644
index 000000000..f61a5dcfd
--- /dev/null
+++ b/src/os/bluestore/bluestore_common.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_BLUESTORE_COMMON_H
+#define CEPH_OSD_BLUESTORE_COMMON_H
+
+#include "include/intarith.h"
+#include "include/ceph_assert.h"
+#include "kv/KeyValueDB.h"
+
+template <class Bitset, class Func>
+void apply_for_bitset_range(uint64_t off,
+  uint64_t len,
+  uint64_t granularity,
+  Bitset &bitset,
+  Func f) {
+  auto end = round_up_to(off + len, granularity) / granularity;
+  ceph_assert(end <= bitset.size());
+  uint64_t pos = off / granularity;
+  while (pos < end) {
+    f(pos, bitset);
+    pos++;
+  }
+}
+
+// merge operators
+
+struct Int64ArrayMergeOperator : public KeyValueDB::MergeOperator {
+  void merge_nonexistent(
+    const char *rdata, size_t rlen, std::string *new_value) override {
+    *new_value = std::string(rdata, rlen);
+  }
+  void merge(
+    const char *ldata, size_t llen,
+    const char *rdata, size_t rlen,
+    std::string *new_value) override {
+    ceph_assert(llen == rlen);
+    ceph_assert((rlen % 8) == 0);
+    new_value->resize(rlen);
+    const ceph_le64* lv = (const ceph_le64*)ldata;
+    const ceph_le64* rv = (const ceph_le64*)rdata;
+    ceph_le64* nv = &(ceph_le64&)new_value->at(0);
+    for (size_t i = 0; i < rlen >> 3; ++i) {
+      nv[i] = lv[i] + rv[i];
+    }
+  }
+  // We use each operator name and each prefix to construct the
+  // overall RocksDB operator name for consistency check at open time.
+  const char *name() const override {
+    return "int64_array";
+  }
+};
+
+#endif
diff --git a/src/os/bluestore/bluestore_tool.cc b/src/os/bluestore/bluestore_tool.cc
new file mode 100644
index 000000000..e63616bdd
--- /dev/null
+++ b/src/os/bluestore/bluestore_tool.cc
@@ -0,0 +1,1158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <boost/program_options/variables_map.hpp>
+#include <boost/program_options/parsers.hpp>
+
+#include <stdio.h>
+#include <string.h>
+#include <filesystem>
+#include <iostream>
+#include <fstream>
+#include <time.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "global/global_init.h"
+#include "common/ceph_argparse.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+
+#include "os/bluestore/BlueFS.h"
+#include "os/bluestore/BlueStore.h"
+#include "common/admin_socket.h"
+#include "kv/RocksDBStore.h"
+
+using namespace std;
+namespace fs = std::filesystem;
+namespace po = boost::program_options;
+
+void usage(po::options_description &desc)
+{
+  cout << desc << std::endl;
+}
+
+void validate_path(CephContext *cct, const string& path, bool bluefs)
+{
+  BlueStore bluestore(cct, path);
+  string type;
+  int r = bluestore.read_meta("type", &type);
+  if (r < 0) {
+    cerr << "failed to load os-type: " << cpp_strerror(r) << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  if (type != "bluestore") {
+    cerr << "expected bluestore, but type is " << type << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  if (!bluefs) {
+    return;
+  }
+
+  string kv_backend;
+  r = bluestore.read_meta("kv_backend", &kv_backend);
+  if (r < 0) {
+    cerr << "failed to load kv_backend: " << cpp_strerror(r) << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  if (kv_backend != "rocksdb") {
+    cerr << "expect kv_backend to be rocksdb, but is " << kv_backend
+         << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  string bluefs_enabled;
+  r = bluestore.read_meta("bluefs", &bluefs_enabled);
+  if (r < 0) {
+    cerr << "failed to load do_bluefs: " << cpp_strerror(r) << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  if (bluefs_enabled != "1") {
+    cerr << "bluefs not enabled for rocksdb" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+const char* find_device_path(
+  int id,
+  CephContext *cct,
+  const vector<string>& devs)
+{
+  for (auto& i : devs) {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct, i, &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << i << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if ((id == BlueFS::BDEV_SLOW && label.description == "main") ||
+        (id == BlueFS::BDEV_DB && label.description == "bluefs db") ||
+        (id == BlueFS::BDEV_WAL && label.description == "bluefs wal")) {
+      return i.c_str();
+    }
+  }
+  return nullptr;
+}
+
+void parse_devices(
+  CephContext *cct,
+  const vector<string>& devs,
+  map<string, int>* got,
+  bool* has_db,
+  bool* has_wal)
+{
+  string main;
+  bool was_db = false;
+  if (has_wal) {
+    *has_wal = false;
+  }
+  if (has_db) {
+    *has_db = false;
+  }
+  for (auto& d : devs) {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct, d, &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << d << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    int id = -1;
+    if (label.description == "main")
+      main = d;
+    else if (label.description == "bluefs db") {
+      id = BlueFS::BDEV_DB;
+      was_db = true;
+      if (has_db) {
+	*has_db = true;
+      }
+    }
+    else if (label.description == "bluefs wal") {
+      id = BlueFS::BDEV_WAL;
+      if (has_wal) {
+	*has_wal = true;
+      }
+    }
+    if (id >= 0) {
+      got->emplace(d, id);
+    }
+  }
+  if (main.length()) {
+    int id = was_db ? BlueFS::BDEV_SLOW : BlueFS::BDEV_DB;
+    got->emplace(main, id);
+  }
+}
+
+void add_devices(
+  BlueFS *fs,
+  CephContext *cct,
+  const vector<string>& devs)
+{
+  map<string, int> got;
+  parse_devices(cct, devs, &got, nullptr, nullptr);
+  for(auto e : got) {
+    char target_path[PATH_MAX] = "";
+    if(!e.first.empty()) {
+      if (realpath(e.first.c_str(), target_path) == nullptr) {
+	cerr << "failed to retrieve absolute path for " << e.first
+	      << ": " << cpp_strerror(errno)
+	      << std::endl;
+      }
+    }
+
+    cout << " slot " << e.second << " " << e.first;
+    if (target_path[0]) {
+      cout << " -> " << target_path;
+    }
+    cout << std::endl;
+
+    // We provide no shared allocator which prevents bluefs to operate in R/W mode.
+    // Read-only mode isn't strictly enforced though
+    int r = fs->add_block_device(e.second, e.first, false, 0); // 'reserved' is fake
+    if (r < 0) {
+      cerr << "unable to open " << e.first << ": " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+}
+
+BlueFS *open_bluefs_readonly(
+  CephContext *cct,
+  const string& path,
+  const vector<string>& devs)
+{
+  validate_path(cct, path, true);
+  BlueFS *fs = new BlueFS(cct);
+
+  add_devices(fs, cct, devs);
+
+  int r = fs->mount();
+  if (r < 0) {
+    cerr << "unable to mount bluefs: " << cpp_strerror(r)
+	 << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  return fs;
+}
+
+void log_dump(
+  CephContext *cct,
+  const string& path,
+  const vector<string>& devs)
+{
+  validate_path(cct, path, true);
+  BlueFS *fs = new BlueFS(cct);
+
+  add_devices(fs, cct, devs);
+  int r = fs->log_dump();
+  if (r < 0) {
+    cerr << "log_dump failed" << ": "
+         << cpp_strerror(r) << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  delete fs;
+}
+
+void inferring_bluefs_devices(vector<string>& devs, std::string& path)
+{
+  cout << "inferring bluefs devices from bluestore path" << std::endl;
+  for (auto fn : {"block", "block.wal", "block.db"}) {
+    string p = path + "/" + fn;
+    struct stat st;
+    if (::stat(p.c_str(), &st) == 0) {
+      devs.push_back(p);
+    }
+  }
+}
+
+static void bluefs_import(
+  const string& input_file,
+  const string& dest_file,
+  CephContext *cct,
+  const string& path,
+  const vector<string>& devs)
+{
+  int r;
+  std::ifstream f(input_file.c_str(), std::ifstream::binary);
+  if (!f) {
+    r = -errno;
+    cerr << "open " << input_file.c_str() << " failed: " << cpp_strerror(r) << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  BlueStore bluestore(cct, path);
+  KeyValueDB *db_ptr;
+  r = bluestore.open_db_environment(&db_ptr, false);
+  if (r < 0) {
+    cerr << "error preparing db environment: " << cpp_strerror(r) << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  BlueFS* bs = bluestore.get_bluefs();
+
+  BlueFS::FileWriter *h;
+  fs::path file_path(dest_file);
+  const string dir = file_path.parent_path().native();
+  const string file_name = file_path.filename().native();
+  bs->open_for_write(dir, file_name, &h, false);
+  uint64_t max_block = 4096;
+  char buf[max_block];
+  uint64_t left = fs::file_size(input_file.c_str());
+  uint64_t size = 0;
+  while (left) {
+    size = std::min(max_block, left);
+    f.read(buf, size);
+    h->append(buf, size);
+    left -= size;
+  }
+  f.close();
+  bs->fsync(h);
+  bs->close_writer(h);
+  bluestore.close_db_environment();
+  return;
+}
+
+int main(int argc, char **argv)
+{
+  string out_dir;
+  string osd_instance;
+  vector<string> devs;
+  vector<string> devs_source;
+  string dev_target;
+  string path;
+  string action;
+  string log_file;
+  string input_file;
+  string dest_file;
+  string key, value;
+  vector<string> allocs_name;
+  string empty_sharding(1, '\0');
+  string new_sharding = empty_sharding;
+  string resharding_ctrl;
+  int log_level = 30;
+  bool fsck_deep = false;
+  po::options_description po_options("Options");
+  po_options.add_options()
+    ("help,h", "produce help message")
+    (",i", po::value<string>(&osd_instance), "OSD instance. Requires access to monitor/ceph.conf")
+    ("path", po::value<string>(&path), "bluestore path")
+    ("out-dir", po::value<string>(&out_dir), "output directory")
+    ("input-file", po::value<string>(&input_file), "import file")
+    ("dest-file", po::value<string>(&dest_file), "destination file")
+    ("log-file,l", po::value<string>(&log_file), "log file")
+    ("log-level", po::value<int>(&log_level), "log level (30=most, 20=lots, 10=some, 1=little)")
+    ("dev", po::value<vector<string>>(&devs), "device(s)")
+    ("devs-source", po::value<vector<string>>(&devs_source), "bluefs-dev-migrate source device(s)")
+    ("dev-target", po::value<string>(&dev_target), "target/resulting device")
+    ("deep", po::value<bool>(&fsck_deep), "deep fsck (read all data)")
+    ("key,k", po::value<string>(&key), "label metadata key name")
+    ("value,v", po::value<string>(&value), "label metadata value")
+    ("allocator", po::value<vector<string>>(&allocs_name), "allocator to inspect: 'block'/'bluefs-wal'/'bluefs-db'")
+    ("sharding", po::value<string>(&new_sharding), "new sharding to apply")
+    ("resharding-ctrl", po::value<string>(&resharding_ctrl), "gives control over resharding procedure details")
+    ;
+  po::options_description po_positional("Positional options");
+  po_positional.add_options()
+    ("command", po::value<string>(&action),
+        "fsck, "
+        "qfsck, "
+        "allocmap, "
+        "restore_cfb, "
+        "repair, "
+        "quick-fix, "
+        "bluefs-export, "
+        "bluefs-import, "
+        "bluefs-bdev-sizes, "
+        "bluefs-bdev-expand, "
+        "bluefs-bdev-new-db, "
+        "bluefs-bdev-new-wal, "
+        "bluefs-bdev-migrate, "
+        "show-label, "
+        "set-label-key, "
+        "rm-label-key, "
+        "prime-osd-dir, "
+        "bluefs-log-dump, "
+        "free-dump, "
+        "free-score, "
+        "free-fragmentation, "
+        "bluefs-stats, "
+        "reshard, "
+        "show-sharding")
+    ;
+  po::options_description po_all("All options");
+  po_all.add(po_options).add(po_positional);
+
+  vector<string> ceph_option_strings;
+  po::variables_map vm;
+  try {
+    po::parsed_options parsed =
+      po::command_line_parser(argc, argv).options(po_all).allow_unregistered().run();
+    po::store( parsed, vm);
+    po::notify(vm);
+    ceph_option_strings = po::collect_unrecognized(parsed.options,
+						   po::include_positional);
+  } catch(po::error &e) {
+    std::cerr << e.what() << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  // normalize path (remove ending '/' if any)
+  if (path.size() > 1 && *(path.end() - 1) == '/') {
+    path.resize(path.size() - 1);
+  }
+  if (vm.count("help")) {
+    usage(po_all);
+    exit(EXIT_SUCCESS);
+  }
+
+  vector<const char*> args;
+  if (log_file.size()) {
+    args.push_back("--log-file");
+    args.push_back(log_file.c_str());
+    static char ll[10];
+    snprintf(ll, sizeof(ll), "%d", log_level);
+    args.push_back("--debug-bluestore");
+    args.push_back(ll);
+    args.push_back("--debug-bluefs");
+    args.push_back(ll);
+    args.push_back("--debug-rocksdb");
+    args.push_back(ll);
+  } else {
+    // do not write to default-named log "osd.x.log" if --log-file is not provided
+    if (!osd_instance.empty()) {
+      args.push_back("--no-log-to-file");
+    }
+  }
+
+  if (!osd_instance.empty()) {
+    args.push_back("-i");
+    args.push_back(osd_instance.c_str());
+  }
+  args.push_back("--no-log-to-stderr");
+  args.push_back("--err-to-stderr");
+
+  for (auto& i : ceph_option_strings) {
+    args.push_back(i.c_str());
+  }
+  auto cct = global_init(NULL, args, osd_instance.empty() ? CEPH_ENTITY_TYPE_CLIENT : CEPH_ENTITY_TYPE_OSD,
+			 CODE_ENVIRONMENT_UTILITY,
+			 osd_instance.empty() ? CINIT_FLAG_NO_DEFAULT_CONFIG_FILE : 0);
+
+  common_init_finish(cct.get());
+  if (action.empty()) {
+    // if action ("command") is not yet defined try to use first param as action
+    if (args.size() > 0) {
+      if (args.size() == 1) {
+	// treat first unparsed value as action
+	action = args[0];
+      } else {
+	std::cerr << "Unknown options: " << args << std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+  } else {
+    if (args.size() != 0) {
+      std::cerr << "Unknown options: " << args << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  if (action.empty()) {
+    cerr << "must specify an action; --help for help" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  if (!osd_instance.empty()) {
+    // when "-i" is provided "osd data" can be used as path
+    if (path.size() == 0) {
+      path = cct->_conf.get_val<std::string>("osd_data");
+    }
+  }
+
+  if (action == "fsck" || action == "repair" || action == "quick-fix" || action == "allocmap" || action == "qfsck" || action == "restore_cfb") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  if (action == "prime-osd-dir") {
+    if (devs.size() != 1) {
+      cerr << "must specify the main bluestore device" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (path.empty()) {
+      cerr << "must specify osd dir to prime" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  if (action == "set-label-key" ||
+      action == "rm-label-key") {
+    if (devs.size() != 1) {
+      cerr << "must specify the main bluestore device" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (key.size() == 0) {
+      cerr << "must specify a key name with -k" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (action == "set-label-key" && value.size() == 0) {
+      cerr << "must specify a value with -v" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  if (action == "show-label") {
+    if (devs.empty() && path.empty()) {
+      cerr << "must specify bluestore path *or* raw device(s)" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (devs.empty())
+      inferring_bluefs_devices(devs, path);
+  }
+  if (action == "bluefs-export" || 
+      action == "bluefs-import" || 
+      action == "bluefs-log-dump") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if ((action == "bluefs-export") && out_dir.empty()) {
+      cerr << "must specify out-dir to export bluefs" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (action == "bluefs-import" && input_file.empty()) {
+      cerr << "must specify input_file to import bluefs" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (action == "bluefs-import" && dest_file.empty()) {
+      cerr << "must specify dest_file to import bluefs" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    inferring_bluefs_devices(devs, path);
+  }
+  if (action == "bluefs-bdev-sizes" || action == "bluefs-bdev-expand") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    inferring_bluefs_devices(devs, path);
+  }
+  if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (dev_target.empty()) {
+      cout << "NOTICE: --dev-target option omitted, will allocate as a file" << std::endl;
+    }
+    inferring_bluefs_devices(devs, path);
+  }
+  if (action == "bluefs-bdev-migrate") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    inferring_bluefs_devices(devs, path);
+    if (devs_source.size() == 0) {
+      cerr << "must specify source devices with --devs-source" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (dev_target.empty()) {
+      cerr << "must specify target device with --dev-target" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  if (action == "free-score" || action == "free-dump" || action == "free-fragmentation") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    for (auto name : allocs_name) {
+      if (!name.empty() &&
+          name != "block" &&
+          name != "bluefs-db" &&
+          name != "bluefs-wal") {
+        cerr << "unknown allocator '" << name << "'" << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    }
+    if (allocs_name.empty())
+      allocs_name = vector<string>{"block", "bluefs-db", "bluefs-wal"};
+  }
+  if (action == "reshard") {
+    if (path.empty()) {
+      cerr << "must specify bluestore path" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (new_sharding == empty_sharding) {
+      cerr << "must provide reshard specification" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  if (action == "restore_cfb") {
+#ifndef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
+    cerr << action << " bluestore.restore_cfb is not supported!!! " << std::endl;
+    exit(EXIT_FAILURE);
+#else
+    cout << action << " bluestore.restore_cfb" << std::endl;
+    validate_path(cct.get(), path, false);
+    BlueStore bluestore(cct.get(), path);
+    int r = bluestore.push_allocation_to_rocksdb();
+    if (r < 0) {
+      cerr << action << " failed: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    } else {
+      cout << action << " success" << std::endl;
+    }
+#endif
+  }
+  else if (action == "allocmap") {
+#ifdef CEPH_BLUESTORE_TOOL_DISABLE_ALLOCMAP
+    cerr << action << " bluestore.allocmap is not supported!!! " << std::endl;
+    exit(EXIT_FAILURE);
+#else
+    cout << action << " bluestore.allocmap" << std::endl;
+    validate_path(cct.get(), path, false);
+    BlueStore bluestore(cct.get(), path);
+    int r = bluestore.read_allocation_from_drive_for_bluestore_tool();
+    if (r < 0) {
+      cerr << action << " failed: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    } else {
+      cout << action << " success" << std::endl;
+    }
+#endif
+  }
+  else if( action == "qfsck" ) {
+#ifndef CEPH_BLUESTORE_TOOL_RESTORE_ALLOCATION
+    cerr << action << " bluestore.qfsck is not supported!!! " << std::endl;
+    exit(EXIT_FAILURE);
+#else
+    cout << action << " bluestore.quick-fsck" << std::endl;
+    validate_path(cct.get(), path, false);
+    BlueStore bluestore(cct.get(), path);
+    int r = bluestore.read_allocation_from_drive_for_bluestore_tool();
+    if (r < 0) {
+      cerr << action << " failed: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    } else {
+      cout << action << " success" << std::endl;
+    }
+#endif
+  }
+  else if (action == "fsck" ||
+      action == "repair" ||
+      action == "quick-fix") {
+    validate_path(cct.get(), path, false);
+    BlueStore bluestore(cct.get(), path);
+    int r;
+    if (action == "fsck") {
+      r = bluestore.fsck(fsck_deep);
+    } else if (action == "repair") {
+      r = bluestore.repair(fsck_deep);
+    } else {
+      r = bluestore.quick_fix();
+    }
+    if (r < 0) {
+      cerr << action << " failed: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    } else if (r > 0) {
+      cerr << action << " status: remaining " << r << " error(s) and warning(s)" << std::endl;
+      exit(EXIT_FAILURE);
+    } else {
+      cout << action << " success" << std::endl;
+    }
+  }
+  else if (action == "prime-osd-dir") {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+    if (r < 0) {
+      cerr << "failed to read label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    // kludge some things into the map that we want to populate into
+    // target dir
+    label.meta["path_block"] = devs.front();
+    label.meta["type"] = "bluestore";
+    label.meta["fsid"] = stringify(label.osd_uuid);
+    
+    for (auto kk : {
+	"whoami",
+	  "osd_key",
+	  "ceph_fsid",
+	  "fsid",
+	  "type",
+	  "ready" }) {
+      string k = kk;
+      auto i = label.meta.find(k);
+      if (i == label.meta.end()) {
+	continue;
+      }
+      string p = path + "/" + k;
+      string v = i->second;
+      if (k == "osd_key") {
+	p = path + "/keyring";
+	v = "[osd.";
+	v += label.meta["whoami"];
+	v += "]\nkey = " + i->second;
+      }
+      v += "\n";
+      int fd = ::open(p.c_str(), O_CREAT|O_TRUNC|O_WRONLY|O_CLOEXEC, 0600);
+      if (fd < 0) {
+	cerr << "error writing " << p << ": " << cpp_strerror(errno)
+	     << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      int r = safe_write(fd, v.c_str(), v.size());
+      if (r < 0) {
+	cerr << "error writing to " << p << ": " << cpp_strerror(errno)
+	     << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      ::close(fd);
+    }
+  }
+  else if (action == "show-label") {
+    JSONFormatter jf(true);
+    jf.open_object_section("devices");
+    for (auto& i : devs) {
+      bluestore_bdev_label_t label;
+      int r = BlueStore::_read_bdev_label(cct.get(), i, &label);
+      if (r < 0) {
+	cerr << "unable to read label for " << i << ": "
+	     << cpp_strerror(r) << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      jf.open_object_section(i.c_str());
+      label.dump(&jf);
+      jf.close_section();
+    }
+    jf.close_section();
+    jf.flush(cout);
+  }
+  else if (action == "set-label-key") {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (key == "size") {
+      label.size = strtoull(value.c_str(), nullptr, 10);
+    } else if (key =="osd_uuid") {
+      label.osd_uuid.parse(value.c_str());
+    } else if (key =="btime") {
+      uint64_t epoch;
+      uint64_t nsec;
+      int r = utime_t::parse_date(value.c_str(), &epoch, &nsec);
+      if (r == 0) {
+	label.btime = utime_t(epoch, nsec);
+      }
+    } else if (key =="description") {
+      label.description = value;
+    } else {
+      label.meta[key] = value;
+    }
+    r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
+    if (r < 0) {
+      cerr << "unable to write label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  else if (action == "rm-label-key") {
+    bluestore_bdev_label_t label;
+    int r = BlueStore::_read_bdev_label(cct.get(), devs.front(), &label);
+    if (r < 0) {
+      cerr << "unable to read label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    if (!label.meta.count(key)) {
+      cerr << "key '" << key << "' not present" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    label.meta.erase(key);
+    r = BlueStore::_write_bdev_label(cct.get(), devs.front(), label);
+    if (r < 0) {
+      cerr << "unable to write label for " << devs.front() << ": "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  else if (action == "bluefs-bdev-sizes") {
+    BlueStore bluestore(cct.get(), path);
+    bluestore.dump_bluefs_sizes(cout);
+  }
+  else if (action == "bluefs-bdev-expand") {
+    BlueStore bluestore(cct.get(), path);
+    auto r = bluestore.expand_devices(cout);
+    if (r <0) {
+      cerr << "failed to expand bluestore devices: "
+	   << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+  }
+  else if (action == "bluefs-import") {
+    bluefs_import(input_file, dest_file, cct.get(), path, devs);
+  }
+  else if (action == "bluefs-export") {
+    BlueFS *fs = open_bluefs_readonly(cct.get(), path, devs);
+
+    vector<string> dirs;
+    int r = fs->readdir("", &dirs);
+    if (r < 0) {
+      cerr << "readdir in root failed: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    if (::access(out_dir.c_str(), F_OK)) {
+      r = ::mkdir(out_dir.c_str(), 0755);
+      if (r < 0) {
+        r = -errno;
+        cerr << "mkdir " << out_dir << " failed: " << cpp_strerror(r) << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    }
+
+    for (auto& dir : dirs) {
+      if (dir[0] == '.')
+	continue;
+      cout << dir << "/" << std::endl;
+      vector<string> ls;
+      r = fs->readdir(dir, &ls);
+      if (r < 0) {
+	cerr << "readdir " << dir << " failed: " << cpp_strerror(r) << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      string full = out_dir + "/" + dir;
+      if (::access(full.c_str(), F_OK)) {
+        r = ::mkdir(full.c_str(), 0755);
+        if (r < 0) {
+          r = -errno;
+          cerr << "mkdir " << full << " failed: " << cpp_strerror(r) << std::endl;
+          exit(EXIT_FAILURE);
+        }
+      }
+      for (auto& file : ls) {
+	if (file[0] == '.')
+	  continue;
+	cout << dir << "/" << file << std::endl;
+	uint64_t size;
+	utime_t mtime;
+	r = fs->stat(dir, file, &size, &mtime);
+	if (r < 0) {
+	  cerr << "stat " << file << " failed: " << cpp_strerror(r) << std::endl;
+	  exit(EXIT_FAILURE);
+	}
+	string path = out_dir + "/" + dir + "/" + file;
+	int fd = ::open(path.c_str(), O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0644);
+	if (fd < 0) {
+	  r = -errno;
+	  cerr << "open " << path << " failed: " << cpp_strerror(r) << std::endl;
+	  exit(EXIT_FAILURE);
+	}
+	if (size > 0) {
+	  BlueFS::FileReader *h;
+	  r = fs->open_for_read(dir, file, &h, false);
+	  if (r < 0) {
+	    cerr << "open_for_read " << dir << "/" << file << " failed: "
+		 << cpp_strerror(r) << std::endl;
+	    exit(EXIT_FAILURE);
+	  }
+	  int pos = 0;
+	  int left = size;
+	  while (left) {
+	    bufferlist bl;
+	    r = fs->read(h, pos, left, &bl, NULL);
+	    if (r <= 0) {
+	      cerr << "read " << dir << "/" << file << " from " << pos
+		   << " failed: " << cpp_strerror(r) << std::endl;
+	      exit(EXIT_FAILURE);
+	    }
+	    int rc = bl.write_fd(fd);
+	    if (rc < 0) {
+	      cerr << "write to " << path << " failed: "
+		   << cpp_strerror(r) << std::endl;
+	      exit(EXIT_FAILURE);
+	    }
+	    pos += r;
+	    left -= r;
+	  }
+	  delete h;
+	}
+	::close(fd);
+      }
+    }
+    fs->umount();
+    delete fs;
+  } else if (action == "bluefs-log-dump") {
+    log_dump(cct.get(), path, devs);
+  } else if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") {
+    map<string, int> cur_devs_map;
+    bool need_db = action == "bluefs-bdev-new-db";
+
+    bool has_wal = false;
+    bool has_db = false;
+
+    parse_devices(cct.get(), devs, &cur_devs_map, &has_db, &has_wal);
+
+    if (has_db && has_wal) {
+      cerr << "can't allocate new device, both WAL and DB exist"
+	    << std::endl;
+      exit(EXIT_FAILURE);
+    } else if (need_db && has_db) {
+      cerr << "can't allocate new DB device, already exists"
+	    << std::endl;
+      exit(EXIT_FAILURE);
+    } else if (!need_db && has_wal) {
+      cerr << "can't allocate new WAL device, already exists"
+	    << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    auto [target_path, has_size_spec] =
+      [&dev_target]() -> std::pair<string, bool> {
+      if (dev_target.empty()) {
+	return {"", false};
+      }
+      std::error_code ec;
+      fs::path target = fs::weakly_canonical(fs::path{dev_target}, ec);
+      if (ec) {
+	cerr << "failed to retrieve absolute path for " << dev_target
+	     << ": " << ec.message()
+	     << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      return {target.native(),
+              fs::exists(target) &&
+               (fs::is_block_file(target) ||
+	         (fs::is_regular_file(target) && fs::file_size(target) > 0))};
+    }();
+    // Attach either DB or WAL volume, create if needed
+    // check if we need additional size specification
+    if (!has_size_spec) {
+      if (need_db && cct->_conf->bluestore_block_db_size == 0) {
+	cerr << "Might need DB size specification, "
+		"please set Ceph bluestore-block-db-size config parameter "
+	     << std::endl;
+	return EXIT_FAILURE;
+      } else if (!need_db && cct->_conf->bluestore_block_wal_size == 0) {
+	cerr << "Might need WAL size specification, "
+		"please set Ceph bluestore-block-wal-size config parameter "
+	     << std::endl;
+	return EXIT_FAILURE;
+      }
+    }
+    BlueStore bluestore(cct.get(), path);
+    int r = bluestore.add_new_bluefs_device(
+      need_db ? BlueFS::BDEV_NEWDB : BlueFS::BDEV_NEWWAL,
+      target_path);
+    if (r == 0) {
+      cout << (need_db ? "DB" : "WAL") << " device added " << target_path
+	   << std::endl;
+    } else {
+      cerr << "failed to add " << (need_db ? "DB" : "WAL") << " device:"
+	   << cpp_strerror(r)
+	   << std::endl;
+    }
+    return r;
+  } else if (action == "bluefs-bdev-migrate") {
+    map<string, int> cur_devs_map;
+    set<int> src_dev_ids;
+    map<string, int> src_devs;
+
+    parse_devices(cct.get(), devs, &cur_devs_map, nullptr, nullptr);
+    for (auto& s :  devs_source) {
+      auto i = cur_devs_map.find(s);
+      if (i != cur_devs_map.end()) {
+        if (s == dev_target) {
+	  cerr << "Device " << dev_target
+	       << " is present in both source and target lists, omitted."
+	       << std::endl;
+        } else {
+	  src_devs.emplace(*i);
+	  src_dev_ids.emplace(i->second);
+	}
+      } else {
+	cerr << "can't migrate " << s << ", not a valid bluefs volume "
+	      << std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+
+    auto i = cur_devs_map.find(dev_target);
+
+    if (i != cur_devs_map.end()) {
+      // Migrate to an existing BlueFS volume
+
+      auto dev_target_id = i->second;
+      if (dev_target_id == BlueFS::BDEV_WAL) {
+	// currently we're unable to migrate to WAL device since there is no space
+	// reserved for superblock
+	cerr << "Migrate to WAL device isn't supported." << std::endl;
+	exit(EXIT_FAILURE);
+      }
+
+      BlueStore bluestore(cct.get(), path);
+      int r = bluestore.migrate_to_existing_bluefs_device(
+	src_dev_ids,
+	dev_target_id);
+      if (r == 0) {
+	for(auto src : src_devs) {
+	  if (src.second != BlueFS::BDEV_SLOW) {
+	    cout << " device removed:" << src.second << " " << src.first
+		 << std::endl;
+	  }
+	}
+      } else {
+        bool need_db = dev_target_id == BlueFS::BDEV_DB;
+	cerr << "failed to migrate to existing BlueFS device: "
+	     << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_WAL)
+	     << " " << dev_target
+	     << cpp_strerror(r)
+	     << std::endl;
+      }
+      return r;
+    } else {
+      // Migrate to a new BlueFS volume
+      // via creating either DB or WAL volume
+      char target_path[PATH_MAX] = "";
+      int dev_target_id;
+      if (src_dev_ids.count(BlueFS::BDEV_DB)) {
+	// if we have DB device in the source list - we create DB device
+	// (and may be remove WAL).
+	dev_target_id = BlueFS::BDEV_NEWDB;
+      } else if (src_dev_ids.count(BlueFS::BDEV_WAL)) {
+	dev_target_id = BlueFS::BDEV_NEWWAL;
+      } else {
+        cerr << "Unable to migrate Slow volume to new location, "
+	        "please allocate new DB or WAL with "
+		"--bluefs-bdev-new-db(wal) command"
+	     << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      if(!dev_target.empty() &&
+	        realpath(dev_target.c_str(), target_path) == nullptr) {
+	cerr << "failed to retrieve absolute path for " << dev_target
+	     << ": " << cpp_strerror(errno)
+	     << std::endl;
+	exit(EXIT_FAILURE);
+      }
+
+      BlueStore bluestore(cct.get(), path);
+
+      bool need_db = dev_target_id == BlueFS::BDEV_NEWDB;
+      int r = bluestore.migrate_to_new_bluefs_device(
+	src_dev_ids,
+	dev_target_id,
+	target_path);
+      if (r == 0) {
+	for(auto src : src_devs) {
+	  if (src.second != BlueFS::BDEV_SLOW) {
+	    cout << " device removed:" << src.second << " " << src.first
+		 << std::endl;
+	  }
+	}
+	cout << " device added: "
+	     << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB)
+	     << " " << target_path
+	     << std::endl;
+      } else {
+	cerr << "failed to migrate to new BlueFS device: "
+	     << (need_db ? BlueFS::BDEV_DB : BlueFS::BDEV_DB)
+	     << " " << target_path
+	     << cpp_strerror(r)
+	     << std::endl;
+      }
+      return r;
+    }
+  } else  if (action == "free-dump" || action == "free-score" || action == "fragmentation") {
+    AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+    ceph_assert(admin_socket);
+    std::string action_name = action == "free-dump" ? "dump" :
+                              action == "free-score" ? "score" : "fragmentation";
+    validate_path(cct.get(), path, false);
+    BlueStore bluestore(cct.get(), path);
+    int r = bluestore.cold_open();
+    if (r < 0) {
+      cerr << "error from cold_open: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    for (auto alloc_name : allocs_name) {
+      ceph::bufferlist in, out;
+      ostringstream err;
+      int r = admin_socket->execute_command(
+	{"{\"prefix\": \"bluestore allocator " + action_name + " " + alloc_name + "\"}"},
+	in, err, &out);
+      if (r != 0) {
+        cerr << "failure querying '" << alloc_name << "'" << std::endl;
+      } else {
+        cout << alloc_name << ":" << std::endl;
+        cout << std::string(out.c_str(),out.length()) << std::endl;
+      }
+    }
+
+    bluestore.cold_close();
+  } else  if (action == "bluefs-stats") {
+    AdminSocket* admin_socket = g_ceph_context->get_admin_socket();
+    ceph_assert(admin_socket);
+    validate_path(cct.get(), path, false);
+
+    // make sure we can adjust any config settings
+    g_conf()._clear_safe_to_start_threads();
+    g_conf().set_val_or_die("bluestore_volume_selection_policy",
+                            "use_some_extra_enforced");
+    BlueStore bluestore(cct.get(), path);
+    int r = bluestore.cold_open();
+    if (r < 0) {
+      cerr << "error from cold_open: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+
+    ceph::bufferlist in, out;
+    ostringstream err;
+    r = admin_socket->execute_command(
+      { "{\"prefix\": \"bluefs stats\"}" },
+      in, err, &out);
+    if (r != 0) {
+      cerr << "failure querying bluefs stats: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    cout << std::string(out.c_str(), out.length()) << std::endl;
+     bluestore.cold_close();
+  } else if (action == "reshard") {
+    auto get_ctrl = [&](size_t& val) {
+      if (!resharding_ctrl.empty()) {
+	size_t pos;
+	std::string token;
+	pos = resharding_ctrl.find('/');
+	token = resharding_ctrl.substr(0, pos);
+	if (pos != std::string::npos)
+	  resharding_ctrl.erase(0, pos + 1);
+	else
+	  resharding_ctrl.erase();
+	char* endptr;
+	val = strtoll(token.c_str(), &endptr, 0);
+	if (*endptr != '\0') {
+	  cerr << "invalid --resharding-ctrl. '" << token << "' is not a number" << std::endl;
+	  exit(EXIT_FAILURE);
+	}
+      }
+    };
+    BlueStore bluestore(cct.get(), path);
+    KeyValueDB *db_ptr;
+    RocksDBStore::resharding_ctrl ctrl;
+    if (!resharding_ctrl.empty()) {
+      get_ctrl(ctrl.bytes_per_iterator);
+      get_ctrl(ctrl.keys_per_iterator);
+      get_ctrl(ctrl.bytes_per_batch);
+      get_ctrl(ctrl.keys_per_batch);
+      if (!resharding_ctrl.empty()) {
+	cerr << "extra chars in --resharding-ctrl" << std::endl;
+	exit(EXIT_FAILURE);
+      }
+    }
+    int r = bluestore.open_db_environment(&db_ptr, true);
+    if (r < 0) {
+      cerr << "error preparing db environment: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    ceph_assert(db_ptr);
+    RocksDBStore* rocks_db = dynamic_cast<RocksDBStore*>(db_ptr);
+    ceph_assert(rocks_db);
+    r = rocks_db->reshard(new_sharding, &ctrl);
+    if (r < 0) {
+      cerr << "error resharding: " << cpp_strerror(r) << std::endl;
+    } else {
+      cout << "reshard success" << std::endl;
+    }
+    bluestore.close_db_environment();
+  } else if (action == "show-sharding") {
+    BlueStore bluestore(cct.get(), path);
+    KeyValueDB *db_ptr;
+    int r = bluestore.open_db_environment(&db_ptr, false);
+    if (r < 0) {
+      cerr << "error preparing db environment: " << cpp_strerror(r) << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    ceph_assert(db_ptr);
+    RocksDBStore* rocks_db = dynamic_cast<RocksDBStore*>(db_ptr);
+    ceph_assert(rocks_db);
+    std::string sharding;
+    bool res = rocks_db->get_sharding(sharding);
+    bluestore.close_db_environment();
+    if (!res) {
+      cerr << "failed to retrieve sharding def" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    cout << sharding << std::endl;
+  } else {
+    cerr << "unrecognized action " << action << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/src/os/bluestore/bluestore_types.cc b/src/os/bluestore/bluestore_types.cc
new file mode 100644
index 000000000..904b6fbd3
--- /dev/null
+++ b/src/os/bluestore/bluestore_types.cc
@@ -0,0 +1,1279 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "bluestore_types.h"
+#include "common/Formatter.h"
+#include "common/Checksummer.h"
+#include "include/stringify.h"
+
+using std::list;
+using std::map;
+using std::make_pair;
+using std::ostream;
+using std::string;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::Formatter;
+
+// bluestore_bdev_label_t
+
+void bluestore_bdev_label_t::encode(bufferlist& bl) const
+{
+  // be slightly friendly to someone who looks at the device
+  bl.append("bluestore block device\n");
+  bl.append(stringify(osd_uuid));
+  bl.append("\n");
+  ENCODE_START(2, 1, bl);
+  encode(osd_uuid, bl);
+  encode(size, bl);
+  encode(btime, bl);
+  encode(description, bl);
+  encode(meta, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bluestore_bdev_label_t::decode(bufferlist::const_iterator& p)
+{
+  p += 60u; // see above
+  DECODE_START(2, p);
+  decode(osd_uuid, p);
+  decode(size, p);
+  decode(btime, p);
+  decode(description, p);
+  if (struct_v >= 2) {
+    decode(meta, p);
+  }
+  DECODE_FINISH(p);
+}
+
+void bluestore_bdev_label_t::dump(Formatter *f) const
+{
+  f->dump_stream("osd_uuid") << osd_uuid;
+  f->dump_unsigned("size", size);
+  f->dump_stream("btime") << btime;
+  f->dump_string("description", description);
+  for (auto& i : meta) {
+    f->dump_string(i.first.c_str(), i.second);
+  }
+}
+
+void bluestore_bdev_label_t::generate_test_instances(
+  list<bluestore_bdev_label_t*>& o)
+{
+  o.push_back(new bluestore_bdev_label_t);
+  o.push_back(new bluestore_bdev_label_t);
+  o.back()->size = 123;
+  o.back()->btime = utime_t(4, 5);
+  o.back()->description = "fakey";
+  o.back()->meta["foo"] = "bar";
+}
+
+ostream& operator<<(ostream& out, const bluestore_bdev_label_t& l)
+{
+  return out << "bdev(osd_uuid " << l.osd_uuid
+	     << ", size 0x" << std::hex << l.size << std::dec
+	     << ", btime " << l.btime
+	     << ", desc " << l.description
+	     << ", " << l.meta.size() << " meta"
+	     << ")";
+}
+
+// cnode_t
+
+void bluestore_cnode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("bits", bits);
+}
+
+void bluestore_cnode_t::generate_test_instances(list<bluestore_cnode_t*>& o)
+{
+  o.push_back(new bluestore_cnode_t());
+  o.push_back(new bluestore_cnode_t(0));
+  o.push_back(new bluestore_cnode_t(123));
+}
+
+ostream& operator<<(ostream& out, const bluestore_cnode_t& l)
+{
+  return out << "cnode(bits " << l.bits << ")";
+}
+
+// bluestore_extent_ref_map_t
+
+void bluestore_extent_ref_map_t::_check() const
+{
+  uint64_t pos = 0;
+  unsigned refs = 0;
+  for (const auto &p : ref_map) {
+    if (p.first < pos)
+      ceph_abort_msg("overlap");
+    if (p.first == pos && p.second.refs == refs)
+      ceph_abort_msg("unmerged");
+    pos = p.first + p.second.length;
+    refs = p.second.refs;
+  }
+}
+
+void bluestore_extent_ref_map_t::_maybe_merge_left(
+  map<uint64_t,record_t>::iterator& p)
+{
+  if (p == ref_map.begin())
+    return;
+  auto q = p;
+  --q;
+  if (q->second.refs == p->second.refs &&
+      q->first + q->second.length == p->first) {
+    q->second.length += p->second.length;
+    ref_map.erase(p);
+    p = q;
+  }
+}
+
+void bluestore_extent_ref_map_t::get(uint64_t offset, uint32_t length)
+{
+  auto p = ref_map.lower_bound(offset);
+  if (p != ref_map.begin()) {
+    --p;
+    if (p->first + p->second.length <= offset) {
+      ++p;
+    }
+  }
+  while (length > 0) {
+    if (p == ref_map.end()) {
+      // nothing after offset; add the whole thing.
+      p = ref_map.insert(
+	map<uint64_t,record_t>::value_type(offset, record_t(length, 1))).first;
+      break;
+    }
+    if (p->first > offset) {
+      // gap
+      uint64_t newlen = std::min<uint64_t>(p->first - offset, length);
+      p = ref_map.insert(
+	map<uint64_t,record_t>::value_type(offset,
+					   record_t(newlen, 1))).first;
+      offset += newlen;
+      length -= newlen;
+      _maybe_merge_left(p);
+      ++p;
+      continue;
+    }
+    if (p->first < offset) {
+      // split off the portion before offset
+      ceph_assert(p->first + p->second.length > offset);
+      uint64_t left = p->first + p->second.length - offset;
+      p->second.length = offset - p->first;
+      p = ref_map.insert(map<uint64_t,record_t>::value_type(
+			   offset, record_t(left, p->second.refs))).first;
+      // continue below
+    }
+    ceph_assert(p->first == offset);
+    if (length < p->second.length) {
+      ref_map.insert(make_pair(offset + length,
+			       record_t(p->second.length - length,
+					p->second.refs)));
+      p->second.length = length;
+      ++p->second.refs;
+      break;
+    }
+    ++p->second.refs;
+    offset += p->second.length;
+    length -= p->second.length;
+    _maybe_merge_left(p);
+    ++p;
+  }
+  if (p != ref_map.end())
+    _maybe_merge_left(p);
+  //_check();
+}
+
+void bluestore_extent_ref_map_t::put(
+  uint64_t offset, uint32_t length,
+  PExtentVector *release,
+  bool *maybe_unshared)
+{
+  //NB: existing entries in 'release' container must be preserved!
+  bool unshared = true;
+  auto p = ref_map.lower_bound(offset);
+  if (p == ref_map.end() || p->first > offset) {
+    if (p == ref_map.begin()) {
+      ceph_abort_msg("put on missing extent (nothing before)");
+    }
+    --p;
+    if (p->first + p->second.length <= offset) {
+      ceph_abort_msg("put on missing extent (gap)");
+    }
+  }
+  if (p->first < offset) {
+    uint64_t left = p->first + p->second.length - offset;
+    p->second.length = offset - p->first;
+    if (p->second.refs != 1) {
+      unshared = false;
+    }
+    p = ref_map.insert(map<uint64_t,record_t>::value_type(
+			 offset, record_t(left, p->second.refs))).first;
+  }
+  while (length > 0) {
+    ceph_assert(p->first == offset);
+    if (length < p->second.length) {
+      if (p->second.refs != 1) {
+	unshared = false;
+      }
+      ref_map.insert(make_pair(offset + length,
+			       record_t(p->second.length - length,
+					p->second.refs)));
+      if (p->second.refs > 1) {
+	p->second.length = length;
+	--p->second.refs;
+	if (p->second.refs != 1) {
+	  unshared = false;
+	}
+	_maybe_merge_left(p);
+      } else {
+	if (release)
+	  release->push_back(bluestore_pextent_t(p->first, length));
+	ref_map.erase(p);
+      }
+      goto out;
+    }
+    offset += p->second.length;
+    length -= p->second.length;
+    if (p->second.refs > 1) {
+      --p->second.refs;
+      if (p->second.refs != 1) {
+	unshared = false;
+      }
+      _maybe_merge_left(p);
+      ++p;
+    } else {
+      if (release)
+	release->push_back(bluestore_pextent_t(p->first, p->second.length));
+      ref_map.erase(p++);
+    }
+  }
+  if (p != ref_map.end())
+    _maybe_merge_left(p);
+  //_check();
+out:
+  if (maybe_unshared) {
+    if (unshared) {
+      // we haven't seen a ref != 1 yet; check the whole map.
+      for (auto& p : ref_map) {
+	if (p.second.refs != 1) {
+	  unshared = false;
+	  break;
+	}
+      }
+    }
+    *maybe_unshared = unshared;
+  }
+}
+
+bool bluestore_extent_ref_map_t::contains(uint64_t offset, uint32_t length) const
+{
+  auto p = ref_map.lower_bound(offset);
+  if (p == ref_map.end() || p->first > offset) {
+    if (p == ref_map.begin()) {
+      return false; // nothing before
+    }
+    --p;
+    if (p->first + p->second.length <= offset) {
+      return false; // gap
+    }
+  }
+  while (length > 0) {
+    if (p == ref_map.end())
+      return false;
+    if (p->first > offset)
+      return false;
+    if (p->first + p->second.length >= offset + length)
+      return true;
+    uint64_t overlap = p->first + p->second.length - offset;
+    offset += overlap;
+    length -= overlap;
+    ++p;
+  }
+  return true;
+}
+
+bool bluestore_extent_ref_map_t::intersects(
+  uint64_t offset,
+  uint32_t length) const
+{
+  auto p = ref_map.lower_bound(offset);
+  if (p != ref_map.begin()) {
+    --p;
+    if (p->first + p->second.length <= offset) {
+      ++p;
+    }
+  }
+  if (p == ref_map.end())
+    return false;
+  if (p->first >= offset + length)
+    return false;
+  return true;  // intersects p!
+}
+
+void bluestore_extent_ref_map_t::dump(Formatter *f) const
+{
+  f->open_array_section("ref_map");
+  for (auto& p : ref_map) {
+    f->open_object_section("ref");
+    f->dump_unsigned("offset", p.first);
+    f->dump_unsigned("length", p.second.length);
+    f->dump_unsigned("refs", p.second.refs);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void bluestore_extent_ref_map_t::generate_test_instances(
+  list<bluestore_extent_ref_map_t*>& o)
+{
+  o.push_back(new bluestore_extent_ref_map_t);
+  o.push_back(new bluestore_extent_ref_map_t);
+  o.back()->get(10, 10);
+  o.back()->get(18, 22);
+  o.back()->get(20, 20);
+  o.back()->get(10, 25);
+  o.back()->get(15, 20);
+}
+
+ostream& operator<<(ostream& out, const bluestore_extent_ref_map_t& m)
+{
+  out << "ref_map(";
+  for (auto p = m.ref_map.begin(); p != m.ref_map.end(); ++p) {
+    if (p != m.ref_map.begin())
+      out << ",";
+    out << std::hex << "0x" << p->first << "~" << p->second.length << std::dec
+	<< "=" << p->second.refs;
+  }
+  out << ")";
+  return out;
+}
+
+// bluestore_blob_use_tracker_t
+bluestore_blob_use_tracker_t::bluestore_blob_use_tracker_t(
+  const bluestore_blob_use_tracker_t& tracker)
+ : au_size{tracker.au_size},
+   num_au(0),
+   alloc_au(0),
+   bytes_per_au{nullptr}
+{
+  if (tracker.num_au > 0) {
+    allocate(tracker.num_au);
+    std::copy(tracker.bytes_per_au, tracker.bytes_per_au + num_au, bytes_per_au);
+  } else {
+    total_bytes = tracker.total_bytes;
+  }
+}
+
+bluestore_blob_use_tracker_t&
+bluestore_blob_use_tracker_t::operator=(const bluestore_blob_use_tracker_t& rhs)
+{
+  if (this == &rhs) {
+    return *this;
+  }
+  clear();
+  au_size = rhs.au_size;
+  if (rhs.num_au > 0) {
+    allocate( rhs.num_au);
+    std::copy(rhs.bytes_per_au, rhs.bytes_per_au + num_au, bytes_per_au);
+  } else {
+    total_bytes = rhs.total_bytes;
+  }
+  return *this;
+}
+
+void bluestore_blob_use_tracker_t::allocate(uint32_t au_count)
+{
+  ceph_assert(au_count != 0);
+  ceph_assert(num_au == 0);
+  ceph_assert(alloc_au == 0);
+  num_au = alloc_au = au_count;
+  bytes_per_au = new uint32_t[alloc_au];
+  mempool::get_pool(
+    mempool::pool_index_t(mempool::mempool_bluestore_cache_other)).
+      adjust_count(alloc_au, sizeof(uint32_t) * alloc_au);
+
+  for (uint32_t i = 0; i < num_au; ++i) {
+    bytes_per_au[i] = 0;
+  }
+}
+
+void bluestore_blob_use_tracker_t::release(uint32_t au_count, uint32_t* ptr) {
+  if (au_count) {
+    delete[] ptr;
+    mempool::get_pool(
+      mempool::pool_index_t(mempool::mempool_bluestore_cache_other)).
+        adjust_count(-(int32_t)au_count, -(int32_t)(sizeof(uint32_t) * au_count));
+  }
+}
+
+void bluestore_blob_use_tracker_t::init(
+  uint32_t full_length, uint32_t _au_size) {
+  ceph_assert(!au_size || is_empty()); 
+  ceph_assert(_au_size > 0);
+  ceph_assert(full_length > 0);
+  clear();  
+  uint32_t _num_au = round_up_to(full_length, _au_size) / _au_size;
+  au_size = _au_size;
+  if ( _num_au > 1 ) {
+    allocate(_num_au);
+  }
+}
+
+void bluestore_blob_use_tracker_t::get(
+  uint32_t offset, uint32_t length)
+{
+  ceph_assert(au_size);
+  if (!num_au) {
+    total_bytes += length;
+  } else {
+    auto end = offset + length;
+
+    while (offset < end) {
+      auto phase = offset % au_size;
+      bytes_per_au[offset / au_size] += 
+	std::min(au_size - phase, end - offset);
+      offset += (phase ? au_size - phase : au_size);
+    }
+  }
+}
+
+bool bluestore_blob_use_tracker_t::put(
+  uint32_t offset, uint32_t length,
+  PExtentVector *release_units)
+{
+  ceph_assert(au_size);
+  if (release_units) {
+    release_units->clear();
+  }
+  bool maybe_empty = true;
+  if (!num_au) {
+    ceph_assert(total_bytes >= length);
+    total_bytes -= length;
+  } else {
+    auto end = offset + length;
+    uint64_t next_offs = 0;
+    while (offset < end) {
+      auto phase = offset % au_size;
+      size_t pos = offset / au_size;
+      auto diff = std::min(au_size - phase, end - offset);
+      ceph_assert(diff <= bytes_per_au[pos]);
+      bytes_per_au[pos] -= diff;
+      offset += (phase ? au_size - phase : au_size);
+      if (bytes_per_au[pos] == 0) {
+	if (release_units) {
+          if (release_units->empty() || next_offs != pos * au_size) {
+  	    release_units->emplace_back(pos * au_size, au_size);
+            next_offs = pos * au_size;
+          } else {
+            release_units->back().length += au_size;
+          }
+          next_offs += au_size;
+	}
+      } else {
+	maybe_empty = false; // micro optimization detecting we aren't empty 
+	                     // even in the affected extent
+      }
+    }
+  }
+  bool empty = maybe_empty ? !is_not_empty() : false;
+  if (empty && release_units) {
+    release_units->clear();
+  }
+  return empty;
+}
+
+bool bluestore_blob_use_tracker_t::can_split() const
+{
+  return num_au > 0;
+}
+
+bool bluestore_blob_use_tracker_t::can_split_at(uint32_t blob_offset) const
+{
+  ceph_assert(au_size);
+  return (blob_offset % au_size) == 0 &&
+         blob_offset < num_au * au_size;
+}
+
+void bluestore_blob_use_tracker_t::split(
+  uint32_t blob_offset,
+  bluestore_blob_use_tracker_t* r)
+{
+  ceph_assert(au_size);
+  ceph_assert(can_split());
+  ceph_assert(can_split_at(blob_offset));
+  ceph_assert(r->is_empty());
+  
+  uint32_t new_num_au = blob_offset / au_size;
+  r->init( (num_au - new_num_au) * au_size, au_size);
+
+  for (auto i = new_num_au; i < num_au; i++) {
+    r->get((i - new_num_au) * au_size, bytes_per_au[i]);
+    bytes_per_au[i] = 0;
+  }
+  if (new_num_au == 0) {
+    clear();
+  } else if (new_num_au == 1) {
+    uint32_t tmp = bytes_per_au[0];
+    uint32_t _au_size = au_size;
+    clear();
+    au_size = _au_size;
+    total_bytes = tmp;
+  } else {
+    num_au = new_num_au;
+  }
+}
+
+bool bluestore_blob_use_tracker_t::equal(
+  const bluestore_blob_use_tracker_t& other) const
+{
+  if (!num_au && !other.num_au) {
+    return total_bytes == other.total_bytes && au_size == other.au_size;
+  } else if (num_au && other.num_au) {
+    if (num_au != other.num_au || au_size != other.au_size) {
+      return false;
+    }
+    for (size_t i = 0; i < num_au; i++) {
+      if (bytes_per_au[i] != other.bytes_per_au[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  uint32_t n = num_au ? num_au : other.num_au;
+  uint32_t referenced = 
+    num_au ? other.get_referenced_bytes() : get_referenced_bytes();
+   auto bytes_per_au_tmp = num_au ? bytes_per_au : other.bytes_per_au;
+  uint32_t my_referenced = 0;
+  for (size_t i = 0; i < n; i++) {
+    my_referenced += bytes_per_au_tmp[i];
+    if (my_referenced > referenced) {
+      return false;
+    }
+  }
+  return my_referenced == referenced;
+}
+
+void bluestore_blob_use_tracker_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("num_au", num_au);
+  f->dump_unsigned("au_size", au_size);
+  if (!num_au) {
+    f->dump_unsigned("total_bytes", total_bytes);
+  } else {
+    f->open_array_section("bytes_per_au");
+    for (size_t i = 0; i < num_au; ++i) {
+      f->dump_unsigned("", bytes_per_au[i]);
+    }
+    f->close_section();
+  }
+}
+
+void bluestore_blob_use_tracker_t::generate_test_instances(
+  list<bluestore_blob_use_tracker_t*>& o)
+{
+  o.push_back(new bluestore_blob_use_tracker_t());
+  o.back()->init(16, 16);
+  o.back()->get(10, 10);
+  o.back()->get(10, 5);
+  o.push_back(new bluestore_blob_use_tracker_t());
+  o.back()->init(60, 16);
+  o.back()->get(18, 22);
+  o.back()->get(20, 20);
+  o.back()->get(15, 20);
+}
+
+ostream& operator<<(ostream& out, const bluestore_blob_use_tracker_t& m)
+{
+  out << "use_tracker(" << std::hex;
+  if (!m.num_au) {
+    out << "0x" << m.au_size 
+        << " "
+        << "0x" << m.total_bytes;
+  } else {
+    out << "0x" << m.num_au 
+        << "*0x" << m.au_size 
+	<< " 0x[";
+    for (size_t i = 0; i < m.num_au; ++i) {
+      if (i != 0)
+	out << ",";
+      out << m.bytes_per_au[i];
+    }
+    out << "]";
+  }
+  out << std::dec << ")";
+  return out;
+}
+
+// bluestore_pextent_t
+
+void bluestore_pextent_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("offset", offset);
+  f->dump_unsigned("length", length);
+}
+
+ostream& operator<<(ostream& out, const bluestore_pextent_t& o) {
+  if (o.is_valid())
+    return out << "0x" << std::hex << o.offset << "~" << o.length << std::dec;
+  else
+    return out << "!~" << std::hex << o.length << std::dec;
+}
+
+void bluestore_pextent_t::generate_test_instances(list<bluestore_pextent_t*>& ls)
+{
+  ls.push_back(new bluestore_pextent_t);
+  ls.push_back(new bluestore_pextent_t(1, 2));
+}
+
+// bluestore_blob_t
+
+string bluestore_blob_t::get_flags_string(unsigned flags)
+{
+  string s;
+  if (flags & FLAG_COMPRESSED) {
+    if (s.length())
+      s += '+';
+    s += "compressed";
+  }
+  if (flags & FLAG_CSUM) {
+    if (s.length())
+      s += '+';
+    s += "csum";
+  }
+  if (flags & FLAG_HAS_UNUSED) {
+    if (s.length())
+      s += '+';
+    s += "has_unused";
+  }
+  if (flags & FLAG_SHARED) {
+    if (s.length())
+      s += '+';
+    s += "shared";
+  }
+
+  return s;
+}
+
+size_t bluestore_blob_t::get_csum_value_size() const 
+{
+  return Checksummer::get_csum_value_size(csum_type);
+}
+
+void bluestore_blob_t::dump(Formatter *f) const
+{
+  f->open_array_section("extents");
+  for (auto& p : extents) {
+    f->dump_object("extent", p);
+  }
+  f->close_section();
+  f->dump_unsigned("logical_length", logical_length);
+  f->dump_unsigned("compressed_length", compressed_length);
+  f->dump_unsigned("flags", flags);
+  f->dump_unsigned("csum_type", csum_type);
+  f->dump_unsigned("csum_chunk_order", csum_chunk_order);
+  f->open_array_section("csum_data");
+  size_t n = get_csum_count();
+  for (unsigned i = 0; i < n; ++i)
+    f->dump_unsigned("csum", get_csum_item(i));
+  f->close_section();
+  f->dump_unsigned("unused", unused);
+}
+
+void bluestore_blob_t::generate_test_instances(list<bluestore_blob_t*>& ls)
+{
+  ls.push_back(new bluestore_blob_t);
+  ls.push_back(new bluestore_blob_t(0));
+  ls.push_back(new bluestore_blob_t);
+  ls.back()->allocated_test(bluestore_pextent_t(111, 222));
+  ls.push_back(new bluestore_blob_t);
+  ls.back()->init_csum(Checksummer::CSUM_XXHASH32, 16, 65536);
+  ls.back()->csum_data = ceph::buffer::claim_malloc(4, strdup("abcd"));
+  ls.back()->add_unused(0, 3);
+  ls.back()->add_unused(8, 8);
+  ls.back()->allocated_test(bluestore_pextent_t(0x40100000, 0x10000));
+  ls.back()->allocated_test(
+    bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, 0x1000));
+  ls.back()->allocated_test(bluestore_pextent_t(0x40120000, 0x10000));
+}
+
+ostream& operator<<(ostream& out, const bluestore_blob_t& o)
+{
+  out << "blob(" << o.get_extents();
+  if (o.is_compressed()) {
+    out << " clen 0x" << std::hex
+	<< o.get_logical_length()
+	<< " -> 0x"
+	<< o.get_compressed_payload_length()
+	<< std::dec;
+  }
+  if (o.flags) {
+    out << " " << o.get_flags_string();
+  }
+  if (o.has_csum()) {
+    out << " " << Checksummer::get_csum_type_string(o.csum_type)
+	<< "/0x" << std::hex << (1ull << o.csum_chunk_order) << std::dec;
+  }
+  if (o.has_unused())
+    out << " unused=0x" << std::hex << o.unused << std::dec;
+  out << ")";
+  return out;
+}
+
+void bluestore_blob_t::calc_csum(uint64_t b_off, const bufferlist& bl)
+{
+  switch (csum_type) {
+  case Checksummer::CSUM_XXHASH32:
+    Checksummer::calculate<Checksummer::xxhash32>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+    break;
+  case Checksummer::CSUM_XXHASH64:
+    Checksummer::calculate<Checksummer::xxhash64>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+    break;;
+  case Checksummer::CSUM_CRC32C:
+    Checksummer::calculate<Checksummer::crc32c>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+    break;
+  case Checksummer::CSUM_CRC32C_16:
+    Checksummer::calculate<Checksummer::crc32c_16>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+    break;
+  case Checksummer::CSUM_CRC32C_8:
+    Checksummer::calculate<Checksummer::crc32c_8>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, &csum_data);
+    break;
+  }
+}
+
+int bluestore_blob_t::verify_csum(uint64_t b_off, const bufferlist& bl,
+				  int* b_bad_off, uint64_t *bad_csum) const
+{
+  int r = 0;
+
+  *b_bad_off = -1;
+  switch (csum_type) {
+  case Checksummer::CSUM_NONE:
+    break;
+  case Checksummer::CSUM_XXHASH32:
+    *b_bad_off = Checksummer::verify<Checksummer::xxhash32>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+    break;
+  case Checksummer::CSUM_XXHASH64:
+    *b_bad_off = Checksummer::verify<Checksummer::xxhash64>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+    break;
+  case Checksummer::CSUM_CRC32C:
+    *b_bad_off = Checksummer::verify<Checksummer::crc32c>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+    break;
+  case Checksummer::CSUM_CRC32C_16:
+    *b_bad_off = Checksummer::verify<Checksummer::crc32c_16>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+    break;
+  case Checksummer::CSUM_CRC32C_8:
+    *b_bad_off = Checksummer::verify<Checksummer::crc32c_8>(
+      get_csum_chunk_size(), b_off, bl.length(), bl, csum_data, bad_csum);
+    break;
+  default:
+    r = -EOPNOTSUPP;
+    break;
+  }
+
+  if (r < 0)
+    return r;
+  else if (*b_bad_off >= 0)
+    return -1; // bad checksum
+  else
+    return 0;
+}
+
+void bluestore_blob_t::allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs)
+{
+  if (extents.size() == 0) {
+    // if blob is compressed then logical length to be already configured
+    // otherwise - to be unset.
+    ceph_assert((is_compressed() && logical_length != 0) ||
+      (!is_compressed() && logical_length == 0));
+
+    extents.reserve(allocs.size() + (b_off ? 1 : 0));
+    if (b_off) {
+      extents.emplace_back(
+        bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET, b_off));
+
+    }
+    uint32_t new_len = b_off;
+    for (auto& a : allocs) {
+      extents.emplace_back(a.offset, a.length);
+      new_len += a.length;
+    }
+    if (!is_compressed()) {
+      logical_length = new_len;
+    }
+  } else {
+    ceph_assert(!is_compressed()); // partial allocations are forbidden when 
+                              // compressed
+    ceph_assert(b_off < logical_length);
+    uint32_t cur_offs = 0;
+    auto start_it = extents.begin();
+    size_t pos = 0;
+    while (true) {
+      ceph_assert(start_it != extents.end());
+      if (cur_offs + start_it->length > b_off) {
+	break;
+      }
+      cur_offs += start_it->length;
+      ++start_it;
+      ++pos;
+    }
+    uint32_t head = b_off - cur_offs;
+    uint32_t end_off = b_off + length;
+    auto end_it = start_it;
+
+    while (true) {
+      ceph_assert(end_it != extents.end());
+      ceph_assert(!end_it->is_valid());
+      if (cur_offs + end_it->length >= end_off) {
+	break;
+      }
+      cur_offs += end_it->length;
+      ++end_it;
+    }
+    ceph_assert(cur_offs + end_it->length >= end_off);
+    uint32_t tail = cur_offs + end_it->length - end_off;
+
+    start_it = extents.erase(start_it, end_it + 1);
+    size_t count = allocs.size();
+    count += head ? 1 : 0;
+    count += tail ? 1 : 0;
+    extents.insert(start_it,
+                   count,
+                   bluestore_pextent_t(
+                     bluestore_pextent_t::INVALID_OFFSET, 0));
+   
+    // Workaround to resolve lack of proper iterator return in vector::insert
+    // Looks like some gcc/stl implementations still lack it despite c++11
+    // support claim
+    start_it = extents.begin() + pos;
+
+    if (head) {
+      start_it->length = head;
+      ++start_it;
+    }
+    for(auto& e : allocs) {
+      *start_it = e;
+      ++start_it;
+    }
+    if (tail) {
+      start_it->length = tail;
+    } 
+  }
+}
+
+// cut it out of extents
+struct vecbuilder {
+  PExtentVector v;
+  uint64_t invalid = 0;
+
+  void add_invalid(uint64_t length) {
+    invalid += length;
+  }
+  void flush() {
+    if (invalid) {
+      v.emplace_back(bluestore_pextent_t(bluestore_pextent_t::INVALID_OFFSET,
+        invalid));
+
+      invalid = 0;
+    }
+  }
+  void add(uint64_t offset, uint64_t length) {
+    if (offset == bluestore_pextent_t::INVALID_OFFSET) {
+      add_invalid(length);
+    }
+    else {
+      flush();
+      v.emplace_back(offset, length);
+    }
+  }
+};
+
+void bluestore_blob_t::allocated_test(const bluestore_pextent_t& alloc)
+{
+  extents.emplace_back(alloc);
+  if (!is_compressed()) {
+    logical_length += alloc.length;
+  }
+}
+
+bool bluestore_blob_t::release_extents(bool all,
+				       const PExtentVector& logical,
+				       PExtentVector* r)
+{
+  // common case: all of it?
+  if (all) {
+    uint64_t pos = 0;
+    for (auto& e : extents) {
+      if (e.is_valid()) {
+	r->push_back(e);
+      }
+      pos += e.length;
+    }
+    ceph_assert(is_compressed() || get_logical_length() == pos);
+    extents.resize(1);
+    extents[0].offset = bluestore_pextent_t::INVALID_OFFSET;
+    extents[0].length = pos;
+    return true;
+  }
+  // remove from pextents according to logical release list
+  vecbuilder vb;
+  auto loffs_it = logical.begin();
+  auto lend = logical.end();
+  uint32_t pext_loffs_start = 0; //starting loffset of the current pextent
+  uint32_t pext_loffs = 0; //current loffset
+  auto pext_it = extents.begin();
+  auto pext_end = extents.end();
+  while (pext_it != pext_end) {
+    if (loffs_it == lend ||
+        pext_loffs_start + pext_it->length <= loffs_it->offset) {
+      int delta0 = pext_loffs - pext_loffs_start;
+      ceph_assert(delta0 >= 0);
+      if ((uint32_t)delta0 < pext_it->length) {
+	vb.add(pext_it->offset + delta0, pext_it->length - delta0);
+      }
+      pext_loffs_start += pext_it->length;
+      pext_loffs = pext_loffs_start;
+      ++pext_it;
+    }
+    else {
+      //assert(pext_loffs == pext_loffs_start);
+      int delta0 = pext_loffs - pext_loffs_start;
+      ceph_assert(delta0 >= 0);
+
+      int delta = loffs_it->offset - pext_loffs;
+      ceph_assert(delta >= 0);
+      if (delta > 0) {
+	vb.add(pext_it->offset + delta0, delta);
+	pext_loffs += delta;
+      }
+
+      PExtentVector::iterator last_r = r->end();
+      if (r->begin() != last_r) {
+	--last_r;
+      }
+      uint32_t to_release = loffs_it->length;
+      do {
+	uint32_t to_release_part =
+	  std::min(pext_it->length - delta0 - delta, to_release);
+	auto o = pext_it->offset + delta0 + delta;
+	if (last_r != r->end() && last_r->offset + last_r->length == o) {
+	  last_r->length += to_release_part;
+	}
+	else {
+	  last_r = r->emplace(r->end(), o, to_release_part);
+	}
+	to_release -= to_release_part;
+	pext_loffs += to_release_part;
+	if (pext_loffs == pext_loffs_start + pext_it->length) {
+	  pext_loffs_start += pext_it->length;
+	  pext_loffs = pext_loffs_start;
+	  pext_it++;
+	  delta0 = delta = 0;
+	}
+      } while (to_release > 0 && pext_it != pext_end);
+      vb.add_invalid(loffs_it->length - to_release);
+      ++loffs_it;
+    }
+  }
+  vb.flush();
+  extents.swap(vb.v);
+  return false;
+}
+
+void bluestore_blob_t::split(uint32_t blob_offset, bluestore_blob_t& rb)
+{
+  size_t left = blob_offset;
+  uint32_t llen_lb = 0;
+  uint32_t llen_rb = 0;
+  unsigned i = 0;
+  for (auto p = extents.begin(); p != extents.end(); ++p, ++i) {
+    if (p->length <= left) {
+      left -= p->length;
+      llen_lb += p->length;
+      continue;
+    }
+    if (left) {
+      if (p->is_valid()) {
+	rb.extents.emplace_back(bluestore_pextent_t(p->offset + left,
+	  p->length - left));
+      }
+      else {
+	rb.extents.emplace_back(bluestore_pextent_t(
+	  bluestore_pextent_t::INVALID_OFFSET,
+	  p->length - left));
+      }
+      llen_rb += p->length - left;
+      llen_lb += left;
+      p->length = left;
+      ++i;
+      ++p;
+    }
+    while (p != extents.end()) {
+      llen_rb += p->length;
+      rb.extents.push_back(*p++);
+    }
+    extents.resize(i);
+    logical_length = llen_lb;
+    rb.logical_length = llen_rb;
+    break;
+  }
+  rb.flags = flags;
+
+  if (has_csum()) {
+    rb.csum_type = csum_type;
+    rb.csum_chunk_order = csum_chunk_order;
+    size_t csum_order = get_csum_chunk_size();
+    ceph_assert(blob_offset % csum_order == 0);
+    size_t pos = (blob_offset / csum_order) * get_csum_value_size();
+    // deep copy csum data
+    bufferptr old;
+    old.swap(csum_data);
+    rb.csum_data = bufferptr(old.c_str() + pos, old.length() - pos);
+    csum_data = bufferptr(old.c_str(), pos);
+  }
+}
+
+// bluestore_shared_blob_t
+MEMPOOL_DEFINE_OBJECT_FACTORY(bluestore_shared_blob_t, bluestore_shared_blob_t,
+	          bluestore_shared_blob);
+
+void bluestore_shared_blob_t::dump(Formatter *f) const
+{
+  f->dump_int("sbid", sbid);
+  f->dump_object("ref_map", ref_map);
+}
+
+void bluestore_shared_blob_t::generate_test_instances(
+  list<bluestore_shared_blob_t*>& ls)
+{
+  ls.push_back(new bluestore_shared_blob_t(1));
+}
+
+ostream& operator<<(ostream& out, const bluestore_shared_blob_t& sb)
+{
+  out << "(sbid 0x" << std::hex << sb.sbid << std::dec;
+  out << " " << sb.ref_map << ")";
+  return out;
+}
+
+// bluestore_onode_t
+
+void bluestore_onode_t::shard_info::dump(Formatter *f) const
+{
+  f->dump_unsigned("offset", offset);
+  f->dump_unsigned("bytes", bytes);
+}
+
+ostream& operator<<(ostream& out, const bluestore_onode_t::shard_info& si)
+{
+  return out << std::hex << "0x" << si.offset << "(0x" << si.bytes << " bytes"
+	     << std::dec << ")";
+}
+
+void bluestore_onode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("nid", nid);
+  f->dump_unsigned("size", size);
+  f->open_object_section("attrs");
+  for (auto p = attrs.begin(); p != attrs.end(); ++p) {
+    f->open_object_section("attr");
+    f->dump_string("name", p->first.c_str());  // it's not quite std::string
+    f->dump_unsigned("len", p->second.length());
+    f->close_section();
+  }
+  f->close_section();
+  f->dump_string("flags", get_flags_string());
+  f->open_array_section("extent_map_shards");
+  for (auto si : extent_map_shards) {
+    f->dump_object("shard", si);
+  }
+  f->close_section();
+  f->dump_unsigned("expected_object_size", expected_object_size);
+  f->dump_unsigned("expected_write_size", expected_write_size);
+  f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
+}
+
+void bluestore_onode_t::generate_test_instances(list<bluestore_onode_t*>& o)
+{
+  o.push_back(new bluestore_onode_t());
+  // FIXME
+}
+
+// bluestore_deferred_op_t
+
+void bluestore_deferred_op_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("op", (int)op);
+  f->dump_unsigned("data_len", data.length());
+  f->open_array_section("extents");
+  for (auto& e : extents) {
+    f->dump_object("extent", e);
+  }
+  f->close_section();
+}
+
+void bluestore_deferred_op_t::generate_test_instances(list<bluestore_deferred_op_t*>& o)
+{
+  o.push_back(new bluestore_deferred_op_t);
+  o.push_back(new bluestore_deferred_op_t);
+  o.back()->op = OP_WRITE;
+  o.back()->extents.push_back(bluestore_pextent_t(1, 2));
+  o.back()->extents.push_back(bluestore_pextent_t(100, 5));
+  o.back()->data.append("my data");
+}
+
+void bluestore_deferred_transaction_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("seq", seq);
+  f->open_array_section("ops");
+  for (list<bluestore_deferred_op_t>::const_iterator p = ops.begin(); p != ops.end(); ++p) {
+    f->dump_object("op", *p);
+  }
+  f->close_section();
+
+  f->open_array_section("released extents");
+  for (interval_set<uint64_t>::const_iterator p = released.begin(); p != released.end(); ++p) {
+    f->open_object_section("extent");
+    f->dump_unsigned("offset", p.get_start());
+    f->dump_unsigned("length", p.get_len());
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void bluestore_deferred_transaction_t::generate_test_instances(list<bluestore_deferred_transaction_t*>& o)
+{
+  o.push_back(new bluestore_deferred_transaction_t());
+  o.push_back(new bluestore_deferred_transaction_t());
+  o.back()->seq = 123;
+  o.back()->ops.push_back(bluestore_deferred_op_t());
+  o.back()->ops.push_back(bluestore_deferred_op_t());
+  o.back()->ops.back().op = bluestore_deferred_op_t::OP_WRITE;
+  o.back()->ops.back().extents.push_back(bluestore_pextent_t(1,7));
+  o.back()->ops.back().data.append("foodata");
+}
+
+void bluestore_compression_header_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("type", type);
+  f->dump_unsigned("length", length);
+  if (compressor_message) {
+    f->dump_int("compressor_message", *compressor_message);
+  }
+}
+
+void bluestore_compression_header_t::generate_test_instances(
+  list<bluestore_compression_header_t*>& o)
+{
+  o.push_back(new bluestore_compression_header_t);
+  o.push_back(new bluestore_compression_header_t(1));
+  o.back()->length = 1234;
+}
+
+// adds more salt to build a hash func input
+shared_blob_2hash_tracker_t::hash_input_t
+  shared_blob_2hash_tracker_t::build_hash_input(
+    uint64_t sbid,
+    uint64_t offset) const
+{
+  hash_input_t res = {
+    sbid,
+    offset >> au_void_bits,
+    ((sbid & 0xffffffff) << 32) + ~(uint32_t((offset >> au_void_bits) & 0xffffffff))
+  };
+  return res;
+}
+
+void shared_blob_2hash_tracker_t::inc(
+  uint64_t sbid,
+  uint64_t offset,
+  int n)
+{
+  auto hash_input = build_hash_input(sbid, offset);
+  ref_counter_2hash_tracker_t::inc(
+    (char*)hash_input.data(),
+    get_hash_input_size(),
+    n);
+}
+
+void shared_blob_2hash_tracker_t::inc_range(
+  uint64_t sbid,
+  uint64_t offset,
+  uint32_t len,
+  int n)
+{
+  uint32_t alloc_unit = 1 << au_void_bits;
+  int64_t l = len;
+  while (l > 0) {
+    // don't care about ofset alignment as inc() trims it anyway
+    inc(sbid, offset, n);
+    offset += alloc_unit;
+    l -= alloc_unit;
+  }
+}
+
+bool shared_blob_2hash_tracker_t::test_hash_conflict(
+  uint64_t sbid1,
+  uint64_t offset1,
+  uint64_t sbid2,
+  uint64_t offset2) const
+{
+  auto hash_input1 = build_hash_input(sbid1, offset1);
+  auto hash_input2 = build_hash_input(sbid2, offset2);
+  return ref_counter_2hash_tracker_t::test_hash_conflict(
+    (char*)hash_input1.data(),
+    (char*)hash_input2.data(),
+    get_hash_input_size());
+}
+
+bool shared_blob_2hash_tracker_t::test_all_zero(
+  uint64_t sbid,
+  uint64_t offset) const
+{
+  auto hash_input = build_hash_input(sbid, offset);
+  return
+    ref_counter_2hash_tracker_t::test_all_zero(
+      (char*)hash_input.data(),
+      get_hash_input_size());
+}
+
+bool shared_blob_2hash_tracker_t::test_all_zero_range(
+  uint64_t sbid,
+  uint64_t offset,
+  uint32_t len) const
+{
+  uint32_t alloc_unit = 1 << au_void_bits;
+  int64_t l = len;
+  while (l > 0) {
+    // don't care about ofset alignment as inc() trims it anyway
+    if (!test_all_zero(sbid, offset)) {
+      return false;
+    }
+    offset += alloc_unit;
+    l -= alloc_unit;
+  }
+  return true;
+}
diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h
new file mode 100644
index 000000000..4c96e8903
--- /dev/null
+++ b/src/os/bluestore/bluestore_types.h
@@ -0,0 +1,1376 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
+#define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
+
+#include <bit>
+#include <ostream>
+#include <type_traits>
+#include <vector>
+#include <array>
+#include "include/mempool.h"
+#include "include/types.h"
+#include "include/interval_set.h"
+#include "include/utime.h"
+#include "common/hobject.h"
+#include "compressor/Compressor.h"
+#include "common/Checksummer.h"
+#include "include/ceph_hash.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+/// label for block device
+struct bluestore_bdev_label_t {
+  uuid_d osd_uuid;     ///< osd uuid
+  uint64_t size = 0;   ///< device size
+  utime_t btime;       ///< birth time
+  std::string description;  ///< device description
+
+  std::map<std::string,std::string> meta; ///< {read,write}_meta() content from ObjectStore
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_bdev_label_t*>& o);
+};
+WRITE_CLASS_ENCODER(bluestore_bdev_label_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_bdev_label_t& l);
+
+/// collection metadata
+struct bluestore_cnode_t {
+  uint32_t bits;   ///< how many bits of coll pgid are significant
+
+  explicit bluestore_cnode_t(int b=0) : bits(b) {}
+
+  DENC(bluestore_cnode_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.bits, p);
+    DENC_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_cnode_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_cnode_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_cnode_t& l);
+
+template <typename OFFS_TYPE, typename LEN_TYPE>
+struct bluestore_interval_t
+{
+  static const uint64_t INVALID_OFFSET = ~0ull;
+
+  OFFS_TYPE offset = 0;
+  LEN_TYPE length = 0;
+
+  bluestore_interval_t(){}
+  bluestore_interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
+
+  bool is_valid() const {
+    return offset != INVALID_OFFSET;
+  }
+  uint64_t end() const {
+    return offset != INVALID_OFFSET ? offset + length : INVALID_OFFSET;
+  }
+
+  bool operator==(const bluestore_interval_t& other) const {
+    return offset == other.offset && length == other.length;
+  }
+
+};
+
+/// pextent: physical extent
+struct bluestore_pextent_t : public bluestore_interval_t<uint64_t, uint32_t> 
+{
+  bluestore_pextent_t() {}
+  bluestore_pextent_t(uint64_t o, uint64_t l) : bluestore_interval_t(o, l) {}
+  bluestore_pextent_t(const bluestore_interval_t &ext) :
+    bluestore_interval_t(ext.offset, ext.length) {}
+
+  DENC(bluestore_pextent_t, v, p) {
+    denc_lba(v.offset, p);
+    denc_varint_lowz(v.length, p);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_pextent_t*>& ls);
+};
+WRITE_CLASS_DENC(bluestore_pextent_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_pextent_t& o);
+
+typedef mempool::bluestore_cache_other::vector<bluestore_pextent_t> PExtentVector;
+
+template<>
+struct denc_traits<PExtentVector> {
+  static constexpr bool supported = true;
+  static constexpr bool bounded = false;
+  static constexpr bool featured = false;
+  static constexpr bool need_contiguous = true;
+  static void bound_encode(const PExtentVector& v, size_t& p) {
+    p += sizeof(uint32_t);
+    const auto size = v.size();
+    if (size) {
+      size_t per = 0;
+      denc(v.front(), per);
+      p +=  per * size;
+    }
+  }
+  static void encode(const PExtentVector& v,
+		     ceph::buffer::list::contiguous_appender& p) {
+    denc_varint(v.size(), p);
+    for (auto& i : v) {
+      denc(i, p);
+    }
+  }
+  static void decode(PExtentVector& v, ceph::buffer::ptr::const_iterator& p) {
+    unsigned num;
+    denc_varint(num, p);
+    v.clear();
+    v.resize(num);
+    for (unsigned i=0; i<num; ++i) {
+      denc(v[i], p);
+    }
+  }
+};
+
+/// extent_map: a std::map of reference counted extents
+struct bluestore_extent_ref_map_t {
+  struct record_t {
+    uint32_t length;
+    uint32_t refs;
+    record_t(uint32_t l=0, uint32_t r=0) : length(l), refs(r) {}
+    DENC(bluestore_extent_ref_map_t::record_t, v, p) {
+      denc_varint_lowz(v.length, p);
+      denc_varint(v.refs, p);
+    }
+  };
+
+  typedef mempool::bluestore_cache_other::map<uint64_t,record_t> map_t;
+  map_t ref_map;
+
+  void _check() const;
+  void _maybe_merge_left(map_t::iterator& p);
+
+  void clear() {
+    ref_map.clear();
+  }
+  bool empty() const {
+    return ref_map.empty();
+  }
+
+  void get(uint64_t offset, uint32_t len);
+  void put(uint64_t offset, uint32_t len, PExtentVector *release,
+	   bool *maybe_unshared);
+
+  bool contains(uint64_t offset, uint32_t len) const;
+  bool intersects(uint64_t offset, uint32_t len) const;
+
+  void bound_encode(size_t& p) const {
+    denc_varint((uint32_t)0, p);
+    if (!ref_map.empty()) {
+      size_t elem_size = 0;
+      denc_varint_lowz((uint64_t)0, elem_size);
+      ref_map.begin()->second.bound_encode(elem_size);
+      p += elem_size * ref_map.size();
+    }
+  }
+  void encode(ceph::buffer::list::contiguous_appender& p) const {
+    const uint32_t n = ref_map.size();
+    denc_varint(n, p);
+    if (n) {
+      auto i = ref_map.begin();
+      denc_varint_lowz(i->first, p);
+      i->second.encode(p);
+      int64_t pos = i->first;
+      while (++i != ref_map.end()) {
+	denc_varint_lowz((int64_t)i->first - pos, p);
+	i->second.encode(p);
+	pos = i->first;
+      }
+    }
+  }
+  void decode(ceph::buffer::ptr::const_iterator& p) {
+    uint32_t n;
+    denc_varint(n, p);
+    if (n) {
+      int64_t pos;
+      denc_varint_lowz(pos, p);
+      ref_map[pos].decode(p);
+      while (--n) {
+	int64_t delta;
+	denc_varint_lowz(delta, p);
+	pos += delta;
+	ref_map[pos].decode(p);
+      }
+    }
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_extent_ref_map_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_extent_ref_map_t)
+
+
+std::ostream& operator<<(std::ostream& out, const bluestore_extent_ref_map_t& rm);
+static inline bool operator==(const bluestore_extent_ref_map_t::record_t& l,
+			      const bluestore_extent_ref_map_t::record_t& r) {
+  return l.length == r.length && l.refs == r.refs;
+}
+static inline bool operator==(const bluestore_extent_ref_map_t& l,
+			      const bluestore_extent_ref_map_t& r) {
+  return l.ref_map == r.ref_map;
+}
+static inline bool operator!=(const bluestore_extent_ref_map_t& l,
+			      const bluestore_extent_ref_map_t& r) {
+  return !(l == r);
+}
+
+/// blob_use_tracker: a set of per-alloc unit ref buckets to track blob usage
+struct bluestore_blob_use_tracker_t {
+  // N.B.: There is no need to minimize au_size/num_au
+  //   as much as possible (e.g. have just a single byte for au_size) since:
+  //   1) Struct isn't packed hence it's padded. And even if it's packed see 2)
+  //   2) Mem manager has its own granularity, most probably >= 8 bytes
+  //
+  uint32_t au_size;  // Allocation (=tracking) unit size,
+                     // == 0 if uninitialized
+  uint32_t num_au;   // Amount of allocation units tracked
+                     // == 0 if single unit or the whole blob is tracked
+  uint32_t alloc_au; // Amount of allocation units allocated
+                       
+  union {
+    uint32_t* bytes_per_au;
+    uint32_t total_bytes;
+  };
+  
+  bluestore_blob_use_tracker_t()
+    : au_size(0), num_au(0), alloc_au(0), bytes_per_au(nullptr) {
+  }
+  bluestore_blob_use_tracker_t(const bluestore_blob_use_tracker_t& tracker);
+  bluestore_blob_use_tracker_t& operator=(const bluestore_blob_use_tracker_t& rhs);
+  ~bluestore_blob_use_tracker_t() {
+    clear();
+  }
+
+  void clear() {
+    release(alloc_au, bytes_per_au);
+    num_au = 0;
+    alloc_au = 0;
+    bytes_per_au = 0;
+    au_size = 0;
+  }
+
+  uint32_t get_referenced_bytes() const {
+    uint32_t total = 0;
+    if (!num_au) {
+      total = total_bytes;
+    } else {
+      for (size_t i = 0; i < num_au; ++i) {
+	total += bytes_per_au[i];
+      }
+    }
+    return total;
+  }
+  bool is_not_empty() const {
+    if (!num_au) {
+      return total_bytes != 0;
+    } else {
+      for (size_t i = 0; i < num_au; ++i) {
+	if (bytes_per_au[i]) {
+	  return true;
+	}
+      }
+    }
+    return false;
+  }
+  bool is_empty() const {
+    return !is_not_empty();
+  }
+  void prune_tail(uint32_t new_len) {
+    if (num_au) {
+      new_len = round_up_to(new_len, au_size);
+      uint32_t _num_au = new_len / au_size;
+      ceph_assert(_num_au <= num_au);
+      if (_num_au) {
+        num_au = _num_au; // bytes_per_au array is left unmodified
+      } else {
+        clear();
+      }
+    }
+  }
+  void add_tail(uint32_t new_len, uint32_t _au_size) {
+    auto full_size = au_size * (num_au ? num_au : 1);
+    ceph_assert(new_len >= full_size);
+    if (new_len == full_size) {
+      return;
+    }
+    if (!num_au) {
+      uint32_t old_total = total_bytes;
+      total_bytes = 0;
+      init(new_len, _au_size);
+      ceph_assert(num_au);
+      bytes_per_au[0] = old_total;
+    } else {
+      ceph_assert(_au_size == au_size);
+      new_len = round_up_to(new_len, au_size);
+      uint32_t _num_au = new_len / au_size;
+      ceph_assert(_num_au >= num_au);
+      if (_num_au > num_au) {
+	auto old_bytes = bytes_per_au;
+	auto old_num_au = num_au;
+	auto old_alloc_au = alloc_au;
+	alloc_au = num_au = 0; // to bypass an assertion in allocate()
+	bytes_per_au = nullptr;
+	allocate(_num_au);
+	for (size_t i = 0; i < old_num_au; i++) {
+	  bytes_per_au[i] = old_bytes[i];
+	}
+	for (size_t i = old_num_au; i < num_au; i++) {
+	  bytes_per_au[i] = 0;
+	}
+	release(old_alloc_au, old_bytes);
+      }
+    }
+  }
+
+  void init(
+    uint32_t full_length,
+    uint32_t _au_size);
+
+  void get(
+    uint32_t offset,
+    uint32_t len);
+
+  /// put: return true if the blob has no references any more after the call,
+  /// no release_units is filled for the sake of performance.
+  /// return false if there are some references to the blob,
+  /// in this case release_units contains pextents
+  /// (identified by their offsets relative to the blob start)
+  ///  that are not used any more and can be safely deallocated.
+  bool put(
+    uint32_t offset,
+    uint32_t len,
+    PExtentVector *release);
+
+  bool can_split() const;
+  bool can_split_at(uint32_t blob_offset) const;
+  void split(
+    uint32_t blob_offset,
+    bluestore_blob_use_tracker_t* r);
+
+  bool equal(
+    const bluestore_blob_use_tracker_t& other) const;
+    
+  void bound_encode(size_t& p) const {
+    denc_varint(au_size, p);
+    if (au_size) {
+      denc_varint(num_au, p);
+      if (!num_au) {
+        denc_varint(total_bytes, p);
+      } else {
+        size_t elem_size = 0;
+        denc_varint((uint32_t)0, elem_size);
+        p += elem_size * num_au;
+      }
+    }
+  }
+  void encode(ceph::buffer::list::contiguous_appender& p) const {
+    denc_varint(au_size, p);
+    if (au_size) {
+      denc_varint(num_au, p);
+      if (!num_au) {
+        denc_varint(total_bytes, p);
+      } else {
+        size_t elem_size = 0;
+        denc_varint((uint32_t)0, elem_size);
+        for (size_t i = 0; i < num_au; ++i) {
+          denc_varint(bytes_per_au[i], p);
+        }
+      }
+    }
+  }
+  void decode(ceph::buffer::ptr::const_iterator& p) {
+    clear();
+    denc_varint(au_size, p);
+    if (au_size) {
+      uint32_t _num_au;
+      denc_varint(_num_au, p);
+      if (!_num_au) {
+        num_au = 0;
+        denc_varint(total_bytes, p);
+      } else {
+        allocate(_num_au);
+        for (size_t i = 0; i < _num_au; ++i) {
+	  denc_varint(bytes_per_au[i], p);
+        }
+      }
+    }
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_blob_use_tracker_t*>& o);
+private:
+  void allocate(uint32_t _num_au);
+  void release(uint32_t _num_au, uint32_t* ptr);
+};
+WRITE_CLASS_DENC(bluestore_blob_use_tracker_t)
+std::ostream& operator<<(std::ostream& out, const bluestore_blob_use_tracker_t& rm);
+
+/// blob: a piece of data on disk
+struct bluestore_blob_t {
+private:
+  PExtentVector extents;              ///< raw data position on device
+  uint32_t logical_length = 0;        ///< original length of data stored in the blob
+  uint32_t compressed_length = 0;     ///< compressed length if any
+
+public:
+  enum {
+    LEGACY_FLAG_MUTABLE = 1,  ///< [legacy] blob can be overwritten or split
+    FLAG_COMPRESSED = 2,      ///< blob is compressed
+    FLAG_CSUM = 4,            ///< blob has checksums
+    FLAG_HAS_UNUSED = 8,      ///< blob has unused std::map
+    FLAG_SHARED = 16,         ///< blob is shared; see external SharedBlob
+  };
+  static std::string get_flags_string(unsigned flags);
+
+  uint32_t flags = 0;                 ///< FLAG_*
+
+  typedef uint16_t unused_t;
+  unused_t unused = 0;     ///< portion that has never been written to (bitmap)
+
+  uint8_t csum_type = Checksummer::CSUM_NONE;      ///< CSUM_*
+  uint8_t csum_chunk_order = 0;       ///< csum block size is 1<<block_order bytes
+
+  ceph::buffer::ptr csum_data;                ///< opaque std::vector of csum data
+
+  bluestore_blob_t(uint32_t f = 0) : flags(f) {}
+
+  const PExtentVector& get_extents() const {
+    return extents;
+  }
+  PExtentVector& dirty_extents() {
+    return extents;
+  }
+
+  DENC_HELPERS;
+  void bound_encode(size_t& p, uint64_t struct_v) const {
+    ceph_assert(struct_v == 1 || struct_v == 2);
+    denc(extents, p);
+    denc_varint(flags, p);
+    denc_varint_lowz(logical_length, p);
+    denc_varint_lowz(compressed_length, p);
+    denc(csum_type, p);
+    denc(csum_chunk_order, p);
+    denc_varint(csum_data.length(), p);
+    p += csum_data.length();
+    p += sizeof(unused_t);
+  }
+
+  void encode(ceph::buffer::list::contiguous_appender& p, uint64_t struct_v) const {
+    ceph_assert(struct_v == 1 || struct_v == 2);
+    denc(extents, p);
+    denc_varint(flags, p);
+    if (is_compressed()) {
+      denc_varint_lowz(logical_length, p);
+      denc_varint_lowz(compressed_length, p);
+    }
+    if (has_csum()) {
+      denc(csum_type, p);
+      denc(csum_chunk_order, p);
+      denc_varint(csum_data.length(), p);
+      memcpy(p.get_pos_add(csum_data.length()), csum_data.c_str(),
+	     csum_data.length());
+    }
+    if (has_unused()) {
+      denc(unused, p);
+    }
+  }
+
+  void decode(ceph::buffer::ptr::const_iterator& p, uint64_t struct_v) {
+    ceph_assert(struct_v == 1 || struct_v == 2);
+    denc(extents, p);
+    denc_varint(flags, p);
+    if (is_compressed()) {
+      denc_varint_lowz(logical_length, p);
+      denc_varint_lowz(compressed_length, p);
+    } else {
+      logical_length = get_ondisk_length();
+    }
+    if (has_csum()) {
+      denc(csum_type, p);
+      denc(csum_chunk_order, p);
+      int len;
+      denc_varint(len, p);
+      csum_data = p.get_ptr(len);
+      csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+    }
+    if (has_unused()) {
+      denc(unused, p);
+    }
+  }
+
+  bool can_split() const {
+    return
+      !has_flag(FLAG_SHARED) &&
+      !has_flag(FLAG_COMPRESSED) &&
+      !has_flag(FLAG_HAS_UNUSED);     // splitting unused set is complex
+  }
+  bool can_split_at(uint32_t blob_offset) const {
+    return !has_csum() || blob_offset % get_csum_chunk_size() == 0;
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_blob_t*>& ls);
+
+  bool has_flag(unsigned f) const {
+    return flags & f;
+  }
+  void set_flag(unsigned f) {
+    flags |= f;
+  }
+  void clear_flag(unsigned f) {
+    flags &= ~f;
+  }
+  std::string get_flags_string() const {
+    return get_flags_string(flags);
+  }
+
+  void set_compressed(uint64_t clen_orig, uint64_t clen) {
+    set_flag(FLAG_COMPRESSED);
+    logical_length = clen_orig;
+    compressed_length = clen;
+  }
+  bool is_mutable() const {
+    return !is_compressed() && !is_shared();
+  }
+  bool is_compressed() const {
+    return has_flag(FLAG_COMPRESSED);
+  }
+  bool has_csum() const {
+    return has_flag(FLAG_CSUM);
+  }
+  bool has_unused() const {
+    return has_flag(FLAG_HAS_UNUSED);
+  }
+  bool is_shared() const {
+    return has_flag(FLAG_SHARED);
+  }
+
+  /// return chunk (i.e. min readable block) size for the blob
+  uint64_t get_chunk_size(uint64_t dev_block_size) const {
+    return has_csum() ?
+      std::max<uint64_t>(dev_block_size, get_csum_chunk_size()) : dev_block_size;
+  }
+  uint32_t get_csum_chunk_size() const {
+    return 1 << csum_chunk_order;
+  }
+  uint32_t get_compressed_payload_length() const {
+    return is_compressed() ? compressed_length : 0;
+  }
+  uint64_t calc_offset(uint64_t x_off, uint64_t *plen) const {
+    auto p = extents.begin();
+    ceph_assert(p != extents.end());
+    while (x_off >= p->length) {
+      x_off -= p->length;
+      ++p;
+      ceph_assert(p != extents.end());
+    }
+    if (plen)
+      *plen = p->length - x_off;
+    return p->offset + x_off;
+  }
+
+  // validate whether or not the status of pextents within the given range
+  // meets the requirement(allocated or unallocated).
+  bool _validate_range(uint64_t b_off, uint64_t b_len,
+                       bool require_allocated) const {
+    auto p = extents.begin();
+    ceph_assert(p != extents.end());
+    while (b_off >= p->length) {
+      b_off -= p->length;
+      if (++p == extents.end())
+        return false;
+    }
+    b_len += b_off;
+    while (b_len) {
+      if (require_allocated != p->is_valid()) {
+        return false;
+      }
+      if (p->length >= b_len) {
+        return true;
+      }
+      b_len -= p->length;
+      if (++p == extents.end())
+        return false;
+    }
+    ceph_abort_msg("we should not get here");
+    return false;
+  }
+
+  /// return true if the entire range is allocated
+  /// (mapped to extents on disk)
+  bool is_allocated(uint64_t b_off, uint64_t b_len) const {
+    return _validate_range(b_off, b_len, true);
+  }
+
+  /// return true if the entire range is unallocated
+  /// (not mapped to extents on disk)
+  bool is_unallocated(uint64_t b_off, uint64_t b_len) const {
+    return _validate_range(b_off, b_len, false);
+  }
+
+  /// return true if the logical range has never been used
+  bool is_unused(uint64_t offset, uint64_t length) const {
+    if (!has_unused()) {
+      return false;
+    }
+    ceph_assert(!is_compressed());
+    uint64_t blob_len = get_logical_length();
+    ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
+    ceph_assert(offset + length <= blob_len);
+    uint64_t chunk_size = blob_len / (sizeof(unused)*8);
+    uint64_t start = offset / chunk_size;
+    uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size;
+    auto i = start;
+    while (i < end && (unused & (1u << i))) {
+      i++;
+    }
+    return i >= end;
+  }
+
+  /// mark a range that has never been used
+  void add_unused(uint64_t offset, uint64_t length) {
+    ceph_assert(!is_compressed());
+    uint64_t blob_len = get_logical_length();
+    ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
+    ceph_assert(offset + length <= blob_len);
+    uint64_t chunk_size = blob_len / (sizeof(unused)*8);
+    uint64_t start = round_up_to(offset, chunk_size) / chunk_size;
+    uint64_t end = (offset + length) / chunk_size;
+    for (auto i = start; i < end; ++i) {
+      unused |= (1u << i);
+    }
+    if (start != end) {
+      set_flag(FLAG_HAS_UNUSED);
+    }
+  }
+
+  /// indicate that a range has (now) been used.
+  void mark_used(uint64_t offset, uint64_t length) {
+    if (has_unused()) {
+      ceph_assert(!is_compressed());
+      uint64_t blob_len = get_logical_length();
+      ceph_assert((blob_len % (sizeof(unused)*8)) == 0);
+      ceph_assert(offset + length <= blob_len);
+      uint64_t chunk_size = blob_len / (sizeof(unused)*8);
+      uint64_t start = offset / chunk_size;
+      uint64_t end = round_up_to(offset + length, chunk_size) / chunk_size;
+      for (auto i = start; i < end; ++i) {
+        unused &= ~(1u << i);
+      }
+      if (unused == 0) {
+        clear_flag(FLAG_HAS_UNUSED);
+      }
+    }
+  }
+
+  // map_f_invoke templates intended to mask parameters which are not expected
+  // by the provided callback
+  template<class F, typename std::enable_if<std::is_invocable_r_v<
+    int,
+    F,
+    uint64_t,
+    uint64_t>>::type* = nullptr>
+  int map_f_invoke(uint64_t lo,
+    const bluestore_pextent_t& p,
+    uint64_t o,
+    uint64_t l, F&& f) const{
+    return f(o, l);
+  }
+
+  template<class F, typename std::enable_if<std::is_invocable_r_v<
+    int,
+    F,
+    uint64_t,
+    uint64_t,
+    uint64_t>>::type * = nullptr>
+  int map_f_invoke(uint64_t lo,
+    const bluestore_pextent_t& p,
+    uint64_t o,
+    uint64_t l, F&& f) const {
+    return f(lo, o, l);
+  }
+
+  template<class F, typename std::enable_if<std::is_invocable_r_v<
+    int,
+    F,
+    const bluestore_pextent_t&,
+    uint64_t,
+    uint64_t>>::type * = nullptr>
+    int map_f_invoke(uint64_t lo,
+      const bluestore_pextent_t& p,
+      uint64_t o,
+      uint64_t l, F&& f) const {
+    return f(p, o, l);
+  }
+
+  template<class F>
+  int map(uint64_t x_off, uint64_t x_len, F&& f) const {
+    auto x_off0 = x_off;
+    auto p = extents.begin();
+    ceph_assert(p != extents.end());
+    while (x_off >= p->length) {
+      x_off -= p->length;
+      ++p;
+      ceph_assert(p != extents.end());
+    }
+    while (x_len > 0 && p != extents.end()) {
+      uint64_t l = std::min(p->length - x_off, x_len);
+      int r = map_f_invoke(x_off0, *p, p->offset + x_off, l, f);
+      if (r < 0)
+        return r;
+      x_off = 0;
+      x_len -= l;
+      x_off0 += l;
+      ++p;
+    }
+    return 0;
+  }
+
+  template<class F>
+  void map_bl(uint64_t x_off,
+	      ceph::buffer::list& bl,
+	      F&& f) const {
+    static_assert(std::is_invocable_v<F, uint64_t, ceph::buffer::list&>);
+
+    auto p = extents.begin();
+    ceph_assert(p != extents.end());
+    while (x_off >= p->length) {
+      x_off -= p->length;
+      ++p;
+      ceph_assert(p != extents.end());
+    }
+    ceph::buffer::list::iterator it = bl.begin();
+    uint64_t x_len = bl.length();
+    while (x_len > 0) {
+      ceph_assert(p != extents.end());
+      uint64_t l = std::min(p->length - x_off, x_len);
+      ceph::buffer::list t;
+      it.copy(l, t);
+      f(p->offset + x_off, t);
+      x_off = 0;
+      x_len -= l;
+      ++p;
+    }
+  }
+
+  uint32_t get_ondisk_length() const {
+    uint32_t len = 0;
+    for (auto &p : extents) {
+      len += p.length;
+    }
+    return len;
+  }
+
+  uint32_t get_logical_length() const {
+    return logical_length;
+  }
+  size_t get_csum_value_size() const;
+
+  size_t get_csum_count() const {
+    size_t vs = get_csum_value_size();
+    if (!vs)
+      return 0;
+    return csum_data.length() / vs;
+  }
+  uint64_t get_csum_item(unsigned i) const {
+    size_t cs = get_csum_value_size();
+    const char *p = csum_data.c_str();
+    switch (cs) {
+    case 0:
+      ceph_abort_msg("no csum data, bad index");
+    case 1:
+      return reinterpret_cast<const uint8_t*>(p)[i];
+    case 2:
+      return reinterpret_cast<const ceph_le16*>(p)[i];
+    case 4:
+      return reinterpret_cast<const ceph_le32*>(p)[i];
+    case 8:
+      return reinterpret_cast<const ceph_le64*>(p)[i];
+    default:
+      ceph_abort_msg("unrecognized csum word size");
+    }
+  }
+  const char *get_csum_item_ptr(unsigned i) const {
+    size_t cs = get_csum_value_size();
+    return csum_data.c_str() + (cs * i);
+  }
+  char *get_csum_item_ptr(unsigned i) {
+    size_t cs = get_csum_value_size();
+    return csum_data.c_str() + (cs * i);
+  }
+
+  void init_csum(unsigned type, unsigned order, unsigned len) {
+    flags |= FLAG_CSUM;
+    csum_type = type;
+    csum_chunk_order = order;
+    csum_data = ceph::buffer::create(get_csum_value_size() * len / get_csum_chunk_size());
+    csum_data.zero();
+    csum_data.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
+  }
+
+  /// calculate csum for the buffer at the given b_off
+  void calc_csum(uint64_t b_off, const ceph::buffer::list& bl);
+
+  /// verify csum: return -EOPNOTSUPP for unsupported checksum type;
+  /// return -1 and valid(nonnegative) b_bad_off for checksum error;
+  /// return 0 if all is well.
+  int verify_csum(uint64_t b_off, const ceph::buffer::list& bl, int* b_bad_off,
+		  uint64_t *bad_csum) const;
+
+  bool can_prune_tail() const {
+    return
+      extents.size() > 1 &&  // if it's all invalid it's not pruning.
+      !extents.back().is_valid() &&
+      !has_unused();
+  }
+  void prune_tail() {
+    const auto &p = extents.back();
+    logical_length -= p.length;
+    extents.pop_back();
+    if (has_csum()) {
+      ceph::buffer::ptr t;
+      t.swap(csum_data);
+      csum_data = ceph::buffer::ptr(t.c_str(),
+			    get_logical_length() / get_csum_chunk_size() *
+			    get_csum_value_size());
+    }
+  }
+  void add_tail(uint32_t new_len) {
+    ceph_assert(is_mutable());
+    ceph_assert(!has_unused());
+    ceph_assert(new_len > logical_length);
+    extents.emplace_back(
+      bluestore_pextent_t(
+        bluestore_pextent_t::INVALID_OFFSET,
+        new_len - logical_length));
+    logical_length = new_len;
+    if (has_csum()) {
+      ceph::buffer::ptr t;
+      t.swap(csum_data);
+      csum_data = ceph::buffer::create(
+	get_csum_value_size() * logical_length / get_csum_chunk_size());
+      csum_data.copy_in(0, t.length(), t.c_str());
+      csum_data.zero(t.length(), csum_data.length() - t.length());
+    }
+  }
+  uint32_t get_release_size(uint32_t min_alloc_size) const {
+    if (is_compressed()) {
+      return get_logical_length();
+    }
+    uint32_t res = get_csum_chunk_size();
+    if (!has_csum() || res < min_alloc_size) {
+      res = min_alloc_size;
+    }
+    return res;
+  }
+
+  void split(uint32_t blob_offset, bluestore_blob_t& rb);
+  void allocated(uint32_t b_off, uint32_t length, const PExtentVector& allocs);
+  void allocated_test(const bluestore_pextent_t& alloc); // intended for UT only
+
+  /// updates blob's pextents container and return unused pextents eligible
+  /// for release.
+  /// all - indicates that the whole blob to be released.
+  /// logical - specifies set of logical extents within blob's
+  /// to be released
+  /// Returns true if blob has no more valid pextents
+  bool release_extents(
+    bool all,
+    const PExtentVector& logical,
+    PExtentVector* r);
+};
+WRITE_CLASS_DENC_FEATURED(bluestore_blob_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_blob_t& o);
+
+
+/// shared blob state
+struct bluestore_shared_blob_t {
+  MEMPOOL_CLASS_HELPERS();
+  uint64_t sbid;                       ///> shared blob id
+  bluestore_extent_ref_map_t ref_map;  ///< shared blob extents
+
+  bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {}
+  bluestore_shared_blob_t(uint64_t _sbid,
+			  bluestore_extent_ref_map_t&& _ref_map ) 
+    : sbid(_sbid), ref_map(std::move(_ref_map)) {}
+
+  DENC(bluestore_shared_blob_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.ref_map, p);
+    DENC_FINISH(p);
+  }
+
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_shared_blob_t*>& ls);
+
+  bool empty() const {
+    return ref_map.empty();
+  }
+};
+WRITE_CLASS_DENC(bluestore_shared_blob_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_shared_blob_t& o);
+
+/// onode: per-object metadata
+struct bluestore_onode_t {
+  uint64_t nid = 0;                    ///< numeric id (locally unique)
+  uint64_t size = 0;                   ///< object size
+  // mempool to be assigned to buffer::ptr manually
+  std::map<mempool::bluestore_cache_meta::string, ceph::buffer::ptr> attrs;
+
+  struct shard_info {
+    uint32_t offset = 0;  ///< logical offset for start of shard
+    uint32_t bytes = 0;   ///< encoded bytes
+    DENC(shard_info, v, p) {
+      denc_varint(v.offset, p);
+      denc_varint(v.bytes, p);
+    }
+    void dump(ceph::Formatter *f) const;
+  };
+  std::vector<shard_info> extent_map_shards; ///< extent std::map shards (if any)
+
+  uint32_t expected_object_size = 0;
+  uint32_t expected_write_size = 0;
+  uint32_t alloc_hint_flags = 0;
+
+  uint8_t flags = 0;
+
+  std::map<uint32_t, uint64_t> zone_offset_refs;  ///< (zone, offset) refs to this onode
+
+  enum {
+    FLAG_OMAP = 1,         ///< object may have omap data
+    FLAG_PGMETA_OMAP = 2,  ///< omap data is in meta omap prefix
+    FLAG_PERPOOL_OMAP = 4, ///< omap data is in per-pool prefix; per-pool keys
+    FLAG_PERPG_OMAP = 8,   ///< omap data is in per-pg prefix; per-pg keys
+  };
+
+  std::string get_flags_string() const {
+    std::string s;
+    if (flags & FLAG_OMAP) {
+      s = "omap";
+    }
+    if (flags & FLAG_PGMETA_OMAP) {
+      s += "+pgmeta_omap";
+    }
+    if (flags & FLAG_PERPOOL_OMAP) {
+      s += "+per_pool_omap";
+    }
+    if (flags & FLAG_PERPG_OMAP) {
+      s += "+per_pg_omap";
+    }
+    return s;
+  }
+
+  bool has_flag(unsigned f) const {
+    return flags & f;
+  }
+
+  void set_flag(unsigned f) {
+    flags |= f;
+  }
+
+  void clear_flag(unsigned f) {
+    flags &= ~f;
+  }
+
+  bool has_omap() const {
+    return has_flag(FLAG_OMAP);
+  }
+
+  static bool is_pgmeta_omap(uint8_t flags) {
+    return flags & FLAG_PGMETA_OMAP;
+  }
+  static bool is_perpool_omap(uint8_t flags) {
+    return flags & FLAG_PERPOOL_OMAP;
+  }
+  static bool is_perpg_omap(uint8_t flags) {
+    return flags & FLAG_PERPG_OMAP;
+  }
+  bool is_pgmeta_omap() const {
+    return has_flag(FLAG_PGMETA_OMAP);
+  }
+  bool is_perpool_omap() const {
+    return has_flag(FLAG_PERPOOL_OMAP);
+  }
+  bool is_perpg_omap() const {
+    return has_flag(FLAG_PERPG_OMAP);
+  }
+
+  void set_omap_flags(bool legacy) {
+    set_flag(FLAG_OMAP | (legacy ? 0 : (FLAG_PERPOOL_OMAP | FLAG_PERPG_OMAP)));
+  }
+  void set_omap_flags_pgmeta() {
+    set_flag(FLAG_OMAP | FLAG_PGMETA_OMAP);
+  }
+
+  void clear_omap_flag() {
+    clear_flag(FLAG_OMAP |
+	       FLAG_PGMETA_OMAP |
+	       FLAG_PERPOOL_OMAP |
+	       FLAG_PERPG_OMAP);
+  }
+
+  DENC(bluestore_onode_t, v, p) {
+    DENC_START(2, 1, p);
+    denc_varint(v.nid, p);
+    denc_varint(v.size, p);
+    denc(v.attrs, p);
+    denc(v.flags, p);
+    denc(v.extent_map_shards, p);
+    denc_varint(v.expected_object_size, p);
+    denc_varint(v.expected_write_size, p);
+    denc_varint(v.alloc_hint_flags, p);
+    if (struct_v >= 2) {
+      denc(v.zone_offset_refs, p);
+    }
+    DENC_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_onode_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_onode_t::shard_info)
+WRITE_CLASS_DENC(bluestore_onode_t)
+
+std::ostream& operator<<(std::ostream& out, const bluestore_onode_t::shard_info& si);
+
+/// writeahead-logged op
+struct bluestore_deferred_op_t {
+  typedef enum {
+    OP_WRITE = 1,
+  } type_t;
+  __u8 op = 0;
+
+  PExtentVector extents;
+  ceph::buffer::list data;
+
+  DENC(bluestore_deferred_op_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.op, p);
+    denc(v.extents, p);
+    denc(v.data, p);
+    DENC_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_deferred_op_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_deferred_op_t)
+
+
+/// writeahead-logged transaction
+struct bluestore_deferred_transaction_t {
+  uint64_t seq = 0;
+  std::list<bluestore_deferred_op_t> ops;
+  interval_set<uint64_t> released;  ///< allocations to release after tx
+
+  bluestore_deferred_transaction_t() : seq(0) {}
+
+  DENC(bluestore_deferred_transaction_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.seq, p);
+    denc(v.ops, p);
+    denc(v.released, p);
+    DENC_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_deferred_transaction_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_deferred_transaction_t)
+
+struct bluestore_compression_header_t {
+  uint8_t type = Compressor::COMP_ALG_NONE;
+  uint32_t length = 0;
+  std::optional<int32_t> compressor_message;
+
+  bluestore_compression_header_t() {}
+  bluestore_compression_header_t(uint8_t _type)
+    : type(_type) {}
+
+  DENC(bluestore_compression_header_t, v, p) {
+    DENC_START(2, 1, p);
+    denc(v.type, p);
+    denc(v.length, p);
+    if (struct_v >= 2) {
+      denc(v.compressor_message, p);
+    }
+    DENC_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bluestore_compression_header_t*>& o);
+};
+WRITE_CLASS_DENC(bluestore_compression_header_t)
+
+template <template <typename> typename V, class COUNTER_TYPE = int32_t>
+class ref_counter_2hash_tracker_t {
+  size_t num_non_zero = 0;
+  size_t num_buckets = 0;
+  V<COUNTER_TYPE> buckets1;
+  V<COUNTER_TYPE> buckets2;
+
+public:
+  ref_counter_2hash_tracker_t(uint64_t mem_cap) {
+    num_buckets = mem_cap / sizeof(COUNTER_TYPE) / 2;
+    ceph_assert(num_buckets);
+    buckets1.resize(num_buckets);
+    buckets2.resize(num_buckets);
+    reset();
+  }
+
+  size_t get_num_buckets() const {
+    return num_buckets;
+  }
+
+  void inc(const char* hash_val, size_t hash_val_len, int n) {
+    auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len) %
+      num_buckets;
+    if (buckets1[h] == 0 && n) {
+      ++num_non_zero;
+    } else if (buckets1[h] == -n) {
+      --num_non_zero;
+    }
+    buckets1[h] += n;
+    h = ceph_str_hash_linux((const char*)hash_val, hash_val_len) % num_buckets;
+    if (buckets2[h] == 0 && n) {
+      ++num_non_zero;
+    } else if (buckets2[h] == -n) {
+      --num_non_zero;
+    }
+    buckets2[h] += n;
+  }
+
+  bool test_hash_conflict(
+    const char* hash_val1,
+    const char* hash_val2,
+    size_t hash_val_len) const {
+
+    auto h1 = ceph_str_hash_rjenkins((const char*)hash_val1, hash_val_len);
+    auto h2 = ceph_str_hash_rjenkins((const char*)hash_val2, hash_val_len);
+    auto h3 = ceph_str_hash_linux((const char*)hash_val1, hash_val_len);
+    auto h4 = ceph_str_hash_linux((const char*)hash_val2, hash_val_len);
+    return ((h1 % num_buckets) == (h2 % num_buckets)) &&
+      ((h3 % num_buckets) == (h4 % num_buckets));
+  }
+
+  bool test_all_zero(const char* hash_val, size_t hash_val_len) const {
+    auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len);
+    if (buckets1[h % num_buckets] != 0) {
+      return false;
+    }
+    h = ceph_str_hash_linux((const char*)hash_val, hash_val_len);
+    return buckets2[h % num_buckets] == 0;
+  }
+
+  // returns number of mismatching buckets
+  size_t count_non_zero() const {
+    return num_non_zero;
+  }
+  void reset() {
+    for (size_t i = 0; i < num_buckets; i++) {
+      buckets1[i] = 0;
+      buckets2[i] = 0;
+    }
+    num_non_zero = 0;
+  }
+};
+
+class shared_blob_2hash_tracker_t
+  : public ref_counter_2hash_tracker_t<mempool::bluestore_fsck::vector> {
+
+  static const size_t hash_input_len = 3;
+
+  typedef std::array<uint64_t, hash_input_len> hash_input_t;
+
+  static size_t get_hash_input_size() {
+    return hash_input_len * sizeof(hash_input_t::value_type);
+  }
+
+  inline hash_input_t build_hash_input(uint64_t sbid, uint64_t offset) const;
+
+  size_t au_void_bits = 0;
+
+
+public:
+  shared_blob_2hash_tracker_t(uint64_t mem_cap, size_t alloc_unit)
+    : ref_counter_2hash_tracker_t(mem_cap) {
+    ceph_assert(alloc_unit);
+    ceph_assert(std::has_single_bit(alloc_unit));
+    au_void_bits = std::countr_zero(alloc_unit);
+  }
+  void inc(uint64_t sbid, uint64_t offset, int n);
+  void inc_range(uint64_t sbid, uint64_t offset, uint32_t len, int n);
+
+  bool test_hash_conflict(
+    uint64_t sbid,
+    uint64_t offset,
+    uint64_t sbid2,
+    uint64_t offset2) const;
+  bool test_all_zero(
+    uint64_t sbid,
+    uint64_t offset) const;
+  bool test_all_zero_range(
+    uint64_t sbid,
+    uint64_t offset,
+    uint32_t len) const;
+};
+
+class sb_info_t {
+  // subzero value indicates (potentially) stray blob,
+  // i.e. blob that has got no real references from onodes
+  int64_t sbid = 0;
+
+public:
+  enum {
+    INVALID_POOL_ID = INT64_MIN
+  };
+
+  int64_t pool_id = INVALID_POOL_ID;
+  // subzero value indicates compressed_allocated as well
+  int32_t allocated_chunks = 0;
+
+  sb_info_t(int64_t _sbid = 0) : sbid(_sbid)
+  {
+  }
+  bool operator< (const sb_info_t& other) const {
+    return std::abs(sbid) < std::abs(other.sbid);
+  }
+  bool operator< (const uint64_t& other_sbid) const {
+    return uint64_t(std::abs(sbid)) < other_sbid;
+  }
+  bool is_stray() const {
+    return sbid < 0;
+  }
+  uint64_t get_sbid() const {
+    return uint64_t(std::abs(sbid));
+  }
+  void adopt() {
+    sbid = std::abs(sbid);
+  }
+} __attribute__((packed));
+
+// Space-efficient container to keep a set of sb_info structures
+// given that the majority of entries are appended in a proper id-sorted
+// order. Hence one can keep them in a regular vector and apply binary search
+// whenever specific entry to be found.
+// For the rare occasions when out-of-order append takes place - an auxilliary
+// regular map is used.
+struct sb_info_space_efficient_map_t {
+  // large array sorted by the user
+  mempool::bluestore_fsck::vector<sb_info_t> items;
+  // small additional set of items we maintain sorting ourselves
+  // this would never keep an entry with id > items.back().id
+  mempool::bluestore_fsck::vector<sb_info_t> aux_items;
+
+  sb_info_t& add_maybe_stray(uint64_t sbid) {
+    return _add(-int64_t(sbid));
+  }
+  sb_info_t& add_or_adopt(uint64_t sbid) {
+    auto& r = _add(sbid);
+    r.adopt();
+    return r;
+  }
+  auto find(uint64_t id) {
+    if (items.size() != 0) {
+      auto it = std::lower_bound(
+	items.begin(),
+	items.end() - 1,
+	id,
+	[](const sb_info_t& a, const uint64_t& b) {
+	  return a < b;
+	});
+      if (it->get_sbid() == id) {
+	return it;
+      }
+      if (aux_items.size() != 0) {
+	auto it = std::lower_bound(
+	  aux_items.begin(),
+	  aux_items.end(),
+	  id,
+	  [](const sb_info_t& a, const uint64_t& b) {
+	    return a < b;
+	  });
+	if (it->get_sbid() == id) {
+	  return it;
+	}
+      }
+    }
+    return items.end();
+  }
+  // enumerates strays, order isn't guaranteed.
+  void foreach_stray(std::function<void(const sb_info_t&)> cb) {
+    for (auto& sbi : items) {
+      if (sbi.is_stray()) {
+	cb(sbi);
+      }
+    }
+    for (auto& sbi : aux_items) {
+      if (sbi.is_stray()) {
+	cb(sbi);
+      }
+    }
+  }
+  auto end() {
+    return items.end();
+  }
+
+  void shrink() {
+    items.shrink_to_fit();
+    aux_items.shrink_to_fit();
+  }
+  void clear() {
+    items.clear();
+    aux_items.clear();
+    shrink();
+  }
+private:
+  sb_info_t& _add(int64_t id) {
+    uint64_t n_id = uint64_t(std::abs(id));
+    if (items.size() == 0 || n_id > items.back().get_sbid()) {
+      return items.emplace_back(id);
+    }
+    auto it = find(n_id);
+    if (it != items.end()) {
+      return *it;
+    }
+    if (aux_items.size() == 0 || n_id > aux_items.back().get_sbid()) {
+      return aux_items.emplace_back(id);
+    }
+    // do sorted insertion, may be expensive!
+    it = std::upper_bound(
+      aux_items.begin(),
+      aux_items.end(),
+      n_id,
+      [](const uint64_t& a, const sb_info_t& b) {
+	return a < b.get_sbid();
+      });
+    return *aux_items.emplace(it, id);
+  }
+};
+
+#endif
diff --git a/src/os/bluestore/fastbmap_allocator_impl.cc b/src/os/bluestore/fastbmap_allocator_impl.cc
new file mode 100644
index 000000000..4f735ba2e
--- /dev/null
+++ b/src/os/bluestore/fastbmap_allocator_impl.cc
@@ -0,0 +1,707 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator implementation.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ *
+ */
+
+#include "fastbmap_allocator_impl.h"
+
+uint64_t AllocatorLevel::l0_dives = 0;
+uint64_t AllocatorLevel::l0_iterations = 0;
+uint64_t AllocatorLevel::l0_inner_iterations = 0;
+uint64_t AllocatorLevel::alloc_fragments = 0;
+uint64_t AllocatorLevel::alloc_fragments_fast = 0;
+uint64_t AllocatorLevel::l2_allocs = 0;
+
+inline interval_t _align2units(uint64_t offset, uint64_t len, uint64_t min_length)
+{
+  return len >= min_length ?
+    interval_t(offset, p2align<uint64_t>(len, min_length)) :
+    interval_t();
+}
+
+interval_t AllocatorLevel01Loose::_get_longest_from_l0(uint64_t pos0,
+  uint64_t pos1, uint64_t min_length, interval_t* tail) const
+{
+  interval_t res;
+  if (pos0 >= pos1) {
+    return res;
+  }
+  auto pos = pos0;
+
+  interval_t res_candidate;
+  if (tail->length != 0) {
+    ceph_assert((tail->offset % l0_granularity) == 0);
+    ceph_assert((tail->length % l0_granularity) == 0);
+    res_candidate.offset = tail->offset / l0_granularity;
+    res_candidate.length = tail->length / l0_granularity;
+  }
+  *tail = interval_t();
+
+  auto d = bits_per_slot;
+  slot_t bits = l0[pos / d];
+  bits >>= pos % d;
+  bool end_loop = false;
+  auto min_granules = min_length / l0_granularity;
+
+  do {
+    if ((pos % d) == 0) {
+      bits = l0[pos / d];
+      if (pos1 - pos >= d) {
+        switch(bits) {
+	  case all_slot_set:
+	    // slot is totally free
+	    if (!res_candidate.length) {
+	      res_candidate.offset = pos;
+	    }
+	    res_candidate.length += d;
+	    pos += d;
+	    end_loop = pos >= pos1;
+	    if (end_loop) {
+	      *tail = res_candidate;
+	      res_candidate = _align2units(res_candidate.offset,
+		res_candidate.length, min_granules);
+	      if(res.length < res_candidate.length) {
+		res = res_candidate;
+	      }
+	    }
+	    continue;
+	  case all_slot_clear:
+	    // slot is totally allocated
+	    res_candidate = _align2units(res_candidate.offset,
+	      res_candidate.length, min_granules);
+	    if (res.length < res_candidate.length) {
+	      res = res_candidate;
+	    }
+	    res_candidate = interval_t();
+	    pos += d;
+	    end_loop = pos >= pos1;
+	    continue;
+	}
+      }
+    } //if ((pos % d) == 0)
+
+    end_loop = ++pos >= pos1;
+    if (bits & 1) {
+      // item is free
+      if (!res_candidate.length) {
+	res_candidate.offset = pos - 1;
+      }
+      ++res_candidate.length;
+      if (end_loop) {
+	*tail = res_candidate;
+	res_candidate = _align2units(res_candidate.offset,
+	  res_candidate.length, min_granules);
+	if (res.length < res_candidate.length) {
+	  res = res_candidate;
+	}
+      }
+    } else {
+      res_candidate = _align2units(res_candidate.offset,
+	res_candidate.length, min_granules);
+      if (res.length < res_candidate.length) {
+	res = res_candidate;
+      }
+      res_candidate = interval_t();
+    }
+    bits >>= 1;
+  } while (!end_loop);
+  res.offset *= l0_granularity;
+  res.length *= l0_granularity;
+  tail->offset *= l0_granularity;
+  tail->length *= l0_granularity;
+  return res;
+}
+
+void AllocatorLevel01Loose::_analyze_partials(uint64_t pos_start,
+  uint64_t pos_end, uint64_t length, uint64_t min_length, int mode,
+  search_ctx_t* ctx)
+{
+  auto d = L1_ENTRIES_PER_SLOT;
+  ceph_assert((pos_start % d) == 0);
+  ceph_assert((pos_end % d) == 0);
+
+  uint64_t l0_w = slots_per_slotset * L0_ENTRIES_PER_SLOT;
+
+  uint64_t l1_pos = pos_start;
+  const interval_t empty_tail;
+  interval_t prev_tail;
+
+  uint64_t next_free_l1_pos = 0;
+  for (auto pos = pos_start / d; pos < pos_end / d; ++pos) {
+    slot_t slot_val = l1[pos];
+    // FIXME minor: code below can be optimized to check slot_val against
+    // all_slot_set(_clear) value
+
+    for (auto c = 0; c < d; c++) {
+      switch (slot_val & L1_ENTRY_MASK) {
+      case L1_ENTRY_FREE:
+        prev_tail  = empty_tail;
+        if (!ctx->free_count) {
+          ctx->free_l1_pos = l1_pos;
+        } else if (l1_pos != next_free_l1_pos){
+	  auto o = ctx->free_l1_pos * l1_granularity;
+	  auto l = ctx->free_count * l1_granularity;
+          // check if already found extent fits min_length after alignment
+	  if (_align2units(o, l, min_length).length >= min_length) {
+	    break;
+	  }
+	  // if not - proceed with the next one
+          ctx->free_l1_pos = l1_pos;
+          ctx->free_count = 0;
+	}
+	next_free_l1_pos = l1_pos + 1;
+        ++ctx->free_count;
+        if (mode == STOP_ON_EMPTY) {
+          return;
+        }
+        break;
+      case L1_ENTRY_FULL:
+        prev_tail = empty_tail;
+        break;
+      case L1_ENTRY_PARTIAL:
+	interval_t longest;
+        ++ctx->partial_count;
+
+        longest = _get_longest_from_l0(l1_pos * l0_w, (l1_pos + 1) * l0_w, min_length, &prev_tail);
+
+        if (longest.length >= length) {
+          if ((ctx->affordable_len == 0) ||
+              ((ctx->affordable_len != 0) &&
+                (longest.length < ctx->affordable_len))) {
+            ctx->affordable_len = longest.length;
+	    ctx->affordable_offs = longest.offset;
+          }
+        }
+        if (longest.length >= min_length &&
+	    (ctx->min_affordable_len == 0 ||
+	      (longest.length < ctx->min_affordable_len))) {
+
+          ctx->min_affordable_len = p2align<uint64_t>(longest.length, min_length);
+	  ctx->min_affordable_offs = longest.offset;
+        }
+        if (mode == STOP_ON_PARTIAL) {
+          return;
+        }
+        break;
+      }
+      slot_val >>= L1_ENTRY_WIDTH;
+      ++l1_pos;
+    }
+  }
+  ctx->fully_processed = true;
+}
+
+void AllocatorLevel01Loose::_mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end)
+{
+  if (l0_pos == l0_pos_end) {
+    return;
+  }
+  auto d0 = bits_per_slotset;
+  uint64_t l1_w = L1_ENTRIES_PER_SLOT;
+  // this should be aligned with slotset boundaries
+  ceph_assert(0 == (l0_pos % d0));
+  ceph_assert(0 == (l0_pos_end % d0));
+
+  int64_t idx = l0_pos / bits_per_slot;
+  int64_t idx_end = l0_pos_end / bits_per_slot;
+  slot_t mask_to_apply = L1_ENTRY_NOT_USED;
+
+  auto l1_pos = l0_pos / d0;
+
+  while (idx < idx_end) {
+    if (l0[idx] == all_slot_clear) {
+      // if not all prev slots are allocated then no need to check the
+      // current slot set, it's partial
+      ++idx;
+      if (mask_to_apply == L1_ENTRY_NOT_USED) {
+	mask_to_apply = L1_ENTRY_FULL;
+      } else if (mask_to_apply != L1_ENTRY_FULL) {
+	idx = p2roundup(idx, int64_t(slots_per_slotset));
+        mask_to_apply = L1_ENTRY_PARTIAL;
+      }
+    } else if (l0[idx] == all_slot_set) {
+      // if not all prev slots are free then no need to check the
+      // current slot set, it's partial
+      ++idx;
+      if (mask_to_apply == L1_ENTRY_NOT_USED) {
+	mask_to_apply = L1_ENTRY_FREE;
+      } else if (mask_to_apply != L1_ENTRY_FREE) {
+	idx = p2roundup(idx, int64_t(slots_per_slotset));
+        mask_to_apply = L1_ENTRY_PARTIAL;
+      }
+    } else {
+      // no need to check the current slot set, it's partial
+      mask_to_apply = L1_ENTRY_PARTIAL;
+      ++idx;
+      idx = p2roundup(idx, int64_t(slots_per_slotset));
+    }
+    if ((idx % slots_per_slotset) == 0) {
+      ceph_assert(mask_to_apply != L1_ENTRY_NOT_USED);
+      uint64_t shift = (l1_pos % l1_w) * L1_ENTRY_WIDTH;
+      slot_t& slot_val = l1[l1_pos / l1_w];
+      auto mask = slot_t(L1_ENTRY_MASK) << shift;
+
+      slot_t old_mask = (slot_val & mask) >> shift;
+      switch(old_mask) {
+      case L1_ENTRY_FREE:
+	unalloc_l1_count--;
+	break;
+      case L1_ENTRY_PARTIAL:
+	partial_l1_count--;
+	break;
+      }
+      slot_val &= ~mask;
+      slot_val |= slot_t(mask_to_apply) << shift;
+      switch(mask_to_apply) {
+      case L1_ENTRY_FREE:
+	unalloc_l1_count++;
+	break;
+      case L1_ENTRY_PARTIAL:
+	partial_l1_count++;
+	break;
+      }
+      mask_to_apply = L1_ENTRY_NOT_USED;
+      ++l1_pos;
+    }
+  }
+}
+
+void AllocatorLevel01Loose::_mark_alloc_l0(int64_t l0_pos_start,
+  int64_t l0_pos_end)
+{
+  auto d0 = L0_ENTRIES_PER_SLOT;
+
+  int64_t pos = l0_pos_start;
+  slot_t bits = (slot_t)1 << (l0_pos_start % d0);
+  slot_t* val_s = l0.data() + (pos / d0);
+  int64_t pos_e = std::min(l0_pos_end, p2roundup<int64_t>(l0_pos_start + 1, d0));
+  while (pos < pos_e) {
+    (*val_s) &= ~bits;
+    bits <<= 1;
+    pos++;
+  }
+  pos_e = std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0));
+  while (pos < pos_e) {
+    *(++val_s) = all_slot_clear;
+    pos += d0;
+  }
+  bits = 1;
+  ++val_s;
+  while (pos < l0_pos_end) {
+    (*val_s) &= ~bits;
+    bits <<= 1;
+    pos++;
+  }
+}
+
+interval_t AllocatorLevel01Loose::_allocate_l1_contiguous(uint64_t length,
+  uint64_t min_length, uint64_t max_length,
+  uint64_t pos_start, uint64_t pos_end)
+{
+  interval_t res = { 0, 0 };
+  uint64_t l0_w = slots_per_slotset * L0_ENTRIES_PER_SLOT;
+
+  if (unlikely(length <= l0_granularity)) {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, l0_granularity, l0_granularity,
+      STOP_ON_PARTIAL, &ctx);
+
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.affordable_len) {
+      // allocate as specified
+      ceph_assert(ctx.affordable_len >= length);
+      auto pos = ctx.affordable_offs / l0_granularity;
+      _mark_alloc_l1_l0(pos, pos + 1);
+      res = interval_t(ctx.affordable_offs, length);
+      return res;
+    }
+
+    // allocate from free slot sets
+    if (ctx.free_count) {
+      auto l = std::min(length, ctx.free_count * l1_granularity);
+      ceph_assert((l % l0_granularity) == 0);
+      auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+      _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+      res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+      return res;
+    }
+  } else if (unlikely(length == l1_granularity)) {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, length, min_length, STOP_ON_EMPTY, &ctx);
+
+    // allocate using contiguous extent found at l1 if any
+    if (ctx.free_count) {
+
+      auto l = std::min(length, ctx.free_count * l1_granularity);
+      ceph_assert((l % l0_granularity) == 0);
+      auto pos_end = ctx.free_l1_pos * l0_w + l / l0_granularity;
+
+      _mark_alloc_l1_l0(ctx.free_l1_pos * l0_w, pos_end);
+      res = interval_t(ctx.free_l1_pos * l1_granularity, l);
+
+      return res;
+    }
+
+    // we can terminate earlier on free entry only
+    ceph_assert(ctx.fully_processed);
+
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.affordable_len) {
+      ceph_assert(ctx.affordable_len >= length);
+      ceph_assert((length % l0_granularity) == 0);
+      auto pos_start = ctx.affordable_offs / l0_granularity;
+      auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      res = interval_t(ctx.affordable_offs, length);
+      return res;
+    }
+    if (ctx.min_affordable_len) {
+      auto pos_start = ctx.min_affordable_offs / l0_granularity;
+      auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
+    }
+  } else {
+    search_ctx_t ctx;
+    _analyze_partials(pos_start, pos_end, length, min_length, NO_STOP, &ctx);
+    ceph_assert(ctx.fully_processed);
+    // check partially free slot sets first (including neighboring),
+    // full length match required.
+    if (ctx.affordable_len) {
+      ceph_assert(ctx.affordable_len >= length);
+      ceph_assert((length % l0_granularity) == 0);
+      auto pos_start = ctx.affordable_offs / l0_granularity;
+      auto pos_end = (ctx.affordable_offs + length) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      res = interval_t(ctx.affordable_offs, length);
+      return res;
+    }
+    // allocate using contiguous extent found at l1 if affordable
+    // align allocated extent with min_length
+    if (ctx.free_count) {
+      auto o = ctx.free_l1_pos * l1_granularity;
+      auto l = ctx.free_count * l1_granularity;
+      interval_t aligned_extent = _align2units(o, l, min_length);
+      if (aligned_extent.length > 0) {
+	aligned_extent.length = std::min(length,
+	  uint64_t(aligned_extent.length));
+	ceph_assert((aligned_extent.offset % l0_granularity) == 0);
+	ceph_assert((aligned_extent.length % l0_granularity) == 0);
+
+	auto pos_start = aligned_extent.offset / l0_granularity;
+	auto pos_end = (aligned_extent.offset + aligned_extent.length) / l0_granularity;
+
+	_mark_alloc_l1_l0(pos_start, pos_end);
+	return aligned_extent;
+      }
+    }
+    if (ctx.min_affordable_len) {
+      auto pos_start = ctx.min_affordable_offs / l0_granularity;
+      auto pos_end = (ctx.min_affordable_offs + ctx.min_affordable_len) / l0_granularity;
+      _mark_alloc_l1_l0(pos_start, pos_end);
+      return interval_t(ctx.min_affordable_offs, ctx.min_affordable_len);
+    }
+  }
+  return res;
+}
+
+bool AllocatorLevel01Loose::_allocate_l1(uint64_t length,
+  uint64_t min_length, uint64_t max_length,
+  uint64_t l1_pos_start, uint64_t l1_pos_end,
+  uint64_t* allocated,
+  interval_vector_t* res)
+{
+  uint64_t d0 = L0_ENTRIES_PER_SLOT;
+  uint64_t d1 = L1_ENTRIES_PER_SLOT;
+
+  ceph_assert(0 == (l1_pos_start % (slots_per_slotset * d1)));
+  ceph_assert(0 == (l1_pos_end % (slots_per_slotset * d1)));
+  if (min_length != l0_granularity) {
+    // probably not the most effecient way but
+    // don't care much about that at the moment
+    bool has_space = true;
+    while (length > *allocated && has_space) {
+      interval_t i =
+        _allocate_l1_contiguous(length - *allocated, min_length, max_length,
+	  l1_pos_start, l1_pos_end);
+      if (i.length == 0) {
+        has_space = false;
+      } else {
+	_fragment_and_emplace(max_length, i.offset, i.length, res);
+        *allocated += i.length;
+      }
+    }
+  } else {
+    uint64_t l0_w = slots_per_slotset * d0;
+
+    for (auto idx = l1_pos_start / d1;
+      idx < l1_pos_end / d1 && length > *allocated;
+      ++idx) {
+      slot_t& slot_val = l1[idx];
+      if (slot_val == all_slot_clear) {
+        continue;
+      } else if (slot_val == all_slot_set) {
+        uint64_t to_alloc = std::min(length - *allocated,
+          l1_granularity * d1);
+        *allocated += to_alloc;
+        ++alloc_fragments_fast;
+	_fragment_and_emplace(max_length, idx * d1 * l1_granularity, to_alloc,
+	  res);
+        _mark_alloc_l1_l0(idx * d1 * bits_per_slotset,
+	  idx * d1 * bits_per_slotset + to_alloc / l0_granularity);
+        continue;
+      }
+      auto free_pos = find_next_set_bit(slot_val, 0);
+      ceph_assert(free_pos < bits_per_slot);
+      do {
+        ceph_assert(length > *allocated);
+
+        bool empty;
+        empty = _allocate_l0(length, max_length,
+	  (idx * d1 + free_pos / L1_ENTRY_WIDTH) * l0_w,
+          (idx * d1 + free_pos / L1_ENTRY_WIDTH + 1) * l0_w,
+          allocated,
+          res);
+
+	auto mask = slot_t(L1_ENTRY_MASK) << free_pos;
+
+	slot_t old_mask = (slot_val & mask) >> free_pos;
+	switch(old_mask) {
+	case L1_ENTRY_FREE:
+	  unalloc_l1_count--;
+	  break;
+	case L1_ENTRY_PARTIAL:
+	  partial_l1_count--;
+	  break;
+	}
+        slot_val &= ~mask;
+        if (empty) {
+          // the next line is no op with the current L1_ENTRY_FULL but left
+          // as-is for the sake of uniformity and to avoid potential errors
+          // in future
+          slot_val |= slot_t(L1_ENTRY_FULL) << free_pos;
+        } else {
+          slot_val |= slot_t(L1_ENTRY_PARTIAL) << free_pos;
+	  partial_l1_count++;
+        }
+        if (length <= *allocated || slot_val == all_slot_clear) {
+          break;
+        }
+	free_pos = find_next_set_bit(slot_val, free_pos + L1_ENTRY_WIDTH);
+      } while (free_pos < bits_per_slot);
+    }
+  }
+  return _is_empty_l1(l1_pos_start, l1_pos_end);
+}
+
+void AllocatorLevel01Loose::collect_stats(
+  std::map<size_t, size_t>& bins_overall)
+{
+  size_t free_seq_cnt = 0;
+  for (auto slot : l0) {
+    if (slot == all_slot_set) {
+      free_seq_cnt += L0_ENTRIES_PER_SLOT;
+    } else if(slot != all_slot_clear) {
+      size_t pos = 0;
+      do {
+	auto pos1 = find_next_set_bit(slot, pos);
+	if (pos1 == pos) {
+	  free_seq_cnt++;
+	  pos = pos1 + 1;
+	} else {
+	  if (free_seq_cnt) {
+	    bins_overall[cbits(free_seq_cnt) - 1]++;
+	    free_seq_cnt = 0;
+	  }
+	  if (pos1 < bits_per_slot) {
+	    free_seq_cnt = 1;
+	  }
+          pos = pos1 + 1;
+	}
+      } while (pos < bits_per_slot);
+    } else if (free_seq_cnt) {
+      bins_overall[cbits(free_seq_cnt) - 1]++;
+      free_seq_cnt = 0;
+    }
+  }
+  if (free_seq_cnt) {
+    bins_overall[cbits(free_seq_cnt) - 1]++;
+  }
+}
+
+inline ssize_t AllocatorLevel01Loose::count_0s(slot_t slot_val, size_t start_pos)
+  {
+  #ifdef __GNUC__
+    size_t pos = __builtin_ffsll(slot_val >> start_pos);
+    if (pos == 0)
+      return sizeof(slot_t)*8 - start_pos;
+    return pos - 1;
+  #else
+    size_t pos = start_pos;
+    slot_t mask = slot_t(1) << pos;
+    while (pos < bits_per_slot && (slot_val & mask) == 0) {
+      mask <<= 1;
+      pos++;
+    }
+    return pos - start_pos;
+  #endif
+  }
+
+ inline ssize_t AllocatorLevel01Loose::count_1s(slot_t slot_val, size_t start_pos)
+ {
+   return count_0s(~slot_val, start_pos);
+ }
+void AllocatorLevel01Loose::foreach_internal(
+    std::function<void(uint64_t offset, uint64_t length)> notify)
+{
+  size_t len = 0;
+  size_t off = 0;
+  for (size_t i = 0; i < l1.size(); i++)
+  {
+    for (size_t j = 0; j < L1_ENTRIES_PER_SLOT * L1_ENTRY_WIDTH; j += L1_ENTRY_WIDTH)
+    {
+      size_t w = (l1[i] >> j) & L1_ENTRY_MASK;
+      switch (w) {
+        case L1_ENTRY_FULL:
+          if (len > 0) {
+            notify(off, len);
+            len = 0;
+          }
+          break;
+        case L1_ENTRY_FREE:
+          if (len == 0)
+            off = ( ( bits_per_slot * i + j ) / L1_ENTRY_WIDTH ) * slots_per_slotset * bits_per_slot;
+          len += bits_per_slotset;
+          break;
+        case L1_ENTRY_PARTIAL:
+          size_t pos = ( ( bits_per_slot * i + j ) / L1_ENTRY_WIDTH ) * slots_per_slotset;
+          for (size_t t = 0; t < slots_per_slotset; t++) {
+            size_t p = 0;
+            slot_t allocation_pattern = l0[pos + t];
+            while (p < bits_per_slot) {
+              if (len == 0) {
+                //continue to skip allocated space, meaning bits set to 0
+                ssize_t alloc_count = count_0s(allocation_pattern, p);
+                p += alloc_count;
+                //now we are switched to expecting free space
+                if (p < bits_per_slot) {
+                  //now @p are 1s
+                  ssize_t free_count = count_1s(allocation_pattern, p);
+                  assert(free_count > 0);
+                  len = free_count;
+                  off = (pos + t) * bits_per_slot + p;
+                  p += free_count;
+                }
+              } else {
+                //continue free region
+                ssize_t free_count = count_1s(allocation_pattern, p);
+                if (free_count == 0) {
+                  notify(off, len);
+                  len = 0;
+                } else {
+                  p += free_count;
+                  len += free_count;
+                }
+              }
+            }
+          }
+          break;
+      }
+    }
+  }
+  if (len > 0)
+    notify(off, len);
+}
+
+uint64_t AllocatorLevel01Loose::_claim_free_to_left_l0(int64_t l0_pos_start)
+{
+  int64_t d0 = L0_ENTRIES_PER_SLOT;
+
+  int64_t pos = l0_pos_start - 1;
+  slot_t bits = (slot_t)1 << (pos % d0);
+  int64_t idx = pos / d0;
+  slot_t* val_s = l0.data() + idx;
+
+  int64_t pos_e = p2align<int64_t>(pos, d0);
+
+  while (pos >= pos_e) {
+    if (0 == ((*val_s) & bits))
+      return pos + 1;
+    (*val_s) &= ~bits;
+    bits >>= 1;
+    --pos;
+  }
+  --idx;
+  val_s = l0.data() + idx;
+  while (idx >= 0 && (*val_s) == all_slot_set) {
+    *val_s = all_slot_clear;
+    --idx;
+    pos -= d0;
+    val_s = l0.data() + idx;
+  }
+
+  if (idx >= 0 &&
+      (*val_s) != all_slot_set && (*val_s) != all_slot_clear) {
+    int64_t pos_e = p2align<int64_t>(pos, d0);
+    slot_t bits = (slot_t)1 << (pos % d0);
+    while (pos >= pos_e) {
+      if (0 == ((*val_s) & bits))
+        return pos + 1;
+      (*val_s) &= ~bits;
+      bits >>= 1;
+      --pos;
+    }
+  }
+  return pos + 1;
+}
+
+uint64_t AllocatorLevel01Loose::_claim_free_to_right_l0(int64_t l0_pos_start)
+{
+  auto d0 = L0_ENTRIES_PER_SLOT;
+
+  int64_t pos = l0_pos_start;
+  slot_t bits = (slot_t)1 << (pos % d0);
+  size_t idx = pos / d0;
+  if (idx >= l0.size()) {
+    return pos;
+  }
+  slot_t* val_s = l0.data() + idx;
+
+  int64_t pos_e = p2roundup<int64_t>(pos + 1, d0);
+
+  while (pos < pos_e) {
+    if (0 == ((*val_s) & bits))
+      return pos;
+    (*val_s) &= ~bits;
+    bits <<= 1;
+    ++pos;
+  }
+  ++idx;
+  val_s = l0.data() + idx;
+  while (idx < l0.size() && (*val_s) == all_slot_set) {
+    *val_s = all_slot_clear;
+    ++idx;
+    pos += d0;
+    val_s = l0.data() + idx;
+  }
+
+  if (idx < l0.size() &&
+      (*val_s) != all_slot_set && (*val_s) != all_slot_clear) {
+    int64_t pos_e = p2roundup<int64_t>(pos + 1, d0);
+    slot_t bits = (slot_t)1 << (pos % d0);
+    while (pos < pos_e) {
+      if (0 == ((*val_s) & bits))
+        return pos;
+      (*val_s) &= ~bits;
+      bits <<= 1;
+      ++pos;
+    }
+  }
+  return pos;
+}
diff --git a/src/os/bluestore/fastbmap_allocator_impl.h b/src/os/bluestore/fastbmap_allocator_impl.h
new file mode 100644
index 000000000..550214b62
--- /dev/null
+++ b/src/os/bluestore/fastbmap_allocator_impl.h
@@ -0,0 +1,846 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator implementation.
+ * Author: Igor Fedotov, ifedotov@suse.com
+ *
+ */
+
+#ifndef __FAST_BITMAP_ALLOCATOR_IMPL_H
+#define __FAST_BITMAP_ALLOCATOR_IMPL_H
+#include "include/intarith.h"
+
+#include <bit>
+#include <vector>
+#include <algorithm>
+#include <mutex>
+
+typedef uint64_t slot_t;
+
+#ifdef NON_CEPH_BUILD
+#include <assert.h>
+struct interval_t
+{
+  uint64_t offset = 0;
+  uint64_t length = 0;
+
+  interval_t() {}
+  interval_t(uint64_t o, uint64_t l) : offset(o), length(l) {}
+  interval_t(const interval_t &ext) :
+    offset(ext.offset), length(ext.length) {}
+};
+typedef std::vector<interval_t> interval_vector_t;
+typedef std::vector<slot_t> slot_vector_t;
+#else
+#include "include/ceph_assert.h"
+#include "common/likely.h"
+#include "os/bluestore/bluestore_types.h"
+#include "include/mempool.h"
+#include "common/ceph_mutex.h"
+
+typedef bluestore_interval_t<uint64_t, uint64_t> interval_t;
+typedef PExtentVector interval_vector_t;
+
+typedef mempool::bluestore_alloc::vector<slot_t> slot_vector_t;
+
+#endif
+
+// fitting into cache line on x86_64
+static const size_t slots_per_slotset = 8; // 8 slots per set
+static const size_t slotset_bytes = sizeof(slot_t) * slots_per_slotset;
+static const size_t bits_per_slot = sizeof(slot_t) * 8;
+static const size_t bits_per_slotset = slotset_bytes * 8;
+static const slot_t all_slot_set = 0xffffffffffffffff;
+static const slot_t all_slot_clear = 0;
+
+inline size_t find_next_set_bit(slot_t slot_val, size_t start_pos)
+{
+#ifdef __GNUC__
+  if (start_pos == 0) {
+    start_pos = __builtin_ffsll(slot_val);
+    return start_pos ? start_pos - 1 : bits_per_slot;
+  }
+#endif
+  slot_t mask = slot_t(1) << start_pos;
+  while (start_pos < bits_per_slot && !(slot_val & mask)) {
+    mask <<= 1;
+    ++start_pos;
+  }
+  return start_pos;
+}
+
+
+class AllocatorLevel
+{
+protected:
+
+  virtual uint64_t _children_per_slot() const = 0;
+  virtual uint64_t _level_granularity() const = 0;
+
+public:
+  static uint64_t l0_dives;
+  static uint64_t l0_iterations;
+  static uint64_t l0_inner_iterations;
+  static uint64_t alloc_fragments;
+  static uint64_t alloc_fragments_fast;
+  static uint64_t l2_allocs;
+
+  virtual ~AllocatorLevel()
+  {}
+
+  virtual void collect_stats(
+    std::map<size_t, size_t>& bins_overall) = 0;
+
+};
+
+class AllocatorLevel01 : public AllocatorLevel
+{
+protected:
+  slot_vector_t l0; // set bit means free entry
+  slot_vector_t l1;
+  uint64_t l0_granularity = 0; // space per entry
+  uint64_t l1_granularity = 0; // space per entry
+
+  size_t partial_l1_count = 0;
+  size_t unalloc_l1_count = 0;
+
+  double get_fragmentation() const {
+    double res = 0.0;
+    auto total = unalloc_l1_count + partial_l1_count;
+    if (total) {
+      res = double(partial_l1_count) / double(total);
+    }
+    return res;
+  }
+
+  uint64_t _level_granularity() const override
+  {
+    return l1_granularity;
+  }
+
+  inline bool _is_slot_fully_allocated(uint64_t idx) const {
+    return l1[idx] == all_slot_clear;
+  }
+public:
+  inline uint64_t get_min_alloc_size() const
+  {
+    return l0_granularity;
+  }
+
+};
+
+template <class T>
+class AllocatorLevel02;
+
+class AllocatorLevel01Loose : public AllocatorLevel01
+{
+  enum {
+    L1_ENTRY_WIDTH = 2,
+    L1_ENTRY_MASK = (1 << L1_ENTRY_WIDTH) - 1,
+    L1_ENTRY_FULL = 0x00,
+    L1_ENTRY_PARTIAL = 0x01,
+    L1_ENTRY_NOT_USED = 0x02,
+    L1_ENTRY_FREE = 0x03,
+    L1_ENTRIES_PER_SLOT = bits_per_slot / L1_ENTRY_WIDTH, //32
+    L0_ENTRIES_PER_SLOT = bits_per_slot, // 64
+  };
+  uint64_t _children_per_slot() const override
+  {
+    return L1_ENTRIES_PER_SLOT;
+  }
+
+  interval_t _get_longest_from_l0(uint64_t pos0, uint64_t pos1,
+    uint64_t min_length, interval_t* tail) const;
+
+  inline void _fragment_and_emplace(uint64_t max_length, uint64_t offset,
+    uint64_t len,
+    interval_vector_t* res)
+  {
+    auto it = res->rbegin();
+    if (max_length) {
+      if (it != res->rend() && it->offset + it->length == offset) {
+	auto l = max_length - it->length;
+	if (l >= len) {
+	  it->length += len;
+	  return;
+	} else {
+	  offset += l;
+	  len -= l;
+	  it->length += l;
+	}
+      }
+
+      while (len > max_length) {
+	res->emplace_back(offset, max_length);
+	offset += max_length;
+	len -= max_length;
+      }
+      res->emplace_back(offset, len);
+      return;
+    }
+
+    if (it != res->rend() && it->offset + it->length == offset) {
+      it->length += len;
+    } else {
+      res->emplace_back(offset, len);
+    }
+  }
+
+  bool _allocate_l0(uint64_t length,
+    uint64_t max_length,
+    uint64_t l0_pos0, uint64_t l0_pos1,
+    uint64_t* allocated,
+    interval_vector_t* res)
+  {
+    uint64_t d0 = L0_ENTRIES_PER_SLOT;
+
+    ++l0_dives;
+
+    ceph_assert(l0_pos0 < l0_pos1);
+    ceph_assert(length > *allocated);
+    ceph_assert(0 == (l0_pos0 % (slots_per_slotset * d0)));
+    ceph_assert(0 == (l0_pos1 % (slots_per_slotset * d0)));
+    ceph_assert(((length - *allocated) % l0_granularity) == 0);
+
+    uint64_t need_entries = (length - *allocated) / l0_granularity;
+
+    for (auto idx = l0_pos0 / d0; (idx < l0_pos1 / d0) && (length > *allocated);
+      ++idx) {
+      ++l0_iterations;
+      slot_t& slot_val = l0[idx];
+      auto base = idx * d0;
+      if (slot_val == all_slot_clear) {
+        continue;
+      } else if (slot_val == all_slot_set) {
+        uint64_t to_alloc = std::min(need_entries, d0);
+        *allocated += to_alloc * l0_granularity;
+	++alloc_fragments;
+        need_entries -= to_alloc;
+
+	_fragment_and_emplace(max_length, base * l0_granularity,
+          to_alloc * l0_granularity, res);
+
+        if (to_alloc == d0) {
+          slot_val = all_slot_clear;
+        } else {
+          _mark_alloc_l0(base, base + to_alloc);
+        }
+        continue;
+      }
+
+      auto free_pos = find_next_set_bit(slot_val, 0);
+      ceph_assert(free_pos < bits_per_slot);
+      auto next_pos = free_pos + 1;
+      while (next_pos < bits_per_slot &&
+        (next_pos - free_pos) < need_entries) {
+	++l0_inner_iterations;
+
+        if (0 == (slot_val & (slot_t(1) << next_pos))) {
+          auto to_alloc = (next_pos - free_pos);
+          *allocated += to_alloc * l0_granularity;
+	  ++alloc_fragments;
+          need_entries -= to_alloc;
+	  _fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
+	    to_alloc * l0_granularity, res);
+          _mark_alloc_l0(base + free_pos, base + next_pos);
+          free_pos = find_next_set_bit(slot_val, next_pos + 1);
+          next_pos = free_pos + 1;
+        } else {
+          ++next_pos;
+        }
+      }
+      if (need_entries && free_pos < bits_per_slot) {
+        auto to_alloc = std::min(need_entries, d0 - free_pos);
+        *allocated += to_alloc * l0_granularity;
+	++alloc_fragments;
+	need_entries -= to_alloc;
+	_fragment_and_emplace(max_length, (base + free_pos) * l0_granularity,
+	  to_alloc * l0_granularity, res);
+        _mark_alloc_l0(base + free_pos, base + free_pos + to_alloc);
+      }
+    }
+    return _is_empty_l0(l0_pos0, l0_pos1);
+  }
+
+protected:
+
+  friend class AllocatorLevel02<AllocatorLevel01Loose>;
+
+  void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
+  {
+    l0_granularity = _alloc_unit;
+    // 512 bits at L0 mapped to L1 entry
+    l1_granularity = l0_granularity * bits_per_slotset;
+
+    // capacity to have slot alignment at l1
+    auto aligned_capacity =
+      p2roundup((int64_t)capacity,
+        int64_t(l1_granularity * slots_per_slotset * _children_per_slot()));
+    size_t slot_count =
+      aligned_capacity / l1_granularity / _children_per_slot();
+    // we use set bit(s) as a marker for (partially) free entry
+    l1.resize(slot_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+    // l0 slot count
+    size_t slot_count_l0 = aligned_capacity / _alloc_unit / bits_per_slot;
+    // we use set bit(s) as a marker for (partially) free entry
+    l0.resize(slot_count_l0, mark_as_free ? all_slot_set : all_slot_clear);
+
+    partial_l1_count = unalloc_l1_count = 0;
+    if (mark_as_free) {
+      unalloc_l1_count = slot_count * _children_per_slot();
+      auto l0_pos_no_use = p2roundup((int64_t)capacity, (int64_t)l0_granularity) / l0_granularity;
+      _mark_alloc_l1_l0(l0_pos_no_use, aligned_capacity / l0_granularity);
+    }
+  }
+
+  struct search_ctx_t
+  {
+    size_t partial_count = 0;
+    size_t free_count = 0;
+    uint64_t free_l1_pos = 0;
+
+    uint64_t min_affordable_len = 0;
+    uint64_t min_affordable_offs = 0;
+    uint64_t affordable_len = 0;
+    uint64_t affordable_offs = 0;
+
+    bool fully_processed = false;
+
+    void reset()
+    {
+      *this = search_ctx_t();
+    }
+  };
+  enum {
+    NO_STOP,
+    STOP_ON_EMPTY,
+    STOP_ON_PARTIAL,
+  };
+  void _analyze_partials(uint64_t pos_start, uint64_t pos_end,
+    uint64_t length, uint64_t min_length, int mode,
+    search_ctx_t* ctx);
+
+  void _mark_l1_on_l0(int64_t l0_pos, int64_t l0_pos_end);
+  void _mark_alloc_l0(int64_t l0_pos_start, int64_t l0_pos_end);
+  uint64_t _claim_free_to_left_l0(int64_t l0_pos_start);
+  uint64_t _claim_free_to_right_l0(int64_t l0_pos_start);
+
+
+  void _mark_alloc_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    _mark_alloc_l0(l0_pos_start, l0_pos_end);
+    l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
+    l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
+    _mark_l1_on_l0(l0_pos_start, l0_pos_end);
+  }
+
+  void _mark_free_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    auto d0 = L0_ENTRIES_PER_SLOT;
+
+    auto pos = l0_pos_start;
+    slot_t bits = (slot_t)1 << (l0_pos_start % d0);
+    slot_t* val_s = &l0[pos / d0];
+    int64_t pos_e = std::min(l0_pos_end,
+                             p2roundup<int64_t>(l0_pos_start + 1, d0));
+    while (pos < pos_e) {
+      *val_s |=  bits;
+      bits <<= 1;
+      pos++;
+    }
+    pos_e = std::min(l0_pos_end, p2align<int64_t>(l0_pos_end, d0));
+    while (pos < pos_e) {
+      *(++val_s) = all_slot_set;
+      pos += d0;
+    }
+    bits = 1;
+    ++val_s;
+    while (pos < l0_pos_end) {
+      *val_s |= bits;
+      bits <<= 1;
+      pos++;
+    }
+  }
+
+  void _mark_free_l1_l0(int64_t l0_pos_start, int64_t l0_pos_end)
+  {
+    _mark_free_l0(l0_pos_start, l0_pos_end);
+    l0_pos_start = p2align(l0_pos_start, int64_t(bits_per_slotset));
+    l0_pos_end = p2roundup(l0_pos_end, int64_t(bits_per_slotset));
+    _mark_l1_on_l0(l0_pos_start, l0_pos_end);
+  }
+
+  bool _is_empty_l0(uint64_t l0_pos, uint64_t l0_pos_end)
+  {
+    bool no_free = true;
+    uint64_t d = slots_per_slotset * L0_ENTRIES_PER_SLOT;
+    ceph_assert(0 == (l0_pos % d));
+    ceph_assert(0 == (l0_pos_end % d));
+
+    auto idx = l0_pos / L0_ENTRIES_PER_SLOT;
+    auto idx_end = l0_pos_end / L0_ENTRIES_PER_SLOT;
+    while (idx < idx_end && no_free) {
+      no_free = l0[idx] == all_slot_clear;
+      ++idx;
+    }
+    return no_free;
+  }
+  bool _is_empty_l1(uint64_t l1_pos, uint64_t l1_pos_end)
+  {
+    bool no_free = true;
+    uint64_t d = slots_per_slotset * _children_per_slot();
+    ceph_assert(0 == (l1_pos % d));
+    ceph_assert(0 == (l1_pos_end % d));
+
+    auto idx = l1_pos / L1_ENTRIES_PER_SLOT;
+    auto idx_end = l1_pos_end / L1_ENTRIES_PER_SLOT;
+    while (idx < idx_end && no_free) {
+      no_free = _is_slot_fully_allocated(idx);
+      ++idx;
+    }
+    return no_free;
+  }
+
+  interval_t _allocate_l1_contiguous(uint64_t length,
+    uint64_t min_length, uint64_t max_length,
+    uint64_t pos_start, uint64_t pos_end);
+
+  bool _allocate_l1(uint64_t length,
+    uint64_t min_length, uint64_t max_length,
+    uint64_t l1_pos_start, uint64_t l1_pos_end,
+    uint64_t* allocated,
+    interval_vector_t* res);
+
+  uint64_t _mark_alloc_l1(uint64_t offset, uint64_t length)
+  {
+    uint64_t l0_pos_start = offset / l0_granularity;
+    uint64_t l0_pos_end = p2roundup(offset + length, l0_granularity) / l0_granularity;
+    _mark_alloc_l1_l0(l0_pos_start, l0_pos_end);
+    return l0_granularity * (l0_pos_end - l0_pos_start);
+  }
+
+  uint64_t _free_l1(uint64_t offs, uint64_t len)
+  {
+    uint64_t l0_pos_start = offs / l0_granularity;
+    uint64_t l0_pos_end = p2roundup(offs + len, l0_granularity) / l0_granularity;
+    _mark_free_l1_l0(l0_pos_start, l0_pos_end);
+    return l0_granularity * (l0_pos_end - l0_pos_start);
+  }
+
+  uint64_t claim_free_to_left_l1(uint64_t offs)
+  {
+    uint64_t l0_pos_end = offs / l0_granularity;
+    uint64_t l0_pos_start = _claim_free_to_left_l0(l0_pos_end);
+    if (l0_pos_start < l0_pos_end) {
+      _mark_l1_on_l0(
+        p2align(l0_pos_start, uint64_t(bits_per_slotset)),
+        p2roundup(l0_pos_end, uint64_t(bits_per_slotset)));
+      return l0_granularity * (l0_pos_end - l0_pos_start);
+    }
+    return 0;
+  }
+
+  uint64_t claim_free_to_right_l1(uint64_t offs)
+  {
+    uint64_t l0_pos_start = offs / l0_granularity;
+    uint64_t l0_pos_end = _claim_free_to_right_l0(l0_pos_start);
+
+    if (l0_pos_start < l0_pos_end) {
+      _mark_l1_on_l0(
+        p2align(l0_pos_start, uint64_t(bits_per_slotset)),
+        p2roundup(l0_pos_end, uint64_t(bits_per_slotset)));
+      return l0_granularity * (l0_pos_end - l0_pos_start);
+    }
+    return 0;
+  }
+
+
+public:
+  uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    if (pos1 == 0) {
+      pos1 = l1.size() * L1_ENTRIES_PER_SLOT;
+    }
+    auto avail = debug_get_free(pos0, pos1);
+    return (pos1 - pos0) * l1_granularity - avail;
+  }
+
+  uint64_t debug_get_free(uint64_t l1_pos0 = 0, uint64_t l1_pos1 = 0)
+  {
+    ceph_assert(0 == (l1_pos0 % L1_ENTRIES_PER_SLOT));
+    ceph_assert(0 == (l1_pos1 % L1_ENTRIES_PER_SLOT));
+
+    auto idx0 = l1_pos0 * slots_per_slotset;
+    auto idx1 = l1_pos1 * slots_per_slotset;
+
+    if (idx1 == 0) {
+      idx1 = l0.size();
+    }
+
+    uint64_t res = 0;
+    for (uint64_t i = idx0; i < idx1; ++i) {
+      auto v = l0[i];
+      if (v == all_slot_set) {
+        res += L0_ENTRIES_PER_SLOT;
+      } else if (v != all_slot_clear) {
+        size_t cnt = 0;
+#ifdef __GNUC__
+        cnt = __builtin_popcountll(v);
+#else
+        // Kernighan's Alg to count set bits
+        while (v) {
+          v &= (v - 1);
+          cnt++;
+        }
+#endif
+        res += cnt;
+      }
+    }
+    return res * l0_granularity;
+  }
+  void collect_stats(
+    std::map<size_t, size_t>& bins_overall) override;
+
+  static inline ssize_t count_0s(slot_t slot_val, size_t start_pos);
+  static inline ssize_t count_1s(slot_t slot_val, size_t start_pos);
+  void foreach_internal(std::function<void(uint64_t offset, uint64_t length)> notify);
+};
+
+
+class AllocatorLevel01Compact : public AllocatorLevel01
+{
+  uint64_t _children_per_slot() const override
+  {
+    return 8;
+  }
+public:
+  void collect_stats(
+    std::map<size_t, size_t>& bins_overall) override
+  {
+    // not implemented
+  }
+};
+
+template <class L1>
+class AllocatorLevel02 : public AllocatorLevel
+{
+public:
+  uint64_t debug_get_free(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    std::lock_guard l(lock);
+    return l1.debug_get_free(pos0 * l1._children_per_slot() * bits_per_slot,
+      pos1 * l1._children_per_slot() * bits_per_slot);
+  }
+  uint64_t debug_get_allocated(uint64_t pos0 = 0, uint64_t pos1 = 0)
+  {
+    std::lock_guard l(lock);
+    return l1.debug_get_allocated(pos0 * l1._children_per_slot() * bits_per_slot,
+      pos1 * l1._children_per_slot() * bits_per_slot);
+  }
+
+  uint64_t get_available()
+  {
+    std::lock_guard l(lock);
+    return available;
+  }
+  inline uint64_t get_min_alloc_size() const
+  {
+    return l1.get_min_alloc_size();
+  }
+  void collect_stats(
+    std::map<size_t, size_t>& bins_overall) override {
+
+      std::lock_guard l(lock);
+      l1.collect_stats(bins_overall);
+  }
+  uint64_t claim_free_to_left(uint64_t offset) {
+    std::lock_guard l(lock);
+    auto allocated = l1.claim_free_to_left_l1(offset);
+    ceph_assert(available >= allocated);
+    available -= allocated;
+
+    uint64_t l2_pos = (offset - allocated) / l2_granularity;
+    uint64_t l2_pos_end =
+      p2roundup(int64_t(offset), int64_t(l2_granularity)) / l2_granularity;
+    _mark_l2_on_l1(l2_pos, l2_pos_end);
+    return allocated;
+  }
+
+  uint64_t claim_free_to_right(uint64_t offset) {
+    std::lock_guard l(lock);
+    auto allocated = l1.claim_free_to_right_l1(offset);
+    ceph_assert(available >= allocated);
+    available -= allocated;
+
+    uint64_t l2_pos = (offset) / l2_granularity;
+    int64_t end = offset + allocated;
+    uint64_t l2_pos_end = p2roundup(end, int64_t(l2_granularity)) / l2_granularity;
+    _mark_l2_on_l1(l2_pos, l2_pos_end);
+    return allocated;
+  }
+
+  void foreach_internal(
+    std::function<void(uint64_t offset, uint64_t length)> notify)
+  {
+    size_t alloc_size = get_min_alloc_size();
+    auto multiply_by_alloc_size = [alloc_size, notify](size_t off, size_t len) {
+      notify(off * alloc_size, len * alloc_size);
+    };
+    std::lock_guard l(lock);
+    l1.foreach_internal(multiply_by_alloc_size);
+  }
+  double get_fragmentation_internal() {
+    std::lock_guard l(lock);
+    return l1.get_fragmentation();
+  }
+
+protected:
+  ceph::mutex lock = ceph::make_mutex("AllocatorLevel02::lock");
+  L1 l1;
+  slot_vector_t l2;
+  uint64_t l2_granularity = 0; // space per entry
+  uint64_t available = 0;
+  uint64_t last_pos = 0;
+
+  enum {
+    L1_ENTRIES_PER_SLOT = bits_per_slot, // 64
+  };
+
+  uint64_t _children_per_slot() const override
+  {
+    return L1_ENTRIES_PER_SLOT;
+  }
+  uint64_t _level_granularity() const override
+  {
+    return l2_granularity;
+  }
+
+  void _init(uint64_t capacity, uint64_t _alloc_unit, bool mark_as_free = true)
+  {
+    ceph_assert(std::has_single_bit(_alloc_unit));
+    l1._init(capacity, _alloc_unit, mark_as_free);
+
+    l2_granularity =
+      l1._level_granularity() * l1._children_per_slot() * slots_per_slotset;
+
+    // capacity to have slot alignment at l2
+    auto aligned_capacity =
+      p2roundup((int64_t)capacity, (int64_t)l2_granularity * L1_ENTRIES_PER_SLOT);
+    size_t elem_count = aligned_capacity / l2_granularity / L1_ENTRIES_PER_SLOT;
+    // we use set bit(s) as a marker for (partially) free entry
+    l2.resize(elem_count, mark_as_free ? all_slot_set : all_slot_clear);
+
+    if (mark_as_free) {
+      // capacity to have slotset alignment at l1
+      auto l2_pos_no_use =
+	p2roundup((int64_t)capacity, (int64_t)l2_granularity) / l2_granularity;
+      _mark_l2_allocated(l2_pos_no_use, aligned_capacity / l2_granularity);
+      available = p2align(capacity, _alloc_unit);
+    } else {
+      available = 0;
+    }
+  }
+
+  void _mark_l2_allocated(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = L1_ENTRIES_PER_SLOT;
+    ceph_assert(0 <= l2_pos_end);
+    ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    while (l2_pos < l2_pos_end) {
+      l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
+      ++l2_pos;
+    }
+  }
+
+  void _mark_l2_free(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = L1_ENTRIES_PER_SLOT;
+    ceph_assert(0 <= l2_pos_end);
+    ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    while (l2_pos < l2_pos_end) {
+        l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
+        ++l2_pos;
+    }
+  }
+
+  void _mark_l2_on_l1(int64_t l2_pos, int64_t l2_pos_end)
+  {
+    auto d = L1_ENTRIES_PER_SLOT;
+    ceph_assert(0 <= l2_pos_end);
+    ceph_assert((int64_t)l2.size() >= (l2_pos_end / d));
+
+    auto idx = l2_pos * slots_per_slotset;
+    auto idx_end = l2_pos_end * slots_per_slotset;
+    bool all_allocated = true;
+    while (idx < idx_end) {
+      if (!l1._is_slot_fully_allocated(idx)) {
+        all_allocated = false;
+        idx = p2roundup(int64_t(++idx), int64_t(slots_per_slotset));
+      }
+      else {
+        ++idx;
+      }
+      if ((idx % slots_per_slotset) == 0) {
+        if (all_allocated) {
+          l2[l2_pos / d] &= ~(slot_t(1) << (l2_pos % d));
+        }
+        else {
+          l2[l2_pos / d] |= (slot_t(1) << (l2_pos % d));
+        }
+        all_allocated = true;
+        ++l2_pos;
+      }
+    }
+  }
+
+  void _allocate_l2(uint64_t length,
+    uint64_t min_length,
+    uint64_t max_length,
+    uint64_t hint,
+    
+    uint64_t* allocated,
+    interval_vector_t* res)
+  {
+    uint64_t prev_allocated = *allocated;
+    uint64_t d = L1_ENTRIES_PER_SLOT;
+    ceph_assert(min_length <= l2_granularity);
+    ceph_assert(max_length == 0 || max_length >= min_length);
+    ceph_assert(max_length == 0 || (max_length % min_length) == 0);
+    ceph_assert(length >= min_length);
+    ceph_assert((length % min_length) == 0);
+
+    uint64_t cap = 1ull << 31;
+    if (max_length == 0 || max_length >= cap) {
+      max_length = cap;
+    }
+
+    uint64_t l1_w = slots_per_slotset * l1._children_per_slot();
+
+    std::lock_guard l(lock);
+
+    if (available < min_length) {
+      return;
+    }
+    if (hint != 0) {
+      last_pos = (hint / (d * l2_granularity)) < l2.size() ? p2align(hint / l2_granularity, d) : 0;
+    }
+    auto l2_pos = last_pos;
+    auto last_pos0 = last_pos;
+    auto pos = last_pos / d;
+    auto pos_end = l2.size();
+    // outer loop below is intended to optimize the performance by
+    // avoiding 'modulo' operations inside the internal loop.
+    // Looks like they have negative impact on the performance
+    for (auto i = 0; i < 2; ++i) {
+      for(; length > *allocated && pos < pos_end; ++pos) {
+	slot_t& slot_val = l2[pos];
+	size_t free_pos = 0;
+	bool all_set = false;
+	if (slot_val == all_slot_clear) {
+	  l2_pos += d;
+	  last_pos = l2_pos;
+	  continue;
+	} else if (slot_val == all_slot_set) {
+	  free_pos = 0;
+	  all_set = true;
+	} else {
+	  free_pos = find_next_set_bit(slot_val, 0);
+	  ceph_assert(free_pos < bits_per_slot);
+	}
+	do {
+	  ceph_assert(length > *allocated);
+	  bool empty = l1._allocate_l1(length,
+	    min_length,
+	    max_length,
+	    (l2_pos + free_pos) * l1_w,
+	    (l2_pos + free_pos + 1) * l1_w,
+	    allocated,
+	    res);
+	  if (empty) {
+	    slot_val &= ~(slot_t(1) << free_pos);
+	  }
+	  if (length <= *allocated || slot_val == all_slot_clear) {
+	    break;
+	  }
+	  ++free_pos;
+	  if (!all_set) {
+	    free_pos = find_next_set_bit(slot_val, free_pos);
+	  }
+	} while (free_pos < bits_per_slot);
+	last_pos = l2_pos;
+	l2_pos += d;
+      }
+      l2_pos = 0;
+      pos = 0;
+      pos_end = last_pos0 / d;
+    }
+
+    ++l2_allocs;
+    auto allocated_here = *allocated - prev_allocated;
+    ceph_assert(available >= allocated_here);
+    available -= allocated_here;
+  }
+
+#ifndef NON_CEPH_BUILD
+  // to provide compatibility with BlueStore's allocator interface
+  void _free_l2(const interval_set<uint64_t> & rr)
+  {
+    uint64_t released = 0;
+    std::lock_guard l(lock);
+    for (auto r : rr) {
+      released += l1._free_l1(r.first, r.second);
+      uint64_t l2_pos = r.first / l2_granularity;
+      uint64_t l2_pos_end = p2roundup(int64_t(r.first + r.second), int64_t(l2_granularity)) / l2_granularity;
+
+      _mark_l2_free(l2_pos, l2_pos_end);
+    }
+    available += released;
+  }
+#endif
+
+  template <typename T>
+  void _free_l2(const T& rr)
+  {
+    uint64_t released = 0;
+    std::lock_guard l(lock);
+    for (auto r : rr) {
+      released += l1._free_l1(r.offset, r.length);
+      uint64_t l2_pos = r.offset / l2_granularity;
+      uint64_t l2_pos_end = p2roundup(int64_t(r.offset + r.length), int64_t(l2_granularity)) / l2_granularity;
+
+      _mark_l2_free(l2_pos, l2_pos_end);
+    }
+    available += released;
+  }
+
+  void _mark_allocated(uint64_t o, uint64_t len)
+  {
+    uint64_t l2_pos = o / l2_granularity;
+    uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
+
+    std::lock_guard l(lock);
+    auto allocated = l1._mark_alloc_l1(o, len);
+    ceph_assert(available >= allocated);
+    available -= allocated;
+    _mark_l2_on_l1(l2_pos, l2_pos_end);
+  }
+
+  void _mark_free(uint64_t o, uint64_t len)
+  {
+    uint64_t l2_pos = o / l2_granularity;
+    uint64_t l2_pos_end = p2roundup(int64_t(o + len), int64_t(l2_granularity)) / l2_granularity;
+
+    std::lock_guard l(lock);
+    available += l1._free_l1(o, len);
+    _mark_l2_free(l2_pos, l2_pos_end);
+  }
+  void _shutdown()
+  {
+    last_pos = 0;
+  }
+};
+
+#endif
diff --git a/src/os/bluestore/simple_bitmap.cc b/src/os/bluestore/simple_bitmap.cc
new file mode 100644
index 000000000..fb12162b9
--- /dev/null
+++ b/src/os/bluestore/simple_bitmap.cc
@@ -0,0 +1,276 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "simple_bitmap.h"
+
+#include "include/ceph_assert.h"
+#include "bluestore_types.h"
+#include "common/debug.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << __func__ << "::SBMAP::" << this << " "
+
+static struct extent_t null_extent = {0, 0};
+
+//----------------------------------------------------------------------------
+//throw bad_alloc
+SimpleBitmap::SimpleBitmap(CephContext *_cct, uint64_t num_bits) :cct(_cct)
+{
+  m_num_bits   = num_bits;
+  m_word_count = bits_to_words(num_bits);
+  if (num_bits & BITS_IN_WORD_MASK) {
+    m_word_count++;
+  }
+  m_arr = new uint64_t [m_word_count];
+  clear_all();
+}
+
+//----------------------------------------------------------------------------
+SimpleBitmap::~SimpleBitmap()
+{
+  delete [] m_arr;
+}
+
+//----------------------------------------------------------------------------
+bool SimpleBitmap::set(uint64_t offset, uint64_t length)
+{
+  dout(20) <<" [" << std::hex << offset << ", " << length << "]" << dendl;
+
+  if (offset + length > m_num_bits) {
+    derr << __func__ << "::offset + length = " << offset + length << " exceeds map size = " << m_num_bits << dendl;
+    ceph_assert(offset + length <= m_num_bits);
+    return false;
+  }
+
+  auto [word_index, first_bit_set] = split(offset);
+  // special case optimization
+  if (length == 1) {
+    uint64_t set_mask  = 1ULL << first_bit_set;
+    m_arr[word_index] |= set_mask;
+    return true;
+  }
+
+  // handle the first word which might be incomplete
+  if (first_bit_set != 0) {
+    uint64_t   set_mask      = FULL_MASK << first_bit_set;
+    uint64_t   first_bit_clr = first_bit_set + length;
+    if (first_bit_clr <= BITS_IN_WORD) {
+      if (first_bit_clr < BITS_IN_WORD) {
+	uint64_t clr_bits = BITS_IN_WORD - first_bit_clr;
+	uint64_t clr_mask = FULL_MASK >> clr_bits;
+	set_mask     &= clr_mask;
+      }
+      m_arr[word_index] |= set_mask;
+      return true;
+    } else {
+      // set all bits in this word starting from first_bit_set
+      m_arr[word_index] |= set_mask;
+      word_index ++;
+      length -= (BITS_IN_WORD - first_bit_set);
+    }
+  }
+
+  // set a range of full words
+  uint64_t full_words_count = bits_to_words(length);
+  uint64_t end              = word_index + full_words_count;
+  for (; word_index < end; word_index++) {
+    m_arr[word_index] = FULL_MASK;
+  }
+  length -= words_to_bits(full_words_count);
+
+  // set bits in the last word
+  if (length) {
+    uint64_t set_mask = ~(FULL_MASK << length);
+    m_arr[word_index] |= set_mask;
+  }
+
+  return true;
+}
+
+//----------------------------------------------------------------------------
+bool SimpleBitmap::clr(uint64_t offset, uint64_t length)
+{
+  if (offset + length > m_num_bits) {
+    derr << __func__ << "::offset + length = " << offset + length << " exceeds map size = " << m_num_bits << dendl;
+    ceph_assert(offset + length <= m_num_bits);
+    return false;
+  }
+
+  auto [word_index, first_bit_clr] = split(offset);
+  // special case optimization
+  if (length == 1) {
+    uint64_t set_mask   = 1ULL << first_bit_clr;
+    uint64_t clr_mask   = ~set_mask;
+    m_arr[word_index] &= clr_mask;
+
+    return true;
+  }
+
+  // handle the first word when we we are unaligned on word boundaries
+  if (first_bit_clr != 0) {
+    uint64_t clr_mask      = ~(FULL_MASK << first_bit_clr);
+    uint64_t first_bit_set = first_bit_clr + length;
+    // special case - we only work on a single word
+    if (first_bit_set <= BITS_IN_WORD) {
+      if (first_bit_set < BITS_IN_WORD) {
+	uint64_t set_mask = FULL_MASK << first_bit_set;
+	clr_mask         |= set_mask;
+      }
+      m_arr[word_index]     &= clr_mask;
+      return true;
+    }
+    else {
+      // clear all bits in this word starting from first_bit_clr
+      // and continue to the next word
+      m_arr[word_index] &= clr_mask;
+      word_index ++;
+      length -= (BITS_IN_WORD - first_bit_clr);
+    }
+  }
+
+
+  // clear a range of full words
+  uint64_t full_words_count = bits_to_words(length);
+  uint64_t end              = word_index + full_words_count;
+  for (; word_index < end; word_index++) {
+    m_arr[word_index] = 0;
+  }
+  length -= words_to_bits(full_words_count);
+
+  // set bits in the last word
+  if (length) {
+    uint64_t clr_mask = (FULL_MASK << length);
+    m_arr[word_index] &= clr_mask;
+  }
+
+  return true;
+}
+
+//----------------------------------------------------------------------------
+extent_t SimpleBitmap::get_next_set_extent(uint64_t offset)
+{
+  if (offset >= m_num_bits ) {
+    return null_extent;
+  }
+
+  auto [word_idx, bits_to_clear] = split(offset);
+  uint64_t word     = m_arr[word_idx];
+  word &= (FULL_MASK << bits_to_clear);
+
+  // if there are no set bits in this word
+  if (word == 0) {
+      // skip past all clear words
+    while (++word_idx < m_word_count && !m_arr[word_idx]);
+
+    if (word_idx < m_word_count ) {
+      word = m_arr[word_idx];
+    } else {
+      return null_extent;
+    }
+  }
+
+  // ffs is 1 based, must dec by one as we are zero based
+  int           ffs = __builtin_ffsll(word) - 1;
+  extent_t      ext;
+  ext.offset = words_to_bits(word_idx) + ffs;
+  if (ext.offset >= m_num_bits ) {
+    return null_extent;
+  }
+
+  // set all bits from current to LSB
+  uint64_t      clr_mask = FULL_MASK << ffs;
+  uint64_t      set_mask = ~clr_mask;
+  word |= set_mask;
+
+  // skipped past fully set words
+  if (word == FULL_MASK) {
+    while ( (++word_idx < m_word_count) && (m_arr[word_idx] == FULL_MASK) );
+
+    if (word_idx < m_word_count) {
+      word = m_arr[word_idx];
+    } else {
+      // bitmap is set from ext.offset until the last bit
+      ext.length = (m_num_bits - ext.offset);
+      return ext;
+    }
+  }
+
+  ceph_assert(word != FULL_MASK);
+  int      ffz     = __builtin_ffsll(~word) - 1;
+  uint64_t zoffset = words_to_bits(word_idx) + ffz;
+  ext.length       = (zoffset - ext.offset);
+
+  return ext;
+}
+
+//----------------------------------------------------------------------------
+extent_t SimpleBitmap::get_next_clr_extent(uint64_t offset)
+{
+  if (offset >= m_num_bits ) {
+    return null_extent;
+  }
+
+  uint64_t word_idx = offset_to_index(offset);
+  uint64_t word     = m_arr[word_idx];
+
+  // set all bit set before offset
+  offset &= BITS_IN_WORD_MASK;
+  if (offset != 0) {
+    uint64_t bits_to_set = BITS_IN_WORD - offset;
+    uint64_t set_mask    = FULL_MASK >> bits_to_set;
+    word |= set_mask;
+  }
+  if (word == FULL_MASK) {
+    // skipped past fully set words
+    while ( (++word_idx < m_word_count) && (m_arr[word_idx] == FULL_MASK) );
+
+    if (word_idx < m_word_count) {
+      word = m_arr[word_idx];
+    } else {
+      dout(10) << "2)Reached the end of the bitmap" << dendl;
+      return null_extent;
+    }
+  }
+
+  int      ffz = __builtin_ffsll(~word) - 1;
+  extent_t ext;
+  ext.offset = words_to_bits(word_idx) + ffz;
+  if (ext.offset >= m_num_bits ) {
+    return null_extent;
+  }
+
+  // clear all bits from current position to LSB
+  word &= (FULL_MASK << ffz);
+
+  // skip past all clear words
+  if (word == 0) {
+    while ( (++word_idx < m_word_count) && (m_arr[word_idx] == 0) );
+
+    if (word_idx < m_word_count) {
+      word = m_arr[word_idx];
+    } else {
+      // bitmap is set from ext.offset until the last bit
+      ext.length = (m_num_bits - ext.offset);
+      return ext;
+    }
+  }
+
+  // ffs is 1 based, must dec by one as we are zero based
+  int           ffs     = __builtin_ffsll(word) - 1;
+  uint64_t      soffset = words_to_bits(word_idx) + ffs;
+  ext.length = (soffset - ext.offset);
+  return ext;
+}
diff --git a/src/os/bluestore/simple_bitmap.h b/src/os/bluestore/simple_bitmap.h
new file mode 100644
index 000000000..5d9d56021
--- /dev/null
+++ b/src/os/bluestore/simple_bitmap.h
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Author: Gabriel BenHanokh <gbenhano@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#pragma once
+#include <cstdint>
+#include <iostream>
+#include <string>
+#include <cstring>
+#include <cmath>
+#include <iomanip>
+
+#include "include/ceph_assert.h"
+
+struct extent_t {
+  uint64_t offset;
+  uint64_t length;
+  bool operator==(const extent_t& other) const {
+    return (this->offset == other.offset && this->length == other.length);
+  }
+};
+
+class SimpleBitmap {
+public:
+  SimpleBitmap(CephContext *_cct, uint64_t num_bits);
+  ~SimpleBitmap();
+
+  SimpleBitmap(const SimpleBitmap&) = delete;
+  SimpleBitmap& operator=(const SimpleBitmap&) = delete;
+
+
+  // set a bit range range of @length starting at @offset
+  bool     set(uint64_t offset, uint64_t length);
+  // clear a bit range range of @length starting at @offset
+  bool     clr(uint64_t offset, uint64_t length);
+
+  // returns a copy of the next set extent starting at @offset
+  extent_t get_next_set_extent(uint64_t offset);
+
+  // returns a copy of the next clear extent starting at @offset
+  extent_t get_next_clr_extent(uint64_t offset);
+
+  //----------------------------------------------------------------------------
+  inline uint64_t get_size() {
+    return m_num_bits;
+  }
+
+  //----------------------------------------------------------------------------
+  // clears all bits in the bitmap
+  inline void clear_all() {
+    std::memset(m_arr, 0, words_to_bytes(m_word_count));
+  }
+
+  //----------------------------------------------------------------------------
+  // sets all bits in the bitmap
+  inline void set_all() {
+    std::memset(m_arr, 0xFF,  words_to_bytes(m_word_count));
+    // clear bits in the last word past the last legal bit
+    uint64_t incomplete_word_bit_offset = (m_num_bits & BITS_IN_WORD_MASK);
+    if (incomplete_word_bit_offset) {
+      uint64_t clr_mask   = ~(FULL_MASK << incomplete_word_bit_offset);
+      m_arr[m_word_count - 1] &= clr_mask;
+    }
+  }
+
+  //----------------------------------------------------------------------------
+  bool bit_is_set(uint64_t offset) {
+    if (offset < m_num_bits) {
+      auto [word_index, bit_offset] = split(offset);
+      uint64_t mask       = 1ULL << bit_offset;
+      return (m_arr[word_index] & mask);
+    } else {
+      ceph_assert(offset < m_num_bits);
+      return false;
+    }
+  }
+
+  //----------------------------------------------------------------------------
+  bool bit_is_clr(uint64_t offset) {
+    if (offset < m_num_bits) {
+      auto [word_index, bit_offset] = split(offset);
+      uint64_t mask       = 1ULL << bit_offset;
+      return ( (m_arr[word_index] & mask) == 0 );
+    } else {
+      ceph_assert(offset < m_num_bits);
+      return false;
+    }
+  }
+
+private:
+  //----------------------------------------------------------------------------
+  static inline std::pair<uint64_t, uint64_t> split(uint64_t offset) {
+    return { offset_to_index(offset), (offset & BITS_IN_WORD_MASK) };
+  }
+
+  //---------------------------------------------------------------------------
+  static inline uint64_t offset_to_index(uint64_t offset) {
+    return offset >> BITS_IN_WORD_SHIFT;
+  }
+
+  //---------------------------------------------------------------------------
+  static inline uint64_t index_to_offset(uint64_t index) {
+    return index << BITS_IN_WORD_SHIFT;
+  }
+
+  //---------------------------------------------------------------------------
+  static  inline uint64_t bits_to_words(uint64_t bit_count) {
+    return bit_count >> BITS_IN_WORD_SHIFT;
+  }
+
+  //---------------------------------------------------------------------------
+  static  inline uint64_t words_to_bits(uint64_t words_count) {
+    return words_count << BITS_IN_WORD_SHIFT;
+  }
+
+  //---------------------------------------------------------------------------
+  static  inline uint64_t bytes_to_words(uint64_t byte_count) {
+    return byte_count >> BYTES_IN_WORD_SHIFT;
+  }
+
+  //---------------------------------------------------------------------------
+  static  inline uint64_t words_to_bytes(uint64_t words_count) {
+    return (words_count << BYTES_IN_WORD_SHIFT);
+  }
+
+  constexpr static uint64_t      BYTES_IN_WORD       = sizeof(uint64_t);
+  constexpr static uint64_t      BYTES_IN_WORD_SHIFT = 3;
+  constexpr static uint64_t      BITS_IN_WORD        = (BYTES_IN_WORD * 8);
+  constexpr static uint64_t      BITS_IN_WORD_MASK   = (BITS_IN_WORD - 1);
+  constexpr static uint64_t      BITS_IN_WORD_SHIFT  = 6;
+  constexpr static uint64_t      FULL_MASK           = (~((uint64_t)0));
+
+  CephContext *cct;
+  uint64_t    *m_arr;
+  uint64_t     m_num_bits;
+  uint64_t     m_word_count;
+};
diff --git a/src/os/bluestore/zoned_types.h b/src/os/bluestore/zoned_types.h
new file mode 100644
index 000000000..d8ca3a0c7
--- /dev/null
+++ b/src/os/bluestore/zoned_types.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OS_BLUESTORE_ZONED_TYPES_H
+#define CEPH_OS_BLUESTORE_ZONED_TYPES_H
+
+#include "include/types.h"
+#include "kv/KeyValueDB.h"
+#include "os/kv.h"
+
+// Tracks two bits of information about the state of a zone: (1) number of dead
+// bytes in a zone and (2) the write pointer.  We use the existing
+// Int64ArrayMergeOperator for merge and avoid the cost of point queries.
+//
+// We use the same struct for an on-disk and in-memory representation of the
+// state.
+struct zone_state_t {
+  uint64_t num_dead_bytes = 0;  ///< dead bytes deallocated (behind the write pointer)
+  uint64_t write_pointer = 0;   ///< relative offset within the zone
+
+  void encode(ceph::buffer::list &bl) const {
+    using ceph::encode;
+    encode(write_pointer, bl);
+    encode(num_dead_bytes, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &p) {
+    using ceph::decode;
+    decode(write_pointer, p);
+    decode(num_dead_bytes, p);
+  }
+
+  void reset() {
+    write_pointer = 0;
+    num_dead_bytes = 0;
+  }
+
+  uint64_t get_num_dead_bytes() const {
+    return num_dead_bytes;
+  }
+
+  uint64_t get_num_live_bytes() const {
+    return write_pointer - num_dead_bytes;
+  }
+
+  uint64_t get_write_pointer() const {
+    return write_pointer;
+  }
+
+  void increment_num_dead_bytes(uint64_t num_bytes) {
+    num_dead_bytes += num_bytes;
+  }
+
+  void increment_write_pointer(uint64_t num_bytes) {
+    write_pointer += num_bytes;
+  }
+
+  friend std::ostream& operator<<(
+    std::ostream& out,
+    const zone_state_t& zone_state) {
+    return out << std::hex
+	       << " dead bytes: 0x" << zone_state.get_num_dead_bytes()
+	       << " write pointer: 0x"  << zone_state.get_write_pointer()
+	       << " " << std::dec;
+  }
+};
+
+#endif
diff --git a/src/os/fs/FS.cc b/src/os/fs/FS.cc
new file mode 100644
index 000000000..a7d085402
--- /dev/null
+++ b/src/os/fs/FS.cc
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#ifdef __linux__
+#include <linux/falloc.h>
+#endif
+
+#include "FS.h"
+
+#include "acconfig.h"
+
+#ifdef HAVE_LIBXFS
+#include "XFS.h"
+#endif
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/mount.h>
+#else
+#include <sys/vfs.h>
+#endif
+#include "include/compat.h"
+
+// ---------------
+
+FS *FS::create(uint64_t f_type)
+{
+  switch (f_type) {
+#ifdef HAVE_LIBXFS
+  case XFS_SUPER_MAGIC:
+    return new XFS;
+#endif
+  default:
+    return new FS;
+  }
+}
+
+FS *FS::create_by_fd(int fd)
+{
+  struct statfs st;
+  ::fstatfs(fd, &st);
+  return create(st.f_type);
+}
+
+// ---------------
+
+int FS::set_alloc_hint(int fd, uint64_t hint)
+{
+  return 0;  // no-op
+}
+
+#ifdef HAVE_NAME_TO_HANDLE_AT
+int FS::get_handle(int fd, std::string *h)
+{
+  char buf[sizeof(struct file_handle) + MAX_HANDLE_SZ];
+  struct file_handle *fh = (struct file_handle *)buf;
+  int mount_id;
+
+  fh->handle_bytes = MAX_HANDLE_SZ;
+  int r = name_to_handle_at(fd, "", fh, &mount_id, AT_EMPTY_PATH);
+  if (r < 0) {
+    return -errno;
+  }
+  *h = std::string(buf, fh->handle_bytes + sizeof(struct file_handle));
+  return 0;
+}
+
+int FS::open_handle(int mount_fd, const std::string& h, int flags)
+{
+  if (h.length() < sizeof(struct file_handle)) {
+    return -EINVAL;
+  }
+  struct file_handle *fh = (struct file_handle *)h.data();
+  if (fh->handle_bytes > h.length()) {
+    return -ERANGE;
+  }
+  int fd = open_by_handle_at(mount_fd, fh, flags);
+  if (fd < 0)
+    return -errno;
+  return fd;
+}
+
+#else // HAVE_NAME_TO_HANDLE_AT
+
+int FS::get_handle(int fd, std::string *h)
+{
+  return -EOPNOTSUPP;
+}
+
+int FS::open_handle(int mount_fd, const std::string& h, int flags)
+{
+  return -EOPNOTSUPP;
+}
+
+#endif // HAVE_NAME_TO_HANDLE_AT
+
+int FS::copy_file_range(int to_fd, uint64_t to_offset,
+			int from_fd,
+			uint64_t from_offset, uint64_t from_len)
+{
+  ceph_abort_msg("write me");
+}
+
+int FS::zero(int fd, uint64_t offset, uint64_t length)
+{
+  int r;
+
+  /*
+
+    From the fallocate(2) man page:
+
+       Specifying the FALLOC_FL_PUNCH_HOLE flag (available since Linux 2.6.38)
+       in mode deallocates space (i.e., creates a  hole)  in  the  byte  range
+       starting  at offset and continuing for len bytes.  Within the specified
+       range, partial filesystem  blocks  are  zeroed,  and  whole  filesystem
+       blocks  are removed from the file.  After a successful call, subsequent
+       reads from this range will return zeroes.
+
+       The FALLOC_FL_PUNCH_HOLE flag must be ORed with FALLOC_FL_KEEP_SIZE  in
+       mode;  in  other words, even when punching off the end of the file, the
+       file size (as reported by stat(2)) does not change.
+
+       Not all  filesystems  support  FALLOC_FL_PUNCH_HOLE;  if  a  filesystem
+       doesn't  support the operation, an error is returned.  The operation is
+       supported on at least the following filesystems:
+
+       *  XFS (since Linux 2.6.38)
+
+       *  ext4 (since Linux 3.0)
+
+       *  Btrfs (since Linux 3.7)
+
+       *  tmpfs (since Linux 3.5)
+
+   So: we only do this is PUNCH_HOLE *and* KEEP_SIZE are defined.
+
+  */
+#if !defined(__APPLE__) && !defined(__FreeBSD__)
+# ifdef CEPH_HAVE_FALLOCATE
+#  ifdef FALLOC_FL_KEEP_SIZE
+  // first try fallocate
+  r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length);
+  if (r < 0) {
+    r = -errno;
+  }
+  if (r != -EOPNOTSUPP) {
+    goto out;  // a real error
+  }
+  // if that failed (-EOPNOTSUPP), fall back to writing zeros.
+#  endif
+# endif
+#endif
+
+  {
+    // fall back to writing zeros
+    ceph::bufferlist bl;
+    bl.append_zero(length);
+    r = ::lseek64(fd, offset, SEEK_SET);
+    if (r < 0) {
+      r = -errno;
+      goto out;
+    }
+    r = bl.write_fd(fd);
+  }
+
+ out:
+  return r;
+}
+
+// ---------------
+
diff --git a/src/os/fs/FS.h b/src/os/fs/FS.h
new file mode 100644
index 000000000..a1852f49f
--- /dev/null
+++ b/src/os/fs/FS.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_FS_H
+#define CEPH_OS_FS_H
+
+#include <errno.h>
+#include <time.h>
+
+#include <string>
+
+#include "include/types.h"
+#include "common/Cond.h"
+
+class FS {
+public:
+  virtual ~FS() { }
+
+  static FS *create(uint64_t f_type);
+  static FS *create_by_fd(int fd);
+
+  virtual const char *get_name() {
+    return "generic";
+  }
+
+  virtual int set_alloc_hint(int fd, uint64_t hint);
+
+  virtual int get_handle(int fd, std::string *h);
+  virtual int open_handle(int mount_fd, const std::string& h, int flags);
+
+  virtual int copy_file_range(int to_fd, uint64_t to_offset,
+			      int from_fd,
+			      uint64_t from_offset, uint64_t from_len);
+  virtual int zero(int fd, uint64_t offset, uint64_t length);
+
+  // -- aio --
+};
+
+#endif
diff --git a/src/os/fs/XFS.cc b/src/os/fs/XFS.cc
new file mode 100644
index 000000000..c72ee1a08
--- /dev/null
+++ b/src/os/fs/XFS.cc
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "XFS.h"
+
+#include <xfs/xfs.h>
+
+int XFS::set_alloc_hint(int fd, uint64_t val)
+{
+  struct fsxattr fsx;
+  struct stat sb;
+  int ret;
+
+  if (fstat(fd, &sb) < 0) {
+    ret = -errno;
+    return ret;
+  }
+  if (!S_ISREG(sb.st_mode)) {
+    return -EINVAL;
+  }
+
+  if (ioctl(fd, XFS_IOC_FSGETXATTR, &fsx) < 0) {
+    ret = -errno;
+    return ret;
+  }
+
+  // already set?
+  if ((fsx.fsx_xflags & XFS_XFLAG_EXTSIZE) && fsx.fsx_extsize == val)
+    return 0;
+
+  // xfs won't change extent size if any extents are allocated
+  if (fsx.fsx_nextents != 0)
+    return 0;
+
+  fsx.fsx_xflags |= XFS_XFLAG_EXTSIZE;
+  fsx.fsx_extsize = val;
+
+  if (ioctl(fd, XFS_IOC_FSSETXATTR, &fsx) < 0) {
+    ret = -errno;
+    return ret;
+  }
+
+  return 0;
+}
diff --git a/src/os/fs/XFS.h b/src/os/fs/XFS.h
new file mode 100644
index 000000000..f0ea717e3
--- /dev/null
+++ b/src/os/fs/XFS.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_XFS_H
+#define CEPH_OS_XFS_H
+
+#include "FS.h"
+
+# ifndef XFS_SUPER_MAGIC
+#define XFS_SUPER_MAGIC 0x58465342
+# endif
+
+class XFS : public FS {
+  const char *get_name() override {
+    return "xfs";
+  }
+  int set_alloc_hint(int fd, uint64_t hint) override;
+};
+
+#endif
diff --git a/src/os/fs/ZFS.cc b/src/os/fs/ZFS.cc
new file mode 100644
index 000000000..02520796c
--- /dev/null
+++ b/src/os/fs/ZFS.cc
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#define HAVE_IOCTL_IN_SYS_IOCTL_H
+#include <libzfs.h>
+#include "ZFS.h"
+
+const int ZFS::TYPE_FILESYSTEM 	= ZFS_TYPE_FILESYSTEM;
+const int ZFS::TYPE_SNAPSHOT	= ZFS_TYPE_SNAPSHOT;
+const int ZFS::TYPE_VOLUME	= ZFS_TYPE_VOLUME;
+const int ZFS::TYPE_DATASET	= ZFS_TYPE_DATASET;
+
+ZFS::~ZFS()
+{
+  if (g_zfs)
+    ::libzfs_fini((libzfs_handle_t*)g_zfs);
+}
+
+int ZFS::init()
+{
+  g_zfs = ::libzfs_init();
+  return g_zfs ? 0 : -EINVAL;
+}
+
+ZFS::Handle *ZFS::open(const char *n, int t)
+{
+  return (ZFS::Handle*)::zfs_open((libzfs_handle_t*)g_zfs, n, (zfs_type_t)t);
+}
+
+void ZFS::close(ZFS::Handle *h)
+{
+  ::zfs_close((zfs_handle_t*)h);
+}
+
+const char *ZFS::get_name(ZFS::Handle *h)
+{
+  return ::zfs_get_name((zfs_handle_t*)h);
+}
+
+ZFS::Handle *ZFS::path_to_zhandle(const char *p, int t)
+{
+  return ::zfs_path_to_zhandle((libzfs_handle_t*)g_zfs, (char *)p, (zfs_type_t)t);
+}
+
+int ZFS::create(const char *n, int t)
+{
+  return ::zfs_create((libzfs_handle_t*)g_zfs, n, (zfs_type_t)t, NULL);
+}
+
+int ZFS::snapshot(const char *n, bool r)
+{
+  return ::zfs_snapshot((libzfs_handle_t*)g_zfs, n, (boolean_t)r, NULL);
+}
+
+int ZFS::rollback(ZFS::Handle *h, ZFS::Handle *snap, bool f)
+{
+  return ::zfs_rollback((zfs_handle_t*)h, (zfs_handle_t*)snap, (boolean_t)f);
+}
+
+int ZFS::destroy_snaps(ZFS::Handle *h, const char *n, bool d)
+{
+  return ::zfs_destroy_snaps((zfs_handle_t*)h, (char *)n, (boolean_t)d);
+}
+
+bool ZFS::is_mounted(ZFS::Handle *h, char **p)
+{
+  return (bool)::zfs_is_mounted((zfs_handle_t*)h, p);
+}
+
+int ZFS::mount(ZFS::Handle *h, const char *o, int f)
+{
+  return ::zfs_mount((zfs_handle_t*)h, o, f);
+}
+
+int ZFS::umount(ZFS::Handle *h, const char *o, int f)
+{
+  return ::zfs_unmount((zfs_handle_t*)h, o, f);
+}
+
+int ZFS::iter_snapshots_sorted(ZFS::Handle *h, ZFS::iter_func f, void *d)
+{
+  return ::zfs_iter_snapshots_sorted((zfs_handle_t*)h, (zfs_iter_f)f, d);
+}
diff --git a/src/os/fs/ZFS.h b/src/os/fs/ZFS.h
new file mode 100644
index 000000000..3ebe11107
--- /dev/null
+++ b/src/os/fs/ZFS.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_ZFS_H
+#define CEPH_ZFS_H
+
+// Simple wrapper to hide libzfs.h. (it conflicts with standard linux headers)
+class ZFS {
+  void *g_zfs;
+public:
+
+  static const int TYPE_FILESYSTEM;
+  static const int TYPE_SNAPSHOT;
+  static const int TYPE_VOLUME;
+  static const int TYPE_POOL;
+  static const int TYPE_DATASET;
+
+  typedef void Handle;
+  typedef int (*iter_func)(Handle *, void *);
+
+  static const char *get_name(Handle *);
+
+  ZFS() : g_zfs(NULL) {}
+  ~ZFS();
+  int init();
+  Handle *open(const char *, int);
+  void close(Handle *);
+  Handle *path_to_zhandle(const char *, int);
+  int create(const char *, int);
+  int snapshot(const char *, bool);
+  int rollback(Handle *, Handle *, bool);
+  int destroy_snaps(Handle *, const char *, bool);
+  int iter_snapshots_sorted(Handle *, iter_func, void *);
+  int mount(Handle *, const char *, int);
+  int umount(Handle *, const char *, int);
+  bool is_mounted(Handle *, char **);
+};
+
+#endif
diff --git a/src/os/fs/btrfs_ioctl.h b/src/os/fs/btrfs_ioctl.h
new file mode 100644
index 000000000..277498ca8
--- /dev/null
+++ b/src/os/fs/btrfs_ioctl.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __IOCTL_
+#define __IOCTL_
+
+#if defined(__linux__)
+#include <linux/ioctl.h>
+#elif defined(__FreeBSD__)
+#include <sys/ioctl.h>
+#endif
+
+#define BTRFS_IOCTL_MAGIC 0x94
+#define BTRFS_VOL_NAME_MAX 255
+
+/* this should be 4k */
+#define BTRFS_PATH_NAME_MAX 4087
+struct btrfs_ioctl_vol_args {
+	__s64 fd;
+	char name[BTRFS_PATH_NAME_MAX + 1];
+};
+
+#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
+
+#define BTRFS_SUBVOL_NAME_MAX 4039
+struct btrfs_ioctl_vol_args_v2 {
+	__s64 fd;
+	__u64 transid;
+	__u64 flags;
+	__u64 unused[4];
+	char name[BTRFS_SUBVOL_NAME_MAX + 1];
+};
+
+#define BTRFS_INO_LOOKUP_PATH_MAX 4080
+struct btrfs_ioctl_ino_lookup_args {
+	__u64 treeid;
+	__u64 objectid;
+	char name[BTRFS_INO_LOOKUP_PATH_MAX];
+};
+
+struct btrfs_ioctl_search_key {
+	/* which root are we searching.  0 is the tree of tree roots */
+	__u64 tree_id;
+
+	/* keys returned will be >= min and <= max */
+	__u64 min_objectid;
+	__u64 max_objectid;
+
+	/* keys returned will be >= min and <= max */
+	__u64 min_offset;
+	__u64 max_offset;
+
+	/* max and min transids to search for */
+	__u64 min_transid;
+	__u64 max_transid;
+
+	/* keys returned will be >= min and <= max */
+	__u32 min_type;
+	__u32 max_type;
+
+	/*
+	 * how many items did userland ask for, and how many are we
+	 * returning
+	 */
+	__u32 nr_items;
+
+	/* align to 64 bits */
+	__u32 unused;
+
+	/* some extra for later */
+	__u64 unused1;
+	__u64 unused2;
+	__u64 unused3;
+	__u64 unused4;
+};
+
+struct btrfs_ioctl_search_header {
+	__u64 transid;
+	__u64 objectid;
+	__u64 offset;
+	__u32 type;
+	__u32 len;
+};
+
+#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
+/*
+ * the buf is an array of search headers where
+ * each header is followed by the actual item
+ * the type field is expanded to 32 bits for alignment
+ */
+struct btrfs_ioctl_search_args {
+	struct btrfs_ioctl_search_key key;
+	char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
+};
+
+struct btrfs_ioctl_clone_range_args {
+  __s64 src_fd;
+  __u64 src_offset, src_length;
+  __u64 dest_offset;
+};
+
+/* flags for the defrag range ioctl */
+#define BTRFS_DEFRAG_RANGE_COMPRESS 1
+#define BTRFS_DEFRAG_RANGE_START_IO 2
+
+struct btrfs_ioctl_defrag_range_args {
+	/* start of the defrag operation */
+	__u64 start;
+
+	/* number of bytes to defrag, use (u64)-1 to say all */
+	__u64 len;
+
+	/*
+	 * flags for the operation, which can include turning
+	 * on compression for this one defrag
+	 */
+	__u64 flags;
+
+	/*
+	 * any extent bigger than this will be considered
+	 * already defragged.  Use 0 to take the kernel default
+	 * Use 1 to say every single extent must be rewritten
+	 */
+	__u32 extent_thresh;
+
+	/* spare for later */
+	__u32 unused[5];
+};
+
+struct btrfs_ioctl_space_info {
+	__u64 flags;
+	__u64 total_bytes;
+	__u64 used_bytes;
+};
+
+struct btrfs_ioctl_space_args {
+	__u64 space_slots;
+	__u64 total_spaces;
+	struct btrfs_ioctl_space_info spaces[0];
+};
+
+#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
+				   struct btrfs_ioctl_vol_args)
+/* trans start and trans end are dangerous, and only for
+ * use by applications that know how to avoid the
+ * resulting deadlocks
+ */
+#define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
+#define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
+#define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
+
+#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
+#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
+				   struct btrfs_ioctl_vol_args)
+
+#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
+				  struct btrfs_ioctl_clone_range_args)
+
+#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
+				struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
+				struct btrfs_ioctl_defrag_range_args)
+#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
+				   struct btrfs_ioctl_search_args)
+#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
+				   struct btrfs_ioctl_ino_lookup_args)
+#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
+#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
+				    struct btrfs_ioctl_space_args)
+#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
+#define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
+#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
+				   struct btrfs_ioctl_vol_args_v2)
+#endif
diff --git a/src/os/kstore/KStore.cc b/src/os/kstore/KStore.cc
new file mode 100644
index 000000000..9526a7564
--- /dev/null
+++ b/src/os/kstore/KStore.cc
@@ -0,0 +1,3409 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#if defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif
+
+#include "KStore.h"
+#include "osd/osd_types.h"
+#include "os/kv.h"
+#include "include/compat.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/Formatter.h"
+#include "common/pretty_binary.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_kstore
+
+/*
+
+  TODO:
+
+  * superblock, features
+  * refcounted extents (for efficient clone)
+
+ */
+
+using std::less;
+using std::list;
+using std::make_pair;
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::decode;
+using ceph::encode;
+using ceph::JSONFormatter;
+
+const string PREFIX_SUPER = "S"; // field -> value
+const string PREFIX_COLL = "C"; // collection name -> (nothing)
+const string PREFIX_OBJ = "O";  // object name -> onode
+const string PREFIX_DATA = "D"; // nid + offset -> data
+const string PREFIX_OMAP = "M"; // u64 + keyname -> value
+
+/*
+ * object name key structure
+ *
+ * 2 chars: shard (-- for none, or hex digit, so that we sort properly)
+ * encoded u64: poolid + 2^63 (so that it sorts properly)
+ * encoded u32: hash (bit reversed)
+ *
+ * 1 char: '.'
+ *
+ * escaped string: namespace
+ *
+ * 1 char: '<', '=', or '>'.  if =, then object key == object name, and
+ *         we are followed just by the key.  otherwise, we are followed by
+ *         the key and then the object name.
+ * escaped string: key
+ * escaped string: object name (unless '=' above)
+ *
+ * encoded u64: snap
+ * encoded u64: generation
+ */
+
+/*
+ * string encoding in the key
+ *
+ * The key string needs to lexicographically sort the same way that
+ * ghobject_t does.  We do this by escaping anything <= to '#' with #
+ * plus a 2 digit hex string, and anything >= '~' with ~ plus the two
+ * hex digits.
+ *
+ * We use ! as a terminator for strings; this works because it is < #
+ * and will get escaped if it is present in the string.
+ *
+ */
+
+static void append_escaped(const string &in, string *out)
+{
+  char hexbyte[8];
+  for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+    if ((unsigned char)*i <= '#') {
+      snprintf(hexbyte, sizeof(hexbyte), "#%02x", (uint8_t)*i);
+      out->append(hexbyte);
+    } else if ((unsigned char)*i >= '~') {
+      snprintf(hexbyte, sizeof(hexbyte), "~%02x", (uint8_t)*i);
+      out->append(hexbyte);
+    } else {
+      out->push_back(*i);
+    }
+  }
+  out->push_back('!');
+}
+
+static int decode_escaped(const char *p, string *out)
+{
+  const char *orig_p = p;
+  while (*p && *p != '!') {
+    if (*p == '#' || *p == '~') {
+      unsigned hex;
+      int r = sscanf(++p, "%2x", &hex);
+      if (r < 1)
+	return -EINVAL;
+      out->push_back((char)hex);
+      p += 2;
+    } else {
+      out->push_back(*p++);
+    }
+  }
+  return p - orig_p;
+}
+
+static void _key_encode_shard(shard_id_t shard, string *key)
+{
+  // make field ordering match with ghobject_t compare operations
+  if (shard == shard_id_t::NO_SHARD) {
+    // otherwise ff will sort *after* 0, not before.
+    key->append("--");
+  } else {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02x", (int)shard);
+    key->append(buf);
+  }
+}
+static const char *_key_decode_shard(const char *key, shard_id_t *pshard)
+{
+  if (key[0] == '-') {
+    *pshard = shard_id_t::NO_SHARD;
+  } else {
+    unsigned shard;
+    int r = sscanf(key, "%x", &shard);
+    if (r < 1)
+      return NULL;
+    *pshard = shard_id_t(shard);
+  }
+  return key + 2;
+}
+
+static void get_coll_key_range(const coll_t& cid, int bits,
+			       string *temp_start, string *temp_end,
+			       string *start, string *end)
+{
+  temp_start->clear();
+  temp_end->clear();
+  start->clear();
+  end->clear();
+
+  spg_t pgid;
+  if (cid.is_pg(&pgid)) {
+    _key_encode_shard(pgid.shard, start);
+    *end = *start;
+    *temp_start = *start;
+    *temp_end = *start;
+
+    _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start);
+    _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start);
+    _key_encode_u32(hobject_t::_reverse_bits(pgid.ps()), start);
+    _key_encode_u32(hobject_t::_reverse_bits(pgid.ps()), temp_start);
+    start->append(".");
+    temp_start->append(".");
+
+    _key_encode_u64(pgid.pool() + 0x8000000000000000ull, end);
+    _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_end);
+
+    uint64_t end_hash =
+      hobject_t::_reverse_bits(pgid.ps()) + (1ull << (32-bits));
+    if (end_hash <= 0xffffffffull) {
+      _key_encode_u32(end_hash, end);
+      _key_encode_u32(end_hash, temp_end);
+      end->append(".");
+      temp_end->append(".");
+    } else {
+      _key_encode_u32(0xffffffff, end);
+      _key_encode_u32(0xffffffff, temp_end);
+      end->append(":");
+      temp_end->append(":");
+    }
+  } else {
+    _key_encode_shard(shard_id_t::NO_SHARD, start);
+    _key_encode_u64(-1ull + 0x8000000000000000ull, start);
+    *end = *start;
+    _key_encode_u32(0, start);
+    start->append(".");
+    _key_encode_u32(0xffffffff, end);
+    end->append(":");
+
+    // no separate temp section
+    *temp_start = *end;
+    *temp_end = *end;
+  }
+}
+
+static int get_key_object(const string& key, ghobject_t *oid);
+
+static void get_object_key(CephContext* cct, const ghobject_t& oid,
+			   string *key)
+{
+  key->clear();
+
+  _key_encode_shard(oid.shard_id, key);
+  _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
+  _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
+  key->append(".");
+
+  append_escaped(oid.hobj.nspace, key);
+
+  if (oid.hobj.get_key().length()) {
+    // is a key... could be < = or >.
+    // (ASCII chars < = and > sort in that order, yay)
+    if (oid.hobj.get_key() < oid.hobj.oid.name) {
+      key->append("<");
+      append_escaped(oid.hobj.get_key(), key);
+      append_escaped(oid.hobj.oid.name, key);
+    } else if (oid.hobj.get_key() > oid.hobj.oid.name) {
+      key->append(">");
+      append_escaped(oid.hobj.get_key(), key);
+      append_escaped(oid.hobj.oid.name, key);
+    } else {
+      // same as no key
+      key->append("=");
+      append_escaped(oid.hobj.oid.name, key);
+    }
+  } else {
+    // no key
+    key->append("=");
+    append_escaped(oid.hobj.oid.name, key);
+  }
+
+  _key_encode_u64(oid.hobj.snap, key);
+  _key_encode_u64(oid.generation, key);
+
+  // sanity check
+  if (true) {
+    ghobject_t t;
+    int r = get_key_object(*key, &t);
+    if (r || t != oid) {
+      derr << "  r " << r << dendl;
+      derr << "key " << pretty_binary_string(*key) << dendl;
+      derr << "oid " << oid << dendl;
+      derr << "  t " << t << dendl;
+      ceph_assert(t == oid);
+    }
+  }
+}
+
+static int get_key_object(const string& key, ghobject_t *oid)
+{
+  int r;
+  const char *p = key.c_str();
+
+  p = _key_decode_shard(p, &oid->shard_id);
+
+  uint64_t pool;
+  p = _key_decode_u64(p, &pool);
+  oid->hobj.pool = pool - 0x8000000000000000ull;
+
+  unsigned hash;
+  p = _key_decode_u32(p, &hash);
+  oid->hobj.set_bitwise_key_u32(hash);
+  if (*p != '.')
+    return -5;
+  ++p;
+
+  r = decode_escaped(p, &oid->hobj.nspace);
+  if (r < 0)
+    return -6;
+  p += r + 1;
+
+  if (*p == '=') {
+    // no key
+    ++p;
+    r = decode_escaped(p, &oid->hobj.oid.name);
+    if (r < 0)
+      return -7;
+    p += r + 1;
+  } else if (*p == '<' || *p == '>') {
+    // key + name
+    ++p;
+    string okey;
+    r = decode_escaped(p, &okey);
+    if (r < 0)
+      return -8;
+    p += r + 1;
+    r = decode_escaped(p, &oid->hobj.oid.name);
+    if (r < 0)
+      return -9;
+    p += r + 1;
+    oid->hobj.set_key(okey);
+  } else {
+    // malformed
+    return -10;
+  }
+
+  p = _key_decode_u64(p, &oid->hobj.snap.val);
+  p = _key_decode_u64(p, &oid->generation);
+  if (*p) {
+    // if we get something other than a null terminator here, 
+    // something goes wrong.
+    return -12;
+  }
+
+  return 0;
+}
+
+
+static void get_data_key(uint64_t nid, uint64_t offset, string *out)
+{
+  _key_encode_u64(nid, out);
+  _key_encode_u64(offset, out);
+}
+
+// '-' < '.' < '~'
+static void get_omap_header(uint64_t id, string *out)
+{
+  _key_encode_u64(id, out);
+  out->push_back('-');
+}
+
+// hmm, I don't think there's any need to escape the user key since we
+// have a clean prefix.
+static void get_omap_key(uint64_t id, const string& key, string *out)
+{
+  _key_encode_u64(id, out);
+  out->push_back('.');
+  out->append(key);
+}
+
+static void rewrite_omap_key(uint64_t id, string old, string *out)
+{
+  _key_encode_u64(id, out);
+  out->append(old.substr(out->length()));
+}
+
+static void decode_omap_key(const string& key, string *user_key)
+{
+  *user_key = key.substr(sizeof(uint64_t) + 1);
+}
+
+static void get_omap_tail(uint64_t id, string *out)
+{
+  _key_encode_u64(id, out);
+  out->push_back('~');
+}
+
+
+
+// Onode
+
+#undef dout_prefix
+#define dout_prefix *_dout << "kstore.onode(" << this << ") "
+
+void KStore::Onode::flush()
+{
+  std::unique_lock<std::mutex> l(flush_lock);
+  dout(20) << __func__ << " " << flush_txns << dendl;
+  while (!flush_txns.empty())
+    flush_cond.wait(l);
+  dout(20) << __func__ << " done" << dendl;
+}
+
+// OnodeHashLRU
+
+#undef dout_prefix
+#define dout_prefix *_dout << "kstore.lru(" << this << ") "
+
+void KStore::OnodeHashLRU::_touch(OnodeRef o)
+{
+  lru_list_t::iterator p = lru.iterator_to(*o);
+  lru.erase(p);
+  lru.push_front(*o);
+}
+
+void KStore::OnodeHashLRU::add(const ghobject_t& oid, OnodeRef o)
+{
+  std::lock_guard<std::mutex> l(lock);
+  dout(30) << __func__ << " " << oid << " " << o << dendl;
+  ceph_assert(onode_map.count(oid) == 0);
+  onode_map[oid] = o;
+  lru.push_front(*o);
+}
+
+KStore::OnodeRef KStore::OnodeHashLRU::lookup(const ghobject_t& oid)
+{
+  std::lock_guard<std::mutex> l(lock);
+  dout(30) << __func__ << dendl;
+  ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
+  if (p == onode_map.end()) {
+    dout(30) << __func__ << " " << oid << " miss" << dendl;
+    return OnodeRef();
+  }
+  dout(30) << __func__ << " " << oid << " hit " << p->second << dendl;
+  _touch(p->second);
+  return p->second;
+}
+
+void KStore::OnodeHashLRU::clear()
+{
+  std::lock_guard<std::mutex> l(lock);
+  dout(10) << __func__ << dendl;
+  lru.clear();
+  onode_map.clear();
+}
+
+void KStore::OnodeHashLRU::rename(const ghobject_t& old_oid,
+				  const ghobject_t& new_oid)
+{
+  std::lock_guard<std::mutex> l(lock);
+  dout(30) << __func__ << " " << old_oid << " -> " << new_oid << dendl;
+  ceph::unordered_map<ghobject_t,OnodeRef>::iterator po, pn;
+  po = onode_map.find(old_oid);
+  pn = onode_map.find(new_oid);
+
+  ceph_assert(po != onode_map.end());
+  if (pn != onode_map.end()) {
+    lru_list_t::iterator p = lru.iterator_to(*pn->second);
+    lru.erase(p);
+    onode_map.erase(pn);
+  }
+  OnodeRef o = po->second;
+
+  // install a non-existent onode it its place
+  po->second.reset(new Onode(cct, old_oid, o->key));
+  lru.push_back(*po->second);
+
+  // fix oid, key
+  onode_map.insert(make_pair(new_oid, o));
+  _touch(o);
+  o->oid = new_oid;
+  get_object_key(cct, new_oid, &o->key);
+}
+
+bool KStore::OnodeHashLRU::get_next(
+  const ghobject_t& after,
+  pair<ghobject_t,OnodeRef> *next)
+{
+  std::lock_guard<std::mutex> l(lock);
+  dout(20) << __func__ << " after " << after << dendl;
+
+  if (after == ghobject_t()) {
+    if (lru.empty()) {
+      return false;
+    }
+    ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.begin();
+    ceph_assert(p != onode_map.end());
+    next->first = p->first;
+    next->second = p->second;
+    return true;
+  }
+
+  ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(after);
+  ceph_assert(p != onode_map.end()); // for now
+  lru_list_t::iterator pi = lru.iterator_to(*p->second);
+  ++pi;
+  if (pi == lru.end()) {
+    return false;
+  }
+  next->first = pi->oid;
+  next->second = onode_map[pi->oid];
+  return true;
+}
+
+int KStore::OnodeHashLRU::trim(int max)
+{
+  std::lock_guard<std::mutex> l(lock);
+  dout(20) << __func__ << " max " << max
+	   << " size " << onode_map.size() << dendl;
+  int trimmed = 0;
+  int num = onode_map.size() - max;
+  if (onode_map.size() == 0 || num <= 0)
+    return 0; // don't even try
+
+  lru_list_t::iterator p = lru.end();
+  if (num)
+    --p;
+  while (num > 0) {
+    Onode *o = &*p;
+    int refs = o->nref.load();
+    if (refs > 1) {
+      dout(20) << __func__ << "  " << o->oid << " has " << refs
+	       << " refs; stopping with " << num << " left to trim" << dendl;
+      break;
+    }
+    dout(30) << __func__ << "  trim " << o->oid << dendl;
+    if (p != lru.begin()) {
+      lru.erase(p--);
+    } else {
+      lru.erase(p);
+      ceph_assert(num == 1);
+    }
+    o->get();  // paranoia
+    onode_map.erase(o->oid);
+    o->put();
+    --num;
+    ++trimmed;
+  }
+  return trimmed;
+}
+
+// =======================================================
+
+// Collection
+
+#undef dout_prefix
+#define dout_prefix *_dout << "kstore(" << store->path << ").collection(" << cid << ") "
+
+KStore::Collection::Collection(KStore *ns, coll_t cid)
+  : CollectionImpl(ns->cct, cid),
+    store(ns),
+    osr(new OpSequencer()),
+    onode_map(store->cct)
+{
+}
+
+void KStore::Collection::flush()
+{
+  osr->flush();
+}
+
+bool KStore::Collection::flush_commit(Context *c)
+{
+  return osr->flush_commit(c);
+}
+
+
+KStore::OnodeRef KStore::Collection::get_onode(
+  const ghobject_t& oid,
+  bool create)
+{
+  ceph_assert(create ? ceph_mutex_is_wlocked(lock) : ceph_mutex_is_locked(lock));
+
+  spg_t pgid;
+  if (cid.is_pg(&pgid)) {
+    if (!oid.match(cnode.bits, pgid.ps())) {
+      lderr(store->cct) << __func__ << " oid " << oid << " not part of "
+			<< pgid << " bits " << cnode.bits << dendl;
+      ceph_abort();
+    }
+  }
+
+  OnodeRef o = onode_map.lookup(oid);
+  if (o)
+    return o;
+
+  string key;
+  get_object_key(store->cct, oid, &key);
+
+  ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
+			<< pretty_binary_string(key) << dendl;
+
+  bufferlist v;
+  int r = store->db->get(PREFIX_OBJ, key, &v);
+  ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
+  Onode *on;
+  if (v.length() == 0) {
+    ceph_assert(r == -ENOENT);
+    if (!create)
+      return OnodeRef();
+
+    // new
+    on = new Onode(store->cct, oid, key);
+    on->dirty = true;
+  } else {
+    // loaded
+    ceph_assert(r >=0);
+    on = new Onode(store->cct, oid, key);
+    on->exists = true;
+    auto p = v.cbegin();
+    decode(on->onode, p);
+  }
+  o.reset(on);
+  onode_map.add(oid, o);
+  return o;
+}
+
+
+
+// =======================================================
+
+#undef dout_prefix
+#define dout_prefix *_dout << "kstore(" << path << ") "
+
+KStore::KStore(CephContext *cct, const string& path)
+  : ObjectStore(cct, path),
+    db(NULL),
+    basedir(path),
+    path_fd(-1),
+    fsid_fd(-1),
+    mounted(false),
+    nid_last(0),
+    nid_max(0),
+    throttle_ops(cct, "kstore_max_ops", cct->_conf->kstore_max_ops),
+    throttle_bytes(cct, "kstore_max_bytes", cct->_conf->kstore_max_bytes),
+    finisher(cct),
+    kv_sync_thread(this),
+    kv_stop(false),
+    logger(nullptr)
+{
+  _init_logger();
+}
+
+KStore::~KStore()
+{
+  _shutdown_logger();
+  ceph_assert(!mounted);
+  ceph_assert(db == NULL);
+  ceph_assert(fsid_fd < 0);
+}
+
+void KStore::_init_logger()
+{
+  // XXX
+  PerfCountersBuilder b(cct, "KStore",
+                        l_kstore_first, l_kstore_last);
+  b.add_time_avg(l_kstore_state_prepare_lat, "state_prepare_lat", "Average prepare state latency");
+  b.add_time_avg(l_kstore_state_kv_queued_lat, "state_kv_queued_lat", "Average kv_queued state latency");
+  b.add_time_avg(l_kstore_state_kv_done_lat, "state_kv_done_lat", "Average kv_done state latency");
+  b.add_time_avg(l_kstore_state_finishing_lat, "state_finishing_lat", "Average finishing state latency");
+  b.add_time_avg(l_kstore_state_done_lat, "state_done_lat", "Average done state latency");
+  logger = b.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+}
+
+void KStore::_shutdown_logger()
+{
+  // XXX
+  cct->get_perfcounters_collection()->remove(logger);
+  delete logger;
+}
+
+int KStore::_open_path()
+{
+  ceph_assert(path_fd < 0);
+  path_fd = ::open(path.c_str(), O_DIRECTORY|O_CLOEXEC);
+  if (path_fd < 0) {
+    int r = -errno;
+    derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r)
+	 << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void KStore::_close_path()
+{
+  VOID_TEMP_FAILURE_RETRY(::close(path_fd));
+  path_fd = -1;
+}
+
+int KStore::_open_fsid(bool create)
+{
+  ceph_assert(fsid_fd < 0);
+  int flags = O_RDWR;
+  if (create)
+    flags |= O_CREAT;
+  fsid_fd = ::openat(path_fd, "fsid", flags, 0644);
+  if (fsid_fd < 0) {
+    int err = -errno;
+    derr << __func__ << " " << cpp_strerror(err) << dendl;
+    return err;
+  }
+  return 0;
+}
+
+int KStore::_read_fsid(uuid_d *uuid)
+{
+  char fsid_str[40];
+  memset(fsid_str, 0, sizeof(fsid_str));
+  int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str));
+  if (ret < 0) {
+    derr << __func__ << " failed: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  if (ret > 36)
+    fsid_str[36] = 0;
+  else
+    fsid_str[ret] = 0;
+  if (!uuid->parse(fsid_str)) {
+    derr << __func__ << " unparsable uuid " << fsid_str << dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+int KStore::_write_fsid()
+{
+  int r = ::ftruncate(fsid_fd, 0);
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  string str = stringify(fsid) + "\n";
+  r = safe_write(fsid_fd, str.c_str(), str.length());
+  if (r < 0) {
+    derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = ::fsync(fsid_fd);
+  if (r < 0) {
+    r = -errno;
+    derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void KStore::_close_fsid()
+{
+  VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+  fsid_fd = -1;
+}
+
+int KStore::_lock_fsid()
+{
+  struct flock l;
+  memset(&l, 0, sizeof(l));
+  l.l_type = F_WRLCK;
+  l.l_whence = SEEK_SET;
+  l.l_start = 0;
+  l.l_len = 0;
+  int r = ::fcntl(fsid_fd, F_SETLK, &l);
+  if (r < 0) {
+    int err = errno;
+    derr << __func__ << " failed to lock " << path << "/fsid"
+	 << " (is another ceph-osd still running?)"
+	 << cpp_strerror(err) << dendl;
+    return -err;
+  }
+  return 0;
+}
+
+bool KStore::test_mount_in_use()
+{
+  // most error conditions mean the mount is not in use (e.g., because
+  // it doesn't exist).  only if we fail to lock do we conclude it is
+  // in use.
+  bool ret = false;
+  int r = _open_path();
+  if (r < 0)
+    return false;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+  r = _lock_fsid();
+  if (r < 0)
+    ret = true; // if we can't lock, it is in use
+  _close_fsid();
+ out_path:
+  _close_path();
+  return ret;
+}
+
+int KStore::_open_db(bool create)
+{
+  int r;
+  ceph_assert(!db);
+  char fn[PATH_MAX];
+  snprintf(fn, sizeof(fn), "%s/db", path.c_str());
+
+  string kv_backend;
+  if (create) {
+    kv_backend = cct->_conf->kstore_backend;
+  } else {
+    r = read_meta("kv_backend", &kv_backend);
+    if (r < 0) {
+      derr << __func__ << " uanble to read 'kv_backend' meta" << dendl;
+      return -EIO;
+    }
+  }
+  dout(10) << __func__ << " kv_backend = " << kv_backend << dendl;
+
+  if (create) {
+    int r = ::mkdir(fn, 0755);
+    if (r < 0)
+      r = -errno;
+    if (r < 0 && r != -EEXIST) {
+      derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r)
+	   << dendl;
+      return r;
+    }
+
+    // wal_dir, too!
+    char walfn[PATH_MAX];
+    snprintf(walfn, sizeof(walfn), "%s/db.wal", path.c_str());
+    r = ::mkdir(walfn, 0755);
+    if (r < 0)
+      r = -errno;
+    if (r < 0 && r != -EEXIST) {
+      derr << __func__ << " failed to create " << walfn
+	   << ": " << cpp_strerror(r)
+	   << dendl;
+      return r;
+    }
+  }
+
+  db = KeyValueDB::create(cct, kv_backend, fn);
+  if (!db) {
+    derr << __func__ << " error creating db" << dendl;
+    return -EIO;
+  }
+  string options;
+  if (kv_backend == "rocksdb")
+    options = cct->_conf->kstore_rocksdb_options;
+  db->init(options);
+  stringstream err;
+  if (create)
+    r = db->create_and_open(err);
+  else
+    r = db->open(err);
+  if (r) {
+    derr << __func__ << " erroring opening db: " << err.str() << dendl;
+    delete db;
+    db = NULL;
+    return -EIO;
+  }
+  dout(1) << __func__ << " opened " << kv_backend
+	  << " path " << fn << " options " << options << dendl;
+  return 0;
+}
+
+void KStore::_close_db()
+{
+  ceph_assert(db);
+  delete db;
+  db = NULL;
+}
+
+int KStore::_open_collections(int *errors)
+{
+  ceph_assert(coll_map.empty());
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL);
+  for (it->upper_bound(string());
+       it->valid();
+       it->next()) {
+    coll_t cid;
+    if (cid.parse(it->key())) {
+      auto c = ceph::make_ref<Collection>(this, cid);
+      bufferlist bl = it->value();
+      auto p = bl.cbegin();
+      try {
+        decode(c->cnode, p);
+      } catch (ceph::buffer::error& e) {
+        derr << __func__ << " failed to decode cnode, key:"
+             << pretty_binary_string(it->key()) << dendl;
+        return -EIO;
+      } 
+      dout(20) << __func__ << " opened " << cid << dendl;
+      coll_map[cid] = c;
+    } else {
+      derr << __func__ << " unrecognized collection " << it->key() << dendl;
+      if (errors)
+	(*errors)++;
+    }
+  }
+  return 0;
+}
+
+int KStore::mkfs()
+{
+  dout(1) << __func__ << " path " << path << dendl;
+  int r;
+  uuid_d old_fsid;
+
+  r = _open_path();
+  if (r < 0)
+    return r;
+
+  r = _open_fsid(true);
+  if (r < 0)
+    goto out_path_fd;
+
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_close_fsid;
+
+  r = _read_fsid(&old_fsid);
+  if (r < 0 || old_fsid.is_zero()) {
+    if (fsid.is_zero()) {
+      fsid.generate_random();
+      dout(1) << __func__ << " generated fsid " << fsid << dendl;
+    } else {
+      dout(1) << __func__ << " using provided fsid " << fsid << dendl;
+    }
+    // we'll write it last.
+  } else {
+    if (!fsid.is_zero() && fsid != old_fsid) {
+      derr << __func__ << " on-disk fsid " << old_fsid
+	   << " != provided " << fsid << dendl;
+      r = -EINVAL;
+      goto out_close_fsid;
+    }
+    fsid = old_fsid;
+    dout(1) << __func__ << " already created, fsid is " << fsid << dendl;
+    goto out_close_fsid;
+  }
+
+  r = _open_db(true);
+  if (r < 0)
+    goto out_close_fsid;
+
+  r = write_meta("kv_backend", cct->_conf->kstore_backend);
+  if (r < 0)
+    goto out_close_db;
+
+  r = write_meta("type", "kstore");
+  if (r < 0)
+    goto out_close_db;
+
+  // indicate mkfs completion/success by writing the fsid file
+  r = _write_fsid();
+  if (r == 0)
+    dout(10) << __func__ << " success" << dendl;
+  else
+    derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl;
+
+ out_close_db:
+  _close_db();
+ out_close_fsid:
+  _close_fsid();
+ out_path_fd:
+  _close_path();
+  return r;
+}
+
+int KStore::mount()
+{
+  dout(1) << __func__ << " path " << path << dendl;
+
+  if (cct->_conf->kstore_fsck_on_mount) {
+    int rc = fsck(cct->_conf->kstore_fsck_on_mount_deep);
+    if (rc < 0)
+      return rc;
+  }
+
+  int r = _open_path();
+  if (r < 0)
+    return r;
+  r = _open_fsid(false);
+  if (r < 0)
+    goto out_path;
+
+  r = _read_fsid(&fsid);
+  if (r < 0)
+    goto out_fsid;
+
+  r = _lock_fsid();
+  if (r < 0)
+    goto out_fsid;
+
+  r = _open_db(false);
+  if (r < 0)
+    goto out_fsid;
+
+  r = _open_super_meta();
+  if (r < 0)
+    goto out_db;
+
+  r = _open_collections();
+  if (r < 0)
+    goto out_db;
+
+  finisher.start();
+  kv_sync_thread.create("kstore_kv_sync");
+
+  mounted = true;
+  return 0;
+
+ out_db:
+  _close_db();
+ out_fsid:
+  _close_fsid();
+ out_path:
+  _close_path();
+  return r;
+}
+
+int KStore::umount()
+{
+  ceph_assert(mounted);
+  dout(1) << __func__ << dendl;
+
+  _sync();
+  _reap_collections();
+  coll_map.clear();
+
+  dout(20) << __func__ << " stopping kv thread" << dendl;
+  _kv_stop();
+  dout(20) << __func__ << " draining finisher" << dendl;
+  finisher.wait_for_empty();
+  dout(20) << __func__ << " stopping finisher" << dendl;
+  finisher.stop();
+  dout(20) << __func__ << " closing" << dendl;
+
+  mounted = false;
+  _close_db();
+  _close_fsid();
+  _close_path();
+  return 0;
+}
+
+int KStore::fsck(bool deep)
+{
+  dout(1) << __func__ << dendl;
+  int errors = 0;
+  dout(1) << __func__ << " finish with " << errors << " errors" << dendl;
+  return errors;
+}
+
+void KStore::_sync()
+{
+  dout(10) << __func__ << dendl;
+
+  std::unique_lock<std::mutex> l(kv_lock);
+  while (!kv_committing.empty() ||
+	 !kv_queue.empty()) {
+    dout(20) << " waiting for kv to commit" << dendl;
+    kv_sync_cond.wait(l);
+  }
+
+  dout(10) << __func__ << " done" << dendl;
+}
+
+int KStore::statfs(struct store_statfs_t* buf0, osd_alert_list_t* alerts)
+{
+  struct statfs buf;
+  buf0->reset();
+  if (alerts) {
+    alerts->clear(); // returns nothing for now
+  }
+  if (::statfs(basedir.c_str(), &buf) < 0) {
+    int r = -errno;
+    ceph_assert(r != -ENOENT);
+    return r;
+  }
+
+  buf0->total = buf.f_blocks * buf.f_bsize;
+  buf0->available = buf.f_bavail * buf.f_bsize;
+
+  return 0;
+}
+
+ObjectStore::CollectionHandle KStore::open_collection(const coll_t& cid)
+{
+  return _get_collection(cid);
+}
+
+ObjectStore::CollectionHandle KStore::create_new_collection(const coll_t& cid)
+{
+  auto c = ceph::make_ref<Collection>(this, cid);
+  std::unique_lock l{coll_lock};
+  new_coll_map[cid] = c;
+  return c;
+}
+
+int KStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+			bool *per_pool_omap)
+{
+  return -ENOTSUP;
+}
+
+// ---------------
+// cache
+
+KStore::CollectionRef KStore::_get_collection(coll_t cid)
+{
+  std::shared_lock l{coll_lock};
+  ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+  if (cp == coll_map.end())
+    return CollectionRef();
+  return cp->second;
+}
+
+void KStore::_queue_reap_collection(CollectionRef& c)
+{
+  dout(10) << __func__ << " " << c->cid << dendl;
+  std::lock_guard<std::mutex> l(reap_lock);
+  removed_collections.push_back(c);
+}
+
+void KStore::_reap_collections()
+{
+  list<CollectionRef> removed_colls;
+  std::lock_guard<std::mutex> l(reap_lock);
+  removed_colls.swap(removed_collections);
+
+  for (list<CollectionRef>::iterator p = removed_colls.begin();
+       p != removed_colls.end();
+       ++p) {
+    CollectionRef c = *p;
+    dout(10) << __func__ << " " << c->cid << dendl;
+    {
+      pair<ghobject_t,OnodeRef> next;
+      while (c->onode_map.get_next(next.first, &next)) {
+	ceph_assert(!next.second->exists);
+	if (!next.second->flush_txns.empty()) {
+	  dout(10) << __func__ << " " << c->cid << " " << next.second->oid
+		   << " flush_txns " << next.second->flush_txns << dendl;
+	  return;
+	}
+      }
+    }
+    c->onode_map.clear();
+    dout(10) << __func__ << " " << c->cid << " done" << dendl;
+  }
+
+  dout(10) << __func__ << " all reaped" << dendl;
+}
+
+// ---------------
+// read operations
+
+bool KStore::exists(CollectionHandle& ch, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists)
+    return false;
+  return true;
+}
+
+int KStore::stat(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  struct stat *st,
+  bool allow_eio)
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists)
+    return -ENOENT;
+  st->st_size = o->onode.size;
+  st->st_blksize = 4096;
+  st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
+  st->st_nlink = 1;
+  return 0;
+}
+
+int KStore::set_collection_opts(
+  CollectionHandle& ch,
+  const pool_opts_t& opts)
+{
+  return -EOPNOTSUPP;
+}
+
+int KStore::read(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t length,
+  bufferlist& bl,
+  uint32_t op_flags)
+{
+  dout(15) << __func__ << " " << ch->cid << " " << oid
+	   << " " << offset << "~" << length
+	   << dendl;
+  bl.clear();
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+
+  int r;
+
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+
+  if (offset == length && offset == 0)
+    length = o->onode.size;
+
+  r = _do_read(o, offset, length, bl, false, op_flags);
+
+ out:
+  dout(10) << __func__ << " " << ch->cid << " " << oid
+	   << " " << offset << "~" << length
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_do_read(
+    OnodeRef o,
+    uint64_t offset,
+    size_t length,
+    bufferlist& bl,
+    bool do_cache,
+    uint32_t op_flags)
+{
+  int r = 0;
+  uint64_t stripe_size = o->onode.stripe_size;
+  uint64_t stripe_off;
+
+  dout(20) << __func__ << " " << offset << "~" << length << " size "
+	   << o->onode.size << " nid " << o->onode.nid << dendl;
+  bl.clear();
+
+  if (offset > o->onode.size) {
+    goto out;
+  }
+  if (offset + length > o->onode.size) {
+    length = o->onode.size - offset;
+  }
+  if (stripe_size == 0) {
+    bl.append_zero(length);
+    r = length;
+    goto out;
+  }
+
+  o->flush();
+
+  stripe_off = offset % stripe_size;
+  while (length > 0) {
+    bufferlist stripe;
+    _do_read_stripe(o, offset - stripe_off, &stripe, do_cache);
+    dout(30) << __func__ << " stripe " << offset - stripe_off << " got "
+	     << stripe.length() << dendl;
+    unsigned swant = std::min<unsigned>(stripe_size - stripe_off, length);
+    if (stripe.length()) {
+      if (swant == stripe.length()) {
+	bl.claim_append(stripe);
+	dout(30) << __func__ << " taking full stripe" << dendl;
+      } else {
+	unsigned l = 0;
+	if (stripe_off < stripe.length()) {
+	  l = std::min<uint64_t>(stripe.length() - stripe_off, swant);
+	  bufferlist t;
+	  t.substr_of(stripe, stripe_off, l);
+	  bl.claim_append(t);
+	  dout(30) << __func__ << " taking " << stripe_off << "~" << l << dendl;
+	}
+	if (l < swant) {
+	  bl.append_zero(swant - l);
+	  dout(30) << __func__ << " adding " << swant - l << " zeros" << dendl;
+	}
+      }
+    } else {
+      dout(30) << __func__ << " generating " << swant << " zeros" << dendl;
+      bl.append_zero(swant);
+    }
+    offset += swant;
+    length -= swant;
+    stripe_off = 0;
+  }
+  r = bl.length();
+  dout(30) << " result:\n";
+  bl.hexdump(*_dout);
+  *_dout << dendl;
+
+ out:
+  return r;
+}
+
+int KStore::fiemap(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t len,
+  bufferlist& bl)
+{
+  map<uint64_t, uint64_t> m;
+  int r = fiemap(ch, oid, offset, len, m);
+  if (r >= 0) {
+    encode(m, bl);
+  }
+  return r;
+}
+
+int KStore::fiemap(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t len,
+  map<uint64_t, uint64_t>& destmap)
+{
+  CollectionRef c = static_cast<Collection*>(ch.get());
+  if (!c)
+    return -ENOENT;
+  std::shared_lock l{c->lock};
+
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    return -ENOENT;
+  }
+
+  if (offset > o->onode.size)
+    goto out;
+
+  if (offset + len > o->onode.size) {
+    len = o->onode.size - offset;
+  }
+
+  dout(20) << __func__ << " " << offset << "~" << len << " size "
+	   << o->onode.size << dendl;
+
+  // FIXME: do something smarter here
+  destmap[0] = o->onode.size;
+
+ out:
+  dout(20) << __func__ << " " << offset << "~" << len
+	   << " size = 0 (" << destmap << ")" << dendl;
+  return 0;
+}
+
+int KStore::getattr(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  const char *name,
+  bufferptr& value)
+{
+  dout(15) << __func__ << " " << ch->cid << " " << oid << " " << name << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r;
+  string k(name);
+
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+
+  if (!o->onode.attrs.count(k)) {
+    r = -ENODATA;
+    goto out;
+  }
+  value = o->onode.attrs[k];
+  r = 0;
+ out:
+  dout(10) << __func__ << " " << ch->cid << " " << oid << " " << name
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::getattrs(
+  CollectionHandle& ch,
+  const ghobject_t& oid,
+  map<string,bufferptr,less<>>& aset)
+{
+  dout(15) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r;
+
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  aset = o->onode.attrs;
+  r = 0;
+ out:
+  dout(10) << __func__ << " " << ch->cid << " " << oid
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::list_collections(vector<coll_t>& ls)
+{
+  std::shared_lock l{coll_lock};
+  for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
+       p != coll_map.end();
+       ++p)
+    ls.push_back(p->first);
+  return 0;
+}
+
+bool KStore::collection_exists(const coll_t& c)
+{
+  std::shared_lock l{coll_lock};
+  return coll_map.count(c);
+}
+
+int KStore::collection_empty(CollectionHandle& ch, bool *empty)
+{
+  dout(15) << __func__ << " " << ch->cid << dendl;
+  vector<ghobject_t> ls;
+  ghobject_t next;
+  int r = collection_list(ch, ghobject_t(), ghobject_t::get_max(), 1,
+			  &ls, &next);
+  if (r < 0) {
+    derr << __func__ << " collection_list returned: " << cpp_strerror(r)
+         << dendl;
+    return r;
+  }
+  *empty = ls.empty();
+  dout(10) << __func__ << " " << ch->cid << " = " << (int)(*empty) << dendl;
+  return 0;
+}
+
+int KStore::collection_bits(CollectionHandle& ch)
+{
+  dout(15) << __func__ << " " << ch->cid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  dout(10) << __func__ << " " << ch->cid << " = " << c->cnode.bits << dendl;
+  return c->cnode.bits;
+}
+
+int KStore::collection_list(
+  CollectionHandle &c_, const ghobject_t& start, const ghobject_t& end, int max,
+  vector<ghobject_t> *ls, ghobject_t *pnext)
+
+{
+  Collection *c = static_cast<Collection*>(c_.get());
+  c->flush();
+  dout(15) << __func__ << " " << c->cid
+           << " start " << start << " end " << end << " max " << max << dendl;
+  int r;
+  {
+    std::shared_lock l{c->lock};
+    r = _collection_list(c, start, end, max, ls, pnext);
+  }
+
+  dout(10) << __func__ << " " << c->cid
+    << " start " << start << " end " << end << " max " << max
+    << " = " << r << ", ls.size() = " << ls->size()
+    << ", next = " << (pnext ? *pnext : ghobject_t())  << dendl;
+  return r;
+}
+
+int KStore::_collection_list(
+  Collection* c, const ghobject_t& start, const ghobject_t& end, int max,
+  vector<ghobject_t> *ls, ghobject_t *pnext)
+{
+  int r = 0;
+  KeyValueDB::Iterator it;
+  string temp_start_key, temp_end_key;
+  string start_key, end_key;
+  bool set_next = false;
+  string pend;
+  bool temp;
+
+  ghobject_t static_next;
+  if (!pnext)
+    pnext = &static_next;
+
+  if (start == ghobject_t::get_max() ||
+    start.hobj.is_max()) {
+    goto out;
+  }
+  get_coll_key_range(c->cid, c->cnode.bits, &temp_start_key, &temp_end_key,
+		     &start_key, &end_key);
+  dout(20) << __func__
+	   << " range " << pretty_binary_string(temp_start_key)
+	   << " to " << pretty_binary_string(temp_end_key)
+	   << " and " << pretty_binary_string(start_key)
+	   << " to " << pretty_binary_string(end_key)
+	   << " start " << start << dendl;
+  it = db->get_iterator(PREFIX_OBJ);
+  if (start == ghobject_t() || start == c->cid.get_min_hobj()) {
+    it->upper_bound(temp_start_key);
+    temp = true;
+  } else {
+    string k;
+    get_object_key(cct, start, &k);
+    if (start.hobj.is_temp()) {
+      temp = true;
+      ceph_assert(k >= temp_start_key && k < temp_end_key);
+    } else {
+      temp = false;
+      ceph_assert(k >= start_key && k < end_key);
+    }
+    dout(20) << " start from " << pretty_binary_string(k)
+	     << " temp=" << (int)temp << dendl;
+    it->lower_bound(k);
+  }
+  if (end.hobj.is_max()) {
+    pend = temp ? temp_end_key : end_key;
+  } else {
+    if (end.hobj.is_temp()) {
+      if (temp)
+        get_object_key(cct, end, &pend);
+      else
+	goto out;
+    } else {
+      if (temp)
+        pend = temp_end_key;
+      else
+        get_object_key(cct, end, &pend);
+    }
+  }
+  dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
+  while (true) {
+    if (!it->valid() || it->key() >= pend) {
+      if (!it->valid())
+	dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
+      else
+	dout(20) << __func__ << " key " << pretty_binary_string(it->key())
+		 << " > " << end << dendl;
+      if (temp) {
+	if (end.hobj.is_temp()) {
+          if (it->valid() && it->key() < temp_end_key) {
+            int r = get_key_object(it->key(), pnext);
+            ceph_assert(r == 0);
+            set_next = true;
+          }
+	  break;
+	}
+	dout(30) << __func__ << " switch to non-temp namespace" << dendl;
+	temp = false;
+	it->upper_bound(start_key);
+        if (end.hobj.is_max())
+          pend = end_key;
+        else
+          get_object_key(cct, end, &pend);
+	dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
+	continue;
+      }
+      if (it->valid() && it->key() < end_key) {
+        int r = get_key_object(it->key(), pnext);
+        ceph_assert(r == 0);
+        set_next = true;
+      }
+      break;
+    }
+    dout(20) << __func__ << " key " << pretty_binary_string(it->key()) << dendl;
+    ghobject_t oid;
+    int r = get_key_object(it->key(), &oid);
+    ceph_assert(r == 0);
+    if (ls->size() >= (unsigned)max) {
+      dout(20) << __func__ << " reached max " << max << dendl;
+      *pnext = oid;
+      set_next = true;
+      break;
+    }
+    ls->push_back(oid);
+    it->next();
+  }
+out:
+  if (!set_next) {
+    *pnext = ghobject_t::get_max();
+  }
+  return r;
+}
+
+// omap reads
+
+KStore::OmapIteratorImpl::OmapIteratorImpl(
+  CollectionRef c, OnodeRef o, KeyValueDB::Iterator it)
+  : c(c), o(o), it(it)
+{
+  std::shared_lock l{c->lock};
+  if (o->onode.omap_head) {
+    get_omap_key(o->onode.omap_head, string(), &head);
+    get_omap_tail(o->onode.omap_head, &tail);
+    it->lower_bound(head);
+  }
+}
+
+int KStore::OmapIteratorImpl::seek_to_first()
+{
+  std::shared_lock l{c->lock};
+  if (o->onode.omap_head) {
+    it->lower_bound(head);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  return 0;
+}
+
+int KStore::OmapIteratorImpl::upper_bound(const string& after)
+{
+  std::shared_lock l{c->lock};
+  if (o->onode.omap_head) {
+    string key;
+    get_omap_key(o->onode.omap_head, after, &key);
+    it->upper_bound(key);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  return 0;
+}
+
+int KStore::OmapIteratorImpl::lower_bound(const string& to)
+{
+  std::shared_lock l{c->lock};
+  if (o->onode.omap_head) {
+    string key;
+    get_omap_key(o->onode.omap_head, to, &key);
+    it->lower_bound(key);
+  } else {
+    it = KeyValueDB::Iterator();
+  }
+  return 0;
+}
+
+bool KStore::OmapIteratorImpl::valid()
+{
+  std::shared_lock l{c->lock};
+  if (o->onode.omap_head && it->valid() && it->raw_key().second <= tail) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int KStore::OmapIteratorImpl::next()
+{
+  std::shared_lock l{c->lock};
+  if (o->onode.omap_head) {
+    it->next();
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+string KStore::OmapIteratorImpl::key()
+{
+  std::shared_lock l{c->lock};
+  ceph_assert(it->valid());
+  string db_key = it->raw_key().second;
+  string user_key;
+  decode_omap_key(db_key, &user_key);
+  return user_key;
+}
+
+bufferlist KStore::OmapIteratorImpl::value()
+{
+  std::shared_lock l{c->lock};
+  ceph_assert(it->valid());
+  return it->value();
+}
+
+int KStore::omap_get(
+  CollectionHandle& ch,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  bufferlist *header,      ///< [out] omap header
+  map<string, bufferlist> *out /// < [out] Key to value map
+  )
+{
+  dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  {
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+    string head, tail;
+    get_omap_header(o->onode.omap_head, &head);
+    get_omap_tail(o->onode.omap_head, &tail);
+    it->lower_bound(head);
+    while (it->valid()) {
+      if (it->key() == head) {
+	dout(30) << __func__ << "  got header" << dendl;
+	*header = it->value();
+      } else if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      } else {
+	string user_key;
+	decode_omap_key(it->key(), &user_key);
+	dout(30) << __func__ << "  got " << pretty_binary_string(it->key())
+		 << " -> " << user_key << dendl;
+	ceph_assert(it->key() < tail);
+	(*out)[user_key] = it->value();
+      }
+      it->next();
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::omap_get_header(
+  CollectionHandle& ch,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  bufferlist *header,      ///< [out] omap header
+  bool allow_eio ///< [in] don't assert on eio
+  )
+{
+  dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  {
+    string head;
+    get_omap_header(o->onode.omap_head, &head);
+    if (db->get(PREFIX_OMAP, head, header) >= 0) {
+      dout(30) << __func__ << "  got header" << dendl;
+    } else {
+      dout(30) << __func__ << "  no header" << dendl;
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::omap_get_keys(
+  CollectionHandle& ch,              ///< [in] Collection containing oid
+  const ghobject_t &oid, ///< [in] Object containing omap
+  set<string> *keys      ///< [out] Keys defined on oid
+  )
+{
+  dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  {
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+    string head, tail;
+    get_omap_key(o->onode.omap_head, string(), &head);
+    get_omap_tail(o->onode.omap_head, &tail);
+    it->lower_bound(head);
+    while (it->valid()) {
+      if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      }
+      string user_key;
+      decode_omap_key(it->key(), &user_key);
+      dout(30) << __func__ << "  got " << pretty_binary_string(it->key())
+	       << " -> " << user_key << dendl;
+      ceph_assert(it->key() < tail);
+      keys->insert(user_key);
+      it->next();
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::omap_get_values(
+  CollectionHandle& ch,                    ///< [in] Collection containing oid
+  const ghobject_t &oid,       ///< [in] Object containing omap
+  const set<string> &keys,     ///< [in] Keys to get
+  map<string, bufferlist> *out ///< [out] Returned keys and values
+  )
+{
+  dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+    string key;
+    get_omap_key(o->onode.omap_head, *p, &key);
+    bufferlist val;
+    if (db->get(PREFIX_OMAP, key, &val) >= 0) {
+      dout(30) << __func__ << "  got " << pretty_binary_string(key)
+	       << " -> " << *p << dendl;
+      out->insert(make_pair(*p, val));
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::omap_check_keys(
+  CollectionHandle& ch,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  const set<string> &keys, ///< [in] Keys to check
+  set<string> *out         ///< [out] Subset of keys defined on oid
+  )
+{
+  dout(15) << __func__ << " " << ch->cid << " oid " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  int r = 0;
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    r = -ENOENT;
+    goto out;
+  }
+  if (!o->onode.omap_head)
+    goto out;
+  o->flush();
+  for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p) {
+    string key;
+    get_omap_key(o->onode.omap_head, *p, &key);
+    bufferlist val;
+    if (db->get(PREFIX_OMAP, key, &val) >= 0) {
+      dout(30) << __func__ << "  have " << pretty_binary_string(key)
+	       << " -> " << *p << dendl;
+      out->insert(*p);
+    } else {
+      dout(30) << __func__ << "  miss " << pretty_binary_string(key)
+	       << " -> " << *p << dendl;
+    }
+  }
+ out:
+  dout(10) << __func__ << " " << ch->cid << " oid " << oid << " = " << r << dendl;
+  return r;
+}
+
+ObjectMap::ObjectMapIterator KStore::get_omap_iterator(
+  CollectionHandle& ch,              ///< [in] collection
+  const ghobject_t &oid  ///< [in] object
+  )
+{
+
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  OnodeRef o = c->get_onode(oid, false);
+  if (!o || !o->exists) {
+    dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl;
+    return ObjectMap::ObjectMapIterator();
+  }
+  o->flush();
+  dout(10) << __func__ << " header = " << o->onode.omap_head <<dendl;
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+  return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it));
+}
+
+
+// -----------------
+// write helpers
+
+int KStore::_open_super_meta()
+{
+  // nid
+  {
+    nid_max = 0;
+    bufferlist bl;
+    db->get(PREFIX_SUPER, "nid_max", &bl);
+    auto p = bl.cbegin();
+    try {
+      decode(nid_max, p);
+    } catch (ceph::buffer::error& e) {
+    }
+    dout(10) << __func__ << " old nid_max " << nid_max << dendl;
+    nid_last = nid_max;
+  }
+  return 0;
+}
+
+void KStore::_assign_nid(TransContext *txc, OnodeRef o)
+{
+  if (o->onode.nid)
+    return;
+  std::lock_guard<std::mutex> l(nid_lock);
+  o->onode.nid = ++nid_last;
+  dout(20) << __func__ << " " << o->oid << " nid " << o->onode.nid << dendl;
+  if (nid_last > nid_max) {
+    nid_max += cct->_conf->kstore_nid_prealloc;
+    bufferlist bl;
+    encode(nid_max, bl);
+    txc->t->set(PREFIX_SUPER, "nid_max", bl);
+    dout(10) << __func__ << " nid_max now " << nid_max << dendl;
+  }
+}
+
+KStore::TransContext *KStore::_txc_create(OpSequencer *osr)
+{
+  TransContext *txc = new TransContext(osr);
+  txc->t = db->get_transaction();
+  osr->queue_new(txc);
+  dout(20) << __func__ << " osr " << osr << " = " << txc << dendl;
+  return txc;
+}
+
+void KStore::_txc_state_proc(TransContext *txc)
+{
+  while (true) {
+    dout(10) << __func__ << " txc " << txc
+	     << " " << txc->get_state_name() << dendl;
+    switch (txc->state) {
+    case TransContext::STATE_PREPARE:
+      txc->log_state_latency(logger, l_kstore_state_prepare_lat);
+      txc->state = TransContext::STATE_KV_QUEUED;
+      if (!cct->_conf->kstore_sync_transaction) {
+	std::lock_guard<std::mutex> l(kv_lock);
+	if (cct->_conf->kstore_sync_submit_transaction) {
+          int r = db->submit_transaction(txc->t);
+	  ceph_assert(r == 0);
+	}
+	kv_queue.push_back(txc);
+	kv_cond.notify_one();
+	return;
+      }
+      {
+	int r = db->submit_transaction_sync(txc->t);
+	ceph_assert(r == 0);
+      }
+      break;
+
+    case TransContext::STATE_KV_QUEUED:
+      txc->log_state_latency(logger, l_kstore_state_kv_queued_lat);
+      txc->state = TransContext::STATE_KV_DONE;
+      _txc_finish_kv(txc);
+      // ** fall-thru **
+
+    case TransContext::STATE_KV_DONE:
+      txc->log_state_latency(logger, l_kstore_state_kv_done_lat);
+      txc->state = TransContext::STATE_FINISHING;
+      // ** fall-thru **
+
+    case TransContext::TransContext::STATE_FINISHING:
+      txc->log_state_latency(logger, l_kstore_state_finishing_lat);
+      _txc_finish(txc);
+      return;
+
+    default:
+      derr << __func__ << " unexpected txc " << txc
+	   << " state " << txc->get_state_name() << dendl;
+      ceph_abort_msg("unexpected txc state");
+      return;
+    }
+  }
+}
+
+void KStore::_txc_finalize(OpSequencer *osr, TransContext *txc)
+{
+  dout(20) << __func__ << " osr " << osr << " txc " << txc
+	   << " onodes " << txc->onodes << dendl;
+
+  // finalize onodes
+  for (set<OnodeRef>::iterator p = txc->onodes.begin();
+       p != txc->onodes.end();
+       ++p) {
+    bufferlist bl;
+    encode((*p)->onode, bl);
+    dout(20) << " onode size is " << bl.length() << dendl;
+    txc->t->set(PREFIX_OBJ, (*p)->key, bl);
+
+    std::lock_guard<std::mutex> l((*p)->flush_lock);
+    (*p)->flush_txns.insert(txc);
+  }
+}
+
+void KStore::_txc_finish_kv(TransContext *txc)
+{
+  dout(20) << __func__ << " txc " << txc << dendl;
+
+  // warning: we're calling onreadable_sync inside the sequencer lock
+  if (txc->onreadable_sync) {
+    txc->onreadable_sync->complete(0);
+    txc->onreadable_sync = NULL;
+  }
+  if (txc->onreadable) {
+    finisher.queue(txc->onreadable);
+    txc->onreadable = NULL;
+  }
+  if (txc->oncommit) {
+    finisher.queue(txc->oncommit);
+    txc->oncommit = NULL;
+  }
+  if (!txc->oncommits.empty()) {
+    finisher.queue(txc->oncommits);
+  }
+
+  throttle_ops.put(txc->ops);
+  throttle_bytes.put(txc->bytes);
+}
+
+void KStore::_txc_finish(TransContext *txc)
+{
+  dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl;
+  ceph_assert(txc->state == TransContext::STATE_FINISHING);
+
+  for (set<OnodeRef>::iterator p = txc->onodes.begin();
+       p != txc->onodes.end();
+       ++p) {
+    std::lock_guard<std::mutex> l((*p)->flush_lock);
+    dout(20) << __func__ << " onode " << *p << " had " << (*p)->flush_txns
+	     << dendl;
+    ceph_assert((*p)->flush_txns.count(txc));
+    (*p)->flush_txns.erase(txc);
+    if ((*p)->flush_txns.empty()) {
+      (*p)->flush_cond.notify_all();
+      (*p)->clear_pending_stripes();
+    }
+  }
+
+  // clear out refs
+  txc->onodes.clear();
+
+  while (!txc->removed_collections.empty()) {
+    _queue_reap_collection(txc->removed_collections.front());
+    txc->removed_collections.pop_front();
+  }
+
+  OpSequencerRef osr = txc->osr;
+  {
+    std::lock_guard<std::mutex> l(osr->qlock);
+    txc->state = TransContext::STATE_DONE;
+  }
+
+  _osr_reap_done(osr.get());
+}
+
+void KStore::_osr_reap_done(OpSequencer *osr)
+{
+  std::lock_guard<std::mutex> l(osr->qlock);
+  dout(20) << __func__ << " osr " << osr << dendl;
+  while (!osr->q.empty()) {
+    TransContext *txc = &osr->q.front();
+    dout(20) << __func__ << "  txc " << txc << " " << txc->get_state_name()
+	     << dendl;
+    if (txc->state != TransContext::STATE_DONE) {
+      break;
+    }
+
+    if (txc->first_collection) {
+      txc->first_collection->onode_map.trim(cct->_conf->kstore_onode_map_size);
+    }
+
+    osr->q.pop_front();
+    txc->log_state_latency(logger, l_kstore_state_done_lat);
+    delete txc;
+    osr->qcond.notify_all();
+    if (osr->q.empty())
+      dout(20) << __func__ << " osr " << osr << " q now empty" << dendl;
+  }
+}
+
+void KStore::_kv_sync_thread()
+{
+  dout(10) << __func__ << " start" << dendl;
+  std::unique_lock<std::mutex> l(kv_lock);
+  while (true) {
+    ceph_assert(kv_committing.empty());
+    if (kv_queue.empty()) {
+      if (kv_stop)
+	break;
+      dout(20) << __func__ << " sleep" << dendl;
+      kv_sync_cond.notify_all();
+      kv_cond.wait(l);
+      dout(20) << __func__ << " wake" << dendl;
+    } else {
+      dout(20) << __func__ << " committing " << kv_queue.size() << dendl;
+      kv_committing.swap(kv_queue);
+      utime_t start = ceph_clock_now();
+      l.unlock();
+
+      dout(30) << __func__ << " committing txc " << kv_committing << dendl;
+
+      // one transaction to force a sync
+      KeyValueDB::Transaction t = db->get_transaction();
+      if (!cct->_conf->kstore_sync_submit_transaction) {
+	for (std::deque<TransContext *>::iterator it = kv_committing.begin();
+	     it != kv_committing.end();
+	     ++it) {
+	  int r = db->submit_transaction((*it)->t);
+	  ceph_assert(r == 0);
+	}
+      }
+      int r = db->submit_transaction_sync(t);
+      ceph_assert(r == 0);
+      utime_t finish = ceph_clock_now();
+      utime_t dur = finish - start;
+      dout(20) << __func__ << " committed " << kv_committing.size()
+	       << " in " << dur << dendl;
+      while (!kv_committing.empty()) {
+	TransContext *txc = kv_committing.front();
+	_txc_state_proc(txc);
+	kv_committing.pop_front();
+      }
+
+      // this is as good a place as any ...
+      _reap_collections();
+
+      l.lock();
+    }
+  }
+  dout(10) << __func__ << " finish" << dendl;
+}
+
+
+// ---------------------------
+// transactions
+
+int KStore::queue_transactions(
+  CollectionHandle& ch,
+  vector<Transaction>& tls,
+  TrackedOpRef op,
+  ThreadPool::TPHandle *handle)
+{
+  Context *onreadable;
+  Context *ondisk;
+  Context *onreadable_sync;
+  ObjectStore::Transaction::collect_contexts(
+    tls, &onreadable, &ondisk, &onreadable_sync);
+
+  // set up the sequencer
+  Collection *c = static_cast<Collection*>(ch.get());
+  OpSequencer *osr = c->osr.get();
+  dout(10) << __func__ << " ch " << ch.get() << " " << c->cid << dendl;
+
+  // prepare
+  TransContext *txc = _txc_create(osr);
+  txc->onreadable = onreadable;
+  txc->onreadable_sync = onreadable_sync;
+  txc->oncommit = ondisk;
+
+  for (vector<Transaction>::iterator p = tls.begin(); p != tls.end(); ++p) {
+    txc->ops += (*p).get_num_ops();
+    txc->bytes += (*p).get_num_bytes();
+    _txc_add_transaction(txc, &(*p));
+  }
+
+  _txc_finalize(osr, txc);
+
+  throttle_ops.get(txc->ops);
+  throttle_bytes.get(txc->bytes);
+
+  // execute (start)
+  _txc_state_proc(txc);
+  return 0;
+}
+
+void KStore::_txc_add_transaction(TransContext *txc, Transaction *t)
+{
+  Transaction::iterator i = t->begin();
+
+  dout(30) << __func__ << " transaction dump:\n";
+  JSONFormatter f(true);
+  f.open_object_section("transaction");
+  t->dump(&f);
+  f.close_section();
+  f.flush(*_dout);
+  *_dout << dendl;
+
+  vector<CollectionRef> cvec(i.colls.size());
+  unsigned j = 0;
+  for (vector<coll_t>::iterator p = i.colls.begin(); p != i.colls.end();
+       ++p, ++j) {
+    cvec[j] = _get_collection(*p);
+
+    // note first collection we reference
+    if (!j && !txc->first_collection)
+      txc->first_collection = cvec[j];
+  }
+  vector<OnodeRef> ovec(i.objects.size());
+
+  for (int pos = 0; i.have_op(); ++pos) {
+    Transaction::Op *op = i.decode_op();
+    int r = 0;
+
+    // no coll or obj
+    if (op->op == Transaction::OP_NOP)
+      continue;
+
+    // collection operations
+    CollectionRef &c = cvec[op->cid];
+    switch (op->op) {
+    case Transaction::OP_RMCOLL:
+      {
+        coll_t cid = i.get_cid(op->cid);
+	r = _remove_collection(txc, cid, &c);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_MKCOLL:
+      {
+	ceph_assert(!c);
+        coll_t cid = i.get_cid(op->cid);
+	r = _create_collection(txc, cid, op->split_bits, &c);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_SPLIT_COLLECTION:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_SPLIT_COLLECTION2:
+      {
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+	r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_MERGE_COLLECTION:
+      {
+        uint32_t bits = op->split_bits;
+	r = _merge_collection(txc, &c, cvec[op->dest_cid], bits);
+	if (!r)
+	  continue;
+      }
+      break;
+
+    case Transaction::OP_COLL_HINT:
+      {
+        uint32_t type = op->hint;
+        bufferlist hint;
+        i.decode_bl(hint);
+        auto hiter = hint.cbegin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          decode(pg_num, hiter);
+          decode(num_objs, hiter);
+          dout(10) << __func__ << " collection hint objects is a no-op, "
+		   << " pg_num " << pg_num << " num_objects " << num_objs
+		   << dendl;
+        } else {
+          // Ignore the hint
+          dout(10) << __func__ << " unknown collection hint " << type << dendl;
+        }
+	continue;
+      }
+      break;
+
+    case Transaction::OP_COLL_SETATTR:
+      r = -EOPNOTSUPP;
+      break;
+
+    case Transaction::OP_COLL_RMATTR:
+      r = -EOPNOTSUPP;
+      break;
+
+    case Transaction::OP_COLL_RENAME:
+      ceph_abort_msg("not implemented");
+      break;
+    }
+    if (r < 0) {
+      derr << " error " << cpp_strerror(r)
+	   << " not handled on operation " << op->op
+	   << " (op " << pos << ", counting from 0)" << dendl;
+      dout(0) << " transaction dump:\n";
+      JSONFormatter f(true);
+      f.open_object_section("transaction");
+      t->dump(&f);
+      f.close_section();
+      f.flush(*_dout);
+      *_dout << dendl;
+      ceph_abort_msg("unexpected error");
+    }
+
+    // object operations
+    std::unique_lock l{c->lock};
+    OnodeRef &o = ovec[op->oid];
+    if (!o) {
+      // these operations implicity create the object
+      bool create = false;
+      if (op->op == Transaction::OP_TOUCH ||
+	  op->op == Transaction::OP_CREATE ||
+	  op->op == Transaction::OP_WRITE ||
+	  op->op == Transaction::OP_ZERO) {
+	create = true;
+      }
+      ghobject_t oid = i.get_oid(op->oid);
+      o = c->get_onode(oid, create);
+      if (!create) {
+	if (!o || !o->exists) {
+	  dout(10) << __func__ << " op " << op->op << " got ENOENT on "
+		   << oid << dendl;
+	  r = -ENOENT;
+	  goto endop;
+	}
+      }
+    }
+
+    switch (op->op) {
+    case Transaction::OP_TOUCH:
+    case Transaction::OP_CREATE:
+	r = _touch(txc, c, o);
+      break;
+
+    case Transaction::OP_WRITE:
+      {
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	uint32_t fadvise_flags = i.get_fadvise_flags();
+        bufferlist bl;
+        i.decode_bl(bl);
+	r = _write(txc, c, o, off, len, bl, fadvise_flags);
+      }
+      break;
+
+    case Transaction::OP_ZERO:
+      {
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	r = _zero(txc, c, o, off, len);
+      }
+      break;
+
+    case Transaction::OP_TRIMCACHE:
+      {
+        // deprecated, no-op
+      }
+      break;
+
+    case Transaction::OP_TRUNCATE:
+      {
+        uint64_t off = op->off;
+	r = _truncate(txc, c, o, off);
+      }
+      break;
+
+    case Transaction::OP_REMOVE:
+	r = _remove(txc, c, o);
+      break;
+
+    case Transaction::OP_SETATTR:
+      {
+        string name = i.decode_string();
+        bufferlist bl;
+        i.decode_bl(bl);
+	map<string, bufferptr> to_set;
+	to_set[name] = bufferptr(bl.c_str(), bl.length());
+	r = _setattrs(txc, c, o, to_set);
+      }
+      break;
+
+    case Transaction::OP_SETATTRS:
+      {
+        map<string, bufferptr> aset;
+        i.decode_attrset(aset);
+	r = _setattrs(txc, c, o, aset);
+      }
+      break;
+
+    case Transaction::OP_RMATTR:
+      {
+	string name = i.decode_string();
+	r = _rmattr(txc, c, o, name);
+      }
+      break;
+
+    case Transaction::OP_RMATTRS:
+      {
+	r = _rmattrs(txc, c, o);
+      }
+      break;
+
+    case Transaction::OP_CLONE:
+      {
+        const ghobject_t& noid = i.get_oid(op->dest_oid);
+	OnodeRef no = c->get_onode(noid, true);
+	r = _clone(txc, c, o, no);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_CLONERANGE2:
+      {
+	const ghobject_t& noid = i.get_oid(op->dest_oid);
+	OnodeRef no = c->get_onode(noid, true);
+        uint64_t srcoff = op->off;
+        uint64_t len = op->len;
+        uint64_t dstoff = op->dest_off;
+	r = _clone_range(txc, c, o, no, srcoff, len, dstoff);
+      }
+      break;
+
+    case Transaction::OP_COLL_ADD:
+      ceph_abort_msg("not implemented");
+      break;
+
+    case Transaction::OP_COLL_REMOVE:
+      ceph_abort_msg("not implemented");
+      break;
+
+    case Transaction::OP_COLL_MOVE:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_COLL_MOVE_RENAME:
+      {
+	ceph_assert(op->cid == op->dest_cid);
+	const ghobject_t& noid = i.get_oid(op->dest_oid);
+	OnodeRef no = c->get_onode(noid, true);
+	r = _rename(txc, c, o, no, noid);
+	o.reset();
+      }
+      break;
+
+    case Transaction::OP_TRY_RENAME:
+      {
+	const ghobject_t& noid = i.get_oid(op->dest_oid);
+	OnodeRef no = c->get_onode(noid, true);
+	r = _rename(txc, c, o, no, noid);
+	if (r == -ENOENT)
+	  r = 0;
+	o.reset();
+      }
+      break;
+
+    case Transaction::OP_OMAP_CLEAR:
+      {
+	r = _omap_clear(txc, c, o);
+      }
+      break;
+    case Transaction::OP_OMAP_SETKEYS:
+      {
+	bufferlist aset_bl;
+        i.decode_attrset_bl(&aset_bl);
+	r = _omap_setkeys(txc, c, o, aset_bl);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYS:
+      {
+	bufferlist keys_bl;
+        i.decode_keyset_bl(&keys_bl);
+	r = _omap_rmkeys(txc, c, o, keys_bl);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYRANGE:
+      {
+        string first, last;
+        first = i.decode_string();
+        last = i.decode_string();
+	r = _omap_rmkey_range(txc, c, o, first, last);
+      }
+      break;
+    case Transaction::OP_OMAP_SETHEADER:
+      {
+        bufferlist bl;
+        i.decode_bl(bl);
+	r = _omap_setheader(txc, c, o, bl);
+      }
+      break;
+
+    case Transaction::OP_SETALLOCHINT:
+      {
+        uint64_t expected_object_size = op->expected_object_size;
+        uint64_t expected_write_size = op->expected_write_size;
+	uint32_t flags = op->hint;
+	r = _setallochint(txc, c, o,
+			  expected_object_size,
+			  expected_write_size,
+			  flags);
+      }
+      break;
+
+    default:
+      derr << "bad op " << op->op << dendl;
+      ceph_abort();
+    }
+
+  endop:
+    if (r < 0) {
+      bool ok = false;
+
+      if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+			    op->op == Transaction::OP_CLONE ||
+			    op->op == Transaction::OP_CLONERANGE2 ||
+			    op->op == Transaction::OP_COLL_ADD))
+	// -ENOENT is usually okay
+	ok = true;
+      if (r == -ENODATA)
+	ok = true;
+
+      if (!ok) {
+	const char *msg = "unexpected error code";
+
+	if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+			     op->op == Transaction::OP_CLONE ||
+			     op->op == Transaction::OP_CLONERANGE2))
+	  msg = "ENOENT on clone suggests osd bug";
+
+	if (r == -ENOSPC)
+	  // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+	  // by partially applying transactions.
+	  msg = "ENOSPC from key value store, misconfigured cluster";
+
+	if (r == -ENOTEMPTY) {
+	  msg = "ENOTEMPTY suggests garbage data in osd data dir";
+	}
+
+	dout(0) << " error " << cpp_strerror(r) << " not handled on operation " << op->op
+		<< " (op " << pos << ", counting from 0)" << dendl;
+	dout(0) << msg << dendl;
+	dout(0) << " transaction dump:\n";
+	JSONFormatter f(true);
+	f.open_object_section("transaction");
+	t->dump(&f);
+	f.close_section();
+	f.flush(*_dout);
+	*_dout << dendl;
+	ceph_abort_msg("unexpected error");
+      }
+    }
+  }
+}
+
+
+
+// -----------------
+// write operations
+
+int KStore::_touch(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef &o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  o->exists = true;
+  _assign_nid(txc, o);
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+void KStore::_dump_onode(OnodeRef o)
+{
+  dout(30) << __func__ << " " << o
+	   << " nid " << o->onode.nid
+	   << " size " << o->onode.size
+	   << " expected_object_size " << o->onode.expected_object_size
+	   << " expected_write_size " << o->onode.expected_write_size
+	   << dendl;
+  for (map<string,bufferptr>::iterator p = o->onode.attrs.begin();
+       p != o->onode.attrs.end();
+       ++p) {
+    dout(30) << __func__ << "  attr " << p->first
+	     << " len " << p->second.length() << dendl;
+  }
+}
+
+void KStore::_do_read_stripe(OnodeRef o, uint64_t offset, bufferlist *pbl, bool do_cache)
+{
+  if (!do_cache) {
+    string key;
+    get_data_key(o->onode.nid, offset, &key);
+    db->get(PREFIX_DATA, key, pbl);
+    return;
+  }
+ 
+  map<uint64_t,bufferlist>::iterator p = o->pending_stripes.find(offset);
+  if (p == o->pending_stripes.end()) {
+    string key;
+    get_data_key(o->onode.nid, offset, &key);
+    db->get(PREFIX_DATA, key, pbl);
+    o->pending_stripes[offset] = *pbl;
+  } else {
+    *pbl = p->second;
+  }
+}
+
+void KStore::_do_write_stripe(TransContext *txc, OnodeRef o,
+			      uint64_t offset, bufferlist& bl)
+{
+  o->pending_stripes[offset] = bl;
+  string key;
+  get_data_key(o->onode.nid, offset, &key);
+  txc->t->set(PREFIX_DATA, key, bl);
+}
+
+void KStore::_do_remove_stripe(TransContext *txc, OnodeRef o, uint64_t offset)
+{
+  o->pending_stripes.erase(offset);
+  string key;
+  get_data_key(o->onode.nid, offset, &key);
+  txc->t->rmkey(PREFIX_DATA, key);
+}
+
+int KStore::_do_write(TransContext *txc,
+		      OnodeRef o,
+		      uint64_t offset, uint64_t length,
+		      bufferlist& orig_bl,
+		      uint32_t fadvise_flags)
+{
+  int r = 0;
+
+  dout(20) << __func__
+	   << " " << o->oid << " " << offset << "~" << length
+	   << " - have " << o->onode.size
+	   << " bytes, nid " << o->onode.nid << dendl;
+  _dump_onode(o);
+  o->exists = true;
+
+  if (length == 0) {
+    return 0;
+  }
+
+  uint64_t stripe_size = o->onode.stripe_size;
+  if (!stripe_size) {
+    o->onode.stripe_size = cct->_conf->kstore_default_stripe_size;
+    stripe_size = o->onode.stripe_size;
+  }
+
+  unsigned bl_off = 0;
+  while (length > 0) {
+    uint64_t offset_rem = offset % stripe_size;
+    uint64_t end_rem = (offset + length) % stripe_size;
+    if (offset_rem == 0 && end_rem == 0) {
+      bufferlist bl;
+      bl.substr_of(orig_bl, bl_off, stripe_size);
+      dout(30) << __func__ << " full stripe " << offset << dendl;
+      _do_write_stripe(txc, o, offset, bl);
+      offset += stripe_size;
+      length -= stripe_size;
+      bl_off += stripe_size;
+      continue;
+    }
+    uint64_t stripe_off = offset - offset_rem;
+    bufferlist prev;
+    _do_read_stripe(o, stripe_off, &prev, true);
+    dout(20) << __func__ << " read previous stripe " << stripe_off
+	     << ", got " << prev.length() << dendl;
+    bufferlist bl;
+    if (offset_rem) {
+      unsigned p = std::min<uint64_t>(prev.length(), offset_rem);
+      if (p) {
+	dout(20) << __func__ << " reuse leading " << p << " bytes" << dendl;
+	bl.substr_of(prev, 0, p);
+      }
+      if (p < offset_rem) {
+	dout(20) << __func__ << " add leading " << offset_rem - p << " zeros" << dendl;
+	bl.append_zero(offset_rem - p);
+      }
+    }
+    unsigned use = stripe_size - offset_rem;
+    if (use > length)
+      use -= stripe_size - end_rem;
+    dout(20) << __func__ << " using " << use << " for this stripe" << dendl;
+    bufferlist t;
+    t.substr_of(orig_bl, bl_off, use);
+    bl.claim_append(t);
+    bl_off += use;
+    if (end_rem) {
+      if (end_rem < prev.length()) {
+	unsigned l = prev.length() - end_rem;
+	dout(20) << __func__ << " reuse trailing " << l << " bytes" << dendl;
+	bufferlist t;
+	t.substr_of(prev, end_rem, l);
+	bl.claim_append(t);
+      }
+    }
+    dout(30) << " writing:\n";
+    bl.hexdump(*_dout);
+    *_dout << dendl;
+    _do_write_stripe(txc, o, stripe_off, bl);
+    offset += use;
+    length -= use;
+  }
+
+  if (offset > o->onode.size) {
+    dout(20) << __func__ << " extending size to " << offset + length
+	     << dendl;
+    o->onode.size = offset;
+  }
+
+  return r;
+}
+
+int KStore::_write(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& o,
+		   uint64_t offset, size_t length,
+		   bufferlist& bl,
+		   uint32_t fadvise_flags)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset << "~" << length
+	   << dendl;
+  _assign_nid(txc, o);
+  int r = _do_write(txc, o, offset, length, bl, fadvise_flags);
+  txc->write_onode(o);
+
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset << "~" << length
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_zero(TransContext *txc,
+		  CollectionRef& c,
+		  OnodeRef& o,
+		  uint64_t offset, size_t length)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset << "~" << length
+	   << dendl;
+  int r = 0;
+  o->exists = true;
+
+  _dump_onode(o);
+  _assign_nid(txc, o);
+
+  uint64_t stripe_size = o->onode.stripe_size;
+  if (stripe_size) {
+    uint64_t end = offset + length;
+    uint64_t pos = offset;
+    uint64_t stripe_off = pos % stripe_size;
+    while (pos < offset + length) {
+      if (stripe_off || end - pos < stripe_size) {
+	bufferlist stripe;
+	_do_read_stripe(o, pos - stripe_off, &stripe, true);
+	dout(30) << __func__ << " stripe " << pos - stripe_off << " got "
+		 << stripe.length() << dendl;
+	bufferlist bl;
+	bl.substr_of(stripe, 0, std::min<uint64_t>(stripe.length(), stripe_off));
+	if (end >= pos - stripe_off + stripe_size ||
+	    end >= o->onode.size) {
+	  dout(20) << __func__ << " truncated stripe " << pos - stripe_off
+		   << " to " << bl.length() << dendl;
+	} else {
+          auto len = end - (pos - stripe_off + bl.length());
+	  bl.append_zero(len);
+	  dout(20) << __func__ << " adding " << len << " of zeros" << dendl;
+	  if (stripe.length() > bl.length()) {
+	    unsigned l = stripe.length() - bl.length();
+	    bufferlist t;
+	    t.substr_of(stripe, stripe.length() - l, l);
+	    dout(20) << __func__ << " keeping tail " << l << " of stripe" << dendl;
+	    bl.claim_append(t);
+	  }
+	}
+	_do_write_stripe(txc, o, pos - stripe_off, bl);
+	pos += stripe_size - stripe_off;
+	stripe_off = 0;
+      } else {
+	dout(20) << __func__ << " rm stripe " << pos << dendl;
+	_do_remove_stripe(txc, o, pos - stripe_off);
+	pos += stripe_size;
+      }
+    }
+  }
+  if (offset + length > o->onode.size) {
+    o->onode.size = offset + length;
+    dout(20) << __func__ << " extending size to " << offset + length
+	     << dendl;
+  }
+  txc->write_onode(o);
+
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset << "~" << length
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_do_truncate(TransContext *txc, OnodeRef o, uint64_t offset)
+{
+  uint64_t stripe_size = o->onode.stripe_size;
+
+  o->flush();
+
+  // trim down stripes
+  if (stripe_size) {
+    uint64_t pos = offset;
+    uint64_t stripe_off = pos % stripe_size;
+    while (pos < o->onode.size) {
+      if (stripe_off) {
+	bufferlist stripe;
+	_do_read_stripe(o, pos - stripe_off, &stripe, true);
+	dout(30) << __func__ << " stripe " << pos - stripe_off << " got "
+		 << stripe.length() << dendl;
+	bufferlist t;
+	t.substr_of(stripe, 0, std::min<uint64_t>(stripe_off, stripe.length()));
+	_do_write_stripe(txc, o, pos - stripe_off, t);
+	dout(20) << __func__ << " truncated stripe " << pos - stripe_off
+		 << " to " << t.length() << dendl;
+	pos += stripe_size - stripe_off;
+	stripe_off = 0;
+      } else {
+	dout(20) << __func__ << " rm stripe " << pos << dendl;
+	_do_remove_stripe(txc, o, pos - stripe_off);
+	pos += stripe_size;
+      }
+    }
+
+    // trim down cached tail
+    if (o->tail_bl.length()) {
+      if (offset / stripe_size != o->onode.size / stripe_size) {
+	dout(20) << __func__ << " clear cached tail" << dendl;
+	o->clear_tail();
+      }
+    }
+  }
+
+  o->onode.size = offset;
+  dout(10) << __func__ << " truncate size to " << offset << dendl;
+
+  txc->write_onode(o);
+  return 0;
+}
+
+int KStore::_truncate(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& o,
+		      uint64_t offset)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset
+	   << dendl;
+  int r = _do_truncate(txc, o, offset);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << offset
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_do_remove(TransContext *txc,
+		       OnodeRef o)
+{
+  string key;
+
+  _do_truncate(txc, o, 0);
+
+  o->onode.size = 0;
+  if (o->onode.omap_head) {
+    _do_omap_clear(txc, o->onode.omap_head);
+  }
+  o->exists = false;
+  o->onode = kstore_onode_t();
+  txc->onodes.erase(o);
+  get_object_key(cct, o->oid, &key);
+  txc->t->rmkey(PREFIX_OBJ, key);
+  return 0;
+}
+
+int KStore::_remove(TransContext *txc,
+		    CollectionRef& c,
+		    OnodeRef &o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = _do_remove(txc, o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_setattr(TransContext *txc,
+		     CollectionRef& c,
+		     OnodeRef& o,
+		     const string& name,
+		     bufferptr& val)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << " (" << val.length() << " bytes)"
+	   << dendl;
+  int r = 0;
+  o->onode.attrs[name] = val;
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << " (" << val.length() << " bytes)"
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_setattrs(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& o,
+		      const map<string,bufferptr>& aset)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << aset.size() << " keys"
+	   << dendl;
+  int r = 0;
+  for (map<string,bufferptr>::const_iterator p = aset.begin();
+       p != aset.end(); ++p) {
+    if (p->second.is_partial())
+      o->onode.attrs[p->first] = bufferptr(p->second.c_str(), p->second.length());
+    else
+      o->onode.attrs[p->first] = p->second;
+  }
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << aset.size() << " keys"
+	   << " = " << r << dendl;
+  return r;
+}
+
+
+int KStore::_rmattr(TransContext *txc,
+		    CollectionRef& c,
+		    OnodeRef& o,
+		    const string& name)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << dendl;
+  int r = 0;
+  o->onode.attrs.erase(name);
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " " << name << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_rmattrs(TransContext *txc,
+		     CollectionRef& c,
+		     OnodeRef& o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  o->onode.attrs.clear();
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+void KStore::_do_omap_clear(TransContext *txc, uint64_t id)
+{
+  KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+  string prefix, tail;
+  get_omap_header(id, &prefix);
+  get_omap_tail(id, &tail);
+  it->lower_bound(prefix);
+  while (it->valid()) {
+    if (it->key() >= tail) {
+      dout(30) << __func__ << "  stop at " << tail << dendl;
+      break;
+    }
+    txc->t->rmkey(PREFIX_OMAP, it->key());
+    dout(30) << __func__ << "  rm " << pretty_binary_string(it->key()) << dendl;
+    it->next();
+  }
+}
+
+int KStore::_omap_clear(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  if (o->onode.omap_head != 0) {
+    _do_omap_clear(txc, o->onode.omap_head);
+  }
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_omap_setkeys(TransContext *txc,
+			  CollectionRef& c,
+			  OnodeRef& o,
+			  bufferlist &bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r;
+  auto p = bl.cbegin();
+  __u32 num;
+  if (!o->onode.omap_head) {
+    o->onode.omap_head = o->onode.nid;
+    txc->write_onode(o);
+  }
+  decode(num, p);
+  while (num--) {
+    string key;
+    bufferlist value;
+    decode(key, p);
+    decode(value, p);
+    string final_key;
+    get_omap_key(o->onode.omap_head, key, &final_key);
+    dout(30) << __func__ << "  " << pretty_binary_string(final_key)
+	     << " <- " << key << dendl;
+    txc->t->set(PREFIX_OMAP, final_key, value);
+  }
+  r = 0;
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_omap_setheader(TransContext *txc,
+			    CollectionRef& c,
+			    OnodeRef &o,
+			    bufferlist& bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r;
+  string key;
+  if (!o->onode.omap_head) {
+    o->onode.omap_head = o->onode.nid;
+    txc->write_onode(o);
+  }
+  get_omap_header(o->onode.omap_head, &key);
+  txc->t->set(PREFIX_OMAP, key, bl);
+  r = 0;
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_omap_rmkeys(TransContext *txc,
+			 CollectionRef& c,
+			 OnodeRef& o,
+			 const bufferlist& bl)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  int r = 0;
+  auto p = bl.cbegin();
+  __u32 num;
+
+  if (!o->onode.omap_head) {
+    r = 0;
+    goto out;
+  }
+  decode(num, p);
+  while (num--) {
+    string key;
+    decode(key, p);
+    string final_key;
+    get_omap_key(o->onode.omap_head, key, &final_key);
+    dout(30) << __func__ << "  rm " << pretty_binary_string(final_key)
+	     << " <- " << key << dendl;
+    txc->t->rmkey(PREFIX_OMAP, final_key);
+  }
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_omap_rmkey_range(TransContext *txc,
+			      CollectionRef& c,
+			      OnodeRef& o,
+			      const string& first, const string& last)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid << dendl;
+  KeyValueDB::Iterator it;
+  string key_first, key_last;
+  int r = 0;
+
+  if (!o->onode.omap_head) {
+    goto out;
+  }
+  it = db->get_iterator(PREFIX_OMAP);
+  get_omap_key(o->onode.omap_head, first, &key_first);
+  get_omap_key(o->onode.omap_head, last, &key_last);
+  it->lower_bound(key_first);
+  while (it->valid()) {
+    if (it->key() >= key_last) {
+      dout(30) << __func__ << "  stop at " << pretty_binary_string(key_last)
+	       << dendl;
+      break;
+    }
+    txc->t->rmkey(PREFIX_OMAP, it->key());
+    dout(30) << __func__ << "  rm " << pretty_binary_string(it->key()) << dendl;
+    it->next();
+  }
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_setallochint(TransContext *txc,
+			  CollectionRef& c,
+			  OnodeRef& o,
+			  uint64_t expected_object_size,
+			  uint64_t expected_write_size,
+			  uint32_t flags)
+{
+  dout(15) << __func__ << " " << c->cid << " " << o->oid
+	   << " object_size " << expected_object_size
+	   << " write_size " << expected_write_size
+	   << " flags " << flags
+	   << dendl;
+  int r = 0;
+  o->onode.expected_object_size = expected_object_size;
+  o->onode.expected_write_size = expected_write_size;
+  o->onode.alloc_hint_flags = flags;
+
+  txc->write_onode(o);
+  dout(10) << __func__ << " " << c->cid << " " << o->oid
+	   << " object_size " << expected_object_size
+	   << " write_size " << expected_write_size
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_clone(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& oldo,
+		   OnodeRef& newo)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << dendl;
+  int r = 0;
+  if (oldo->oid.hobj.get_hash() != newo->oid.hobj.get_hash()) {
+    derr << __func__ << " mismatched hash on " << oldo->oid
+	 << " and " << newo->oid << dendl;
+    return -EINVAL;
+  }
+
+  bufferlist bl;
+  newo->exists = true;
+  _assign_nid(txc, newo);
+
+  // data
+  oldo->flush();
+
+  r = _do_read(oldo, 0, oldo->onode.size, bl, true, 0);
+  if (r < 0)
+    goto out;
+
+  // truncate any old data
+  r = _do_truncate(txc, newo, 0);
+  if (r < 0)
+    goto out;
+
+  r = _do_write(txc, newo, 0, oldo->onode.size, bl, 0);
+  if (r < 0)
+    goto out;
+
+  newo->onode.attrs = oldo->onode.attrs;
+
+  // clone omap
+  if (newo->onode.omap_head) {
+    dout(20) << __func__ << " clearing old omap data" << dendl;
+    _do_omap_clear(txc, newo->onode.omap_head);
+  }
+  if (oldo->onode.omap_head) {
+    dout(20) << __func__ << " copying omap data" << dendl;
+    if (!newo->onode.omap_head) {
+      newo->onode.omap_head = newo->onode.nid;
+    }
+    KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
+    string head, tail;
+    get_omap_header(oldo->onode.omap_head, &head);
+    get_omap_tail(oldo->onode.omap_head, &tail);
+    it->lower_bound(head);
+    while (it->valid()) {
+      string key;
+      if (it->key() >= tail) {
+	dout(30) << __func__ << "  reached tail" << dendl;
+	break;
+      } else {
+	dout(30) << __func__ << "  got header/data "
+		 << pretty_binary_string(it->key()) << dendl;
+	ceph_assert(it->key() < tail);
+	rewrite_omap_key(newo->onode.omap_head, it->key(), &key);
+	txc->t->set(PREFIX_OMAP, key, it->value());
+      }
+      it->next();
+    }
+  }
+
+  txc->write_onode(newo);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_clone_range(TransContext *txc,
+			 CollectionRef& c,
+			 OnodeRef& oldo,
+			 OnodeRef& newo,
+			 uint64_t srcoff, uint64_t length, uint64_t dstoff)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << " from " << srcoff << "~" << length
+	   << " to offset " << dstoff << dendl;
+  int r = 0;
+
+  bufferlist bl;
+  newo->exists = true;
+  _assign_nid(txc, newo);
+
+  r = _do_read(oldo, srcoff, length, bl, true, 0);
+  if (r < 0)
+    goto out;
+
+  r = _do_write(txc, newo, dstoff, bl.length(), bl, 0);
+  if (r < 0)
+    goto out;
+
+  txc->write_onode(newo);
+
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << newo->oid << " from " << srcoff << "~" << length
+	   << " to offset " << dstoff
+	   << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_rename(TransContext *txc,
+		    CollectionRef& c,
+		    OnodeRef& oldo,
+		    OnodeRef& newo,
+		    const ghobject_t& new_oid)
+{
+  dout(15) << __func__ << " " << c->cid << " " << oldo->oid << " -> "
+	   << new_oid << dendl;
+  int r;
+  ghobject_t old_oid = oldo->oid;
+  bufferlist bl;
+  string old_key, new_key;
+
+  if (newo && newo->exists) {
+    // destination object already exists, remove it first
+    r = _do_remove(txc, newo);
+    if (r < 0)
+      goto out;
+  }
+
+  txc->t->rmkey(PREFIX_OBJ, oldo->key);
+  txc->write_onode(oldo);
+  c->onode_map.rename(old_oid, new_oid);  // this adjusts oldo->{oid,key}
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> "
+	   << new_oid << " = " << r << dendl;
+  return r;
+}
+
+// collections
+
+int KStore::_create_collection(
+  TransContext *txc,
+  coll_t cid,
+  unsigned bits,
+  CollectionRef *c)
+{
+  dout(15) << __func__ << " " << cid << " bits " << bits << dendl;
+  int r;
+  bufferlist bl;
+
+  {
+    std::unique_lock l{coll_lock};
+    if (*c) {
+      r = -EEXIST;
+      goto out;
+    }
+    auto p = new_coll_map.find(cid);
+    ceph_assert(p != new_coll_map.end());
+    *c = p->second;
+    ceph_assert((*c)->cid == cid);
+    (*c)->cnode.bits = bits;
+    coll_map[cid] = *c;
+    new_coll_map.erase(p);
+  }
+  encode((*c)->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(cid), bl);
+  r = 0;
+
+ out:
+  dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_remove_collection(TransContext *txc, coll_t cid,
+				 CollectionRef *c)
+{
+  dout(15) << __func__ << " " << cid << dendl;
+  int r;
+
+  {
+    std::unique_lock l{coll_lock};
+    if (!*c) {
+      r = -ENOENT;
+      goto out;
+    }
+    size_t nonexistent_count = 0;
+    pair<ghobject_t,OnodeRef> next_onode;
+    while ((*c)->onode_map.get_next(next_onode.first, &next_onode)) {
+      if (next_onode.second->exists) {
+	r = -ENOTEMPTY;
+	goto out;
+      }
+      ++nonexistent_count;
+    }
+    vector<ghobject_t> ls;
+    ghobject_t next;
+    // Enumerate onodes in db, up to nonexistent_count + 1
+    // then check if all of them are marked as non-existent.
+    // Bypass the check if returned number is greater than nonexistent_count
+    r = _collection_list(c->get(), ghobject_t(), ghobject_t::get_max(),
+                         nonexistent_count + 1, &ls, &next);
+    if (r >= 0) {
+      bool exists = false; //ls.size() > nonexistent_count;
+      for (auto it = ls.begin(); !exists && it < ls.end(); ++it) {
+        dout(10) << __func__ << " oid " << *it << dendl;
+        auto onode = (*c)->onode_map.lookup(*it);
+        exists = !onode || onode->exists;
+        if (exists) {
+          dout(10) << __func__ << " " << *it
+                   << " exists in db" << dendl;
+        }
+      }
+      if (!exists) {
+        coll_map.erase(cid);
+        txc->removed_collections.push_back(*c);
+        c->reset();
+        txc->t->rmkey(PREFIX_COLL, stringify(cid));
+        r = 0;
+      } else {
+        dout(10) << __func__ << " " << cid
+                 << " is non-empty" << dendl;
+        r = -ENOTEMPTY;
+      }
+    }
+  }
+
+ out:
+  dout(10) << __func__ << " " << cid << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_split_collection(TransContext *txc,
+				CollectionRef& c,
+				CollectionRef& d,
+				unsigned bits, int rem)
+{
+  dout(15) << __func__ << " " << c->cid << " to " << d->cid << " "
+	   << " bits " << bits << dendl;
+  int r;
+  std::unique_lock l{c->lock};
+  std::unique_lock l2{d->lock};
+  c->onode_map.clear();
+  d->onode_map.clear();
+  c->cnode.bits = bits;
+  ceph_assert(d->cnode.bits == bits);
+  r = 0;
+
+  bufferlist bl;
+  encode(c->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(c->cid), bl);
+
+  dout(10) << __func__ << " " << c->cid << " to " << d->cid << " "
+	   << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+int KStore::_merge_collection(TransContext *txc,
+			      CollectionRef *c,
+			      CollectionRef& d,
+			      unsigned bits)
+{
+  dout(15) << __func__ << " " << (*c)->cid << " to " << d->cid << " "
+	   << " bits " << bits << dendl;
+  int r;
+  std::scoped_lock l{(*c)->lock, d->lock};
+  (*c)->onode_map.clear();
+  d->onode_map.clear();
+  d->cnode.bits = bits;
+  r = 0;
+
+  coll_t cid = (*c)->cid;
+
+  bufferlist bl;
+  encode(d->cnode, bl);
+  txc->t->set(PREFIX_COLL, stringify(d->cid), bl);
+
+  coll_map.erase((*c)->cid);
+  txc->removed_collections.push_back(*c);
+  c->reset();
+  txc->t->rmkey(PREFIX_COLL, stringify(cid));
+
+  dout(10) << __func__ << " " << cid << " to " << d->cid << " "
+	   << " bits " << bits << " = " << r << dendl;
+  return r;
+}
+
+// ===========================================
diff --git a/src/os/kstore/KStore.h b/src/os/kstore/KStore.h
new file mode 100644
index 000000000..9e3c7acd7
--- /dev/null
+++ b/src/os/kstore/KStore.h
@@ -0,0 +1,699 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_KSTORE_H
+#define CEPH_OSD_KSTORE_H
+
+#include "acconfig.h"
+
+#include <unistd.h>
+
+#include <atomic>
+#include <mutex>
+#include <condition_variable>
+
+#include "include/ceph_assert.h"
+#include "include/unordered_map.h"
+#include "common/Finisher.h"
+#include "common/Throttle.h"
+#include "common/WorkQueue.h"
+#include "os/ObjectStore.h"
+#include "common/perf_counters.h"
+#include "os/fs/FS.h"
+#include "kv/KeyValueDB.h"
+
+#include "kstore_types.h"
+
+#include "boost/intrusive/list.hpp"
+
+enum {  
+  l_kstore_first = 832430,
+  l_kstore_state_prepare_lat,
+  l_kstore_state_kv_queued_lat,
+  l_kstore_state_kv_done_lat,
+  l_kstore_state_finishing_lat,
+  l_kstore_state_done_lat,
+  l_kstore_last
+};
+
+class KStore : public ObjectStore {
+  // -----------------------------------------------------
+  // types
+public:
+
+  struct TransContext;
+
+  /// an in-memory object
+  struct Onode {
+    CephContext* cct;
+    std::atomic_int nref;  ///< reference count
+
+    ghobject_t oid;
+    std::string key;     ///< key under PREFIX_OBJ where we are stored
+    boost::intrusive::list_member_hook<> lru_item;
+
+    kstore_onode_t onode;  ///< metadata stored as value in kv store
+    bool dirty;     // ???
+    bool exists;
+
+    std::mutex flush_lock;  ///< protect flush_txns
+    std::condition_variable flush_cond;   ///< wait here for unapplied txns
+    std::set<TransContext*> flush_txns;   ///< committing txns
+
+    uint64_t tail_offset;
+    ceph::buffer::list tail_bl;
+
+    std::map<uint64_t,ceph::buffer::list> pending_stripes;  ///< unwritten stripes
+
+    Onode(CephContext* cct, const ghobject_t& o, const std::string& k)
+      : cct(cct),
+	nref(0),
+	oid(o),
+	key(k),
+	dirty(false),
+	exists(false),
+        tail_offset(0) {
+    }
+
+    void flush();
+    void get() {
+      ++nref;
+    }
+    void put() {
+      if (--nref == 0)
+	delete this;
+    }
+
+    void clear_tail() {
+      tail_offset = 0;
+      tail_bl.clear();
+    }
+    void clear_pending_stripes() {
+      pending_stripes.clear();
+    }
+  };
+  typedef boost::intrusive_ptr<Onode> OnodeRef;
+
+  struct OnodeHashLRU {
+    CephContext* cct;
+    typedef boost::intrusive::list<
+      Onode,
+      boost::intrusive::member_hook<
+        Onode,
+	boost::intrusive::list_member_hook<>,
+	&Onode::lru_item> > lru_list_t;
+
+    std::mutex lock;
+    ceph::unordered_map<ghobject_t,OnodeRef> onode_map;  ///< forward lookups
+    lru_list_t lru;                                      ///< lru
+
+    OnodeHashLRU(CephContext* cct) : cct(cct) {}
+
+    void add(const ghobject_t& oid, OnodeRef o);
+    void _touch(OnodeRef o);
+    OnodeRef lookup(const ghobject_t& o);
+    void rename(const ghobject_t& old_oid, const ghobject_t& new_oid);
+    void clear();
+    bool get_next(const ghobject_t& after, std::pair<ghobject_t,OnodeRef> *next);
+    int trim(int max=-1);
+  };
+
+  class OpSequencer;
+  typedef boost::intrusive_ptr<OpSequencer> OpSequencerRef;
+
+  struct Collection : public CollectionImpl {
+    KStore *store;
+    kstore_cnode_t cnode;
+    ceph::shared_mutex lock =
+      ceph::make_shared_mutex("KStore::Collection::lock", true, false);
+
+    OpSequencerRef osr;
+
+    // cache onodes on a per-collection basis to avoid lock
+    // contention.
+    OnodeHashLRU onode_map;
+
+    OnodeRef get_onode(const ghobject_t& oid, bool create);
+
+    bool contains(const ghobject_t& oid) {
+      if (cid.is_meta())
+	return oid.hobj.pool == -1;
+      spg_t spgid;
+      if (cid.is_pg(&spgid))
+	return
+	  spgid.pgid.contains(cnode.bits, oid) &&
+	  oid.shard_id == spgid.shard;
+      return false;
+    }
+
+    void flush() override;
+    bool flush_commit(Context *c) override;
+
+  private:
+    FRIEND_MAKE_REF(Collection);
+    Collection(KStore *ns, coll_t c);
+  };
+  using CollectionRef = ceph::ref_t<Collection>;
+
+  class OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
+    CollectionRef c;
+    OnodeRef o;
+    KeyValueDB::Iterator it;
+    std::string head, tail;
+  public:
+    OmapIteratorImpl(CollectionRef c, OnodeRef o, KeyValueDB::Iterator it);
+    int seek_to_first() override;
+    int upper_bound(const std::string &after) override;
+    int lower_bound(const std::string &to) override;
+    bool valid() override;
+    int next() override;
+    std::string key() override;
+    ceph::buffer::list value() override;
+    int status() override {
+      return 0;
+    }
+  };
+
+  struct TransContext {
+    typedef enum {
+      STATE_PREPARE,
+      STATE_AIO_WAIT,
+      STATE_IO_DONE,
+      STATE_KV_QUEUED,
+      STATE_KV_COMMITTING,
+      STATE_KV_DONE,
+      STATE_FINISHING,
+      STATE_DONE,
+    } state_t;
+
+    state_t state;
+
+    const char *get_state_name() {
+      switch (state) {
+      case STATE_PREPARE: return "prepare";
+      case STATE_AIO_WAIT: return "aio_wait";
+      case STATE_IO_DONE: return "io_done";
+      case STATE_KV_QUEUED: return "kv_queued";
+      case STATE_KV_COMMITTING: return "kv_committing";
+      case STATE_KV_DONE: return "kv_done";
+      case STATE_FINISHING: return "finishing";
+      case STATE_DONE: return "done";
+      }
+      return "???";
+    }
+
+    void log_state_latency(PerfCounters *logger, int state) {
+        utime_t lat, now = ceph_clock_now();
+        lat = now - start;
+        logger->tinc(state, lat);
+        start = now;
+    }
+
+    CollectionRef ch;
+    OpSequencerRef osr;
+    boost::intrusive::list_member_hook<> sequencer_item;
+
+    uint64_t ops, bytes;
+
+    std::set<OnodeRef> onodes;     ///< these onodes need to be updated/written
+    KeyValueDB::Transaction t; ///< then we will commit this
+    Context *oncommit;         ///< signal on commit
+    Context *onreadable;         ///< signal on readable
+    Context *onreadable_sync;         ///< signal on readable
+    std::list<Context*> oncommits;  ///< more commit completions
+    std::list<CollectionRef> removed_collections; ///< colls we removed
+
+    CollectionRef first_collection;  ///< first referenced collection
+    utime_t start;
+    explicit TransContext(OpSequencer *o)
+      : state(STATE_PREPARE),
+	osr(o),
+	ops(0),
+	bytes(0),
+	oncommit(NULL),
+	onreadable(NULL),
+	onreadable_sync(NULL),
+        start(ceph_clock_now()){
+      //cout << "txc new " << this << std::endl;
+    }
+    ~TransContext() {
+      //cout << "txc del " << this << std::endl;
+    }
+
+    void write_onode(OnodeRef &o) {
+      onodes.insert(o);
+    }
+  };
+
+  class OpSequencer : public RefCountedObject {
+  public:
+    std::mutex qlock;
+    std::condition_variable qcond;
+    typedef boost::intrusive::list<
+      TransContext,
+      boost::intrusive::member_hook<
+        TransContext,
+	boost::intrusive::list_member_hook<>,
+	&TransContext::sequencer_item> > q_list_t;
+    q_list_t q;  ///< transactions
+
+    ~OpSequencer() {
+      ceph_assert(q.empty());
+    }
+
+    void queue_new(TransContext *txc) {
+      std::lock_guard<std::mutex> l(qlock);
+      q.push_back(*txc);
+    }
+
+    void flush() {
+      std::unique_lock<std::mutex> l(qlock);
+      while (!q.empty())
+	qcond.wait(l);
+    }
+
+    bool flush_commit(Context *c) {
+      std::lock_guard<std::mutex> l(qlock);
+      if (q.empty()) {
+	return true;
+      }
+      TransContext *txc = &q.back();
+      if (txc->state >= TransContext::STATE_KV_DONE) {
+	return true;
+      }
+      ceph_assert(txc->state < TransContext::STATE_KV_DONE);
+      txc->oncommits.push_back(c);
+      return false;
+    }
+  };
+
+  struct KVSyncThread : public Thread {
+    KStore *store;
+    explicit KVSyncThread(KStore *s) : store(s) {}
+    void *entry() override {
+      store->_kv_sync_thread();
+      return NULL;
+    }
+  };
+
+  // --------------------------------------------------------
+  // members
+private:
+  KeyValueDB *db;
+  uuid_d fsid;
+  std::string basedir;
+  int path_fd;  ///< open handle to $path
+  int fsid_fd;  ///< open handle (locked) to $path/fsid
+  bool mounted;
+
+  /// rwlock to protect coll_map
+  ceph::shared_mutex coll_lock = ceph::make_shared_mutex("KStore::coll_lock");
+  ceph::unordered_map<coll_t, CollectionRef> coll_map;
+  std::map<coll_t,CollectionRef> new_coll_map;
+
+  std::mutex nid_lock;
+  uint64_t nid_last;
+  uint64_t nid_max;
+
+  Throttle throttle_ops, throttle_bytes;          ///< submit to commit
+
+  Finisher finisher;
+
+  KVSyncThread kv_sync_thread;
+  std::mutex kv_lock;
+  std::condition_variable kv_cond, kv_sync_cond;
+  bool kv_stop;
+  std::deque<TransContext*> kv_queue, kv_committing;
+
+  //Logger *logger;
+  PerfCounters *logger;
+  std::mutex reap_lock;
+  std::list<CollectionRef> removed_collections;
+
+
+  // --------------------------------------------------------
+  // private methods
+
+  void _init_logger();
+  void _shutdown_logger();
+
+  int _open_path();
+  void _close_path();
+  int _open_fsid(bool create);
+  int _lock_fsid();
+  int _read_fsid(uuid_d *f);
+  int _write_fsid();
+  void _close_fsid();
+  int _open_db(bool create);
+  void _close_db();
+  int _open_collections(int *errors=0);
+  void _close_collections();
+
+  int _open_super_meta();
+
+  CollectionRef _get_collection(coll_t cid);
+  void _queue_reap_collection(CollectionRef& c);
+  void _reap_collections();
+
+  void _assign_nid(TransContext *txc, OnodeRef o);
+
+  void _dump_onode(OnodeRef o);
+
+  TransContext *_txc_create(OpSequencer *osr);
+  void _txc_release(TransContext *txc, uint64_t offset, uint64_t length);
+  void _txc_add_transaction(TransContext *txc, Transaction *t);
+  void _txc_finalize(OpSequencer *osr, TransContext *txc);
+  void _txc_state_proc(TransContext *txc);
+  void _txc_finish_kv(TransContext *txc);
+  void _txc_finish(TransContext *txc);
+
+  void _osr_reap_done(OpSequencer *osr);
+
+  void _kv_sync_thread();
+  void _kv_stop() {
+    {
+      std::lock_guard<std::mutex> l(kv_lock);
+      kv_stop = true;
+      kv_cond.notify_all();
+    }
+    kv_sync_thread.join();
+    kv_stop = false;
+  }
+
+  void _do_read_stripe(OnodeRef o, uint64_t offset, ceph::buffer::list *pbl, bool do_cache);
+  void _do_write_stripe(TransContext *txc, OnodeRef o,
+			uint64_t offset, ceph::buffer::list& bl);
+  void _do_remove_stripe(TransContext *txc, OnodeRef o, uint64_t offset);
+
+  int _collection_list(
+    Collection *c, const ghobject_t& start, const ghobject_t& end,
+    int max, std::vector<ghobject_t> *ls, ghobject_t *next);
+
+public:
+  KStore(CephContext *cct, const std::string& path);
+  ~KStore() override;
+
+  std::string get_type() override {
+    return "kstore";
+  }
+
+  bool needs_journal() override { return false; };
+  bool wants_journal() override { return false; };
+  bool allows_journal() override { return false; };
+
+  static int get_block_device_fsid(const std::string& path, uuid_d *fsid);
+
+  bool test_mount_in_use() override;
+
+  int mount() override;
+  int umount() override;
+  void _sync();
+
+  int fsck(bool deep) override;
+
+
+  int validate_hobject_key(const hobject_t &obj) const override {
+    return 0;
+  }
+  unsigned get_max_attr_name_length() override {
+    return 256;  // arbitrary; there is no real limit internally
+  }
+
+  int mkfs() override;
+  int mkjournal() override {
+    return 0;
+  }
+  void dump_perf_counters(ceph::Formatter *f) override {
+    f->open_object_section("perf_counters");
+    logger->dump_formatted(f, false, false);
+    f->close_section();
+  }
+  void get_db_statistics(ceph::Formatter *f) override {
+    db->get_statistics(f);
+  }
+  int statfs(struct store_statfs_t *buf,
+             osd_alert_list_t* alerts = nullptr) override;
+  int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+		  bool *per_pool_omap) override;
+
+  CollectionHandle open_collection(const coll_t& c) override;
+  CollectionHandle create_new_collection(const coll_t& c) override;
+  void set_collection_commit_queue(const coll_t& cid,
+				   ContextQueue *commit_queue) override {
+  }
+
+  using ObjectStore::exists;
+  bool exists(CollectionHandle& c, const ghobject_t& oid) override;
+  using ObjectStore::stat;
+  int stat(
+    CollectionHandle& c,
+    const ghobject_t& oid,
+    struct stat *st,
+    bool allow_eio = false) override; // struct stat?
+  int set_collection_opts(
+    CollectionHandle& c,
+    const pool_opts_t& opts) override;
+  using ObjectStore::read;
+  int read(
+    CollectionHandle& c,
+    const ghobject_t& oid,
+    uint64_t offset,
+    size_t len,
+    ceph::buffer::list& bl,
+    uint32_t op_flags = 0) override;
+  int _do_read(
+    OnodeRef o,
+    uint64_t offset,
+    size_t len,
+    ceph::buffer::list& bl,
+    bool do_cache,
+    uint32_t op_flags = 0);
+
+  using ObjectStore::fiemap;
+  int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) override;
+  int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, ceph::buffer::list& outbl) override;
+  using ObjectStore::getattr;
+  int getattr(CollectionHandle& c, const ghobject_t& oid, const char *name, ceph::buffer::ptr& value) override;
+  using ObjectStore::getattrs;
+  int getattrs(CollectionHandle& c,
+	       const ghobject_t& oid,
+	       std::map<std::string,ceph::buffer::ptr,std::less<>>& aset) override;
+
+  int list_collections(std::vector<coll_t>& ls) override;
+  bool collection_exists(const coll_t& c) override;
+  int collection_empty(CollectionHandle& c, bool *empty) override;
+  int collection_bits(CollectionHandle& c) override;
+  int collection_list(
+    CollectionHandle &c, const ghobject_t& start, const ghobject_t& end,
+    int max,
+    std::vector<ghobject_t> *ls, ghobject_t *next) override;
+
+  using ObjectStore::omap_get;
+  int omap_get(
+    CollectionHandle& c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value std::map
+    ) override;
+
+  using ObjectStore::omap_get_header;
+  /// Get omap header
+  int omap_get_header(
+    CollectionHandle& c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    bool allow_eio = false ///< [in] don't assert on eio
+    ) override;
+
+  using ObjectStore::omap_get_keys;
+  /// Get keys defined on oid
+  int omap_get_keys(
+    CollectionHandle& c,              ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    std::set<std::string> *keys      ///< [out] Keys defined on oid
+    ) override;
+
+  using ObjectStore::omap_get_values;
+  /// Get key values
+  int omap_get_values(
+    CollectionHandle& c,                    ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::set<std::string> &keys,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) override;
+
+  using ObjectStore::omap_check_keys;
+  /// Filters keys into out which are defined on oid
+  int omap_check_keys(
+    CollectionHandle& c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    const std::set<std::string> &keys, ///< [in] Keys to check
+    std::set<std::string> *out         ///< [out] Subset of keys defined on oid
+    ) override;
+
+  using ObjectStore::get_omap_iterator;
+  ObjectMap::ObjectMapIterator get_omap_iterator(
+    CollectionHandle& c,              ///< [in] collection
+    const ghobject_t &oid  ///< [in] object
+    ) override;
+
+  void set_fsid(uuid_d u) override {
+    fsid = u;
+  }
+  uuid_d get_fsid() override {
+    return fsid;
+  }
+
+  uint64_t estimate_objects_overhead(uint64_t num_objects) override {
+    return num_objects * 300; //assuming per-object overhead is 300 bytes
+  }
+
+  objectstore_perf_stat_t get_cur_stats() override {
+    return objectstore_perf_stat_t();
+  }
+  const PerfCounters* get_perf_counters() const override {
+    return logger;
+  }
+
+
+  int queue_transactions(
+    CollectionHandle& ch,
+    std::vector<Transaction>& tls,
+    TrackedOpRef op = TrackedOpRef(),
+    ThreadPool::TPHandle *handle = NULL) override;
+
+  void compact () override {
+    ceph_assert(db);
+    db->compact();
+  }
+  
+private:
+  // --------------------------------------------------------
+  // write ops
+
+  int _write(TransContext *txc,
+	     CollectionRef& c,
+	     OnodeRef& o,
+	     uint64_t offset, size_t len,
+	     ceph::buffer::list& bl,
+	     uint32_t fadvise_flags);
+  int _do_write(TransContext *txc,
+		OnodeRef o,
+		uint64_t offset, uint64_t length,
+		ceph::buffer::list& bl,
+		uint32_t fadvise_flags);
+  int _touch(TransContext *txc,
+	     CollectionRef& c,
+	     OnodeRef& o);
+  int _zero(TransContext *txc,
+	    CollectionRef& c,
+	    OnodeRef& o,
+	    uint64_t offset, size_t len);
+  int _do_truncate(TransContext *txc,
+		   OnodeRef o,
+		   uint64_t offset);
+  int _truncate(TransContext *txc,
+		CollectionRef& c,
+		OnodeRef& o,
+		uint64_t offset);
+  int _remove(TransContext *txc,
+	      CollectionRef& c,
+	      OnodeRef& o);
+  int _do_remove(TransContext *txc,
+		 OnodeRef o);
+  int _setattr(TransContext *txc,
+	       CollectionRef& c,
+	       OnodeRef& o,
+	       const std::string& name,
+	       ceph::buffer::ptr& val);
+  int _setattrs(TransContext *txc,
+		CollectionRef& c,
+		OnodeRef& o,
+		const std::map<std::string,ceph::buffer::ptr>& aset);
+  int _rmattr(TransContext *txc,
+	      CollectionRef& c,
+	      OnodeRef& o,
+	      const std::string& name);
+  int _rmattrs(TransContext *txc,
+	       CollectionRef& c,
+	       OnodeRef& o);
+  void _do_omap_clear(TransContext *txc, uint64_t id);
+  int _omap_clear(TransContext *txc,
+		  CollectionRef& c,
+		  OnodeRef& o);
+  int _omap_setkeys(TransContext *txc,
+		    CollectionRef& c,
+		    OnodeRef& o,
+		    ceph::buffer::list& bl);
+  int _omap_setheader(TransContext *txc,
+		      CollectionRef& c,
+		      OnodeRef& o,
+		      ceph::buffer::list& header);
+  int _omap_rmkeys(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& o,
+		   const ceph::buffer::list& bl);
+  int _omap_rmkey_range(TransContext *txc,
+			CollectionRef& c,
+			OnodeRef& o,
+			const std::string& first, const std::string& last);
+  int _setallochint(TransContext *txc,
+		    CollectionRef& c,
+		    OnodeRef& o,
+		    uint64_t expected_object_size,
+		    uint64_t expected_write_size,
+		    uint32_t flags);
+  int _clone(TransContext *txc,
+	     CollectionRef& c,
+	     OnodeRef& oldo,
+	     OnodeRef& newo);
+  int _clone_range(TransContext *txc,
+		   CollectionRef& c,
+		   OnodeRef& oldo,
+		   OnodeRef& newo,
+		   uint64_t srcoff, uint64_t length, uint64_t dstoff);
+  int _rename(TransContext *txc,
+	      CollectionRef& c,
+	      OnodeRef& oldo,
+	      OnodeRef& newo,
+	      const ghobject_t& new_oid);
+  int _create_collection(TransContext *txc, coll_t cid, unsigned bits,
+			 CollectionRef *c);
+  int _remove_collection(TransContext *txc, coll_t cid, CollectionRef *c);
+  int _split_collection(TransContext *txc,
+			CollectionRef& c,
+			CollectionRef& d,
+			unsigned bits, int rem);
+  int _merge_collection(TransContext *txc,
+			CollectionRef *c,
+			CollectionRef& d,
+			unsigned bits);
+
+};
+
+static inline void intrusive_ptr_add_ref(KStore::Onode *o) {
+  o->get();
+}
+static inline void intrusive_ptr_release(KStore::Onode *o) {
+  o->put();
+}
+
+static inline void intrusive_ptr_add_ref(KStore::OpSequencer *o) {
+  o->get();
+}
+static inline void intrusive_ptr_release(KStore::OpSequencer *o) {
+  o->put();
+}
+
+#endif
diff --git a/src/os/kstore/kstore_types.cc b/src/os/kstore/kstore_types.cc
new file mode 100644
index 000000000..885c52b60
--- /dev/null
+++ b/src/os/kstore/kstore_types.cc
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "kstore_types.h"
+#include "common/Formatter.h"
+#include "include/stringify.h"
+
+using std::list;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+// cnode_t
+
+void kstore_cnode_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(bits, bl);
+  ENCODE_FINISH(bl);
+}
+
+void kstore_cnode_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(bits, p);
+  DECODE_FINISH(p);
+}
+
+void kstore_cnode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("bits", bits);
+}
+
+void kstore_cnode_t::generate_test_instances(list<kstore_cnode_t*>& o)
+{
+  o.push_back(new kstore_cnode_t());
+  o.push_back(new kstore_cnode_t(0));
+  o.push_back(new kstore_cnode_t(123));
+}
+
+
+// kstore_onode_t
+
+void kstore_onode_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(nid, bl);
+  encode(size, bl);
+  encode(attrs, bl);
+  encode(omap_head, bl);
+  encode(stripe_size, bl);
+  encode(expected_object_size, bl);
+  encode(expected_write_size, bl);
+  encode(alloc_hint_flags, bl);
+  ENCODE_FINISH(bl);
+}
+
+void kstore_onode_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(nid, p);
+  decode(size, p);
+  decode(attrs, p);
+  decode(omap_head, p);
+  decode(stripe_size, p);
+  decode(expected_object_size, p);
+  decode(expected_write_size, p);
+  decode(alloc_hint_flags, p);
+  DECODE_FINISH(p);
+}
+
+void kstore_onode_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("nid", nid);
+  f->dump_unsigned("size", size);
+  f->open_object_section("attrs");
+  for (auto p = attrs.begin(); p != attrs.end(); ++p) {
+    f->open_object_section("attr");
+    f->dump_string("name", p->first);
+    f->dump_unsigned("len", p->second.length());
+    f->close_section();
+  }
+  f->close_section();
+  f->dump_unsigned("omap_head", omap_head);
+  f->dump_unsigned("stripe_size", stripe_size);
+  f->dump_unsigned("expected_object_size", expected_object_size);
+  f->dump_unsigned("expected_write_size", expected_write_size);
+  f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
+}
+
+void kstore_onode_t::generate_test_instances(list<kstore_onode_t*>& o)
+{
+  o.push_back(new kstore_onode_t());
+  // FIXME
+}
diff --git a/src/os/kstore/kstore_types.h b/src/os/kstore/kstore_types.h
new file mode 100644
index 000000000..f264642e2
--- /dev/null
+++ b/src/os/kstore/kstore_types.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OSD_KSTORE_TYPES_H
+#define CEPH_OSD_KSTORE_TYPES_H
+
+#include <ostream>
+#include "include/types.h"
+#include "include/interval_set.h"
+#include "include/utime.h"
+#include "common/hobject.h"
+
+namespace ceph {
+  class Formatter;
+}
+/// collection metadata
+struct kstore_cnode_t {
+  uint32_t bits;   ///< how many bits of coll pgid are significant
+
+  explicit kstore_cnode_t(int b=0) : bits(b) {}
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<kstore_cnode_t*>& o);
+};
+WRITE_CLASS_ENCODER(kstore_cnode_t)
+
+/// onode: per-object metadata
+struct kstore_onode_t {
+  uint64_t nid;                        ///< numeric id (locally unique)
+  uint64_t size;                       ///< object size
+  std::map<std::string, ceph::buffer::ptr, std::less<>> attrs;        ///< attrs
+  uint64_t omap_head;                  ///< id for omap root node
+  uint32_t stripe_size;                ///< stripe size
+
+  uint32_t expected_object_size;
+  uint32_t expected_write_size;
+  uint32_t alloc_hint_flags;
+
+  kstore_onode_t()
+    : nid(0),
+      size(0),
+      omap_head(0),
+      stripe_size(0),
+      expected_object_size(0),
+      expected_write_size(0),
+      alloc_hint_flags(0) {}
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<kstore_onode_t*>& o);
+};
+WRITE_CLASS_ENCODER(kstore_onode_t)
+
+#endif
diff --git a/src/os/kv.h b/src/os/kv.h
new file mode 100644
index 000000000..64048b088
--- /dev/null
+++ b/src/os/kv.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_KV_H
+#define CEPH_OS_KV_H
+
+#include <string>
+#include "include/byteorder.h"
+
+// some key encoding helpers
+template<typename T>
+inline static void _key_encode_u32(uint32_t u, T *key) {
+  uint32_t bu;
+#ifdef CEPH_BIG_ENDIAN
+  bu = u;
+#elif defined(CEPH_LITTLE_ENDIAN)
+  bu = swab(u);
+#else
+# error wtf
+#endif
+  key->append((char*)&bu, 4);
+}
+
+template<typename T>
+inline static void _key_encode_u32(uint32_t u, size_t pos, T *key) {
+  uint32_t bu;
+#ifdef CEPH_BIG_ENDIAN
+  bu = u;
+#elif defined(CEPH_LITTLE_ENDIAN)
+  bu = swab(u);
+#else
+# error wtf
+#endif
+  key->replace(pos, sizeof(bu), (char*)&bu, sizeof(bu));
+}
+
+inline static const char *_key_decode_u32(const char *key, uint32_t *pu) {
+  uint32_t bu;
+  memcpy(&bu, key, 4);
+#ifdef CEPH_BIG_ENDIAN
+  *pu = bu;
+#elif defined(CEPH_LITTLE_ENDIAN)
+  *pu = swab(bu);
+#else
+# error wtf
+#endif
+  return key + 4;
+}
+
+template<typename T>
+inline static void _key_encode_u64(uint64_t u, T *key) {
+  uint64_t bu;
+#ifdef CEPH_BIG_ENDIAN
+  bu = u;
+#elif defined(CEPH_LITTLE_ENDIAN)
+  bu = swab(u);
+#else
+# error wtf
+#endif
+  key->append((char*)&bu, 8);
+}
+
+inline static const char *_key_decode_u64(const char *key, uint64_t *pu) {
+  uint64_t bu;
+  memcpy(&bu, key, 8);
+#ifdef CEPH_BIG_ENDIAN
+  *pu = bu;
+#elif defined(CEPH_LITTLE_ENDIAN)
+  *pu = swab(bu);
+#else
+# error wtf
+#endif
+  return key + 8;
+}
+
+#endif
diff --git a/src/os/memstore/MemStore.cc b/src/os/memstore/MemStore.cc
new file mode 100644
index 000000000..99e99dcba
--- /dev/null
+++ b/src/os/memstore/MemStore.cc
@@ -0,0 +1,1824 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "acconfig.h"
+
+#ifdef HAVE_SYS_MOUNT_H
+#include <sys/mount.h>
+#endif
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+#include "include/types.h"
+#include "include/stringify.h"
+#include "include/unordered_map.h"
+#include "common/errno.h"
+#include "MemStore.h"
+#include "include/compat.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_memstore
+#undef dout_prefix
+#define dout_prefix *_dout << "memstore(" << path << ") "
+
+using ceph::decode;
+using ceph::encode;
+
+// for comparing collections for lock ordering
+bool operator>(const MemStore::CollectionRef& l,
+	       const MemStore::CollectionRef& r)
+{
+  return (unsigned long)l.get() > (unsigned long)r.get();
+}
+
+
+int MemStore::mount()
+{
+  int r = _load();
+  if (r < 0)
+    return r;
+  finisher.start();
+  return 0;
+}
+
+int MemStore::umount()
+{
+  finisher.wait_for_empty();
+  finisher.stop();
+  return _save();
+}
+
+int MemStore::_save()
+{
+  dout(10) << __func__ << dendl;
+  dump_all();
+  std::set<coll_t> collections;
+  for (auto p = coll_map.begin(); p != coll_map.end(); ++p) {
+    dout(20) << __func__ << " coll " << p->first << " " << p->second << dendl;
+    collections.insert(p->first);
+    ceph::buffer::list bl;
+    ceph_assert(p->second);
+    p->second->encode(bl);
+    std::string fn = path + "/" + stringify(p->first);
+    int r = bl.write_file(fn.c_str());
+    if (r < 0)
+      return r;
+  }
+
+  std::string fn = path + "/collections";
+  ceph::buffer::list bl;
+  encode(collections, bl);
+  int r = bl.write_file(fn.c_str());
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+void MemStore::dump_all()
+{
+  auto f = ceph::Formatter::create("json-pretty");
+  f->open_object_section("store");
+  dump(f);
+  f->close_section();
+  dout(0) << "dump:";
+  f->flush(*_dout);
+  *_dout << dendl;
+  delete f;
+}
+
+void MemStore::dump(ceph::Formatter *f)
+{
+  f->open_array_section("collections");
+  for (auto p = coll_map.begin(); p != coll_map.end(); ++p) {
+    f->open_object_section("collection");
+    f->dump_string("name", stringify(p->first));
+
+    f->open_array_section("xattrs");
+    for (auto q = p->second->xattr.begin();
+	 q != p->second->xattr.end();
+	 ++q) {
+      f->open_object_section("xattr");
+      f->dump_string("name", q->first);
+      f->dump_int("length", q->second.length());
+      f->close_section();
+    }
+    f->close_section();
+
+    f->open_array_section("objects");
+    for (auto q = p->second->object_map.begin();
+	 q != p->second->object_map.end();
+	 ++q) {
+      f->open_object_section("object");
+      f->dump_string("name", stringify(q->first));
+      if (q->second)
+	q->second->dump(f);
+      f->close_section();
+    }
+    f->close_section();
+
+    f->close_section();
+  }
+  f->close_section();
+}
+
+int MemStore::_load()
+{
+  dout(10) << __func__ << dendl;
+  ceph::buffer::list bl;
+  std::string fn = path + "/collections";
+  std::string err;
+  int r = bl.read_file(fn.c_str(), &err);
+  if (r < 0)
+    return r;
+
+  std::set<coll_t> collections;
+  auto p = bl.cbegin();
+  decode(collections, p);
+
+  for (auto q = collections.begin();
+       q != collections.end();
+       ++q) {
+    std::string fn = path + "/" + stringify(*q);
+    ceph::buffer::list cbl;
+    int r = cbl.read_file(fn.c_str(), &err);
+    if (r < 0)
+      return r;
+    auto c = ceph::make_ref<Collection>(cct, *q);
+    auto p = cbl.cbegin();
+    c->decode(p);
+    coll_map[*q] = c;
+    used_bytes += c->used_bytes();
+  }
+
+  dump_all();
+
+  return 0;
+}
+
+void MemStore::set_fsid(uuid_d u)
+{
+  int r = write_meta("fsid", stringify(u));
+  ceph_assert(r >= 0);
+}
+
+uuid_d MemStore::get_fsid()
+{
+  std::string fsid_str;
+  int r = read_meta("fsid", &fsid_str);
+  ceph_assert(r >= 0);
+  uuid_d uuid;
+  bool b = uuid.parse(fsid_str.c_str());
+  ceph_assert(b);
+  return uuid;
+}
+
+int MemStore::mkfs()
+{
+  std::string fsid_str;
+  int r = read_meta("fsid", &fsid_str);
+  if (r == -ENOENT) {
+    uuid_d fsid;
+    fsid.generate_random();
+    fsid_str = stringify(fsid);
+    r = write_meta("fsid", fsid_str);
+    if (r < 0)
+      return r;
+    dout(1) << __func__ << " new fsid " << fsid_str << dendl;
+  } else if (r < 0) {
+    return r;
+  } else {  
+    dout(1) << __func__ << " had fsid " << fsid_str << dendl;
+  }
+
+  std::string fn = path + "/collections";
+  derr << path << dendl;
+  ceph::buffer::list bl;
+  std::set<coll_t> collections;
+  encode(collections, bl);
+  r = bl.write_file(fn.c_str());
+  if (r < 0)
+    return r;
+
+  r = write_meta("type", "memstore");
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int MemStore::statfs(struct store_statfs_t *st, osd_alert_list_t* alerts)
+{
+  dout(10) << __func__ << dendl;
+  if (alerts) {
+    alerts->clear(); // returns nothing for now
+  }
+  st->reset();
+  st->total = cct->_conf->memstore_device_bytes;
+  st->available = std::max<int64_t>(st->total - used_bytes, 0);
+  dout(10) << __func__ << ": used_bytes: " << used_bytes
+	   << "/" << cct->_conf->memstore_device_bytes << dendl;
+  return 0;
+}
+
+int MemStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+			  bool *per_pool_omap)
+{
+  return -ENOTSUP;
+}
+
+objectstore_perf_stat_t MemStore::get_cur_stats()
+{
+  // fixme
+  return objectstore_perf_stat_t();
+}
+
+MemStore::CollectionRef MemStore::get_collection(const coll_t& cid)
+{
+  std::shared_lock l{coll_lock};
+  ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+  if (cp == coll_map.end())
+    return CollectionRef();
+  return cp->second;
+}
+
+ObjectStore::CollectionHandle MemStore::create_new_collection(const coll_t& cid)
+{
+  std::lock_guard l{coll_lock};
+  auto c = ceph::make_ref<Collection>(cct, cid);
+  new_coll_map[cid] = c;
+  return c;
+}
+
+
+// ---------------
+// read operations
+
+bool MemStore::exists(CollectionHandle &c_, const ghobject_t& oid)
+{
+  Collection *c = static_cast<Collection*>(c_.get());
+  dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl;
+  if (!c->exists)
+    return false;
+
+  // Perform equivalent of c->get_object_(oid) != NULL. In C++11 the
+  // shared_ptr needs to be compared to nullptr.
+  return (bool)c->get_object(oid);
+}
+
+int MemStore::stat(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  struct stat *st,
+  bool allow_eio)
+{
+  Collection *c = static_cast<Collection*>(c_.get());
+  dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  st->st_size = o->get_size();
+  st->st_blksize = 4096;
+  st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize;
+  st->st_nlink = 1;
+  return 0;
+}
+
+int MemStore::set_collection_opts(
+  CollectionHandle& ch,
+  const pool_opts_t& opts)
+{
+  return -EOPNOTSUPP;
+}
+
+int MemStore::read(
+  CollectionHandle &c_,
+  const ghobject_t& oid,
+  uint64_t offset,
+  size_t len,
+  ceph::buffer::list& bl,
+  uint32_t op_flags)
+{
+  Collection *c = static_cast<Collection*>(c_.get());
+  dout(10) << __func__ << " " << c->cid << " " << oid << " "
+	   << offset << "~" << len << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  if (offset >= o->get_size())
+    return 0;
+  size_t l = len;
+  if (l == 0 && offset == 0)  // note: len == 0 means read the entire object
+    l = o->get_size();
+  else if (offset + l > o->get_size())
+    l = o->get_size() - offset;
+  bl.clear();
+  return o->read(offset, l, bl);
+}
+
+int MemStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+		     uint64_t offset, size_t len, ceph::buffer::list& bl)
+{
+  std::map<uint64_t, uint64_t> destmap;
+  int r = fiemap(ch, oid, offset, len, destmap);
+  if (r >= 0)
+    encode(destmap, bl);
+  return r;
+}
+
+int MemStore::fiemap(CollectionHandle& ch, const ghobject_t& oid,
+		     uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap)
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << " " << offset << "~"
+	   << len << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  size_t l = len;
+  if (offset + l > o->get_size())
+    l = o->get_size() - offset;
+  if (offset >= o->get_size())
+    goto out;
+  destmap[offset] = l;
+ out:
+  return 0;
+}
+
+int MemStore::getattr(CollectionHandle &c_, const ghobject_t& oid,
+		      const char *name, ceph::buffer::ptr& value)
+{
+  Collection *c = static_cast<Collection*>(c_.get());
+  dout(10) << __func__ << " " << c->cid << " " << oid << " " << name << dendl;
+  if (!c->exists)
+    return -ENOENT;
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::string k(name);
+  std::lock_guard lock{o->xattr_mutex};
+  if (!o->xattr.count(k)) {
+    return -ENODATA;
+  }
+  value = o->xattr[k];
+  return 0;
+}
+
+int MemStore::getattrs(CollectionHandle &c_, const ghobject_t& oid,
+		       std::map<std::string,ceph::buffer::ptr,std::less<>>& aset)
+{
+  Collection *c = static_cast<Collection*>(c_.get());
+  dout(10) << __func__ << " " << c->cid << " " << oid << dendl;
+  if (!c->exists)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->xattr_mutex};
+  aset = o->xattr;
+  return 0;
+}
+
+int MemStore::list_collections(std::vector<coll_t>& ls)
+{
+  dout(10) << __func__ << dendl;
+  std::shared_lock l{coll_lock};
+  for (ceph::unordered_map<coll_t,CollectionRef>::iterator p = coll_map.begin();
+       p != coll_map.end();
+       ++p) {
+    ls.push_back(p->first);
+  }
+  return 0;
+}
+
+bool MemStore::collection_exists(const coll_t& cid)
+{
+  dout(10) << __func__ << " " << cid << dendl;
+  std::shared_lock l{coll_lock};
+  return coll_map.count(cid);
+}
+
+int MemStore::collection_empty(CollectionHandle& ch, bool *empty)
+{
+  dout(10) << __func__ << " " << ch->cid << dendl;
+  CollectionRef c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  *empty = c->object_map.empty();
+  return 0;
+}
+
+int MemStore::collection_bits(CollectionHandle& ch)
+{
+  dout(10) << __func__ << " " << ch->cid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+  return c->bits;
+}
+
+int MemStore::collection_list(CollectionHandle& ch,
+			      const ghobject_t& start,
+			      const ghobject_t& end,
+			      int max,
+			      std::vector<ghobject_t> *ls, ghobject_t *next)
+{
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::shared_lock l{c->lock};
+
+  dout(10) << __func__ << " cid " << ch->cid << " start " << start
+	   << " end " << end << dendl;
+  auto p = c->object_map.lower_bound(start);
+  while (p != c->object_map.end() &&
+	 ls->size() < (unsigned)max &&
+	 p->first < end) {
+    ls->push_back(p->first);
+    ++p;
+  }
+  if (next != NULL) {
+    if (p == c->object_map.end())
+      *next = ghobject_t::get_max();
+    else
+      *next = p->first;
+  }
+  dout(10) << __func__ << " cid " << ch->cid << " got " << ls->size() << dendl;
+  return 0;
+}
+
+int MemStore::omap_get(
+  CollectionHandle& ch,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  ceph::buffer::list *header,      ///< [out] omap header
+  std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+  )
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  *header = o->omap_header;
+  *out = o->omap;
+  return 0;
+}
+
+int MemStore::omap_get_header(
+  CollectionHandle& ch,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  ceph::buffer::list *header,      ///< [out] omap header
+  bool allow_eio ///< [in] don't assert on eio
+  )
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  *header = o->omap_header;
+  return 0;
+}
+
+int MemStore::omap_get_keys(
+  CollectionHandle& ch,              ///< [in] Collection containing oid
+  const ghobject_t &oid, ///< [in] Object containing omap
+  std::set<std::string> *keys      ///< [out] Keys defined on oid
+  )
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  for (auto p = o->omap.begin(); p != o->omap.end(); ++p)
+    keys->insert(p->first);
+  return 0;
+}
+
+int MemStore::omap_get_values(
+  CollectionHandle& ch,                    ///< [in] Collection containing oid
+  const ghobject_t &oid,       ///< [in] Object containing omap
+  const std::set<std::string> &keys,     ///< [in] Keys to get
+  std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+  )
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  for (auto p = keys.begin(); p != keys.end(); ++p) {
+    auto q = o->omap.find(*p);
+    if (q != o->omap.end())
+      out->insert(*q);
+  }
+  return 0;
+}
+
+#ifdef WITH_SEASTAR
+int MemStore::omap_get_values(
+  CollectionHandle& ch,                    ///< [in] Collection containing oid
+  const ghobject_t &oid,       ///< [in] Object containing omap
+  const std::optional<std::string> &start_after,     ///< [in] Keys to get
+  std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+  )
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  assert(start_after);
+  std::lock_guard lock{o->omap_mutex};
+  for (auto it = o->omap.upper_bound(*start_after);
+       it != std::end(o->omap);
+       ++it) {
+    out->insert(*it);
+  }
+  return 0;
+}
+#endif
+
+int MemStore::omap_check_keys(
+  CollectionHandle& ch,                ///< [in] Collection containing oid
+  const ghobject_t &oid,   ///< [in] Object containing omap
+  const std::set<std::string> &keys, ///< [in] Keys to check
+  std::set<std::string> *out         ///< [out] Subset of keys defined on oid
+  )
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  for (auto p = keys.begin(); p != keys.end(); ++p) {
+    auto q = o->omap.find(*p);
+    if (q != o->omap.end())
+      out->insert(*p);
+  }
+  return 0;
+}
+
+class MemStore::OmapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
+  CollectionRef c;
+  ObjectRef o;
+  std::map<std::string,ceph::buffer::list>::iterator it;
+public:
+  OmapIteratorImpl(CollectionRef c, ObjectRef o)
+    : c(c), o(o), it(o->omap.begin()) {}
+
+  int seek_to_first() override {
+    std::lock_guard lock{o->omap_mutex};
+    it = o->omap.begin();
+    return 0;
+  }
+  int upper_bound(const std::string &after) override {
+    std::lock_guard lock{o->omap_mutex};
+    it = o->omap.upper_bound(after);
+    return 0;
+  }
+  int lower_bound(const std::string &to) override {
+    std::lock_guard lock{o->omap_mutex};
+    it = o->omap.lower_bound(to);
+    return 0;
+  }
+  bool valid() override {
+    std::lock_guard lock{o->omap_mutex};
+    return it != o->omap.end();
+  }
+  int next() override {
+    std::lock_guard lock{o->omap_mutex};
+    ++it;
+    return 0;
+  }
+  std::string key() override {
+    std::lock_guard lock{o->omap_mutex};
+    return it->first;
+  }
+  ceph::buffer::list value() override {
+    std::lock_guard lock{o->omap_mutex};
+    return it->second;
+  }
+  int status() override {
+    return 0;
+  }
+};
+
+ObjectMap::ObjectMapIterator MemStore::get_omap_iterator(
+  CollectionHandle& ch,
+  const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << ch->cid << " " << oid << dendl;
+  Collection *c = static_cast<Collection*>(ch.get());
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return ObjectMap::ObjectMapIterator();
+  return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o));
+}
+
+
+// ---------------
+// write operations
+
+int MemStore::queue_transactions(
+  CollectionHandle& ch,
+  std::vector<Transaction>& tls,
+  TrackedOpRef op,
+  ThreadPool::TPHandle *handle)
+{
+  // because memstore operations are synchronous, we can implement the
+  // Sequencer with a mutex. this guarantees ordering on a given sequencer,
+  // while allowing operations on different sequencers to happen in parallel
+  Collection *c = static_cast<Collection*>(ch.get());
+  std::unique_lock lock{c->sequencer_mutex};
+
+  for (auto p = tls.begin(); p != tls.end(); ++p) {
+    // poke the TPHandle heartbeat just to exercise that code path
+    if (handle)
+      handle->reset_tp_timeout();
+
+    _do_transaction(*p);
+  }
+
+  Context *on_apply = NULL, *on_apply_sync = NULL, *on_commit = NULL;
+  ObjectStore::Transaction::collect_contexts(tls, &on_apply, &on_commit,
+					     &on_apply_sync);
+  if (on_apply_sync)
+    on_apply_sync->complete(0);
+  if (on_apply)
+    finisher.queue(on_apply);
+  if (on_commit)
+    finisher.queue(on_commit);
+  return 0;
+}
+
+void MemStore::_do_transaction(Transaction& t)
+{
+  Transaction::iterator i = t.begin();
+  int pos = 0;
+
+  while (i.have_op()) {
+    Transaction::Op *op = i.decode_op();
+    int r = 0;
+
+    switch (op->op) {
+    case Transaction::OP_NOP:
+      break;
+    case Transaction::OP_TOUCH:
+    case Transaction::OP_CREATE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _touch(cid, oid);
+      }
+      break;
+
+    case Transaction::OP_WRITE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	uint32_t fadvise_flags = i.get_fadvise_flags();
+        ceph::buffer::list bl;
+        i.decode_bl(bl);
+	r = _write(cid, oid, off, len, bl, fadvise_flags);
+      }
+      break;
+
+    case Transaction::OP_ZERO:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	r = _zero(cid, oid, off, len);
+      }
+      break;
+
+    case Transaction::OP_TRIMCACHE:
+      {
+        // deprecated, no-op
+      }
+      break;
+
+    case Transaction::OP_TRUNCATE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        uint64_t off = op->off;
+	r = _truncate(cid, oid, off);
+      }
+      break;
+
+    case Transaction::OP_REMOVE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _remove(cid, oid);
+      }
+      break;
+
+    case Transaction::OP_SETATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        std::string name = i.decode_string();
+        ceph::buffer::list bl;
+        i.decode_bl(bl);
+	std::map<std::string, ceph::buffer::ptr> to_set;
+	to_set[name] = ceph::buffer::ptr(bl.c_str(), bl.length());
+	r = _setattrs(cid, oid, to_set);
+      }
+      break;
+
+    case Transaction::OP_SETATTRS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	std::map<std::string, ceph::buffer::ptr> aset;
+        i.decode_attrset(aset);
+	r = _setattrs(cid, oid, aset);
+      }
+      break;
+
+    case Transaction::OP_RMATTR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        std::string name = i.decode_string();
+	r = _rmattr(cid, oid, name.c_str());
+      }
+      break;
+
+    case Transaction::OP_RMATTRS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _rmattrs(cid, oid);
+      }
+      break;
+
+    case Transaction::OP_CLONE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ghobject_t noid = i.get_oid(op->dest_oid);
+	r = _clone(cid, oid, noid);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ghobject_t noid = i.get_oid(op->dest_oid);
+        uint64_t off = op->off;
+        uint64_t len = op->len;
+	r = _clone_range(cid, oid, noid, off, len, off);
+      }
+      break;
+
+    case Transaction::OP_CLONERANGE2:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ghobject_t noid = i.get_oid(op->dest_oid);
+        uint64_t srcoff = op->off;
+        uint64_t len = op->len;
+        uint64_t dstoff = op->dest_off;
+	r = _clone_range(cid, oid, noid, srcoff, len, dstoff);
+      }
+      break;
+
+    case Transaction::OP_MKCOLL:
+      {
+        coll_t cid = i.get_cid(op->cid);
+	r = _create_collection(cid, op->split_bits);
+      }
+      break;
+
+    case Transaction::OP_COLL_HINT:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t type = op->hint;
+        ceph::buffer::list hint;
+        i.decode_bl(hint);
+        auto hiter = hint.cbegin();
+        if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) {
+          uint32_t pg_num;
+          uint64_t num_objs;
+          decode(pg_num, hiter);
+          decode(num_objs, hiter);
+          r = _collection_hint_expected_num_objs(cid, pg_num, num_objs);
+        } else {
+          // Ignore the hint
+          dout(10) << "Unrecognized collection hint type: " << type << dendl;
+        }
+      }
+      break;
+
+    case Transaction::OP_RMCOLL:
+      {
+        coll_t cid = i.get_cid(op->cid);
+	r = _destroy_collection(cid);
+      }
+      break;
+
+    case Transaction::OP_COLL_ADD:
+      {
+        coll_t ocid = i.get_cid(op->cid);
+        coll_t ncid = i.get_cid(op->dest_cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _collection_add(ncid, ocid, oid);
+      }
+      break;
+
+    case Transaction::OP_COLL_REMOVE:
+       {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _remove(cid, oid);
+       }
+      break;
+
+    case Transaction::OP_COLL_MOVE:
+      ceph_abort_msg("deprecated");
+      break;
+
+    case Transaction::OP_COLL_MOVE_RENAME:
+      {
+        coll_t oldcid = i.get_cid(op->cid);
+        ghobject_t oldoid = i.get_oid(op->oid);
+        coll_t newcid = i.get_cid(op->dest_cid);
+        ghobject_t newoid = i.get_oid(op->dest_oid);
+	r = _collection_move_rename(oldcid, oldoid, newcid, newoid);
+	if (r == -ENOENT)
+	  r = 0;
+      }
+      break;
+
+    case Transaction::OP_TRY_RENAME:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oldoid = i.get_oid(op->oid);
+        ghobject_t newoid = i.get_oid(op->dest_oid);
+	r = _collection_move_rename(cid, oldoid, cid, newoid);
+	if (r == -ENOENT)
+	  r = 0;
+      }
+      break;
+
+    case Transaction::OP_COLL_SETATTR:
+      {
+	ceph_abort_msg("not implemented");
+      }
+      break;
+
+    case Transaction::OP_COLL_RMATTR:
+      {
+	ceph_abort_msg("not implemented");
+      }
+      break;
+
+    case Transaction::OP_COLL_RENAME:
+      {
+	ceph_abort_msg("not implemented");
+      }
+      break;
+
+    case Transaction::OP_OMAP_CLEAR:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+	r = _omap_clear(cid, oid);
+      }
+      break;
+    case Transaction::OP_OMAP_SETKEYS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ceph::buffer::list aset_bl;
+        i.decode_attrset_bl(&aset_bl);
+	r = _omap_setkeys(cid, oid, aset_bl);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYS:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ceph::buffer::list keys_bl;
+        i.decode_keyset_bl(&keys_bl);
+	r = _omap_rmkeys(cid, oid, keys_bl);
+      }
+      break;
+    case Transaction::OP_OMAP_RMKEYRANGE:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        std::string first, last;
+        first = i.decode_string();
+        last = i.decode_string();
+	r = _omap_rmkeyrange(cid, oid, first, last);
+      }
+      break;
+    case Transaction::OP_OMAP_SETHEADER:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        ghobject_t oid = i.get_oid(op->oid);
+        ceph::buffer::list bl;
+        i.decode_bl(bl);
+	r = _omap_setheader(cid, oid, bl);
+      }
+      break;
+    case Transaction::OP_SPLIT_COLLECTION:
+      ceph_abort_msg("deprecated");
+      break;
+    case Transaction::OP_SPLIT_COLLECTION2:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        uint32_t rem = op->split_rem;
+        coll_t dest = i.get_cid(op->dest_cid);
+	r = _split_collection(cid, bits, rem, dest);
+      }
+      break;
+    case Transaction::OP_MERGE_COLLECTION:
+      {
+        coll_t cid = i.get_cid(op->cid);
+        uint32_t bits = op->split_bits;
+        coll_t dest = i.get_cid(op->dest_cid);
+	r = _merge_collection(cid, bits, dest);
+      }
+      break;
+
+    case Transaction::OP_SETALLOCHINT:
+      {
+        r = 0;
+      }
+      break;
+
+    case Transaction::OP_COLL_SET_BITS:
+      {
+        r = 0;
+      }
+      break;
+
+    default:
+      derr << "bad op " << op->op << dendl;
+      ceph_abort();
+    }
+
+    if (r < 0) {
+      bool ok = false;
+
+      if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE ||
+			    op->op == Transaction::OP_CLONE ||
+			    op->op == Transaction::OP_CLONERANGE2 ||
+			    op->op == Transaction::OP_COLL_ADD))
+	// -ENOENT is usually okay
+	ok = true;
+      if (r == -ENODATA)
+	ok = true;
+
+      if (!ok) {
+	const char *msg = "unexpected error code";
+
+	if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE ||
+			     op->op == Transaction::OP_CLONE ||
+			     op->op == Transaction::OP_CLONERANGE2))
+	  msg = "ENOENT on clone suggests osd bug";
+
+	if (r == -ENOSPC)
+	  // For now, if we hit _any_ ENOSPC, crash, before we do any damage
+	  // by partially applying transactions.
+	  msg = "ENOSPC from MemStore, misconfigured cluster or insufficient memory";
+
+	if (r == -ENOTEMPTY) {
+	  msg = "ENOTEMPTY suggests garbage data in osd data dir";
+	  dump_all();
+	}
+
+	derr    << " error " << cpp_strerror(r) << " not handled on operation " << op->op
+		<< " (op " << pos << ", counting from 0)" << dendl;
+	dout(0) << msg << dendl;
+	dout(0) << " transaction dump:\n";
+	ceph::JSONFormatter f(true);
+	f.open_object_section("transaction");
+	t.dump(&f);
+	f.close_section();
+	f.flush(*_dout);
+	*_dout << dendl;
+	ceph_abort_msg("unexpected error");
+      }
+    }
+
+    ++pos;
+  }
+}
+
+int MemStore::_touch(const coll_t& cid, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  c->get_or_create_object(oid);
+  return 0;
+}
+
+int MemStore::_write(const coll_t& cid, const ghobject_t& oid,
+		     uint64_t offset, size_t len, const ceph::buffer::list& bl,
+		     uint32_t fadvise_flags)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << " "
+	   << offset << "~" << len << dendl;
+  ceph_assert(len == bl.length());
+
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_or_create_object(oid);
+  if (len > 0 && !cct->_conf->memstore_debug_omit_block_device_write) {
+    const ssize_t old_size = o->get_size();
+    o->write(offset, bl);
+    used_bytes += (o->get_size() - old_size);
+  }
+
+  return 0;
+}
+
+int MemStore::_zero(const coll_t& cid, const ghobject_t& oid,
+		    uint64_t offset, size_t len)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~"
+	   << len << dendl;
+  ceph::buffer::list bl;
+  bl.append_zero(len);
+  return _write(cid, oid, offset, len, bl);
+}
+
+int MemStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << " " << size << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  if (cct->_conf->memstore_debug_omit_block_device_write)
+    return 0;
+  const ssize_t old_size = o->get_size();
+  int r = o->truncate(size);
+  used_bytes += (o->get_size() - old_size);
+  return r;
+}
+
+int MemStore::_remove(const coll_t& cid, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  std::lock_guard l{c->lock};
+
+  auto i = c->object_hash.find(oid);
+  if (i == c->object_hash.end())
+    return -ENOENT;
+  used_bytes -= i->second->get_size();
+  c->object_hash.erase(i);
+  c->object_map.erase(oid);
+
+  return 0;
+}
+
+int MemStore::_setattrs(const coll_t& cid, const ghobject_t& oid,
+			std::map<std::string,ceph::buffer::ptr>& aset)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->xattr_mutex};
+  for (auto p = aset.begin(); p != aset.end(); ++p)
+    o->xattr[p->first] = p->second;
+  return 0;
+}
+
+int MemStore::_rmattr(const coll_t& cid, const ghobject_t& oid, const char *name)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << " " << name << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->xattr_mutex};
+  auto i = o->xattr.find(name);
+  if (i == o->xattr.end())
+    return -ENODATA;
+  o->xattr.erase(i);
+  return 0;
+}
+
+int MemStore::_rmattrs(const coll_t& cid, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->xattr_mutex};
+  o->xattr.clear();
+  return 0;
+}
+
+int MemStore::_clone(const coll_t& cid, const ghobject_t& oldoid,
+		     const ghobject_t& newoid)
+{
+  dout(10) << __func__ << " " << cid << " " << oldoid
+	   << " -> " << newoid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef oo = c->get_object(oldoid);
+  if (!oo)
+    return -ENOENT;
+  ObjectRef no = c->get_or_create_object(newoid);
+  used_bytes += oo->get_size() - no->get_size();
+  no->clone(oo.get(), 0, oo->get_size(), 0);
+
+  // take xattr and omap locks with std::lock()
+  std::scoped_lock l{oo->xattr_mutex,
+		     no->xattr_mutex,
+		     oo->omap_mutex,
+		     no->omap_mutex};
+
+  no->omap_header = oo->omap_header;
+  no->omap = oo->omap;
+  no->xattr = oo->xattr;
+  return 0;
+}
+
+int MemStore::_clone_range(const coll_t& cid, const ghobject_t& oldoid,
+			   const ghobject_t& newoid,
+			   uint64_t srcoff, uint64_t len, uint64_t dstoff)
+{
+  dout(10) << __func__ << " " << cid << " "
+	   << oldoid << " " << srcoff << "~" << len << " -> "
+	   << newoid << " " << dstoff << "~" << len
+	   << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef oo = c->get_object(oldoid);
+  if (!oo)
+    return -ENOENT;
+  ObjectRef no = c->get_or_create_object(newoid);
+  if (srcoff >= oo->get_size())
+    return 0;
+  if (srcoff + len >= oo->get_size())
+    len = oo->get_size() - srcoff;
+
+  const ssize_t old_size = no->get_size();
+  no->clone(oo.get(), srcoff, len, dstoff);
+  used_bytes += (no->get_size() - old_size);
+
+  return len;
+}
+
+int MemStore::_omap_clear(const coll_t& cid, const ghobject_t &oid)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  o->omap.clear();
+  o->omap_header.clear();
+  return 0;
+}
+
+int MemStore::_omap_setkeys(const coll_t& cid, const ghobject_t &oid,
+			    ceph::buffer::list& aset_bl)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  auto p = aset_bl.cbegin();
+  __u32 num;
+  decode(num, p);
+  while (num--) {
+    std::string key;
+    decode(key, p);
+    decode(o->omap[key], p);
+  }
+  return 0;
+}
+
+int MemStore::_omap_rmkeys(const coll_t& cid, const ghobject_t &oid,
+			   ceph::buffer::list& keys_bl)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  auto p = keys_bl.cbegin();
+  __u32 num;
+  decode(num, p);
+  while (num--) {
+    std::string key;
+    decode(key, p);
+    o->omap.erase(key);
+  }
+  return 0;
+}
+
+int MemStore::_omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
+			       const std::string& first, const std::string& last)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << " " << first
+	   << " " << last << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  auto p = o->omap.lower_bound(first);
+  auto e = o->omap.lower_bound(last);
+  o->omap.erase(p, e);
+  return 0;
+}
+
+int MemStore::_omap_setheader(const coll_t& cid, const ghobject_t &oid,
+			      const ceph::buffer::list &bl)
+{
+  dout(10) << __func__ << " " << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+
+  ObjectRef o = c->get_object(oid);
+  if (!o)
+    return -ENOENT;
+  std::lock_guard lock{o->omap_mutex};
+  o->omap_header = bl;
+  return 0;
+}
+
+int MemStore::_create_collection(const coll_t& cid, int bits)
+{
+  dout(10) << __func__ << " " << cid << dendl;
+  std::lock_guard l{coll_lock};
+  auto result = coll_map.insert(std::make_pair(cid, CollectionRef()));
+  if (!result.second)
+    return -EEXIST;
+  auto p = new_coll_map.find(cid);
+  ceph_assert(p != new_coll_map.end());
+  result.first->second = p->second;
+  result.first->second->bits = bits;
+  new_coll_map.erase(p);
+  return 0;
+}
+
+int MemStore::_destroy_collection(const coll_t& cid)
+{
+  dout(10) << __func__ << " " << cid << dendl;
+  std::lock_guard l{coll_lock};
+  ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+  if (cp == coll_map.end())
+    return -ENOENT;
+  {
+    std::shared_lock l2{cp->second->lock};
+    if (!cp->second->object_map.empty())
+      return -ENOTEMPTY;
+    cp->second->exists = false;
+  }
+  used_bytes -= cp->second->used_bytes();
+  coll_map.erase(cp);
+  return 0;
+}
+
+int MemStore::_collection_add(const coll_t& cid, const coll_t& ocid, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << cid << " " << ocid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  CollectionRef oc = get_collection(ocid);
+  if (!oc)
+    return -ENOENT;
+
+  std::scoped_lock l{std::min(&(*c), &(*oc))->lock,
+		     std::max(&(*c), &(*oc))->lock};
+
+  if (c->object_hash.count(oid))
+    return -EEXIST;
+  if (oc->object_hash.count(oid) == 0)
+    return -ENOENT;
+  ObjectRef o = oc->object_hash[oid];
+  c->object_map[oid] = o;
+  c->object_hash[oid] = o;
+  return 0;
+}
+
+int MemStore::_collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+				      coll_t cid, const ghobject_t& oid)
+{
+  dout(10) << __func__ << " " << oldcid << " " << oldoid << " -> "
+	   << cid << " " << oid << dendl;
+  CollectionRef c = get_collection(cid);
+  if (!c)
+    return -ENOENT;
+  CollectionRef oc = get_collection(oldcid);
+  if (!oc)
+    return -ENOENT;
+
+  // note: c and oc may be the same
+  ceph_assert(&(*c) == &(*oc));
+
+  std::lock_guard l{c->lock};
+  if (c->object_hash.count(oid))
+    return -EEXIST;
+  if (oc->object_hash.count(oldoid) == 0)
+    return -ENOENT;
+  {
+    ObjectRef o = oc->object_hash[oldoid];
+    c->object_map[oid] = o;
+    c->object_hash[oid] = o;
+    oc->object_map.erase(oldoid);
+    oc->object_hash.erase(oldoid);
+  }
+  return 0;
+}
+
+int MemStore::_split_collection(const coll_t& cid, uint32_t bits, uint32_t match,
+				coll_t dest)
+{
+  dout(10) << __func__ << " " << cid << " " << bits << " " << match << " "
+	   << dest << dendl;
+  CollectionRef sc = get_collection(cid);
+  if (!sc)
+    return -ENOENT;
+  CollectionRef dc = get_collection(dest);
+  if (!dc)
+    return -ENOENT;
+
+  std::scoped_lock l{std::min(&(*sc), &(*dc))->lock,
+                     std::max(&(*sc), &(*dc))->lock};
+
+  auto p = sc->object_map.begin();
+  while (p != sc->object_map.end()) {
+    if (p->first.match(bits, match)) {
+      dout(20) << " moving " << p->first << dendl;
+      dc->object_map.insert(std::make_pair(p->first, p->second));
+      dc->object_hash.insert(std::make_pair(p->first, p->second));
+      sc->object_hash.erase(p->first);
+      sc->object_map.erase(p++);
+    } else {
+      ++p;
+    }
+  }
+
+  sc->bits = bits;
+  ceph_assert(dc->bits == (int)bits);
+
+  return 0;
+}
+
+int MemStore::_merge_collection(const coll_t& cid, uint32_t bits, coll_t dest)
+{
+  dout(10) << __func__ << " " << cid << " " << bits << " "
+	   << dest << dendl;
+  CollectionRef sc = get_collection(cid);
+  if (!sc)
+    return -ENOENT;
+  CollectionRef dc = get_collection(dest);
+  if (!dc)
+    return -ENOENT;
+  {
+    std::scoped_lock l{std::min(&(*sc), &(*dc))->lock,
+                       std::max(&(*sc), &(*dc))->lock};
+
+    auto p = sc->object_map.begin();
+    while (p != sc->object_map.end()) {
+      dout(20) << " moving " << p->first << dendl;
+      dc->object_map.insert(std::make_pair(p->first, p->second));
+      dc->object_hash.insert(std::make_pair(p->first, p->second));
+      sc->object_hash.erase(p->first);
+      sc->object_map.erase(p++);
+    }
+
+    dc->bits = bits;
+  }
+
+  {
+    std::lock_guard l{coll_lock};
+    ceph::unordered_map<coll_t,CollectionRef>::iterator cp = coll_map.find(cid);
+    ceph_assert(cp != coll_map.end());
+    used_bytes -= cp->second->used_bytes();
+    coll_map.erase(cp);
+  }
+
+  return 0;
+}
+
+namespace {
+struct BufferlistObject : public MemStore::Object {
+  ceph::spinlock mutex;
+  ceph::buffer::list data;
+
+  size_t get_size() const override { return data.length(); }
+
+  int read(uint64_t offset, uint64_t len, ceph::buffer::list &bl) override;
+  int write(uint64_t offset, const ceph::buffer::list &bl) override;
+  int clone(Object *src, uint64_t srcoff, uint64_t len,
+            uint64_t dstoff) override;
+  int truncate(uint64_t offset) override;
+
+  void encode(ceph::buffer::list& bl) const override {
+    ENCODE_START(1, 1, bl);
+    encode(data, bl);
+    encode_base(bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) override {
+    DECODE_START(1, p);
+    decode(data, p);
+    decode_base(p);
+    DECODE_FINISH(p);
+  }
+};
+}
+// BufferlistObject
+int BufferlistObject::read(uint64_t offset, uint64_t len,
+                                     ceph::buffer::list &bl)
+{
+  std::lock_guard<decltype(mutex)> lock(mutex);
+  bl.substr_of(data, offset, len);
+  return bl.length();
+}
+
+int BufferlistObject::write(uint64_t offset, const ceph::buffer::list &src)
+{
+  unsigned len = src.length();
+
+  std::lock_guard<decltype(mutex)> lock(mutex);
+
+  // before
+  ceph::buffer::list newdata;
+  if (get_size() >= offset) {
+    newdata.substr_of(data, 0, offset);
+  } else {
+    if (get_size()) {
+      newdata.substr_of(data, 0, get_size());
+    }
+    newdata.append_zero(offset - get_size());
+  }
+
+  newdata.append(src);
+
+  // after
+  if (get_size() > offset + len) {
+    ceph::buffer::list tail;
+    tail.substr_of(data, offset + len, get_size() - (offset + len));
+    newdata.append(tail);
+  }
+
+  data = std::move(newdata);
+  return 0;
+}
+
+int BufferlistObject::clone(Object *src, uint64_t srcoff,
+                                      uint64_t len, uint64_t dstoff)
+{
+  auto srcbl = dynamic_cast<BufferlistObject*>(src);
+  if (srcbl == nullptr)
+    return -ENOTSUP;
+
+  ceph::buffer::list bl;
+  {
+    std::lock_guard<decltype(srcbl->mutex)> lock(srcbl->mutex);
+    if (srcoff == dstoff && len == src->get_size()) {
+      data = srcbl->data;
+      return 0;
+    }
+    bl.substr_of(srcbl->data, srcoff, len);
+  }
+  return write(dstoff, bl);
+}
+
+int BufferlistObject::truncate(uint64_t size)
+{
+  std::lock_guard<decltype(mutex)> lock(mutex);
+  if (get_size() > size) {
+    ceph::buffer::list bl;
+    bl.substr_of(data, 0, size);
+    data = std::move(bl);
+  } else if (get_size() == size) {
+    // do nothing
+  } else {
+    data.append_zero(size - get_size());
+  }
+  return 0;
+}
+
+// PageSetObject
+
+struct MemStore::PageSetObject : public Object {
+  PageSet data;
+  uint64_t data_len;
+#if defined(__GLIBCXX__)
+  // use a thread-local vector for the pages returned by PageSet, so we
+  // can avoid allocations in read/write()
+  static thread_local PageSet::page_vector tls_pages;
+#endif
+
+  size_t get_size() const override { return data_len; }
+
+  int read(uint64_t offset, uint64_t len, ceph::buffer::list &bl) override;
+  int write(uint64_t offset, const ceph::buffer::list &bl) override;
+  int clone(Object *src, uint64_t srcoff, uint64_t len,
+            uint64_t dstoff) override;
+  int truncate(uint64_t offset) override;
+
+  void encode(ceph::buffer::list& bl) const override {
+    ENCODE_START(1, 1, bl);
+    encode(data_len, bl);
+    data.encode(bl);
+    encode_base(bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& p) override {
+    DECODE_START(1, p);
+    decode(data_len, p);
+    data.decode(p);
+    decode_base(p);
+    DECODE_FINISH(p);
+  }
+
+private:
+  FRIEND_MAKE_REF(PageSetObject);
+  explicit PageSetObject(size_t page_size) : data(page_size), data_len(0) {}
+};
+
+#if defined(__GLIBCXX__)
+// use a thread-local vector for the pages returned by PageSet, so we
+// can avoid allocations in read/write()
+thread_local PageSet::page_vector MemStore::PageSetObject::tls_pages;
+#define DEFINE_PAGE_VECTOR(name)
+#else
+#define DEFINE_PAGE_VECTOR(name) PageSet::page_vector name;
+#endif
+
+int MemStore::PageSetObject::read(uint64_t offset, uint64_t len, ceph::buffer::list& bl)
+{
+  const auto start = offset;
+  const auto end = offset + len;
+  auto remaining = len;
+
+  DEFINE_PAGE_VECTOR(tls_pages);
+  data.get_range(offset, len, tls_pages);
+
+  // allocate a buffer for the data
+  ceph::buffer::ptr buf(len);
+
+  auto p = tls_pages.begin();
+  while (remaining) {
+    // no more pages in range
+    if (p == tls_pages.end() || (*p)->offset >= end) {
+      buf.zero(offset - start, remaining);
+      break;
+    }
+    auto page = *p;
+
+    // fill any holes between pages with zeroes
+    if (page->offset > offset) {
+      const auto count = std::min(remaining, page->offset - offset);
+      buf.zero(offset - start, count);
+      remaining -= count;
+      offset = page->offset;
+      if (!remaining)
+        break;
+    }
+
+    // read from page
+    const auto page_offset = offset - page->offset;
+    const auto count = std::min(remaining, data.get_page_size() - page_offset);
+
+    buf.copy_in(offset - start, count, page->data + page_offset);
+
+    remaining -= count;
+    offset += count;
+
+    ++p;
+  }
+
+  tls_pages.clear(); // drop page refs
+
+  bl.append(std::move(buf));
+  return len;
+}
+
+int MemStore::PageSetObject::write(uint64_t offset, const ceph::buffer::list &src)
+{
+  unsigned len = src.length();
+
+  DEFINE_PAGE_VECTOR(tls_pages);
+  // make sure the page range is allocated
+  data.alloc_range(offset, src.length(), tls_pages);
+
+  auto page = tls_pages.begin();
+
+  auto p = src.begin();
+  while (len > 0) {
+    unsigned page_offset = offset - (*page)->offset;
+    unsigned pageoff = data.get_page_size() - page_offset;
+    unsigned count = std::min(len, pageoff);
+    p.copy(count, (*page)->data + page_offset);
+    offset += count;
+    len -= count;
+    if (count == pageoff)
+      ++page;
+  }
+  if (data_len < offset)
+    data_len = offset;
+  tls_pages.clear(); // drop page refs
+  return 0;
+}
+
+int MemStore::PageSetObject::clone(Object *src, uint64_t srcoff,
+                                   uint64_t len, uint64_t dstoff)
+{
+  const int64_t delta = dstoff - srcoff;
+
+  auto &src_data = static_cast<PageSetObject*>(src)->data;
+  const uint64_t src_page_size = src_data.get_page_size();
+
+  auto &dst_data = data;
+  const auto dst_page_size = dst_data.get_page_size();
+
+  DEFINE_PAGE_VECTOR(tls_pages);
+  PageSet::page_vector dst_pages;
+
+  while (len) {
+    // limit to 16 pages at a time so tls_pages doesn't balloon in size
+    auto count = std::min(len, (uint64_t)src_page_size * 16);
+    src_data.get_range(srcoff, count, tls_pages);
+
+    // allocate the destination range
+    // TODO: avoid allocating pages for holes in the source range
+    dst_data.alloc_range(srcoff + delta, count, dst_pages);
+    auto dst_iter = dst_pages.begin();
+
+    for (auto &src_page : tls_pages) {
+      auto sbegin = std::max(srcoff, src_page->offset);
+      auto send = std::min(srcoff + count, src_page->offset + src_page_size);
+
+      // zero-fill holes before src_page
+      if (srcoff < sbegin) {
+        while (dst_iter != dst_pages.end()) {
+          auto &dst_page = *dst_iter;
+          auto dbegin = std::max(srcoff + delta, dst_page->offset);
+          auto dend = std::min(sbegin + delta, dst_page->offset + dst_page_size);
+          std::fill(dst_page->data + dbegin - dst_page->offset,
+                    dst_page->data + dend - dst_page->offset, 0);
+          if (dend < dst_page->offset + dst_page_size)
+            break;
+          ++dst_iter;
+        }
+        const auto c = sbegin - srcoff;
+        count -= c;
+        len -= c;
+      }
+
+      // copy data from src page to dst pages
+      while (dst_iter != dst_pages.end()) {
+        auto &dst_page = *dst_iter;
+        auto dbegin = std::max(sbegin + delta, dst_page->offset);
+        auto dend = std::min(send + delta, dst_page->offset + dst_page_size);
+
+        std::copy(src_page->data + (dbegin - delta) - src_page->offset,
+                  src_page->data + (dend - delta) - src_page->offset,
+                  dst_page->data + dbegin - dst_page->offset);
+        if (dend < dst_page->offset + dst_page_size)
+          break;
+        ++dst_iter;
+      }
+
+      const auto c = send - sbegin;
+      count -= c;
+      len -= c;
+      srcoff = send;
+      dstoff = send + delta;
+    }
+    tls_pages.clear(); // drop page refs
+
+    // zero-fill holes after the last src_page
+    if (count > 0) {
+      while (dst_iter != dst_pages.end()) {
+        auto &dst_page = *dst_iter;
+        auto dbegin = std::max(dstoff, dst_page->offset);
+        auto dend = std::min(dstoff + count, dst_page->offset + dst_page_size);
+        std::fill(dst_page->data + dbegin - dst_page->offset,
+                  dst_page->data + dend - dst_page->offset, 0);
+        ++dst_iter;
+      }
+      srcoff += count;
+      dstoff += count;
+      len -= count;
+    }
+    dst_pages.clear(); // drop page refs
+  }
+
+  // update object size
+  if (data_len < dstoff)
+    data_len = dstoff;
+  return 0;
+}
+
+int MemStore::PageSetObject::truncate(uint64_t size)
+{
+  data.free_pages_after(size);
+  data_len = size;
+
+  const auto page_size = data.get_page_size();
+  const auto page_offset = size & ~(page_size-1);
+  if (page_offset == size)
+    return 0;
+
+  DEFINE_PAGE_VECTOR(tls_pages);
+  // write zeroes to the rest of the last page
+  data.get_range(page_offset, page_size, tls_pages);
+  if (tls_pages.empty())
+    return 0;
+
+  auto page = tls_pages.begin();
+  auto data = (*page)->data;
+  std::fill(data + (size - page_offset), data + page_size, 0);
+  tls_pages.clear(); // drop page ref
+  return 0;
+}
+
+
+MemStore::ObjectRef MemStore::Collection::create_object() const {
+  if (use_page_set)
+    return ceph::make_ref<PageSetObject>(cct->_conf->memstore_page_size);
+  return make_ref<BufferlistObject>();
+}
diff --git a/src/os/memstore/MemStore.h b/src/os/memstore/MemStore.h
new file mode 100644
index 000000000..858379ed9
--- /dev/null
+++ b/src/os/memstore/MemStore.h
@@ -0,0 +1,414 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013- Sage Weil <sage@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_MEMSTORE_H
+#define CEPH_MEMSTORE_H
+
+#include <atomic>
+#include <mutex>
+#include <boost/intrusive_ptr.hpp>
+
+#include "include/unordered_map.h"
+#include "common/Finisher.h"
+#include "common/RefCountedObj.h"
+#include "os/ObjectStore.h"
+#include "PageSet.h"
+#include "include/ceph_assert.h"
+
+class MemStore : public ObjectStore {
+public:
+  struct Object : public RefCountedObject {
+    ceph::mutex xattr_mutex{ceph::make_mutex("MemStore::Object::xattr_mutex")};
+    ceph::mutex omap_mutex{ceph::make_mutex("MemStore::Object::omap_mutex")};
+    std::map<std::string,ceph::buffer::ptr,std::less<>> xattr;
+    ceph::buffer::list omap_header;
+    std::map<std::string,ceph::buffer::list> omap;
+
+    using Ref = ceph::ref_t<Object>;
+
+    // interface for object data
+    virtual size_t get_size() const = 0;
+    virtual int read(uint64_t offset, uint64_t len, ceph::buffer::list &bl) = 0;
+    virtual int write(uint64_t offset, const ceph::buffer::list &bl) = 0;
+    virtual int clone(Object *src, uint64_t srcoff, uint64_t len,
+                      uint64_t dstoff) = 0;
+    virtual int truncate(uint64_t offset) = 0;
+    virtual void encode(ceph::buffer::list& bl) const = 0;
+    virtual void decode(ceph::buffer::list::const_iterator& p) = 0;
+
+    void encode_base(ceph::buffer::list& bl) const {
+      using ceph::encode;
+      encode(xattr, bl);
+      encode(omap_header, bl);
+      encode(omap, bl);
+    }
+    void decode_base(ceph::buffer::list::const_iterator& p) {
+      using ceph::decode;
+      decode(xattr, p);
+      decode(omap_header, p);
+      decode(omap, p);
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->dump_int("data_len", get_size());
+      f->dump_int("omap_header_len", omap_header.length());
+
+      f->open_array_section("xattrs");
+      for (auto p = xattr.begin(); p != xattr.end(); ++p) {
+	f->open_object_section("xattr");
+	f->dump_string("name", p->first);
+	f->dump_int("length", p->second.length());
+	f->close_section();
+      }
+      f->close_section();
+
+      f->open_array_section("omap");
+      for (auto p = omap.begin(); p != omap.end(); ++p) {
+	f->open_object_section("pair");
+	f->dump_string("key", p->first);
+	f->dump_int("length", p->second.length());
+	f->close_section();
+      }
+      f->close_section();
+    }
+  protected:
+    Object() = default;
+  };
+  using ObjectRef = Object::Ref;
+
+  struct PageSetObject;
+  struct Collection : public CollectionImpl {
+    int bits = 0;
+    CephContext *cct;
+    bool use_page_set;
+    ceph::unordered_map<ghobject_t, ObjectRef> object_hash;  ///< for lookup
+    std::map<ghobject_t, ObjectRef> object_map;        ///< for iteration
+    std::map<std::string,ceph::buffer::ptr> xattr;
+    /// for object_{map,hash}
+    ceph::shared_mutex lock{
+      ceph::make_shared_mutex("MemStore::Collection::lock", true, false)};
+
+    bool exists = true;
+    ceph::mutex sequencer_mutex{
+      ceph::make_mutex("MemStore::Collection::sequencer_mutex")};
+
+    typedef boost::intrusive_ptr<Collection> Ref;
+
+    ObjectRef create_object() const;
+
+    // NOTE: The lock only needs to protect the object_map/hash, not the
+    // contents of individual objects.  The osd is already sequencing
+    // reads and writes, so we will never see them concurrently at this
+    // level.
+
+    ObjectRef get_object(ghobject_t oid) {
+      std::shared_lock l{lock};
+      auto o = object_hash.find(oid);
+      if (o == object_hash.end())
+	return ObjectRef();
+      return o->second;
+    }
+
+    ObjectRef get_or_create_object(ghobject_t oid) {
+      std::lock_guard l{lock};
+      auto result = object_hash.emplace(oid, ObjectRef());
+      if (result.second)
+        object_map[oid] = result.first->second = create_object();
+      return result.first->second;
+    }
+
+    void encode(ceph::buffer::list& bl) const {
+      ENCODE_START(1, 1, bl);
+      encode(xattr, bl);
+      encode(use_page_set, bl);
+      uint32_t s = object_map.size();
+      encode(s, bl);
+      for (auto p = object_map.begin(); p != object_map.end(); ++p) {
+	encode(p->first, bl);
+	p->second->encode(bl);
+      }
+      ENCODE_FINISH(bl);
+    }
+    void decode(ceph::buffer::list::const_iterator& p) {
+      DECODE_START(1, p);
+      decode(xattr, p);
+      decode(use_page_set, p);
+      uint32_t s;
+      decode(s, p);
+      while (s--) {
+	ghobject_t k;
+	decode(k, p);
+	auto o = create_object();
+	o->decode(p);
+	object_map.insert(std::make_pair(k, o));
+	object_hash.insert(std::make_pair(k, o));
+      }
+      DECODE_FINISH(p);
+    }
+
+    uint64_t used_bytes() const {
+      uint64_t result = 0;
+      for (auto p = object_map.begin(); p != object_map.end(); ++p) {
+        result += p->second->get_size();
+      }
+
+      return result;
+    }
+
+    void flush() override {
+    }
+    bool flush_commit(Context *c) override {
+      return true;
+    }
+
+  private:
+    FRIEND_MAKE_REF(Collection);
+    explicit Collection(CephContext *cct, coll_t c)
+      : CollectionImpl(cct, c),
+	cct(cct),
+	use_page_set(cct->_conf->memstore_page_set) {}
+  };
+  typedef Collection::Ref CollectionRef;
+
+private:
+  class OmapIteratorImpl;
+
+
+  ceph::unordered_map<coll_t, CollectionRef> coll_map;
+  /// rwlock to protect coll_map
+  ceph::shared_mutex coll_lock{
+    ceph::make_shared_mutex("MemStore::coll_lock")};
+  std::map<coll_t,CollectionRef> new_coll_map;
+
+  CollectionRef get_collection(const coll_t& cid);
+
+  Finisher finisher;
+
+  std::atomic<uint64_t> used_bytes;
+
+  void _do_transaction(Transaction& t);
+
+  int _touch(const coll_t& cid, const ghobject_t& oid);
+  int _write(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len,
+	      const ceph::buffer::list& bl, uint32_t fadvise_flags = 0);
+  int _zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len);
+  int _truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size);
+  int _remove(const coll_t& cid, const ghobject_t& oid);
+  int _setattrs(const coll_t& cid, const ghobject_t& oid, std::map<std::string,ceph::buffer::ptr>& aset);
+  int _rmattr(const coll_t& cid, const ghobject_t& oid, const char *name);
+  int _rmattrs(const coll_t& cid, const ghobject_t& oid);
+  int _clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid);
+  int _clone_range(const coll_t& cid, const ghobject_t& oldoid,
+		   const ghobject_t& newoid,
+		   uint64_t srcoff, uint64_t len, uint64_t dstoff);
+  int _omap_clear(const coll_t& cid, const ghobject_t &oid);
+  int _omap_setkeys(const coll_t& cid, const ghobject_t &oid, ceph::buffer::list& aset_bl);
+  int _omap_rmkeys(const coll_t& cid, const ghobject_t &oid, ceph::buffer::list& keys_bl);
+  int _omap_rmkeyrange(const coll_t& cid, const ghobject_t &oid,
+		       const std::string& first, const std::string& last);
+  int _omap_setheader(const coll_t& cid, const ghobject_t &oid, const ceph::buffer::list &bl);
+
+  int _collection_hint_expected_num_objs(const coll_t& cid, uint32_t pg_num,
+      uint64_t num_objs) const { return 0; }
+  int _create_collection(const coll_t& c, int bits);
+  int _destroy_collection(const coll_t& c);
+  int _collection_add(const coll_t& cid, const coll_t& ocid, const ghobject_t& oid);
+  int _collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid,
+			      coll_t cid, const ghobject_t& o);
+  int _split_collection(const coll_t& cid, uint32_t bits, uint32_t rem, coll_t dest);
+  int _merge_collection(const coll_t& cid, uint32_t bits, coll_t dest);
+
+  int _save();
+  int _load();
+
+  void dump(ceph::Formatter *f);
+  void dump_all();
+
+public:
+  MemStore(CephContext *cct, const std::string& path)
+    : ObjectStore(cct, path),
+      finisher(cct),
+      used_bytes(0) {}
+  ~MemStore() override { }
+
+  std::string get_type() override {
+    return "memstore";
+  }
+
+  bool test_mount_in_use() override {
+    return false;
+  }
+
+  int mount() override;
+  int umount() override;
+
+  int fsck(bool deep) override {
+    return 0;
+  }
+
+  int validate_hobject_key(const hobject_t &obj) const override {
+    return 0;
+  }
+  unsigned get_max_attr_name_length() override {
+    return 256;  // arbitrary; there is no real limit internally
+  }
+
+  int mkfs() override;
+  int mkjournal() override {
+    return 0;
+  }
+  bool wants_journal() override {
+    return false;
+  }
+  bool allows_journal() override {
+    return false;
+  }
+  bool needs_journal() override {
+    return false;
+  }
+
+  int get_devices(std::set<std::string> *ls) override {
+    // no devices for us!
+    return 0;
+  }
+
+  int statfs(struct store_statfs_t *buf,
+             osd_alert_list_t* alerts = nullptr) override;
+  int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
+		  bool *per_pool_omap) override;
+
+  bool exists(CollectionHandle &c, const ghobject_t& oid) override;
+  int stat(CollectionHandle &c, const ghobject_t& oid,
+	   struct stat *st, bool allow_eio = false) override;
+  int set_collection_opts(
+    CollectionHandle& c,
+    const pool_opts_t& opts) override;
+  int read(
+    CollectionHandle &c,
+    const ghobject_t& oid,
+    uint64_t offset,
+    size_t len,
+    ceph::buffer::list& bl,
+    uint32_t op_flags = 0) override;
+  using ObjectStore::fiemap;
+  int fiemap(CollectionHandle& c, const ghobject_t& oid,
+	     uint64_t offset, size_t len, ceph::buffer::list& bl) override;
+  int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset,
+	     size_t len, std::map<uint64_t, uint64_t>& destmap) override;
+  int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name,
+	      ceph::buffer::ptr& value) override;
+  int getattrs(CollectionHandle &c, const ghobject_t& oid,
+	       std::map<std::string,ceph::buffer::ptr,std::less<>>& aset) override;
+
+  int list_collections(std::vector<coll_t>& ls) override;
+
+  CollectionHandle open_collection(const coll_t& c) override {
+    return get_collection(c);
+  }
+  CollectionHandle create_new_collection(const coll_t& c) override;
+
+  void set_collection_commit_queue(const coll_t& cid,
+				   ContextQueue *commit_queue) override {
+  }
+
+  bool collection_exists(const coll_t& c) override;
+  int collection_empty(CollectionHandle& c, bool *empty) override;
+  int collection_bits(CollectionHandle& c) override;
+  int collection_list(CollectionHandle& cid,
+		      const ghobject_t& start, const ghobject_t& end, int max,
+		      std::vector<ghobject_t> *ls, ghobject_t *next) override;
+
+  using ObjectStore::omap_get;
+  int omap_get(
+    CollectionHandle& c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value map
+    ) override;
+
+  using ObjectStore::omap_get_header;
+  /// Get omap header
+  int omap_get_header(
+    CollectionHandle& c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    ceph::buffer::list *header,      ///< [out] omap header
+    bool allow_eio = false ///< [in] don't assert on eio
+    ) override;
+
+  using ObjectStore::omap_get_keys;
+  /// Get keys defined on oid
+  int omap_get_keys(
+    CollectionHandle& c,              ///< [in] Collection containing oid
+    const ghobject_t &oid, ///< [in] Object containing omap
+    std::set<std::string> *keys      ///< [out] Keys defined on oid
+    ) override;
+
+  using ObjectStore::omap_get_values;
+  /// Get key values
+  int omap_get_values(
+    CollectionHandle& c,                    ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::set<std::string> &keys,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) override;
+#ifdef WITH_SEASTAR
+  int omap_get_values(
+    CollectionHandle &c,         ///< [in] Collection containing oid
+    const ghobject_t &oid,       ///< [in] Object containing omap
+    const std::optional<std::string> &start_after,     ///< [in] Keys to get
+    std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
+    ) override;
+#endif
+
+  using ObjectStore::omap_check_keys;
+  /// Filters keys into out which are defined on oid
+  int omap_check_keys(
+    CollectionHandle& c,                ///< [in] Collection containing oid
+    const ghobject_t &oid,   ///< [in] Object containing omap
+    const std::set<std::string> &keys, ///< [in] Keys to check
+    std::set<std::string> *out         ///< [out] Subset of keys defined on oid
+    ) override;
+
+  using ObjectStore::get_omap_iterator;
+  ObjectMap::ObjectMapIterator get_omap_iterator(
+    CollectionHandle& c,              ///< [in] collection
+    const ghobject_t &oid  ///< [in] object
+    ) override;
+
+  void set_fsid(uuid_d u) override;
+  uuid_d get_fsid() override;
+
+  uint64_t estimate_objects_overhead(uint64_t num_objects) override {
+    return 0; //do not care
+  }
+
+  objectstore_perf_stat_t get_cur_stats() override;
+
+  const PerfCounters* get_perf_counters() const override {
+    return nullptr;
+  }
+
+
+  int queue_transactions(
+    CollectionHandle& ch,
+    std::vector<Transaction>& tls,
+    TrackedOpRef op = TrackedOpRef(),
+    ThreadPool::TPHandle *handle = NULL) override;
+};
+
+
+
+
+#endif
diff --git a/src/os/memstore/PageSet.h b/src/os/memstore/PageSet.h
new file mode 100644
index 000000000..71954e574
--- /dev/null
+++ b/src/os/memstore/PageSet.h
@@ -0,0 +1,232 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013- Sage Weil <sage@inktank.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PAGESET_H
+#define CEPH_PAGESET_H
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <mutex>
+#include <vector>
+#include <boost/intrusive/avl_set.hpp>
+#include <boost/intrusive_ptr.hpp>
+
+#include "include/encoding.h"
+
+struct Page {
+  char *const data;
+  boost::intrusive::avl_set_member_hook<> hook;
+  uint64_t offset;
+
+  // avoid RefCountedObject because it has a virtual destructor
+  std::atomic<uint16_t> nrefs;
+  void get() { ++nrefs; }
+  void put() { if (--nrefs == 0) delete this; }
+
+  typedef boost::intrusive_ptr<Page> Ref;
+  friend void intrusive_ptr_add_ref(Page *p) { p->get(); }
+  friend void intrusive_ptr_release(Page *p) { p->put(); }
+
+  // key-value comparison functor for avl
+  struct Less {
+    bool operator()(uint64_t offset, const Page &page) const {
+      return offset < page.offset;
+    }
+    bool operator()(const Page &page, uint64_t offset) const {
+      return page.offset < offset;
+    }
+    bool operator()(const Page &lhs, const Page &rhs) const {
+      return lhs.offset < rhs.offset;
+    }
+  };
+  void encode(ceph::buffer::list &bl, size_t page_size) const {
+    using ceph::encode;
+    bl.append(ceph::buffer::copy(data, page_size));
+    encode(offset, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &p, size_t page_size) {
+    using ceph::decode;
+    p.copy(page_size, data);
+    decode(offset, p);
+  }
+
+  static Ref create(size_t page_size, uint64_t offset = 0) {
+    // ensure proper alignment of the Page
+    const auto align = alignof(Page);
+    page_size = (page_size + align - 1) & ~(align - 1);
+    // allocate the Page and its data in a single buffer
+    auto buffer = new char[page_size + sizeof(Page)];
+    // place the Page structure at the end of the buffer
+    return new (buffer + page_size) Page(buffer, offset);
+  }
+
+  // copy disabled
+  Page(const Page&) = delete;
+  const Page& operator=(const Page&) = delete;
+
+ private: // private constructor, use create() instead
+  Page(char *data, uint64_t offset) : data(data), offset(offset), nrefs(1) {}
+
+  static void operator delete(void *p) {
+    delete[] reinterpret_cast<Page*>(p)->data;
+  }
+};
+
+class PageSet {
+ public:
+  // alloc_range() and get_range() return page refs in a vector
+  typedef std::vector<Page::Ref> page_vector;
+
+ private:
+  // store pages in a boost intrusive avl_set
+  typedef Page::Less page_cmp;
+  typedef boost::intrusive::member_hook<Page,
+          boost::intrusive::avl_set_member_hook<>,
+          &Page::hook> member_option;
+  typedef boost::intrusive::avl_set<Page,
+          boost::intrusive::compare<page_cmp>, member_option> page_set;
+
+  typedef typename page_set::iterator iterator;
+
+  page_set pages;
+  uint64_t page_size;
+
+  typedef std::mutex lock_type;
+  lock_type mutex;
+
+  void free_pages(iterator cur, iterator end) {
+    while (cur != end) {
+      Page *page = &*cur;
+      cur = pages.erase(cur);
+      page->put();
+    }
+  }
+
+  int count_pages(uint64_t offset, uint64_t len) const {
+    // count the overlapping pages
+    int count = 0;
+    if (offset % page_size) {
+      count++;
+      size_t rem = page_size - offset % page_size;
+      len = len <= rem ? 0 : len - rem;
+    }
+    count += len / page_size;
+    if (len % page_size)
+      count++;
+    return count;
+  }
+
+ public:
+  explicit PageSet(size_t page_size) : page_size(page_size) {}
+  PageSet(PageSet &&rhs)
+    : pages(std::move(rhs.pages)), page_size(rhs.page_size) {}
+  ~PageSet() {
+    free_pages(pages.begin(), pages.end());
+  }
+
+  // disable copy
+  PageSet(const PageSet&) = delete;
+  const PageSet& operator=(const PageSet&) = delete;
+
+  bool empty() const { return pages.empty(); }
+  size_t size() const { return pages.size(); }
+  size_t get_page_size() const { return page_size; }
+
+  // allocate all pages that intersect the range [offset,length)
+  void alloc_range(uint64_t offset, uint64_t length, page_vector &range) {
+    // loop in reverse so we can provide hints to avl_set::insert_check()
+    //	and get O(1) insertions after the first
+    uint64_t position = offset + length - 1;
+
+    range.resize(count_pages(offset, length));
+    auto out = range.rbegin();
+
+    std::lock_guard<lock_type> lock(mutex);
+    iterator cur = pages.end();
+    while (length) {
+      const uint64_t page_offset = position & ~(page_size-1);
+
+      typename page_set::insert_commit_data commit;
+      auto insert = pages.insert_check(cur, page_offset, page_cmp(), commit);
+      if (insert.second) {
+        auto page = Page::create(page_size, page_offset);
+        cur = pages.insert_commit(*page, commit);
+
+        // assume that the caller will write to the range [offset,length),
+        //  so we only need to zero memory outside of this range
+
+        // zero end of page past offset + length
+        if (offset + length < page->offset + page_size)
+          std::fill(page->data + offset + length - page->offset,
+                    page->data + page_size, 0);
+        // zero front of page between page_offset and offset
+        if (offset > page->offset)
+          std::fill(page->data, page->data + offset - page->offset, 0);
+      } else { // exists
+        cur = insert.first;
+      }
+      // add a reference to output vector
+      out->reset(&*cur);
+      ++out;
+
+      auto c = std::min(length, (position & (page_size-1)) + 1);
+      position -= c;
+      length -= c;
+    }
+    // make sure we sized the vector correctly
+    ceph_assert(out == range.rend());
+  }
+
+  // return all allocated pages that intersect the range [offset,length)
+  void get_range(uint64_t offset, uint64_t length, page_vector &range) {
+    auto cur = pages.lower_bound(offset & ~(page_size-1), page_cmp());
+    while (cur != pages.end() && cur->offset < offset + length)
+      range.push_back(&*cur++);
+  }
+
+  void free_pages_after(uint64_t offset) {
+    std::lock_guard<lock_type> lock(mutex);
+    auto cur = pages.lower_bound(offset & ~(page_size-1), page_cmp());
+    if (cur == pages.end())
+      return;
+    if (cur->offset < offset)
+      cur++;
+    free_pages(cur, pages.end());
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    using ceph::encode;
+    encode(page_size, bl);
+    unsigned count = pages.size();
+    encode(count, bl);
+    for (auto p = pages.rbegin(); p != pages.rend(); ++p)
+      p->encode(bl, page_size);
+  }
+  void decode(ceph::buffer::list::const_iterator &p) {
+    using ceph::decode;
+    ceph_assert(empty());
+    decode(page_size, p);
+    unsigned count;
+    decode(count, p);
+    auto cur = pages.end();
+    for (unsigned i = 0; i < count; i++) {
+      auto page = Page::create(page_size);
+      page->decode(p, page_size);
+      cur = pages.insert_before(cur, *page);
+    }
+  }
+};
+
+#endif // CEPH_PAGESET_H