Adding upstream version 14.2.21.upstream/14.2.21 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
commit: 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree: e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/osdc
parent: Initial commit. (diff)
download: ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
12 files changed, 15452 insertions, 0 deletions
diff --git a/src/osdc/CMakeLists.txt b/src/osdc/CMakeLists.txt
new file mode 100644
index 00000000..ef34e629
--- /dev/null
+++ b/src/osdc/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(osdc_files
+  Filer.cc
+  ObjectCacher.cc
+  Objecter.cc
+  Striper.cc)
+add_library(osdc STATIC ${osdc_files})
+if(WITH_LTTNG AND WITH_EVENTTRACE)
+  add_dependencies(osdc eventtrace_tp)
+endif()
diff --git a/src/osdc/Filer.cc b/src/osdc/Filer.cc
new file mode 100644
index 00000000..086daf71
--- /dev/null
+++ b/src/osdc/Filer.cc
@@ -0,0 +1,487 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include <mutex>
+#include <algorithm>
+#include "Filer.h"
+#include "osd/OSDMap.h"
+#include "Striper.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDMap.h"
+
+#include "msg/Messenger.h"
+
+#include "include/Context.h"
+
+#include "common/Finisher.h"
+#include "common/config.h"
+
+#define dout_subsys ceph_subsys_filer
+#undef dout_prefix
+#define dout_prefix *_dout << objecter->messenger->get_myname() << ".filer "
+
+class Filer::C_Probe : public Context {
+public:
+  Filer *filer;
+  Probe *probe;
+  object_t oid;
+  uint64_t size;
+  ceph::real_time mtime;
+  C_Probe(Filer *f, Probe *p, object_t o) : filer(f), probe(p), oid(o),
+					    size(0) {}
+  void finish(int r) override {
+    if (r == -ENOENT) {
+      r = 0;
+      ceph_assert(size == 0);
+    }
+
+    bool probe_complete;
+    {
+      Probe::unique_lock pl(probe->lock);
+      if (r != 0) {
+	probe->err = r;
+      }
+
+      probe_complete = filer->_probed(probe, oid, size, mtime, pl);
+      ceph_assert(!pl.owns_lock());
+    }
+    if (probe_complete) {
+      probe->onfinish->complete(probe->err);
+      delete probe;
+    }
+  }
+};
+
+int Filer::probe(inodeno_t ino,
+		 file_layout_t *layout,
+		 snapid_t snapid,
+		 uint64_t start_from,
+		 uint64_t *end, // LB, when !fwd
+		 ceph::real_time *pmtime,
+		 bool fwd,
+		 int flags,
+		 Context *onfinish)
+{
+  ldout(cct, 10) << "probe " << (fwd ? "fwd ":"bwd ")
+	   << hex << ino << dec
+	   << " starting from " << start_from
+	   << dendl;
+
+  ceph_assert(snapid);  // (until there is a non-NOSNAP write)
+
+  Probe *probe = new Probe(ino, *layout, snapid, start_from, end, pmtime,
+			   flags, fwd, onfinish);
+
+  return probe_impl(probe, layout, start_from, end);
+}
+
+int Filer::probe(inodeno_t ino,
+		 file_layout_t *layout,
+		 snapid_t snapid,
+		 uint64_t start_from,
+		 uint64_t *end, // LB, when !fwd
+		 utime_t *pmtime,
+		 bool fwd,
+		 int flags,
+		 Context *onfinish)
+{
+  ldout(cct, 10) << "probe " << (fwd ? "fwd ":"bwd ")
+	   << hex << ino << dec
+	   << " starting from " << start_from
+	   << dendl;
+
+  ceph_assert(snapid);  // (until there is a non-NOSNAP write)
+
+  Probe *probe = new Probe(ino, *layout, snapid, start_from, end, pmtime,
+			   flags, fwd, onfinish);
+  return probe_impl(probe, layout, start_from, end);
+}
+
+int Filer::probe_impl(Probe* probe, file_layout_t *layout,
+		      uint64_t start_from, uint64_t *end) // LB, when !fwd
+{
+  // period (bytes before we jump unto a new set of object(s))
+  uint64_t period = layout->get_period();
+
+  // start with 1+ periods.
+  probe->probing_len = period;
+  if (probe->fwd) {
+    if (start_from % period)
+      probe->probing_len += period - (start_from % period);
+  } else {
+    ceph_assert(start_from > *end);
+    if (start_from % period)
+      probe->probing_len -= period - (start_from % period);
+    probe->probing_off -= probe->probing_len;
+  }
+
+  Probe::unique_lock pl(probe->lock);
+  _probe(probe, pl);
+  ceph_assert(!pl.owns_lock());
+
+  return 0;
+}
+
+
+
+/**
+ * probe->lock must be initially locked, this function will release it
+ */
+void Filer::_probe(Probe *probe, Probe::unique_lock& pl)
+{
+  ceph_assert(pl.owns_lock() && pl.mutex() == &probe->lock);
+
+  ldout(cct, 10) << "_probe " << hex << probe->ino << dec
+		 << " " << probe->probing_off << "~" << probe->probing_len
+		 << dendl;
+
+  // map range onto objects
+  probe->known_size.clear();
+  probe->probing.clear();
+  Striper::file_to_extents(cct, probe->ino, &probe->layout, probe->probing_off,
+			   probe->probing_len, 0, probe->probing);
+
+  std::vector<ObjectExtent> stat_extents;
+  for (vector<ObjectExtent>::iterator p = probe->probing.begin();
+       p != probe->probing.end();
+       ++p) {
+    ldout(cct, 10) << "_probe  probing " << p->oid << dendl;
+    probe->ops.insert(p->oid);
+    stat_extents.push_back(*p);
+  }
+
+  pl.unlock();
+  for (std::vector<ObjectExtent>::iterator i = stat_extents.begin();
+       i != stat_extents.end(); ++i) {
+    C_Probe *c = new C_Probe(this, probe, i->oid);
+    objecter->stat(i->oid, i->oloc, probe->snapid, &c->size, &c->mtime,
+		   probe->flags | CEPH_OSD_FLAG_RWORDERED,
+		   new C_OnFinisher(c, finisher));
+  }
+}
+
+/**
+ * probe->lock must be initially held, and will be released by this function.
+ *
+ * @return true if probe is complete and Probe object may be freed.
+ */
+bool Filer::_probed(Probe *probe, const object_t& oid, uint64_t size,
+		    ceph::real_time mtime, Probe::unique_lock& pl)
+{
+  ceph_assert(pl.owns_lock() && pl.mutex() == &probe->lock);
+
+  ldout(cct, 10) << "_probed " << probe->ino << " object " << oid
+	   << " has size " << size << " mtime " << mtime << dendl;
+
+  probe->known_size[oid] = size;
+  if (mtime > probe->max_mtime)
+    probe->max_mtime = mtime;
+
+  ceph_assert(probe->ops.count(oid));
+  probe->ops.erase(oid);
+
+  if (!probe->ops.empty()) {
+    pl.unlock();
+    return false;  // waiting for more!
+  }
+
+  if (probe->err) { // we hit an error, propagate back up
+    pl.unlock();
+    return true;
+  }
+
+  // analyze!
+  uint64_t end = 0;
+
+  if (!probe->fwd) {
+    std::reverse(probe->probing.begin(), probe->probing.end());
+  }
+
+  for (vector<ObjectExtent>::iterator p = probe->probing.begin();
+       p != probe->probing.end();
+       ++p) {
+    uint64_t shouldbe = p->length + p->offset;
+    ldout(cct, 10) << "_probed  " << probe->ino << " object " << hex
+		   << p->oid << dec << " should be " << shouldbe
+		   << ", actual is " << probe->known_size[p->oid]
+		   << dendl;
+
+    if (!probe->found_size) {
+      ceph_assert(probe->known_size[p->oid] <= shouldbe);
+
+      if ((probe->fwd && probe->known_size[p->oid] == shouldbe) ||
+	  (!probe->fwd && probe->known_size[p->oid] == 0 &&
+	   probe->probing_off > 0))
+	continue;  // keep going
+
+      // aha, we found the end!
+      // calc offset into buffer_extent to get distance from probe->from.
+      uint64_t oleft = probe->known_size[p->oid] - p->offset;
+      for (vector<pair<uint64_t, uint64_t> >::iterator i
+	     = p->buffer_extents.begin();
+	   i != p->buffer_extents.end();
+	   ++i) {
+	if (oleft <= (uint64_t)i->second) {
+	  end = probe->probing_off + i->first + oleft;
+	  ldout(cct, 10) << "_probed  end is in buffer_extent " << i->first
+			 << "~" << i->second << " off " << oleft
+			 << ", from was " << probe->probing_off << ", end is "
+			 << end << dendl;
+
+	  probe->found_size = true;
+	  ldout(cct, 10) << "_probed found size at " << end << dendl;
+	  *probe->psize = end;
+
+	  if (!probe->pmtime &&
+	      !probe->pumtime)  // stop if we don't need mtime too
+	    break;
+	}
+	oleft -= i->second;
+      }
+    }
+    break;
+  }
+
+  if (!probe->found_size || (probe->probing_off && (probe->pmtime ||
+						    probe->pumtime))) {
+    // keep probing!
+    ldout(cct, 10) << "_probed probing further" << dendl;
+
+    uint64_t period = probe->layout.get_period();
+    if (probe->fwd) {
+      probe->probing_off += probe->probing_len;
+      ceph_assert(probe->probing_off % period == 0);
+      probe->probing_len = period;
+    } else {
+      // previous period.
+      ceph_assert(probe->probing_off % period == 0);
+      probe->probing_len = period;
+      probe->probing_off -= period;
+    }
+    _probe(probe, pl);
+    ceph_assert(!pl.owns_lock());
+    return false;
+  } else if (probe->pmtime) {
+    ldout(cct, 10) << "_probed found mtime " << probe->max_mtime << dendl;
+    *probe->pmtime = probe->max_mtime;
+  } else if (probe->pumtime) {
+    ldout(cct, 10) << "_probed found mtime " << probe->max_mtime << dendl;
+    *probe->pumtime = ceph::real_clock::to_ceph_timespec(probe->max_mtime);
+  }
+  // done!
+  pl.unlock();
+  return true;
+}
+
+
+// -----------------------
+
+struct PurgeRange {
+  std::mutex lock;
+  typedef std::lock_guard<std::mutex> lock_guard;
+  typedef std::unique_lock<std::mutex> unique_lock;
+  inodeno_t ino;
+  file_layout_t layout;
+  SnapContext snapc;
+  uint64_t first, num;
+  ceph::real_time mtime;
+  int flags;
+  Context *oncommit;
+  int uncommitted;
+  int err = 0;
+  PurgeRange(inodeno_t i, const file_layout_t& l, const SnapContext& sc,
+	     uint64_t fo, uint64_t no, ceph::real_time t, int fl,
+	     Context *fin)
+    : ino(i), layout(l), snapc(sc), first(fo), num(no), mtime(t), flags(fl),
+      oncommit(fin), uncommitted(0) {}
+};
+
+int Filer::purge_range(inodeno_t ino,
+		       const file_layout_t *layout,
+		       const SnapContext& snapc,
+		       uint64_t first_obj, uint64_t num_obj,
+		       ceph::real_time mtime,
+		       int flags,
+		       Context *oncommit)
+{
+  ceph_assert(num_obj > 0);
+
+  // single object?  easy!
+  if (num_obj == 1) {
+    object_t oid = file_object_t(ino, first_obj);
+    object_locator_t oloc = OSDMap::file_to_object_locator(*layout);
+    objecter->remove(oid, oloc, snapc, mtime, flags, oncommit);
+    return 0;
+  }
+
+  PurgeRange *pr = new PurgeRange(ino, *layout, snapc, first_obj,
+				  num_obj, mtime, flags, oncommit);
+
+  _do_purge_range(pr, 0, 0);
+  return 0;
+}
+
+struct C_PurgeRange : public Context {
+  Filer *filer;
+  PurgeRange *pr;
+  C_PurgeRange(Filer *f, PurgeRange *p) : filer(f), pr(p) {}
+  void finish(int r) override {
+    filer->_do_purge_range(pr, 1, r);
+  }
+};
+
+void Filer::_do_purge_range(PurgeRange *pr, int fin, int err)
+{
+  PurgeRange::unique_lock prl(pr->lock);
+  if (err && err != -ENOENT)
+    pr->err = err;
+  pr->uncommitted -= fin;
+  ldout(cct, 10) << "_do_purge_range " << pr->ino << " objects " << pr->first
+		 << "~" << pr->num << " uncommitted " << pr->uncommitted
+		 << dendl;
+
+  if (pr->num == 0 && pr->uncommitted == 0) {
+    pr->oncommit->complete(pr->err);
+    prl.unlock();
+    delete pr;
+    return;
+  }
+
+  std::vector<object_t> remove_oids;
+
+  int max = cct->_conf->filer_max_purge_ops - pr->uncommitted;
+  while (pr->num > 0 && max > 0) {
+    remove_oids.push_back(file_object_t(pr->ino, pr->first));
+    pr->uncommitted++;
+    pr->first++;
+    pr->num--;
+    max--;
+  }
+  prl.unlock();
+
+  // Issue objecter ops outside pr->lock to avoid lock dependency loop
+  for (const auto& oid : remove_oids) {
+    object_locator_t oloc = OSDMap::file_to_object_locator(pr->layout);
+    objecter->remove(oid, oloc, pr->snapc, pr->mtime, pr->flags,
+		     new C_OnFinisher(new C_PurgeRange(this, pr), finisher));
+  }
+}
+
+// -----------------------
+struct TruncRange {
+  std::mutex lock;
+  typedef std::lock_guard<std::mutex> lock_guard;
+  typedef std::unique_lock<std::mutex> unique_lock;
+  inodeno_t ino;
+  file_layout_t layout;
+  SnapContext snapc;
+  ceph::real_time mtime;
+  int flags;
+  Context *oncommit;
+  int uncommitted;
+  uint64_t offset;
+  uint64_t length;
+  uint32_t truncate_seq;
+  TruncRange(inodeno_t i, const file_layout_t& l, const SnapContext& sc,
+	     ceph::real_time t, int fl, Context *fin,
+	     uint64_t off, uint64_t len, uint32_t ts)
+    : ino(i), layout(l), snapc(sc), mtime(t), flags(fl), oncommit(fin),
+      uncommitted(0), offset(off), length(len), truncate_seq(ts) {}
+};
+
+void Filer::truncate(inodeno_t ino,
+		     file_layout_t *layout,
+		     const SnapContext& snapc,
+		     uint64_t offset,
+		     uint64_t len,
+		     __u32 truncate_seq,
+		     ceph::real_time mtime,
+		     int flags,
+		     Context *oncommit)
+{
+  uint64_t period = layout->get_period();
+  uint64_t num_objs = Striper::get_num_objects(*layout, len + (offset % period));
+  if (num_objs == 1) {
+    vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents);
+    vector<OSDOp> ops(1);
+    ops[0].op.op = CEPH_OSD_OP_TRIMTRUNC;
+    ops[0].op.extent.truncate_seq = truncate_seq;
+    ops[0].op.extent.truncate_size = extents[0].offset;
+    objecter->_modify(extents[0].oid, extents[0].oloc, ops, mtime, snapc,
+		      flags, oncommit);
+    return;
+  }
+
+  if (len > 0 && (offset + len) % period)
+    len += period - ((offset + len) % period);
+
+  TruncRange *tr = new TruncRange(ino, *layout, snapc, mtime, flags, oncommit,
+				  offset, len, truncate_seq);
+  _do_truncate_range(tr, 0);
+}
+
+struct C_TruncRange : public Context {
+  Filer *filer;
+  TruncRange *tr;
+  C_TruncRange(Filer *f, TruncRange *t) : filer(f), tr(t) {}
+  void finish(int r) override {
+    filer->_do_truncate_range(tr, 1);
+  }
+};
+
+void Filer::_do_truncate_range(TruncRange *tr, int fin)
+{
+  TruncRange::unique_lock trl(tr->lock);
+  tr->uncommitted -= fin;
+  ldout(cct, 10) << "_do_truncate_range " << tr->ino << " objects " << tr->offset
+		 << "~" << tr->length << " uncommitted " << tr->uncommitted
+		 << dendl;
+
+  if (tr->length == 0 && tr->uncommitted == 0) {
+    tr->oncommit->complete(0);
+    trl.unlock();
+    delete tr;
+    return;
+  }
+
+  vector<ObjectExtent> extents;
+
+  int max = cct->_conf->filer_max_truncate_ops - tr->uncommitted;
+  if (max > 0 && tr->length > 0) {
+    uint64_t len = tr->layout.get_period() * max;
+    if (len > tr->length)
+      len = tr->length;
+
+    uint64_t offset = tr->offset + tr->length - len;
+    Striper::file_to_extents(cct, tr->ino, &tr->layout, offset, len, 0, extents);
+    tr->uncommitted += extents.size();
+    tr->length -= len;
+  }
+
+  trl.unlock();
+
+  // Issue objecter ops outside tr->lock to avoid lock dependency loop
+  for (const auto& p : extents) {
+    vector<OSDOp> ops(1);
+    ops[0].op.op = CEPH_OSD_OP_TRIMTRUNC;
+    ops[0].op.extent.truncate_size = p.offset;
+    ops[0].op.extent.truncate_seq = tr->truncate_seq;
+    objecter->_modify(p.oid, p.oloc, ops, tr->mtime, tr->snapc, tr->flags,
+		      new C_OnFinisher(new C_TruncRange(this, tr), finisher));
+  }
+}
diff --git a/src/osdc/Filer.h b/src/osdc/Filer.h
new file mode 100644
index 00000000..ea9ac170
--- /dev/null
+++ b/src/osdc/Filer.h
@@ -0,0 +1,302 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_FILER_H
+#define CEPH_FILER_H
+
+/*** Filer
+ *
+ * stripe file ranges onto objects.
+ * build list<ObjectExtent> for the objecter or objectcacher.
+ *
+ * also, provide convenience methods that call objecter for you.
+ *
+ * "files" are identified by ino.
+ */
+
+
+#include <mutex>
+
+#include "include/types.h"
+
+#include "common/ceph_time.h"
+
+#include "osd/OSDMap.h"
+#include "Objecter.h"
+#include "Striper.h"
+
+class Context;
+class Messenger;
+class OSDMap;
+class Finisher;
+
+
+/**** Filer interface ***/
+
+class Filer {
+  CephContext *cct;
+  Objecter   *objecter;
+  Finisher   *finisher;
+
+  // probes
+  struct Probe {
+    std::mutex lock;
+    typedef std::lock_guard<std::mutex> lock_guard;
+    typedef std::unique_lock<std::mutex> unique_lock;
+    inodeno_t ino;
+    file_layout_t layout;
+    snapid_t snapid;
+
+    uint64_t *psize;
+    ceph::real_time *pmtime;
+    utime_t *pumtime;
+
+    int flags;
+
+    bool fwd;
+
+    Context *onfinish;
+
+    vector<ObjectExtent> probing;
+    uint64_t probing_off, probing_len;
+
+    map<object_t, uint64_t> known_size;
+    ceph::real_time max_mtime;
+
+    set<object_t> ops;
+
+    int err;
+    bool found_size;
+
+    Probe(inodeno_t i, file_layout_t &l, snapid_t sn,
+	  uint64_t f, uint64_t *e, ceph::real_time *m, int fl, bool fw,
+	  Context *c) :
+      ino(i), layout(l), snapid(sn),
+      psize(e), pmtime(m), pumtime(nullptr), flags(fl), fwd(fw), onfinish(c),
+      probing_off(f), probing_len(0),
+      err(0), found_size(false) {}
+
+    Probe(inodeno_t i, file_layout_t &l, snapid_t sn,
+	  uint64_t f, uint64_t *e, utime_t *m, int fl, bool fw,
+	  Context *c) :
+      ino(i), layout(l), snapid(sn),
+      psize(e), pmtime(nullptr), pumtime(m), flags(fl), fwd(fw),
+      onfinish(c), probing_off(f), probing_len(0),
+      err(0), found_size(false) {}
+  };
+
+  class C_Probe;
+
+  void _probe(Probe *p, Probe::unique_lock& pl);
+  bool _probed(Probe *p, const object_t& oid, uint64_t size,
+	       ceph::real_time mtime, Probe::unique_lock& pl);
+
+ public:
+  Filer(const Filer& other);
+  const Filer operator=(const Filer& other);
+
+  Filer(Objecter *o, Finisher *f) : cct(o->cct), objecter(o), finisher(f) {}
+  ~Filer() {}
+
+  bool is_active() {
+    return objecter->is_active(); // || (oc && oc->is_active());
+  }
+
+
+  /*** async file interface.  scatter/gather as needed. ***/
+
+  void read(inodeno_t ino,
+	   file_layout_t *layout,
+	   snapid_t snap,
+	   uint64_t offset,
+	   uint64_t len,
+	   bufferlist *bl,   // ptr to data
+	   int flags,
+	   Context *onfinish,
+	   int op_flags = 0) {
+    ceph_assert(snap);  // (until there is a non-NOSNAP write)
+    vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents);
+    objecter->sg_read(extents, snap, bl, flags, onfinish, op_flags);
+  }
+
+  void read_trunc(inodeno_t ino,
+		 file_layout_t *layout,
+		 snapid_t snap,
+		 uint64_t offset,
+		 uint64_t len,
+		 bufferlist *bl, // ptr to data
+		 int flags,
+		 uint64_t truncate_size,
+		 __u32 truncate_seq,
+		 Context *onfinish,
+		 int op_flags = 0) {
+    ceph_assert(snap);  // (until there is a non-NOSNAP write)
+    vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, truncate_size,
+			     extents);
+    objecter->sg_read_trunc(extents, snap, bl, flags,
+			    truncate_size, truncate_seq, onfinish, op_flags);
+  }
+
+  void write(inodeno_t ino,
+	    file_layout_t *layout,
+	    const SnapContext& snapc,
+	    uint64_t offset,
+	    uint64_t len,
+	    bufferlist& bl,
+	    ceph::real_time mtime,
+	    int flags,
+	    Context *oncommit,
+	    int op_flags = 0) {
+    vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents);
+    objecter->sg_write(extents, snapc, bl, mtime, flags, oncommit, op_flags);
+  }
+
+  void write_trunc(inodeno_t ino,
+		  file_layout_t *layout,
+		  const SnapContext& snapc,
+		  uint64_t offset,
+		  uint64_t len,
+		  bufferlist& bl,
+		  ceph::real_time mtime,
+		  int flags,
+		  uint64_t truncate_size,
+		  __u32 truncate_seq,
+		  Context *oncommit,
+		  int op_flags = 0) {
+    vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, truncate_size,
+			     extents);
+    objecter->sg_write_trunc(extents, snapc, bl, mtime, flags,
+		       truncate_size, truncate_seq, oncommit, op_flags);
+  }
+
+  void truncate(inodeno_t ino,
+	       file_layout_t *layout,
+	       const SnapContext& snapc,
+	       uint64_t offset,
+	       uint64_t len,
+	       __u32 truncate_seq,
+	       ceph::real_time mtime,
+	       int flags,
+	       Context *oncommit);
+  void _do_truncate_range(struct TruncRange *pr, int fin);
+
+  void zero(inodeno_t ino,
+	   const file_layout_t *layout,
+	   const SnapContext& snapc,
+	   uint64_t offset,
+	   uint64_t len,
+	   ceph::real_time mtime,
+	   int flags,
+	   bool keep_first,
+	   Context *oncommit) {
+    vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, ino, layout, offset, len, 0, extents);
+    if (extents.size() == 1) {
+      if (extents[0].offset == 0 && extents[0].length == layout->object_size
+	  && (!keep_first || extents[0].objectno != 0))
+	objecter->remove(extents[0].oid, extents[0].oloc,
+			 snapc, mtime, flags, oncommit);
+      else
+	objecter->zero(extents[0].oid, extents[0].oloc, extents[0].offset,
+		       extents[0].length, snapc, mtime, flags, oncommit);
+    } else {
+      C_GatherBuilder gcom(cct, oncommit);
+      for (vector<ObjectExtent>::iterator p = extents.begin();
+	   p != extents.end();
+	   ++p) {
+	if (p->offset == 0 && p->length == layout->object_size &&
+	    (!keep_first || p->objectno != 0))
+	  objecter->remove(p->oid, p->oloc,
+			   snapc, mtime, flags,
+			   oncommit ? gcom.new_sub():0);
+	else
+	  objecter->zero(p->oid, p->oloc, p->offset, p->length,
+			 snapc, mtime, flags,
+			 oncommit ? gcom.new_sub():0);
+      }
+      gcom.activate();
+    }
+  }
+
+  void zero(inodeno_t ino,
+	   file_layout_t *layout,
+	   const SnapContext& snapc,
+	   uint64_t offset,
+	   uint64_t len,
+	   ceph::real_time mtime,
+	   int flags,
+	   Context *oncommit) {
+    zero(ino, layout,
+         snapc, offset,
+         len, mtime,
+         flags, false,
+         oncommit);
+  }
+  // purge range of ino.### objects
+  int purge_range(inodeno_t ino,
+		  const file_layout_t *layout,
+		  const SnapContext& snapc,
+		  uint64_t first_obj, uint64_t num_obj,
+		  ceph::real_time mtime,
+		  int flags, Context *oncommit);
+  void _do_purge_range(struct PurgeRange *pr, int fin, int err);
+
+  /*
+   * probe
+   *  specify direction,
+   *  and whether we stop when we find data, or hole.
+   */
+  int probe(inodeno_t ino,
+	    file_layout_t *layout,
+	    snapid_t snapid,
+	    uint64_t start_from,
+	    uint64_t *end,
+	    ceph::real_time *mtime,
+	    bool fwd,
+	    int flags,
+	    Context *onfinish);
+
+  int probe(inodeno_t ino,
+	    file_layout_t *layout,
+	    snapid_t snapid,
+	    uint64_t start_from,
+	    uint64_t *end,
+	    bool fwd,
+	    int flags,
+	    Context *onfinish) {
+    return probe(ino, layout, snapid, start_from, end,
+		 (ceph::real_time* )0, fwd, flags, onfinish);
+  }
+
+  int probe(inodeno_t ino,
+	    file_layout_t *layout,
+	    snapid_t snapid,
+	    uint64_t start_from,
+	    uint64_t *end,
+	    utime_t *mtime,
+	    bool fwd,
+	    int flags,
+	    Context *onfinish);
+
+private:
+  int probe_impl(Probe* probe, file_layout_t *layout,
+		 uint64_t start_from, uint64_t *end);
+};
+
+#endif // !CEPH_FILER_H
diff --git a/src/osdc/Journaler.cc b/src/osdc/Journaler.cc
new file mode 100644
index 00000000..5c00293e
--- /dev/null
+++ b/src/osdc/Journaler.cc
@@ -0,0 +1,1607 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/perf_counters.h"
+#include "common/dout.h"
+#include "include/Context.h"
+#include "msg/Messenger.h"
+#include "osdc/Journaler.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+#include "common/Finisher.h"
+
+#define dout_subsys ceph_subsys_journaler
+#undef dout_prefix
+#define dout_prefix *_dout << objecter->messenger->get_myname() \
+  << ".journaler." << name << (readonly ? "(ro) ":"(rw) ")
+
+using std::chrono::seconds;
+
+
+class Journaler::C_DelayFlush : public Context {
+  Journaler *journaler;
+  public:
+  explicit C_DelayFlush(Journaler *j) : journaler(j) {}
+  void finish(int r) override {
+    journaler->_do_delayed_flush();
+  }
+};
+
+void Journaler::set_readonly()
+{
+  lock_guard l(lock);
+
+  ldout(cct, 1) << "set_readonly" << dendl;
+  readonly = true;
+}
+
+void Journaler::set_writeable()
+{
+  lock_guard l(lock);
+
+  ldout(cct, 1) << "set_writeable" << dendl;
+  readonly = false;
+}
+
+void Journaler::create(file_layout_t *l, stream_format_t const sf)
+{
+  lock_guard lk(lock);
+
+  ceph_assert(!readonly);
+  state = STATE_ACTIVE;
+
+  stream_format = sf;
+  journal_stream.set_format(sf);
+  _set_layout(l);
+
+  prezeroing_pos = prezero_pos = write_pos = flush_pos =
+    safe_pos = read_pos = requested_pos = received_pos =
+    expire_pos = trimming_pos = trimmed_pos =
+    next_safe_pos = layout.get_period();
+
+  ldout(cct, 1) << "created blank journal at inode 0x" << std::hex << ino
+		<< std::dec << ", format=" << stream_format << dendl;
+}
+
+void Journaler::set_layout(file_layout_t const *l)
+{
+    lock_guard lk(lock);
+    _set_layout(l);
+}
+
+void Journaler::_set_layout(file_layout_t const *l)
+{
+  layout = *l;
+
+  if (layout.pool_id != pg_pool) {
+    // user can reset pool id through cephfs-journal-tool
+    lderr(cct) << "may got older pool id from header layout" << dendl;
+    ceph_abort();
+  }
+  last_written.layout = layout;
+  last_committed.layout = layout;
+
+  // prefetch intelligently.
+  // (watch out, this is big if you use big objects or weird striping)
+  uint64_t periods = cct->_conf.get_val<uint64_t>("journaler_prefetch_periods");
+  fetch_len = layout.get_period() * periods;
+}
+
+
+/***************** HEADER *******************/
+
+ostream& operator<<(ostream &out, const Journaler::Header &h)
+{
+  return out << "loghead(trim " << h.trimmed_pos
+	     << ", expire " << h.expire_pos
+	     << ", write " << h.write_pos
+	     << ", stream_format " << (int)(h.stream_format)
+	     << ")";
+}
+
+class Journaler::C_ReadHead : public Context {
+  Journaler *ls;
+public:
+  bufferlist bl;
+  explicit C_ReadHead(Journaler *l) : ls(l) {}
+  void finish(int r) override {
+    ls->_finish_read_head(r, bl);
+  }
+};
+
+class Journaler::C_RereadHead : public Context {
+  Journaler *ls;
+  Context *onfinish;
+public:
+  bufferlist bl;
+  C_RereadHead(Journaler *l, Context *onfinish_) : ls (l),
+						   onfinish(onfinish_) {}
+  void finish(int r) override {
+    ls->_finish_reread_head(r, bl, onfinish);
+  }
+};
+
+class Journaler::C_ProbeEnd : public Context {
+  Journaler *ls;
+public:
+  uint64_t end;
+  explicit C_ProbeEnd(Journaler *l) : ls(l), end(-1) {}
+  void finish(int r) override {
+    ls->_finish_probe_end(r, end);
+  }
+};
+
+class Journaler::C_ReProbe : public Context {
+  Journaler *ls;
+  C_OnFinisher *onfinish;
+public:
+  uint64_t end;
+  C_ReProbe(Journaler *l, C_OnFinisher *onfinish_) :
+    ls(l), onfinish(onfinish_), end(0) {}
+  void finish(int r) override {
+    ls->_finish_reprobe(r, end, onfinish);
+  }
+};
+
+void Journaler::recover(Context *onread) 
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    onread->complete(-EAGAIN);
+    return;
+  }
+
+  ldout(cct, 1) << "recover start" << dendl;
+  ceph_assert(state != STATE_ACTIVE);
+  ceph_assert(readonly);
+
+  if (onread)
+    waitfor_recover.push_back(wrap_finisher(onread));
+
+  if (state != STATE_UNDEF) {
+    ldout(cct, 1) << "recover - already recovering" << dendl;
+    return;
+  }
+
+  ldout(cct, 1) << "read_head" << dendl;
+  state = STATE_READHEAD;
+  C_ReadHead *fin = new C_ReadHead(this);
+  _read_head(fin, &fin->bl);
+}
+
+void Journaler::_read_head(Context *on_finish, bufferlist *bl)
+{
+  // lock is locked
+  ceph_assert(state == STATE_READHEAD || state == STATE_REREADHEAD);
+
+  object_t oid = file_object_t(ino, 0);
+  object_locator_t oloc(pg_pool);
+  objecter->read_full(oid, oloc, CEPH_NOSNAP, bl, 0, wrap_finisher(on_finish));
+}
+
+void Journaler::reread_head(Context *onfinish)
+{
+  lock_guard l(lock);
+  _reread_head(wrap_finisher(onfinish));
+}
+
+/**
+ * Re-read the head from disk, and set the write_pos, expire_pos, trimmed_pos
+ * from the on-disk header. This switches the state to STATE_REREADHEAD for
+ * the duration, and you shouldn't start a re-read while other operations are
+ * in-flight, nor start other operations while a re-read is in progress.
+ * Also, don't call this until the Journaler has finished its recovery and has
+ * gone STATE_ACTIVE!
+ */
+void Journaler::_reread_head(Context *onfinish)
+{
+  ldout(cct, 10) << "reread_head" << dendl;
+  ceph_assert(state == STATE_ACTIVE);
+
+  state = STATE_REREADHEAD;
+  C_RereadHead *fin = new C_RereadHead(this, onfinish);
+  _read_head(fin, &fin->bl);
+}
+
+void Journaler::_finish_reread_head(int r, bufferlist& bl, Context *finish)
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    finish->complete(-EAGAIN);
+    return;
+  }
+
+  //read on-disk header into
+  ceph_assert(bl.length() || r < 0 );
+
+  // unpack header
+  if (r == 0) {
+    Header h;
+    auto p = bl.cbegin();
+    try {
+      decode(h, p);
+    } catch (const buffer::error &e) {
+      finish->complete(-EINVAL);
+      return;
+    }
+    prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos
+      = h.write_pos;
+    expire_pos = h.expire_pos;
+    trimmed_pos = trimming_pos = h.trimmed_pos;
+    init_headers(h);
+    state = STATE_ACTIVE;
+  }
+
+  finish->complete(r);
+}
+
+void Journaler::_finish_read_head(int r, bufferlist& bl)
+{
+  lock_guard l(lock);
+  if (is_stopping())
+    return;
+
+  ceph_assert(state == STATE_READHEAD);
+
+  if (r!=0) {
+    ldout(cct, 0) << "error getting journal off disk" << dendl;
+    list<Context*> ls;
+    ls.swap(waitfor_recover);
+    finish_contexts(cct, ls, r);
+    return;
+  }
+
+  if (bl.length() == 0) {
+    ldout(cct, 1) << "_finish_read_head r=" << r
+		  << " read 0 bytes, assuming empty log" << dendl;
+    state = STATE_ACTIVE;
+    list<Context*> ls;
+    ls.swap(waitfor_recover);
+    finish_contexts(cct, ls, 0);
+    return;
+  }
+
+  // unpack header
+  bool corrupt = false;
+  Header h;
+  auto p = bl.cbegin();
+  try {
+    decode(h, p);
+
+    if (h.magic != magic) {
+      ldout(cct, 0) << "on disk magic '" << h.magic << "' != my magic '"
+		    << magic << "'" << dendl;
+      corrupt = true;
+    } else if (h.write_pos < h.expire_pos || h.expire_pos < h.trimmed_pos) {
+      ldout(cct, 0) << "Corrupt header (bad offsets): " << h << dendl;
+      corrupt = true;
+    }
+  } catch (const buffer::error &e) {
+    corrupt = true;
+  }
+
+  if (corrupt) {
+    list<Context*> ls;
+    ls.swap(waitfor_recover);
+    finish_contexts(cct, ls, -EINVAL);
+    return;
+  }
+
+  prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos
+    = h.write_pos;
+  read_pos = requested_pos = received_pos = expire_pos = h.expire_pos;
+  trimmed_pos = trimming_pos = h.trimmed_pos;
+
+  init_headers(h);
+  _set_layout(&h.layout);
+  stream_format = h.stream_format;
+  journal_stream.set_format(h.stream_format);
+
+  ldout(cct, 1) << "_finish_read_head " << h
+		<< ".  probing for end of log (from " << write_pos << ")..."
+		<< dendl;
+  C_ProbeEnd *fin = new C_ProbeEnd(this);
+  state = STATE_PROBING;
+  _probe(fin, &fin->end);
+}
+
+void Journaler::_probe(Context *finish, uint64_t *end)
+{
+  // lock is locked
+  ldout(cct, 1) << "probing for end of the log" << dendl;
+  ceph_assert(state == STATE_PROBING || state == STATE_REPROBING);
+  // probe the log
+  filer.probe(ino, &layout, CEPH_NOSNAP,
+	      write_pos, end, true, 0, wrap_finisher(finish));
+}
+
+void Journaler::_reprobe(C_OnFinisher *finish)
+{
+  ldout(cct, 10) << "reprobe" << dendl;
+  ceph_assert(state == STATE_ACTIVE);
+
+  state = STATE_REPROBING;
+  C_ReProbe *fin = new C_ReProbe(this, finish);
+  _probe(fin, &fin->end);
+}
+
+
+void Journaler::_finish_reprobe(int r, uint64_t new_end,
+				C_OnFinisher *onfinish)
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    onfinish->complete(-EAGAIN);
+    return;
+  }
+
+  ceph_assert(new_end >= write_pos || r < 0);
+  ldout(cct, 1) << "_finish_reprobe new_end = " << new_end
+	  << " (header had " << write_pos << ")."
+	  << dendl;
+  prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = new_end;
+  state = STATE_ACTIVE;
+  onfinish->complete(r);
+}
+
+void Journaler::_finish_probe_end(int r, uint64_t end)
+{
+  lock_guard l(lock);
+  if (is_stopping())
+    return;
+
+  ceph_assert(state == STATE_PROBING);
+  if (r < 0) { // error in probing
+    goto out;
+  }
+  if (((int64_t)end) == -1) {
+    end = write_pos;
+    ldout(cct, 1) << "_finish_probe_end write_pos = " << end << " (header had "
+		  << write_pos << "). log was empty. recovered." << dendl;
+    ceph_abort(); // hrm.
+  } else {
+    ceph_assert(end >= write_pos);
+    ldout(cct, 1) << "_finish_probe_end write_pos = " << end
+		  << " (header had " << write_pos << "). recovered."
+		  << dendl;
+  }
+
+  state = STATE_ACTIVE;
+
+  prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = end;
+
+out:
+  // done.
+  list<Context*> ls;
+  ls.swap(waitfor_recover);
+  finish_contexts(cct, ls, r);
+}
+
+class Journaler::C_RereadHeadProbe : public Context
+{
+  Journaler *ls;
+  C_OnFinisher *final_finish;
+public:
+  C_RereadHeadProbe(Journaler *l, C_OnFinisher *finish) :
+    ls(l), final_finish(finish) {}
+  void finish(int r) override {
+    ls->_finish_reread_head_and_probe(r, final_finish);
+  }
+};
+
+void Journaler::reread_head_and_probe(Context *onfinish)
+{
+  lock_guard l(lock);
+
+  ceph_assert(state == STATE_ACTIVE);
+  _reread_head(new C_RereadHeadProbe(this, wrap_finisher(onfinish)));
+}
+
+void Journaler::_finish_reread_head_and_probe(int r, C_OnFinisher *onfinish)
+{
+  // Expect to be called back from finish_reread_head, which already takes lock
+  // lock is locked
+  if (is_stopping()) {
+    onfinish->complete(-EAGAIN);
+    return;
+  }
+
+  // Let the caller know that the operation has failed or was intentionally
+  // failed since the caller has been blacklisted.
+  if (r == -EBLACKLISTED) {
+    onfinish->complete(r);
+    return;
+  }
+
+  ceph_assert(!r); //if we get an error, we're boned
+  _reprobe(onfinish);
+}
+
+
+// WRITING
+
+class Journaler::C_WriteHead : public Context {
+public:
+  Journaler *ls;
+  Header h;
+  C_OnFinisher *oncommit;
+  C_WriteHead(Journaler *l, Header& h_, C_OnFinisher *c) : ls(l), h(h_),
+							   oncommit(c) {}
+  void finish(int r) override {
+    ls->_finish_write_head(r, h, oncommit);
+  }
+};
+
+void Journaler::write_head(Context *oncommit)
+{
+  lock_guard l(lock);
+  _write_head(oncommit);
+}
+
+
+void Journaler::_write_head(Context *oncommit)
+{
+  ceph_assert(!readonly);
+  ceph_assert(state == STATE_ACTIVE);
+  last_written.trimmed_pos = trimmed_pos;
+  last_written.expire_pos = expire_pos;
+  last_written.unused_field = expire_pos;
+  last_written.write_pos = safe_pos;
+  last_written.stream_format = stream_format;
+  ldout(cct, 10) << "write_head " << last_written << dendl;
+
+  // Avoid persisting bad pointers in case of bugs
+  ceph_assert(last_written.write_pos >= last_written.expire_pos);
+  ceph_assert(last_written.expire_pos >= last_written.trimmed_pos);
+
+  last_wrote_head = ceph::real_clock::now();
+
+  bufferlist bl;
+  encode(last_written, bl);
+  SnapContext snapc;
+
+  object_t oid = file_object_t(ino, 0);
+  object_locator_t oloc(pg_pool);
+  objecter->write_full(oid, oloc, snapc, bl, ceph::real_clock::now(), 0,
+		       wrap_finisher(new C_WriteHead(
+					     this, last_written,
+					     wrap_finisher(oncommit))),
+		       0, 0, write_iohint);
+}
+
+void Journaler::_finish_write_head(int r, Header &wrote,
+				   C_OnFinisher *oncommit)
+{
+  lock_guard l(lock);
+
+  if (r < 0) {
+    lderr(cct) << "_finish_write_head got " << cpp_strerror(r) << dendl;
+    handle_write_error(r);
+    return;
+  }
+  ceph_assert(!readonly);
+  ldout(cct, 10) << "_finish_write_head " << wrote << dendl;
+  last_committed = wrote;
+  if (oncommit) {
+    oncommit->complete(r);
+  }
+
+  _trim();  // trim?
+}
+
+
+/***************** WRITING *******************/
+
+class Journaler::C_Flush : public Context {
+  Journaler *ls;
+  uint64_t start;
+  ceph::real_time stamp;
+public:
+  C_Flush(Journaler *l, int64_t s, ceph::real_time st)
+    : ls(l), start(s), stamp(st) {}
+  void finish(int r) override {
+    ls->_finish_flush(r, start, stamp);
+  }
+};
+
+void Journaler::_finish_flush(int r, uint64_t start, ceph::real_time stamp)
+{
+  lock_guard l(lock);
+  ceph_assert(!readonly);
+
+  if (r < 0) {
+    lderr(cct) << "_finish_flush got " << cpp_strerror(r) << dendl;
+    handle_write_error(r);
+    return;
+  }
+
+  ceph_assert(start < flush_pos);
+
+  // calc latency?
+  if (logger) {
+    ceph::timespan lat = ceph::real_clock::now() - stamp;
+    logger->tinc(logger_key_lat, lat);
+  }
+
+  // adjust safe_pos
+  auto it = pending_safe.find(start);
+  ceph_assert(it != pending_safe.end());
+  uint64_t min_next_safe_pos = pending_safe.begin()->second;
+  pending_safe.erase(it);
+  if (pending_safe.empty())
+    safe_pos = next_safe_pos;
+  else
+    safe_pos = min_next_safe_pos;
+
+  ldout(cct, 10) << "_finish_flush safe from " << start
+		 << ", pending_safe " << pending_safe
+		 << ", (prezeroing/prezero)/write/flush/safe positions now "
+		 << "(" << prezeroing_pos << "/" << prezero_pos << ")/"
+		 << write_pos << "/" << flush_pos << "/" << safe_pos
+		 << dendl;
+
+  // kick waiters <= safe_pos
+  if (!waitfor_safe.empty()) {
+    list<Context*> ls;
+    while (!waitfor_safe.empty()) {
+      auto it = waitfor_safe.begin();
+      if (it->first > safe_pos)
+	break;
+      ls.splice(ls.end(), it->second);
+      waitfor_safe.erase(it);
+    }
+    finish_contexts(cct, ls);
+  }
+}
+
+
+
+uint64_t Journaler::append_entry(bufferlist& bl)
+{
+  unique_lock l(lock);
+
+  ceph_assert(!readonly);
+  uint32_t s = bl.length();
+
+  // append
+  size_t delta = bl.length() + journal_stream.get_envelope_size();
+  // write_buf space is nearly full
+  if (!write_buf_throttle.get_or_fail(delta)) {
+    l.unlock();
+    ldout(cct, 10) << "write_buf_throttle wait, delta " << delta << dendl;
+    write_buf_throttle.get(delta);
+    l.lock();
+  }
+  ldout(cct, 20) << "write_buf_throttle get, delta " << delta << dendl;
+  size_t wrote = journal_stream.write(bl, &write_buf, write_pos);
+  ldout(cct, 10) << "append_entry len " << s << " to " << write_pos << "~"
+		 << wrote << dendl;
+  write_pos += wrote;
+
+  // flush previous object?
+  uint64_t su = get_layout_period();
+  ceph_assert(su > 0);
+  uint64_t write_off = write_pos % su;
+  uint64_t write_obj = write_pos / su;
+  uint64_t flush_obj = flush_pos / su;
+  if (write_obj != flush_obj) {
+    ldout(cct, 10) << " flushing completed object(s) (su " << su << " wro "
+		   << write_obj << " flo " << flush_obj << ")" << dendl;
+    _do_flush(write_buf.length() - write_off);
+
+    // if _do_flush() skips flushing some data, it does do a best effort to
+    // update next_safe_pos.
+    if (write_buf.length() > 0 &&
+	write_buf.length() <= wrote) { // the unflushed data are within this entry
+      // set next_safe_pos to end of previous entry
+      next_safe_pos = write_pos - wrote;
+    }
+  }
+
+  return write_pos;
+}
+
+
+void Journaler::_do_flush(unsigned amount)
+{
+  if (is_stopping())
+    return;
+  if (write_pos == flush_pos)
+    return;
+  ceph_assert(write_pos > flush_pos);
+  ceph_assert(!readonly);
+
+  // flush
+  uint64_t len = write_pos - flush_pos;
+  ceph_assert(len == write_buf.length());
+  if (amount && amount < len)
+    len = amount;
+
+  // zero at least two full periods ahead.  this ensures
+  // that the next object will not exist.
+  uint64_t period = get_layout_period();
+  if (flush_pos + len + 2*period > prezero_pos) {
+    _issue_prezero();
+
+    int64_t newlen = prezero_pos - flush_pos - period;
+    if (newlen <= 0) {
+      ldout(cct, 10) << "_do_flush wanted to do " << flush_pos << "~" << len
+		     << " already too close to prezero_pos " << prezero_pos
+		     << ", zeroing first" << dendl;
+      waiting_for_zero_pos = flush_pos + len;
+      return;
+    }
+    if (static_cast<uint64_t>(newlen) < len) {
+      ldout(cct, 10) << "_do_flush wanted to do " << flush_pos << "~" << len
+		     << " but hit prezero_pos " << prezero_pos
+		     << ", will do " << flush_pos << "~" << newlen << dendl;
+      waiting_for_zero_pos = flush_pos + len;
+      len = newlen;
+    }
+  }
+  ldout(cct, 10) << "_do_flush flushing " << flush_pos << "~" << len << dendl;
+
+  // submit write for anything pending
+  // flush _start_ pos to _finish_flush
+  ceph::real_time now = ceph::real_clock::now();
+  SnapContext snapc;
+
+  Context *onsafe = new C_Flush(this, flush_pos, now);  // on COMMIT
+  pending_safe[flush_pos] = next_safe_pos;
+
+  bufferlist write_bl;
+
+  // adjust pointers
+  if (len == write_buf.length()) {
+    write_bl.swap(write_buf);
+    next_safe_pos = write_pos;
+  } else {
+    write_buf.splice(0, len, &write_bl);
+    // Keys of waitfor_safe map are journal entry boundaries.
+    // Try finding a journal entry that we are actually flushing
+    // and set next_safe_pos to end of it. This is best effort.
+    // The one we found may not be the lastest flushing entry.
+    auto p = waitfor_safe.lower_bound(flush_pos + len);
+    if (p != waitfor_safe.end()) {
+      if (p->first > flush_pos + len && p != waitfor_safe.begin())
+       --p;
+      if (p->first <= flush_pos + len && p->first > next_safe_pos)
+       next_safe_pos = p->first;
+    }
+  }
+
+  filer.write(ino, &layout, snapc,
+	      flush_pos, len, write_bl, ceph::real_clock::now(),
+	      0,
+	      wrap_finisher(onsafe), write_iohint);
+
+  flush_pos += len;
+  ceph_assert(write_buf.length() == write_pos - flush_pos);
+  write_buf_throttle.put(len);
+  ldout(cct, 20) << "write_buf_throttle put, len " << len << dendl;
+ 
+  ldout(cct, 10)
+    << "_do_flush (prezeroing/prezero)/write/flush/safe pointers now at "
+    << "(" << prezeroing_pos << "/" << prezero_pos << ")/" << write_pos
+    << "/" << flush_pos << "/" << safe_pos << dendl;
+
+  _issue_prezero();
+}
+
+
+void Journaler::wait_for_flush(Context *onsafe)
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    if (onsafe)
+      onsafe->complete(-EAGAIN);
+    return;
+  }
+  _wait_for_flush(onsafe);
+}
+
+void Journaler::_wait_for_flush(Context *onsafe)
+{
+  ceph_assert(!readonly);
+
+  // all flushed and safe?
+  if (write_pos == safe_pos) {
+    ceph_assert(write_buf.length() == 0);
+    ldout(cct, 10)
+      << "flush nothing to flush, (prezeroing/prezero)/write/flush/safe "
+      "pointers at " << "(" << prezeroing_pos << "/" << prezero_pos << ")/"
+      << write_pos << "/" << flush_pos << "/" << safe_pos << dendl;
+    if (onsafe) {
+      finisher->queue(onsafe, 0);
+    }
+    return;
+  }
+
+  // queue waiter
+  if (onsafe) {
+    waitfor_safe[write_pos].push_back(wrap_finisher(onsafe));
+  }
+}
+
+void Journaler::flush(Context *onsafe)
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    if (onsafe)
+      onsafe->complete(-EAGAIN);
+    return;
+  }
+  _flush(wrap_finisher(onsafe));
+}
+
+void Journaler::_flush(C_OnFinisher *onsafe)
+{
+  ceph_assert(!readonly);
+
+  if (write_pos == flush_pos) {
+    ceph_assert(write_buf.length() == 0);
+    ldout(cct, 10) << "flush nothing to flush, (prezeroing/prezero)/write/"
+      "flush/safe pointers at " << "(" << prezeroing_pos << "/" << prezero_pos
+		   << ")/" << write_pos << "/" << flush_pos << "/" << safe_pos
+		   << dendl;
+    if (onsafe) {
+      onsafe->complete(0);
+    }
+  } else {
+    _do_flush();
+    _wait_for_flush(onsafe);
+  }
+
+  // write head?
+  if (_write_head_needed()) {
+    _write_head();
+  }
+}
+
+bool Journaler::_write_head_needed()
+{
+  return last_wrote_head + seconds(cct->_conf.get_val<int64_t>("journaler_write_head_interval"))
+      < ceph::real_clock::now();
+}
+
+
+/*************** prezeroing ******************/
+
+struct C_Journaler_Prezero : public Context {
+  Journaler *journaler;
+  uint64_t from, len;
+  C_Journaler_Prezero(Journaler *j, uint64_t f, uint64_t l)
+    : journaler(j), from(f), len(l) {}
+  void finish(int r) override {
+    journaler->_finish_prezero(r, from, len);
+  }
+};
+
+void Journaler::_issue_prezero()
+{
+  ceph_assert(prezeroing_pos >= flush_pos);
+
+  uint64_t num_periods = cct->_conf.get_val<uint64_t>("journaler_prezero_periods");
+  /*
+   * issue zero requests based on write_pos, even though the invariant
+   * is that we zero ahead of flush_pos.
+   */
+  uint64_t period = get_layout_period();
+  uint64_t to = write_pos + period * num_periods  + period - 1;
+  to -= to % period;
+
+  if (prezeroing_pos >= to) {
+    ldout(cct, 20) << "_issue_prezero target " << to << " <= prezeroing_pos "
+		   << prezeroing_pos << dendl;
+    return;
+  }
+
+  while (prezeroing_pos < to) {
+    uint64_t len;
+    if (prezeroing_pos % period == 0) {
+      len = period;
+      ldout(cct, 10) << "_issue_prezero removing " << prezeroing_pos << "~"
+		     << period << " (full period)" << dendl;
+    } else {
+      len = period - (prezeroing_pos % period);
+      ldout(cct, 10) << "_issue_prezero zeroing " << prezeroing_pos << "~"
+		     << len << " (partial period)" << dendl;
+    }
+    SnapContext snapc;
+    Context *c = wrap_finisher(new C_Journaler_Prezero(this, prezeroing_pos,
+						       len));
+    filer.zero(ino, &layout, snapc, prezeroing_pos, len,
+	       ceph::real_clock::now(), 0, c);
+    prezeroing_pos += len;
+  }
+}
+
+// Lock cycle because we get called out of objecter callback (holding
+// objecter read lock), but there are also cases where we take the journaler
+// lock before calling into objecter to do I/O.
+void Journaler::_finish_prezero(int r, uint64_t start, uint64_t len)
+{
+  lock_guard l(lock);
+
+  ldout(cct, 10) << "_prezeroed to " << start << "~" << len
+		 << ", prezeroing/prezero was " << prezeroing_pos << "/"
+		 << prezero_pos << ", pending " << pending_zero
+		 << dendl;
+  if (r < 0 && r != -ENOENT) {
+    lderr(cct) << "_prezeroed got " << cpp_strerror(r) << dendl;
+    handle_write_error(r);
+    return;
+  }
+
+  ceph_assert(r == 0 || r == -ENOENT);
+
+  if (start == prezero_pos) {
+    prezero_pos += len;
+    while (!pending_zero.empty() &&
+	   pending_zero.begin().get_start() == prezero_pos) {
+      interval_set<uint64_t>::iterator b(pending_zero.begin());
+      prezero_pos += b.get_len();
+      pending_zero.erase(b);
+    }
+
+    if (waiting_for_zero_pos > flush_pos) {
+      _do_flush(waiting_for_zero_pos - flush_pos);
+    }
+
+    if (prezero_pos == prezeroing_pos &&
+	!waitfor_prezero.empty()) {
+      list<Context*> ls;
+      ls.swap(waitfor_prezero);
+      finish_contexts(cct, ls, 0);
+    }
+  } else {
+    pending_zero.insert(start, len);
+  }
+  ldout(cct, 10) << "_prezeroed prezeroing/prezero now " << prezeroing_pos
+		 << "/" << prezero_pos
+		 << ", pending " << pending_zero
+		 << dendl;
+}
+
+void Journaler::wait_for_prezero(Context *onfinish)
+{
+  ceph_assert(onfinish);
+  lock_guard l(lock);
+
+  if (prezero_pos == prezeroing_pos) {
+    finisher->queue(onfinish, 0);
+    return;
+  }
+  waitfor_prezero.push_back(wrap_finisher(onfinish));
+}
+
+
+/***************** READING *******************/
+
+
+class Journaler::C_Read : public Context {
+  Journaler *ls;
+  uint64_t offset;
+  uint64_t length;
+public:
+  bufferlist bl;
+  C_Read(Journaler *j, uint64_t o, uint64_t l) : ls(j), offset(o), length(l) {}
+  void finish(int r) override {
+    ls->_finish_read(r, offset, length, bl);
+  }
+};
+
+class Journaler::C_RetryRead : public Context {
+  Journaler *ls;
+public:
+  explicit C_RetryRead(Journaler *l) : ls(l) {}
+
+  void finish(int r) override {
+    // Should only be called from waitfor_safe i.e. already inside lock
+    // (ls->lock is locked
+    ls->_prefetch();
+  }
+};
+
+void Journaler::_finish_read(int r, uint64_t offset, uint64_t length,
+			     bufferlist& bl)
+{
+  lock_guard l(lock);
+
+  if (r < 0) {
+    ldout(cct, 0) << "_finish_read got error " << r << dendl;
+    error = r;
+  } else {
+    ldout(cct, 10) << "_finish_read got " << offset << "~" << bl.length()
+		   << dendl;
+    if (bl.length() < length) {
+      ldout(cct, 0) << "_finish_read got less than expected (" << length << ")"
+		    << dendl;
+      error = -EINVAL;
+    }
+  }
+
+  if (error) {
+    if (on_readable) {
+      C_OnFinisher *f = on_readable;
+      on_readable = 0;
+      f->complete(error);
+    }
+    return;
+  }
+
+  prefetch_buf[offset].swap(bl);
+
+  try {
+    _assimilate_prefetch();
+  } catch (const buffer::error &err) {
+    lderr(cct) << "_decode error from assimilate_prefetch" << dendl;
+    error = -EINVAL;
+    if (on_readable) {
+      C_OnFinisher *f = on_readable;
+      on_readable = 0;
+      f->complete(error);
+    }
+    return;
+  }
+  _prefetch();
+}
+
+void Journaler::_assimilate_prefetch()
+{
+  bool was_readable = readable;
+
+  bool got_any = false;
+  while (!prefetch_buf.empty()) {
+    map<uint64_t,bufferlist>::iterator p = prefetch_buf.begin();
+    if (p->first != received_pos) {
+      uint64_t gap = p->first - received_pos;
+      ldout(cct, 10) << "_assimilate_prefetch gap of " << gap
+		     << " from received_pos " << received_pos
+		     << " to first prefetched buffer " << p->first << dendl;
+      break;
+    }
+
+    ldout(cct, 10) << "_assimilate_prefetch " << p->first << "~"
+		   << p->second.length() << dendl;
+    received_pos += p->second.length();
+    read_buf.claim_append(p->second);
+    ceph_assert(received_pos <= requested_pos);
+    prefetch_buf.erase(p);
+    got_any = true;
+  }
+
+  if (got_any) {
+    ldout(cct, 10) << "_assimilate_prefetch read_buf now " << read_pos << "~"
+		   << read_buf.length() << ", read pointers read_pos=" << read_pos 
+                   << " received_pos=" << received_pos << " requested_pos=" << requested_pos
+		   << dendl;
+
+    // Update readability (this will also hit any decode errors resulting
+    // from bad data)
+    readable = _is_readable();
+  }
+
+  if ((got_any && !was_readable && readable) || read_pos == write_pos) {
+    // readable!
+    ldout(cct, 10) << "_finish_read now readable (or at journal end) readable="
+                   << readable << " read_pos=" << read_pos << " write_pos="
+                   << write_pos << dendl;
+    if (on_readable) {
+      C_OnFinisher *f = on_readable;
+      on_readable = 0;
+      f->complete(0);
+    }
+  }
+}
+
+void Journaler::_issue_read(uint64_t len)
+{
+  // stuck at safe_pos?  (this is needed if we are reading the tail of
+  // a journal we are also writing to)
+  ceph_assert(requested_pos <= safe_pos);
+  if (requested_pos == safe_pos) {
+    ldout(cct, 10) << "_issue_read requested_pos = safe_pos = " << safe_pos
+		   << ", waiting" << dendl;
+    ceph_assert(write_pos > requested_pos);
+    if (pending_safe.empty()) {
+      _flush(NULL);
+    }
+
+    // Make sure keys of waitfor_safe map are journal entry boundaries.
+    // The key we used here is either next_safe_pos or old value of
+    // next_safe_pos. next_safe_pos is always set to journal entry
+    // boundary.
+    auto p = pending_safe.rbegin();
+    if (p != pending_safe.rend())
+      waitfor_safe[p->second].push_back(new C_RetryRead(this));
+    else
+      waitfor_safe[next_safe_pos].push_back(new C_RetryRead(this));
+    return;
+  }
+
+  // don't read too much
+  if (requested_pos + len > safe_pos) {
+    len = safe_pos - requested_pos;
+    ldout(cct, 10) << "_issue_read reading only up to safe_pos " << safe_pos
+		   << dendl;
+  }
+
+  // go.
+  ldout(cct, 10) << "_issue_read reading " << requested_pos << "~" << len
+		 << ", read pointers read_pos=" << read_pos << " received_pos=" << received_pos
+		 << " requested_pos+len=" << (requested_pos+len) << dendl;
+
+  // step by period (object).  _don't_ do a single big filer.read()
+  // here because it will wait for all object reads to complete before
+  // giving us back any data.  this way we can process whatever bits
+  // come in that are contiguous.
+  uint64_t period = get_layout_period();
+  while (len > 0) {
+    uint64_t e = requested_pos + period;
+    e -= e % period;
+    uint64_t l = e - requested_pos;
+    if (l > len)
+      l = len;
+    C_Read *c = new C_Read(this, requested_pos, l);
+    filer.read(ino, &layout, CEPH_NOSNAP, requested_pos, l, &c->bl, 0,
+	       wrap_finisher(c), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+    requested_pos += l;
+    len -= l;
+  }
+}
+
+void Journaler::_prefetch()
+{
+  if (is_stopping())
+    return;
+
+  ldout(cct, 10) << "_prefetch" << dendl;
+  // prefetch
+  uint64_t pf;
+  if (temp_fetch_len) {
+    ldout(cct, 10) << "_prefetch temp_fetch_len " << temp_fetch_len << dendl;
+    pf = temp_fetch_len;
+    temp_fetch_len = 0;
+  } else {
+    pf = fetch_len;
+  }
+
+  uint64_t raw_target = read_pos + pf;
+
+  // read full log segments, so increase if necessary
+  uint64_t period = get_layout_period();
+  uint64_t remainder = raw_target % period;
+  uint64_t adjustment = remainder ? period - remainder : 0;
+  uint64_t target = raw_target + adjustment;
+
+  // don't read past the log tail
+  if (target > write_pos)
+    target = write_pos;
+
+  if (requested_pos < target) {
+    uint64_t len = target - requested_pos;
+    ldout(cct, 10) << "_prefetch " << pf << " requested_pos " << requested_pos
+		   << " < target " << target << " (" << raw_target
+		   << "), prefetching " << len << dendl;
+
+    if (pending_safe.empty() && write_pos > safe_pos) {
+      // If we are reading and writing the journal, then we may need
+      // to issue a flush if one isn't already in progress.
+      // Avoid doing a flush every time so that if we do write/read/write/read
+      // we don't end up flushing after every write.
+      ldout(cct, 10) << "_prefetch: requested_pos=" << requested_pos
+                     << ", read_pos=" << read_pos
+                     << ", write_pos=" << write_pos
+                     << ", safe_pos=" << safe_pos << dendl;
+      _do_flush();
+    }
+
+    _issue_read(len);
+  }
+}
+
+
+/*
+ * _is_readable() - return true if next entry is ready.
+ */
+bool Journaler::_is_readable()
+{
+  // anything to read?
+  if (read_pos == write_pos)
+    return false;
+
+  // Check if the retrieve bytestream has enough for an entry
+  uint64_t need;
+  if (journal_stream.readable(read_buf, &need)) {
+    return true;
+  }
+
+  ldout (cct, 10) << "_is_readable read_buf.length() == " << read_buf.length()
+		  << ", but need " << need << " for next entry; fetch_len is "
+		  << fetch_len << dendl;
+
+  // partial fragment at the end?
+  if (received_pos == write_pos) {
+    ldout(cct, 10) << "is_readable() detected partial entry at tail, "
+      "adjusting write_pos to " << read_pos << dendl;
+
+    // adjust write_pos
+    prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = read_pos;
+    ceph_assert(write_buf.length() == 0);
+    ceph_assert(waitfor_safe.empty());
+
+    // reset read state
+    requested_pos = received_pos = read_pos;
+    read_buf.clear();
+
+    // FIXME: truncate on disk?
+
+    return false;
+  }
+
+  if (need > fetch_len) {
+    temp_fetch_len = need;
+    ldout(cct, 10) << "_is_readable noting temp_fetch_len " << temp_fetch_len
+		   << dendl;
+  }
+
+  ldout(cct, 10) << "_is_readable: not readable, returning false" << dendl;
+  return false;
+}
+
+/*
+ * is_readable() - kickstart prefetch, too
+ */
+bool Journaler::is_readable()
+{
+  lock_guard l(lock);
+
+  if (error != 0) {
+    return false;
+  }
+
+  bool r = readable;
+  _prefetch();
+  return r;
+}
+
+class Journaler::C_EraseFinish : public Context {
+  Journaler *journaler;
+  C_OnFinisher *completion;
+  public:
+  C_EraseFinish(Journaler *j, C_OnFinisher *c) : journaler(j), completion(c) {}
+  void finish(int r) override {
+    journaler->_finish_erase(r, completion);
+  }
+};
+
+/**
+ * Entirely erase the journal, including header.  For use when you
+ * have already made a copy of the journal somewhere else.
+ */
+void Journaler::erase(Context *completion)
+{
+  lock_guard l(lock);
+
+  // Async delete the journal data
+  uint64_t first = trimmed_pos / get_layout_period();
+  uint64_t num = (write_pos - trimmed_pos) / get_layout_period() + 2;
+  filer.purge_range(ino, &layout, SnapContext(), first, num,
+		    ceph::real_clock::now(), 0,
+		    wrap_finisher(new C_EraseFinish(
+				    this, wrap_finisher(completion))));
+
+  // We will not start the operation to delete the header until
+  // _finish_erase has seen the data deletion succeed: otherwise if
+  // there was an error deleting data we might prematurely delete the
+  // header thereby lose our reference to the data.
+}
+
+void Journaler::_finish_erase(int data_result, C_OnFinisher *completion)
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    completion->complete(-EAGAIN);
+    return;
+  }
+
+  if (data_result == 0) {
+    // Async delete the journal header
+    filer.purge_range(ino, &layout, SnapContext(), 0, 1,
+		      ceph::real_clock::now(),
+		      0, wrap_finisher(completion));
+  } else {
+    lderr(cct) << "Failed to delete journal " << ino << " data: "
+	       << cpp_strerror(data_result) << dendl;
+    completion->complete(data_result);
+  }
+}
+
+/* try_read_entry(bl)
+ *  read entry into bl if it's ready.
+ *  otherwise, do nothing.
+ */
+bool Journaler::try_read_entry(bufferlist& bl)
+{
+  lock_guard l(lock);
+
+  if (!readable) {
+    ldout(cct, 10) << "try_read_entry at " << read_pos << " not readable"
+		   << dendl;
+    return false;
+  }
+
+  uint64_t start_ptr;
+  size_t consumed;
+  try {
+    consumed = journal_stream.read(read_buf, &bl, &start_ptr);
+    if (stream_format >= JOURNAL_FORMAT_RESILIENT) {
+      ceph_assert(start_ptr == read_pos);
+    }
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << ": decode error from journal_stream" << dendl;
+    error = -EINVAL;
+    return false;
+  }
+
+  ldout(cct, 10) << "try_read_entry at " << read_pos << " read "
+		 << read_pos << "~" << consumed << " (have "
+		 << read_buf.length() << ")" << dendl;
+
+  read_pos += consumed;
+  try {
+    // We were readable, we might not be any more
+    readable = _is_readable();
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << ": decode error from _is_readable" << dendl;
+    error = -EINVAL;
+    return false;
+  }
+
+  // prefetch?
+  _prefetch();
+
+  // If bufferlist consists of discontiguous memory, decoding types whose
+  // denc_traits needs contiguous memory is inefficient. The bufferlist may
+  // get copied to temporary memory multiple times (copy_shallow() in
+  // src/include/denc.h actually does deep copy)
+  if (bl.get_num_buffers() > 1)
+    bl.rebuild();
+  return true;
+}
+
+void Journaler::wait_for_readable(Context *onreadable)
+{
+  lock_guard l(lock);
+  if (is_stopping()) {
+    finisher->queue(onreadable, -EAGAIN);
+    return;
+  }
+
+  ceph_assert(on_readable == 0);
+  if (!readable) {
+    ldout(cct, 10) << "wait_for_readable at " << read_pos << " onreadable "
+		   << onreadable << dendl;
+    on_readable = wrap_finisher(onreadable);
+  } else {
+    // race with OSD reply
+    finisher->queue(onreadable, 0);
+  }
+}
+
+bool Journaler::have_waiter() const
+{
+  return on_readable != nullptr;
+}
+
+
+
+
+/***************** TRIMMING *******************/
+
+
+class Journaler::C_Trim : public Context {
+  Journaler *ls;
+  uint64_t to;
+public:
+  C_Trim(Journaler *l, int64_t t) : ls(l), to(t) {}
+  void finish(int r) override {
+    ls->_finish_trim(r, to);
+  }
+};
+
+void Journaler::trim()
+{
+  lock_guard l(lock);
+  _trim();
+}
+
+void Journaler::_trim()
+{
+  if (is_stopping())
+    return;
+
+  ceph_assert(!readonly);
+  uint64_t period = get_layout_period();
+  uint64_t trim_to = last_committed.expire_pos;
+  trim_to -= trim_to % period;
+  ldout(cct, 10) << "trim last_commited head was " << last_committed
+	   << ", can trim to " << trim_to
+	   << dendl;
+  if (trim_to == 0 || trim_to == trimming_pos) {
+    ldout(cct, 10) << "trim already trimmed/trimming to "
+		   << trimmed_pos << "/" << trimming_pos << dendl;
+    return;
+  }
+
+  if (trimming_pos > trimmed_pos) {
+    ldout(cct, 10) << "trim already trimming atm, try again later.  "
+      "trimmed/trimming is " << trimmed_pos << "/" << trimming_pos << dendl;
+    return;
+  }
+
+  // trim
+  ceph_assert(trim_to <= write_pos);
+  ceph_assert(trim_to <= expire_pos);
+  ceph_assert(trim_to > trimming_pos);
+  ldout(cct, 10) << "trim trimming to " << trim_to
+		 << ", trimmed/trimming/expire are "
+		 << trimmed_pos << "/" << trimming_pos << "/" << expire_pos
+		 << dendl;
+
+  // delete range of objects
+  uint64_t first = trimming_pos / period;
+  uint64_t num = (trim_to - trimming_pos) / period;
+  SnapContext snapc;
+  filer.purge_range(ino, &layout, snapc, first, num,
+		    ceph::real_clock::now(), 0,
+		    wrap_finisher(new C_Trim(this, trim_to)));
+  trimming_pos = trim_to;
+}
+
+void Journaler::_finish_trim(int r, uint64_t to)
+{
+  lock_guard l(lock);
+
+  ceph_assert(!readonly);
+  ldout(cct, 10) << "_finish_trim trimmed_pos was " << trimmed_pos
+	   << ", trimmed/trimming/expire now "
+	   << to << "/" << trimming_pos << "/" << expire_pos
+	   << dendl;
+  if (r < 0 && r != -ENOENT) {
+    lderr(cct) << "_finish_trim got " << cpp_strerror(r) << dendl;
+    handle_write_error(r);
+    return;
+  }
+
+  ceph_assert(r >= 0 || r == -ENOENT);
+
+  ceph_assert(to <= trimming_pos);
+  ceph_assert(to > trimmed_pos);
+  trimmed_pos = to;
+}
+
+void Journaler::handle_write_error(int r)
+{
+  // lock is locked
+
+  lderr(cct) << "handle_write_error " << cpp_strerror(r) << dendl;
+  if (on_write_error) {
+    on_write_error->complete(r);
+    on_write_error = NULL;
+    called_write_error = true;
+  } else if (called_write_error) {
+    /* We don't call error handler more than once, subsequent errors
+     * are dropped -- this is okay as long as the error handler does
+     * something dramatic like respawn */
+    lderr(cct) << __func__ << ": multiple write errors, handler already called"
+	       << dendl;
+  } else {
+    ceph_abort_msg("unhandled write error");
+  }
+}
+
+
+/**
+ * Test whether the 'read_buf' byte stream has enough data to read
+ * an entry
+ *
+ * sets 'next_envelope_size' to the number of bytes needed to advance (enough
+ * to get the next header if header was unavailable, or enough to get the whole
+ * next entry if the header was available but the body wasn't).
+ */
+bool JournalStream::readable(bufferlist &read_buf, uint64_t *need) const
+{
+  ceph_assert(need != NULL);
+
+  uint32_t entry_size = 0;
+  uint64_t entry_sentinel = 0;
+  auto p = read_buf.cbegin();
+
+  // Do we have enough data to decode an entry prefix?
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    *need = sizeof(entry_size) + sizeof(entry_sentinel);
+  } else {
+    *need = sizeof(entry_size);
+  }
+  if (read_buf.length() >= *need) {
+    if (format >= JOURNAL_FORMAT_RESILIENT) {
+      decode(entry_sentinel, p);
+      if (entry_sentinel != sentinel) {
+	throw buffer::malformed_input("Invalid sentinel");
+      }
+    }
+
+    decode(entry_size, p);
+  } else {
+    return false;
+  }
+
+  // Do we have enough data to decode an entry prefix, payload and suffix?
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    *need = JOURNAL_ENVELOPE_RESILIENT + entry_size;
+  } else {
+    *need = JOURNAL_ENVELOPE_LEGACY + entry_size;
+  }
+  if (read_buf.length() >= *need) {
+    return true;  // No more bytes needed
+  }
+
+  return false;
+}
+
+
+/**
+ * Consume one entry from a journal byte stream 'from', splicing a
+ * serialized LogEvent blob into 'entry'.
+ *
+ * 'entry' must be non null and point to an empty bufferlist.
+ *
+ * 'from' must contain sufficient valid data (i.e. readable is true).
+ *
+ * 'start_ptr' will be set to the entry's start pointer, if the collection
+ * format provides it.  It may not be null.
+ *
+ * @returns The number of bytes consumed from the `from` byte stream.  Note
+ *          that this is not equal to the length of `entry`, which contains
+ *          the inner serialized LogEvent and not the envelope.
+ */
+size_t JournalStream::read(bufferlist &from, bufferlist *entry,
+			   uint64_t *start_ptr)
+{
+  ceph_assert(start_ptr != NULL);
+  ceph_assert(entry != NULL);
+  ceph_assert(entry->length() == 0);
+
+  uint32_t entry_size = 0;
+
+  // Consume envelope prefix: entry_size and entry_sentinel
+  auto from_ptr = from.cbegin();
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    uint64_t entry_sentinel = 0;
+    decode(entry_sentinel, from_ptr);
+    // Assertion instead of clean check because of precondition of this
+    // fn is that readable() already passed
+    ceph_assert(entry_sentinel == sentinel);
+  }
+  decode(entry_size, from_ptr);
+
+  // Read out the payload
+  from_ptr.copy(entry_size, *entry);
+
+  // Consume the envelope suffix (start_ptr)
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    decode(*start_ptr, from_ptr);
+  } else {
+    *start_ptr = 0;
+  }
+
+  // Trim the input buffer to discard the bytes we have consumed
+  from.splice(0, from_ptr.get_off());
+
+  return from_ptr.get_off();
+}
+
+
+/**
+ * Append one entry
+ */
+size_t JournalStream::write(bufferlist &entry, bufferlist *to,
+			    uint64_t const &start_ptr)
+{
+  ceph_assert(to != NULL);
+
+  uint32_t const entry_size = entry.length();
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    encode(sentinel, *to);
+  }
+  encode(entry_size, *to);
+  to->claim_append(entry);
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    encode(start_ptr, *to);
+  }
+
+  if (format >= JOURNAL_FORMAT_RESILIENT) {
+    return JOURNAL_ENVELOPE_RESILIENT + entry_size;
+  } else {
+    return JOURNAL_ENVELOPE_LEGACY + entry_size;
+  }
+}
+
+/**
+ * set write error callback
+ *
+ * Set a callback/context to trigger if we get a write error from
+ * the objecter.  This may be from an explicit request (e.g., flush)
+ * or something async the journaler did on its own (e.g., journal
+ * header update).
+ *
+ * It is only used once; if the caller continues to use the
+ * Journaler and wants to hear about errors, it needs to reset the
+ * error_handler.
+ *
+ * @param c callback/context to trigger on error
+ */
+void Journaler::set_write_error_handler(Context *c) {
+  lock_guard l(lock);
+  ceph_assert(!on_write_error);
+  on_write_error = wrap_finisher(c);
+  called_write_error = false;
+}
+
+
+/**
+ * Wrap a context in a C_OnFinisher, if it is non-NULL
+ *
+ * Utility function to avoid lots of error-prone and verbose
+ * NULL checking on contexts passed in.
+ */
+C_OnFinisher *Journaler::wrap_finisher(Context *c)
+{
+  if (c != NULL) {
+    return new C_OnFinisher(c, finisher);
+  } else {
+    return NULL;
+  }
+}
+
+void Journaler::shutdown()
+{
+  lock_guard l(lock);
+
+  ldout(cct, 1) << __func__ << dendl;
+
+  state = STATE_STOPPING;
+  readable = false;
+
+  // Kick out anyone reading from journal
+  error = -EAGAIN;
+  if (on_readable) {
+    C_OnFinisher *f = on_readable;
+    on_readable = 0;
+    f->complete(-EAGAIN);
+  }
+
+  list<Context*> ls;
+  ls.swap(waitfor_recover);
+  finish_contexts(cct, ls, -ESHUTDOWN);
+
+  std::map<uint64_t, std::list<Context*> >::iterator i;
+  for (i = waitfor_safe.begin(); i != waitfor_safe.end(); ++i) {
+    finish_contexts(cct, i->second, -EAGAIN);
+  }
+  waitfor_safe.clear();
+}
+
diff --git a/src/osdc/Journaler.h b/src/osdc/Journaler.h
new file mode 100644
index 00000000..e3cd9e6c
--- /dev/null
+++ b/src/osdc/Journaler.h
@@ -0,0 +1,540 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+/* Journaler
+ *
+ * This class stripes a serial log over objects on the store.  Four
+ * logical pointers:
+ *
+ *  write_pos - where we're writing new entries
+ *  unused_field - where we're reading old entires
+ *  expire_pos - what is deemed "old" by user
+ *  trimmed_pos - where we're expiring old items
+ *
+ *  trimmed_pos <= expire_pos <= unused_field <= write_pos.
+ *
+ * Often, unused_field <= write_pos (as with MDS log).  During
+ * recovery, write_pos is undefined until the end of the log is
+ * discovered.
+ *
+ * A "head" struct at the beginning of the log is used to store
+ * metadata at regular intervals.  The basic invariants include:
+ *
+ *   head.unused_field <= unused_field -- the head may "lag", since
+ *                                        it's updated lazily.
+ *   head.write_pos  <= write_pos
+ *   head.expire_pos <= expire_pos
+ *   head.trimmed_pos   <= trimmed_pos
+ *
+ * More significantly,
+ *
+ *   head.expire_pos >= trimmed_pos -- this ensures we can find the
+ *                                     "beginning" of the log as last
+ *                                     recorded, before it is trimmed.
+ *                                     trimming will block until a
+ *                                     sufficiently current expire_pos
+ *                                     is committed.
+ *
+ * To recover log state, we simply start at the last write_pos in the
+ * head, and probe the object sequence sizes until we read the end.
+ *
+ * Head struct is stored in the first object.  Actual journal starts
+ * after layout.period() bytes.
+ *
+ */
+
+#ifndef CEPH_JOURNALER_H
+#define CEPH_JOURNALER_H
+
+#include <list>
+#include <map>
+
+#include "Objecter.h"
+#include "Filer.h"
+
+#include "common/Timer.h"
+#include "common/Throttle.h"
+
+class CephContext;
+class Context;
+class PerfCounters;
+class Finisher;
+class C_OnFinisher;
+
+typedef __u8 stream_format_t;
+
+// Legacy envelope is leading uint32_t size
+enum StreamFormat {
+    JOURNAL_FORMAT_LEGACY = 0,
+    JOURNAL_FORMAT_RESILIENT = 1,
+    // Insert new formats here, before COUNT
+    JOURNAL_FORMAT_COUNT
+};
+
+// Highest journal format version that we support
+#define JOURNAL_FORMAT_MAX (JOURNAL_FORMAT_COUNT - 1)
+
+// Legacy envelope is leading uint32_t size
+#define JOURNAL_ENVELOPE_LEGACY (sizeof(uint32_t))
+
+// Resilient envelope is leading uint64_t sentinel, uint32_t size,
+// trailing uint64_t start_ptr
+#define JOURNAL_ENVELOPE_RESILIENT (sizeof(uint32_t) + sizeof(uint64_t) + \
+				    sizeof(uint64_t))
+
+/**
+ * Represents a collection of entries serialized in a byte stream.
+ *
+ * Each entry consists of:
+ *  - a blob (used by the next level up as a serialized LogEvent)
+ *  - a uint64_t (used by the next level up as a pointer to the start
+ *    of the entry in the collection bytestream)
+ */
+class JournalStream
+{
+  stream_format_t format;
+
+  public:
+  JournalStream(stream_format_t format_) : format(format_) {}
+
+  void set_format(stream_format_t format_) {format = format_;}
+
+  bool readable(bufferlist &bl, uint64_t *need) const;
+  size_t read(bufferlist &from, bufferlist *to, uint64_t *start_ptr);
+  size_t write(bufferlist &entry, bufferlist *to, uint64_t const &start_ptr);
+  size_t get_envelope_size() const {
+     if (format >= JOURNAL_FORMAT_RESILIENT) {
+       return JOURNAL_ENVELOPE_RESILIENT;
+     } else {
+       return JOURNAL_ENVELOPE_LEGACY;
+     }
+  }
+
+  // A magic number for the start of journal entries, so that we can
+  // identify them in damaged journals.
+  static const uint64_t sentinel = 0x3141592653589793;
+};
+
+
+class Journaler {
+public:
+  // this goes at the head of the log "file".
+  class Header {
+    public:
+    uint64_t trimmed_pos;
+    uint64_t expire_pos;
+    uint64_t unused_field;
+    uint64_t write_pos;
+    string magic;
+    file_layout_t layout; //< The mapping from byte stream offsets
+			     //  to RADOS objects
+    stream_format_t stream_format; //< The encoding of LogEvents
+				   //  within the journal byte stream
+
+    Header(const char *m="") :
+      trimmed_pos(0), expire_pos(0), unused_field(0), write_pos(0), magic(m),
+      stream_format(-1) {
+    }
+
+    void encode(bufferlist &bl) const {
+      ENCODE_START(2, 2, bl);
+      encode(magic, bl);
+      encode(trimmed_pos, bl);
+      encode(expire_pos, bl);
+      encode(unused_field, bl);
+      encode(write_pos, bl);
+      encode(layout, bl, 0);  // encode in legacy format
+      encode(stream_format, bl);
+      ENCODE_FINISH(bl);
+    }
+    void decode(bufferlist::const_iterator &bl) {
+      DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+      decode(magic, bl);
+      decode(trimmed_pos, bl);
+      decode(expire_pos, bl);
+      decode(unused_field, bl);
+      decode(write_pos, bl);
+      decode(layout, bl);
+      if (struct_v > 1) {
+	decode(stream_format, bl);
+      } else {
+	stream_format = JOURNAL_FORMAT_LEGACY;
+      }
+      DECODE_FINISH(bl);
+    }
+
+    void dump(Formatter *f) const {
+      f->open_object_section("journal_header");
+      {
+	f->dump_string("magic", magic);
+	f->dump_unsigned("write_pos", write_pos);
+	f->dump_unsigned("expire_pos", expire_pos);
+	f->dump_unsigned("trimmed_pos", trimmed_pos);
+	f->dump_unsigned("stream_format", stream_format);
+	f->dump_object("layout", layout);
+      }
+      f->close_section(); // journal_header
+    }
+
+    static void generate_test_instances(list<Header*> &ls)
+    {
+      ls.push_back(new Header());
+
+      ls.push_back(new Header());
+      ls.back()->trimmed_pos = 1;
+      ls.back()->expire_pos = 2;
+      ls.back()->unused_field = 3;
+      ls.back()->write_pos = 4;
+      ls.back()->magic = "magique";
+
+      ls.push_back(new Header());
+      ls.back()->stream_format = JOURNAL_FORMAT_RESILIENT;
+    }
+  };
+  WRITE_CLASS_ENCODER(Header)
+
+  uint32_t get_stream_format() const {
+    return stream_format;
+  }
+
+  Header last_committed;
+
+private:
+  // me
+  CephContext *cct;
+  std::mutex lock;
+  const std::string name;
+  typedef std::lock_guard<std::mutex> lock_guard;
+  typedef std::unique_lock<std::mutex> unique_lock;
+  Finisher *finisher;
+  Header last_written;
+  inodeno_t ino;
+  int64_t pg_pool;
+  bool readonly;
+  file_layout_t layout;
+  uint32_t stream_format;
+  JournalStream journal_stream;
+
+  const char *magic;
+  Objecter *objecter;
+  Filer filer;
+
+  PerfCounters *logger;
+  int logger_key_lat;
+
+  class C_DelayFlush;
+  C_DelayFlush *delay_flush_event;
+  /*
+   * Do a flush as a result of a C_DelayFlush context.
+   */
+  void _do_delayed_flush()
+  {
+    ceph_assert(delay_flush_event != NULL);
+    lock_guard l(lock);
+    delay_flush_event = NULL;
+    _do_flush();
+  }
+
+  // my state
+  static const int STATE_UNDEF = 0;
+  static const int STATE_READHEAD = 1;
+  static const int STATE_PROBING = 2;
+  static const int STATE_ACTIVE = 3;
+  static const int STATE_REREADHEAD = 4;
+  static const int STATE_REPROBING = 5;
+  static const int STATE_STOPPING = 6;
+
+  int state;
+  int error;
+
+  void _write_head(Context *oncommit=NULL);
+  void _wait_for_flush(Context *onsafe);
+  void _trim();
+
+  // header
+  ceph::real_time last_wrote_head;
+  void _finish_write_head(int r, Header &wrote, C_OnFinisher *oncommit);
+  class C_WriteHead;
+  friend class C_WriteHead;
+
+  void _reread_head(Context *onfinish);
+  void _set_layout(file_layout_t const *l);
+  list<Context*> waitfor_recover;
+  void _read_head(Context *on_finish, bufferlist *bl);
+  void _finish_read_head(int r, bufferlist& bl);
+  void _finish_reread_head(int r, bufferlist& bl, Context *finish);
+  void _probe(Context *finish, uint64_t *end);
+  void _finish_probe_end(int r, uint64_t end);
+  void _reprobe(C_OnFinisher *onfinish);
+  void _finish_reprobe(int r, uint64_t end, C_OnFinisher *onfinish);
+  void _finish_reread_head_and_probe(int r, C_OnFinisher *onfinish);
+  class C_ReadHead;
+  friend class C_ReadHead;
+  class C_ProbeEnd;
+  friend class C_ProbeEnd;
+  class C_RereadHead;
+  friend class C_RereadHead;
+  class C_ReProbe;
+  friend class C_ReProbe;
+  class C_RereadHeadProbe;
+  friend class C_RereadHeadProbe;
+
+  // writer
+  uint64_t prezeroing_pos;
+  uint64_t prezero_pos; ///< we zero journal space ahead of write_pos to
+			//   avoid problems with tail probing
+  uint64_t write_pos; ///< logical write position, where next entry
+		      //   will go
+  uint64_t flush_pos; ///< where we will flush. if
+		      ///  write_pos>flush_pos, we're buffering writes.
+  uint64_t safe_pos; ///< what has been committed safely to disk.
+
+  uint64_t next_safe_pos; /// start position of the first entry that isn't
+			  /// being fully flushed. If we don't flush any
+			  // partial entry, it's equal to flush_pos.
+
+  bufferlist write_buf; ///< write buffer.  flush_pos +
+			///  write_buf.length() == write_pos.
+
+  // protect write_buf from bufferlist _len overflow 
+  Throttle write_buf_throttle;
+
+  uint64_t waiting_for_zero_pos;
+  interval_set<uint64_t> pending_zero;  // non-contig bits we've zeroed
+  list<Context*> waitfor_prezero;
+
+  std::map<uint64_t, uint64_t> pending_safe; // flush_pos -> safe_pos
+  // when safe through given offset
+  std::map<uint64_t, std::list<Context*> > waitfor_safe;
+
+  void _flush(C_OnFinisher *onsafe);
+  void _do_flush(unsigned amount=0);
+  void _finish_flush(int r, uint64_t start, ceph::real_time stamp);
+  class C_Flush;
+  friend class C_Flush;
+
+  // reader
+  uint64_t read_pos;      // logical read position, where next entry starts.
+  uint64_t requested_pos; // what we've requested from OSD.
+  uint64_t received_pos;  // what we've received from OSD.
+  // read buffer.  unused_field + read_buf.length() == prefetch_pos.
+  bufferlist read_buf;
+
+  map<uint64_t,bufferlist> prefetch_buf;
+
+  uint64_t fetch_len;     // how much to read at a time
+  uint64_t temp_fetch_len;
+
+  // for wait_for_readable()
+  C_OnFinisher *on_readable;
+  C_OnFinisher *on_write_error;
+  bool called_write_error;
+
+  // read completion callback
+  void _finish_read(int r, uint64_t offset, uint64_t length, bufferlist &bl);
+  void _finish_retry_read(int r);
+  void _assimilate_prefetch();
+  void _issue_read(uint64_t len); // read some more
+  void _prefetch(); // maybe read ahead
+  class C_Read;
+  friend class C_Read;
+  class C_RetryRead;
+  friend class C_RetryRead;
+
+  // trimmer
+  uint64_t expire_pos;    // what we're allowed to trim to
+  uint64_t trimming_pos;      // what we've requested to trim through
+  uint64_t trimmed_pos;   // what has been trimmed
+
+  bool readable;
+
+  void _finish_trim(int r, uint64_t to);
+  class C_Trim;
+  friend class C_Trim;
+
+  void _issue_prezero();
+  void _finish_prezero(int r, uint64_t from, uint64_t len);
+  friend struct C_Journaler_Prezero;
+
+  // only init_headers when following or first reading off-disk
+  void init_headers(Header& h) {
+    ceph_assert(readonly ||
+	   state == STATE_READHEAD ||
+	   state == STATE_REREADHEAD);
+    last_written = last_committed = h;
+  }
+
+  /**
+   * handle a write error
+   *
+   * called when we get an objecter error on a write.
+   *
+   * @param r error code
+   */
+  void handle_write_error(int r);
+
+  bool _is_readable();
+
+  void _finish_erase(int data_result, C_OnFinisher *completion);
+  class C_EraseFinish;
+  friend class C_EraseFinish;
+
+  C_OnFinisher *wrap_finisher(Context *c);
+
+  uint32_t write_iohint; // the fadvise flags for write op, see
+			 // CEPH_OSD_OP_FADIVSE_*
+
+public:
+  Journaler(const std::string &name_, inodeno_t ino_, int64_t pool,
+      const char *mag, Objecter *obj, PerfCounters *l, int lkey, Finisher *f) :
+    last_committed(mag),
+    cct(obj->cct), name(name_), finisher(f), last_written(mag),
+    ino(ino_), pg_pool(pool), readonly(true),
+    stream_format(-1), journal_stream(-1),
+    magic(mag),
+    objecter(obj), filer(objecter, f), logger(l), logger_key_lat(lkey),
+    delay_flush_event(0),
+    state(STATE_UNDEF), error(0),
+    prezeroing_pos(0), prezero_pos(0), write_pos(0), flush_pos(0),
+    safe_pos(0), next_safe_pos(0),
+    write_buf_throttle(cct, "write_buf_throttle", UINT_MAX - (UINT_MAX >> 3)),
+    waiting_for_zero_pos(0),
+    read_pos(0), requested_pos(0), received_pos(0),
+    fetch_len(0), temp_fetch_len(0),
+    on_readable(0), on_write_error(NULL), called_write_error(false),
+    expire_pos(0), trimming_pos(0), trimmed_pos(0), readable(false),
+    write_iohint(0)
+  {
+  }
+
+  /* reset
+   *
+   * NOTE: we assume the caller knows/has ensured that any objects in
+   * our sequence do not exist.. e.g. after a MKFS.  this is _not_ an
+   * "erase" method.
+   */
+  void reset() {
+    lock_guard l(lock);
+    ceph_assert(state == STATE_ACTIVE);
+
+    readonly = true;
+    delay_flush_event = NULL;
+    state = STATE_UNDEF;
+    error = 0;
+    prezeroing_pos = 0;
+    prezero_pos = 0;
+    write_pos = 0;
+    flush_pos = 0;
+    safe_pos = 0;
+    next_safe_pos = 0;
+    read_pos = 0;
+    requested_pos = 0;
+    received_pos = 0;
+    fetch_len = 0;
+    ceph_assert(!on_readable);
+    expire_pos = 0;
+    trimming_pos = 0;
+    trimmed_pos = 0;
+    waiting_for_zero_pos = 0;
+  }
+
+  // Asynchronous operations
+  // =======================
+  void erase(Context *completion);
+  void create(file_layout_t *layout, stream_format_t const sf);
+  void recover(Context *onfinish);
+  void reread_head(Context *onfinish);
+  void reread_head_and_probe(Context *onfinish);
+  void write_head(Context *onsave=0);
+  void wait_for_flush(Context *onsafe = 0);
+  void flush(Context *onsafe = 0);
+  void wait_for_readable(Context *onfinish);
+  bool have_waiter() const;
+  void wait_for_prezero(Context *onfinish);
+
+  // Synchronous setters
+  // ===================
+  void set_layout(file_layout_t const *l);
+  void set_readonly();
+  void set_writeable();
+  void set_write_pos(uint64_t p) {
+    lock_guard l(lock);
+    prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = p;
+  }
+  void set_read_pos(uint64_t p) {
+    lock_guard l(lock);
+    // we can't cope w/ in-progress read right now.
+    ceph_assert(requested_pos == received_pos);
+    read_pos = requested_pos = received_pos = p;
+    read_buf.clear();
+  }
+  uint64_t append_entry(bufferlist& bl);
+  void set_expire_pos(uint64_t ep) {
+      lock_guard l(lock);
+      expire_pos = ep;
+  }
+  void set_trimmed_pos(uint64_t p) {
+      lock_guard l(lock);
+      trimming_pos = trimmed_pos = p;
+  }
+
+  bool _write_head_needed();
+  bool write_head_needed() {
+    lock_guard l(lock);
+    return _write_head_needed();
+  }
+
+
+  void trim();
+  void trim_tail() {
+    lock_guard l(lock);
+
+    ceph_assert(!readonly);
+    _issue_prezero();
+  }
+
+  void set_write_error_handler(Context *c);
+
+  void set_write_iohint(uint32_t iohint_flags) {
+    write_iohint = iohint_flags;
+  }
+  /**
+   * Cause any ongoing waits to error out with -EAGAIN, set error
+   * to -EAGAIN.
+   */
+  void shutdown();
+public:
+
+  // Synchronous getters
+  // ===================
+  // TODO: need some locks on reads for true safety
+  uint64_t get_layout_period() const {
+    return layout.get_period();
+  }
+  file_layout_t& get_layout() { return layout; }
+  bool is_active() { return state == STATE_ACTIVE; }
+  bool is_stopping() { return state == STATE_STOPPING; }
+  int get_error() { return error; }
+  bool is_readonly() { return readonly; }
+  bool is_readable();
+  bool try_read_entry(bufferlist& bl);
+  uint64_t get_write_pos() const { return write_pos; }
+  uint64_t get_write_safe_pos() const { return safe_pos; }
+  uint64_t get_read_pos() const { return read_pos; }
+  uint64_t get_expire_pos() const { return expire_pos; }
+  uint64_t get_trimmed_pos() const { return trimmed_pos; }
+};
+WRITE_CLASS_ENCODER(Journaler::Header)
+
+#endif
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
new file mode 100644
index 00000000..c326a02a
--- /dev/null
+++ b/src/osdc/ObjectCacher.cc
@@ -0,0 +1,2800 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <limits.h>
+
+#include "msg/Messenger.h"
+#include "ObjectCacher.h"
+#include "WritebackHandler.h"
+#include "common/errno.h"
+#include "common/perf_counters.h"
+
+#include "include/ceph_assert.h"
+
+#define MAX_FLUSH_UNDER_LOCK 20  ///< max bh's we start writeback on
+#define BUFFER_MEMORY_WEIGHT CEPH_PAGE_SHIFT  // memory usage of BufferHead, count in (1<<n)
+
+using std::chrono::seconds;
+				 /// while holding the lock
+
+/*** ObjectCacher::BufferHead ***/
+
+
+/*** ObjectCacher::Object ***/
+
+#define dout_subsys ceph_subsys_objectcacher
+#undef dout_prefix
+#define dout_prefix *_dout << "objectcacher.object(" << oid << ") "
+
+
+
+class ObjectCacher::C_ReadFinish : public Context {
+  ObjectCacher *oc;
+  int64_t poolid;
+  sobject_t oid;
+  loff_t start;
+  uint64_t length;
+  xlist<C_ReadFinish*>::item set_item;
+  bool trust_enoent;
+  ceph_tid_t tid;
+  ZTracer::Trace trace;
+
+public:
+  bufferlist bl;
+  C_ReadFinish(ObjectCacher *c, Object *ob, ceph_tid_t t, loff_t s,
+	       uint64_t l, const ZTracer::Trace &trace) :
+    oc(c), poolid(ob->oloc.pool), oid(ob->get_soid()), start(s), length(l),
+    set_item(this), trust_enoent(true),
+    tid(t), trace(trace) {
+    ob->reads.push_back(&set_item);
+  }
+
+  void finish(int r) override {
+    oc->bh_read_finish(poolid, oid, tid, start, length, bl, r, trust_enoent);
+    trace.event("finish");
+
+    // object destructor clears the list
+    if (set_item.is_on_list())
+      set_item.remove_myself();
+  }
+
+  void distrust_enoent() {
+    trust_enoent = false;
+  }
+};
+
+class ObjectCacher::C_RetryRead : public Context {
+  ObjectCacher *oc;
+  OSDRead *rd;
+  ObjectSet *oset;
+  Context *onfinish;
+  ZTracer::Trace trace;
+public:
+  C_RetryRead(ObjectCacher *_oc, OSDRead *r, ObjectSet *os, Context *c,
+	      const ZTracer::Trace &trace)
+    : oc(_oc), rd(r), oset(os), onfinish(c), trace(trace) {
+  }
+  void finish(int r) override {
+    if (r >= 0) {
+      r = oc->_readx(rd, oset, onfinish, false, &trace);
+    }
+
+    if (r == 0) {
+      // read is still in-progress
+      return;
+    }
+
+    trace.event("finish");
+    if (onfinish) {
+      onfinish->complete(r);
+    }
+  }
+};
+
+ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left,
+						      loff_t off)
+{
+  ceph_assert(oc->lock.is_locked());
+  ldout(oc->cct, 20) << "split " << *left << " at " << off << dendl;
+
+  // split off right
+  ObjectCacher::BufferHead *right = new BufferHead(this);
+
+  //inherit and if later access, this auto clean.
+  right->set_dontneed(left->get_dontneed());
+  right->set_nocache(left->get_nocache());
+
+  right->last_write_tid = left->last_write_tid;
+  right->last_read_tid = left->last_read_tid;
+  right->set_state(left->get_state());
+  right->snapc = left->snapc;
+  right->set_journal_tid(left->journal_tid);
+
+  loff_t newleftlen = off - left->start();
+  right->set_start(off);
+  right->set_length(left->length() - newleftlen);
+
+  // shorten left
+  oc->bh_stat_sub(left);
+  left->set_length(newleftlen);
+  oc->bh_stat_add(left);
+
+  // add right
+  oc->bh_add(this, right);
+
+  // split buffers too
+  bufferlist bl;
+  bl.claim(left->bl);
+  if (bl.length()) {
+    ceph_assert(bl.length() == (left->length() + right->length()));
+    right->bl.substr_of(bl, left->length(), right->length());
+    left->bl.substr_of(bl, 0, left->length());
+  }
+
+  // move read waiters
+  if (!left->waitfor_read.empty()) {
+    map<loff_t, list<Context*> >::iterator start_remove
+      = left->waitfor_read.begin();
+    while (start_remove != left->waitfor_read.end() &&
+	   start_remove->first < right->start())
+      ++start_remove;
+    for (map<loff_t, list<Context*> >::iterator p = start_remove;
+	 p != left->waitfor_read.end(); ++p) {
+      ldout(oc->cct, 20) << "split  moving waiters at byte " << p->first
+			 << " to right bh" << dendl;
+      right->waitfor_read[p->first].swap( p->second );
+      ceph_assert(p->second.empty());
+    }
+    left->waitfor_read.erase(start_remove, left->waitfor_read.end());
+  }
+
+  ldout(oc->cct, 20) << "split    left is " << *left << dendl;
+  ldout(oc->cct, 20) << "split   right is " << *right << dendl;
+  return right;
+}
+
+
+void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right)
+{
+  ceph_assert(oc->lock.is_locked());
+
+  ldout(oc->cct, 10) << "merge_left " << *left << " + " << *right << dendl;
+  if (left->get_journal_tid() == 0) {
+    left->set_journal_tid(right->get_journal_tid());
+  }
+  right->set_journal_tid(0);
+
+  oc->bh_remove(this, right);
+  oc->bh_stat_sub(left);
+  left->set_length(left->length() + right->length());
+  oc->bh_stat_add(left);
+
+  // data
+  left->bl.claim_append(right->bl);
+
+  // version
+  // note: this is sorta busted, but should only be used for dirty buffers
+  left->last_write_tid =  std::max( left->last_write_tid, right->last_write_tid );
+  left->last_write = std::max( left->last_write, right->last_write );
+
+  left->set_dontneed(right->get_dontneed() ? left->get_dontneed() : false);
+  left->set_nocache(right->get_nocache() ? left->get_nocache() : false);
+
+  // waiters
+  for (map<loff_t, list<Context*> >::iterator p = right->waitfor_read.begin();
+       p != right->waitfor_read.end();
+       ++p)
+    left->waitfor_read[p->first].splice(left->waitfor_read[p->first].begin(),
+					p->second );
+
+  // hose right
+  delete right;
+
+  ldout(oc->cct, 10) << "merge_left result " << *left << dendl;
+}
+
+bool ObjectCacher::Object::can_merge_bh(BufferHead *left, BufferHead *right)
+{
+  if (left->end() != right->start() ||
+      left->get_state() != right->get_state() ||
+      !left->can_merge_journal(right))
+    return false;
+  if (left->is_tx() && left->last_write_tid != right->last_write_tid)
+    return false;
+  return true;
+}
+
+void ObjectCacher::Object::try_merge_bh(BufferHead *bh)
+{
+  ceph_assert(oc->lock.is_locked());
+  ldout(oc->cct, 10) << "try_merge_bh " << *bh << dendl;
+
+  // do not merge rx buffers; last_read_tid may not match
+  if (bh->is_rx())
+    return;
+
+  // to the left?
+  map<loff_t,BufferHead*>::iterator p = data.find(bh->start());
+  ceph_assert(p->second == bh);
+  if (p != data.begin()) {
+    --p;
+    if (can_merge_bh(p->second, bh)) {
+      merge_left(p->second, bh);
+      bh = p->second;
+    } else {
+      ++p;
+    }
+  }
+  // to the right?
+  ceph_assert(p->second == bh);
+  ++p;
+  if (p != data.end() && can_merge_bh(bh, p->second))
+    merge_left(bh, p->second);
+
+  maybe_rebuild_buffer(bh);
+}
+
+void ObjectCacher::Object::maybe_rebuild_buffer(BufferHead *bh)
+{
+  auto& bl = bh->bl;
+  if (bl.get_num_buffers() <= 1)
+    return;
+
+  auto wasted = bl.get_wasted_space();
+  if (wasted * 2 > bl.length() &&
+      wasted > (1U << BUFFER_MEMORY_WEIGHT))
+    bl.rebuild();
+}
+
+/*
+ * count bytes we have cached in given range
+ */
+bool ObjectCacher::Object::is_cached(loff_t cur, loff_t left) const
+{
+  ceph_assert(oc->lock.is_locked());
+  map<loff_t, BufferHead*>::const_iterator p = data_lower_bound(cur);
+  while (left > 0) {
+    if (p == data.end())
+      return false;
+
+    if (p->first <= cur) {
+      // have part of it
+      loff_t lenfromcur = std::min(p->second->end() - cur, left);
+      cur += lenfromcur;
+      left -= lenfromcur;
+      ++p;
+      continue;
+    } else if (p->first > cur) {
+      // gap
+      return false;
+    } else
+      ceph_abort();
+  }
+
+  return true;
+}
+
+/*
+ * all cached data in this range[off, off+len]
+ */
+bool ObjectCacher::Object::include_all_cached_data(loff_t off, loff_t len)
+{
+  ceph_assert(oc->lock.is_locked());
+  if (data.empty())
+      return true;
+  map<loff_t, BufferHead*>::iterator first = data.begin();
+  map<loff_t, BufferHead*>::reverse_iterator last = data.rbegin();
+  if (first->second->start() >= off && last->second->end() <= (off + len))
+    return true;
+  else
+    return false;
+}
+
+/*
+ * map a range of bytes into buffer_heads.
+ * - create missing buffer_heads as necessary.
+ */
+int ObjectCacher::Object::map_read(ObjectExtent &ex,
+                                   map<loff_t, BufferHead*>& hits,
+                                   map<loff_t, BufferHead*>& missing,
+                                   map<loff_t, BufferHead*>& rx,
+				   map<loff_t, BufferHead*>& errors)
+{
+  ceph_assert(oc->lock.is_locked());
+  ldout(oc->cct, 10) << "map_read " << ex.oid << " "
+                     << ex.offset << "~" << ex.length << dendl;
+
+  loff_t cur = ex.offset;
+  loff_t left = ex.length;
+
+  map<loff_t, BufferHead*>::const_iterator p = data_lower_bound(ex.offset);
+  while (left > 0) {
+    // at end?
+    if (p == data.end()) {
+      // rest is a miss.
+      BufferHead *n = new BufferHead(this);
+      n->set_start(cur);
+      n->set_length(left);
+      oc->bh_add(this, n);
+      if (complete) {
+        oc->mark_zero(n);
+        hits[cur] = n;
+        ldout(oc->cct, 20) << "map_read miss+complete+zero " << left << " left, " << *n << dendl;
+      } else {
+        missing[cur] = n;
+        ldout(oc->cct, 20) << "map_read miss " << left << " left, " << *n << dendl;
+      }
+      cur += left;
+      ceph_assert(cur == (loff_t)ex.offset + (loff_t)ex.length);
+      break;  // no more.
+    }
+
+    if (p->first <= cur) {
+      // have it (or part of it)
+      BufferHead *e = p->second;
+
+      if (e->is_clean() ||
+          e->is_dirty() ||
+          e->is_tx() ||
+          e->is_zero()) {
+        hits[cur] = e;     // readable!
+        ldout(oc->cct, 20) << "map_read hit " << *e << dendl;
+      } else if (e->is_rx()) {
+        rx[cur] = e;       // missing, not readable.
+        ldout(oc->cct, 20) << "map_read rx " << *e << dendl;
+      } else if (e->is_error()) {
+        errors[cur] = e;
+        ldout(oc->cct, 20) << "map_read error " << *e << dendl;
+      } else {
+        ceph_abort();
+      }
+
+      loff_t lenfromcur = std::min(e->end() - cur, left);
+      cur += lenfromcur;
+      left -= lenfromcur;
+      ++p;
+      continue;  // more?
+
+    } else if (p->first > cur) {
+      // gap.. miss
+      loff_t next = p->first;
+      BufferHead *n = new BufferHead(this);
+      loff_t len = std::min(next - cur, left);
+      n->set_start(cur);
+      n->set_length(len);
+      oc->bh_add(this,n);
+      if (complete) {
+        oc->mark_zero(n);
+        hits[cur] = n;
+        ldout(oc->cct, 20) << "map_read gap+complete+zero " << *n << dendl;
+      } else {
+        missing[cur] = n;
+        ldout(oc->cct, 20) << "map_read gap " << *n << dendl;
+      }
+      cur += std::min(left, n->length());
+      left -= std::min(left, n->length());
+      continue;    // more?
+    } else {
+      ceph_abort();
+    }
+  }
+  return 0;
+}
+
+void ObjectCacher::Object::audit_buffers()
+{
+  loff_t offset = 0;
+  for (map<loff_t, BufferHead*>::const_iterator it = data.begin();
+       it != data.end(); ++it) {
+    if (it->first != it->second->start()) {
+      lderr(oc->cct) << "AUDIT FAILURE: map position " << it->first
+		     << " does not match bh start position: "
+		     << *it->second << dendl;
+      ceph_assert(it->first == it->second->start());
+    }
+    if (it->first < offset) {
+      lderr(oc->cct) << "AUDIT FAILURE: " << it->first << " " << *it->second
+		     << " overlaps with previous bh " << *((--it)->second)
+		     << dendl;
+      ceph_assert(it->first >= offset);
+    }
+    BufferHead *bh = it->second;
+    map<loff_t, list<Context*> >::const_iterator w_it;
+    for (w_it = bh->waitfor_read.begin();
+	 w_it != bh->waitfor_read.end(); ++w_it) {
+      if (w_it->first < bh->start() ||
+	    w_it->first >= bh->start() + bh->length()) {
+	lderr(oc->cct) << "AUDIT FAILURE: waiter at " << w_it->first
+		       << " is not within bh " << *bh << dendl;
+	ceph_assert(w_it->first >= bh->start());
+	ceph_assert(w_it->first < bh->start() + bh->length());
+      }
+    }
+    offset = it->first + it->second->length();
+  }
+}
+
+/*
+ * map a range of extents on an object's buffer cache.
+ * - combine any bh's we're writing into one
+ * - break up bufferheads that don't fall completely within the range
+ * //no! - return a bh that includes the write.  may also include
+ * other dirty data to left and/or right.
+ */
+ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex,
+							  ceph_tid_t tid)
+{
+  ceph_assert(oc->lock.is_locked());
+  BufferHead *final = 0;
+
+  ldout(oc->cct, 10) << "map_write oex " << ex.oid
+      	       << " " << ex.offset << "~" << ex.length << dendl;
+
+  loff_t cur = ex.offset;
+  loff_t left = ex.length;
+
+  map<loff_t, BufferHead*>::const_iterator p = data_lower_bound(ex.offset);
+  while (left > 0) {
+    loff_t max = left;
+
+    // at end ?
+    if (p == data.end()) {
+      if (final == NULL) {
+        final = new BufferHead(this);
+        replace_journal_tid(final, tid);
+        final->set_start( cur );
+        final->set_length( max );
+        oc->bh_add(this, final);
+        ldout(oc->cct, 10) << "map_write adding trailing bh " << *final << dendl;
+      } else {
+        oc->bh_stat_sub(final);
+        final->set_length(final->length() + max);
+        oc->bh_stat_add(final);
+      }
+      left -= max;
+      cur += max;
+      continue;
+    }
+
+    ldout(oc->cct, 10) << "cur is " << cur << ", p is " << *p->second << dendl;
+    //oc->verify_stats();
+
+    if (p->first <= cur) {
+      BufferHead *bh = p->second;
+      ldout(oc->cct, 10) << "map_write bh " << *bh << " intersected" << dendl;
+
+      if (p->first < cur) {
+        ceph_assert(final == 0);
+        if (cur + max >= bh->end()) {
+          // we want right bit (one splice)
+          final = split(bh, cur);   // just split it, take right half.
+          maybe_rebuild_buffer(bh);
+          replace_journal_tid(final, tid);
+          ++p;
+          ceph_assert(p->second == final);
+        } else {
+          // we want middle bit (two splices)
+          final = split(bh, cur);
+          maybe_rebuild_buffer(bh);
+          ++p;
+          ceph_assert(p->second == final);
+          auto right = split(final, cur+max);
+          maybe_rebuild_buffer(right);
+          replace_journal_tid(final, tid);
+        }
+      } else {
+        ceph_assert(p->first == cur);
+        if (bh->length() <= max) {
+          // whole bufferhead, piece of cake.
+        } else {
+          // we want left bit (one splice)
+          auto right = split(bh, cur + max);        // just split
+          maybe_rebuild_buffer(right);
+        }
+        if (final) {
+          oc->mark_dirty(bh);
+          oc->mark_dirty(final);
+          --p;  // move iterator back to final
+          ceph_assert(p->second == final);
+          replace_journal_tid(bh, tid);
+          merge_left(final, bh);
+        } else {
+          final = bh;
+          replace_journal_tid(final, tid);
+        }
+      }
+
+      // keep going.
+      loff_t lenfromcur = final->end() - cur;
+      cur += lenfromcur;
+      left -= lenfromcur;
+      ++p;
+      continue;
+    } else {
+      // gap!
+      loff_t next = p->first;
+      loff_t glen = std::min(next - cur, max);
+      ldout(oc->cct, 10) << "map_write gap " << cur << "~" << glen << dendl;
+      if (final) {
+        oc->bh_stat_sub(final);
+        final->set_length(final->length() + glen);
+        oc->bh_stat_add(final);
+      } else {
+        final = new BufferHead(this);
+	replace_journal_tid(final, tid);
+        final->set_start( cur );
+        final->set_length( glen );
+        oc->bh_add(this, final);
+      }
+
+      cur += glen;
+      left -= glen;
+      continue;    // more?
+    }
+  }
+
+  // set version
+  ceph_assert(final);
+  ceph_assert(final->get_journal_tid() == tid);
+  ldout(oc->cct, 10) << "map_write final is " << *final << dendl;
+
+  return final;
+}
+
+void ObjectCacher::Object::replace_journal_tid(BufferHead *bh,
+					       ceph_tid_t tid) {
+  ceph_tid_t bh_tid = bh->get_journal_tid();
+
+  ceph_assert(tid == 0 || bh_tid <= tid);
+  if (bh_tid != 0 && bh_tid != tid) {
+    // inform journal that it should not expect a writeback from this extent
+    oc->writeback_handler.overwrite_extent(get_oid(), bh->start(),
+					   bh->length(), bh_tid, tid);
+  }
+  bh->set_journal_tid(tid);
+}
+
+void ObjectCacher::Object::truncate(loff_t s)
+{
+  ceph_assert(oc->lock.is_locked());
+  ldout(oc->cct, 10) << "truncate " << *this << " to " << s << dendl;
+
+  std::list<Context*> waiting_for_read;
+  while (!data.empty()) {
+    BufferHead *bh = data.rbegin()->second;
+    if (bh->end() <= s)
+      break;
+
+    // split bh at truncation point?
+    if (bh->start() < s) {
+      split(bh, s);
+      maybe_rebuild_buffer(bh);
+      continue;
+    }
+
+    // remove bh entirely
+    ceph_assert(bh->start() >= s);
+    for ([[maybe_unused]] auto& [off, ctxs] : bh->waitfor_read) {
+      waiting_for_read.splice(waiting_for_read.end(), ctxs);
+    }
+    bh->waitfor_read.clear();
+    replace_journal_tid(bh, 0);
+    oc->bh_remove(this, bh);
+    delete bh;
+  }
+  if (!waiting_for_read.empty()) {
+    ldout(oc->cct, 10) <<  "restarting reads post-truncate" << dendl;
+  }
+  finish_contexts(oc->cct, waiting_for_read, 0);
+}
+
+void ObjectCacher::Object::discard(loff_t off, loff_t len,
+                                   C_GatherBuilder* commit_gather)
+{
+  ceph_assert(oc->lock.is_locked());
+  ldout(oc->cct, 10) << "discard " << *this << " " << off << "~" << len
+		     << dendl;
+
+  if (!exists) {
+    ldout(oc->cct, 10) << " setting exists on " << *this << dendl;
+    exists = true;
+  }
+  if (complete) {
+    ldout(oc->cct, 10) << " clearing complete on " << *this << dendl;
+    complete = false;
+  }
+
+  std::list<Context*> waiting_for_read;
+  map<loff_t, BufferHead*>::const_iterator p = data_lower_bound(off);
+  while (p != data.end()) {
+    BufferHead *bh = p->second;
+    if (bh->start() >= off + len)
+      break;
+
+    // split bh at truncation point?
+    if (bh->start() < off) {
+      split(bh, off);
+      maybe_rebuild_buffer(bh);
+      ++p;
+      continue;
+    }
+
+    ceph_assert(bh->start() >= off);
+    if (bh->end() > off + len) {
+      auto right = split(bh, off + len);
+      maybe_rebuild_buffer(right);
+    }
+
+    ++p;
+    ldout(oc->cct, 10) << "discard " << *this << " bh " << *bh << dendl;
+    replace_journal_tid(bh, 0);
+
+    if (bh->is_tx() && commit_gather != nullptr) {
+      // wait for the writeback to commit
+      waitfor_commit[bh->last_write_tid].emplace_back(commit_gather->new_sub());
+    } else if (bh->is_rx()) {
+      // cannot remove bh with in-flight read, but we can ensure the
+      // read won't overwrite the discard
+      bh->last_read_tid = ++oc->last_read_tid;
+      bh->bl.clear();
+      bh->set_nocache(true);
+      oc->mark_zero(bh);
+      // we should mark all Rx bh to zero
+      continue;
+    } else {
+      for ([[maybe_unused]] auto& [off, ctxs] : bh->waitfor_read) {
+        waiting_for_read.splice(waiting_for_read.end(), ctxs);
+      }
+      bh->waitfor_read.clear();
+    }
+
+    oc->bh_remove(this, bh);
+    delete bh;
+  }
+  if (!waiting_for_read.empty()) {
+    ldout(oc->cct, 10) <<  "restarting reads post-discard" << dendl;
+  }
+  finish_contexts(oc->cct, waiting_for_read, 0); /* restart reads */
+}
+
+
+
+/*** ObjectCacher ***/
+
+#undef dout_prefix
+#define dout_prefix *_dout << "objectcacher "
+
+
+ObjectCacher::ObjectCacher(CephContext *cct_, string name,
+			   WritebackHandler& wb, Mutex& l,
+			   flush_set_callback_t flush_callback,
+			   void *flush_callback_arg, uint64_t max_bytes,
+			   uint64_t max_objects, uint64_t max_dirty,
+			   uint64_t target_dirty, double max_dirty_age,
+			   bool block_writes_upfront)
+  : perfcounter(NULL),
+    cct(cct_), writeback_handler(wb), name(name), lock(l),
+    max_dirty(max_dirty), target_dirty(target_dirty),
+    max_size(max_bytes), max_objects(max_objects),
+    max_dirty_age(ceph::make_timespan(max_dirty_age)),
+    block_writes_upfront(block_writes_upfront),
+    trace_endpoint("ObjectCacher"),
+    flush_set_callback(flush_callback),
+    flush_set_callback_arg(flush_callback_arg),
+    last_read_tid(0), flusher_stop(false), flusher_thread(this),finisher(cct),
+    stat_clean(0), stat_zero(0), stat_dirty(0), stat_rx(0), stat_tx(0),
+    stat_missing(0), stat_error(0), stat_dirty_waiting(0),
+    stat_nr_dirty_waiters(0), reads_outstanding(0)
+{
+  perf_start();
+  finisher.start();
+  scattered_write = writeback_handler.can_scattered_write();
+}
+
+ObjectCacher::~ObjectCacher()
+{
+  finisher.stop();
+  perf_stop();
+  // we should be empty.
+  for (vector<ceph::unordered_map<sobject_t, Object *> >::iterator i
+	 = objects.begin();
+       i != objects.end();
+       ++i)
+    ceph_assert(i->empty());
+  ceph_assert(bh_lru_rest.lru_get_size() == 0);
+  ceph_assert(bh_lru_dirty.lru_get_size() == 0);
+  ceph_assert(ob_lru.lru_get_size() == 0);
+  ceph_assert(dirty_or_tx_bh.empty());
+}
+
+void ObjectCacher::perf_start()
+{
+  string n = "objectcacher-" + name;
+  PerfCountersBuilder plb(cct, n, l_objectcacher_first, l_objectcacher_last);
+
+  plb.add_u64_counter(l_objectcacher_cache_ops_hit,
+		      "cache_ops_hit", "Hit operations");
+  plb.add_u64_counter(l_objectcacher_cache_ops_miss,
+		      "cache_ops_miss", "Miss operations");
+  plb.add_u64_counter(l_objectcacher_cache_bytes_hit,
+		      "cache_bytes_hit", "Hit data", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_objectcacher_cache_bytes_miss,
+		      "cache_bytes_miss", "Miss data", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_objectcacher_data_read,
+		      "data_read", "Read data");
+  plb.add_u64_counter(l_objectcacher_data_written,
+		      "data_written", "Data written to cache");
+  plb.add_u64_counter(l_objectcacher_data_flushed,
+		      "data_flushed", "Data flushed");
+  plb.add_u64_counter(l_objectcacher_overwritten_in_flush,
+		      "data_overwritten_while_flushing",
+		      "Data overwritten while flushing");
+  plb.add_u64_counter(l_objectcacher_write_ops_blocked, "write_ops_blocked",
+		      "Write operations, delayed due to dirty limits");
+  plb.add_u64_counter(l_objectcacher_write_bytes_blocked,
+		      "write_bytes_blocked",
+		      "Write data blocked on dirty limit", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_time(l_objectcacher_write_time_blocked, "write_time_blocked",
+	       "Time spent blocking a write due to dirty limits");
+
+  perfcounter = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perfcounter);
+}
+
+void ObjectCacher::perf_stop()
+{
+  ceph_assert(perfcounter);
+  cct->get_perfcounters_collection()->remove(perfcounter);
+  delete perfcounter;
+}
+
+/* private */
+ObjectCacher::Object *ObjectCacher::get_object(sobject_t oid,
+					       uint64_t object_no,
+					       ObjectSet *oset,
+					       object_locator_t &l,
+					       uint64_t truncate_size,
+					       uint64_t truncate_seq)
+{
+  // XXX: Add handling of nspace in object_locator_t in cache
+  ceph_assert(lock.is_locked());
+  // have it?
+  if ((uint32_t)l.pool < objects.size()) {
+    if (objects[l.pool].count(oid)) {
+      Object *o = objects[l.pool][oid];
+      o->object_no = object_no;
+      o->truncate_size = truncate_size;
+      o->truncate_seq = truncate_seq;
+      return o;
+    }
+  } else {
+    objects.resize(l.pool+1);
+  }
+
+  // create it.
+  Object *o = new Object(this, oid, object_no, oset, l, truncate_size,
+			 truncate_seq);
+  objects[l.pool][oid] = o;
+  ob_lru.lru_insert_top(o);
+  return o;
+}
+
+void ObjectCacher::close_object(Object *ob)
+{
+  ceph_assert(lock.is_locked());
+  ldout(cct, 10) << "close_object " << *ob << dendl;
+  ceph_assert(ob->can_close());
+
+  // ok!
+  ob_lru.lru_remove(ob);
+  objects[ob->oloc.pool].erase(ob->get_soid());
+  ob->set_item.remove_myself();
+  delete ob;
+}
+
+void ObjectCacher::bh_read(BufferHead *bh, int op_flags,
+                           const ZTracer::Trace &parent_trace)
+{
+  ceph_assert(lock.is_locked());
+  ldout(cct, 7) << "bh_read on " << *bh << " outstanding reads "
+		<< reads_outstanding << dendl;
+
+  ZTracer::Trace trace;
+  if (parent_trace.valid()) {
+    trace.init("", &trace_endpoint, &parent_trace);
+    trace.copy_name("bh_read " + bh->ob->get_oid().name);
+    trace.event("start");
+  }
+
+  mark_rx(bh);
+  bh->last_read_tid = ++last_read_tid;
+
+  // finisher
+  C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob, bh->last_read_tid,
+					    bh->start(), bh->length(), trace);
+  // go
+  writeback_handler.read(bh->ob->get_oid(), bh->ob->get_object_number(),
+			 bh->ob->get_oloc(), bh->start(), bh->length(),
+			 bh->ob->get_snap(), &onfinish->bl,
+			 bh->ob->truncate_size, bh->ob->truncate_seq,
+			 op_flags, trace, onfinish);
+
+  ++reads_outstanding;
+}
+
+void ObjectCacher::bh_read_finish(int64_t poolid, sobject_t oid,
+				  ceph_tid_t tid, loff_t start,
+				  uint64_t length, bufferlist &bl, int r,
+				  bool trust_enoent)
+{
+  ceph_assert(lock.is_locked());
+  ldout(cct, 7) << "bh_read_finish "
+		<< oid
+		<< " tid " << tid
+		<< " " << start << "~" << length
+		<< " (bl is " << bl.length() << ")"
+		<< " returned " << r
+		<< " outstanding reads " << reads_outstanding
+		<< dendl;
+
+  if (r >= 0 && bl.length() < length) {
+    ldout(cct, 7) << "bh_read_finish " << oid << " padding " << start << "~"
+		  << length << " with " << length - bl.length() << " bytes of zeroes"
+		  << dendl;
+    bl.append_zero(length - bl.length());
+  }
+
+  list<Context*> ls;
+  int err = 0;
+
+  if (objects[poolid].count(oid) == 0) {
+    ldout(cct, 7) << "bh_read_finish no object cache" << dendl;
+  } else {
+    Object *ob = objects[poolid][oid];
+
+    if (r == -ENOENT && !ob->complete) {
+      // wake up *all* rx waiters, or else we risk reordering
+      // identical reads. e.g.
+      //   read 1~1
+      //   reply to unrelated 3~1 -> !exists
+      //   read 1~1 -> immediate ENOENT
+      //   reply to first 1~1 -> ooo ENOENT
+      bool allzero = true;
+      for (map<loff_t, BufferHead*>::iterator p = ob->data.begin();
+	   p != ob->data.end(); ++p) {
+	BufferHead *bh = p->second;
+	for (map<loff_t, list<Context*> >::iterator p
+	       = bh->waitfor_read.begin();
+	     p != bh->waitfor_read.end();
+	     ++p)
+	  ls.splice(ls.end(), p->second);
+	bh->waitfor_read.clear();
+	if (!bh->is_zero() && !bh->is_rx())
+	  allzero = false;
+      }
+
+      // just pass through and retry all waiters if we don't trust
+      // -ENOENT for this read
+      if (trust_enoent) {
+	ldout(cct, 7)
+	  << "bh_read_finish ENOENT, marking complete and !exists on " << *ob
+	  << dendl;
+	ob->complete = true;
+	ob->exists = false;
+
+	/* If all the bhs are effectively zero, get rid of them.  All
+	 * the waiters will be retried and get -ENOENT immediately, so
+	 * it's safe to clean up the unneeded bh's now. Since we know
+	 * it's safe to remove them now, do so, so they aren't hanging
+	 *around waiting for more -ENOENTs from rados while the cache
+	 * is being shut down.
+	 *
+	 * Only do this when all the bhs are rx or clean, to match the
+	 * condition in _readx(). If there are any non-rx or non-clean
+	 * bhs, _readx() will wait for the final result instead of
+	 * returning -ENOENT immediately.
+	 */
+	if (allzero) {
+	  ldout(cct, 10)
+	    << "bh_read_finish ENOENT and allzero, getting rid of "
+	    << "bhs for " << *ob << dendl;
+	  map<loff_t, BufferHead*>::iterator p = ob->data.begin();
+	  while (p != ob->data.end()) {
+	    BufferHead *bh = p->second;
+	    // current iterator will be invalidated by bh_remove()
+	    ++p;
+	    bh_remove(ob, bh);
+	    delete bh;
+	  }
+	}
+      }
+    }
+
+    // apply to bh's!
+    loff_t opos = start;
+    while (true) {
+      map<loff_t, BufferHead*>::const_iterator p = ob->data_lower_bound(opos);
+      if (p == ob->data.end())
+	break;
+      if (opos >= start+(loff_t)length) {
+	ldout(cct, 20) << "break due to opos " << opos << " >= start+length "
+		       << start << "+" << length << "=" << start+(loff_t)length
+		       << dendl;
+	break;
+      }
+
+      BufferHead *bh = p->second;
+      ldout(cct, 20) << "checking bh " << *bh << dendl;
+
+      // finishers?
+      for (map<loff_t, list<Context*> >::iterator it
+	     = bh->waitfor_read.begin();
+	   it != bh->waitfor_read.end();
+	   ++it)
+	ls.splice(ls.end(), it->second);
+      bh->waitfor_read.clear();
+
+      if (bh->start() > opos) {
+	ldout(cct, 1) << "bh_read_finish skipping gap "
+		      << opos << "~" << bh->start() - opos
+		      << dendl;
+	opos = bh->start();
+	continue;
+      }
+
+      if (!bh->is_rx()) {
+	ldout(cct, 10) << "bh_read_finish skipping non-rx " << *bh << dendl;
+	opos = bh->end();
+	continue;
+      }
+
+      if (bh->last_read_tid != tid) {
+	ldout(cct, 10) << "bh_read_finish bh->last_read_tid "
+		       << bh->last_read_tid << " != tid " << tid
+		       << ", skipping" << dendl;
+	opos = bh->end();
+	continue;
+      }
+
+      ceph_assert(opos >= bh->start());
+      ceph_assert(bh->start() == opos);   // we don't merge rx bh's... yet!
+      ceph_assert(bh->length() <= start+(loff_t)length-opos);
+
+      if (bh->error < 0)
+	err = bh->error;
+
+      opos = bh->end();
+
+      if (r == -ENOENT) {
+	if (trust_enoent) {
+	  ldout(cct, 10) << "bh_read_finish removing " << *bh << dendl;
+	  bh_remove(ob, bh);
+	  delete bh;
+	} else {
+	  ldout(cct, 10) << "skipping unstrusted -ENOENT and will retry for "
+			 << *bh << dendl;
+	}
+	continue;
+      }
+
+      if (r < 0) {
+	bh->error = r;
+	mark_error(bh);
+      } else {
+	bh->bl.substr_of(bl,
+			 bh->start() - start,
+			 bh->length());
+	mark_clean(bh);
+      }
+
+      ldout(cct, 10) << "bh_read_finish read " << *bh << dendl;
+
+      ob->try_merge_bh(bh);
+    }
+  }
+
+  // called with lock held.
+  ldout(cct, 20) << "finishing waiters " << ls << dendl;
+
+  finish_contexts(cct, ls, err);
+  retry_waiting_reads();
+
+  --reads_outstanding;
+  read_cond.Signal();
+}
+
+void ObjectCacher::bh_write_adjacencies(BufferHead *bh, ceph::real_time cutoff,
+					int64_t *max_amount, int *max_count)
+{
+  list<BufferHead*> blist;
+
+  int count = 0;
+  int64_t total_len = 0;
+  set<BufferHead*, BufferHead::ptr_lt>::iterator it = dirty_or_tx_bh.find(bh);
+  ceph_assert(it != dirty_or_tx_bh.end());
+  for (set<BufferHead*, BufferHead::ptr_lt>::iterator p = it;
+       p != dirty_or_tx_bh.end();
+       ++p) {
+    BufferHead *obh = *p;
+    if (obh->ob != bh->ob)
+      break;
+    if (obh->is_dirty() && obh->last_write <= cutoff) {
+      blist.push_back(obh);
+      ++count;
+      total_len += obh->length();
+      if ((max_count && count > *max_count) ||
+	  (max_amount && total_len > *max_amount))
+	break;
+    }
+  }
+
+  while (it != dirty_or_tx_bh.begin()) {
+    --it;
+    BufferHead *obh = *it;
+    if (obh->ob != bh->ob)
+      break;
+    if (obh->is_dirty() && obh->last_write <= cutoff) {
+      blist.push_front(obh);
+      ++count;
+      total_len += obh->length();
+      if ((max_count && count > *max_count) ||
+	  (max_amount && total_len > *max_amount))
+	break;
+    }
+  }
+  if (max_count)
+    *max_count -= count;
+  if (max_amount)
+    *max_amount -= total_len;
+
+  bh_write_scattered(blist);
+}
+
+class ObjectCacher::C_WriteCommit : public Context {
+  ObjectCacher *oc;
+  int64_t poolid;
+  sobject_t oid;
+  vector<pair<loff_t, uint64_t> > ranges;
+  ZTracer::Trace trace;
+public:
+  ceph_tid_t tid = 0;
+  C_WriteCommit(ObjectCacher *c, int64_t _poolid, sobject_t o, loff_t s,
+		uint64_t l, const ZTracer::Trace &trace) :
+    oc(c), poolid(_poolid), oid(o), trace(trace) {
+      ranges.push_back(make_pair(s, l));
+    }
+  C_WriteCommit(ObjectCacher *c, int64_t _poolid, sobject_t o,
+		vector<pair<loff_t, uint64_t> >& _ranges) :
+    oc(c), poolid(_poolid), oid(o), tid(0) {
+      ranges.swap(_ranges);
+    }
+  void finish(int r) override {
+    oc->bh_write_commit(poolid, oid, ranges, tid, r);
+    trace.event("finish");
+  }
+};
+void ObjectCacher::bh_write_scattered(list<BufferHead*>& blist)
+{
+  ceph_assert(lock.is_locked());
+
+  Object *ob = blist.front()->ob;
+  ob->get();
+
+  ceph::real_time last_write;
+  SnapContext snapc;
+  vector<pair<loff_t, uint64_t> > ranges;
+  vector<pair<uint64_t, bufferlist> > io_vec;
+
+  ranges.reserve(blist.size());
+  io_vec.reserve(blist.size());
+
+  uint64_t total_len = 0;
+  for (list<BufferHead*>::iterator p = blist.begin(); p != blist.end(); ++p) {
+    BufferHead *bh = *p;
+    ldout(cct, 7) << "bh_write_scattered " << *bh << dendl;
+    ceph_assert(bh->ob == ob);
+    ceph_assert(bh->bl.length() == bh->length());
+    ranges.push_back(pair<loff_t, uint64_t>(bh->start(), bh->length()));
+
+    int n = io_vec.size();
+    io_vec.resize(n + 1);
+    io_vec[n].first = bh->start();
+    io_vec[n].second = bh->bl;
+
+    total_len += bh->length();
+    if (bh->snapc.seq > snapc.seq)
+      snapc = bh->snapc;
+    if (bh->last_write > last_write)
+      last_write = bh->last_write;
+  }
+
+  C_WriteCommit *oncommit = new C_WriteCommit(this, ob->oloc.pool, ob->get_soid(), ranges);
+
+  ceph_tid_t tid = writeback_handler.write(ob->get_oid(), ob->get_oloc(),
+					   io_vec, snapc, last_write,
+					   ob->truncate_size, ob->truncate_seq,
+					   oncommit);
+  oncommit->tid = tid;
+  ob->last_write_tid = tid;
+  for (list<BufferHead*>::iterator p = blist.begin(); p != blist.end(); ++p) {
+    BufferHead *bh = *p;
+    bh->last_write_tid = tid;
+    mark_tx(bh);
+  }
+
+  if (perfcounter)
+    perfcounter->inc(l_objectcacher_data_flushed, total_len);
+}
+
+void ObjectCacher::bh_write(BufferHead *bh, const ZTracer::Trace &parent_trace)
+{
+  ceph_assert(lock.is_locked());
+  ldout(cct, 7) << "bh_write " << *bh << dendl;
+
+  bh->ob->get();
+
+  ZTracer::Trace trace;
+  if (parent_trace.valid()) {
+    trace.init("", &trace_endpoint, &parent_trace);
+    trace.copy_name("bh_write " + bh->ob->get_oid().name);
+    trace.event("start");
+  }
+
+  // finishers
+  C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->oloc.pool,
+					      bh->ob->get_soid(), bh->start(),
+					      bh->length(), trace);
+  // go
+  ceph_tid_t tid = writeback_handler.write(bh->ob->get_oid(),
+					   bh->ob->get_oloc(),
+					   bh->start(), bh->length(),
+					   bh->snapc, bh->bl, bh->last_write,
+					   bh->ob->truncate_size,
+					   bh->ob->truncate_seq,
+					   bh->journal_tid, trace, oncommit);
+  ldout(cct, 20) << " tid " << tid << " on " << bh->ob->get_oid() << dendl;
+
+  // set bh last_write_tid
+  oncommit->tid = tid;
+  bh->ob->last_write_tid = tid;
+  bh->last_write_tid = tid;
+
+  if (perfcounter) {
+    perfcounter->inc(l_objectcacher_data_flushed, bh->length());
+  }
+
+  mark_tx(bh);
+}
+
+void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid,
+				   vector<pair<loff_t, uint64_t> >& ranges,
+				   ceph_tid_t tid, int r)
+{
+  ceph_assert(lock.is_locked());
+  ldout(cct, 7) << "bh_write_commit " << oid << " tid " << tid
+		<< " ranges " << ranges << " returned " << r << dendl;
+
+  if (objects[poolid].count(oid) == 0) {
+    ldout(cct, 7) << "bh_write_commit no object cache" << dendl;
+    return;
+  }
+
+  Object *ob = objects[poolid][oid];
+  int was_dirty_or_tx = ob->oset->dirty_or_tx;
+
+  for (vector<pair<loff_t, uint64_t> >::iterator p = ranges.begin();
+       p != ranges.end();
+       ++p) {
+    loff_t start = p->first;
+    uint64_t length = p->second;
+    if (!ob->exists) {
+      ldout(cct, 10) << "bh_write_commit marking exists on " << *ob << dendl;
+      ob->exists = true;
+
+      if (writeback_handler.may_copy_on_write(ob->get_oid(), start, length,
+					      ob->get_snap())) {
+	ldout(cct, 10) << "bh_write_commit may copy on write, clearing "
+	  "complete on " << *ob << dendl;
+	ob->complete = false;
+      }
+    }
+
+    vector<pair<loff_t, BufferHead*>> hit;
+    // apply to bh's!
+    for (map<loff_t, BufferHead*>::const_iterator p = ob->data_lower_bound(start);
+	 p != ob->data.end();
+	 ++p) {
+      BufferHead *bh = p->second;
+
+      if (bh->start() >= start+(loff_t)length)
+	break;
+
+      // make sure bh is tx
+      if (!bh->is_tx()) {
+	ldout(cct, 10) << "bh_write_commit skipping non-tx " << *bh << dendl;
+	continue;
+      }
+
+      // make sure bh tid matches
+      if (bh->last_write_tid != tid) {
+	ceph_assert(bh->last_write_tid > tid);
+	ldout(cct, 10) << "bh_write_commit newer tid on " << *bh << dendl;
+	continue;
+      }
+
+      // we don't merge tx buffers. tx buffer should be within the range
+      ceph_assert(bh->start() >= start);
+      ceph_assert(bh->end() <= start+(loff_t)length);
+
+      if (r >= 0) {
+	// ok!  mark bh clean and error-free
+	mark_clean(bh);
+	bh->set_journal_tid(0);
+	if (bh->get_nocache())
+	  bh_lru_rest.lru_bottouch(bh);
+	hit.push_back(make_pair(bh->start(), bh));
+	ldout(cct, 10) << "bh_write_commit clean " << *bh << dendl;
+      } else {
+	mark_dirty(bh);
+	ldout(cct, 10) << "bh_write_commit marking dirty again due to error "
+		       << *bh << " r = " << r << " " << cpp_strerror(-r)
+		       << dendl;
+      }
+    }
+
+    for (auto& p : hit) {
+      //p.second maybe merged and deleted in merge_left
+      if (ob->data.count(p.first))
+	ob->try_merge_bh(p.second);
+    }
+  }
+
+  // update last_commit.
+  ceph_assert(ob->last_commit_tid < tid);
+  ob->last_commit_tid = tid;
+
+  // waiters?
+  list<Context*> ls;
+  if (ob->waitfor_commit.count(tid)) {
+    ls.splice(ls.begin(), ob->waitfor_commit[tid]);
+    ob->waitfor_commit.erase(tid);
+  }
+
+  // is the entire object set now clean and fully committed?
+  ObjectSet *oset = ob->oset;
+  ob->put();
+
+  if (flush_set_callback &&
+      was_dirty_or_tx > 0 &&
+      oset->dirty_or_tx == 0) {        // nothing dirty/tx
+    flush_set_callback(flush_set_callback_arg, oset);
+  }
+
+  if (!ls.empty())
+    finish_contexts(cct, ls, r);
+}
+
+void ObjectCacher::flush(ZTracer::Trace *trace, loff_t amount)
+{
+  ceph_assert(trace != nullptr);
+  ceph_assert(lock.is_locked());
+  ceph::real_time cutoff = ceph::real_clock::now();
+
+  ldout(cct, 10) << "flush " << amount << dendl;
+
+  /*
+   * NOTE: we aren't actually pulling things off the LRU here, just
+   * looking at the tail item.  Then we call bh_write, which moves it
+   * to the other LRU, so that we can call
+   * lru_dirty.lru_get_next_expire() again.
+   */
+  int64_t left = amount;
+  while (amount == 0 || left > 0) {
+    BufferHead *bh = static_cast<BufferHead*>(
+      bh_lru_dirty.lru_get_next_expire());
+    if (!bh) break;
+    if (bh->last_write > cutoff) break;
+
+    if (scattered_write) {
+      bh_write_adjacencies(bh, cutoff, amount > 0 ? &left : NULL, NULL);
+    } else {
+      left -= bh->length();
+      bh_write(bh, *trace);
+    }
+  }
+}
+
+
+void ObjectCacher::trim()
+{
+  ceph_assert(lock.is_locked());
+  ldout(cct, 10) << "trim  start: bytes: max " << max_size << "  clean "
+		 << get_stat_clean() << ", objects: max " << max_objects
+		 << " current " << ob_lru.lru_get_size() << dendl;
+
+  uint64_t max_clean_bh = max_size >> BUFFER_MEMORY_WEIGHT;
+  uint64_t nr_clean_bh = bh_lru_rest.lru_get_size() - bh_lru_rest.lru_get_num_pinned();
+  while (get_stat_clean() > 0 &&
+	 ((uint64_t)get_stat_clean() > max_size ||
+	  nr_clean_bh > max_clean_bh)) {
+    BufferHead *bh = static_cast<BufferHead*>(bh_lru_rest.lru_expire());
+    if (!bh)
+      break;
+
+    ldout(cct, 10) << "trim trimming " << *bh << dendl;
+    ceph_assert(bh->is_clean() || bh->is_zero() || bh->is_error());
+
+    Object *ob = bh->ob;
+    bh_remove(ob, bh);
+    delete bh;
+
+    --nr_clean_bh;
+
+    if (ob->complete) {
+      ldout(cct, 10) << "trim clearing complete on " << *ob << dendl;
+      ob->complete = false;
+    }
+  }
+
+  while (ob_lru.lru_get_size() > max_objects) {
+    Object *ob = static_cast<Object*>(ob_lru.lru_expire());
+    if (!ob)
+      break;
+
+    ldout(cct, 10) << "trim trimming " << *ob << dendl;
+    close_object(ob);
+  }
+
+  ldout(cct, 10) << "trim finish:  max " << max_size << "  clean "
+		 << get_stat_clean() << ", objects: max " << max_objects
+		 << " current " << ob_lru.lru_get_size() << dendl;
+}
+
+
+
+/* public */
+
+bool ObjectCacher::is_cached(ObjectSet *oset, vector<ObjectExtent>& extents,
+			     snapid_t snapid)
+{
+  ceph_assert(lock.is_locked());
+  for (vector<ObjectExtent>::iterator ex_it = extents.begin();
+       ex_it != extents.end();
+       ++ex_it) {
+    ldout(cct, 10) << "is_cached " << *ex_it << dendl;
+
+    // get Object cache
+    sobject_t soid(ex_it->oid, snapid);
+    Object *o = get_object_maybe(soid, ex_it->oloc);
+    if (!o)
+      return false;
+    if (!o->is_cached(ex_it->offset, ex_it->length))
+      return false;
+  }
+  return true;
+}
+
+
+/*
+ * returns # bytes read (if in cache).  onfinish is untouched (caller
+ *           must delete it)
+ * returns 0 if doing async read
+ */
+int ObjectCacher::readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
+			ZTracer::Trace *parent_trace)
+{
+  ZTracer::Trace trace;
+  if (parent_trace != nullptr) {
+    trace.init("read", &trace_endpoint, parent_trace);
+    trace.event("start");
+  }
+
+  int r =_readx(rd, oset, onfinish, true, &trace);
+  if (r < 0) {
+    trace.event("finish");
+  }
+  return r;
+}
+
+int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
+			 bool external_call, ZTracer::Trace *trace)
+{
+  ceph_assert(trace != nullptr);
+  ceph_assert(lock.is_locked());
+  bool success = true;
+  int error = 0;
+  uint64_t bytes_in_cache = 0;
+  uint64_t bytes_not_in_cache = 0;
+  uint64_t total_bytes_read = 0;
+  map<uint64_t, bufferlist> stripe_map;  // final buffer offset -> substring
+  bool dontneed = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
+  bool nocache = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+
+  /*
+   * WARNING: we can only meaningfully return ENOENT if the read request
+   * passed in a single ObjectExtent.  Any caller who wants ENOENT instead of
+   * zeroed buffers needs to feed single extents into readx().
+   */
+  ceph_assert(!oset->return_enoent || rd->extents.size() == 1);
+
+  for (vector<ObjectExtent>::iterator ex_it = rd->extents.begin();
+       ex_it != rd->extents.end();
+       ++ex_it) {
+    ldout(cct, 10) << "readx " << *ex_it << dendl;
+
+    total_bytes_read += ex_it->length;
+
+    // get Object cache
+    sobject_t soid(ex_it->oid, rd->snap);
+    Object *o = get_object(soid, ex_it->objectno, oset, ex_it->oloc,
+			   ex_it->truncate_size, oset->truncate_seq);
+    if (external_call)
+      touch_ob(o);
+
+    // does not exist and no hits?
+    if (oset->return_enoent && !o->exists) {
+      ldout(cct, 10) << "readx  object !exists, 1 extent..." << dendl;
+
+      // should we worry about COW underneath us?
+      if (writeback_handler.may_copy_on_write(soid.oid, ex_it->offset,
+					      ex_it->length, soid.snap)) {
+	ldout(cct, 20) << "readx  may copy on write" << dendl;
+	bool wait = false;
+	list<BufferHead*> blist;
+	for (map<loff_t, BufferHead*>::iterator bh_it = o->data.begin();
+	     bh_it != o->data.end();
+	     ++bh_it) {
+	  BufferHead *bh = bh_it->second;
+	  if (bh->is_dirty() || bh->is_tx()) {
+	    ldout(cct, 10) << "readx  flushing " << *bh << dendl;
+	    wait = true;
+	    if (bh->is_dirty()) {
+	      if (scattered_write)
+		blist.push_back(bh);
+	      else
+		bh_write(bh, *trace);
+	    }
+	  }
+	}
+	if (scattered_write && !blist.empty())
+	  bh_write_scattered(blist);
+	if (wait) {
+	  ldout(cct, 10) << "readx  waiting on tid " << o->last_write_tid
+			 << " on " << *o << dendl;
+	  o->waitfor_commit[o->last_write_tid].push_back(
+	    new C_RetryRead(this,rd, oset, onfinish, *trace));
+	  // FIXME: perfcounter!
+	  return 0;
+	}
+      }
+
+      // can we return ENOENT?
+      bool allzero = true;
+      for (map<loff_t, BufferHead*>::iterator bh_it = o->data.begin();
+	   bh_it != o->data.end();
+	   ++bh_it) {
+	ldout(cct, 20) << "readx  ob has bh " << *bh_it->second << dendl;
+	if (!bh_it->second->is_zero() && !bh_it->second->is_rx()) {
+	  allzero = false;
+	  break;
+	}
+      }
+      if (allzero) {
+	ldout(cct, 10) << "readx  ob has all zero|rx, returning ENOENT"
+		       << dendl;
+	delete rd;
+	if (dontneed)
+	  bottouch_ob(o);
+	return -ENOENT;
+      }
+    }
+
+    // map extent into bufferheads
+    map<loff_t, BufferHead*> hits, missing, rx, errors;
+    o->map_read(*ex_it, hits, missing, rx, errors);
+    if (external_call) {
+      // retry reading error buffers
+      missing.insert(errors.begin(), errors.end());
+    } else {
+      // some reads had errors, fail later so completions
+      // are cleaned up properly
+      // TODO: make read path not call _readx for every completion
+      hits.insert(errors.begin(), errors.end());
+    }
+
+    if (!missing.empty() || !rx.empty()) {
+      // read missing
+      map<loff_t, BufferHead*>::iterator last = missing.end();
+      for (map<loff_t, BufferHead*>::iterator bh_it = missing.begin();
+	   bh_it != missing.end();
+	   ++bh_it) {
+	uint64_t rx_bytes = static_cast<uint64_t>(
+	  stat_rx + bh_it->second->length());
+	bytes_not_in_cache += bh_it->second->length();
+	if (!waitfor_read.empty() || (stat_rx > 0 && rx_bytes > max_size)) {
+	  // cache is full with concurrent reads -- wait for rx's to complete
+	  // to constrain memory growth (especially during copy-ups)
+	  if (success) {
+	    ldout(cct, 10) << "readx missed, waiting on cache to complete "
+			   << waitfor_read.size() << " blocked reads, "
+			   << (std::max(rx_bytes, max_size) - max_size)
+			   << " read bytes" << dendl;
+	    waitfor_read.push_back(new C_RetryRead(this, rd, oset, onfinish,
+						   *trace));
+	  }
+
+	  bh_remove(o, bh_it->second);
+	  delete bh_it->second;
+	} else {
+	  bh_it->second->set_nocache(nocache);
+	  bh_read(bh_it->second, rd->fadvise_flags, *trace);
+	  if ((success && onfinish) || last != missing.end())
+	    last = bh_it;
+	}
+	success = false;
+      }
+
+      //add wait in last bh avoid wakeup early. Because read is order
+      if (last != missing.end()) {
+	ldout(cct, 10) << "readx missed, waiting on " << *last->second
+	  << " off " << last->first << dendl;
+	last->second->waitfor_read[last->first].push_back(
+	  new C_RetryRead(this, rd, oset, onfinish, *trace) );
+
+      }
+
+      // bump rx
+      for (map<loff_t, BufferHead*>::iterator bh_it = rx.begin();
+	   bh_it != rx.end();
+	   ++bh_it) {
+	touch_bh(bh_it->second); // bump in lru, so we don't lose it.
+	if (success && onfinish) {
+	  ldout(cct, 10) << "readx missed, waiting on " << *bh_it->second
+			 << " off " << bh_it->first << dendl;
+	  bh_it->second->waitfor_read[bh_it->first].push_back(
+	    new C_RetryRead(this, rd, oset, onfinish, *trace) );
+	}
+	bytes_not_in_cache += bh_it->second->length();
+	success = false;
+      }
+
+      for (map<loff_t, BufferHead*>::iterator bh_it = hits.begin();
+	   bh_it != hits.end();  ++bh_it)
+	//bump in lru, so we don't lose it when later read
+	touch_bh(bh_it->second);
+
+    } else {
+      ceph_assert(!hits.empty());
+
+      // make a plain list
+      for (map<loff_t, BufferHead*>::iterator bh_it = hits.begin();
+	   bh_it != hits.end();
+	   ++bh_it) {
+	BufferHead *bh = bh_it->second;
+	ldout(cct, 10) << "readx hit bh " << *bh << dendl;
+	if (bh->is_error() && bh->error)
+	  error = bh->error;
+	bytes_in_cache += bh->length();
+
+	if (bh->get_nocache() && bh->is_clean())
+	  bh_lru_rest.lru_bottouch(bh);
+	else
+	  touch_bh(bh);
+	//must be after touch_bh because touch_bh set dontneed false
+	if (dontneed &&
+	    ((loff_t)ex_it->offset <= bh->start() &&
+	     (bh->end() <=(loff_t)(ex_it->offset + ex_it->length)))) {
+	  bh->set_dontneed(true); //if dirty
+	  if (bh->is_clean())
+	    bh_lru_rest.lru_bottouch(bh);
+	}
+      }
+
+      if (!error) {
+	// create reverse map of buffer offset -> object for the
+	// eventual result.  this is over a single ObjectExtent, so we
+	// know that
+	//  - the bh's are contiguous
+	//  - the buffer frags need not be (and almost certainly aren't)
+	loff_t opos = ex_it->offset;
+	map<loff_t, BufferHead*>::iterator bh_it = hits.begin();
+	ceph_assert(bh_it->second->start() <= opos);
+	uint64_t bhoff = opos - bh_it->second->start();
+	vector<pair<uint64_t,uint64_t> >::iterator f_it
+	  = ex_it->buffer_extents.begin();
+	uint64_t foff = 0;
+	while (1) {
+	  BufferHead *bh = bh_it->second;
+	  ceph_assert(opos == (loff_t)(bh->start() + bhoff));
+
+	  uint64_t len = std::min(f_it->second - foff, bh->length() - bhoff);
+	  ldout(cct, 10) << "readx rmap opos " << opos << ": " << *bh << " +"
+			 << bhoff << " frag " << f_it->first << "~"
+			 << f_it->second << " +" << foff << "~" << len
+			 << dendl;
+
+	  bufferlist bit;
+	  // put substr here first, since substr_of clobbers, and we
+	  // may get multiple bh's at this stripe_map position
+	  if (bh->is_zero()) {
+	    stripe_map[f_it->first].append_zero(len);
+	  } else {
+	    bit.substr_of(bh->bl,
+		opos - bh->start(),
+		len);
+	    stripe_map[f_it->first].claim_append(bit);
+	  }
+
+	  opos += len;
+	  bhoff += len;
+	  foff += len;
+	  if (opos == bh->end()) {
+	    ++bh_it;
+	    bhoff = 0;
+	  }
+	  if (foff == f_it->second) {
+	    ++f_it;
+	    foff = 0;
+	  }
+	  if (bh_it == hits.end()) break;
+	  if (f_it == ex_it->buffer_extents.end())
+	    break;
+	}
+	ceph_assert(f_it == ex_it->buffer_extents.end());
+	ceph_assert(opos == (loff_t)ex_it->offset + (loff_t)ex_it->length);
+      }
+
+      if (dontneed && o->include_all_cached_data(ex_it->offset, ex_it->length))
+	  bottouch_ob(o);
+    }
+  }
+
+  if (!success) {
+    if (perfcounter && external_call) {
+      perfcounter->inc(l_objectcacher_data_read, total_bytes_read);
+      perfcounter->inc(l_objectcacher_cache_bytes_miss, bytes_not_in_cache);
+      perfcounter->inc(l_objectcacher_cache_ops_miss);
+    }
+    if (onfinish) {
+      ldout(cct, 20) << "readx defer " << rd << dendl;
+    } else {
+      ldout(cct, 20) << "readx drop " << rd << " (no complete, but no waiter)"
+		     << dendl;
+      delete rd;
+    }
+    return 0;  // wait!
+  }
+  if (perfcounter && external_call) {
+    perfcounter->inc(l_objectcacher_data_read, total_bytes_read);
+    perfcounter->inc(l_objectcacher_cache_bytes_hit, bytes_in_cache);
+    perfcounter->inc(l_objectcacher_cache_ops_hit);
+  }
+
+  // no misses... success!  do the read.
+  ldout(cct, 10) << "readx has all buffers" << dendl;
+
+  // ok, assemble into result buffer.
+  uint64_t pos = 0;
+  if (rd->bl && !error) {
+    rd->bl->clear();
+    for (map<uint64_t,bufferlist>::iterator i = stripe_map.begin();
+	 i != stripe_map.end();
+	 ++i) {
+      ceph_assert(pos == i->first);
+      ldout(cct, 10) << "readx  adding buffer len " << i->second.length()
+		     << " at " << pos << dendl;
+      pos += i->second.length();
+      rd->bl->claim_append(i->second);
+      ceph_assert(rd->bl->length() == pos);
+    }
+    ldout(cct, 10) << "readx  result is " << rd->bl->length() << dendl;
+  } else if (!error) {
+    ldout(cct, 10) << "readx  no bufferlist ptr (readahead?), done." << dendl;
+    map<uint64_t,bufferlist>::reverse_iterator i = stripe_map.rbegin();
+    pos = i->first + i->second.length();
+  }
+
+  // done with read.
+  int ret = error ? error : pos;
+  ldout(cct, 20) << "readx done " << rd << " " << ret << dendl;
+  ceph_assert(pos <= (uint64_t) INT_MAX);
+
+  delete rd;
+
+  trim();
+
+  return ret;
+}
+
+void ObjectCacher::retry_waiting_reads()
+{
+  list<Context *> ls;
+  ls.swap(waitfor_read);
+
+  while (!ls.empty() && waitfor_read.empty()) {
+    Context *ctx = ls.front();
+    ls.pop_front();
+    ctx->complete(0);
+  }
+  waitfor_read.splice(waitfor_read.end(), ls);
+}
+
+int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
+			 ZTracer::Trace *parent_trace)
+{
+  ceph_assert(lock.is_locked());
+  ceph::real_time now = ceph::real_clock::now();
+  uint64_t bytes_written = 0;
+  uint64_t bytes_written_in_flush = 0;
+  bool dontneed = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
+  bool nocache = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+
+  ZTracer::Trace trace;
+  if (parent_trace != nullptr) {
+    trace.init("write", &trace_endpoint, parent_trace);
+    trace.event("start");
+  }
+
+  list<Context*> wait_for_reads;
+  for (vector<ObjectExtent>::iterator ex_it = wr->extents.begin();
+       ex_it != wr->extents.end();
+       ++ex_it) {
+    // get object cache
+    sobject_t soid(ex_it->oid, CEPH_NOSNAP);
+    Object *o = get_object(soid, ex_it->objectno, oset, ex_it->oloc,
+			   ex_it->truncate_size, oset->truncate_seq);
+
+    // map it all into a single bufferhead.
+    BufferHead *bh = o->map_write(*ex_it, wr->journal_tid);
+    bool missing = bh->is_missing();
+    bh->snapc = wr->snapc;
+
+    // readers that need to be woken up due to an overwrite
+    for (auto& [_, wait_for_read] : bh->waitfor_read) {
+      wait_for_reads.splice(wait_for_reads.end(), wait_for_read);
+    }
+    bh->waitfor_read.clear();
+
+    bytes_written += ex_it->length;
+    if (bh->is_tx()) {
+      bytes_written_in_flush += ex_it->length;
+    }
+
+    // adjust buffer pointers (ie "copy" data into my cache)
+    // this is over a single ObjectExtent, so we know that
+    //  - there is one contiguous bh
+    //  - the buffer frags need not be (and almost certainly aren't)
+    // note: i assume striping is monotonic... no jumps backwards, ever!
+    loff_t opos = ex_it->offset;
+    for (vector<pair<uint64_t, uint64_t> >::iterator f_it
+	   = ex_it->buffer_extents.begin();
+	 f_it != ex_it->buffer_extents.end();
+	 ++f_it) {
+      ldout(cct, 10) << "writex writing " << f_it->first << "~"
+		     << f_it->second << " into " << *bh << " at " << opos
+		     << dendl;
+      uint64_t bhoff = opos - bh->start();
+      ceph_assert(f_it->second <= bh->length() - bhoff);
+
+      // get the frag we're mapping in
+      bufferlist frag;
+      frag.substr_of(wr->bl, f_it->first, f_it->second);
+
+      // keep anything left of bhoff
+      if (!bhoff)
+        bh->bl.swap(frag);
+      else
+        bh->bl.claim_append(frag);
+
+      opos += f_it->second;
+    }
+
+    // ok, now bh is dirty.
+    mark_dirty(bh);
+    if (dontneed)
+      bh->set_dontneed(true);
+    else if (nocache && missing)
+      bh->set_nocache(true);
+    else
+      touch_bh(bh);
+
+    bh->last_write = now;
+
+    o->try_merge_bh(bh);
+  }
+
+  if (perfcounter) {
+    perfcounter->inc(l_objectcacher_data_written, bytes_written);
+    if (bytes_written_in_flush) {
+      perfcounter->inc(l_objectcacher_overwritten_in_flush,
+		       bytes_written_in_flush);
+    }
+  }
+
+  int r = _wait_for_write(wr, bytes_written, oset, &trace, onfreespace);
+  delete wr;
+
+  finish_contexts(cct, wait_for_reads, 0);
+
+  //verify_stats();
+  trim();
+  return r;
+}
+
+class ObjectCacher::C_WaitForWrite : public Context {
+public:
+  C_WaitForWrite(ObjectCacher *oc, uint64_t len,
+                 const ZTracer::Trace &trace, Context *onfinish) :
+    m_oc(oc), m_len(len), m_trace(trace), m_onfinish(onfinish) {}
+  void finish(int r) override;
+private:
+  ObjectCacher *m_oc;
+  uint64_t m_len;
+  ZTracer::Trace m_trace;
+  Context *m_onfinish;
+};
+
+void ObjectCacher::C_WaitForWrite::finish(int r)
+{
+  std::lock_guard l(m_oc->lock);
+  m_oc->maybe_wait_for_writeback(m_len, &m_trace);
+  m_onfinish->complete(r);
+}
+
+void ObjectCacher::maybe_wait_for_writeback(uint64_t len,
+                                            ZTracer::Trace *trace)
+{
+  ceph_assert(lock.is_locked());
+  ceph::mono_time start = ceph::mono_clock::now();
+  int blocked = 0;
+  // wait for writeback?
+  //  - wait for dirty and tx bytes (relative to the max_dirty threshold)
+  //  - do not wait for bytes other waiters are waiting on.  this means that
+  //    threads do not wait for each other.  this effectively allows the cache
+  //    size to balloon proportional to the data that is in flight.
+
+  uint64_t max_dirty_bh = max_dirty >> BUFFER_MEMORY_WEIGHT;
+  while (get_stat_dirty() + get_stat_tx() > 0 &&
+	 (((uint64_t)(get_stat_dirty() + get_stat_tx()) >=
+	  max_dirty + get_stat_dirty_waiting()) ||
+	 (dirty_or_tx_bh.size() >=
+	  max_dirty_bh + get_stat_nr_dirty_waiters()))) {
+
+    if (blocked == 0) {
+      trace->event("start wait for writeback");
+    }
+    ldout(cct, 10) << __func__ << " waiting for dirty|tx "
+		   << (get_stat_dirty() + get_stat_tx()) << " >= max "
+		   << max_dirty << " + dirty_waiting "
+		   << get_stat_dirty_waiting() << dendl;
+    flusher_cond.Signal();
+    stat_dirty_waiting += len;
+    ++stat_nr_dirty_waiters;
+    stat_cond.Wait(lock);
+    stat_dirty_waiting -= len;
+    --stat_nr_dirty_waiters;
+    ++blocked;
+    ldout(cct, 10) << __func__ << " woke up" << dendl;
+  }
+  if (blocked > 0) {
+    trace->event("finish wait for writeback");
+  }
+  if (blocked && perfcounter) {
+    perfcounter->inc(l_objectcacher_write_ops_blocked);
+    perfcounter->inc(l_objectcacher_write_bytes_blocked, len);
+    ceph::timespan blocked = ceph::mono_clock::now() - start;
+    perfcounter->tinc(l_objectcacher_write_time_blocked, blocked);
+  }
+}
+
+// blocking wait for write.
+int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset,
+				  ZTracer::Trace *trace, Context *onfreespace)
+{
+  ceph_assert(lock.is_locked());
+  ceph_assert(trace != nullptr);
+  int ret = 0;
+
+  if (max_dirty > 0 && !(wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_FUA)) {
+    if (block_writes_upfront) {
+      maybe_wait_for_writeback(len, trace);
+      if (onfreespace)
+	onfreespace->complete(0);
+    } else {
+      ceph_assert(onfreespace);
+      finisher.queue(new C_WaitForWrite(this, len, *trace, onfreespace));
+    }
+  } else {
+    // write-thru!  flush what we just wrote.
+    Cond cond;
+    bool done = false;
+    Context *fin = block_writes_upfront ?
+      new C_Cond(&cond, &done, &ret) : onfreespace;
+    ceph_assert(fin);
+    bool flushed = flush_set(oset, wr->extents, trace, fin);
+    ceph_assert(!flushed);   // we just dirtied it, and didn't drop our lock!
+    ldout(cct, 10) << "wait_for_write waiting on write-thru of " << len
+		   << " bytes" << dendl;
+    if (block_writes_upfront) {
+      while (!done)
+	cond.Wait(lock);
+      ldout(cct, 10) << "wait_for_write woke up, ret " << ret << dendl;
+      if (onfreespace)
+	onfreespace->complete(ret);
+    }
+  }
+
+  // start writeback anyway?
+  if (get_stat_dirty() > 0 && (uint64_t) get_stat_dirty() > target_dirty) {
+    ldout(cct, 10) << "wait_for_write " << get_stat_dirty() << " > target "
+		   << target_dirty << ", nudging flusher" << dendl;
+    flusher_cond.Signal();
+  }
+  return ret;
+}
+
+void ObjectCacher::flusher_entry()
+{
+  ldout(cct, 10) << "flusher start" << dendl;
+  lock.Lock();
+  while (!flusher_stop) {
+    loff_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() +
+      get_stat_dirty();
+    ldout(cct, 11) << "flusher "
+		   << all << " / " << max_size << ":  "
+		   << get_stat_tx() << " tx, "
+		   << get_stat_rx() << " rx, "
+		   << get_stat_clean() << " clean, "
+		   << get_stat_dirty() << " dirty ("
+		   << target_dirty << " target, "
+		   << max_dirty << " max)"
+		   << dendl;
+    loff_t actual = get_stat_dirty() + get_stat_dirty_waiting();
+
+    ZTracer::Trace trace;
+    if (cct->_conf->osdc_blkin_trace_all) {
+      trace.init("flusher", &trace_endpoint);
+      trace.event("start");
+    }
+
+    if (actual > 0 && (uint64_t) actual > target_dirty) {
+      // flush some dirty pages
+      ldout(cct, 10) << "flusher " << get_stat_dirty() << " dirty + "
+		     << get_stat_dirty_waiting() << " dirty_waiting > target "
+		     << target_dirty << ", flushing some dirty bhs" << dendl;
+      flush(&trace, actual - target_dirty);
+    } else {
+      // check tail of lru for old dirty items
+      ceph::real_time cutoff = ceph::real_clock::now();
+      cutoff -= max_dirty_age;
+      BufferHead *bh = 0;
+      int max = MAX_FLUSH_UNDER_LOCK;
+      while ((bh = static_cast<BufferHead*>(bh_lru_dirty.
+					    lru_get_next_expire())) != 0 &&
+	     bh->last_write <= cutoff &&
+	     max > 0) {
+	ldout(cct, 10) << "flusher flushing aged dirty bh " << *bh << dendl;
+	if (scattered_write) {
+	  bh_write_adjacencies(bh, cutoff, NULL, &max);
+        } else {
+	  bh_write(bh, trace);
+	  --max;
+	}
+      }
+      if (!max) {
+	// back off the lock to avoid starving other threads
+        trace.event("backoff");
+	lock.Unlock();
+	lock.Lock();
+	continue;
+      }
+    }
+
+    trace.event("finish");
+    if (flusher_stop)
+      break;
+
+    flusher_cond.WaitInterval(lock, seconds(1));
+  }
+
+  /* Wait for reads to finish. This is only possible if handling
+   * -ENOENT made some read completions finish before their rados read
+   * came back. If we don't wait for them, and destroy the cache, when
+   * the rados reads do come back their callback will try to access the
+   * no-longer-valid ObjectCacher.
+   */
+  while (reads_outstanding > 0) {
+    ldout(cct, 10) << "Waiting for all reads to complete. Number left: "
+		   << reads_outstanding << dendl;
+    read_cond.Wait(lock);
+  }
+
+  lock.Unlock();
+  ldout(cct, 10) << "flusher finish" << dendl;
+}
+
+
+// -------------------------------------------------
+
+bool ObjectCacher::set_is_empty(ObjectSet *oset)
+{
+  ceph_assert(lock.is_locked());
+  if (oset->objects.empty())
+    return true;
+
+  for (xlist<Object*>::iterator p = oset->objects.begin(); !p.end(); ++p)
+    if (!(*p)->is_empty())
+      return false;
+
+  return true;
+}
+
+bool ObjectCacher::set_is_cached(ObjectSet *oset)
+{
+  ceph_assert(lock.is_locked());
+  if (oset->objects.empty())
+    return false;
+
+  for (xlist<Object*>::iterator p = oset->objects.begin();
+       !p.end(); ++p) {
+    Object *ob = *p;
+    for (map<loff_t,BufferHead*>::iterator q = ob->data.begin();
+	 q != ob->data.end();
+	 ++q) {
+      BufferHead *bh = q->second;
+      if (!bh->is_dirty() && !bh->is_tx())
+	return true;
+    }
+  }
+
+  return false;
+}
+
+bool ObjectCacher::set_is_dirty_or_committing(ObjectSet *oset)
+{
+  ceph_assert(lock.is_locked());
+  if (oset->objects.empty())
+    return false;
+
+  for (xlist<Object*>::iterator i = oset->objects.begin();
+       !i.end(); ++i) {
+    Object *ob = *i;
+
+    for (map<loff_t,BufferHead*>::iterator p = ob->data.begin();
+	 p != ob->data.end();
+	 ++p) {
+      BufferHead *bh = p->second;
+      if (bh->is_dirty() || bh->is_tx())
+	return true;
+    }
+  }
+
+  return false;
+}
+
+
+// purge.  non-blocking.  violently removes dirty buffers from cache.
+void ObjectCacher::purge(Object *ob)
+{
+  ceph_assert(lock.is_locked());
+  ldout(cct, 10) << "purge " << *ob << dendl;
+
+  ob->truncate(0);
+}
+
+
+// flush.  non-blocking.  no callback.
+// true if clean, already flushed.
+// false if we wrote something.
+// be sloppy about the ranges and flush any buffer it touches
+bool ObjectCacher::flush(Object *ob, loff_t offset, loff_t length,
+                         ZTracer::Trace *trace)
+{
+  ceph_assert(trace != nullptr);
+  ceph_assert(lock.is_locked());
+  list<BufferHead*> blist;
+  bool clean = true;
+  ldout(cct, 10) << "flush " << *ob << " " << offset << "~" << length << dendl;
+  for (map<loff_t,BufferHead*>::const_iterator p = ob->data_lower_bound(offset);
+       p != ob->data.end();
+       ++p) {
+    BufferHead *bh = p->second;
+    ldout(cct, 20) << "flush  " << *bh << dendl;
+    if (length && bh->start() > offset+length) {
+      break;
+    }
+    if (bh->is_tx()) {
+      clean = false;
+      continue;
+    }
+    if (!bh->is_dirty()) {
+      continue;
+    }
+
+    if (scattered_write)
+      blist.push_back(bh);
+    else
+      bh_write(bh, *trace);
+    clean = false;
+  }
+  if (scattered_write && !blist.empty())
+    bh_write_scattered(blist);
+
+  return clean;
+}
+
+bool ObjectCacher::_flush_set_finish(C_GatherBuilder *gather,
+				     Context *onfinish)
+{
+  ceph_assert(lock.is_locked());
+  if (gather->has_subs()) {
+    gather->set_finisher(onfinish);
+    gather->activate();
+    return false;
+  }
+
+  ldout(cct, 10) << "flush_set has no dirty|tx bhs" << dendl;
+  onfinish->complete(0);
+  return true;
+}
+
+// flush.  non-blocking, takes callback.
+// returns true if already flushed
+bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish)
+{
+  ceph_assert(lock.is_locked());
+  ceph_assert(onfinish != NULL);
+  if (oset->objects.empty()) {
+    ldout(cct, 10) << "flush_set on " << oset << " dne" << dendl;
+    onfinish->complete(0);
+    return true;
+  }
+
+  ldout(cct, 10) << "flush_set " << oset << dendl;
+
+  // we'll need to wait for all objects to flush!
+  C_GatherBuilder gather(cct);
+  set<Object*> waitfor_commit;
+
+  list<BufferHead*> blist;
+  Object *last_ob = NULL;
+  set<BufferHead*, BufferHead::ptr_lt>::const_iterator it, p, q;
+
+  // Buffer heads in dirty_or_tx_bh are sorted in ObjectSet/Object/offset
+  // order. But items in oset->objects are not sorted. So the iterator can
+  // point to any buffer head in the ObjectSet
+  BufferHead key(*oset->objects.begin());
+  it = dirty_or_tx_bh.lower_bound(&key);
+  p = q = it;
+
+  bool backwards = true;
+  if (it != dirty_or_tx_bh.begin())
+    --it;
+  else
+    backwards = false;
+
+  for (; p != dirty_or_tx_bh.end(); p = q) {
+    ++q;
+    BufferHead *bh = *p;
+    if (bh->ob->oset != oset)
+      break;
+    waitfor_commit.insert(bh->ob);
+    if (bh->is_dirty()) {
+      if (scattered_write) {
+	if (last_ob != bh->ob) {
+	  if (!blist.empty()) {
+	    bh_write_scattered(blist);
+	    blist.clear();
+	  }
+	  last_ob = bh->ob;
+	}
+	blist.push_back(bh);
+      } else {
+	bh_write(bh, {});
+      }
+    }
+  }
+
+  if (backwards) {
+    for(p = q = it; true; p = q) {
+      if (q != dirty_or_tx_bh.begin())
+	--q;
+      else
+	backwards = false;
+      BufferHead *bh = *p;
+      if (bh->ob->oset != oset)
+	break;
+      waitfor_commit.insert(bh->ob);
+      if (bh->is_dirty()) {
+	if (scattered_write) {
+	  if (last_ob != bh->ob) {
+	    if (!blist.empty()) {
+	      bh_write_scattered(blist);
+	      blist.clear();
+	    }
+	    last_ob = bh->ob;
+	  }
+	  blist.push_front(bh);
+	} else {
+	  bh_write(bh, {});
+	}
+      }
+      if (!backwards)
+	break;
+    }
+  }
+
+  if (scattered_write && !blist.empty())
+    bh_write_scattered(blist);
+
+  for (set<Object*>::iterator i = waitfor_commit.begin();
+       i != waitfor_commit.end(); ++i) {
+    Object *ob = *i;
+
+    // we'll need to gather...
+    ldout(cct, 10) << "flush_set " << oset << " will wait for ack tid "
+		   << ob->last_write_tid << " on " << *ob << dendl;
+    ob->waitfor_commit[ob->last_write_tid].push_back(gather.new_sub());
+  }
+
+  return _flush_set_finish(&gather, onfinish);
+}
+
+// flush.  non-blocking, takes callback.
+// returns true if already flushed
+bool ObjectCacher::flush_set(ObjectSet *oset, vector<ObjectExtent>& exv,
+			     ZTracer::Trace *trace, Context *onfinish)
+{
+  ceph_assert(lock.is_locked());
+  ceph_assert(trace != nullptr);
+  ceph_assert(onfinish != NULL);
+  if (oset->objects.empty()) {
+    ldout(cct, 10) << "flush_set on " << oset << " dne" << dendl;
+    onfinish->complete(0);
+    return true;
+  }
+
+  ldout(cct, 10) << "flush_set " << oset << " on " << exv.size()
+		 << " ObjectExtents" << dendl;
+
+  // we'll need to wait for all objects to flush!
+  C_GatherBuilder gather(cct);
+
+  for (vector<ObjectExtent>::iterator p = exv.begin();
+       p != exv.end();
+       ++p) {
+    ObjectExtent &ex = *p;
+    sobject_t soid(ex.oid, CEPH_NOSNAP);
+    if (objects[oset->poolid].count(soid) == 0)
+      continue;
+    Object *ob = objects[oset->poolid][soid];
+
+    ldout(cct, 20) << "flush_set " << oset << " ex " << ex << " ob " << soid
+		   << " " << ob << dendl;
+
+    if (!flush(ob, ex.offset, ex.length, trace)) {
+      // we'll need to gather...
+      ldout(cct, 10) << "flush_set " << oset << " will wait for ack tid "
+		     << ob->last_write_tid << " on " << *ob << dendl;
+      ob->waitfor_commit[ob->last_write_tid].push_back(gather.new_sub());
+    }
+  }
+
+  return _flush_set_finish(&gather, onfinish);
+}
+
+// flush all dirty data.  non-blocking, takes callback.
+// returns true if already flushed
+bool ObjectCacher::flush_all(Context *onfinish)
+{
+  ceph_assert(lock.is_locked());
+  ceph_assert(onfinish != NULL);
+
+  ldout(cct, 10) << "flush_all " << dendl;
+
+  // we'll need to wait for all objects to flush!
+  C_GatherBuilder gather(cct);
+  set<Object*> waitfor_commit;
+
+  list<BufferHead*> blist;
+  Object *last_ob = NULL;
+  set<BufferHead*, BufferHead::ptr_lt>::iterator next, it;
+  next = it = dirty_or_tx_bh.begin();
+  while (it != dirty_or_tx_bh.end()) {
+    ++next;
+    BufferHead *bh = *it;
+    waitfor_commit.insert(bh->ob);
+
+    if (bh->is_dirty()) {
+      if (scattered_write) {
+	if (last_ob != bh->ob) {
+	  if (!blist.empty()) {
+	    bh_write_scattered(blist);
+	    blist.clear();
+	  }
+	  last_ob = bh->ob;
+	}
+	blist.push_back(bh);
+      } else {
+	bh_write(bh, {});
+      }
+    }
+
+    it = next;
+  }
+
+  if (scattered_write && !blist.empty())
+    bh_write_scattered(blist);
+
+  for (set<Object*>::iterator i = waitfor_commit.begin();
+       i != waitfor_commit.end();
+       ++i) {
+    Object *ob = *i;
+
+    // we'll need to gather...
+    ldout(cct, 10) << "flush_all will wait for ack tid "
+		   << ob->last_write_tid << " on " << *ob << dendl;
+    ob->waitfor_commit[ob->last_write_tid].push_back(gather.new_sub());
+  }
+
+  return _flush_set_finish(&gather, onfinish);
+}
+
+void ObjectCacher::purge_set(ObjectSet *oset)
+{
+  ceph_assert(lock.is_locked());
+  if (oset->objects.empty()) {
+    ldout(cct, 10) << "purge_set on " << oset << " dne" << dendl;
+    return;
+  }
+
+  ldout(cct, 10) << "purge_set " << oset << dendl;
+  const bool were_dirty = oset->dirty_or_tx > 0;
+
+  for (xlist<Object*>::iterator i = oset->objects.begin();
+       !i.end(); ++i) {
+    Object *ob = *i;
+	purge(ob);
+  }
+
+  // Although we have purged rather than flushed, caller should still
+  // drop any resources associate with dirty data.
+  ceph_assert(oset->dirty_or_tx == 0);
+  if (flush_set_callback && were_dirty) {
+    flush_set_callback(flush_set_callback_arg, oset);
+  }
+}
+
+
+loff_t ObjectCacher::release(Object *ob)
+{
+  ceph_assert(lock.is_locked());
+  list<BufferHead*> clean;
+  loff_t o_unclean = 0;
+
+  for (map<loff_t,BufferHead*>::iterator p = ob->data.begin();
+       p != ob->data.end();
+       ++p) {
+    BufferHead *bh = p->second;
+    if (bh->is_clean() || bh->is_zero() || bh->is_error())
+      clean.push_back(bh);
+    else
+      o_unclean += bh->length();
+  }
+
+  for (list<BufferHead*>::iterator p = clean.begin();
+       p != clean.end();
+       ++p) {
+    bh_remove(ob, *p);
+    delete *p;
+  }
+
+  if (ob->can_close()) {
+    ldout(cct, 10) << "release trimming " << *ob << dendl;
+    close_object(ob);
+    ceph_assert(o_unclean == 0);
+    return 0;
+  }
+
+  if (ob->complete) {
+    ldout(cct, 10) << "release clearing complete on " << *ob << dendl;
+    ob->complete = false;
+  }
+  if (!ob->exists) {
+    ldout(cct, 10) << "release setting exists on " << *ob << dendl;
+    ob->exists = true;
+  }
+
+  return o_unclean;
+}
+
+loff_t ObjectCacher::release_set(ObjectSet *oset)
+{
+  ceph_assert(lock.is_locked());
+  // return # bytes not clean (and thus not released).
+  loff_t unclean = 0;
+
+  if (oset->objects.empty()) {
+    ldout(cct, 10) << "release_set on " << oset << " dne" << dendl;
+    return 0;
+  }
+
+  ldout(cct, 10) << "release_set " << oset << dendl;
+
+  xlist<Object*>::iterator q;
+  for (xlist<Object*>::iterator p = oset->objects.begin();
+       !p.end(); ) {
+    q = p;
+    ++q;
+    Object *ob = *p;
+
+    loff_t o_unclean = release(ob);
+    unclean += o_unclean;
+
+    if (o_unclean)
+      ldout(cct, 10) << "release_set " << oset << " " << *ob
+		     << " has " << o_unclean << " bytes left"
+		     << dendl;
+    p = q;
+  }
+
+  if (unclean) {
+    ldout(cct, 10) << "release_set " << oset
+		   << ", " << unclean << " bytes left" << dendl;
+  }
+
+  return unclean;
+}
+
+
+uint64_t ObjectCacher::release_all()
+{
+  ceph_assert(lock.is_locked());
+  ldout(cct, 10) << "release_all" << dendl;
+  uint64_t unclean = 0;
+
+  vector<ceph::unordered_map<sobject_t, Object*> >::iterator i
+    = objects.begin();
+  while (i != objects.end()) {
+    ceph::unordered_map<sobject_t, Object*>::iterator p = i->begin();
+    while (p != i->end()) {
+      ceph::unordered_map<sobject_t, Object*>::iterator n = p;
+      ++n;
+
+      Object *ob = p->second;
+
+      loff_t o_unclean = release(ob);
+      unclean += o_unclean;
+
+      if (o_unclean)
+	ldout(cct, 10) << "release_all " << *ob
+		       << " has " << o_unclean << " bytes left"
+		       << dendl;
+    p = n;
+    }
+    ++i;
+  }
+
+  if (unclean) {
+    ldout(cct, 10) << "release_all unclean " << unclean << " bytes left"
+		   << dendl;
+  }
+
+  return unclean;
+}
+
+void ObjectCacher::clear_nonexistence(ObjectSet *oset)
+{
+  ceph_assert(lock.is_locked());
+  ldout(cct, 10) << "clear_nonexistence() " << oset << dendl;
+
+  for (xlist<Object*>::iterator p = oset->objects.begin();
+       !p.end(); ++p) {
+    Object *ob = *p;
+    if (!ob->exists) {
+      ldout(cct, 10) << " setting exists and complete on " << *ob << dendl;
+      ob->exists = true;
+      ob->complete = false;
+    }
+    for (xlist<C_ReadFinish*>::iterator q = ob->reads.begin();
+	 !q.end(); ++q) {
+      C_ReadFinish *comp = *q;
+      comp->distrust_enoent();
+    }
+  }
+}
+
+/**
+ * discard object extents from an ObjectSet by removing the objects in
+ * exls from the in-memory oset.
+ */
+void ObjectCacher::discard_set(ObjectSet *oset, const vector<ObjectExtent>& exls)
+{
+  ceph_assert(lock.is_locked());
+  bool was_dirty = oset->dirty_or_tx > 0;
+
+  _discard(oset, exls, nullptr);
+  _discard_finish(oset, was_dirty, nullptr);
+}
+
+/**
+ * discard object extents from an ObjectSet by removing the objects in
+ * exls from the in-memory oset. If the bh is in TX state, the discard
+ * will wait for the write to commit prior to invoking on_finish.
+ */
+void ObjectCacher::discard_writeback(ObjectSet *oset,
+                                     const vector<ObjectExtent>& exls,
+                                     Context* on_finish)
+{
+  ceph_assert(lock.is_locked());
+  bool was_dirty = oset->dirty_or_tx > 0;
+
+  C_GatherBuilder gather(cct);
+  _discard(oset, exls, &gather);
+
+  if (gather.has_subs()) {
+    bool flushed = was_dirty && oset->dirty_or_tx == 0;
+    gather.set_finisher(new FunctionContext(
+      [this, oset, flushed, on_finish](int) {
+	ceph_assert(lock.is_locked());
+	if (flushed && flush_set_callback)
+	  flush_set_callback(flush_set_callback_arg, oset);
+	if (on_finish)
+	  on_finish->complete(0);
+      }));
+    gather.activate();
+    return;
+  }
+
+  _discard_finish(oset, was_dirty, on_finish);
+}
+
+void ObjectCacher::_discard(ObjectSet *oset, const vector<ObjectExtent>& exls,
+                            C_GatherBuilder* gather)
+{
+  if (oset->objects.empty()) {
+    ldout(cct, 10) << __func__ << " on " << oset << " dne" << dendl;
+    return;
+  }
+
+  ldout(cct, 10) << __func__ << " " << oset << dendl;
+
+  for (auto& ex : exls) {
+    ldout(cct, 10) << __func__ << " " << oset << " ex " << ex << dendl;
+    sobject_t soid(ex.oid, CEPH_NOSNAP);
+    if (objects[oset->poolid].count(soid) == 0)
+      continue;
+    Object *ob = objects[oset->poolid][soid];
+
+    ob->discard(ex.offset, ex.length, gather);
+  }
+}
+
+void ObjectCacher::_discard_finish(ObjectSet *oset, bool was_dirty,
+                                   Context* on_finish)
+{
+  ceph_assert(lock.is_locked());
+
+  // did we truncate off dirty data?
+  if (flush_set_callback && was_dirty && oset->dirty_or_tx == 0) {
+    flush_set_callback(flush_set_callback_arg, oset);
+  }
+
+  // notify that in-flight writeback has completed
+  if (on_finish != nullptr) {
+    on_finish->complete(0);
+  }
+}
+
+void ObjectCacher::verify_stats() const
+{
+  ceph_assert(lock.is_locked());
+  ldout(cct, 10) << "verify_stats" << dendl;
+
+  loff_t clean = 0, zero = 0, dirty = 0, rx = 0, tx = 0, missing = 0,
+    error = 0;
+  for (vector<ceph::unordered_map<sobject_t, Object*> >::const_iterator i
+	 = objects.begin();
+       i != objects.end();
+       ++i) {
+    for (ceph::unordered_map<sobject_t, Object*>::const_iterator p
+	   = i->begin();
+	 p != i->end();
+	 ++p) {
+      Object *ob = p->second;
+      for (map<loff_t, BufferHead*>::const_iterator q = ob->data.begin();
+	   q != ob->data.end();
+	  ++q) {
+	BufferHead *bh = q->second;
+	switch (bh->get_state()) {
+	case BufferHead::STATE_MISSING:
+	  missing += bh->length();
+	  break;
+	case BufferHead::STATE_CLEAN:
+	  clean += bh->length();
+	  break;
+	case BufferHead::STATE_ZERO:
+	  zero += bh->length();
+	  break;
+	case BufferHead::STATE_DIRTY:
+	  dirty += bh->length();
+	  break;
+	case BufferHead::STATE_TX:
+	  tx += bh->length();
+	  break;
+	case BufferHead::STATE_RX:
+	  rx += bh->length();
+	  break;
+	case BufferHead::STATE_ERROR:
+	  error += bh->length();
+	  break;
+	default:
+	  ceph_abort();
+	}
+      }
+    }
+  }
+
+  ldout(cct, 10) << " clean " << clean << " rx " << rx << " tx " << tx
+		 << " dirty " << dirty << " missing " << missing
+		 << " error " << error << dendl;
+  ceph_assert(clean == stat_clean);
+  ceph_assert(rx == stat_rx);
+  ceph_assert(tx == stat_tx);
+  ceph_assert(dirty == stat_dirty);
+  ceph_assert(missing == stat_missing);
+  ceph_assert(zero == stat_zero);
+  ceph_assert(error == stat_error);
+}
+
+void ObjectCacher::bh_stat_add(BufferHead *bh)
+{
+  ceph_assert(lock.is_locked());
+  switch (bh->get_state()) {
+  case BufferHead::STATE_MISSING:
+    stat_missing += bh->length();
+    break;
+  case BufferHead::STATE_CLEAN:
+    stat_clean += bh->length();
+    break;
+  case BufferHead::STATE_ZERO:
+    stat_zero += bh->length();
+    break;
+  case BufferHead::STATE_DIRTY:
+    stat_dirty += bh->length();
+    bh->ob->dirty_or_tx += bh->length();
+    bh->ob->oset->dirty_or_tx += bh->length();
+    break;
+  case BufferHead::STATE_TX:
+    stat_tx += bh->length();
+    bh->ob->dirty_or_tx += bh->length();
+    bh->ob->oset->dirty_or_tx += bh->length();
+    break;
+  case BufferHead::STATE_RX:
+    stat_rx += bh->length();
+    break;
+  case BufferHead::STATE_ERROR:
+    stat_error += bh->length();
+    break;
+  default:
+    ceph_abort_msg("bh_stat_add: invalid bufferhead state");
+  }
+  if (get_stat_dirty_waiting() > 0)
+    stat_cond.Signal();
+}
+
+void ObjectCacher::bh_stat_sub(BufferHead *bh)
+{
+  ceph_assert(lock.is_locked());
+  switch (bh->get_state()) {
+  case BufferHead::STATE_MISSING:
+    stat_missing -= bh->length();
+    break;
+  case BufferHead::STATE_CLEAN:
+    stat_clean -= bh->length();
+    break;
+  case BufferHead::STATE_ZERO:
+    stat_zero -= bh->length();
+    break;
+  case BufferHead::STATE_DIRTY:
+    stat_dirty -= bh->length();
+    bh->ob->dirty_or_tx -= bh->length();
+    bh->ob->oset->dirty_or_tx -= bh->length();
+    break;
+  case BufferHead::STATE_TX:
+    stat_tx -= bh->length();
+    bh->ob->dirty_or_tx -= bh->length();
+    bh->ob->oset->dirty_or_tx -= bh->length();
+    break;
+  case BufferHead::STATE_RX:
+    stat_rx -= bh->length();
+    break;
+  case BufferHead::STATE_ERROR:
+    stat_error -= bh->length();
+    break;
+  default:
+    ceph_abort_msg("bh_stat_sub: invalid bufferhead state");
+  }
+}
+
+void ObjectCacher::bh_set_state(BufferHead *bh, int s)
+{
+  ceph_assert(lock.is_locked());
+  int state = bh->get_state();
+  // move between lru lists?
+  if (s == BufferHead::STATE_DIRTY && state != BufferHead::STATE_DIRTY) {
+    bh_lru_rest.lru_remove(bh);
+    bh_lru_dirty.lru_insert_top(bh);
+  } else if (s != BufferHead::STATE_DIRTY &&state == BufferHead::STATE_DIRTY) {
+    bh_lru_dirty.lru_remove(bh);
+    if (bh->get_dontneed())
+      bh_lru_rest.lru_insert_bot(bh);
+    else
+      bh_lru_rest.lru_insert_top(bh);
+  }
+
+  if ((s == BufferHead::STATE_TX ||
+       s == BufferHead::STATE_DIRTY) &&
+      state != BufferHead::STATE_TX &&
+      state != BufferHead::STATE_DIRTY) {
+    dirty_or_tx_bh.insert(bh);
+  } else if ((state == BufferHead::STATE_TX ||
+	      state == BufferHead::STATE_DIRTY) &&
+	     s != BufferHead::STATE_TX &&
+	     s != BufferHead::STATE_DIRTY) {
+    dirty_or_tx_bh.erase(bh);
+  }
+
+  if (s != BufferHead::STATE_ERROR &&
+      state == BufferHead::STATE_ERROR) {
+    bh->error = 0;
+  }
+
+  // set state
+  bh_stat_sub(bh);
+  bh->set_state(s);
+  bh_stat_add(bh);
+}
+
+void ObjectCacher::bh_add(Object *ob, BufferHead *bh)
+{
+  ceph_assert(lock.is_locked());
+  ldout(cct, 30) << "bh_add " << *ob << " " << *bh << dendl;
+  ob->add_bh(bh);
+  if (bh->is_dirty()) {
+    bh_lru_dirty.lru_insert_top(bh);
+    dirty_or_tx_bh.insert(bh);
+  } else {
+    if (bh->get_dontneed())
+      bh_lru_rest.lru_insert_bot(bh);
+    else
+      bh_lru_rest.lru_insert_top(bh);
+  }
+
+  if (bh->is_tx()) {
+    dirty_or_tx_bh.insert(bh);
+  }
+  bh_stat_add(bh);
+}
+
+void ObjectCacher::bh_remove(Object *ob, BufferHead *bh)
+{
+  ceph_assert(lock.is_locked());
+  ceph_assert(bh->get_journal_tid() == 0);
+  ldout(cct, 30) << "bh_remove " << *ob << " " << *bh << dendl;
+  ob->remove_bh(bh);
+  if (bh->is_dirty()) {
+    bh_lru_dirty.lru_remove(bh);
+    dirty_or_tx_bh.erase(bh);
+  } else {
+    bh_lru_rest.lru_remove(bh);
+  }
+
+  if (bh->is_tx()) {
+    dirty_or_tx_bh.erase(bh);
+  }
+  bh_stat_sub(bh);
+  if (get_stat_dirty_waiting() > 0)
+    stat_cond.Signal();
+}
+
diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h
new file mode 100644
index 00000000..a976f082
--- /dev/null
+++ b/src/osdc/ObjectCacher.h
@@ -0,0 +1,774 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OBJECTCACHER_H
+#define CEPH_OBJECTCACHER_H
+
+#include "include/types.h"
+#include "include/lru.h"
+#include "include/Context.h"
+#include "include/xlist.h"
+
+#include "common/Cond.h"
+#include "common/Finisher.h"
+#include "common/Thread.h"
+#include "common/zipkin_trace.h"
+
+#include "Objecter.h"
+#include "Striper.h"
+
+class CephContext;
+class WritebackHandler;
+class PerfCounters;
+
+enum {
+  l_objectcacher_first = 25000,
+
+  l_objectcacher_cache_ops_hit, // ops we satisfy completely from cache
+  l_objectcacher_cache_ops_miss, // ops we don't satisfy completely from cache
+
+  l_objectcacher_cache_bytes_hit, // bytes read directly from cache
+
+  l_objectcacher_cache_bytes_miss, // bytes we couldn't read directly
+
+				   // from cache
+
+  l_objectcacher_data_read, // total bytes read out
+  l_objectcacher_data_written, // bytes written to cache
+  l_objectcacher_data_flushed, // bytes flushed to WritebackHandler
+  l_objectcacher_overwritten_in_flush, // bytes overwritten while
+				       // flushing is in progress
+
+  l_objectcacher_write_ops_blocked, // total write ops we delayed due
+				    // to dirty limits
+  l_objectcacher_write_bytes_blocked, // total number of write bytes
+				      // we delayed due to dirty
+				      // limits
+  l_objectcacher_write_time_blocked, // total time in seconds spent
+				     // blocking a write due to dirty
+				     // limits
+
+  l_objectcacher_last,
+};
+
+class ObjectCacher {
+  PerfCounters *perfcounter;
+ public:
+  CephContext *cct;
+  class Object;
+  struct ObjectSet;
+  class C_ReadFinish;
+
+  typedef void (*flush_set_callback_t) (void *p, ObjectSet *oset);
+
+  // read scatter/gather
+  struct OSDRead {
+    vector<ObjectExtent> extents;
+    snapid_t snap;
+    bufferlist *bl;
+    int fadvise_flags;
+    OSDRead(snapid_t s, bufferlist *b, int f)
+      : snap(s), bl(b), fadvise_flags(f) {}
+  };
+
+  OSDRead *prepare_read(snapid_t snap, bufferlist *b, int f) const {
+    return new OSDRead(snap, b, f);
+  }
+
+  // write scatter/gather
+  struct OSDWrite {
+    vector<ObjectExtent> extents;
+    SnapContext snapc;
+    bufferlist bl;
+    ceph::real_time mtime;
+    int fadvise_flags;
+    ceph_tid_t journal_tid;
+    OSDWrite(const SnapContext& sc, const bufferlist& b, ceph::real_time mt,
+	     int f, ceph_tid_t _journal_tid)
+      : snapc(sc), bl(b), mtime(mt), fadvise_flags(f),
+	journal_tid(_journal_tid) {}
+  };
+
+  OSDWrite *prepare_write(const SnapContext& sc,
+			  const bufferlist &b,
+			  ceph::real_time mt,
+			  int f,
+			  ceph_tid_t journal_tid) const {
+    return new OSDWrite(sc, b, mt, f, journal_tid);
+  }
+
+
+
+  // ******* BufferHead *********
+  class BufferHead : public LRUObject {
+  public:
+    // states
+    static const int STATE_MISSING = 0;
+    static const int STATE_CLEAN = 1;
+    static const int STATE_ZERO = 2;   // NOTE: these are *clean* zeros
+    static const int STATE_DIRTY = 3;
+    static const int STATE_RX = 4;
+    static const int STATE_TX = 5;
+    static const int STATE_ERROR = 6; // a read error occurred
+
+  private:
+    // my fields
+    int state;
+    int ref;
+    struct {
+      loff_t start, length;   // bh extent in object
+    } ex;
+    bool dontneed; //indicate bh don't need by anyone
+    bool nocache; //indicate bh don't need by this caller
+
+  public:
+    Object *ob;
+    bufferlist  bl;
+    ceph_tid_t last_write_tid;  // version of bh (if non-zero)
+    ceph_tid_t last_read_tid;   // tid of last read op (if any)
+    ceph::real_time last_write;
+    SnapContext snapc;
+    ceph_tid_t journal_tid;
+    int error; // holds return value for failed reads
+
+    map<loff_t, list<Context*> > waitfor_read;
+
+    // cons
+    explicit BufferHead(Object *o) :
+      state(STATE_MISSING),
+      ref(0),
+      dontneed(false),
+      nocache(false),
+      ob(o),
+      last_write_tid(0),
+      last_read_tid(0),
+      journal_tid(0),
+      error(0) {
+      ex.start = ex.length = 0;
+    }
+
+    // extent
+    loff_t start() const { return ex.start; }
+    void set_start(loff_t s) { ex.start = s; }
+    loff_t length() const { return ex.length; }
+    void set_length(loff_t l) { ex.length = l; }
+    loff_t end() const { return ex.start + ex.length; }
+    loff_t last() const { return end() - 1; }
+
+    // states
+    void set_state(int s) {
+      if (s == STATE_RX || s == STATE_TX) get();
+      if (state == STATE_RX || state == STATE_TX) put();
+      state = s;
+    }
+    int get_state() const { return state; }
+
+    inline ceph_tid_t get_journal_tid() const {
+      return journal_tid;
+    }
+    inline void set_journal_tid(ceph_tid_t _journal_tid) {
+      journal_tid = _journal_tid;
+    }
+
+    bool is_missing() const { return state == STATE_MISSING; }
+    bool is_dirty() const { return state == STATE_DIRTY; }
+    bool is_clean() const { return state == STATE_CLEAN; }
+    bool is_zero() const { return state == STATE_ZERO; }
+    bool is_tx() const { return state == STATE_TX; }
+    bool is_rx() const { return state == STATE_RX; }
+    bool is_error() const { return state == STATE_ERROR; }
+
+    // reference counting
+    int get() {
+      ceph_assert(ref >= 0);
+      if (ref == 0) lru_pin();
+      return ++ref;
+    }
+    int put() {
+      ceph_assert(ref > 0);
+      if (ref == 1) lru_unpin();
+      --ref;
+      return ref;
+    }
+
+    void set_dontneed(bool v) {
+      dontneed = v;
+    }
+    bool get_dontneed() const {
+      return dontneed;
+    }
+
+    void set_nocache(bool v) {
+      nocache = v;
+    }
+    bool get_nocache() const {
+      return nocache;
+    }
+
+    inline bool can_merge_journal(BufferHead *bh) const {
+      return (get_journal_tid() == bh->get_journal_tid());
+    }
+
+    struct ptr_lt {
+      bool operator()(const BufferHead* l, const BufferHead* r) const {
+	const Object *lob = l->ob;
+	const Object *rob = r->ob;
+	const ObjectSet *loset = lob->oset;
+	const ObjectSet *roset = rob->oset;
+	if (loset != roset)
+	  return loset < roset;
+	if (lob != rob)
+	  return lob < rob;
+	if (l->start() != r->start())
+	  return l->start() < r->start();
+	return l < r;
+      }
+    };
+  };
+
+  // ******* Object *********
+  class Object : public LRUObject {
+  private:
+    // ObjectCacher::Object fields
+    int ref;
+    ObjectCacher *oc;
+    sobject_t oid;
+    friend struct ObjectSet;
+
+  public:
+    uint64_t object_no;
+    ObjectSet *oset;
+    xlist<Object*>::item set_item;
+    object_locator_t oloc;
+    uint64_t truncate_size, truncate_seq;
+
+    bool complete;
+    bool exists;
+
+    map<loff_t, BufferHead*>     data;
+
+    ceph_tid_t last_write_tid;  // version of bh (if non-zero)
+    ceph_tid_t last_commit_tid; // last update committed.
+
+    int dirty_or_tx;
+
+    map< ceph_tid_t, list<Context*> > waitfor_commit;
+    xlist<C_ReadFinish*> reads;
+
+    Object(const Object&) = delete;
+    Object& operator=(const Object&) = delete;
+
+    Object(ObjectCacher *_oc, sobject_t o, uint64_t ono, ObjectSet *os,
+	   object_locator_t& l, uint64_t ts, uint64_t tq) :
+      ref(0),
+      oc(_oc),
+      oid(o), object_no(ono), oset(os), set_item(this), oloc(l),
+      truncate_size(ts), truncate_seq(tq),
+      complete(false), exists(true),
+      last_write_tid(0), last_commit_tid(0),
+      dirty_or_tx(0) {
+      // add to set
+      os->objects.push_back(&set_item);
+    }
+    ~Object() {
+      reads.clear();
+      ceph_assert(ref == 0);
+      ceph_assert(data.empty());
+      ceph_assert(dirty_or_tx == 0);
+      set_item.remove_myself();
+    }
+
+    sobject_t get_soid() const { return oid; }
+    object_t get_oid() { return oid.oid; }
+    snapid_t get_snap() { return oid.snap; }
+    ObjectSet *get_object_set() const { return oset; }
+    string get_namespace() { return oloc.nspace; }
+    uint64_t get_object_number() const { return object_no; }
+
+    const object_locator_t& get_oloc() const { return oloc; }
+    void set_object_locator(object_locator_t& l) { oloc = l; }
+
+    bool can_close() const {
+      if (lru_is_expireable()) {
+	ceph_assert(data.empty());
+	ceph_assert(waitfor_commit.empty());
+	return true;
+      }
+      return false;
+    }
+
+    /**
+     * Check buffers and waiters for consistency
+     * - no overlapping buffers
+     * - index in map matches BH
+     * - waiters fall within BH
+     */
+    void audit_buffers();
+
+    /**
+     * find first buffer that includes or follows an offset
+     *
+     * @param offset object byte offset
+     * @return iterator pointing to buffer, or data.end()
+     */
+    map<loff_t,BufferHead*>::const_iterator data_lower_bound(loff_t offset) const {
+      map<loff_t,BufferHead*>::const_iterator p = data.lower_bound(offset);
+      if (p != data.begin() &&
+	  (p == data.end() || p->first > offset)) {
+	--p;     // might overlap!
+	if (p->first + p->second->length() <= offset)
+	  ++p;   // doesn't overlap.
+      }
+      return p;
+    }
+
+    // bh
+    // add to my map
+    void add_bh(BufferHead *bh) {
+      if (data.empty())
+	get();
+      ceph_assert(data.count(bh->start()) == 0);
+      data[bh->start()] = bh;
+    }
+    void remove_bh(BufferHead *bh) {
+      ceph_assert(data.count(bh->start()));
+      data.erase(bh->start());
+      if (data.empty())
+	put();
+    }
+
+    bool is_empty() const { return data.empty(); }
+
+    // mid-level
+    BufferHead *split(BufferHead *bh, loff_t off);
+    void merge_left(BufferHead *left, BufferHead *right);
+    bool can_merge_bh(BufferHead *left, BufferHead *right);
+    void try_merge_bh(BufferHead *bh);
+    void maybe_rebuild_buffer(BufferHead *bh);
+
+    bool is_cached(loff_t off, loff_t len) const;
+    bool include_all_cached_data(loff_t off, loff_t len);
+    int map_read(ObjectExtent &ex,
+                 map<loff_t, BufferHead*>& hits,
+                 map<loff_t, BufferHead*>& missing,
+                 map<loff_t, BufferHead*>& rx,
+		 map<loff_t, BufferHead*>& errors);
+    BufferHead *map_write(ObjectExtent &ex, ceph_tid_t tid);
+
+    void replace_journal_tid(BufferHead *bh, ceph_tid_t tid);
+    void truncate(loff_t s);
+    void discard(loff_t off, loff_t len, C_GatherBuilder* commit_gather);
+
+    // reference counting
+    int get() {
+      ceph_assert(ref >= 0);
+      if (ref == 0) lru_pin();
+      return ++ref;
+    }
+    int put() {
+      ceph_assert(ref > 0);
+      if (ref == 1) lru_unpin();
+      --ref;
+      return ref;
+    }
+  };
+
+
+  struct ObjectSet {
+    void *parent;
+
+    inodeno_t ino;
+    uint64_t truncate_seq, truncate_size;
+
+    int64_t poolid;
+    xlist<Object*> objects;
+
+    int dirty_or_tx;
+    bool return_enoent;
+
+    ObjectSet(void *p, int64_t _poolid, inodeno_t i)
+      : parent(p), ino(i), truncate_seq(0),
+	truncate_size(0), poolid(_poolid), dirty_or_tx(0),
+	return_enoent(false) {}
+
+  };
+
+
+  // ******* ObjectCacher *********
+  // ObjectCacher fields
+ private:
+  WritebackHandler& writeback_handler;
+  bool scattered_write;
+
+  string name;
+  Mutex& lock;
+
+  uint64_t max_dirty, target_dirty, max_size, max_objects;
+  ceph::timespan max_dirty_age;
+  bool block_writes_upfront;
+
+  ZTracer::Endpoint trace_endpoint;
+
+  flush_set_callback_t flush_set_callback;
+  void *flush_set_callback_arg;
+
+  // indexed by pool_id
+  vector<ceph::unordered_map<sobject_t, Object*> > objects;
+
+  list<Context*> waitfor_read;
+
+  ceph_tid_t last_read_tid;
+
+  set<BufferHead*, BufferHead::ptr_lt> dirty_or_tx_bh;
+  LRU   bh_lru_dirty, bh_lru_rest;
+  LRU   ob_lru;
+
+  Cond flusher_cond;
+  bool flusher_stop;
+  void flusher_entry();
+  class FlusherThread : public Thread {
+    ObjectCacher *oc;
+  public:
+    explicit FlusherThread(ObjectCacher *o) : oc(o) {}
+    void *entry() override {
+      oc->flusher_entry();
+      return 0;
+    }
+  } flusher_thread;
+
+  Finisher finisher;
+
+  // objects
+  Object *get_object_maybe(sobject_t oid, object_locator_t &l) {
+    // have it?
+    if (((uint32_t)l.pool < objects.size()) &&
+	(objects[l.pool].count(oid)))
+      return objects[l.pool][oid];
+    return NULL;
+  }
+
+  Object *get_object(sobject_t oid, uint64_t object_no, ObjectSet *oset,
+		     object_locator_t &l, uint64_t truncate_size,
+		     uint64_t truncate_seq);
+  void close_object(Object *ob);
+
+  // bh stats
+  Cond  stat_cond;
+
+  loff_t stat_clean;
+  loff_t stat_zero;
+  loff_t stat_dirty;
+  loff_t stat_rx;
+  loff_t stat_tx;
+  loff_t stat_missing;
+  loff_t stat_error;
+  loff_t stat_dirty_waiting;   // bytes that writers are waiting on to write
+
+  size_t stat_nr_dirty_waiters;
+
+  void verify_stats() const;
+
+  void bh_stat_add(BufferHead *bh);
+  void bh_stat_sub(BufferHead *bh);
+  loff_t get_stat_tx() const { return stat_tx; }
+  loff_t get_stat_rx() const { return stat_rx; }
+  loff_t get_stat_dirty() const { return stat_dirty; }
+  loff_t get_stat_clean() const { return stat_clean; }
+  loff_t get_stat_zero() const { return stat_zero; }
+  loff_t get_stat_dirty_waiting() const { return stat_dirty_waiting; }
+  size_t get_stat_nr_dirty_waiters() const { return stat_nr_dirty_waiters; }
+
+  void touch_bh(BufferHead *bh) {
+    if (bh->is_dirty())
+      bh_lru_dirty.lru_touch(bh);
+    else
+      bh_lru_rest.lru_touch(bh);
+
+    bh->set_dontneed(false);
+    bh->set_nocache(false);
+    touch_ob(bh->ob);
+  }
+  void touch_ob(Object *ob) {
+    ob_lru.lru_touch(ob);
+  }
+  void bottouch_ob(Object *ob) {
+    ob_lru.lru_bottouch(ob);
+  }
+
+  // bh states
+  void bh_set_state(BufferHead *bh, int s);
+  void copy_bh_state(BufferHead *bh1, BufferHead *bh2) {
+    bh_set_state(bh2, bh1->get_state());
+  }
+
+  void mark_missing(BufferHead *bh) {
+    bh_set_state(bh,BufferHead::STATE_MISSING);
+  }
+  void mark_clean(BufferHead *bh) {
+    bh_set_state(bh, BufferHead::STATE_CLEAN);
+  }
+  void mark_zero(BufferHead *bh) {
+    bh_set_state(bh, BufferHead::STATE_ZERO);
+  }
+  void mark_rx(BufferHead *bh) {
+    bh_set_state(bh, BufferHead::STATE_RX);
+  }
+  void mark_tx(BufferHead *bh) {
+    bh_set_state(bh, BufferHead::STATE_TX); }
+  void mark_error(BufferHead *bh) {
+    bh_set_state(bh, BufferHead::STATE_ERROR);
+  }
+  void mark_dirty(BufferHead *bh) {
+    bh_set_state(bh, BufferHead::STATE_DIRTY);
+    bh_lru_dirty.lru_touch(bh);
+    //bh->set_dirty_stamp(ceph_clock_now());
+  }
+
+  void bh_add(Object *ob, BufferHead *bh);
+  void bh_remove(Object *ob, BufferHead *bh);
+
+  // io
+  void bh_read(BufferHead *bh, int op_flags,
+               const ZTracer::Trace &parent_trace);
+  void bh_write(BufferHead *bh, const ZTracer::Trace &parent_trace);
+  void bh_write_scattered(list<BufferHead*>& blist);
+  void bh_write_adjacencies(BufferHead *bh, ceph::real_time cutoff,
+			    int64_t *amount, int *max_count);
+
+  void trim();
+  void flush(ZTracer::Trace *trace, loff_t amount=0);
+
+  /**
+   * flush a range of buffers
+   *
+   * Flush any buffers that intersect the specified extent.  If len==0,
+   * flush *all* buffers for the object.
+   *
+   * @param o object
+   * @param off start offset
+   * @param len extent length, or 0 for entire object
+   * @return true if object was already clean/flushed.
+   */
+  bool flush(Object *o, loff_t off, loff_t len,
+             ZTracer::Trace *trace);
+  loff_t release(Object *o);
+  void purge(Object *o);
+
+  int64_t reads_outstanding;
+  Cond read_cond;
+
+  int _readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
+	     bool external_call, ZTracer::Trace *trace);
+  void retry_waiting_reads();
+
+ public:
+  void bh_read_finish(int64_t poolid, sobject_t oid, ceph_tid_t tid,
+		      loff_t offset, uint64_t length,
+		      bufferlist &bl, int r,
+		      bool trust_enoent);
+  void bh_write_commit(int64_t poolid, sobject_t oid,
+		       vector<pair<loff_t, uint64_t> >& ranges,
+		       ceph_tid_t t, int r);
+
+  class C_WriteCommit;
+  class C_WaitForWrite;
+
+  void perf_start();
+  void perf_stop();
+
+
+
+  ObjectCacher(CephContext *cct_, string name, WritebackHandler& wb, Mutex& l,
+	       flush_set_callback_t flush_callback,
+	       void *flush_callback_arg,
+	       uint64_t max_bytes, uint64_t max_objects,
+	       uint64_t max_dirty, uint64_t target_dirty, double max_age,
+	       bool block_writes_upfront);
+  ~ObjectCacher();
+
+  void start() {
+    flusher_thread.create("flusher");
+  }
+  void stop() {
+    ceph_assert(flusher_thread.is_started());
+    lock.Lock();  // hmm.. watch out for deadlock!
+    flusher_stop = true;
+    flusher_cond.Signal();
+    lock.Unlock();
+    flusher_thread.join();
+  }
+
+
+  class C_RetryRead;
+
+
+  // non-blocking.  async.
+
+  /**
+   * @note total read size must be <= INT_MAX, since
+   * the return value is total bytes read
+   */
+  int readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
+	    ZTracer::Trace *parent_trace = nullptr);
+  int writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
+	     ZTracer::Trace *parent_trace = nullptr);
+  bool is_cached(ObjectSet *oset, vector<ObjectExtent>& extents,
+		 snapid_t snapid);
+
+private:
+  // write blocking
+  int _wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset,
+                      ZTracer::Trace *trace, Context *onfreespace);
+  void maybe_wait_for_writeback(uint64_t len, ZTracer::Trace *trace);
+  bool _flush_set_finish(C_GatherBuilder *gather, Context *onfinish);
+
+  void _discard(ObjectSet *oset, const vector<ObjectExtent>& exls,
+                C_GatherBuilder* gather);
+  void _discard_finish(ObjectSet *oset, bool was_dirty, Context* on_finish);
+
+public:
+  bool set_is_empty(ObjectSet *oset);
+  bool set_is_cached(ObjectSet *oset);
+  bool set_is_dirty_or_committing(ObjectSet *oset);
+
+  bool flush_set(ObjectSet *oset, Context *onfinish=0);
+  bool flush_set(ObjectSet *oset, vector<ObjectExtent>& ex,
+                 ZTracer::Trace *trace, Context *onfinish = 0);
+  bool flush_all(Context *onfinish = 0);
+
+  void purge_set(ObjectSet *oset);
+
+  // returns # of bytes not released (ie non-clean)
+  loff_t release_set(ObjectSet *oset);
+  uint64_t release_all();
+
+  void discard_set(ObjectSet *oset, const vector<ObjectExtent>& ex);
+  void discard_writeback(ObjectSet *oset, const vector<ObjectExtent>& ex,
+                         Context* on_finish);
+
+  /**
+   * Retry any in-flight reads that get -ENOENT instead of marking
+   * them zero, and get rid of any cached -ENOENTs.
+   * After this is called and the cache's lock is unlocked,
+   * any new requests will treat -ENOENT normally.
+   */
+  void clear_nonexistence(ObjectSet *oset);
+
+
+  // cache sizes
+  void set_max_dirty(uint64_t v) {
+    max_dirty = v;
+  }
+  void set_target_dirty(int64_t v) {
+    target_dirty = v;
+  }
+  void set_max_size(int64_t v) {
+    max_size = v;
+  }
+  void set_max_dirty_age(double a) {
+    max_dirty_age = make_timespan(a);
+  }
+  void set_max_objects(int64_t v) {
+    max_objects = v;
+  }
+
+
+  // file functions
+
+  /*** async+caching (non-blocking) file interface ***/
+  int file_is_cached(ObjectSet *oset, file_layout_t *layout,
+		     snapid_t snapid, loff_t offset, uint64_t len) {
+    vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, oset->ino, layout, offset, len,
+			     oset->truncate_size, extents);
+    return is_cached(oset, extents, snapid);
+  }
+
+  int file_read(ObjectSet *oset, file_layout_t *layout, snapid_t snapid,
+		loff_t offset, uint64_t len, bufferlist *bl, int flags,
+		Context *onfinish) {
+    OSDRead *rd = prepare_read(snapid, bl, flags);
+    Striper::file_to_extents(cct, oset->ino, layout, offset, len,
+			     oset->truncate_size, rd->extents);
+    return readx(rd, oset, onfinish);
+  }
+
+  int file_write(ObjectSet *oset, file_layout_t *layout,
+		 const SnapContext& snapc, loff_t offset, uint64_t len,
+		 bufferlist& bl, ceph::real_time mtime, int flags) {
+    OSDWrite *wr = prepare_write(snapc, bl, mtime, flags, 0);
+    Striper::file_to_extents(cct, oset->ino, layout, offset, len,
+			     oset->truncate_size, wr->extents);
+    return writex(wr, oset, NULL);
+  }
+
+  bool file_flush(ObjectSet *oset, file_layout_t *layout,
+		  const SnapContext& snapc, loff_t offset, uint64_t len,
+		  Context *onfinish) {
+    vector<ObjectExtent> extents;
+    Striper::file_to_extents(cct, oset->ino, layout, offset, len,
+			     oset->truncate_size, extents);
+    ZTracer::Trace trace;
+    return flush_set(oset, extents, &trace, onfinish);
+  }
+};
+
+
+inline ostream& operator<<(ostream &out, const ObjectCacher::BufferHead &bh)
+{
+  out << "bh[ " << &bh << " "
+      << bh.start() << "~" << bh.length()
+      << " " << bh.ob
+      << " (" << bh.bl.length() << ")"
+      << " v " << bh.last_write_tid;
+  if (bh.get_journal_tid() != 0) {
+    out << " j " << bh.get_journal_tid();
+  }
+  if (bh.is_tx()) out << " tx";
+  if (bh.is_rx()) out << " rx";
+  if (bh.is_dirty()) out << " dirty";
+  if (bh.is_clean()) out << " clean";
+  if (bh.is_zero()) out << " zero";
+  if (bh.is_missing()) out << " missing";
+  if (bh.bl.length() > 0) out << " firstbyte=" << (int)bh.bl[0];
+  if (bh.error) out << " error=" << bh.error;
+  out << "]";
+  out << " waiters = {";
+  for (map<loff_t, list<Context*> >::const_iterator it
+	 = bh.waitfor_read.begin();
+       it != bh.waitfor_read.end(); ++it) {
+    out << " " << it->first << "->[";
+    for (list<Context*>::const_iterator lit = it->second.begin();
+	 lit != it->second.end(); ++lit) {
+	 out << *lit << ", ";
+    }
+    out << "]";
+  }
+  out << "}";
+  return out;
+}
+
+inline ostream& operator<<(ostream &out, const ObjectCacher::ObjectSet &os)
+{
+  return out << "objectset[" << os.ino
+	     << " ts " << os.truncate_seq << "/" << os.truncate_size
+	     << " objects " << os.objects.size()
+	     << " dirty_or_tx " << os.dirty_or_tx
+	     << "]";
+}
+
+inline ostream& operator<<(ostream &out, const ObjectCacher::Object &ob)
+{
+  out << "object["
+      << ob.get_soid() << " oset " << ob.oset << dec
+      << " wr " << ob.last_write_tid << "/" << ob.last_commit_tid;
+
+  if (ob.complete)
+    out << " COMPLETE";
+  if (!ob.exists)
+    out << " !EXISTS";
+
+  out << "]";
+  return out;
+}
+
+#endif
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
new file mode 100644
index 00000000..bc39114a
--- /dev/null
+++ b/src/osdc/Objecter.cc
@@ -0,0 +1,5285 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "Objecter.h"
+#include "osd/OSDMap.h"
+#include "Filer.h"
+
+#include "mon/MonClient.h"
+
+#include "msg/Messenger.h"
+#include "msg/Message.h"
+
+#include "messages/MPing.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDBackoff.h"
+#include "messages/MOSDMap.h"
+
+#include "messages/MPoolOp.h"
+#include "messages/MPoolOpReply.h"
+
+#include "messages/MGetPoolStats.h"
+#include "messages/MGetPoolStatsReply.h"
+#include "messages/MStatfs.h"
+#include "messages/MStatfsReply.h"
+
+#include "messages/MMonCommand.h"
+
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+
+#include "messages/MWatchNotify.h"
+
+#include <errno.h>
+
+#include "common/config.h"
+#include "common/perf_counters.h"
+#include "common/scrub_types.h"
+#include "include/str_list.h"
+#include "common/errno.h"
+#include "common/EventTrace.h"
+
+using ceph::real_time;
+using ceph::real_clock;
+
+using ceph::mono_clock;
+using ceph::mono_time;
+
+using ceph::timespan;
+
+
+#define dout_subsys ceph_subsys_objecter
+#undef dout_prefix
+#define dout_prefix *_dout << messenger->get_myname() << ".objecter "
+
+
+enum {
+  l_osdc_first = 123200,
+  l_osdc_op_active,
+  l_osdc_op_laggy,
+  l_osdc_op_send,
+  l_osdc_op_send_bytes,
+  l_osdc_op_resend,
+  l_osdc_op_reply,
+
+  l_osdc_op,
+  l_osdc_op_r,
+  l_osdc_op_w,
+  l_osdc_op_rmw,
+  l_osdc_op_pg,
+
+  l_osdc_osdop_stat,
+  l_osdc_osdop_create,
+  l_osdc_osdop_read,
+  l_osdc_osdop_write,
+  l_osdc_osdop_writefull,
+  l_osdc_osdop_writesame,
+  l_osdc_osdop_append,
+  l_osdc_osdop_zero,
+  l_osdc_osdop_truncate,
+  l_osdc_osdop_delete,
+  l_osdc_osdop_mapext,
+  l_osdc_osdop_sparse_read,
+  l_osdc_osdop_clonerange,
+  l_osdc_osdop_getxattr,
+  l_osdc_osdop_setxattr,
+  l_osdc_osdop_cmpxattr,
+  l_osdc_osdop_rmxattr,
+  l_osdc_osdop_resetxattrs,
+  l_osdc_osdop_call,
+  l_osdc_osdop_watch,
+  l_osdc_osdop_notify,
+  l_osdc_osdop_src_cmpxattr,
+  l_osdc_osdop_pgls,
+  l_osdc_osdop_pgls_filter,
+  l_osdc_osdop_other,
+
+  l_osdc_linger_active,
+  l_osdc_linger_send,
+  l_osdc_linger_resend,
+  l_osdc_linger_ping,
+
+  l_osdc_poolop_active,
+  l_osdc_poolop_send,
+  l_osdc_poolop_resend,
+
+  l_osdc_poolstat_active,
+  l_osdc_poolstat_send,
+  l_osdc_poolstat_resend,
+
+  l_osdc_statfs_active,
+  l_osdc_statfs_send,
+  l_osdc_statfs_resend,
+
+  l_osdc_command_active,
+  l_osdc_command_send,
+  l_osdc_command_resend,
+
+  l_osdc_map_epoch,
+  l_osdc_map_full,
+  l_osdc_map_inc,
+
+  l_osdc_osd_sessions,
+  l_osdc_osd_session_open,
+  l_osdc_osd_session_close,
+  l_osdc_osd_laggy,
+
+  l_osdc_osdop_omap_wr,
+  l_osdc_osdop_omap_rd,
+  l_osdc_osdop_omap_del,
+
+  l_osdc_last,
+};
+
+
+// config obs ----------------------------
+
+static const char *config_keys[] = {
+  "crush_location",
+  NULL
+};
+
+class Objecter::RequestStateHook : public AdminSocketHook {
+  Objecter *m_objecter;
+public:
+  explicit RequestStateHook(Objecter *objecter);
+  bool call(std::string_view command, const cmdmap_t& cmdmap,
+	    std::string_view format, bufferlist& out) override;
+};
+
+/**
+ * This is a more limited form of C_Contexts, but that requires
+ * a ceph_context which we don't have here.
+ */
+class ObjectOperation::C_TwoContexts : public Context {
+  Context *first;
+  Context *second;
+public:
+  C_TwoContexts(Context *first, Context *second) :
+    first(first), second(second) {}
+  void finish(int r) override {
+    first->complete(r);
+    second->complete(r);
+    first = NULL;
+    second = NULL;
+  }
+
+  ~C_TwoContexts() override {
+    delete first;
+    delete second;
+  }
+};
+
+void ObjectOperation::add_handler(Context *extra) {
+  size_t last = out_handler.size() - 1;
+  Context *orig = out_handler[last];
+  if (orig) {
+    Context *wrapper = new C_TwoContexts(orig, extra);
+    out_handler[last] = wrapper;
+  } else {
+    out_handler[last] = extra;
+  }
+}
+
+Objecter::OSDSession::unique_completion_lock Objecter::OSDSession::get_lock(
+  object_t& oid)
+{
+  if (oid.name.empty())
+    return unique_completion_lock();
+
+  static constexpr uint32_t HASH_PRIME = 1021;
+  uint32_t h = ceph_str_hash_linux(oid.name.c_str(), oid.name.size())
+    % HASH_PRIME;
+
+  return unique_completion_lock(completion_locks[h % num_locks],
+				std::defer_lock);
+}
+
+const char** Objecter::get_tracked_conf_keys() const
+{
+  return config_keys;
+}
+
+
+void Objecter::handle_conf_change(const ConfigProxy& conf,
+				  const std::set <std::string> &changed)
+{
+  if (changed.count("crush_location")) {
+    update_crush_location();
+  }
+}
+
+void Objecter::update_crush_location()
+{
+  unique_lock wl(rwlock);
+  crush_location = cct->crush_location.get_location();
+}
+
+// messages ------------------------------
+
+/*
+ * initialize only internal data structures, don't initiate cluster interaction
+ */
+void Objecter::init()
+{
+  ceph_assert(!initialized);
+
+  if (!logger) {
+    PerfCountersBuilder pcb(cct, "objecter", l_osdc_first, l_osdc_last);
+
+    pcb.add_u64(l_osdc_op_active, "op_active", "Operations active", "actv",
+		PerfCountersBuilder::PRIO_CRITICAL);
+    pcb.add_u64(l_osdc_op_laggy, "op_laggy", "Laggy operations");
+    pcb.add_u64_counter(l_osdc_op_send, "op_send", "Sent operations");
+    pcb.add_u64_counter(l_osdc_op_send_bytes, "op_send_bytes", "Sent data", NULL, 0, unit_t(UNIT_BYTES));
+    pcb.add_u64_counter(l_osdc_op_resend, "op_resend", "Resent operations");
+    pcb.add_u64_counter(l_osdc_op_reply, "op_reply", "Operation reply");
+
+    pcb.add_u64_counter(l_osdc_op, "op", "Operations");
+    pcb.add_u64_counter(l_osdc_op_r, "op_r", "Read operations", "rd",
+			PerfCountersBuilder::PRIO_CRITICAL);
+    pcb.add_u64_counter(l_osdc_op_w, "op_w", "Write operations", "wr",
+			PerfCountersBuilder::PRIO_CRITICAL);
+    pcb.add_u64_counter(l_osdc_op_rmw, "op_rmw", "Read-modify-write operations",
+			"rdwr", PerfCountersBuilder::PRIO_INTERESTING);
+    pcb.add_u64_counter(l_osdc_op_pg, "op_pg", "PG operation");
+
+    pcb.add_u64_counter(l_osdc_osdop_stat, "osdop_stat", "Stat operations");
+    pcb.add_u64_counter(l_osdc_osdop_create, "osdop_create",
+			"Create object operations");
+    pcb.add_u64_counter(l_osdc_osdop_read, "osdop_read", "Read operations");
+    pcb.add_u64_counter(l_osdc_osdop_write, "osdop_write", "Write operations");
+    pcb.add_u64_counter(l_osdc_osdop_writefull, "osdop_writefull",
+			"Write full object operations");
+    pcb.add_u64_counter(l_osdc_osdop_writesame, "osdop_writesame",
+                        "Write same operations");
+    pcb.add_u64_counter(l_osdc_osdop_append, "osdop_append",
+			"Append operation");
+    pcb.add_u64_counter(l_osdc_osdop_zero, "osdop_zero",
+			"Set object to zero operations");
+    pcb.add_u64_counter(l_osdc_osdop_truncate, "osdop_truncate",
+			"Truncate object operations");
+    pcb.add_u64_counter(l_osdc_osdop_delete, "osdop_delete",
+			"Delete object operations");
+    pcb.add_u64_counter(l_osdc_osdop_mapext, "osdop_mapext",
+			"Map extent operations");
+    pcb.add_u64_counter(l_osdc_osdop_sparse_read, "osdop_sparse_read",
+			"Sparse read operations");
+    pcb.add_u64_counter(l_osdc_osdop_clonerange, "osdop_clonerange",
+			"Clone range operations");
+    pcb.add_u64_counter(l_osdc_osdop_getxattr, "osdop_getxattr",
+			"Get xattr operations");
+    pcb.add_u64_counter(l_osdc_osdop_setxattr, "osdop_setxattr",
+			"Set xattr operations");
+    pcb.add_u64_counter(l_osdc_osdop_cmpxattr, "osdop_cmpxattr",
+			"Xattr comparison operations");
+    pcb.add_u64_counter(l_osdc_osdop_rmxattr, "osdop_rmxattr",
+			"Remove xattr operations");
+    pcb.add_u64_counter(l_osdc_osdop_resetxattrs, "osdop_resetxattrs",
+			"Reset xattr operations");
+    pcb.add_u64_counter(l_osdc_osdop_call, "osdop_call",
+			"Call (execute) operations");
+    pcb.add_u64_counter(l_osdc_osdop_watch, "osdop_watch",
+			"Watch by object operations");
+    pcb.add_u64_counter(l_osdc_osdop_notify, "osdop_notify",
+			"Notify about object operations");
+    pcb.add_u64_counter(l_osdc_osdop_src_cmpxattr, "osdop_src_cmpxattr",
+			"Extended attribute comparison in multi operations");
+    pcb.add_u64_counter(l_osdc_osdop_pgls, "osdop_pgls");
+    pcb.add_u64_counter(l_osdc_osdop_pgls_filter, "osdop_pgls_filter");
+    pcb.add_u64_counter(l_osdc_osdop_other, "osdop_other", "Other operations");
+
+    pcb.add_u64(l_osdc_linger_active, "linger_active",
+		"Active lingering operations");
+    pcb.add_u64_counter(l_osdc_linger_send, "linger_send",
+			"Sent lingering operations");
+    pcb.add_u64_counter(l_osdc_linger_resend, "linger_resend",
+			"Resent lingering operations");
+    pcb.add_u64_counter(l_osdc_linger_ping, "linger_ping",
+			"Sent pings to lingering operations");
+
+    pcb.add_u64(l_osdc_poolop_active, "poolop_active",
+		"Active pool operations");
+    pcb.add_u64_counter(l_osdc_poolop_send, "poolop_send",
+			"Sent pool operations");
+    pcb.add_u64_counter(l_osdc_poolop_resend, "poolop_resend",
+			"Resent pool operations");
+
+    pcb.add_u64(l_osdc_poolstat_active, "poolstat_active",
+		"Active get pool stat operations");
+    pcb.add_u64_counter(l_osdc_poolstat_send, "poolstat_send",
+			"Pool stat operations sent");
+    pcb.add_u64_counter(l_osdc_poolstat_resend, "poolstat_resend",
+			"Resent pool stats");
+
+    pcb.add_u64(l_osdc_statfs_active, "statfs_active", "Statfs operations");
+    pcb.add_u64_counter(l_osdc_statfs_send, "statfs_send", "Sent FS stats");
+    pcb.add_u64_counter(l_osdc_statfs_resend, "statfs_resend",
+			"Resent FS stats");
+
+    pcb.add_u64(l_osdc_command_active, "command_active", "Active commands");
+    pcb.add_u64_counter(l_osdc_command_send, "command_send",
+			"Sent commands");
+    pcb.add_u64_counter(l_osdc_command_resend, "command_resend",
+			"Resent commands");
+
+    pcb.add_u64(l_osdc_map_epoch, "map_epoch", "OSD map epoch");
+    pcb.add_u64_counter(l_osdc_map_full, "map_full",
+			"Full OSD maps received");
+    pcb.add_u64_counter(l_osdc_map_inc, "map_inc",
+			"Incremental OSD maps received");
+
+    pcb.add_u64(l_osdc_osd_sessions, "osd_sessions",
+		"Open sessions");  // open sessions
+    pcb.add_u64_counter(l_osdc_osd_session_open, "osd_session_open",
+			"Sessions opened");
+    pcb.add_u64_counter(l_osdc_osd_session_close, "osd_session_close",
+			"Sessions closed");
+    pcb.add_u64(l_osdc_osd_laggy, "osd_laggy", "Laggy OSD sessions");
+
+    pcb.add_u64_counter(l_osdc_osdop_omap_wr, "omap_wr",
+			"OSD OMAP write operations");
+    pcb.add_u64_counter(l_osdc_osdop_omap_rd, "omap_rd",
+			"OSD OMAP read operations");
+    pcb.add_u64_counter(l_osdc_osdop_omap_del, "omap_del",
+			"OSD OMAP delete operations");
+
+    logger = pcb.create_perf_counters();
+    cct->get_perfcounters_collection()->add(logger);
+  }
+
+  m_request_state_hook = new RequestStateHook(this);
+  AdminSocket* admin_socket = cct->get_admin_socket();
+  int ret = admin_socket->register_command("objecter_requests",
+					   "objecter_requests",
+					   m_request_state_hook,
+					   "show in-progress osd requests");
+
+  /* Don't warn on EEXIST, happens if multiple ceph clients
+   * are instantiated from one process */
+  if (ret < 0 && ret != -EEXIST) {
+    lderr(cct) << "error registering admin socket command: "
+	       << cpp_strerror(ret) << dendl;
+  }
+
+  update_crush_location();
+
+  cct->_conf.add_observer(this);
+
+  initialized = true;
+}
+
+/*
+ * ok, cluster interaction can happen
+ */
+void Objecter::start(const OSDMap* o)
+{
+  shared_lock rl(rwlock);
+
+  start_tick();
+  if (o) {
+    osdmap->deepish_copy_from(*o);
+  } else if (osdmap->get_epoch() == 0) {
+    _maybe_request_map();
+  }
+}
+
+void Objecter::shutdown()
+{
+  ceph_assert(initialized);
+
+  unique_lock wl(rwlock);
+
+  initialized = false;
+
+  wl.unlock();
+  cct->_conf.remove_observer(this);
+  wl.lock();
+
+  map<int,OSDSession*>::iterator p;
+  while (!osd_sessions.empty()) {
+    p = osd_sessions.begin();
+    close_session(p->second);
+  }
+
+  while(!check_latest_map_lingers.empty()) {
+    map<uint64_t, LingerOp*>::iterator i = check_latest_map_lingers.begin();
+    i->second->put();
+    check_latest_map_lingers.erase(i->first);
+  }
+
+  while(!check_latest_map_ops.empty()) {
+    map<ceph_tid_t, Op*>::iterator i = check_latest_map_ops.begin();
+    i->second->put();
+    check_latest_map_ops.erase(i->first);
+  }
+
+  while(!check_latest_map_commands.empty()) {
+    map<ceph_tid_t, CommandOp*>::iterator i
+      = check_latest_map_commands.begin();
+    i->second->put();
+    check_latest_map_commands.erase(i->first);
+  }
+
+  while(!poolstat_ops.empty()) {
+    map<ceph_tid_t,PoolStatOp*>::iterator i = poolstat_ops.begin();
+    delete i->second;
+    poolstat_ops.erase(i->first);
+  }
+
+  while(!statfs_ops.empty()) {
+    map<ceph_tid_t, StatfsOp*>::iterator i = statfs_ops.begin();
+    delete i->second;
+    statfs_ops.erase(i->first);
+  }
+
+  while(!pool_ops.empty()) {
+    map<ceph_tid_t, PoolOp*>::iterator i = pool_ops.begin();
+    delete i->second;
+    pool_ops.erase(i->first);
+  }
+
+  ldout(cct, 20) << __func__ << " clearing up homeless session..." << dendl;
+  while(!homeless_session->linger_ops.empty()) {
+    std::map<uint64_t, LingerOp*>::iterator i
+      = homeless_session->linger_ops.begin();
+    ldout(cct, 10) << " linger_op " << i->first << dendl;
+    LingerOp *lop = i->second;
+    {
+      OSDSession::unique_lock swl(homeless_session->lock);
+      _session_linger_op_remove(homeless_session, lop);
+    }
+    linger_ops.erase(lop->linger_id);
+    linger_ops_set.erase(lop);
+    lop->put();
+  }
+
+  while(!homeless_session->ops.empty()) {
+    std::map<ceph_tid_t, Op*>::iterator i = homeless_session->ops.begin();
+    ldout(cct, 10) << " op " << i->first << dendl;
+    Op *op = i->second;
+    {
+      OSDSession::unique_lock swl(homeless_session->lock);
+      _session_op_remove(homeless_session, op);
+    }
+    op->put();
+  }
+
+  while(!homeless_session->command_ops.empty()) {
+    std::map<ceph_tid_t, CommandOp*>::iterator i
+      = homeless_session->command_ops.begin();
+    ldout(cct, 10) << " command_op " << i->first << dendl;
+    CommandOp *cop = i->second;
+    {
+      OSDSession::unique_lock swl(homeless_session->lock);
+      _session_command_op_remove(homeless_session, cop);
+    }
+    cop->put();
+  }
+
+  if (tick_event) {
+    if (timer.cancel_event(tick_event)) {
+      ldout(cct, 10) <<  " successfully canceled tick" << dendl;
+    }
+    tick_event = 0;
+  }
+
+  if (logger) {
+    cct->get_perfcounters_collection()->remove(logger);
+    delete logger;
+    logger = NULL;
+  }
+
+  // Let go of Objecter write lock so timer thread can shutdown
+  wl.unlock();
+
+  // Outside of lock to avoid cycle WRT calls to RequestStateHook
+  // This is safe because we guarantee no concurrent calls to
+  // shutdown() with the ::initialized check at start.
+  if (m_request_state_hook) {
+    AdminSocket* admin_socket = cct->get_admin_socket();
+    admin_socket->unregister_command("objecter_requests");
+    delete m_request_state_hook;
+    m_request_state_hook = NULL;
+  }
+}
+
+void Objecter::_send_linger(LingerOp *info,
+			    shunique_lock& sul)
+{
+  ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock);
+
+  vector<OSDOp> opv;
+  Context *oncommit = NULL;
+  LingerOp::shared_lock watchl(info->watch_lock);
+  bufferlist *poutbl = NULL;
+  if (info->registered && info->is_watch) {
+    ldout(cct, 15) << "send_linger " << info->linger_id << " reconnect"
+		   << dendl;
+    opv.push_back(OSDOp());
+    opv.back().op.op = CEPH_OSD_OP_WATCH;
+    opv.back().op.watch.cookie = info->get_cookie();
+    opv.back().op.watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
+    opv.back().op.watch.gen = ++info->register_gen;
+    oncommit = new C_Linger_Reconnect(this, info);
+  } else {
+    ldout(cct, 15) << "send_linger " << info->linger_id << " register"
+		   << dendl;
+    opv = info->ops;
+    C_Linger_Commit *c = new C_Linger_Commit(this, info);
+    if (!info->is_watch) {
+      info->notify_id = 0;
+      poutbl = &c->outbl;
+    }
+    oncommit = c;
+  }
+  watchl.unlock();
+  Op *o = new Op(info->target.base_oid, info->target.base_oloc,
+		 opv, info->target.flags | CEPH_OSD_FLAG_READ,
+		 oncommit, info->pobjver);
+  o->outbl = poutbl;
+  o->snapid = info->snap;
+  o->snapc = info->snapc;
+  o->mtime = info->mtime;
+
+  o->target = info->target;
+  o->tid = ++last_tid;
+
+  // do not resend this; we will send a new op to reregister
+  o->should_resend = false;
+  o->ctx_budgeted = true;
+
+  if (info->register_tid) {
+    // repeat send.  cancel old registration op, if any.
+    OSDSession::unique_lock sl(info->session->lock);
+    if (info->session->ops.count(info->register_tid)) {
+      Op *o = info->session->ops[info->register_tid];
+      _op_cancel_map_check(o);
+      _cancel_linger_op(o);
+    }
+    sl.unlock();
+  }
+
+  _op_submit_with_budget(o, sul, &info->register_tid, &info->ctx_budget);
+
+  logger->inc(l_osdc_linger_send);
+}
+
+void Objecter::_linger_commit(LingerOp *info, int r, bufferlist& outbl)
+{
+  LingerOp::unique_lock wl(info->watch_lock);
+  ldout(cct, 10) << "_linger_commit " << info->linger_id << dendl;
+  if (info->on_reg_commit) {
+    info->on_reg_commit->complete(r);
+    info->on_reg_commit = NULL;
+  }
+  if (r < 0 && info->on_notify_finish) {
+    info->on_notify_finish->complete(r);
+    info->on_notify_finish = nullptr;
+  }
+
+  // only tell the user the first time we do this
+  info->registered = true;
+  info->pobjver = NULL;
+
+  if (!info->is_watch) {
+    // make note of the notify_id
+    auto p = outbl.cbegin();
+    try {
+      decode(info->notify_id, p);
+      ldout(cct, 10) << "_linger_commit  notify_id=" << info->notify_id
+		     << dendl;
+    }
+    catch (buffer::error& e) {
+    }
+  }
+}
+
+struct C_DoWatchError : public Context {
+  Objecter *objecter;
+  Objecter::LingerOp *info;
+  int err;
+  C_DoWatchError(Objecter *o, Objecter::LingerOp *i, int r)
+    : objecter(o), info(i), err(r) {
+    info->get();
+    info->_queued_async();
+  }
+  void finish(int r) override {
+    Objecter::unique_lock wl(objecter->rwlock);
+    bool canceled = info->canceled;
+    wl.unlock();
+
+    if (!canceled) {
+      info->watch_context->handle_error(info->get_cookie(), err);
+    }
+
+    info->finished_async();
+    info->put();
+  }
+};
+
+int Objecter::_normalize_watch_error(int r)
+{
+  // translate ENOENT -> ENOTCONN so that a delete->disconnection
+  // notification and a failure to reconnect because we raced with
+  // the delete appear the same to the user.
+  if (r == -ENOENT)
+    r = -ENOTCONN;
+  return r;
+}
+
+void Objecter::_linger_reconnect(LingerOp *info, int r)
+{
+  ldout(cct, 10) << __func__ << " " << info->linger_id << " = " << r
+		 << " (last_error " << info->last_error << ")" << dendl;
+  if (r < 0) {
+    LingerOp::unique_lock wl(info->watch_lock);
+    if (!info->last_error) {
+      r = _normalize_watch_error(r);
+      info->last_error = r;
+      if (info->watch_context) {
+	finisher->queue(new C_DoWatchError(this, info, r));
+      }
+    }
+    wl.unlock();
+  }
+}
+
+void Objecter::_send_linger_ping(LingerOp *info)
+{
+  // rwlock is locked unique
+  // info->session->lock is locked
+
+  if (cct->_conf->objecter_inject_no_watch_ping) {
+    ldout(cct, 10) << __func__ << " " << info->linger_id << " SKIPPING"
+		   << dendl;
+    return;
+  }
+  if (osdmap->test_flag(CEPH_OSDMAP_PAUSERD)) {
+    ldout(cct, 10) << __func__ << " PAUSERD" << dendl;
+    return;
+  }
+
+  ceph::coarse_mono_time now = ceph::coarse_mono_clock::now();
+  ldout(cct, 10) << __func__ << " " << info->linger_id << " now " << now
+		 << dendl;
+
+  vector<OSDOp> opv(1);
+  opv[0].op.op = CEPH_OSD_OP_WATCH;
+  opv[0].op.watch.cookie = info->get_cookie();
+  opv[0].op.watch.op = CEPH_OSD_WATCH_OP_PING;
+  opv[0].op.watch.gen = info->register_gen;
+  C_Linger_Ping *onack = new C_Linger_Ping(this, info);
+  Op *o = new Op(info->target.base_oid, info->target.base_oloc,
+		 opv, info->target.flags | CEPH_OSD_FLAG_READ,
+		 onack, NULL, NULL);
+  o->target = info->target;
+  o->should_resend = false;
+  _send_op_account(o);
+  o->tid = ++last_tid;
+  _session_op_assign(info->session, o);
+  _send_op(o);
+  info->ping_tid = o->tid;
+
+  onack->sent = now;
+  logger->inc(l_osdc_linger_ping);
+}
+
+void Objecter::_linger_ping(LingerOp *info, int r, ceph::coarse_mono_time sent,
+			    uint32_t register_gen)
+{
+  LingerOp::unique_lock l(info->watch_lock);
+  ldout(cct, 10) << __func__ << " " << info->linger_id
+		 << " sent " << sent << " gen " << register_gen << " = " << r
+		 << " (last_error " << info->last_error
+		 << " register_gen " << info->register_gen << ")" << dendl;
+  if (info->register_gen == register_gen) {
+    if (r == 0) {
+      info->watch_valid_thru = sent;
+    } else if (r < 0 && !info->last_error) {
+      r = _normalize_watch_error(r);
+      info->last_error = r;
+      if (info->watch_context) {
+	finisher->queue(new C_DoWatchError(this, info, r));
+      }
+    }
+  } else {
+    ldout(cct, 20) << " ignoring old gen" << dendl;
+  }
+}
+
+int Objecter::linger_check(LingerOp *info)
+{
+  LingerOp::shared_lock l(info->watch_lock);
+
+  ceph::coarse_mono_time stamp = info->watch_valid_thru;
+  if (!info->watch_pending_async.empty())
+    stamp = std::min(info->watch_valid_thru, info->watch_pending_async.front());
+  auto age = ceph::coarse_mono_clock::now() - stamp;
+
+  ldout(cct, 10) << __func__ << " " << info->linger_id
+		 << " err " << info->last_error
+		 << " age " << age << dendl;
+  if (info->last_error)
+    return info->last_error;
+  // return a safe upper bound (we are truncating to ms)
+  return
+    1 + std::chrono::duration_cast<std::chrono::milliseconds>(age).count();
+}
+
+void Objecter::linger_cancel(LingerOp *info)
+{
+  unique_lock wl(rwlock);
+  _linger_cancel(info);
+  info->put();
+}
+
+void Objecter::_linger_cancel(LingerOp *info)
+{
+  // rwlock is locked unique
+  ldout(cct, 20) << __func__ << " linger_id=" << info->linger_id << dendl;
+  if (!info->canceled) {
+    OSDSession *s = info->session;
+    OSDSession::unique_lock sl(s->lock);
+    _session_linger_op_remove(s, info);
+    sl.unlock();
+
+    linger_ops.erase(info->linger_id);
+    linger_ops_set.erase(info);
+    ceph_assert(linger_ops.size() == linger_ops_set.size());
+
+    info->canceled = true;
+    info->put();
+
+    logger->dec(l_osdc_linger_active);
+  }
+}
+
+
+
+Objecter::LingerOp *Objecter::linger_register(const object_t& oid,
+					      const object_locator_t& oloc,
+					      int flags)
+{
+  LingerOp *info = new LingerOp(this);
+  info->target.base_oid = oid;
+  info->target.base_oloc = oloc;
+  if (info->target.base_oloc.key == oid)
+    info->target.base_oloc.key.clear();
+  info->target.flags = flags;
+  info->watch_valid_thru = ceph::coarse_mono_clock::now();
+
+  unique_lock l(rwlock);
+
+  // Acquire linger ID
+  info->linger_id = ++max_linger_id;
+  ldout(cct, 10) << __func__ << " info " << info
+		 << " linger_id " << info->linger_id
+		 << " cookie " << info->get_cookie()
+		 << dendl;
+  linger_ops[info->linger_id] = info;
+  linger_ops_set.insert(info);
+  ceph_assert(linger_ops.size() == linger_ops_set.size());
+
+  info->get(); // for the caller
+  return info;
+}
+
+ceph_tid_t Objecter::linger_watch(LingerOp *info,
+				  ObjectOperation& op,
+				  const SnapContext& snapc,
+				  real_time mtime,
+				  bufferlist& inbl,
+				  Context *oncommit,
+				  version_t *objver)
+{
+  info->is_watch = true;
+  info->snapc = snapc;
+  info->mtime = mtime;
+  info->target.flags |= CEPH_OSD_FLAG_WRITE;
+  info->ops = op.ops;
+  info->inbl = inbl;
+  info->poutbl = NULL;
+  info->pobjver = objver;
+  info->on_reg_commit = oncommit;
+
+  info->ctx_budget = take_linger_budget(info);
+
+  shunique_lock sul(rwlock, ceph::acquire_unique);
+  _linger_submit(info, sul);
+  logger->inc(l_osdc_linger_active);
+
+  return info->linger_id;
+}
+
+ceph_tid_t Objecter::linger_notify(LingerOp *info,
+				   ObjectOperation& op,
+				   snapid_t snap, bufferlist& inbl,
+				   bufferlist *poutbl,
+				   Context *onfinish,
+				   version_t *objver)
+{
+  info->snap = snap;
+  info->target.flags |= CEPH_OSD_FLAG_READ;
+  info->ops = op.ops;
+  info->inbl = inbl;
+  info->poutbl = poutbl;
+  info->pobjver = objver;
+  info->on_reg_commit = onfinish;
+
+  info->ctx_budget = take_linger_budget(info);
+  
+  shunique_lock sul(rwlock, ceph::acquire_unique);
+  _linger_submit(info, sul);
+  logger->inc(l_osdc_linger_active);
+
+  return info->linger_id;
+}
+
+void Objecter::_linger_submit(LingerOp *info, shunique_lock& sul)
+{
+  ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock);
+  ceph_assert(info->linger_id);
+  ceph_assert(info->ctx_budget != -1); // caller needs to have taken budget already!
+
+  // Populate Op::target
+  OSDSession *s = NULL;
+  _calc_target(&info->target, nullptr);
+
+  // Create LingerOp<->OSDSession relation
+  int r = _get_session(info->target.osd, &s, sul);
+  ceph_assert(r == 0);
+  OSDSession::unique_lock sl(s->lock);
+  _session_linger_op_assign(s, info);
+  sl.unlock();
+  put_session(s);
+
+  _send_linger(info, sul);
+}
+
+struct C_DoWatchNotify : public Context {
+  Objecter *objecter;
+  Objecter::LingerOp *info;
+  MWatchNotify *msg;
+  C_DoWatchNotify(Objecter *o, Objecter::LingerOp *i, MWatchNotify *m)
+    : objecter(o), info(i), msg(m) {
+    info->get();
+    info->_queued_async();
+    msg->get();
+  }
+  void finish(int r) override {
+    objecter->_do_watch_notify(info, msg);
+  }
+};
+
+void Objecter::handle_watch_notify(MWatchNotify *m)
+{
+  shared_lock l(rwlock);
+  if (!initialized) {
+    return;
+  }
+
+  LingerOp *info = reinterpret_cast<LingerOp*>(m->cookie);
+  if (linger_ops_set.count(info) == 0) {
+    ldout(cct, 7) << __func__ << " cookie " << m->cookie << " dne" << dendl;
+    return;
+  }
+  LingerOp::unique_lock wl(info->watch_lock);
+  if (m->opcode == CEPH_WATCH_EVENT_DISCONNECT) {
+    if (!info->last_error) {
+      info->last_error = -ENOTCONN;
+      if (info->watch_context) {
+	finisher->queue(new C_DoWatchError(this, info, -ENOTCONN));
+      }
+    }
+  } else if (!info->is_watch) {
+    // we have CEPH_WATCH_EVENT_NOTIFY_COMPLETE; we can do this inline
+    // since we know the only user (librados) is safe to call in
+    // fast-dispatch context
+    if (info->notify_id &&
+	info->notify_id != m->notify_id) {
+      ldout(cct, 10) << __func__ << " reply notify " << m->notify_id
+		     << " != " << info->notify_id << ", ignoring" << dendl;
+    } else if (info->on_notify_finish) {
+      info->notify_result_bl->claim(m->get_data());
+      info->on_notify_finish->complete(m->return_code);
+
+      // if we race with reconnect we might get a second notify; only
+      // notify the caller once!
+      info->on_notify_finish = NULL;
+    }
+  } else {
+    finisher->queue(new C_DoWatchNotify(this, info, m));
+  }
+}
+
+void Objecter::_do_watch_notify(LingerOp *info, MWatchNotify *m)
+{
+  ldout(cct, 10) << __func__ << " " << *m << dendl;
+
+  shared_lock l(rwlock);
+  ceph_assert(initialized);
+
+  if (info->canceled) {
+    l.unlock();
+    goto out;
+  }
+
+  // notify completion?
+  ceph_assert(info->is_watch);
+  ceph_assert(info->watch_context);
+  ceph_assert(m->opcode != CEPH_WATCH_EVENT_DISCONNECT);
+
+  l.unlock();
+
+  switch (m->opcode) {
+  case CEPH_WATCH_EVENT_NOTIFY:
+    info->watch_context->handle_notify(m->notify_id, m->cookie,
+				       m->notifier_gid, m->bl);
+    break;
+  }
+
+ out:
+  info->finished_async();
+  info->put();
+  m->put();
+}
+
+bool Objecter::ms_dispatch(Message *m)
+{
+  ldout(cct, 10) << __func__ << " " << cct << " " << *m << dendl;
+  switch (m->get_type()) {
+    // these we exlusively handle
+  case CEPH_MSG_OSD_OPREPLY:
+    handle_osd_op_reply(static_cast<MOSDOpReply*>(m));
+    return true;
+
+  case CEPH_MSG_OSD_BACKOFF:
+    handle_osd_backoff(static_cast<MOSDBackoff*>(m));
+    return true;
+
+  case CEPH_MSG_WATCH_NOTIFY:
+    handle_watch_notify(static_cast<MWatchNotify*>(m));
+    m->put();
+    return true;
+
+  case MSG_COMMAND_REPLY:
+    if (m->get_source().type() == CEPH_ENTITY_TYPE_OSD) {
+      handle_command_reply(static_cast<MCommandReply*>(m));
+      return true;
+    } else {
+      return false;
+    }
+
+  case MSG_GETPOOLSTATSREPLY:
+    handle_get_pool_stats_reply(static_cast<MGetPoolStatsReply*>(m));
+    return true;
+
+  case CEPH_MSG_POOLOP_REPLY:
+    handle_pool_op_reply(static_cast<MPoolOpReply*>(m));
+    return true;
+
+  case CEPH_MSG_STATFS_REPLY:
+    handle_fs_stats_reply(static_cast<MStatfsReply*>(m));
+    return true;
+
+    // these we give others a chance to inspect
+
+    // MDS, OSD
+  case CEPH_MSG_OSD_MAP:
+    handle_osd_map(static_cast<MOSDMap*>(m));
+    return false;
+  }
+  return false;
+}
+
+void Objecter::_scan_requests(
+  OSDSession *s,
+  bool skipped_map,
+  bool cluster_full,
+  map<int64_t, bool> *pool_full_map,
+  map<ceph_tid_t, Op*>& need_resend,
+  list<LingerOp*>& need_resend_linger,
+  map<ceph_tid_t, CommandOp*>& need_resend_command,
+  shunique_lock& sul,
+  const mempool::osdmap::map<int64_t,OSDMap::snap_interval_set_t> *gap_removed_snaps)
+{
+  ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock);
+
+  list<LingerOp*> unregister_lingers;
+
+  OSDSession::unique_lock sl(s->lock);
+
+  // check for changed linger mappings (_before_ regular ops)
+  map<ceph_tid_t,LingerOp*>::iterator lp = s->linger_ops.begin();
+  while (lp != s->linger_ops.end()) {
+    LingerOp *op = lp->second;
+    ceph_assert(op->session == s);
+    // check_linger_pool_dne() may touch linger_ops; prevent iterator
+    // invalidation
+    ++lp;
+    ldout(cct, 10) << " checking linger op " << op->linger_id << dendl;
+    bool unregister, force_resend_writes = cluster_full;
+    int r = _recalc_linger_op_target(op, sul);
+    if (pool_full_map)
+      force_resend_writes = force_resend_writes ||
+	(*pool_full_map)[op->target.base_oloc.pool];
+    switch (r) {
+    case RECALC_OP_TARGET_NO_ACTION:
+      if (!skipped_map && !force_resend_writes)
+	break;
+      // -- fall-thru --
+    case RECALC_OP_TARGET_NEED_RESEND:
+      need_resend_linger.push_back(op);
+      _linger_cancel_map_check(op);
+      break;
+    case RECALC_OP_TARGET_POOL_DNE:
+      _check_linger_pool_dne(op, &unregister);
+      if (unregister) {
+	ldout(cct, 10) << " need to unregister linger op "
+		       << op->linger_id << dendl;
+	op->get();
+	unregister_lingers.push_back(op);
+      }
+      break;
+    }
+  }
+
+  // check for changed request mappings
+  map<ceph_tid_t,Op*>::iterator p = s->ops.begin();
+  while (p != s->ops.end()) {
+    Op *op = p->second;
+    ++p;   // check_op_pool_dne() may touch ops; prevent iterator invalidation
+    ldout(cct, 10) << " checking op " << op->tid << dendl;
+    _prune_snapc(osdmap->get_new_removed_snaps(), op);
+    if (skipped_map) {
+      _prune_snapc(*gap_removed_snaps, op);
+    }
+    bool force_resend_writes = cluster_full;
+    if (pool_full_map)
+      force_resend_writes = force_resend_writes ||
+	(*pool_full_map)[op->target.base_oloc.pool];
+    int r = _calc_target(&op->target,
+			 op->session ? op->session->con.get() : nullptr);
+    switch (r) {
+    case RECALC_OP_TARGET_NO_ACTION:
+      if (!skipped_map && !(force_resend_writes && op->respects_full()))
+	break;
+      // -- fall-thru --
+    case RECALC_OP_TARGET_NEED_RESEND:
+      _session_op_remove(op->session, op);
+      need_resend[op->tid] = op;
+      _op_cancel_map_check(op);
+      break;
+    case RECALC_OP_TARGET_POOL_DNE:
+      _check_op_pool_dne(op, &sl);
+      break;
+    }
+  }
+
+  // commands
+  map<ceph_tid_t,CommandOp*>::iterator cp = s->command_ops.begin();
+  while (cp != s->command_ops.end()) {
+    CommandOp *c = cp->second;
+    ++cp;
+    ldout(cct, 10) << " checking command " << c->tid << dendl;
+    bool force_resend_writes = cluster_full;
+    if (pool_full_map)
+      force_resend_writes = force_resend_writes ||
+	(*pool_full_map)[c->target_pg.pool()];
+    int r = _calc_command_target(c, sul);
+    switch (r) {
+    case RECALC_OP_TARGET_NO_ACTION:
+      // resend if skipped map; otherwise do nothing.
+      if (!skipped_map && !force_resend_writes)
+	break;
+      // -- fall-thru --
+    case RECALC_OP_TARGET_NEED_RESEND:
+      need_resend_command[c->tid] = c;
+      _session_command_op_remove(c->session, c);
+      _command_cancel_map_check(c);
+      break;
+    case RECALC_OP_TARGET_POOL_DNE:
+    case RECALC_OP_TARGET_OSD_DNE:
+    case RECALC_OP_TARGET_OSD_DOWN:
+      _check_command_map_dne(c);
+      break;
+    }
+  }
+
+  sl.unlock();
+
+  for (list<LingerOp*>::iterator iter = unregister_lingers.begin();
+       iter != unregister_lingers.end();
+       ++iter) {
+    _linger_cancel(*iter);
+    (*iter)->put();
+  }
+}
+
+void Objecter::handle_osd_map(MOSDMap *m)
+{
+  shunique_lock sul(rwlock, acquire_unique);
+  if (!initialized)
+    return;
+
+  ceph_assert(osdmap);
+
+  if (m->fsid != monc->get_fsid()) {
+    ldout(cct, 0) << "handle_osd_map fsid " << m->fsid
+		  << " != " << monc->get_fsid() << dendl;
+    return;
+  }
+
+  bool was_pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD);
+  bool cluster_full = _osdmap_full_flag();
+  bool was_pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || cluster_full ||
+    _osdmap_has_pool_full();
+  map<int64_t, bool> pool_full_map;
+  for (map<int64_t, pg_pool_t>::const_iterator it
+	 = osdmap->get_pools().begin();
+       it != osdmap->get_pools().end(); ++it)
+    pool_full_map[it->first] = _osdmap_pool_full(it->second);
+
+
+  list<LingerOp*> need_resend_linger;
+  map<ceph_tid_t, Op*> need_resend;
+  map<ceph_tid_t, CommandOp*> need_resend_command;
+
+  if (m->get_last() <= osdmap->get_epoch()) {
+    ldout(cct, 3) << "handle_osd_map ignoring epochs ["
+		  << m->get_first() << "," << m->get_last()
+		  << "] <= " << osdmap->get_epoch() << dendl;
+  } else {
+    ldout(cct, 3) << "handle_osd_map got epochs ["
+		  << m->get_first() << "," << m->get_last()
+		  << "] > " << osdmap->get_epoch() << dendl;
+
+    if (osdmap->get_epoch()) {
+      bool skipped_map = false;
+      // we want incrementals
+      for (epoch_t e = osdmap->get_epoch() + 1;
+	   e <= m->get_last();
+	   e++) {
+
+	if (osdmap->get_epoch() == e-1 &&
+	    m->incremental_maps.count(e)) {
+	  ldout(cct, 3) << "handle_osd_map decoding incremental epoch " << e
+			<< dendl;
+	  OSDMap::Incremental inc(m->incremental_maps[e]);
+	  osdmap->apply_incremental(inc);
+
+          emit_blacklist_events(inc);
+
+	  logger->inc(l_osdc_map_inc);
+	}
+	else if (m->maps.count(e)) {
+	  ldout(cct, 3) << "handle_osd_map decoding full epoch " << e << dendl;
+          OSDMap *new_osdmap = new OSDMap();
+          new_osdmap->decode(m->maps[e]);
+
+          emit_blacklist_events(*osdmap, *new_osdmap);
+
+          osdmap = new_osdmap;
+
+	  logger->inc(l_osdc_map_full);
+	}
+	else {
+	  if (e >= m->get_oldest()) {
+	    ldout(cct, 3) << "handle_osd_map requesting missing epoch "
+			  << osdmap->get_epoch()+1 << dendl;
+	    _maybe_request_map();
+	    break;
+	  }
+	  ldout(cct, 3) << "handle_osd_map missing epoch "
+			<< osdmap->get_epoch()+1
+			<< ", jumping to " << m->get_oldest() << dendl;
+	  e = m->get_oldest() - 1;
+	  skipped_map = true;
+	  continue;
+	}
+	logger->set(l_osdc_map_epoch, osdmap->get_epoch());
+
+	cluster_full = cluster_full || _osdmap_full_flag();
+	update_pool_full_map(pool_full_map);
+
+	// check all outstanding requests on every epoch
+	for (auto& i : need_resend) {
+	  _prune_snapc(osdmap->get_new_removed_snaps(), i.second);
+	  if (skipped_map) {
+	    _prune_snapc(m->gap_removed_snaps, i.second);
+	  }
+	}
+	_scan_requests(homeless_session, skipped_map, cluster_full,
+		       &pool_full_map, need_resend,
+		       need_resend_linger, need_resend_command, sul,
+		       &m->gap_removed_snaps);
+	for (map<int,OSDSession*>::iterator p = osd_sessions.begin();
+	     p != osd_sessions.end(); ) {
+	  OSDSession *s = p->second;
+	  _scan_requests(s, skipped_map, cluster_full,
+			 &pool_full_map, need_resend,
+			 need_resend_linger, need_resend_command, sul,
+			 &m->gap_removed_snaps);
+	  ++p;
+	  // osd down or addr change?
+	  if (!osdmap->is_up(s->osd) ||
+	      (s->con &&
+	       s->con->get_peer_addrs() != osdmap->get_addrs(s->osd))) {
+	    close_session(s);
+	  }
+	}
+
+	ceph_assert(e == osdmap->get_epoch());
+      }
+
+    } else {
+      // first map.  we want the full thing.
+      if (m->maps.count(m->get_last())) {
+	for (map<int,OSDSession*>::iterator p = osd_sessions.begin();
+	     p != osd_sessions.end(); ++p) {
+	  OSDSession *s = p->second;
+	  _scan_requests(s, false, false, NULL, need_resend,
+			 need_resend_linger, need_resend_command, sul,
+			 nullptr);
+	}
+	ldout(cct, 3) << "handle_osd_map decoding full epoch "
+		      << m->get_last() << dendl;
+	osdmap->decode(m->maps[m->get_last()]);
+
+	_scan_requests(homeless_session, false, false, NULL,
+		       need_resend, need_resend_linger,
+		       need_resend_command, sul, nullptr);
+      } else {
+	ldout(cct, 3) << "handle_osd_map hmm, i want a full map, requesting"
+		      << dendl;
+	monc->sub_want("osdmap", 0, CEPH_SUBSCRIBE_ONETIME);
+	monc->renew_subs();
+      }
+    }
+  }
+
+  // make sure need_resend targets reflect latest map
+  for (auto p = need_resend.begin(); p != need_resend.end(); ) {
+    Op *op = p->second;
+    if (op->target.epoch < osdmap->get_epoch()) {
+      ldout(cct, 10) << __func__ << "  checking op " << p->first << dendl;
+      int r = _calc_target(&op->target, nullptr);
+      if (r == RECALC_OP_TARGET_POOL_DNE) {
+	p = need_resend.erase(p);
+	_check_op_pool_dne(op, nullptr);
+      } else {
+	++p;
+      }
+    } else {
+      ++p;
+    }
+  }
+
+  bool pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD);
+  bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) || _osdmap_full_flag()
+    || _osdmap_has_pool_full();
+
+  // was/is paused?
+  if (was_pauserd || was_pausewr || pauserd || pausewr ||
+      osdmap->get_epoch() < epoch_barrier) {
+    _maybe_request_map();
+  }
+
+  // resend requests
+  for (map<ceph_tid_t, Op*>::iterator p = need_resend.begin();
+       p != need_resend.end(); ++p) {
+    Op *op = p->second;
+    OSDSession *s = op->session;
+    bool mapped_session = false;
+    if (!s) {
+      int r = _map_session(&op->target, &s, sul);
+      ceph_assert(r == 0);
+      mapped_session = true;
+    } else {
+      get_session(s);
+    }
+    OSDSession::unique_lock sl(s->lock);
+    if (mapped_session) {
+      _session_op_assign(s, op);
+    }
+    if (op->should_resend) {
+      if (!op->session->is_homeless() && !op->target.paused) {
+	logger->inc(l_osdc_op_resend);
+	_send_op(op);
+      }
+    } else {
+      _op_cancel_map_check(op);
+      _cancel_linger_op(op);
+    }
+    sl.unlock();
+    put_session(s);
+  }
+  for (list<LingerOp*>::iterator p = need_resend_linger.begin();
+       p != need_resend_linger.end(); ++p) {
+    LingerOp *op = *p;
+    ceph_assert(op->session);
+    if (!op->session->is_homeless()) {
+      logger->inc(l_osdc_linger_resend);
+      _send_linger(op, sul);
+    }
+  }
+  for (map<ceph_tid_t,CommandOp*>::iterator p = need_resend_command.begin();
+       p != need_resend_command.end(); ++p) {
+    CommandOp *c = p->second;
+    if (c->target.osd >= 0) {
+      _assign_command_session(c, sul);
+      if (c->session && !c->session->is_homeless()) {
+	_send_command(c);
+      }
+    }
+  }
+
+  _dump_active();
+
+  // finish any Contexts that were waiting on a map update
+  map<epoch_t,list< pair< Context*, int > > >::iterator p =
+    waiting_for_map.begin();
+  while (p != waiting_for_map.end() &&
+	 p->first <= osdmap->get_epoch()) {
+    //go through the list and call the onfinish methods
+    for (list<pair<Context*, int> >::iterator i = p->second.begin();
+	 i != p->second.end(); ++i) {
+      i->first->complete(i->second);
+    }
+    waiting_for_map.erase(p++);
+  }
+
+  monc->sub_got("osdmap", osdmap->get_epoch());
+
+  if (!waiting_for_map.empty()) {
+    _maybe_request_map();
+  }
+}
+
+void Objecter::enable_blacklist_events()
+{
+  unique_lock wl(rwlock);
+
+  blacklist_events_enabled = true;
+}
+
+void Objecter::consume_blacklist_events(std::set<entity_addr_t> *events)
+{
+  unique_lock wl(rwlock);
+
+  if (events->empty()) {
+    events->swap(blacklist_events);
+  } else {
+    for (const auto &i : blacklist_events) {
+      events->insert(i);
+    }
+    blacklist_events.clear();
+  }
+}
+
+void Objecter::emit_blacklist_events(const OSDMap::Incremental &inc)
+{
+  if (!blacklist_events_enabled) {
+    return;
+  }
+
+  for (const auto &i : inc.new_blacklist) {
+    blacklist_events.insert(i.first);
+  }
+}
+
+void Objecter::emit_blacklist_events(const OSDMap &old_osd_map,
+                                     const OSDMap &new_osd_map)
+{
+  if (!blacklist_events_enabled) {
+    return;
+  }
+
+  std::set<entity_addr_t> old_set;
+  std::set<entity_addr_t> new_set;
+
+  old_osd_map.get_blacklist(&old_set);
+  new_osd_map.get_blacklist(&new_set);
+
+  std::set<entity_addr_t> delta_set;
+  std::set_difference(
+      new_set.begin(), new_set.end(), old_set.begin(), old_set.end(),
+      std::inserter(delta_set, delta_set.begin()));
+  blacklist_events.insert(delta_set.begin(), delta_set.end());
+}
+
+// op pool check
+
+void Objecter::C_Op_Map_Latest::finish(int r)
+{
+  if (r == -EAGAIN || r == -ECANCELED)
+    return;
+
+  lgeneric_subdout(objecter->cct, objecter, 10)
+    << "op_map_latest r=" << r << " tid=" << tid
+    << " latest " << latest << dendl;
+
+  Objecter::unique_lock wl(objecter->rwlock);
+
+  map<ceph_tid_t, Op*>::iterator iter =
+    objecter->check_latest_map_ops.find(tid);
+  if (iter == objecter->check_latest_map_ops.end()) {
+    lgeneric_subdout(objecter->cct, objecter, 10)
+      << "op_map_latest op "<< tid << " not found" << dendl;
+    return;
+  }
+
+  Op *op = iter->second;
+  objecter->check_latest_map_ops.erase(iter);
+
+  lgeneric_subdout(objecter->cct, objecter, 20)
+    << "op_map_latest op "<< op << dendl;
+
+  if (op->map_dne_bound == 0)
+    op->map_dne_bound = latest;
+
+  OSDSession::unique_lock sl(op->session->lock, defer_lock);
+  objecter->_check_op_pool_dne(op, &sl);
+
+  op->put();
+}
+
+int Objecter::pool_snap_by_name(int64_t poolid, const char *snap_name,
+				snapid_t *snap) const
+{
+  shared_lock rl(rwlock);
+
+  auto& pools = osdmap->get_pools();
+  auto iter = pools.find(poolid);
+  if (iter == pools.end()) {
+    return -ENOENT;
+  }
+  const pg_pool_t& pg_pool = iter->second;
+  for (auto p = pg_pool.snaps.begin();
+       p != pg_pool.snaps.end();
+       ++p) {
+    if (p->second.name == snap_name) {
+      *snap = p->first;
+      return 0;
+    }
+  }
+  return -ENOENT;
+}
+
+int Objecter::pool_snap_get_info(int64_t poolid, snapid_t snap,
+				 pool_snap_info_t *info) const
+{
+  shared_lock rl(rwlock);
+
+  auto& pools = osdmap->get_pools();
+  auto iter = pools.find(poolid);
+  if (iter == pools.end()) {
+    return -ENOENT;
+  }
+  const pg_pool_t& pg_pool = iter->second;
+  auto p = pg_pool.snaps.find(snap);
+  if (p == pg_pool.snaps.end())
+    return -ENOENT;
+  *info = p->second;
+
+  return 0;
+}
+
+int Objecter::pool_snap_list(int64_t poolid, vector<uint64_t> *snaps)
+{
+  shared_lock rl(rwlock);
+
+  const pg_pool_t *pi = osdmap->get_pg_pool(poolid);
+  if (!pi)
+    return -ENOENT;
+  for (map<snapid_t,pool_snap_info_t>::const_iterator p = pi->snaps.begin();
+       p != pi->snaps.end();
+       ++p) {
+    snaps->push_back(p->first);
+  }
+  return 0;
+}
+
+// sl may be unlocked.
+void Objecter::_check_op_pool_dne(Op *op, unique_lock *sl)
+{
+  // rwlock is locked unique
+
+  if (op->target.pool_ever_existed) {
+    // the pool previously existed and now it does not, which means it
+    // was deleted.
+    op->map_dne_bound = osdmap->get_epoch();
+    ldout(cct, 10) << "check_op_pool_dne tid " << op->tid
+		   << " pool previously exists but now does not"
+		   << dendl;
+  } else {
+    ldout(cct, 10) << "check_op_pool_dne tid " << op->tid
+		   << " current " << osdmap->get_epoch()
+		   << " map_dne_bound " << op->map_dne_bound
+		   << dendl;
+  }
+  if (op->map_dne_bound > 0) {
+    if (osdmap->get_epoch() >= op->map_dne_bound) {
+      // we had a new enough map
+      ldout(cct, 10) << "check_op_pool_dne tid " << op->tid
+		     << " concluding pool " << op->target.base_pgid.pool()
+		     << " dne" << dendl;
+      if (op->onfinish) {
+	num_in_flight--;
+	op->onfinish->complete(-ENOENT);
+      }
+
+      OSDSession *s = op->session;
+      if (s) {
+	ceph_assert(s != NULL);
+	ceph_assert(sl->mutex() == &s->lock);
+	bool session_locked = sl->owns_lock();
+	if (!session_locked) {
+	  sl->lock();
+	}
+	_finish_op(op, 0);
+	if (!session_locked) {
+	  sl->unlock();
+	}
+      } else {
+	_finish_op(op, 0);	// no session
+      }
+    }
+  } else {
+    _send_op_map_check(op);
+  }
+}
+
+void Objecter::_send_op_map_check(Op *op)
+{
+  // rwlock is locked unique
+  // ask the monitor
+  if (check_latest_map_ops.count(op->tid) == 0) {
+    op->get();
+    check_latest_map_ops[op->tid] = op;
+    C_Op_Map_Latest *c = new C_Op_Map_Latest(this, op->tid);
+    monc->get_version("osdmap", &c->latest, NULL, c);
+  }
+}
+
+void Objecter::_op_cancel_map_check(Op *op)
+{
+  // rwlock is locked unique
+  map<ceph_tid_t, Op*>::iterator iter =
+    check_latest_map_ops.find(op->tid);
+  if (iter != check_latest_map_ops.end()) {
+    Op *op = iter->second;
+    op->put();
+    check_latest_map_ops.erase(iter);
+  }
+}
+
+// linger pool check
+
+void Objecter::C_Linger_Map_Latest::finish(int r)
+{
+  if (r == -EAGAIN || r == -ECANCELED) {
+    // ignore callback; we will retry in resend_mon_ops()
+    return;
+  }
+
+  unique_lock wl(objecter->rwlock);
+
+  map<uint64_t, LingerOp*>::iterator iter =
+    objecter->check_latest_map_lingers.find(linger_id);
+  if (iter == objecter->check_latest_map_lingers.end()) {
+    return;
+  }
+
+  LingerOp *op = iter->second;
+  objecter->check_latest_map_lingers.erase(iter);
+
+  if (op->map_dne_bound == 0)
+    op->map_dne_bound = latest;
+
+  bool unregister;
+  objecter->_check_linger_pool_dne(op, &unregister);
+
+  if (unregister) {
+    objecter->_linger_cancel(op);
+  }
+
+  op->put();
+}
+
+void Objecter::_check_linger_pool_dne(LingerOp *op, bool *need_unregister)
+{
+  // rwlock is locked unique
+
+  *need_unregister = false;
+
+  if (op->register_gen > 0) {
+    ldout(cct, 10) << "_check_linger_pool_dne linger_id " << op->linger_id
+		   << " pool previously existed but now does not"
+		   << dendl;
+    op->map_dne_bound = osdmap->get_epoch();
+  } else {
+    ldout(cct, 10) << "_check_linger_pool_dne linger_id " << op->linger_id
+		   << " current " << osdmap->get_epoch()
+		   << " map_dne_bound " << op->map_dne_bound
+		   << dendl;
+  }
+  if (op->map_dne_bound > 0) {
+    if (osdmap->get_epoch() >= op->map_dne_bound) {
+      LingerOp::unique_lock wl{op->watch_lock};
+      if (op->on_reg_commit) {
+	op->on_reg_commit->complete(-ENOENT);
+	op->on_reg_commit = nullptr;
+      }
+      if (op->on_notify_finish) {
+        op->on_notify_finish->complete(-ENOENT);
+        op->on_notify_finish = nullptr;
+      }
+      *need_unregister = true;
+    }
+  } else {
+    _send_linger_map_check(op);
+  }
+}
+
+void Objecter::_send_linger_map_check(LingerOp *op)
+{
+  // ask the monitor
+  if (check_latest_map_lingers.count(op->linger_id) == 0) {
+    op->get();
+    check_latest_map_lingers[op->linger_id] = op;
+    C_Linger_Map_Latest *c = new C_Linger_Map_Latest(this, op->linger_id);
+    monc->get_version("osdmap", &c->latest, NULL, c);
+  }
+}
+
+void Objecter::_linger_cancel_map_check(LingerOp *op)
+{
+  // rwlock is locked unique
+
+  map<uint64_t, LingerOp*>::iterator iter =
+    check_latest_map_lingers.find(op->linger_id);
+  if (iter != check_latest_map_lingers.end()) {
+    LingerOp *op = iter->second;
+    op->put();
+    check_latest_map_lingers.erase(iter);
+  }
+}
+
+// command pool check
+
+void Objecter::C_Command_Map_Latest::finish(int r)
+{
+  if (r == -EAGAIN || r == -ECANCELED) {
+    // ignore callback; we will retry in resend_mon_ops()
+    return;
+  }
+
+  unique_lock wl(objecter->rwlock);
+
+  map<uint64_t, CommandOp*>::iterator iter =
+    objecter->check_latest_map_commands.find(tid);
+  if (iter == objecter->check_latest_map_commands.end()) {
+    return;
+  }
+
+  CommandOp *c = iter->second;
+  objecter->check_latest_map_commands.erase(iter);
+
+  if (c->map_dne_bound == 0)
+    c->map_dne_bound = latest;
+
+  OSDSession::unique_lock sul(c->session->lock);
+  objecter->_check_command_map_dne(c);
+  sul.unlock();
+
+  c->put();
+}
+
+void Objecter::_check_command_map_dne(CommandOp *c)
+{
+  // rwlock is locked unique
+  // session is locked unique
+
+  ldout(cct, 10) << "_check_command_map_dne tid " << c->tid
+		 << " current " << osdmap->get_epoch()
+		 << " map_dne_bound " << c->map_dne_bound
+		 << dendl;
+  if (c->map_dne_bound > 0) {
+    if (osdmap->get_epoch() >= c->map_dne_bound) {
+      _finish_command(c, c->map_check_error, c->map_check_error_str);
+    }
+  } else {
+    _send_command_map_check(c);
+  }
+}
+
+void Objecter::_send_command_map_check(CommandOp *c)
+{
+  // rwlock is locked unique
+  // session is locked unique
+
+  // ask the monitor
+  if (check_latest_map_commands.count(c->tid) == 0) {
+    c->get();
+    check_latest_map_commands[c->tid] = c;
+    C_Command_Map_Latest *f = new C_Command_Map_Latest(this, c->tid);
+    monc->get_version("osdmap", &f->latest, NULL, f);
+  }
+}
+
+void Objecter::_command_cancel_map_check(CommandOp *c)
+{
+  // rwlock is locked uniqe
+
+  map<uint64_t, CommandOp*>::iterator iter =
+    check_latest_map_commands.find(c->tid);
+  if (iter != check_latest_map_commands.end()) {
+    CommandOp *c = iter->second;
+    c->put();
+    check_latest_map_commands.erase(iter);
+  }
+}
+
+
+/**
+ * Look up OSDSession by OSD id.
+ *
+ * @returns 0 on success, or -EAGAIN if the lock context requires
+ * promotion to write.
+ */
+int Objecter::_get_session(int osd, OSDSession **session, shunique_lock& sul)
+{
+  ceph_assert(sul && sul.mutex() == &rwlock);
+
+  if (osd < 0) {
+    *session = homeless_session;
+    ldout(cct, 20) << __func__ << " osd=" << osd << " returning homeless"
+		   << dendl;
+    return 0;
+  }
+
+  map<int,OSDSession*>::iterator p = osd_sessions.find(osd);
+  if (p != osd_sessions.end()) {
+    OSDSession *s = p->second;
+    s->get();
+    *session = s;
+    ldout(cct, 20) << __func__ << " s=" << s << " osd=" << osd << " "
+		   << s->get_nref() << dendl;
+    return 0;
+  }
+  if (!sul.owns_lock()) {
+    return -EAGAIN;
+  }
+  OSDSession *s = new OSDSession(cct, osd);
+  osd_sessions[osd] = s;
+  s->con = messenger->connect_to_osd(osdmap->get_addrs(osd));
+  s->con->set_priv(RefCountedPtr{s});
+  logger->inc(l_osdc_osd_session_open);
+  logger->set(l_osdc_osd_sessions, osd_sessions.size());
+  s->get();
+  *session = s;
+  ldout(cct, 20) << __func__ << " s=" << s << " osd=" << osd << " "
+		 << s->get_nref() << dendl;
+  return 0;
+}
+
+void Objecter::put_session(Objecter::OSDSession *s)
+{
+  if (s && !s->is_homeless()) {
+    ldout(cct, 20) << __func__ << " s=" << s << " osd=" << s->osd << " "
+		   << s->get_nref() << dendl;
+    s->put();
+  }
+}
+
+void Objecter::get_session(Objecter::OSDSession *s)
+{
+  ceph_assert(s != NULL);
+
+  if (!s->is_homeless()) {
+    ldout(cct, 20) << __func__ << " s=" << s << " osd=" << s->osd << " "
+		   << s->get_nref() << dendl;
+    s->get();
+  }
+}
+
+void Objecter::_reopen_session(OSDSession *s)
+{
+  // rwlock is locked unique
+  // s->lock is locked
+
+  auto addrs = osdmap->get_addrs(s->osd);
+  ldout(cct, 10) << "reopen_session osd." << s->osd << " session, addr now "
+		 << addrs << dendl;
+  if (s->con) {
+    s->con->set_priv(NULL);
+    s->con->mark_down();
+    logger->inc(l_osdc_osd_session_close);
+  }
+  s->con = messenger->connect_to_osd(addrs);
+  s->con->set_priv(RefCountedPtr{s});
+  s->incarnation++;
+  logger->inc(l_osdc_osd_session_open);
+}
+
+void Objecter::close_session(OSDSession *s)
+{
+  // rwlock is locked unique
+
+  ldout(cct, 10) << "close_session for osd." << s->osd << dendl;
+  if (s->con) {
+    s->con->set_priv(NULL);
+    s->con->mark_down();
+    logger->inc(l_osdc_osd_session_close);
+  }
+  OSDSession::unique_lock sl(s->lock);
+
+  std::list<LingerOp*> homeless_lingers;
+  std::list<CommandOp*> homeless_commands;
+  std::list<Op*> homeless_ops;
+
+  while (!s->linger_ops.empty()) {
+    std::map<uint64_t, LingerOp*>::iterator i = s->linger_ops.begin();
+    ldout(cct, 10) << " linger_op " << i->first << dendl;
+    homeless_lingers.push_back(i->second);
+    _session_linger_op_remove(s, i->second);
+  }
+
+  while (!s->ops.empty()) {
+    std::map<ceph_tid_t, Op*>::iterator i = s->ops.begin();
+    ldout(cct, 10) << " op " << i->first << dendl;
+    homeless_ops.push_back(i->second);
+    _session_op_remove(s, i->second);
+  }
+
+  while (!s->command_ops.empty()) {
+    std::map<ceph_tid_t, CommandOp*>::iterator i = s->command_ops.begin();
+    ldout(cct, 10) << " command_op " << i->first << dendl;
+    homeless_commands.push_back(i->second);
+    _session_command_op_remove(s, i->second);
+  }
+
+  osd_sessions.erase(s->osd);
+  sl.unlock();
+  put_session(s);
+
+  // Assign any leftover ops to the homeless session
+  {
+    OSDSession::unique_lock hsl(homeless_session->lock);
+    for (std::list<LingerOp*>::iterator i = homeless_lingers.begin();
+	 i != homeless_lingers.end(); ++i) {
+      _session_linger_op_assign(homeless_session, *i);
+    }
+    for (std::list<Op*>::iterator i = homeless_ops.begin();
+	 i != homeless_ops.end(); ++i) {
+      _session_op_assign(homeless_session, *i);
+    }
+    for (std::list<CommandOp*>::iterator i = homeless_commands.begin();
+	 i != homeless_commands.end(); ++i) {
+      _session_command_op_assign(homeless_session, *i);
+    }
+  }
+
+  logger->set(l_osdc_osd_sessions, osd_sessions.size());
+}
+
+void Objecter::wait_for_osd_map()
+{
+  unique_lock l(rwlock);
+  if (osdmap->get_epoch()) {
+    l.unlock();
+    return;
+  }
+
+  // Leave this since it goes with C_SafeCond
+  Mutex lock("");
+  Cond cond;
+  bool done;
+  lock.Lock();
+  C_SafeCond *context = new C_SafeCond(&lock, &cond, &done, NULL);
+  waiting_for_map[0].push_back(pair<Context*, int>(context, 0));
+  l.unlock();
+  while (!done)
+    cond.Wait(lock);
+  lock.Unlock();
+}
+
+struct C_Objecter_GetVersion : public Context {
+  Objecter *objecter;
+  uint64_t oldest, newest;
+  Context *fin;
+  C_Objecter_GetVersion(Objecter *o, Context *c)
+    : objecter(o), oldest(0), newest(0), fin(c) {}
+  void finish(int r) override {
+    if (r >= 0) {
+      objecter->get_latest_version(oldest, newest, fin);
+    } else if (r == -EAGAIN) { // try again as instructed
+      objecter->wait_for_latest_osdmap(fin);
+    } else {
+      // it doesn't return any other error codes!
+      ceph_abort();
+    }
+  }
+};
+
+void Objecter::wait_for_latest_osdmap(Context *fin)
+{
+  ldout(cct, 10) << __func__ << dendl;
+  C_Objecter_GetVersion *c = new C_Objecter_GetVersion(this, fin);
+  monc->get_version("osdmap", &c->newest, &c->oldest, c);
+}
+
+void Objecter::get_latest_version(epoch_t oldest, epoch_t newest, Context *fin)
+{
+  unique_lock wl(rwlock);
+  if (osdmap->get_epoch() >= newest) {
+    ldout(cct, 10) << __func__ << " latest " << newest << ", have it" << dendl;
+    wl.unlock();
+    if (fin)
+      fin->complete(0);
+    return;
+  }
+
+  ldout(cct, 10) << __func__ << " latest " << newest << ", waiting" << dendl;
+  _wait_for_new_map(fin, newest, 0);
+}
+
+void Objecter::maybe_request_map()
+{
+  shared_lock rl(rwlock);
+  _maybe_request_map();
+}
+
+void Objecter::_maybe_request_map()
+{
+  // rwlock is locked
+  int flag = 0;
+  if (_osdmap_full_flag()
+      || osdmap->test_flag(CEPH_OSDMAP_PAUSERD)
+      || osdmap->test_flag(CEPH_OSDMAP_PAUSEWR)) {
+    ldout(cct, 10) << "_maybe_request_map subscribing (continuous) to next "
+      "osd map (FULL flag is set)" << dendl;
+  } else {
+    ldout(cct, 10)
+      << "_maybe_request_map subscribing (onetime) to next osd map" << dendl;
+    flag = CEPH_SUBSCRIBE_ONETIME;
+  }
+  epoch_t epoch = osdmap->get_epoch() ? osdmap->get_epoch()+1 : 0;
+  if (monc->sub_want("osdmap", epoch, flag)) {
+    monc->renew_subs();
+  }
+}
+
+void Objecter::_wait_for_new_map(Context *c, epoch_t epoch, int err)
+{
+  // rwlock is locked unique
+  waiting_for_map[epoch].push_back(pair<Context *, int>(c, err));
+  _maybe_request_map();
+}
+
+
+/**
+ * Use this together with wait_for_map: this is a pre-check to avoid
+ * allocating a Context for wait_for_map if we can see that we
+ * definitely already have the epoch.
+ *
+ * This does *not* replace the need to handle the return value of
+ * wait_for_map: just because we don't have it in this pre-check
+ * doesn't mean we won't have it when calling back into wait_for_map,
+ * since the objecter lock is dropped in between.
+ */
+bool Objecter::have_map(const epoch_t epoch)
+{
+  shared_lock rl(rwlock);
+  if (osdmap->get_epoch() >= epoch) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool Objecter::wait_for_map(epoch_t epoch, Context *c, int err)
+{
+  unique_lock wl(rwlock);
+  if (osdmap->get_epoch() >= epoch) {
+    return true;
+  }
+  _wait_for_new_map(c, epoch, err);
+  return false;
+}
+
+void Objecter::_kick_requests(OSDSession *session,
+			      map<uint64_t, LingerOp *>& lresend)
+{
+  // rwlock is locked unique
+
+  // clear backoffs
+  session->backoffs.clear();
+  session->backoffs_by_id.clear();
+
+  // resend ops
+  map<ceph_tid_t,Op*> resend;  // resend in tid order
+  for (map<ceph_tid_t, Op*>::iterator p = session->ops.begin();
+       p != session->ops.end();) {
+    Op *op = p->second;
+    ++p;
+    if (op->should_resend) {
+      if (!op->target.paused)
+	resend[op->tid] = op;
+    } else {
+      _op_cancel_map_check(op);
+      _cancel_linger_op(op);
+    }
+  }
+
+  logger->inc(l_osdc_op_resend, resend.size());
+  while (!resend.empty()) {
+    _send_op(resend.begin()->second);
+    resend.erase(resend.begin());
+  }
+
+  // resend lingers
+  logger->inc(l_osdc_linger_resend, session->linger_ops.size());
+  for (map<ceph_tid_t, LingerOp*>::iterator j = session->linger_ops.begin();
+       j != session->linger_ops.end(); ++j) {
+    LingerOp *op = j->second;
+    op->get();
+    ceph_assert(lresend.count(j->first) == 0);
+    lresend[j->first] = op;
+  }
+
+  // resend commands
+  logger->inc(l_osdc_command_resend, session->command_ops.size());
+  map<uint64_t,CommandOp*> cresend;  // resend in order
+  for (map<ceph_tid_t, CommandOp*>::iterator k = session->command_ops.begin();
+       k != session->command_ops.end(); ++k) {
+    cresend[k->first] = k->second;
+  }
+  while (!cresend.empty()) {
+    _send_command(cresend.begin()->second);
+    cresend.erase(cresend.begin());
+  }
+}
+
+void Objecter::_linger_ops_resend(map<uint64_t, LingerOp *>& lresend,
+				  unique_lock& ul)
+{
+  ceph_assert(ul.owns_lock());
+  shunique_lock sul(std::move(ul));
+  while (!lresend.empty()) {
+    LingerOp *op = lresend.begin()->second;
+    if (!op->canceled) {
+      _send_linger(op, sul);
+    }
+    op->put();
+    lresend.erase(lresend.begin());
+  }
+  ul = sul.release_to_unique();
+}
+
+void Objecter::start_tick()
+{
+  ceph_assert(tick_event == 0);
+  tick_event =
+    timer.add_event(ceph::make_timespan(cct->_conf->objecter_tick_interval),
+		    &Objecter::tick, this);
+}
+
+void Objecter::tick()
+{
+  shared_lock rl(rwlock);
+
+  ldout(cct, 10) << "tick" << dendl;
+
+  // we are only called by C_Tick
+  tick_event = 0;
+
+  if (!initialized) {
+    // we raced with shutdown
+    ldout(cct, 10) << __func__ << " raced with shutdown" << dendl;
+    return;
+  }
+
+  set<OSDSession*> toping;
+
+
+  // look for laggy requests
+  auto cutoff = ceph::coarse_mono_clock::now();
+  cutoff -= ceph::make_timespan(cct->_conf->objecter_timeout);  // timeout
+
+  unsigned laggy_ops = 0;
+
+  for (map<int,OSDSession*>::iterator siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    OSDSession *s = siter->second;
+    OSDSession::lock_guard l(s->lock);
+    bool found = false;
+    for (map<ceph_tid_t,Op*>::iterator p = s->ops.begin();
+	p != s->ops.end();
+	++p) {
+      Op *op = p->second;
+      ceph_assert(op->session);
+      if (op->stamp < cutoff) {
+	ldout(cct, 2) << " tid " << p->first << " on osd." << op->session->osd
+		      << " is laggy" << dendl;
+	found = true;
+	++laggy_ops;
+      }
+    }
+    for (map<uint64_t,LingerOp*>::iterator p = s->linger_ops.begin();
+	p != s->linger_ops.end();
+	++p) {
+      LingerOp *op = p->second;
+      LingerOp::unique_lock wl(op->watch_lock);
+      ceph_assert(op->session);
+      ldout(cct, 10) << " pinging osd that serves lingering tid " << p->first
+		     << " (osd." << op->session->osd << ")" << dendl;
+      found = true;
+      if (op->is_watch && op->registered && !op->last_error)
+	_send_linger_ping(op);
+    }
+    for (map<uint64_t,CommandOp*>::iterator p = s->command_ops.begin();
+	p != s->command_ops.end();
+	++p) {
+      CommandOp *op = p->second;
+      ceph_assert(op->session);
+      ldout(cct, 10) << " pinging osd that serves command tid " << p->first
+		     << " (osd." << op->session->osd << ")" << dendl;
+      found = true;
+    }
+    if (found)
+      toping.insert(s);
+  }
+  if (num_homeless_ops || !toping.empty()) {
+    _maybe_request_map();
+  }
+
+  logger->set(l_osdc_op_laggy, laggy_ops);
+  logger->set(l_osdc_osd_laggy, toping.size());
+
+  if (!toping.empty()) {
+    // send a ping to these osds, to ensure we detect any session resets
+    // (osd reply message policy is lossy)
+    for (set<OSDSession*>::const_iterator i = toping.begin();
+	 i != toping.end();
+	 ++i) {
+      (*i)->con->send_message(new MPing);
+    }
+  }
+
+  // Make sure we don't reschedule if we wake up after shutdown
+  if (initialized) {
+    tick_event = timer.reschedule_me(ceph::make_timespan(
+				       cct->_conf->objecter_tick_interval));
+  }
+}
+
+void Objecter::resend_mon_ops()
+{
+  unique_lock wl(rwlock);
+
+  ldout(cct, 10) << "resend_mon_ops" << dendl;
+
+  for (map<ceph_tid_t,PoolStatOp*>::iterator p = poolstat_ops.begin();
+       p != poolstat_ops.end();
+       ++p) {
+    _poolstat_submit(p->second);
+    logger->inc(l_osdc_poolstat_resend);
+  }
+
+  for (map<ceph_tid_t,StatfsOp*>::iterator p = statfs_ops.begin();
+       p != statfs_ops.end();
+       ++p) {
+    _fs_stats_submit(p->second);
+    logger->inc(l_osdc_statfs_resend);
+  }
+
+  for (map<ceph_tid_t,PoolOp*>::iterator p = pool_ops.begin();
+       p != pool_ops.end();
+       ++p) {
+    _pool_op_submit(p->second);
+    logger->inc(l_osdc_poolop_resend);
+  }
+
+  for (map<ceph_tid_t, Op*>::iterator p = check_latest_map_ops.begin();
+       p != check_latest_map_ops.end();
+       ++p) {
+    C_Op_Map_Latest *c = new C_Op_Map_Latest(this, p->second->tid);
+    monc->get_version("osdmap", &c->latest, NULL, c);
+  }
+
+  for (map<uint64_t, LingerOp*>::iterator p = check_latest_map_lingers.begin();
+       p != check_latest_map_lingers.end();
+       ++p) {
+    C_Linger_Map_Latest *c
+      = new C_Linger_Map_Latest(this, p->second->linger_id);
+    monc->get_version("osdmap", &c->latest, NULL, c);
+  }
+
+  for (map<uint64_t, CommandOp*>::iterator p
+	 = check_latest_map_commands.begin();
+       p != check_latest_map_commands.end();
+       ++p) {
+    C_Command_Map_Latest *c = new C_Command_Map_Latest(this, p->second->tid);
+    monc->get_version("osdmap", &c->latest, NULL, c);
+  }
+}
+
+// read | write ---------------------------
+
+void Objecter::op_submit(Op *op, ceph_tid_t *ptid, int *ctx_budget)
+{
+  shunique_lock rl(rwlock, ceph::acquire_shared);
+  ceph_tid_t tid = 0;
+  if (!ptid)
+    ptid = &tid;
+  op->trace.event("op submit");
+  _op_submit_with_budget(op, rl, ptid, ctx_budget);
+}
+
+void Objecter::_op_submit_with_budget(Op *op, shunique_lock& sul,
+				      ceph_tid_t *ptid,
+				      int *ctx_budget)
+{
+  ceph_assert(initialized);
+
+  ceph_assert(op->ops.size() == op->out_bl.size());
+  ceph_assert(op->ops.size() == op->out_rval.size());
+  ceph_assert(op->ops.size() == op->out_handler.size());
+
+  // throttle.  before we look at any state, because
+  // _take_op_budget() may drop our lock while it blocks.
+  if (!op->ctx_budgeted || (ctx_budget && (*ctx_budget == -1))) {
+    int op_budget = _take_op_budget(op, sul);
+    // take and pass out the budget for the first OP
+    // in the context session
+    if (ctx_budget && (*ctx_budget == -1)) {
+      *ctx_budget = op_budget;
+    }
+  }
+
+  if (osd_timeout > timespan(0)) {
+    if (op->tid == 0)
+      op->tid = ++last_tid;
+    auto tid = op->tid;
+    op->ontimeout = timer.add_event(osd_timeout,
+				    [this, tid]() {
+				      op_cancel(tid, -ETIMEDOUT); });
+  }
+
+  _op_submit(op, sul, ptid);
+}
+
+void Objecter::_send_op_account(Op *op)
+{
+  inflight_ops++;
+
+  // add to gather set(s)
+  if (op->onfinish) {
+    num_in_flight++;
+  } else {
+    ldout(cct, 20) << " note: not requesting reply" << dendl;
+  }
+
+  logger->inc(l_osdc_op_active);
+  logger->inc(l_osdc_op);
+
+  if ((op->target.flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)) ==
+      (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE))
+    logger->inc(l_osdc_op_rmw);
+  else if (op->target.flags & CEPH_OSD_FLAG_WRITE)
+    logger->inc(l_osdc_op_w);
+  else if (op->target.flags & CEPH_OSD_FLAG_READ)
+    logger->inc(l_osdc_op_r);
+
+  if (op->target.flags & CEPH_OSD_FLAG_PGOP)
+    logger->inc(l_osdc_op_pg);
+
+  for (vector<OSDOp>::iterator p = op->ops.begin(); p != op->ops.end(); ++p) {
+    int code = l_osdc_osdop_other;
+    switch (p->op.op) {
+    case CEPH_OSD_OP_STAT: code = l_osdc_osdop_stat; break;
+    case CEPH_OSD_OP_CREATE: code = l_osdc_osdop_create; break;
+    case CEPH_OSD_OP_READ: code = l_osdc_osdop_read; break;
+    case CEPH_OSD_OP_WRITE: code = l_osdc_osdop_write; break;
+    case CEPH_OSD_OP_WRITEFULL: code = l_osdc_osdop_writefull; break;
+    case CEPH_OSD_OP_WRITESAME: code = l_osdc_osdop_writesame; break;
+    case CEPH_OSD_OP_APPEND: code = l_osdc_osdop_append; break;
+    case CEPH_OSD_OP_ZERO: code = l_osdc_osdop_zero; break;
+    case CEPH_OSD_OP_TRUNCATE: code = l_osdc_osdop_truncate; break;
+    case CEPH_OSD_OP_DELETE: code = l_osdc_osdop_delete; break;
+    case CEPH_OSD_OP_MAPEXT: code = l_osdc_osdop_mapext; break;
+    case CEPH_OSD_OP_SPARSE_READ: code = l_osdc_osdop_sparse_read; break;
+    case CEPH_OSD_OP_GETXATTR: code = l_osdc_osdop_getxattr; break;
+    case CEPH_OSD_OP_SETXATTR: code = l_osdc_osdop_setxattr; break;
+    case CEPH_OSD_OP_CMPXATTR: code = l_osdc_osdop_cmpxattr; break;
+    case CEPH_OSD_OP_RMXATTR: code = l_osdc_osdop_rmxattr; break;
+    case CEPH_OSD_OP_RESETXATTRS: code = l_osdc_osdop_resetxattrs; break;
+
+    // OMAP read operations
+    case CEPH_OSD_OP_OMAPGETVALS:
+    case CEPH_OSD_OP_OMAPGETKEYS:
+    case CEPH_OSD_OP_OMAPGETHEADER:
+    case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
+    case CEPH_OSD_OP_OMAP_CMP: code = l_osdc_osdop_omap_rd; break;
+
+    // OMAP write operations
+    case CEPH_OSD_OP_OMAPSETVALS:
+    case CEPH_OSD_OP_OMAPSETHEADER: code = l_osdc_osdop_omap_wr; break;
+
+    // OMAP del operations
+    case CEPH_OSD_OP_OMAPCLEAR:
+    case CEPH_OSD_OP_OMAPRMKEYS: code = l_osdc_osdop_omap_del; break;
+
+    case CEPH_OSD_OP_CALL: code = l_osdc_osdop_call; break;
+    case CEPH_OSD_OP_WATCH: code = l_osdc_osdop_watch; break;
+    case CEPH_OSD_OP_NOTIFY: code = l_osdc_osdop_notify; break;
+    }
+    if (code)
+      logger->inc(code);
+  }
+}
+
+void Objecter::_op_submit(Op *op, shunique_lock& sul, ceph_tid_t *ptid)
+{
+  // rwlock is locked
+
+  ldout(cct, 10) << __func__ << " op " << op << dendl;
+
+  // pick target
+  ceph_assert(op->session == NULL);
+  OSDSession *s = NULL;
+
+  bool check_for_latest_map = _calc_target(&op->target, nullptr)
+    == RECALC_OP_TARGET_POOL_DNE;
+
+  // Try to get a session, including a retry if we need to take write lock
+  int r = _get_session(op->target.osd, &s, sul);
+  if (r == -EAGAIN ||
+      (check_for_latest_map && sul.owns_lock_shared()) ||
+      cct->_conf->objecter_debug_inject_relock_delay) {
+    epoch_t orig_epoch = osdmap->get_epoch();
+    sul.unlock();
+    if (cct->_conf->objecter_debug_inject_relock_delay) {
+      sleep(1);
+    }
+    sul.lock();
+    if (orig_epoch != osdmap->get_epoch()) {
+      // map changed; recalculate mapping
+      ldout(cct, 10) << __func__ << " relock raced with osdmap, recalc target"
+		     << dendl;
+      check_for_latest_map = _calc_target(&op->target, nullptr)
+	== RECALC_OP_TARGET_POOL_DNE;
+      if (s) {
+	put_session(s);
+	s = NULL;
+	r = -EAGAIN;
+      }
+    }
+  }
+  if (r == -EAGAIN) {
+    ceph_assert(s == NULL);
+    r = _get_session(op->target.osd, &s, sul);
+  }
+  ceph_assert(r == 0);
+  ceph_assert(s);  // may be homeless
+
+  _send_op_account(op);
+
+  // send?
+
+  ceph_assert(op->target.flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE));
+
+  if (osdmap_full_try) {
+    op->target.flags |= CEPH_OSD_FLAG_FULL_TRY;
+  }
+
+  bool need_send = false;
+
+  if (osdmap->get_epoch() < epoch_barrier) {
+    ldout(cct, 10) << " barrier, paused " << op << " tid " << op->tid
+		   << dendl;
+    op->target.paused = true;
+    _maybe_request_map();
+  } else if ((op->target.flags & CEPH_OSD_FLAG_WRITE) &&
+             osdmap->test_flag(CEPH_OSDMAP_PAUSEWR)) {
+    ldout(cct, 10) << " paused modify " << op << " tid " << op->tid
+		   << dendl;
+    op->target.paused = true;
+    _maybe_request_map();
+  } else if ((op->target.flags & CEPH_OSD_FLAG_READ) &&
+	     osdmap->test_flag(CEPH_OSDMAP_PAUSERD)) {
+    ldout(cct, 10) << " paused read " << op << " tid " << op->tid
+		   << dendl;
+    op->target.paused = true;
+    _maybe_request_map();
+  } else if (op->respects_full() &&
+	     (_osdmap_full_flag() ||
+	      _osdmap_pool_full(op->target.base_oloc.pool))) {
+    ldout(cct, 0) << " FULL, paused modify " << op << " tid "
+		  << op->tid << dendl;
+    op->target.paused = true;
+    _maybe_request_map();
+  } else if (!s->is_homeless()) {
+    need_send = true;
+  } else {
+    _maybe_request_map();
+  }
+
+  OSDSession::unique_lock sl(s->lock);
+  if (op->tid == 0)
+    op->tid = ++last_tid;
+
+  ldout(cct, 10) << "_op_submit oid " << op->target.base_oid
+		 << " '" << op->target.base_oloc << "' '"
+		 << op->target.target_oloc << "' " << op->ops << " tid "
+		 << op->tid << " osd." << (!s->is_homeless() ? s->osd : -1)
+		 << dendl;
+
+  _session_op_assign(s, op);
+
+  if (need_send) {
+    _send_op(op);
+  }
+
+  // Last chance to touch Op here, after giving up session lock it can
+  // be freed at any time by response handler.
+  ceph_tid_t tid = op->tid;
+  if (check_for_latest_map) {
+    _send_op_map_check(op);
+  }
+  if (ptid)
+    *ptid = tid;
+  op = NULL;
+
+  sl.unlock();
+  put_session(s);
+
+  ldout(cct, 5) << num_in_flight << " in flight" << dendl;
+}
+
+int Objecter::op_cancel(OSDSession *s, ceph_tid_t tid, int r)
+{
+  ceph_assert(initialized);
+
+  OSDSession::unique_lock sl(s->lock);
+
+  map<ceph_tid_t, Op*>::iterator p = s->ops.find(tid);
+  if (p == s->ops.end()) {
+    ldout(cct, 10) << __func__ << " tid " << tid << " dne in session "
+		   << s->osd << dendl;
+    return -ENOENT;
+  }
+
+#if 0
+  if (s->con) {
+    ldout(cct, 20) << " revoking rx buffer for " << tid
+		   << " on " << s->con << dendl;
+    s->con->revoke_rx_buffer(tid);
+  }
+#endif
+
+  ldout(cct, 10) << __func__ << " tid " << tid << " in session " << s->osd
+		 << dendl;
+  Op *op = p->second;
+  if (op->onfinish) {
+    num_in_flight--;
+    op->onfinish->complete(r);
+    op->onfinish = NULL;
+  }
+  _op_cancel_map_check(op);
+  _finish_op(op, r);
+  sl.unlock();
+
+  return 0;
+}
+
+int Objecter::op_cancel(ceph_tid_t tid, int r)
+{
+  int ret = 0;
+
+  unique_lock wl(rwlock);
+  ret = _op_cancel(tid, r);
+
+  return ret;
+}
+
+int Objecter::op_cancel(const vector<ceph_tid_t>& tids, int r)
+{
+  unique_lock wl(rwlock);
+  ldout(cct,10) << __func__ << " " << tids << dendl;
+  for (auto tid : tids) {
+    _op_cancel(tid, r);
+  }
+  return 0;
+}
+
+int Objecter::_op_cancel(ceph_tid_t tid, int r)
+{
+  int ret = 0;
+
+  ldout(cct, 5) << __func__ << ": cancelling tid " << tid << " r=" << r
+		<< dendl;
+
+start:
+
+  for (map<int, OSDSession *>::iterator siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    OSDSession *s = siter->second;
+    OSDSession::shared_lock sl(s->lock);
+    if (s->ops.find(tid) != s->ops.end()) {
+      sl.unlock();
+      ret = op_cancel(s, tid, r);
+      if (ret == -ENOENT) {
+	/* oh no! raced, maybe tid moved to another session, restarting */
+	goto start;
+      }
+      return ret;
+    }
+  }
+
+  ldout(cct, 5) << __func__ << ": tid " << tid
+		<< " not found in live sessions" << dendl;
+
+  // Handle case where the op is in homeless session
+  OSDSession::shared_lock sl(homeless_session->lock);
+  if (homeless_session->ops.find(tid) != homeless_session->ops.end()) {
+    sl.unlock();
+    ret = op_cancel(homeless_session, tid, r);
+    if (ret == -ENOENT) {
+      /* oh no! raced, maybe tid moved to another session, restarting */
+      goto start;
+    } else {
+      return ret;
+    }
+  } else {
+    sl.unlock();
+  }
+
+  ldout(cct, 5) << __func__ << ": tid " << tid
+		<< " not found in homeless session" << dendl;
+
+  return ret;
+}
+
+
+epoch_t Objecter::op_cancel_writes(int r, int64_t pool)
+{
+  unique_lock wl(rwlock);
+
+  std::vector<ceph_tid_t> to_cancel;
+  bool found = false;
+
+  for (map<int, OSDSession *>::iterator siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    OSDSession *s = siter->second;
+    OSDSession::shared_lock sl(s->lock);
+    for (map<ceph_tid_t, Op*>::iterator op_i = s->ops.begin();
+	 op_i != s->ops.end(); ++op_i) {
+      if (op_i->second->target.flags & CEPH_OSD_FLAG_WRITE
+	  && (pool == -1 || op_i->second->target.target_oloc.pool == pool)) {
+	to_cancel.push_back(op_i->first);
+      }
+    }
+    sl.unlock();
+
+    for (std::vector<ceph_tid_t>::iterator titer = to_cancel.begin();
+	 titer != to_cancel.end();
+	 ++titer) {
+      int cancel_result = op_cancel(s, *titer, r);
+      // We hold rwlock across search and cancellation, so cancels
+      // should always succeed
+      ceph_assert(cancel_result == 0);
+    }
+    if (!found && to_cancel.size())
+      found = true;
+    to_cancel.clear();
+  }
+
+  const epoch_t epoch = osdmap->get_epoch();
+
+  wl.unlock();
+
+  if (found) {
+    return epoch;
+  } else {
+    return -1;
+  }
+}
+
+bool Objecter::is_pg_changed(
+  int oldprimary,
+  const vector<int>& oldacting,
+  int newprimary,
+  const vector<int>& newacting,
+  bool any_change)
+{
+  if (OSDMap::primary_changed(
+	oldprimary,
+	oldacting,
+	newprimary,
+	newacting))
+    return true;
+  if (any_change && oldacting != newacting)
+    return true;
+  return false;      // same primary (tho replicas may have changed)
+}
+
+bool Objecter::target_should_be_paused(op_target_t *t)
+{
+  const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool);
+  bool pauserd = osdmap->test_flag(CEPH_OSDMAP_PAUSERD);
+  bool pausewr = osdmap->test_flag(CEPH_OSDMAP_PAUSEWR) ||
+    _osdmap_full_flag() || _osdmap_pool_full(*pi);
+
+  return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+    (t->flags & CEPH_OSD_FLAG_WRITE && pausewr) ||
+    (osdmap->get_epoch() < epoch_barrier);
+}
+
+/**
+ * Locking public accessor for _osdmap_full_flag
+ */
+bool Objecter::osdmap_full_flag() const
+{
+  shared_lock rl(rwlock);
+
+  return _osdmap_full_flag();
+}
+
+bool Objecter::osdmap_pool_full(const int64_t pool_id) const
+{
+  shared_lock rl(rwlock);
+
+  if (_osdmap_full_flag()) {
+    return true;
+  }
+
+  return _osdmap_pool_full(pool_id);
+}
+
+bool Objecter::_osdmap_pool_full(const int64_t pool_id) const
+{
+  const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
+  if (pool == NULL) {
+    ldout(cct, 4) << __func__ << ": DNE pool " << pool_id << dendl;
+    return false;
+  }
+
+  return _osdmap_pool_full(*pool);
+}
+
+bool Objecter::_osdmap_has_pool_full() const
+{
+  for (map<int64_t, pg_pool_t>::const_iterator it
+	 = osdmap->get_pools().begin();
+       it != osdmap->get_pools().end(); ++it) {
+    if (_osdmap_pool_full(it->second))
+      return true;
+  }
+  return false;
+}
+
+bool Objecter::_osdmap_pool_full(const pg_pool_t &p) const
+{
+  return p.has_flag(pg_pool_t::FLAG_FULL) && honor_osdmap_full;
+}
+
+/**
+ * Wrapper around osdmap->test_flag for special handling of the FULL flag.
+ */
+bool Objecter::_osdmap_full_flag() const
+{
+  // Ignore the FULL flag if the caller does not have honor_osdmap_full
+  return osdmap->test_flag(CEPH_OSDMAP_FULL) && honor_osdmap_full;
+}
+
+void Objecter::update_pool_full_map(map<int64_t, bool>& pool_full_map)
+{
+  for (map<int64_t, pg_pool_t>::const_iterator it
+	 = osdmap->get_pools().begin();
+       it != osdmap->get_pools().end(); ++it) {
+    if (pool_full_map.find(it->first) == pool_full_map.end()) {
+      pool_full_map[it->first] = _osdmap_pool_full(it->second);
+    } else {
+      pool_full_map[it->first] = _osdmap_pool_full(it->second) ||
+	pool_full_map[it->first];
+    }
+  }
+}
+
+int64_t Objecter::get_object_hash_position(int64_t pool, const string& key,
+					   const string& ns)
+{
+  shared_lock rl(rwlock);
+  const pg_pool_t *p = osdmap->get_pg_pool(pool);
+  if (!p)
+    return -ENOENT;
+  return p->hash_key(key, ns);
+}
+
+int64_t Objecter::get_object_pg_hash_position(int64_t pool, const string& key,
+					      const string& ns)
+{
+  shared_lock rl(rwlock);
+  const pg_pool_t *p = osdmap->get_pg_pool(pool);
+  if (!p)
+    return -ENOENT;
+  return p->raw_hash_to_pg(p->hash_key(key, ns));
+}
+
+void Objecter::_prune_snapc(
+  const mempool::osdmap::map<int64_t,
+  OSDMap::snap_interval_set_t>& new_removed_snaps,
+  Op *op)
+{
+  bool match = false;
+  auto i = new_removed_snaps.find(op->target.base_pgid.pool());
+  if (i != new_removed_snaps.end()) {
+    for (auto s : op->snapc.snaps) {
+      if (i->second.contains(s)) {
+	match = true;
+	break;
+      }
+    }
+    if (match) {
+      vector<snapid_t> new_snaps;
+      for (auto s : op->snapc.snaps) {
+	if (!i->second.contains(s)) {
+	  new_snaps.push_back(s);
+	}
+      }
+      op->snapc.snaps.swap(new_snaps);
+      ldout(cct,10) << __func__ << " op " << op->tid << " snapc " << op->snapc
+		    << " (was " << new_snaps << ")" << dendl;
+    }
+  }
+}
+
+int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
+{
+  // rwlock is locked
+  bool is_read = t->flags & CEPH_OSD_FLAG_READ;
+  bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
+  t->epoch = osdmap->get_epoch();
+  ldout(cct,20) << __func__ << " epoch " << t->epoch
+		<< " base " << t->base_oid << " " << t->base_oloc
+		<< " precalc_pgid " << (int)t->precalc_pgid
+		<< " pgid " << t->base_pgid
+		<< (is_read ? " is_read" : "")
+		<< (is_write ? " is_write" : "")
+		<< dendl;
+
+  const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool);
+  if (!pi) {
+    t->osd = -1;
+    return RECALC_OP_TARGET_POOL_DNE;
+  }
+  ldout(cct,30) << __func__ << "  base pi " << pi
+		<< " pg_num " << pi->get_pg_num() << dendl;
+
+  bool force_resend = false;
+  if (osdmap->get_epoch() == pi->last_force_op_resend) {
+    if (t->last_force_resend < pi->last_force_op_resend) {
+      t->last_force_resend = pi->last_force_op_resend;
+      force_resend = true;
+    } else if (t->last_force_resend == 0) {
+      force_resend = true;
+    }
+  }
+
+  // apply tiering
+  t->target_oid = t->base_oid;
+  t->target_oloc = t->base_oloc;
+  if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+    if (is_read && pi->has_read_tier())
+      t->target_oloc.pool = pi->read_tier;
+    if (is_write && pi->has_write_tier())
+      t->target_oloc.pool = pi->write_tier;
+    pi = osdmap->get_pg_pool(t->target_oloc.pool);
+    if (!pi) {
+      t->osd = -1;
+      return RECALC_OP_TARGET_POOL_DNE;
+    }
+  }
+
+  pg_t pgid;
+  if (t->precalc_pgid) {
+    ceph_assert(t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY);
+    ceph_assert(t->base_oid.name.empty()); // make sure this is a pg op
+    ceph_assert(t->base_oloc.pool == (int64_t)t->base_pgid.pool());
+    pgid = t->base_pgid;
+  } else {
+    int ret = osdmap->object_locator_to_pg(t->target_oid, t->target_oloc,
+					   pgid);
+    if (ret == -ENOENT) {
+      t->osd = -1;
+      return RECALC_OP_TARGET_POOL_DNE;
+    }
+  }
+  ldout(cct,20) << __func__ << " target " << t->target_oid << " "
+		<< t->target_oloc << " -> pgid " << pgid << dendl;
+  ldout(cct,30) << __func__ << "  target pi " << pi
+		<< " pg_num " << pi->get_pg_num() << dendl;
+  t->pool_ever_existed = true;
+
+  int size = pi->size;
+  int min_size = pi->min_size;
+  unsigned pg_num = pi->get_pg_num();
+  unsigned pg_num_pending = pi->get_pg_num_pending();
+  int up_primary, acting_primary;
+  vector<int> up, acting;
+  osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary,
+			       &acting, &acting_primary);
+  bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
+  bool recovery_deletes = osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES);
+  unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask);
+  pg_t prev_pgid(prev_seed, pgid.pool());
+  if (any_change && PastIntervals::is_new_interval(
+	t->acting_primary,
+	acting_primary,
+	t->acting,
+	acting,
+	t->up_primary,
+	up_primary,
+	t->up,
+	up,
+	t->size,
+	size,
+	t->min_size,
+	min_size,
+	t->pg_num,
+	pg_num,
+	t->pg_num_pending,
+	pg_num_pending,
+	t->sort_bitwise,
+	sort_bitwise,
+	t->recovery_deletes,
+	recovery_deletes,
+	prev_pgid)) {
+    force_resend = true;
+  }
+
+  bool unpaused = false;
+  bool should_be_paused = target_should_be_paused(t);
+  if (t->paused && !should_be_paused) {
+    unpaused = true;
+  }
+  t->paused = should_be_paused;
+
+  bool legacy_change =
+    t->pgid != pgid ||
+      is_pg_changed(
+	t->acting_primary, t->acting, acting_primary, acting,
+	t->used_replica || any_change);
+  bool split_or_merge = false;
+  if (t->pg_num) {
+    split_or_merge =
+      prev_pgid.is_split(t->pg_num, pg_num, nullptr) ||
+      prev_pgid.is_merge_source(t->pg_num, pg_num, nullptr) ||
+      prev_pgid.is_merge_target(t->pg_num, pg_num);
+  }
+
+  if (legacy_change || split_or_merge || force_resend) {
+    t->pgid = pgid;
+    t->acting = acting;
+    t->acting_primary = acting_primary;
+    t->up_primary = up_primary;
+    t->up = up;
+    t->size = size;
+    t->min_size = min_size;
+    t->pg_num = pg_num;
+    t->pg_num_mask = pi->get_pg_num_mask();
+    t->pg_num_pending = pg_num_pending;
+    osdmap->get_primary_shard(
+      pg_t(ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask), pgid.pool()),
+      &t->actual_pgid);
+    t->sort_bitwise = sort_bitwise;
+    t->recovery_deletes = recovery_deletes;
+    ldout(cct, 10) << __func__ << " "
+		   << " raw pgid " << pgid << " -> actual " << t->actual_pgid
+		   << " acting " << acting
+		   << " primary " << acting_primary << dendl;
+    t->used_replica = false;
+    if (acting_primary == -1) {
+      t->osd = -1;
+    } else {
+      int osd;
+      bool read = is_read && !is_write;
+      if (read && (t->flags & CEPH_OSD_FLAG_BALANCE_READS)) {
+	int p = rand() % acting.size();
+	if (p)
+	  t->used_replica = true;
+	osd = acting[p];
+	ldout(cct, 10) << " chose random osd." << osd << " of " << acting
+		       << dendl;
+      } else if (read && (t->flags & CEPH_OSD_FLAG_LOCALIZE_READS) &&
+		 acting.size() > 1) {
+	// look for a local replica.  prefer the primary if the
+	// distance is the same.
+	int best = -1;
+	int best_locality = 0;
+	for (unsigned i = 0; i < acting.size(); ++i) {
+	  int locality = osdmap->crush->get_common_ancestor_distance(
+		 cct, acting[i], crush_location);
+	  ldout(cct, 20) << __func__ << " localize: rank " << i
+			 << " osd." << acting[i]
+			 << " locality " << locality << dendl;
+	  if (i == 0 ||
+	      (locality >= 0 && best_locality >= 0 &&
+	       locality < best_locality) ||
+	      (best_locality < 0 && locality >= 0)) {
+	    best = i;
+	    best_locality = locality;
+	    if (i)
+	      t->used_replica = true;
+	  }
+	}
+	ceph_assert(best >= 0);
+	osd = acting[best];
+      } else {
+	osd = acting_primary;
+      }
+      t->osd = osd;
+    }
+  }
+  if (legacy_change || unpaused || force_resend) {
+    return RECALC_OP_TARGET_NEED_RESEND;
+  }
+  if (split_or_merge &&
+      (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS ||
+       HAVE_FEATURE(osdmap->get_xinfo(acting_primary).features,
+		    RESEND_ON_SPLIT))) {
+    return RECALC_OP_TARGET_NEED_RESEND;
+  }
+  return RECALC_OP_TARGET_NO_ACTION;
+}
+
+int Objecter::_map_session(op_target_t *target, OSDSession **s,
+			   shunique_lock& sul)
+{
+  _calc_target(target, nullptr);
+  return _get_session(target->osd, s, sul);
+}
+
+void Objecter::_session_op_assign(OSDSession *to, Op *op)
+{
+  // to->lock is locked
+  ceph_assert(op->session == NULL);
+  ceph_assert(op->tid);
+
+  get_session(to);
+  op->session = to;
+  to->ops[op->tid] = op;
+
+  if (to->is_homeless()) {
+    num_homeless_ops++;
+  }
+
+  ldout(cct, 15) << __func__ << " " << to->osd << " " << op->tid << dendl;
+}
+
+void Objecter::_session_op_remove(OSDSession *from, Op *op)
+{
+  ceph_assert(op->session == from);
+  // from->lock is locked
+
+  if (from->is_homeless()) {
+    num_homeless_ops--;
+  }
+
+  from->ops.erase(op->tid);
+  put_session(from);
+  op->session = NULL;
+
+  ldout(cct, 15) << __func__ << " " << from->osd << " " << op->tid << dendl;
+}
+
+void Objecter::_session_linger_op_assign(OSDSession *to, LingerOp *op)
+{
+  // to lock is locked unique
+  ceph_assert(op->session == NULL);
+
+  if (to->is_homeless()) {
+    num_homeless_ops++;
+  }
+
+  get_session(to);
+  op->session = to;
+  to->linger_ops[op->linger_id] = op;
+
+  ldout(cct, 15) << __func__ << " " << to->osd << " " << op->linger_id
+		 << dendl;
+}
+
+void Objecter::_session_linger_op_remove(OSDSession *from, LingerOp *op)
+{
+  ceph_assert(from == op->session);
+  // from->lock is locked unique
+
+  if (from->is_homeless()) {
+    num_homeless_ops--;
+  }
+
+  from->linger_ops.erase(op->linger_id);
+  put_session(from);
+  op->session = NULL;
+
+  ldout(cct, 15) << __func__ << " " << from->osd << " " << op->linger_id
+		 << dendl;
+}
+
+void Objecter::_session_command_op_remove(OSDSession *from, CommandOp *op)
+{
+  ceph_assert(from == op->session);
+  // from->lock is locked
+
+  if (from->is_homeless()) {
+    num_homeless_ops--;
+  }
+
+  from->command_ops.erase(op->tid);
+  put_session(from);
+  op->session = NULL;
+
+  ldout(cct, 15) << __func__ << " " << from->osd << " " << op->tid << dendl;
+}
+
+void Objecter::_session_command_op_assign(OSDSession *to, CommandOp *op)
+{
+  // to->lock is locked
+  ceph_assert(op->session == NULL);
+  ceph_assert(op->tid);
+
+  if (to->is_homeless()) {
+    num_homeless_ops++;
+  }
+
+  get_session(to);
+  op->session = to;
+  to->command_ops[op->tid] = op;
+
+  ldout(cct, 15) << __func__ << " " << to->osd << " " << op->tid << dendl;
+}
+
+int Objecter::_recalc_linger_op_target(LingerOp *linger_op,
+				       shunique_lock& sul)
+{
+  // rwlock is locked unique
+
+  int r = _calc_target(&linger_op->target, nullptr, true);
+  if (r == RECALC_OP_TARGET_NEED_RESEND) {
+    ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id
+		   << " pgid " << linger_op->target.pgid
+		   << " acting " << linger_op->target.acting << dendl;
+
+    OSDSession *s = NULL;
+    r = _get_session(linger_op->target.osd, &s, sul);
+    ceph_assert(r == 0);
+
+    if (linger_op->session != s) {
+      // NB locking two sessions (s and linger_op->session) at the
+      // same time here is only safe because we are the only one that
+      // takes two, and we are holding rwlock for write.  Disable
+      // lockdep because it doesn't know that.
+      OSDSession::unique_lock sl(s->lock);
+      _session_linger_op_remove(linger_op->session, linger_op);
+      _session_linger_op_assign(s, linger_op);
+    }
+
+    put_session(s);
+    return RECALC_OP_TARGET_NEED_RESEND;
+  }
+  return r;
+}
+
+void Objecter::_cancel_linger_op(Op *op)
+{
+  ldout(cct, 15) << "cancel_op " << op->tid << dendl;
+
+  ceph_assert(!op->should_resend);
+  if (op->onfinish) {
+    delete op->onfinish;
+    num_in_flight--;
+  }
+
+  _finish_op(op, 0);
+}
+
+void Objecter::_finish_op(Op *op, int r)
+{
+  ldout(cct, 15) << __func__ << " " << op->tid << dendl;
+
+  // op->session->lock is locked unique or op->session is null
+
+  if (!op->ctx_budgeted && op->budget >= 0) {
+    put_op_budget_bytes(op->budget);
+    op->budget = -1;
+  }
+
+  if (op->ontimeout && r != -ETIMEDOUT)
+    timer.cancel_event(op->ontimeout);
+
+  if (op->session) {
+    _session_op_remove(op->session, op);
+  }
+
+  logger->dec(l_osdc_op_active);
+
+  ceph_assert(check_latest_map_ops.find(op->tid) == check_latest_map_ops.end());
+
+  inflight_ops--;
+
+  op->put();
+}
+
+MOSDOp *Objecter::_prepare_osd_op(Op *op)
+{
+  // rwlock is locked
+
+  int flags = op->target.flags;
+  flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
+
+  // Nothing checks this any longer, but needed for compatibility with
+  // pre-luminous osds
+  flags |= CEPH_OSD_FLAG_ONDISK;
+
+  if (!honor_osdmap_full)
+    flags |= CEPH_OSD_FLAG_FULL_FORCE;
+
+  op->target.paused = false;
+  op->stamp = ceph::coarse_mono_clock::now();
+
+  hobject_t hobj = op->target.get_hobj();
+  MOSDOp *m = new MOSDOp(client_inc, op->tid,
+			 hobj, op->target.actual_pgid,
+			 osdmap->get_epoch(),
+			 flags, op->features);
+
+  m->set_snapid(op->snapid);
+  m->set_snap_seq(op->snapc.seq);
+  m->set_snaps(op->snapc.snaps);
+
+  m->ops = op->ops;
+  m->set_mtime(op->mtime);
+  m->set_retry_attempt(op->attempts++);
+
+  if (!op->trace.valid() && cct->_conf->osdc_blkin_trace_all) {
+    op->trace.init("op", &trace_endpoint);
+  }
+
+  if (op->priority)
+    m->set_priority(op->priority);
+  else
+    m->set_priority(cct->_conf->osd_client_op_priority);
+
+  if (op->reqid != osd_reqid_t()) {
+    m->set_reqid(op->reqid);
+  }
+
+  logger->inc(l_osdc_op_send);
+  ssize_t sum = 0;
+  for (unsigned i = 0; i < m->ops.size(); i++) {
+    sum += m->ops[i].indata.length();
+  }
+  logger->inc(l_osdc_op_send_bytes, sum);
+
+  return m;
+}
+
+void Objecter::_send_op(Op *op)
+{
+  // rwlock is locked
+  // op->session->lock is locked
+
+  // backoff?
+  auto p = op->session->backoffs.find(op->target.actual_pgid);
+  if (p != op->session->backoffs.end()) {
+    hobject_t hoid = op->target.get_hobj();
+    auto q = p->second.lower_bound(hoid);
+    if (q != p->second.begin()) {
+      --q;
+      if (hoid >= q->second.end) {
+	++q;
+      }
+    }
+    if (q != p->second.end()) {
+      ldout(cct, 20) << __func__ << " ? " << q->first << " [" << q->second.begin
+		     << "," << q->second.end << ")" << dendl;
+      int r = cmp(hoid, q->second.begin);
+      if (r == 0 || (r > 0 && hoid < q->second.end)) {
+	ldout(cct, 10) << __func__ << " backoff " << op->target.actual_pgid
+		       << " id " << q->second.id << " on " << hoid
+		       << ", queuing " << op << " tid " << op->tid << dendl;
+	return;
+      }
+    }
+  }
+
+  ceph_assert(op->tid > 0);
+  MOSDOp *m = _prepare_osd_op(op);
+
+  if (op->target.actual_pgid != m->get_spg()) {
+    ldout(cct, 10) << __func__ << " " << op->tid << " pgid change from "
+		   << m->get_spg() << " to " << op->target.actual_pgid
+		   << ", updating and reencoding" << dendl;
+    m->set_spg(op->target.actual_pgid);
+    m->clear_payload();  // reencode
+  }
+
+  ldout(cct, 15) << "_send_op " << op->tid << " to "
+		 << op->target.actual_pgid << " on osd." << op->session->osd
+		 << dendl;
+
+  ConnectionRef con = op->session->con;
+  ceph_assert(con);
+
+#if 0
+  // preallocated rx buffer?
+  if (op->con) {
+    ldout(cct, 20) << " revoking rx buffer for " << op->tid << " on "
+		   << op->con << dendl;
+    op->con->revoke_rx_buffer(op->tid);
+  }
+  if (op->outbl &&
+      op->ontimeout == 0 &&  // only post rx_buffer if no timeout; see #9582
+      op->outbl->length()) {
+    op->outbl->invalidate_crc();  // messenger writes through c_str()
+    ldout(cct, 20) << " posting rx buffer for " << op->tid << " on " << con
+		   << dendl;
+    op->con = con;
+    op->con->post_rx_buffer(op->tid, *op->outbl);
+  }
+#endif
+
+  op->incarnation = op->session->incarnation;
+
+  if (op->trace.valid()) {
+    m->trace.init("op msg", nullptr, &op->trace);
+  }
+  op->session->con->send_message(m);
+}
+
+int Objecter::calc_op_budget(const vector<OSDOp>& ops)
+{
+  int op_budget = 0;
+  for (vector<OSDOp>::const_iterator i = ops.begin();
+       i != ops.end();
+       ++i) {
+    if (i->op.op & CEPH_OSD_OP_MODE_WR) {
+      op_budget += i->indata.length();
+    } else if (ceph_osd_op_mode_read(i->op.op)) {
+      if (ceph_osd_op_uses_extent(i->op.op)) {
+        if ((int64_t)i->op.extent.length > 0)
+          op_budget += (int64_t)i->op.extent.length;
+      } else if (ceph_osd_op_type_attr(i->op.op)) {
+        op_budget += i->op.xattr.name_len + i->op.xattr.value_len;
+      }
+    }
+  }
+  return op_budget;
+}
+
+void Objecter::_throttle_op(Op *op,
+			    shunique_lock& sul,
+			    int op_budget)
+{
+  ceph_assert(sul && sul.mutex() == &rwlock);
+  bool locked_for_write = sul.owns_lock();
+
+  if (!op_budget)
+    op_budget = calc_op_budget(op->ops);
+  if (!op_throttle_bytes.get_or_fail(op_budget)) { //couldn't take right now
+    sul.unlock();
+    op_throttle_bytes.get(op_budget);
+    if (locked_for_write)
+      sul.lock();
+    else
+      sul.lock_shared();
+  }
+  if (!op_throttle_ops.get_or_fail(1)) { //couldn't take right now
+    sul.unlock();
+    op_throttle_ops.get(1);
+    if (locked_for_write)
+      sul.lock();
+    else
+      sul.lock_shared();
+  }
+}
+
+int Objecter::take_linger_budget(LingerOp *info)
+{
+  return 1;
+}
+
+/* This function DOES put the passed message before returning */
+void Objecter::handle_osd_op_reply(MOSDOpReply *m)
+{
+  ldout(cct, 10) << "in handle_osd_op_reply" << dendl;
+
+  // get pio
+  ceph_tid_t tid = m->get_tid();
+
+  shunique_lock sul(rwlock, ceph::acquire_shared);
+  if (!initialized) {
+    m->put();
+    return;
+  }
+
+  ConnectionRef con = m->get_connection();
+  auto priv = con->get_priv();
+  auto s = static_cast<OSDSession*>(priv.get());
+  if (!s || s->con != con) {
+    ldout(cct, 7) << __func__ << " no session on con " << con << dendl;
+    m->put();
+    return;
+  }
+
+  OSDSession::unique_lock sl(s->lock);
+
+  map<ceph_tid_t, Op *>::iterator iter = s->ops.find(tid);
+  if (iter == s->ops.end()) {
+    ldout(cct, 7) << "handle_osd_op_reply " << tid
+		  << (m->is_ondisk() ? " ondisk" : (m->is_onnvram() ?
+						    " onnvram" : " ack"))
+		  << " ... stray" << dendl;
+    sl.unlock();
+    m->put();
+    return;
+  }
+
+  ldout(cct, 7) << "handle_osd_op_reply " << tid
+		<< (m->is_ondisk() ? " ondisk" :
+		    (m->is_onnvram() ? " onnvram" : " ack"))
+		<< " uv " << m->get_user_version()
+		<< " in " << m->get_pg()
+		<< " attempt " << m->get_retry_attempt()
+		<< dendl;
+  Op *op = iter->second;
+  op->trace.event("osd op reply");
+
+  if (retry_writes_after_first_reply && op->attempts == 1 &&
+      (op->target.flags & CEPH_OSD_FLAG_WRITE)) {
+    ldout(cct, 7) << "retrying write after first reply: " << tid << dendl;
+    if (op->onfinish) {
+      num_in_flight--;
+    }
+    _session_op_remove(s, op);
+    sl.unlock();
+
+    _op_submit(op, sul, NULL);
+    m->put();
+    return;
+  }
+
+  if (m->get_retry_attempt() >= 0) {
+    if (m->get_retry_attempt() != (op->attempts - 1)) {
+      ldout(cct, 7) << " ignoring reply from attempt "
+		    << m->get_retry_attempt()
+		    << " from " << m->get_source_inst()
+		    << "; last attempt " << (op->attempts - 1) << " sent to "
+		    << op->session->con->get_peer_addr() << dendl;
+      m->put();
+      sl.unlock();
+      return;
+    }
+  } else {
+    // we don't know the request attempt because the server is old, so
+    // just accept this one.  we may do ACK callbacks we shouldn't
+    // have, but that is better than doing callbacks out of order.
+  }
+
+  Context *onfinish = 0;
+
+  int rc = m->get_result();
+
+  if (m->is_redirect_reply()) {
+    ldout(cct, 5) << " got redirect reply; redirecting" << dendl;
+    if (op->onfinish)
+      num_in_flight--;
+    _session_op_remove(s, op);
+    sl.unlock();
+
+    // FIXME: two redirects could race and reorder
+
+    op->tid = 0;
+    m->get_redirect().combine_with_locator(op->target.target_oloc,
+					   op->target.target_oid.name);
+    op->target.flags |= (CEPH_OSD_FLAG_REDIRECTED |
+			 CEPH_OSD_FLAG_IGNORE_CACHE |
+			 CEPH_OSD_FLAG_IGNORE_OVERLAY);
+    _op_submit(op, sul, NULL);
+    m->put();
+    return;
+  }
+
+  if (rc == -EAGAIN) {
+    ldout(cct, 7) << " got -EAGAIN, resubmitting" << dendl;
+    if (op->onfinish)
+      num_in_flight--;
+    _session_op_remove(s, op);
+    sl.unlock();
+
+    op->tid = 0;
+    op->target.flags &= ~(CEPH_OSD_FLAG_BALANCE_READS |
+			  CEPH_OSD_FLAG_LOCALIZE_READS);
+    op->target.pgid = pg_t();
+    _op_submit(op, sul, NULL);
+    m->put();
+    return;
+  }
+
+  sul.unlock();
+
+  if (op->objver)
+    *op->objver = m->get_user_version();
+  if (op->reply_epoch)
+    *op->reply_epoch = m->get_map_epoch();
+  if (op->data_offset)
+    *op->data_offset = m->get_header().data_off;
+
+  // got data?
+  if (op->outbl) {
+#if 0
+    if (op->con)
+      op->con->revoke_rx_buffer(op->tid);
+#endif
+    auto& bl = m->get_data();
+    if (op->outbl->length() == bl.length() &&
+	bl.get_num_buffers() <= 1) {
+      // this is here to keep previous users to *relied* on getting data
+      // read into existing buffers happy.  Notably,
+      // libradosstriper::RadosStriperImpl::aio_read().
+      ldout(cct,10) << __func__ << " copying resulting " << bl.length()
+		    << " into existing buffer of length " << op->outbl->length()
+		    << dendl;
+      bufferlist t;
+      t.claim(*op->outbl);
+      t.invalidate_crc();  // we're overwriting the raw buffers via c_str()
+      bl.copy(0, bl.length(), t.c_str());
+      op->outbl->substr_of(t, 0, bl.length());
+    } else {
+      m->claim_data(*op->outbl);
+    }
+    op->outbl = 0;
+  }
+
+  // per-op result demuxing
+  vector<OSDOp> out_ops;
+  m->claim_ops(out_ops);
+
+  if (out_ops.size() != op->ops.size())
+    ldout(cct, 0) << "WARNING: tid " << op->tid << " reply ops " << out_ops
+		  << " != request ops " << op->ops
+		  << " from " << m->get_source_inst() << dendl;
+
+  vector<bufferlist*>::iterator pb = op->out_bl.begin();
+  vector<int*>::iterator pr = op->out_rval.begin();
+  vector<Context*>::iterator ph = op->out_handler.begin();
+  ceph_assert(op->out_bl.size() == op->out_rval.size());
+  ceph_assert(op->out_bl.size() == op->out_handler.size());
+  vector<OSDOp>::iterator p = out_ops.begin();
+  for (unsigned i = 0;
+       p != out_ops.end() && pb != op->out_bl.end();
+       ++i, ++p, ++pb, ++pr, ++ph) {
+    ldout(cct, 10) << " op " << i << " rval " << p->rval
+		   << " len " << p->outdata.length() << dendl;
+    if (*pb)
+      **pb = p->outdata;
+    // set rval before running handlers so that handlers
+    // can change it if e.g. decoding fails
+    if (*pr)
+      **pr = ceph_to_hostos_errno(p->rval);
+    if (*ph) {
+      ldout(cct, 10) << " op " << i << " handler " << *ph << dendl;
+      (*ph)->complete(ceph_to_hostos_errno(p->rval));
+      *ph = NULL;
+    }
+  }
+
+  // NOTE: we assume that since we only request ONDISK ever we will
+  // only ever get back one (type of) ack ever.
+
+  if (op->onfinish) {
+    num_in_flight--;
+    onfinish = op->onfinish;
+    op->onfinish = NULL;
+  }
+  logger->inc(l_osdc_op_reply);
+
+  /* get it before we call _finish_op() */
+  auto completion_lock = s->get_lock(op->target.base_oid);
+
+  ldout(cct, 15) << "handle_osd_op_reply completed tid " << tid << dendl;
+  _finish_op(op, 0);
+
+  ldout(cct, 5) << num_in_flight << " in flight" << dendl;
+
+  // serialize completions
+  if (completion_lock.mutex()) {
+    completion_lock.lock();
+  }
+  sl.unlock();
+
+  // do callbacks
+  if (onfinish) {
+    onfinish->complete(rc);
+  }
+  if (completion_lock.mutex()) {
+    completion_lock.unlock();
+  }
+
+  m->put();
+}
+
+void Objecter::handle_osd_backoff(MOSDBackoff *m)
+{
+  ldout(cct, 10) << __func__ << " " << *m << dendl;
+  shunique_lock sul(rwlock, ceph::acquire_shared);
+  if (!initialized) {
+    m->put();
+    return;
+  }
+
+  ConnectionRef con = m->get_connection();
+  auto priv = con->get_priv();
+  auto s = static_cast<OSDSession*>(priv.get());
+  if (!s || s->con != con) {
+    ldout(cct, 7) << __func__ << " no session on con " << con << dendl;
+    m->put();
+    return;
+  }
+
+  get_session(s);
+
+  OSDSession::unique_lock sl(s->lock);
+
+  switch (m->op) {
+  case CEPH_OSD_BACKOFF_OP_BLOCK:
+    {
+      // register
+      OSDBackoff& b = s->backoffs[m->pgid][m->begin];
+      s->backoffs_by_id.insert(make_pair(m->id, &b));
+      b.pgid = m->pgid;
+      b.id = m->id;
+      b.begin = m->begin;
+      b.end = m->end;
+
+      // ack with original backoff's epoch so that the osd can discard this if
+      // there was a pg split.
+      Message *r = new MOSDBackoff(m->pgid,
+				   m->map_epoch,
+				   CEPH_OSD_BACKOFF_OP_ACK_BLOCK,
+				   m->id, m->begin, m->end);
+      // this priority must match the MOSDOps from _prepare_osd_op
+      r->set_priority(cct->_conf->osd_client_op_priority);
+      con->send_message(r);
+    }
+    break;
+
+  case CEPH_OSD_BACKOFF_OP_UNBLOCK:
+    {
+      auto p = s->backoffs_by_id.find(m->id);
+      if (p != s->backoffs_by_id.end()) {
+	OSDBackoff *b = p->second;
+	if (b->begin != m->begin &&
+	    b->end != m->end) {
+	  lderr(cct) << __func__ << " got " << m->pgid << " id " << m->id
+		     << " unblock on ["
+		     << m->begin << "," << m->end << ") but backoff is ["
+		     << b->begin << "," << b->end << ")" << dendl;
+	  // hrmpf, unblock it anyway.
+	}
+	ldout(cct, 10) << __func__ << " unblock backoff " << b->pgid
+		       << " id " << b->id
+		       << " [" << b->begin << "," << b->end
+		       << ")" << dendl;
+	auto spgp = s->backoffs.find(b->pgid);
+	ceph_assert(spgp != s->backoffs.end());
+	spgp->second.erase(b->begin);
+	if (spgp->second.empty()) {
+	  s->backoffs.erase(spgp);
+	}
+	s->backoffs_by_id.erase(p);
+
+	// check for any ops to resend
+	for (auto& q : s->ops) {
+	  if (q.second->target.actual_pgid == m->pgid) {
+	    int r = q.second->target.contained_by(m->begin, m->end);
+	    ldout(cct, 20) << __func__ <<  " contained_by " << r << " on "
+			   << q.second->target.get_hobj() << dendl;
+	    if (r) {
+	      _send_op(q.second);
+	    }
+	  }
+	}
+      } else {
+	lderr(cct) << __func__ << " " << m->pgid << " id " << m->id
+		   << " unblock on ["
+		   << m->begin << "," << m->end << ") but backoff dne" << dendl;
+      }
+    }
+    break;
+
+  default:
+    ldout(cct, 10) << __func__ << " unrecognized op " << (int)m->op << dendl;
+  }
+
+  sul.unlock();
+  sl.unlock();
+
+  m->put();
+  put_session(s);
+}
+
+uint32_t Objecter::list_nobjects_seek(NListContext *list_context,
+				      uint32_t pos)
+{
+  shared_lock rl(rwlock);
+  list_context->pos = hobject_t(object_t(), string(), CEPH_NOSNAP,
+				pos, list_context->pool_id, string());
+  ldout(cct, 10) << __func__ << " " << list_context
+		 << " pos " << pos << " -> " << list_context->pos << dendl;
+  pg_t actual = osdmap->raw_pg_to_pg(pg_t(pos, list_context->pool_id));
+  list_context->current_pg = actual.ps();
+  list_context->at_end_of_pool = false;
+  return pos;
+}
+
+uint32_t Objecter::list_nobjects_seek(NListContext *list_context,
+				      const hobject_t& cursor)
+{
+  shared_lock rl(rwlock);
+  ldout(cct, 10) << "list_nobjects_seek " << list_context << dendl;
+  list_context->pos = cursor;
+  list_context->at_end_of_pool = false;
+  pg_t actual = osdmap->raw_pg_to_pg(pg_t(cursor.get_hash(), list_context->pool_id));
+  list_context->current_pg = actual.ps();
+  list_context->sort_bitwise = true;
+  return list_context->current_pg;
+}
+
+void Objecter::list_nobjects_get_cursor(NListContext *list_context,
+                                        hobject_t *cursor)
+{
+  shared_lock rl(rwlock);
+  if (list_context->list.empty()) {
+    *cursor = list_context->pos;
+  } else {
+    const librados::ListObjectImpl& entry = list_context->list.front();
+    const string *key = (entry.locator.empty() ? &entry.oid : &entry.locator);
+    uint32_t h = osdmap->get_pg_pool(list_context->pool_id)->hash_key(*key, entry.nspace);
+    *cursor = hobject_t(entry.oid, entry.locator, list_context->pool_snap_seq, h, list_context->pool_id, entry.nspace);
+  }
+}
+
+void Objecter::list_nobjects(NListContext *list_context, Context *onfinish)
+{
+  ldout(cct, 10) << __func__ << " pool_id " << list_context->pool_id
+		 << " pool_snap_seq " << list_context->pool_snap_seq
+		 << " max_entries " << list_context->max_entries
+		 << " list_context " << list_context
+		 << " onfinish " << onfinish
+		 << " current_pg " << list_context->current_pg
+		 << " pos " << list_context->pos << dendl;
+
+  shared_lock rl(rwlock);
+  const pg_pool_t *pool = osdmap->get_pg_pool(list_context->pool_id);
+  if (!pool) { // pool is gone
+    rl.unlock();
+    put_nlist_context_budget(list_context);
+    onfinish->complete(-ENOENT);
+    return;
+  }
+  int pg_num = pool->get_pg_num();
+  bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
+
+  if (list_context->pos.is_min()) {
+    list_context->starting_pg_num = 0;
+    list_context->sort_bitwise = sort_bitwise;
+    list_context->starting_pg_num = pg_num;
+  }
+  if (list_context->sort_bitwise != sort_bitwise) {
+    list_context->pos = hobject_t(
+      object_t(), string(), CEPH_NOSNAP,
+      list_context->current_pg, list_context->pool_id, string());
+    list_context->sort_bitwise = sort_bitwise;
+    ldout(cct, 10) << " hobject sort order changed, restarting this pg at "
+		   << list_context->pos << dendl;
+  }
+  if (list_context->starting_pg_num != pg_num) {
+    if (!sort_bitwise) {
+      // start reading from the beginning; the pgs have changed
+      ldout(cct, 10) << " pg_num changed; restarting with " << pg_num << dendl;
+      list_context->pos = collection_list_handle_t();
+    }
+    list_context->starting_pg_num = pg_num;
+  }
+
+  if (list_context->pos.is_max()) {
+    ldout(cct, 20) << __func__ << " end of pool, list "
+		   << list_context->list << dendl;
+    if (list_context->list.empty()) {
+      list_context->at_end_of_pool = true;
+    }
+    // release the listing context's budget once all
+    // OPs (in the session) are finished
+    put_nlist_context_budget(list_context);
+    onfinish->complete(0);
+    return;
+  }
+
+  ObjectOperation op;
+  op.pg_nls(list_context->max_entries, list_context->filter,
+	    list_context->pos, osdmap->get_epoch());
+  list_context->bl.clear();
+  C_NList *onack = new C_NList(list_context, onfinish, this);
+  object_locator_t oloc(list_context->pool_id, list_context->nspace);
+
+  // note current_pg in case we don't have (or lose) SORTBITWISE
+  list_context->current_pg = pool->raw_hash_to_pg(list_context->pos.get_hash());
+  rl.unlock();
+
+  pg_read(list_context->current_pg, oloc, op,
+	  &list_context->bl, 0, onack, &onack->epoch,
+	  &list_context->ctx_budget);
+}
+
+void Objecter::_nlist_reply(NListContext *list_context, int r,
+			    Context *final_finish, epoch_t reply_epoch)
+{
+  ldout(cct, 10) << __func__ << " " << list_context << dendl;
+
+  auto iter = list_context->bl.cbegin();
+  pg_nls_response_t response;
+  bufferlist extra_info;
+  decode(response, iter);
+  if (!iter.end()) {
+    decode(extra_info, iter);
+  }
+
+  // if the osd returns 1 (newer code), or handle MAX, it means we
+  // hit the end of the pg.
+  if ((response.handle.is_max() || r == 1) &&
+      !list_context->sort_bitwise) {
+    // legacy OSD and !sortbitwise, figure out the next PG on our own
+    ++list_context->current_pg;
+    if (list_context->current_pg == list_context->starting_pg_num) {
+      // end of pool
+      list_context->pos = hobject_t::get_max();
+    } else {
+      // next pg
+      list_context->pos = hobject_t(object_t(), string(), CEPH_NOSNAP,
+				    list_context->current_pg,
+				    list_context->pool_id, string());
+    }
+  } else {
+    list_context->pos = response.handle;
+  }
+
+  int response_size = response.entries.size();
+  ldout(cct, 20) << " response.entries.size " << response_size
+		 << ", response.entries " << response.entries
+		 << ", handle " << response.handle
+		 << ", tentative new pos " << list_context->pos << dendl;
+  list_context->extra_info.append(extra_info);
+  if (response_size) {
+    list_context->list.splice(list_context->list.end(), response.entries);
+  }
+
+  if (list_context->list.size() >= list_context->max_entries) {
+    ldout(cct, 20) << " hit max, returning results so far, "
+		   << list_context->list << dendl;
+    // release the listing context's budget once all
+    // OPs (in the session) are finished
+    put_nlist_context_budget(list_context);
+    final_finish->complete(0);
+    return;
+  }
+
+  // continue!
+  list_nobjects(list_context, final_finish);
+}
+
+void Objecter::put_nlist_context_budget(NListContext *list_context)
+{
+  if (list_context->ctx_budget >= 0) {
+    ldout(cct, 10) << " release listing context's budget " <<
+      list_context->ctx_budget << dendl;
+    put_op_budget_bytes(list_context->ctx_budget);
+    list_context->ctx_budget = -1;
+  }
+}
+
+// snapshots
+
+int Objecter::create_pool_snap(int64_t pool, string& snap_name,
+			       Context *onfinish)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "create_pool_snap; pool: " << pool << "; snap: "
+		 << snap_name << dendl;
+
+  const pg_pool_t *p = osdmap->get_pg_pool(pool);
+  if (!p)
+    return -EINVAL;
+  if (p->snap_exists(snap_name.c_str()))
+    return -EEXIST;
+
+  PoolOp *op = new PoolOp;
+  if (!op)
+    return -ENOMEM;
+  op->tid = ++last_tid;
+  op->pool = pool;
+  op->name = snap_name;
+  op->onfinish = onfinish;
+  op->pool_op = POOL_OP_CREATE_SNAP;
+  pool_ops[op->tid] = op;
+
+  pool_op_submit(op);
+
+  return 0;
+}
+
+struct C_SelfmanagedSnap : public Context {
+  bufferlist bl;
+  snapid_t *psnapid;
+  Context *fin;
+  C_SelfmanagedSnap(snapid_t *ps, Context *f) : psnapid(ps), fin(f) {}
+  void finish(int r) override {
+    if (r == 0) {
+      try {
+        auto p = bl.cbegin();
+        decode(*psnapid, p);
+      } catch (buffer::error&) {
+        r = -EIO;
+      }
+    }
+    fin->complete(r);
+  }
+};
+
+int Objecter::allocate_selfmanaged_snap(int64_t pool, snapid_t *psnapid,
+					Context *onfinish)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "allocate_selfmanaged_snap; pool: " << pool << dendl;
+  PoolOp *op = new PoolOp;
+  if (!op) return -ENOMEM;
+  op->tid = ++last_tid;
+  op->pool = pool;
+  C_SelfmanagedSnap *fin = new C_SelfmanagedSnap(psnapid, onfinish);
+  op->onfinish = fin;
+  op->blp = &fin->bl;
+  op->pool_op = POOL_OP_CREATE_UNMANAGED_SNAP;
+  pool_ops[op->tid] = op;
+
+  pool_op_submit(op);
+  return 0;
+}
+
+int Objecter::delete_pool_snap(int64_t pool, string& snap_name,
+			       Context *onfinish)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "delete_pool_snap; pool: " << pool << "; snap: "
+		 << snap_name << dendl;
+
+  const pg_pool_t *p = osdmap->get_pg_pool(pool);
+  if (!p)
+    return -EINVAL;
+  if (!p->snap_exists(snap_name.c_str()))
+    return -ENOENT;
+
+  PoolOp *op = new PoolOp;
+  if (!op)
+    return -ENOMEM;
+  op->tid = ++last_tid;
+  op->pool = pool;
+  op->name = snap_name;
+  op->onfinish = onfinish;
+  op->pool_op = POOL_OP_DELETE_SNAP;
+  pool_ops[op->tid] = op;
+
+  pool_op_submit(op);
+
+  return 0;
+}
+
+int Objecter::delete_selfmanaged_snap(int64_t pool, snapid_t snap,
+				      Context *onfinish)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "delete_selfmanaged_snap; pool: " << pool << "; snap: "
+		 << snap << dendl;
+  PoolOp *op = new PoolOp;
+  if (!op) return -ENOMEM;
+  op->tid = ++last_tid;
+  op->pool = pool;
+  op->onfinish = onfinish;
+  op->pool_op = POOL_OP_DELETE_UNMANAGED_SNAP;
+  op->snapid = snap;
+  pool_ops[op->tid] = op;
+
+  pool_op_submit(op);
+
+  return 0;
+}
+
+int Objecter::create_pool(string& name, Context *onfinish,
+			  int crush_rule)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "create_pool name=" << name << dendl;
+
+  if (osdmap->lookup_pg_pool_name(name) >= 0)
+    return -EEXIST;
+
+  PoolOp *op = new PoolOp;
+  if (!op)
+    return -ENOMEM;
+  op->tid = ++last_tid;
+  op->pool = 0;
+  op->name = name;
+  op->onfinish = onfinish;
+  op->pool_op = POOL_OP_CREATE;
+  pool_ops[op->tid] = op;
+  op->crush_rule = crush_rule;
+
+  pool_op_submit(op);
+
+  return 0;
+}
+
+int Objecter::delete_pool(int64_t pool, Context *onfinish)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "delete_pool " << pool << dendl;
+
+  if (!osdmap->have_pg_pool(pool))
+    return -ENOENT;
+
+  _do_delete_pool(pool, onfinish);
+  return 0;
+}
+
+int Objecter::delete_pool(const string &pool_name, Context *onfinish)
+{
+  unique_lock wl(rwlock);
+  ldout(cct, 10) << "delete_pool " << pool_name << dendl;
+
+  int64_t pool = osdmap->lookup_pg_pool_name(pool_name);
+  if (pool < 0)
+    return pool;
+
+  _do_delete_pool(pool, onfinish);
+  return 0;
+}
+
+void Objecter::_do_delete_pool(int64_t pool, Context *onfinish)
+{
+  PoolOp *op = new PoolOp;
+  op->tid = ++last_tid;
+  op->pool = pool;
+  op->name = "delete";
+  op->onfinish = onfinish;
+  op->pool_op = POOL_OP_DELETE;
+  pool_ops[op->tid] = op;
+  pool_op_submit(op);
+}
+
+void Objecter::pool_op_submit(PoolOp *op)
+{
+  // rwlock is locked
+  if (mon_timeout > timespan(0)) {
+    op->ontimeout = timer.add_event(mon_timeout,
+				    [this, op]() {
+				      pool_op_cancel(op->tid, -ETIMEDOUT); });
+  }
+  _pool_op_submit(op);
+}
+
+void Objecter::_pool_op_submit(PoolOp *op)
+{
+  // rwlock is locked unique
+
+  ldout(cct, 10) << "pool_op_submit " << op->tid << dendl;
+  MPoolOp *m = new MPoolOp(monc->get_fsid(), op->tid, op->pool,
+			   op->name, op->pool_op,
+			   last_seen_osdmap_version);
+  if (op->snapid) m->snapid = op->snapid;
+  if (op->crush_rule) m->crush_rule = op->crush_rule;
+  monc->send_mon_message(m);
+  op->last_submit = ceph::coarse_mono_clock::now();
+
+  logger->inc(l_osdc_poolop_send);
+}
+
+/**
+ * Handle a reply to a PoolOp message. Check that we sent the message
+ * and give the caller responsibility for the returned bufferlist.
+ * Then either call the finisher or stash the PoolOp, depending on if we
+ * have a new enough map.
+ * Lastly, clean up the message and PoolOp.
+ */
+void Objecter::handle_pool_op_reply(MPoolOpReply *m)
+{
+  FUNCTRACE(cct);
+  shunique_lock sul(rwlock, acquire_shared);
+  if (!initialized) {
+    sul.unlock();
+    m->put();
+    return;
+  }
+
+  ldout(cct, 10) << "handle_pool_op_reply " << *m << dendl;
+  ceph_tid_t tid = m->get_tid();
+  map<ceph_tid_t, PoolOp *>::iterator iter = pool_ops.find(tid);
+  if (iter != pool_ops.end()) {
+    PoolOp *op = iter->second;
+    ldout(cct, 10) << "have request " << tid << " at " << op << " Op: "
+		   << ceph_pool_op_name(op->pool_op) << dendl;
+    if (op->blp)
+      op->blp->claim(m->response_data);
+    if (m->version > last_seen_osdmap_version)
+      last_seen_osdmap_version = m->version;
+    if (osdmap->get_epoch() < m->epoch) {
+      sul.unlock();
+      sul.lock();
+      // recheck op existence since we have let go of rwlock
+      // (for promotion) above.
+      iter = pool_ops.find(tid);
+      if (iter == pool_ops.end())
+	goto done; // op is gone.
+      if (osdmap->get_epoch() < m->epoch) {
+	ldout(cct, 20) << "waiting for client to reach epoch " << m->epoch
+		       << " before calling back" << dendl;
+	_wait_for_new_map(op->onfinish, m->epoch, m->replyCode);
+      } else {
+	// map epoch changed, probably because a MOSDMap message
+	// sneaked in. Do caller-specified callback now or else
+	// we lose it forever.
+	ceph_assert(op->onfinish);
+	op->onfinish->complete(m->replyCode);
+      }
+    } else {
+      ceph_assert(op->onfinish);
+      op->onfinish->complete(m->replyCode);
+    }
+    op->onfinish = NULL;
+    if (!sul.owns_lock()) {
+      sul.unlock();
+      sul.lock();
+    }
+    iter = pool_ops.find(tid);
+    if (iter != pool_ops.end()) {
+      _finish_pool_op(op, 0);
+    }
+  } else {
+    ldout(cct, 10) << "unknown request " << tid << dendl;
+  }
+
+done:
+  // Not strictly necessary, since we'll release it on return.
+  sul.unlock();
+
+  ldout(cct, 10) << "done" << dendl;
+  m->put();
+}
+
+int Objecter::pool_op_cancel(ceph_tid_t tid, int r)
+{
+  ceph_assert(initialized);
+
+  unique_lock wl(rwlock);
+
+  map<ceph_tid_t, PoolOp*>::iterator it = pool_ops.find(tid);
+  if (it == pool_ops.end()) {
+    ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl;
+    return -ENOENT;
+  }
+
+  ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+
+  PoolOp *op = it->second;
+  if (op->onfinish)
+    op->onfinish->complete(r);
+
+  _finish_pool_op(op, r);
+  return 0;
+}
+
+void Objecter::_finish_pool_op(PoolOp *op, int r)
+{
+  // rwlock is locked unique
+  pool_ops.erase(op->tid);
+  logger->set(l_osdc_poolop_active, pool_ops.size());
+
+  if (op->ontimeout && r != -ETIMEDOUT) {
+    timer.cancel_event(op->ontimeout);
+  }
+
+  delete op;
+}
+
+// pool stats
+
+void Objecter::get_pool_stats(list<string>& pools,
+			      map<string,pool_stat_t> *result,
+			      bool *per_pool,
+			      Context *onfinish)
+{
+  ldout(cct, 10) << "get_pool_stats " << pools << dendl;
+
+  PoolStatOp *op = new PoolStatOp;
+  op->tid = ++last_tid;
+  op->pools = pools;
+  op->pool_stats = result;
+  op->per_pool = per_pool;
+  op->onfinish = onfinish;
+  if (mon_timeout > timespan(0)) {
+    op->ontimeout = timer.add_event(mon_timeout,
+				    [this, op]() {
+				      pool_stat_op_cancel(op->tid,
+							  -ETIMEDOUT); });
+  } else {
+    op->ontimeout = 0;
+  }
+
+  unique_lock wl(rwlock);
+
+  poolstat_ops[op->tid] = op;
+
+  logger->set(l_osdc_poolstat_active, poolstat_ops.size());
+
+  _poolstat_submit(op);
+}
+
+void Objecter::_poolstat_submit(PoolStatOp *op)
+{
+  ldout(cct, 10) << "_poolstat_submit " << op->tid << dendl;
+  monc->send_mon_message(new MGetPoolStats(monc->get_fsid(), op->tid,
+					   op->pools,
+					   last_seen_pgmap_version));
+  op->last_submit = ceph::coarse_mono_clock::now();
+
+  logger->inc(l_osdc_poolstat_send);
+}
+
+void Objecter::handle_get_pool_stats_reply(MGetPoolStatsReply *m)
+{
+  ldout(cct, 10) << "handle_get_pool_stats_reply " << *m << dendl;
+  ceph_tid_t tid = m->get_tid();
+
+  unique_lock wl(rwlock);
+  if (!initialized) {
+    m->put();
+    return;
+  }
+
+  map<ceph_tid_t, PoolStatOp *>::iterator iter = poolstat_ops.find(tid);
+  if (iter != poolstat_ops.end()) {
+    PoolStatOp *op = poolstat_ops[tid];
+    ldout(cct, 10) << "have request " << tid << " at " << op << dendl;
+    *op->pool_stats = m->pool_stats;
+    *op->per_pool = m->per_pool;
+    if (m->version > last_seen_pgmap_version) {
+      last_seen_pgmap_version = m->version;
+    }
+    op->onfinish->complete(0);
+    _finish_pool_stat_op(op, 0);
+  } else {
+    ldout(cct, 10) << "unknown request " << tid << dendl;
+  }
+  ldout(cct, 10) << "done" << dendl;
+  m->put();
+}
+
+int Objecter::pool_stat_op_cancel(ceph_tid_t tid, int r)
+{
+  ceph_assert(initialized);
+
+  unique_lock wl(rwlock);
+
+  map<ceph_tid_t, PoolStatOp*>::iterator it = poolstat_ops.find(tid);
+  if (it == poolstat_ops.end()) {
+    ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl;
+    return -ENOENT;
+  }
+
+  ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+
+  PoolStatOp *op = it->second;
+  if (op->onfinish)
+    op->onfinish->complete(r);
+  _finish_pool_stat_op(op, r);
+  return 0;
+}
+
+void Objecter::_finish_pool_stat_op(PoolStatOp *op, int r)
+{
+  // rwlock is locked unique
+
+  poolstat_ops.erase(op->tid);
+  logger->set(l_osdc_poolstat_active, poolstat_ops.size());
+
+  if (op->ontimeout && r != -ETIMEDOUT)
+    timer.cancel_event(op->ontimeout);
+
+  delete op;
+}
+
+void Objecter::get_fs_stats(ceph_statfs& result,
+			    boost::optional<int64_t> data_pool,
+			    Context *onfinish)
+{
+  ldout(cct, 10) << "get_fs_stats" << dendl;
+  unique_lock l(rwlock);
+
+  StatfsOp *op = new StatfsOp;
+  op->tid = ++last_tid;
+  op->stats = &result;
+  op->data_pool = data_pool;
+  op->onfinish = onfinish;
+  if (mon_timeout > timespan(0)) {
+    op->ontimeout = timer.add_event(mon_timeout,
+				    [this, op]() {
+				      statfs_op_cancel(op->tid,
+						       -ETIMEDOUT); });
+  } else {
+    op->ontimeout = 0;
+  }
+  statfs_ops[op->tid] = op;
+
+  logger->set(l_osdc_statfs_active, statfs_ops.size());
+
+  _fs_stats_submit(op);
+}
+
+void Objecter::_fs_stats_submit(StatfsOp *op)
+{
+  // rwlock is locked unique
+
+  ldout(cct, 10) << "fs_stats_submit" << op->tid << dendl;
+  monc->send_mon_message(new MStatfs(monc->get_fsid(), op->tid,
+				     op->data_pool,
+				     last_seen_pgmap_version));
+  op->last_submit = ceph::coarse_mono_clock::now();
+
+  logger->inc(l_osdc_statfs_send);
+}
+
+void Objecter::handle_fs_stats_reply(MStatfsReply *m)
+{
+  unique_lock wl(rwlock);
+  if (!initialized) {
+    m->put();
+    return;
+  }
+
+  ldout(cct, 10) << "handle_fs_stats_reply " << *m << dendl;
+  ceph_tid_t tid = m->get_tid();
+
+  if (statfs_ops.count(tid)) {
+    StatfsOp *op = statfs_ops[tid];
+    ldout(cct, 10) << "have request " << tid << " at " << op << dendl;
+    *(op->stats) = m->h.st;
+    if (m->h.version > last_seen_pgmap_version)
+      last_seen_pgmap_version = m->h.version;
+    op->onfinish->complete(0);
+    _finish_statfs_op(op, 0);
+  } else {
+    ldout(cct, 10) << "unknown request " << tid << dendl;
+  }
+  m->put();
+  ldout(cct, 10) << "done" << dendl;
+}
+
+int Objecter::statfs_op_cancel(ceph_tid_t tid, int r)
+{
+  ceph_assert(initialized);
+
+  unique_lock wl(rwlock);
+
+  map<ceph_tid_t, StatfsOp*>::iterator it = statfs_ops.find(tid);
+  if (it == statfs_ops.end()) {
+    ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl;
+    return -ENOENT;
+  }
+
+  ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+
+  StatfsOp *op = it->second;
+  if (op->onfinish)
+    op->onfinish->complete(r);
+  _finish_statfs_op(op, r);
+  return 0;
+}
+
+void Objecter::_finish_statfs_op(StatfsOp *op, int r)
+{
+  // rwlock is locked unique
+
+  statfs_ops.erase(op->tid);
+  logger->set(l_osdc_statfs_active, statfs_ops.size());
+
+  if (op->ontimeout && r != -ETIMEDOUT)
+    timer.cancel_event(op->ontimeout);
+
+  delete op;
+}
+
+// scatter/gather
+
+void Objecter::_sg_read_finish(vector<ObjectExtent>& extents,
+			       vector<bufferlist>& resultbl,
+			       bufferlist *bl, Context *onfinish)
+{
+  // all done
+  ldout(cct, 15) << "_sg_read_finish" << dendl;
+
+  if (extents.size() > 1) {
+    Striper::StripedReadResult r;
+    vector<bufferlist>::iterator bit = resultbl.begin();
+    for (vector<ObjectExtent>::iterator eit = extents.begin();
+	 eit != extents.end();
+	 ++eit, ++bit) {
+      r.add_partial_result(cct, *bit, eit->buffer_extents);
+    }
+    bl->clear();
+    r.assemble_result(cct, *bl, false);
+  } else {
+    ldout(cct, 15) << "  only one frag" << dendl;
+    bl->claim(resultbl[0]);
+  }
+
+  // done
+  uint64_t bytes_read = bl->length();
+  ldout(cct, 7) << "_sg_read_finish " << bytes_read << " bytes" << dendl;
+
+  if (onfinish) {
+    onfinish->complete(bytes_read);// > 0 ? bytes_read:m->get_result());
+  }
+}
+
+
+void Objecter::ms_handle_connect(Connection *con)
+{
+  ldout(cct, 10) << "ms_handle_connect " << con << dendl;
+  if (!initialized)
+    return;
+
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON)
+    resend_mon_ops();
+}
+
+bool Objecter::ms_handle_reset(Connection *con)
+{
+  if (!initialized)
+    return false;
+  if (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
+    unique_lock wl(rwlock);
+
+    auto priv = con->get_priv();
+    auto session = static_cast<OSDSession*>(priv.get());
+    if (session) {
+      ldout(cct, 1) << "ms_handle_reset " << con << " session " << session
+		    << " osd." << session->osd << dendl;
+      // the session maybe had been closed if new osdmap just handled
+      // says the osd down
+      if (!(initialized && osdmap->is_up(session->osd))) {
+	ldout(cct, 1) << "ms_handle_reset aborted,initialized=" << initialized << dendl;
+	wl.unlock();
+	return false;
+      }
+      map<uint64_t, LingerOp *> lresend;
+      OSDSession::unique_lock sl(session->lock);
+      _reopen_session(session);
+      _kick_requests(session, lresend);
+      sl.unlock();
+      _linger_ops_resend(lresend, wl);
+      wl.unlock();
+      maybe_request_map();
+    }
+    return true;
+  }
+  return false;
+}
+
+void Objecter::ms_handle_remote_reset(Connection *con)
+{
+  /*
+   * treat these the same.
+   */
+  ms_handle_reset(con);
+}
+
+bool Objecter::ms_handle_refused(Connection *con)
+{
+  // just log for now
+  if (osdmap && (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD)) {
+    int osd = osdmap->identify_osd(con->get_peer_addr());
+    if (osd >= 0) {
+      ldout(cct, 1) << "ms_handle_refused on osd." << osd << dendl;
+    }
+  }
+  return false;
+}
+
+bool Objecter::ms_get_authorizer(int dest_type,
+				 AuthAuthorizer **authorizer)
+{
+  if (!initialized)
+    return false;
+  if (dest_type == CEPH_ENTITY_TYPE_MON)
+    return true;
+  *authorizer = monc->build_authorizer(dest_type);
+  return *authorizer != NULL;
+}
+
+void Objecter::op_target_t::dump(Formatter *f) const
+{
+  f->dump_stream("pg") << pgid;
+  f->dump_int("osd", osd);
+  f->dump_stream("object_id") << base_oid;
+  f->dump_stream("object_locator") << base_oloc;
+  f->dump_stream("target_object_id") << target_oid;
+  f->dump_stream("target_object_locator") << target_oloc;
+  f->dump_int("paused", (int)paused);
+  f->dump_int("used_replica", (int)used_replica);
+  f->dump_int("precalc_pgid", (int)precalc_pgid);
+}
+
+void Objecter::_dump_active(OSDSession *s)
+{
+  for (map<ceph_tid_t,Op*>::iterator p = s->ops.begin();
+       p != s->ops.end();
+       ++p) {
+    Op *op = p->second;
+    ldout(cct, 20) << op->tid << "\t" << op->target.pgid
+		   << "\tosd." << (op->session ? op->session->osd : -1)
+		   << "\t" << op->target.base_oid
+		   << "\t" << op->ops << dendl;
+  }
+}
+
+void Objecter::_dump_active()
+{
+  ldout(cct, 20) << "dump_active .. " << num_homeless_ops << " homeless"
+		 << dendl;
+  for (map<int, OSDSession *>::iterator siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    OSDSession *s = siter->second;
+    OSDSession::shared_lock sl(s->lock);
+    _dump_active(s);
+    sl.unlock();
+  }
+  _dump_active(homeless_session);
+}
+
+void Objecter::dump_active()
+{
+  shared_lock rl(rwlock);
+  _dump_active();
+  rl.unlock();
+}
+
+void Objecter::dump_requests(Formatter *fmt)
+{
+  // Read-lock on Objecter held here
+  fmt->open_object_section("requests");
+  dump_ops(fmt);
+  dump_linger_ops(fmt);
+  dump_pool_ops(fmt);
+  dump_pool_stat_ops(fmt);
+  dump_statfs_ops(fmt);
+  dump_command_ops(fmt);
+  fmt->close_section(); // requests object
+}
+
+void Objecter::_dump_ops(const OSDSession *s, Formatter *fmt)
+{
+  for (map<ceph_tid_t,Op*>::const_iterator p = s->ops.begin();
+       p != s->ops.end();
+       ++p) {
+    Op *op = p->second;
+    auto age = std::chrono::duration<double>(coarse_mono_clock::now() - op->stamp);
+    fmt->open_object_section("op");
+    fmt->dump_unsigned("tid", op->tid);
+    op->target.dump(fmt);
+    fmt->dump_stream("last_sent") << op->stamp;
+    fmt->dump_float("age", age.count());
+    fmt->dump_int("attempts", op->attempts);
+    fmt->dump_stream("snapid") << op->snapid;
+    fmt->dump_stream("snap_context") << op->snapc;
+    fmt->dump_stream("mtime") << op->mtime;
+
+    fmt->open_array_section("osd_ops");
+    for (vector<OSDOp>::const_iterator it = op->ops.begin();
+	 it != op->ops.end();
+	 ++it) {
+      fmt->dump_stream("osd_op") << *it;
+    }
+    fmt->close_section(); // osd_ops array
+
+    fmt->close_section(); // op object
+  }
+}
+
+void Objecter::dump_ops(Formatter *fmt)
+{
+  // Read-lock on Objecter held
+  fmt->open_array_section("ops");
+  for (map<int, OSDSession *>::const_iterator siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    OSDSession *s = siter->second;
+    OSDSession::shared_lock sl(s->lock);
+    _dump_ops(s, fmt);
+    sl.unlock();
+  }
+  _dump_ops(homeless_session, fmt);
+  fmt->close_section(); // ops array
+}
+
+void Objecter::_dump_linger_ops(const OSDSession *s, Formatter *fmt)
+{
+  for (map<uint64_t, LingerOp*>::const_iterator p = s->linger_ops.begin();
+       p != s->linger_ops.end();
+       ++p) {
+    LingerOp *op = p->second;
+    fmt->open_object_section("linger_op");
+    fmt->dump_unsigned("linger_id", op->linger_id);
+    op->target.dump(fmt);
+    fmt->dump_stream("snapid") << op->snap;
+    fmt->dump_stream("registered") << op->registered;
+    fmt->close_section(); // linger_op object
+  }
+}
+
+void Objecter::dump_linger_ops(Formatter *fmt)
+{
+  // We have a read-lock on the objecter
+  fmt->open_array_section("linger_ops");
+  for (map<int, OSDSession *>::const_iterator siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    OSDSession *s = siter->second;
+    OSDSession::shared_lock sl(s->lock);
+    _dump_linger_ops(s, fmt);
+    sl.unlock();
+  }
+  _dump_linger_ops(homeless_session, fmt);
+  fmt->close_section(); // linger_ops array
+}
+
+void Objecter::_dump_command_ops(const OSDSession *s, Formatter *fmt)
+{
+  for (map<uint64_t, CommandOp*>::const_iterator p = s->command_ops.begin();
+       p != s->command_ops.end();
+       ++p) {
+    CommandOp *op = p->second;
+    fmt->open_object_section("command_op");
+    fmt->dump_unsigned("command_id", op->tid);
+    fmt->dump_int("osd", op->session ? op->session->osd : -1);
+    fmt->open_array_section("command");
+    for (vector<string>::const_iterator q = op->cmd.begin();
+	 q != op->cmd.end(); ++q)
+      fmt->dump_string("word", *q);
+    fmt->close_section();
+    if (op->target_osd >= 0)
+      fmt->dump_int("target_osd", op->target_osd);
+    else
+      fmt->dump_stream("target_pg") << op->target_pg;
+    fmt->close_section(); // command_op object
+  }
+}
+
+void Objecter::dump_command_ops(Formatter *fmt)
+{
+  // We have a read-lock on the Objecter here
+  fmt->open_array_section("command_ops");
+  for (map<int, OSDSession *>::const_iterator siter = osd_sessions.begin();
+       siter != osd_sessions.end(); ++siter) {
+    OSDSession *s = siter->second;
+    OSDSession::shared_lock sl(s->lock);
+    _dump_command_ops(s, fmt);
+    sl.unlock();
+  }
+  _dump_command_ops(homeless_session, fmt);
+  fmt->close_section(); // command_ops array
+}
+
+void Objecter::dump_pool_ops(Formatter *fmt) const
+{
+  fmt->open_array_section("pool_ops");
+  for (map<ceph_tid_t, PoolOp*>::const_iterator p = pool_ops.begin();
+       p != pool_ops.end();
+       ++p) {
+    PoolOp *op = p->second;
+    fmt->open_object_section("pool_op");
+    fmt->dump_unsigned("tid", op->tid);
+    fmt->dump_int("pool", op->pool);
+    fmt->dump_string("name", op->name);
+    fmt->dump_int("operation_type", op->pool_op);
+    fmt->dump_unsigned("crush_rule", op->crush_rule);
+    fmt->dump_stream("snapid") << op->snapid;
+    fmt->dump_stream("last_sent") << op->last_submit;
+    fmt->close_section(); // pool_op object
+  }
+  fmt->close_section(); // pool_ops array
+}
+
+void Objecter::dump_pool_stat_ops(Formatter *fmt) const
+{
+  fmt->open_array_section("pool_stat_ops");
+  for (map<ceph_tid_t, PoolStatOp*>::const_iterator p = poolstat_ops.begin();
+       p != poolstat_ops.end();
+       ++p) {
+    PoolStatOp *op = p->second;
+    fmt->open_object_section("pool_stat_op");
+    fmt->dump_unsigned("tid", op->tid);
+    fmt->dump_stream("last_sent") << op->last_submit;
+
+    fmt->open_array_section("pools");
+    for (list<string>::const_iterator it = op->pools.begin();
+	 it != op->pools.end();
+	 ++it) {
+      fmt->dump_string("pool", *it);
+    }
+    fmt->close_section(); // pools array
+
+    fmt->close_section(); // pool_stat_op object
+  }
+  fmt->close_section(); // pool_stat_ops array
+}
+
+void Objecter::dump_statfs_ops(Formatter *fmt) const
+{
+  fmt->open_array_section("statfs_ops");
+  for (map<ceph_tid_t, StatfsOp*>::const_iterator p = statfs_ops.begin();
+       p != statfs_ops.end();
+       ++p) {
+    StatfsOp *op = p->second;
+    fmt->open_object_section("statfs_op");
+    fmt->dump_unsigned("tid", op->tid);
+    fmt->dump_stream("last_sent") << op->last_submit;
+    fmt->close_section(); // statfs_op object
+  }
+  fmt->close_section(); // statfs_ops array
+}
+
+Objecter::RequestStateHook::RequestStateHook(Objecter *objecter) :
+  m_objecter(objecter)
+{
+}
+
+bool Objecter::RequestStateHook::call(std::string_view command,
+				      const cmdmap_t& cmdmap,
+				      std::string_view format,
+				      bufferlist& out)
+{
+  Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
+  shared_lock rl(m_objecter->rwlock);
+  m_objecter->dump_requests(f);
+  f->flush(out);
+  delete f;
+  return true;
+}
+
+void Objecter::blacklist_self(bool set)
+{
+  ldout(cct, 10) << "blacklist_self " << (set ? "add" : "rm") << dendl;
+
+  vector<string> cmd;
+  cmd.push_back("{\"prefix\":\"osd blacklist\", ");
+  if (set)
+    cmd.push_back("\"blacklistop\":\"add\",");
+  else
+    cmd.push_back("\"blacklistop\":\"rm\",");
+  stringstream ss;
+  // this is somewhat imprecise in that we are blacklisting our first addr only
+  ss << messenger->get_myaddrs().front().get_legacy_str();
+  cmd.push_back("\"addr\":\"" + ss.str() + "\"");
+
+  MMonCommand *m = new MMonCommand(monc->get_fsid());
+  m->cmd = cmd;
+
+  monc->send_mon_message(m);
+}
+
+// commands
+
+void Objecter::handle_command_reply(MCommandReply *m)
+{
+  unique_lock wl(rwlock);
+  if (!initialized) {
+    m->put();
+    return;
+  }
+
+  ConnectionRef con = m->get_connection();
+  auto priv = con->get_priv();
+  auto s = static_cast<OSDSession*>(priv.get());
+  if (!s || s->con != con) {
+    ldout(cct, 7) << __func__ << " no session on con " << con << dendl;
+    m->put();
+    return;
+  }
+
+  OSDSession::shared_lock sl(s->lock);
+  map<ceph_tid_t,CommandOp*>::iterator p = s->command_ops.find(m->get_tid());
+  if (p == s->command_ops.end()) {
+    ldout(cct, 10) << "handle_command_reply tid " << m->get_tid()
+		   << " not found" << dendl;
+    m->put();
+    sl.unlock();
+    return;
+  }
+
+  CommandOp *c = p->second;
+  if (!c->session ||
+      m->get_connection() != c->session->con) {
+    ldout(cct, 10) << "handle_command_reply tid " << m->get_tid()
+		   << " got reply from wrong connection "
+		   << m->get_connection() << " " << m->get_source_inst()
+		   << dendl;
+    m->put();
+    sl.unlock();
+    return;
+  }
+  if (c->poutbl) {
+    c->poutbl->claim(m->get_data());
+  }
+
+  sl.unlock();
+
+  OSDSession::unique_lock sul(s->lock);
+  _finish_command(c, m->r, m->rs);
+  sul.unlock();
+
+  m->put();
+}
+
+void Objecter::submit_command(CommandOp *c, ceph_tid_t *ptid)
+{
+  shunique_lock sul(rwlock, ceph::acquire_unique);
+
+  ceph_tid_t tid = ++last_tid;
+  ldout(cct, 10) << "_submit_command " << tid << " " << c->cmd << dendl;
+  c->tid = tid;
+
+  {
+    OSDSession::unique_lock hs_wl(homeless_session->lock);
+    _session_command_op_assign(homeless_session, c);
+  }
+
+  _calc_command_target(c, sul);
+  _assign_command_session(c, sul);
+  if (osd_timeout > timespan(0)) {
+    c->ontimeout = timer.add_event(osd_timeout,
+				   [this, c, tid]() {
+				     command_op_cancel(c->session, tid,
+						       -ETIMEDOUT); });
+  }
+
+  if (!c->session->is_homeless()) {
+    _send_command(c);
+  } else {
+    _maybe_request_map();
+  }
+  if (c->map_check_error)
+    _send_command_map_check(c);
+  *ptid = tid;
+
+  logger->inc(l_osdc_command_active);
+}
+
+int Objecter::_calc_command_target(CommandOp *c, shunique_lock& sul)
+{
+  ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock);
+
+  c->map_check_error = 0;
+
+  // ignore overlays, just like we do with pg ops
+  c->target.flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
+
+  if (c->target_osd >= 0) {
+    if (!osdmap->exists(c->target_osd)) {
+      c->map_check_error = -ENOENT;
+      c->map_check_error_str = "osd dne";
+      c->target.osd = -1;
+      return RECALC_OP_TARGET_OSD_DNE;
+    }
+    if (osdmap->is_down(c->target_osd)) {
+      c->map_check_error = -ENXIO;
+      c->map_check_error_str = "osd down";
+      c->target.osd = -1;
+      return RECALC_OP_TARGET_OSD_DOWN;
+    }
+    c->target.osd = c->target_osd;
+  } else {
+    int ret = _calc_target(&(c->target), nullptr, true);
+    if (ret == RECALC_OP_TARGET_POOL_DNE) {
+      c->map_check_error = -ENOENT;
+      c->map_check_error_str = "pool dne";
+      c->target.osd = -1;
+      return ret;
+    } else if (ret == RECALC_OP_TARGET_OSD_DOWN) {
+      c->map_check_error = -ENXIO;
+      c->map_check_error_str = "osd down";
+      c->target.osd = -1;
+      return ret;
+    }
+  }
+
+  OSDSession *s;
+  int r = _get_session(c->target.osd, &s, sul);
+  ceph_assert(r != -EAGAIN); /* shouldn't happen as we're holding the write lock */
+
+  if (c->session != s) {
+    put_session(s);
+    return RECALC_OP_TARGET_NEED_RESEND;
+  }
+
+  put_session(s);
+
+  ldout(cct, 20) << "_recalc_command_target " << c->tid << " no change, "
+		 << c->session << dendl;
+
+  return RECALC_OP_TARGET_NO_ACTION;
+}
+
+void Objecter::_assign_command_session(CommandOp *c,
+				       shunique_lock& sul)
+{
+  ceph_assert(sul.owns_lock() && sul.mutex() == &rwlock);
+
+  OSDSession *s;
+  int r = _get_session(c->target.osd, &s, sul);
+  ceph_assert(r != -EAGAIN); /* shouldn't happen as we're holding the write lock */
+
+  if (c->session != s) {
+    if (c->session) {
+      OSDSession *cs = c->session;
+      OSDSession::unique_lock csl(cs->lock);
+      _session_command_op_remove(c->session, c);
+      csl.unlock();
+    }
+    OSDSession::unique_lock sl(s->lock);
+    _session_command_op_assign(s, c);
+  }
+
+  put_session(s);
+}
+
+void Objecter::_send_command(CommandOp *c)
+{
+  ldout(cct, 10) << "_send_command " << c->tid << dendl;
+  ceph_assert(c->session);
+  ceph_assert(c->session->con);
+  MCommand *m = new MCommand(monc->monmap.fsid);
+  m->cmd = c->cmd;
+  m->set_data(c->inbl);
+  m->set_tid(c->tid);
+  c->session->con->send_message(m);
+  logger->inc(l_osdc_command_send);
+}
+
+int Objecter::command_op_cancel(OSDSession *s, ceph_tid_t tid, int r)
+{
+  ceph_assert(initialized);
+
+  unique_lock wl(rwlock);
+
+  map<ceph_tid_t, CommandOp*>::iterator it = s->command_ops.find(tid);
+  if (it == s->command_ops.end()) {
+    ldout(cct, 10) << __func__ << " tid " << tid << " dne" << dendl;
+    return -ENOENT;
+  }
+
+  ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+
+  CommandOp *op = it->second;
+  _command_cancel_map_check(op);
+  OSDSession::unique_lock sl(op->session->lock);
+  _finish_command(op, r, "");
+  sl.unlock();
+  return 0;
+}
+
+void Objecter::_finish_command(CommandOp *c, int r, string rs)
+{
+  // rwlock is locked unique
+  // session lock is locked
+
+  ldout(cct, 10) << "_finish_command " << c->tid << " = " << r << " "
+		 << rs << dendl;
+  if (c->prs)
+    *c->prs = rs;
+  if (c->onfinish)
+    c->onfinish->complete(r);
+
+  if (c->ontimeout && r != -ETIMEDOUT)
+    timer.cancel_event(c->ontimeout);
+
+  _session_command_op_remove(c->session, c);
+
+  c->put();
+
+  logger->dec(l_osdc_command_active);
+}
+
+Objecter::OSDSession::~OSDSession()
+{
+  // Caller is responsible for re-assigning or
+  // destroying any ops that were assigned to us
+  ceph_assert(ops.empty());
+  ceph_assert(linger_ops.empty());
+  ceph_assert(command_ops.empty());
+}
+
+Objecter::~Objecter()
+{
+  delete osdmap;
+
+  ceph_assert(homeless_session->get_nref() == 1);
+  ceph_assert(num_homeless_ops == 0);
+  homeless_session->put();
+
+  ceph_assert(osd_sessions.empty());
+  ceph_assert(poolstat_ops.empty());
+  ceph_assert(statfs_ops.empty());
+  ceph_assert(pool_ops.empty());
+  ceph_assert(waiting_for_map.empty());
+  ceph_assert(linger_ops.empty());
+  ceph_assert(check_latest_map_lingers.empty());
+  ceph_assert(check_latest_map_ops.empty());
+  ceph_assert(check_latest_map_commands.empty());
+
+  ceph_assert(!m_request_state_hook);
+  ceph_assert(!logger);
+}
+
+/**
+ * Wait until this OSD map epoch is received before
+ * sending any more operations to OSDs.  Use this
+ * when it is known that the client can't trust
+ * anything from before this epoch (e.g. due to
+ * client blacklist at this epoch).
+ */
+void Objecter::set_epoch_barrier(epoch_t epoch)
+{
+  unique_lock wl(rwlock);
+
+  ldout(cct, 7) << __func__ << ": barrier " << epoch << " (was "
+		<< epoch_barrier << ") current epoch " << osdmap->get_epoch()
+		<< dendl;
+  if (epoch > epoch_barrier) {
+    epoch_barrier = epoch;
+    _maybe_request_map();
+  }
+}
+
+
+
+hobject_t Objecter::enumerate_objects_begin()
+{
+  return hobject_t();
+}
+
+hobject_t Objecter::enumerate_objects_end()
+{
+  return hobject_t::get_max();
+}
+
+struct C_EnumerateReply : public Context {
+  bufferlist bl;
+
+  Objecter *objecter;
+  hobject_t *next;
+  std::list<librados::ListObjectImpl> *result;
+  const hobject_t end;
+  const int64_t pool_id;
+  Context *on_finish;
+
+  epoch_t epoch;
+  int budget;
+
+  C_EnumerateReply(Objecter *objecter_, hobject_t *next_,
+      std::list<librados::ListObjectImpl> *result_,
+      const hobject_t end_, const int64_t pool_id_, Context *on_finish_) :
+    objecter(objecter_), next(next_), result(result_),
+    end(end_), pool_id(pool_id_), on_finish(on_finish_),
+    epoch(0), budget(-1)
+  {}
+
+  void finish(int r) override {
+    objecter->_enumerate_reply(
+      bl, r, end, pool_id, budget, epoch, result, next, on_finish);
+  }
+};
+
+void Objecter::enumerate_objects(
+    int64_t pool_id,
+    const std::string &ns,
+    const hobject_t &start,
+    const hobject_t &end,
+    const uint32_t max,
+    const bufferlist &filter_bl,
+    std::list<librados::ListObjectImpl> *result, 
+    hobject_t *next,
+    Context *on_finish)
+{
+  ceph_assert(result);
+
+  if (!end.is_max() && start > end) {
+    lderr(cct) << __func__ << ": start " << start << " > end " << end << dendl;
+    on_finish->complete(-EINVAL);
+    return;
+  }
+
+  if (max < 1) {
+    lderr(cct) << __func__ << ": result size may not be zero" << dendl;
+    on_finish->complete(-EINVAL);
+    return;
+  }
+
+  if (start.is_max()) {
+    on_finish->complete(0);
+    return;
+  }
+
+  shared_lock rl(rwlock);
+  ceph_assert(osdmap->get_epoch());
+  if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
+    rl.unlock();
+    lderr(cct) << __func__ << ": SORTBITWISE cluster flag not set" << dendl;
+    on_finish->complete(-EOPNOTSUPP);
+    return;
+  }
+  const pg_pool_t *p = osdmap->get_pg_pool(pool_id);
+  if (!p) {
+    lderr(cct) << __func__ << ": pool " << pool_id << " DNE in osd epoch "
+	       << osdmap->get_epoch() << dendl;
+    rl.unlock();
+    on_finish->complete(-ENOENT);
+    return;
+  } else {
+    rl.unlock();
+  }
+
+  ldout(cct, 20) << __func__ << ": start=" << start << " end=" << end << dendl;
+
+  // Stash completion state
+  C_EnumerateReply *on_ack = new C_EnumerateReply(
+      this, next, result, end, pool_id, on_finish);
+
+  ObjectOperation op;
+  op.pg_nls(max, filter_bl, start, 0);
+
+  // Issue.  See you later in _enumerate_reply
+  object_locator_t oloc(pool_id, ns);
+  pg_read(start.get_hash(), oloc, op,
+	  &on_ack->bl, 0, on_ack, &on_ack->epoch, &on_ack->budget);
+}
+
+void Objecter::_enumerate_reply(
+    bufferlist &bl,
+    int r,
+    const hobject_t &end,
+    const int64_t pool_id,
+    int budget,
+    epoch_t reply_epoch,
+    std::list<librados::ListObjectImpl> *result,
+    hobject_t *next,
+    Context *on_finish)
+{
+  if (budget >= 0) {
+    put_op_budget_bytes(budget);
+  }
+
+  if (r < 0) {
+    ldout(cct, 4) << __func__ << ": remote error " << r << dendl;
+    on_finish->complete(r);
+    return;
+  }
+
+  ceph_assert(next != NULL);
+
+  // Decode the results
+  auto iter = bl.cbegin();
+  pg_nls_response_t response;
+
+  // XXX extra_info doesn't seem used anywhere?
+  bufferlist extra_info;
+  decode(response, iter);
+  if (!iter.end()) {
+    decode(extra_info, iter);
+  }
+
+  ldout(cct, 10) << __func__ << ": got " << response.entries.size()
+		 << " handle " << response.handle
+		 << " reply_epoch " << reply_epoch << dendl;
+  ldout(cct, 20) << __func__ << ": response.entries.size "
+		 << response.entries.size() << ", response.entries "
+		 << response.entries << dendl;
+  if (response.handle <= end) {
+    *next = response.handle;
+  } else {
+    ldout(cct, 10) << __func__ << ": adjusted next down to end " << end
+		   << dendl;
+    *next = end;
+
+    // drop anything after 'end'
+    shared_lock rl(rwlock);
+    const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
+    if (!pool) {
+      // pool is gone, drop any results which are now meaningless.
+      rl.unlock();
+      on_finish->complete(-ENOENT);
+      return;
+    }
+    while (!response.entries.empty()) {
+      uint32_t hash = response.entries.back().locator.empty() ?
+	pool->hash_key(response.entries.back().oid,
+		       response.entries.back().nspace) :
+	pool->hash_key(response.entries.back().locator,
+		       response.entries.back().nspace);
+      hobject_t last(response.entries.back().oid,
+		     response.entries.back().locator,
+		     CEPH_NOSNAP,
+		     hash,
+		     pool_id,
+		     response.entries.back().nspace);
+      if (last < end)
+	break;
+      ldout(cct, 20) << __func__ << " dropping item " << last
+		     << " >= end " << end << dendl;
+      response.entries.pop_back();
+    }
+    rl.unlock();
+  }
+  if (!response.entries.empty()) {
+    result->merge(response.entries);
+  }
+
+  // release the listing context's budget once all
+  // OPs (in the session) are finished
+#if 0
+  put_nlist_context_budget(list_context);
+#endif
+  on_finish->complete(r);
+  return;
+}
+
+namespace {
+  using namespace librados;
+
+  template <typename T>
+  void do_decode(std::vector<T>& items, std::vector<bufferlist>& bls)
+  {
+    for (auto bl : bls) {
+      auto p = bl.cbegin();
+      T t;
+      decode(t, p);
+      items.push_back(t);
+    }
+  }
+
+  struct C_ObjectOperation_scrub_ls : public Context {
+    bufferlist bl;
+    uint32_t *interval;
+    std::vector<inconsistent_obj_t> *objects = nullptr;
+    std::vector<inconsistent_snapset_t> *snapsets = nullptr;
+    int *rval;
+
+    C_ObjectOperation_scrub_ls(uint32_t *interval,
+			       std::vector<inconsistent_obj_t> *objects,
+			       int *rval)
+      : interval(interval), objects(objects), rval(rval) {}
+    C_ObjectOperation_scrub_ls(uint32_t *interval,
+			       std::vector<inconsistent_snapset_t> *snapsets,
+			       int *rval)
+      : interval(interval), snapsets(snapsets), rval(rval) {}
+    void finish(int r) override {
+      if (r < 0 && r != -EAGAIN) {
+        if (rval)
+          *rval = r;
+	return;
+      }
+
+      if (rval)
+        *rval = 0;
+
+      try {
+	decode();
+      } catch (buffer::error&) {
+	if (rval)
+	  *rval = -EIO;
+      }
+    }
+  private:
+    void decode() {
+      scrub_ls_result_t result;
+      auto p = bl.cbegin();
+      result.decode(p);
+      *interval = result.interval;
+      if (objects) {
+	do_decode(*objects, result.vals);
+      } else {
+	do_decode(*snapsets, result.vals);
+      }
+    }
+  };
+
+  template <typename T>
+  void do_scrub_ls(::ObjectOperation *op,
+		   const scrub_ls_arg_t& arg,
+		   std::vector<T> *items,
+		   uint32_t *interval,
+		   int *rval)
+  {
+    OSDOp& osd_op = op->add_op(CEPH_OSD_OP_SCRUBLS);
+    op->flags |= CEPH_OSD_FLAG_PGOP;
+    ceph_assert(interval);
+    arg.encode(osd_op.indata);
+    unsigned p = op->ops.size() - 1;
+    auto *h = new C_ObjectOperation_scrub_ls{interval, items, rval};
+    op->out_handler[p] = h;
+    op->out_bl[p] = &h->bl;
+    op->out_rval[p] = rval;
+  }
+}
+
+void ::ObjectOperation::scrub_ls(const librados::object_id_t& start_after,
+				 uint64_t max_to_get,
+				 std::vector<librados::inconsistent_obj_t> *objects,
+				 uint32_t *interval,
+				 int *rval)
+{
+  scrub_ls_arg_t arg = {*interval, 0, start_after, max_to_get};
+  do_scrub_ls(this, arg, objects, interval, rval);
+}
+
+void ::ObjectOperation::scrub_ls(const librados::object_id_t& start_after,
+				 uint64_t max_to_get,
+				 std::vector<librados::inconsistent_snapset_t> *snapsets,
+				 uint32_t *interval,
+				 int *rval)
+{
+  scrub_ls_arg_t arg = {*interval, 1, start_after, max_to_get};
+  do_scrub_ls(this, arg, snapsets, interval, rval);
+}
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
new file mode 100644
index 00000000..ca8d85f7
--- /dev/null
+++ b/src/osdc/Objecter.h
@@ -0,0 +1,3067 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OBJECTER_H
+#define CEPH_OBJECTER_H
+
+#include <condition_variable>
+#include <list>
+#include <map>
+#include <mutex>
+#include <memory>
+#include <sstream>
+#include <type_traits>
+
+#include <boost/thread/shared_mutex.hpp>
+
+#include "include/ceph_assert.h"
+#include "include/buffer.h"
+#include "include/types.h"
+#include "include/rados/rados_types.hpp"
+
+#include "common/admin_socket.h"
+#include "common/ceph_time.h"
+#include "common/ceph_timer.h"
+#include "common/config_obs.h"
+#include "common/shunique_lock.h"
+#include "common/zipkin_trace.h"
+#include "common/Finisher.h"
+#include "common/Throttle.h"
+
+#include "messages/MOSDOp.h"
+#include "msg/Dispatcher.h"
+#include "osd/OSDMap.h"
+
+
+class Context;
+class Messenger;
+class OSDMap;
+class MonClient;
+class Message;
+class Finisher;
+
+class MPoolOpReply;
+
+class MGetPoolStatsReply;
+class MStatfsReply;
+class MCommandReply;
+class MWatchNotify;
+
+class PerfCounters;
+
+// -----------------------------------------
+
+struct ObjectOperation {
+  vector<OSDOp> ops;
+  int flags;
+  int priority;
+
+  vector<bufferlist*> out_bl;
+  vector<Context*> out_handler;
+  vector<int*> out_rval;
+
+  ObjectOperation() : flags(0), priority(0) {}
+  ~ObjectOperation() {
+    while (!out_handler.empty()) {
+      delete out_handler.back();
+      out_handler.pop_back();
+    }
+  }
+
+  size_t size() {
+    return ops.size();
+  }
+
+  void set_last_op_flags(int flags) {
+    ceph_assert(!ops.empty());
+    ops.rbegin()->op.flags = flags;
+  }
+
+  class C_TwoContexts;
+  /**
+   * Add a callback to run when this operation completes,
+   * after any other callbacks for it.
+   */
+  void add_handler(Context *extra);
+
+  OSDOp& add_op(int op) {
+    int s = ops.size();
+    ops.resize(s+1);
+    ops[s].op.op = op;
+    out_bl.resize(s+1);
+    out_bl[s] = NULL;
+    out_handler.resize(s+1);
+    out_handler[s] = NULL;
+    out_rval.resize(s+1);
+    out_rval[s] = NULL;
+    return ops[s];
+  }
+  void add_data(int op, uint64_t off, uint64_t len, bufferlist& bl) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.extent.offset = off;
+    osd_op.op.extent.length = len;
+    osd_op.indata.claim_append(bl);
+  }
+  void add_writesame(int op, uint64_t off, uint64_t write_len,
+		     bufferlist& bl) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.writesame.offset = off;
+    osd_op.op.writesame.length = write_len;
+    osd_op.op.writesame.data_length = bl.length();
+    osd_op.indata.claim_append(bl);
+  }
+  void add_xattr(int op, const char *name, const bufferlist& data) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.xattr.name_len = (name ? strlen(name) : 0);
+    osd_op.op.xattr.value_len = data.length();
+    if (name)
+      osd_op.indata.append(name, osd_op.op.xattr.name_len);
+    osd_op.indata.append(data);
+  }
+  void add_xattr_cmp(int op, const char *name, uint8_t cmp_op,
+		     uint8_t cmp_mode, const bufferlist& data) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.xattr.name_len = (name ? strlen(name) : 0);
+    osd_op.op.xattr.value_len = data.length();
+    osd_op.op.xattr.cmp_op = cmp_op;
+    osd_op.op.xattr.cmp_mode = cmp_mode;
+    if (name)
+      osd_op.indata.append(name, osd_op.op.xattr.name_len);
+    osd_op.indata.append(data);
+  }
+  void add_call(int op, const char *cname, const char *method,
+		bufferlist &indata,
+		bufferlist *outbl, Context *ctx, int *prval) {
+    OSDOp& osd_op = add_op(op);
+
+    unsigned p = ops.size() - 1;
+    out_handler[p] = ctx;
+    out_bl[p] = outbl;
+    out_rval[p] = prval;
+
+    osd_op.op.cls.class_len = strlen(cname);
+    osd_op.op.cls.method_len = strlen(method);
+    osd_op.op.cls.indata_len = indata.length();
+    osd_op.indata.append(cname, osd_op.op.cls.class_len);
+    osd_op.indata.append(method, osd_op.op.cls.method_len);
+    osd_op.indata.append(indata);
+  }
+  void add_pgls(int op, uint64_t count, collection_list_handle_t cookie,
+		epoch_t start_epoch) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.pgls.count = count;
+    osd_op.op.pgls.start_epoch = start_epoch;
+    encode(cookie, osd_op.indata);
+  }
+  void add_pgls_filter(int op, uint64_t count, const bufferlist& filter,
+		       collection_list_handle_t cookie, epoch_t start_epoch) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.pgls.count = count;
+    osd_op.op.pgls.start_epoch = start_epoch;
+    string cname = "pg";
+    string mname = "filter";
+    encode(cname, osd_op.indata);
+    encode(mname, osd_op.indata);
+    osd_op.indata.append(filter);
+    encode(cookie, osd_op.indata);
+  }
+  void add_alloc_hint(int op, uint64_t expected_object_size,
+                      uint64_t expected_write_size,
+		      uint32_t flags) {
+    OSDOp& osd_op = add_op(op);
+    osd_op.op.alloc_hint.expected_object_size = expected_object_size;
+    osd_op.op.alloc_hint.expected_write_size = expected_write_size;
+    osd_op.op.alloc_hint.flags = flags;
+  }
+
+  // ------
+
+  // pg
+  void pg_ls(uint64_t count, bufferlist& filter,
+	     collection_list_handle_t cookie, epoch_t start_epoch) {
+    if (filter.length() == 0)
+      add_pgls(CEPH_OSD_OP_PGLS, count, cookie, start_epoch);
+    else
+      add_pgls_filter(CEPH_OSD_OP_PGLS_FILTER, count, filter, cookie,
+		      start_epoch);
+    flags |= CEPH_OSD_FLAG_PGOP;
+  }
+
+  void pg_nls(uint64_t count, const bufferlist& filter,
+	      collection_list_handle_t cookie, epoch_t start_epoch) {
+    if (filter.length() == 0)
+      add_pgls(CEPH_OSD_OP_PGNLS, count, cookie, start_epoch);
+    else
+      add_pgls_filter(CEPH_OSD_OP_PGNLS_FILTER, count, filter, cookie,
+		      start_epoch);
+    flags |= CEPH_OSD_FLAG_PGOP;
+  }
+
+  void scrub_ls(const librados::object_id_t& start_after,
+		uint64_t max_to_get,
+		std::vector<librados::inconsistent_obj_t> *objects,
+		uint32_t *interval,
+		int *rval);
+  void scrub_ls(const librados::object_id_t& start_after,
+		uint64_t max_to_get,
+		std::vector<librados::inconsistent_snapset_t> *objects,
+		uint32_t *interval,
+		int *rval);
+
+  void create(bool excl) {
+    OSDOp& o = add_op(CEPH_OSD_OP_CREATE);
+    o.op.flags = (excl ? CEPH_OSD_OP_FLAG_EXCL : 0);
+  }
+
+  struct C_ObjectOperation_stat : public Context {
+    bufferlist bl;
+    uint64_t *psize;
+    ceph::real_time *pmtime;
+    time_t *ptime;
+    struct timespec *pts;
+    int *prval;
+    C_ObjectOperation_stat(uint64_t *ps, ceph::real_time *pm, time_t *pt, struct timespec *_pts,
+			   int *prval)
+      : psize(ps), pmtime(pm), ptime(pt), pts(_pts), prval(prval) {}
+    void finish(int r) override {
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	try {
+	  uint64_t size;
+	  ceph::real_time mtime;
+	  decode(size, p);
+	  decode(mtime, p);
+	  if (psize)
+	    *psize = size;
+	  if (pmtime)
+	    *pmtime = mtime;
+	  if (ptime)
+	    *ptime = ceph::real_clock::to_time_t(mtime);
+	  if (pts)
+	    *pts = ceph::real_clock::to_timespec(mtime);
+	} catch (buffer::error& e) {
+	  if (prval)
+	    *prval = -EIO;
+	}
+      }
+    }
+  };
+  void stat(uint64_t *psize, ceph::real_time *pmtime, int *prval) {
+    add_op(CEPH_OSD_OP_STAT);
+    unsigned p = ops.size() - 1;
+    C_ObjectOperation_stat *h = new C_ObjectOperation_stat(psize, pmtime, NULL, NULL,
+							   prval);
+    out_bl[p] = &h->bl;
+    out_handler[p] = h;
+    out_rval[p] = prval;
+  }
+  void stat(uint64_t *psize, time_t *ptime, int *prval) {
+    add_op(CEPH_OSD_OP_STAT);
+    unsigned p = ops.size() - 1;
+    C_ObjectOperation_stat *h = new C_ObjectOperation_stat(psize, NULL, ptime, NULL,
+							   prval);
+    out_bl[p] = &h->bl;
+    out_handler[p] = h;
+    out_rval[p] = prval;
+  }
+  void stat(uint64_t *psize, struct timespec *pts, int *prval) {
+    add_op(CEPH_OSD_OP_STAT);
+    unsigned p = ops.size() - 1;
+    C_ObjectOperation_stat *h = new C_ObjectOperation_stat(psize, NULL, NULL, pts,
+							   prval);
+    out_bl[p] = &h->bl;
+    out_handler[p] = h;
+    out_rval[p] = prval;
+  }
+  // object cmpext
+  struct C_ObjectOperation_cmpext : public Context {
+    int *prval;
+    explicit C_ObjectOperation_cmpext(int *prval)
+      : prval(prval) {}
+
+    void finish(int r) {
+      if (prval)
+        *prval = r;
+    }
+  };
+
+  void cmpext(uint64_t off, bufferlist& cmp_bl, int *prval) {
+    add_data(CEPH_OSD_OP_CMPEXT, off, cmp_bl.length(), cmp_bl);
+    unsigned p = ops.size() - 1;
+    C_ObjectOperation_cmpext *h = new C_ObjectOperation_cmpext(prval);
+    out_handler[p] = h;
+    out_rval[p] = prval;
+  }
+
+  // Used by C API
+  void cmpext(uint64_t off, uint64_t cmp_len, const char *cmp_buf, int *prval) {
+    bufferlist cmp_bl;
+    cmp_bl.append(cmp_buf, cmp_len);
+    add_data(CEPH_OSD_OP_CMPEXT, off, cmp_len, cmp_bl);
+    unsigned p = ops.size() - 1;
+    C_ObjectOperation_cmpext *h = new C_ObjectOperation_cmpext(prval);
+    out_handler[p] = h;
+    out_rval[p] = prval;
+  }
+
+  void read(uint64_t off, uint64_t len, bufferlist *pbl, int *prval,
+	    Context* ctx) {
+    bufferlist bl;
+    add_data(CEPH_OSD_OP_READ, off, len, bl);
+    unsigned p = ops.size() - 1;
+    out_bl[p] = pbl;
+    out_rval[p] = prval;
+    out_handler[p] = ctx;
+  }
+
+  struct C_ObjectOperation_sparse_read : public Context {
+    bufferlist bl;
+    bufferlist *data_bl;
+    std::map<uint64_t, uint64_t> *extents;
+    int *prval;
+    C_ObjectOperation_sparse_read(bufferlist *data_bl,
+				  std::map<uint64_t, uint64_t> *extents,
+				  int *prval)
+      : data_bl(data_bl), extents(extents), prval(prval) {}
+    void finish(int r) override {
+      auto iter = bl.cbegin();
+      if (r >= 0) {
+        // NOTE: it's possible the sub-op has not been executed but the result
+        // code remains zeroed. Avoid the costly exception handling on a
+        // potential IO path.
+        if (bl.length() > 0) {
+	  try {
+	    decode(*extents, iter);
+	    decode(*data_bl, iter);
+	  } catch (buffer::error& e) {
+	    if (prval)
+              *prval = -EIO;
+	  }
+        } else if (prval) {
+          *prval = -EIO;
+        }
+      }
+    }
+  };
+  void sparse_read(uint64_t off, uint64_t len, std::map<uint64_t,uint64_t> *m,
+		   bufferlist *data_bl, int *prval) {
+    bufferlist bl;
+    add_data(CEPH_OSD_OP_SPARSE_READ, off, len, bl);
+    unsigned p = ops.size() - 1;
+    C_ObjectOperation_sparse_read *h =
+      new C_ObjectOperation_sparse_read(data_bl, m, prval);
+    out_bl[p] = &h->bl;
+    out_handler[p] = h;
+    out_rval[p] = prval;
+  }
+  void write(uint64_t off, bufferlist& bl,
+	     uint64_t truncate_size,
+	     uint32_t truncate_seq) {
+    add_data(CEPH_OSD_OP_WRITE, off, bl.length(), bl);
+    OSDOp& o = *ops.rbegin();
+    o.op.extent.truncate_size = truncate_size;
+    o.op.extent.truncate_seq = truncate_seq;
+  }
+  void write(uint64_t off, bufferlist& bl) {
+    write(off, bl, 0, 0);
+  }
+  void write_full(bufferlist& bl) {
+    add_data(CEPH_OSD_OP_WRITEFULL, 0, bl.length(), bl);
+  }
+  void writesame(uint64_t off, uint64_t write_len, bufferlist& bl) {
+    add_writesame(CEPH_OSD_OP_WRITESAME, off, write_len, bl);
+  }
+  void append(bufferlist& bl) {
+    add_data(CEPH_OSD_OP_APPEND, 0, bl.length(), bl);
+  }
+  void zero(uint64_t off, uint64_t len) {
+    bufferlist bl;
+    add_data(CEPH_OSD_OP_ZERO, off, len, bl);
+  }
+  void truncate(uint64_t off) {
+    bufferlist bl;
+    add_data(CEPH_OSD_OP_TRUNCATE, off, 0, bl);
+  }
+  void remove() {
+    bufferlist bl;
+    add_data(CEPH_OSD_OP_DELETE, 0, 0, bl);
+  }
+  void mapext(uint64_t off, uint64_t len) {
+    bufferlist bl;
+    add_data(CEPH_OSD_OP_MAPEXT, off, len, bl);
+  }
+  void sparse_read(uint64_t off, uint64_t len) {
+    bufferlist bl;
+    add_data(CEPH_OSD_OP_SPARSE_READ, off, len, bl);
+  }
+
+  void checksum(uint8_t type, const bufferlist &init_value_bl,
+		uint64_t off, uint64_t len, size_t chunk_size,
+		bufferlist *pbl, int *prval, Context *ctx) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_CHECKSUM);
+    osd_op.op.checksum.offset = off;
+    osd_op.op.checksum.length = len;
+    osd_op.op.checksum.type = type;
+    osd_op.op.checksum.chunk_size = chunk_size;
+    osd_op.indata.append(init_value_bl);
+
+    unsigned p = ops.size() - 1;
+    out_bl[p] = pbl;
+    out_rval[p] = prval;
+    out_handler[p] = ctx;
+  }
+
+  // object attrs
+  void getxattr(const char *name, bufferlist *pbl, int *prval) {
+    bufferlist bl;
+    add_xattr(CEPH_OSD_OP_GETXATTR, name, bl);
+    unsigned p = ops.size() - 1;
+    out_bl[p] = pbl;
+    out_rval[p] = prval;
+  }
+  struct C_ObjectOperation_decodevals : public Context {
+    uint64_t max_entries;
+    bufferlist bl;
+    std::map<std::string,bufferlist> *pattrs;
+    bool *ptruncated;
+    int *prval;
+    C_ObjectOperation_decodevals(uint64_t m, std::map<std::string,bufferlist> *pa,
+				 bool *pt, int *pr)
+      : max_entries(m), pattrs(pa), ptruncated(pt), prval(pr) {
+      if (ptruncated) {
+	*ptruncated = false;
+      }
+    }
+    void finish(int r) override {
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	try {
+	  if (pattrs)
+	    decode(*pattrs, p);
+	  if (ptruncated) {
+	    std::map<std::string,bufferlist> ignore;
+	    if (!pattrs) {
+	      decode(ignore, p);
+	      pattrs = &ignore;
+	    }
+	    if (!p.end()) {
+	      decode(*ptruncated, p);
+	    } else {
+	      // the OSD did not provide this.  since old OSDs do not
+	      // enfoce omap result limits either, we can infer it from
+	      // the size of the result
+	      *ptruncated = (pattrs->size() == max_entries);
+	    }
+	  }
+	}
+	catch (buffer::error& e) {
+	  if (prval)
+	    *prval = -EIO;
+	}
+      }
+    }
+  };
+  struct C_ObjectOperation_decodekeys : public Context {
+    uint64_t max_entries;
+    bufferlist bl;
+    std::set<std::string> *pattrs;
+    bool *ptruncated;
+    int *prval;
+    C_ObjectOperation_decodekeys(uint64_t m, std::set<std::string> *pa, bool *pt,
+				 int *pr)
+      : max_entries(m), pattrs(pa), ptruncated(pt), prval(pr) {
+      if (ptruncated) {
+	*ptruncated = false;
+      }
+    }
+    void finish(int r) override {
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	try {
+	  if (pattrs)
+	    decode(*pattrs, p);
+	  if (ptruncated) {
+	    std::set<std::string> ignore;
+	    if (!pattrs) {
+	      decode(ignore, p);
+	      pattrs = &ignore;
+	    }
+	    if (!p.end()) {
+	      decode(*ptruncated, p);
+	    } else {
+	      // the OSD did not provide this.  since old OSDs do not
+	      // enforce omap result limits either, we can infer it from
+	      // the size of the result
+	      *ptruncated = (pattrs->size() == max_entries);
+	    }
+	  }
+	}
+	catch (buffer::error& e) {
+	  if (prval)
+	    *prval = -EIO;
+	}
+      }
+    }
+  };
+  struct C_ObjectOperation_decodewatchers : public Context {
+    bufferlist bl;
+    list<obj_watch_t> *pwatchers;
+    int *prval;
+    C_ObjectOperation_decodewatchers(list<obj_watch_t> *pw, int *pr)
+      : pwatchers(pw), prval(pr) {}
+    void finish(int r) override {
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	try {
+	  obj_list_watch_response_t resp;
+	  decode(resp, p);
+	  if (pwatchers) {
+	    for (list<watch_item_t>::iterator i = resp.entries.begin() ;
+		 i != resp.entries.end() ; ++i) {
+	      obj_watch_t ow;
+	      string sa = i->addr.get_legacy_str();
+	      strncpy(ow.addr, sa.c_str(), 256);
+	      ow.watcher_id = i->name.num();
+	      ow.cookie = i->cookie;
+	      ow.timeout_seconds = i->timeout_seconds;
+	      pwatchers->push_back(ow);
+	    }
+	  }
+	}
+	catch (buffer::error& e) {
+	  if (prval)
+	    *prval = -EIO;
+	}
+      }
+    }
+  };
+  struct C_ObjectOperation_decodesnaps : public Context {
+    bufferlist bl;
+    librados::snap_set_t *psnaps;
+    int *prval;
+    C_ObjectOperation_decodesnaps(librados::snap_set_t *ps, int *pr)
+      : psnaps(ps), prval(pr) {}
+    void finish(int r) override {
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	try {
+	  obj_list_snap_response_t resp;
+	  decode(resp, p);
+	  if (psnaps) {
+	    psnaps->clones.clear();
+	    for (vector<clone_info>::iterator ci = resp.clones.begin();
+		 ci != resp.clones.end();
+		 ++ci) {
+	      librados::clone_info_t clone;
+
+	      clone.cloneid = ci->cloneid;
+	      clone.snaps.reserve(ci->snaps.size());
+	      clone.snaps.insert(clone.snaps.end(), ci->snaps.begin(),
+				 ci->snaps.end());
+	      clone.overlap = ci->overlap;
+	      clone.size = ci->size;
+
+	      psnaps->clones.push_back(clone);
+	    }
+	    psnaps->seq = resp.seq;
+	  }
+	} catch (buffer::error& e) {
+	  if (prval)
+	    *prval = -EIO;
+	}
+      }
+    }
+  };
+  void getxattrs(std::map<std::string,bufferlist> *pattrs, int *prval) {
+    add_op(CEPH_OSD_OP_GETXATTRS);
+    if (pattrs || prval) {
+      unsigned p = ops.size() - 1;
+      C_ObjectOperation_decodevals *h
+	= new C_ObjectOperation_decodevals(0, pattrs, nullptr, prval);
+      out_handler[p] = h;
+      out_bl[p] = &h->bl;
+      out_rval[p] = prval;
+    }
+  }
+  void setxattr(const char *name, const bufferlist& bl) {
+    add_xattr(CEPH_OSD_OP_SETXATTR, name, bl);
+  }
+  void setxattr(const char *name, const string& s) {
+    bufferlist bl;
+    bl.append(s);
+    add_xattr(CEPH_OSD_OP_SETXATTR, name, bl);
+  }
+  void cmpxattr(const char *name, uint8_t cmp_op, uint8_t cmp_mode,
+		const bufferlist& bl) {
+    add_xattr_cmp(CEPH_OSD_OP_CMPXATTR, name, cmp_op, cmp_mode, bl);
+  }
+  void rmxattr(const char *name) {
+    bufferlist bl;
+    add_xattr(CEPH_OSD_OP_RMXATTR, name, bl);
+  }
+  void setxattrs(map<string, bufferlist>& attrs) {
+    bufferlist bl;
+    encode(attrs, bl);
+    add_xattr(CEPH_OSD_OP_RESETXATTRS, 0, bl.length());
+  }
+  void resetxattrs(const char *prefix, map<string, bufferlist>& attrs) {
+    bufferlist bl;
+    encode(attrs, bl);
+    add_xattr(CEPH_OSD_OP_RESETXATTRS, prefix, bl);
+  }
+
+  // trivialmap
+  void tmap_update(bufferlist& bl) {
+    add_data(CEPH_OSD_OP_TMAPUP, 0, 0, bl);
+  }
+
+  // objectmap
+  void omap_get_keys(const string &start_after,
+		     uint64_t max_to_get,
+		     std::set<std::string> *out_set,
+		     bool *ptruncated,
+		     int *prval) {
+    OSDOp &op = add_op(CEPH_OSD_OP_OMAPGETKEYS);
+    bufferlist bl;
+    encode(start_after, bl);
+    encode(max_to_get, bl);
+    op.op.extent.offset = 0;
+    op.op.extent.length = bl.length();
+    op.indata.claim_append(bl);
+    if (prval || ptruncated || out_set) {
+      unsigned p = ops.size() - 1;
+      C_ObjectOperation_decodekeys *h =
+	new C_ObjectOperation_decodekeys(max_to_get, out_set, ptruncated, prval);
+      out_handler[p] = h;
+      out_bl[p] = &h->bl;
+      out_rval[p] = prval;
+    }
+  }
+
+  void omap_get_vals(const string &start_after,
+		     const string &filter_prefix,
+		     uint64_t max_to_get,
+		     std::map<std::string, bufferlist> *out_set,
+		     bool *ptruncated,
+		     int *prval) {
+    OSDOp &op = add_op(CEPH_OSD_OP_OMAPGETVALS);
+    bufferlist bl;
+    encode(start_after, bl);
+    encode(max_to_get, bl);
+    encode(filter_prefix, bl);
+    op.op.extent.offset = 0;
+    op.op.extent.length = bl.length();
+    op.indata.claim_append(bl);
+    if (prval || out_set || ptruncated) {
+      unsigned p = ops.size() - 1;
+      C_ObjectOperation_decodevals *h =
+	new C_ObjectOperation_decodevals(max_to_get, out_set, ptruncated, prval);
+      out_handler[p] = h;
+      out_bl[p] = &h->bl;
+      out_rval[p] = prval;
+    }
+  }
+
+  void omap_get_vals_by_keys(const std::set<std::string> &to_get,
+			    std::map<std::string, bufferlist> *out_set,
+			    int *prval) {
+    OSDOp &op = add_op(CEPH_OSD_OP_OMAPGETVALSBYKEYS);
+    bufferlist bl;
+    encode(to_get, bl);
+    op.op.extent.offset = 0;
+    op.op.extent.length = bl.length();
+    op.indata.claim_append(bl);
+    if (prval || out_set) {
+      unsigned p = ops.size() - 1;
+      C_ObjectOperation_decodevals *h =
+	new C_ObjectOperation_decodevals(0, out_set, nullptr, prval);
+      out_handler[p] = h;
+      out_bl[p] = &h->bl;
+      out_rval[p] = prval;
+    }
+  }
+
+  void omap_cmp(const std::map<std::string, pair<bufferlist,int> > &assertions,
+		int *prval) {
+    OSDOp &op = add_op(CEPH_OSD_OP_OMAP_CMP);
+    bufferlist bl;
+    encode(assertions, bl);
+    op.op.extent.offset = 0;
+    op.op.extent.length = bl.length();
+    op.indata.claim_append(bl);
+    if (prval) {
+      unsigned p = ops.size() - 1;
+      out_rval[p] = prval;
+    }
+  }
+
+  struct C_ObjectOperation_copyget : public Context {
+    bufferlist bl;
+    object_copy_cursor_t *cursor;
+    uint64_t *out_size;
+    ceph::real_time *out_mtime;
+    std::map<std::string,bufferlist> *out_attrs;
+    bufferlist *out_data, *out_omap_header, *out_omap_data;
+    vector<snapid_t> *out_snaps;
+    snapid_t *out_snap_seq;
+    uint32_t *out_flags;
+    uint32_t *out_data_digest;
+    uint32_t *out_omap_digest;
+    mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *out_reqids;
+    mempool::osd_pglog::map<uint32_t, int> *out_reqid_return_codes;
+    uint64_t *out_truncate_seq;
+    uint64_t *out_truncate_size;
+    int *prval;
+    C_ObjectOperation_copyget(object_copy_cursor_t *c,
+			      uint64_t *s,
+			      ceph::real_time *m,
+			      std::map<std::string,bufferlist> *a,
+			      bufferlist *d, bufferlist *oh,
+			      bufferlist *o,
+			      std::vector<snapid_t> *osnaps,
+			      snapid_t *osnap_seq,
+			      uint32_t *flags,
+			      uint32_t *dd,
+			      uint32_t *od,
+			      mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *oreqids,
+			      mempool::osd_pglog::map<uint32_t, int> *oreqid_return_codes,
+			      uint64_t *otseq,
+			      uint64_t *otsize,
+			      int *r)
+      : cursor(c),
+	out_size(s), out_mtime(m),
+	out_attrs(a), out_data(d), out_omap_header(oh),
+	out_omap_data(o), out_snaps(osnaps), out_snap_seq(osnap_seq),
+	out_flags(flags), out_data_digest(dd), out_omap_digest(od),
+	out_reqids(oreqids),
+	out_reqid_return_codes(oreqid_return_codes),
+	out_truncate_seq(otseq),
+	out_truncate_size(otsize),
+	prval(r) {}
+    void finish(int r) override {
+      // reqids are copied on ENOENT
+      if (r < 0 && r != -ENOENT)
+	return;
+      try {
+	auto p = bl.cbegin();
+	object_copy_data_t copy_reply;
+	decode(copy_reply, p);
+	if (r == -ENOENT) {
+	  if (out_reqids)
+	    *out_reqids = copy_reply.reqids;
+	  return;
+	}
+	if (out_size)
+	  *out_size = copy_reply.size;
+	if (out_mtime)
+	  *out_mtime = ceph::real_clock::from_ceph_timespec(copy_reply.mtime);
+	if (out_attrs)
+	  *out_attrs = copy_reply.attrs;
+	if (out_data)
+	  out_data->claim_append(copy_reply.data);
+	if (out_omap_header)
+	  out_omap_header->claim_append(copy_reply.omap_header);
+	if (out_omap_data)
+	  *out_omap_data = copy_reply.omap_data;
+	if (out_snaps)
+	  *out_snaps = copy_reply.snaps;
+	if (out_snap_seq)
+	  *out_snap_seq = copy_reply.snap_seq;
+	if (out_flags)
+	  *out_flags = copy_reply.flags;
+	if (out_data_digest)
+	  *out_data_digest = copy_reply.data_digest;
+	if (out_omap_digest)
+	  *out_omap_digest = copy_reply.omap_digest;
+	if (out_reqids)
+	  *out_reqids = copy_reply.reqids;
+	if (out_reqid_return_codes)
+	  *out_reqid_return_codes = copy_reply.reqid_return_codes;
+	if (out_truncate_seq)
+	  *out_truncate_seq = copy_reply.truncate_seq;
+	if (out_truncate_size)
+	  *out_truncate_size = copy_reply.truncate_size;
+	*cursor = copy_reply.cursor;
+      } catch (buffer::error& e) {
+	if (prval)
+	  *prval = -EIO;
+      }
+    }
+  };
+
+  void copy_get(object_copy_cursor_t *cursor,
+		uint64_t max,
+		uint64_t *out_size,
+		ceph::real_time *out_mtime,
+		std::map<std::string,bufferlist> *out_attrs,
+		bufferlist *out_data,
+		bufferlist *out_omap_header,
+		bufferlist *out_omap_data,
+		vector<snapid_t> *out_snaps,
+		snapid_t *out_snap_seq,
+		uint32_t *out_flags,
+		uint32_t *out_data_digest,
+		uint32_t *out_omap_digest,
+		mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *out_reqids,
+		mempool::osd_pglog::map<uint32_t, int> *out_reqid_return_codes,
+		uint64_t *truncate_seq,
+		uint64_t *truncate_size,
+		int *prval) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_GET);
+    osd_op.op.copy_get.max = max;
+    encode(*cursor, osd_op.indata);
+    encode(max, osd_op.indata);
+    unsigned p = ops.size() - 1;
+    out_rval[p] = prval;
+    C_ObjectOperation_copyget *h =
+      new C_ObjectOperation_copyget(cursor, out_size, out_mtime,
+				    out_attrs, out_data, out_omap_header,
+				    out_omap_data, out_snaps, out_snap_seq,
+				    out_flags, out_data_digest,
+				    out_omap_digest, out_reqids,
+				    out_reqid_return_codes, truncate_seq,
+				    truncate_size, prval);
+    out_bl[p] = &h->bl;
+    out_handler[p] = h;
+  }
+
+  void undirty() {
+    add_op(CEPH_OSD_OP_UNDIRTY);
+  }
+
+  struct C_ObjectOperation_isdirty : public Context {
+    bufferlist bl;
+    bool *pisdirty;
+    int *prval;
+    C_ObjectOperation_isdirty(bool *p, int *r)
+      : pisdirty(p), prval(r) {}
+    void finish(int r) override {
+      if (r < 0)
+	return;
+      try {
+	auto p = bl.cbegin();
+	bool isdirty;
+	decode(isdirty, p);
+	if (pisdirty)
+	  *pisdirty = isdirty;
+      } catch (buffer::error& e) {
+	if (prval)
+	  *prval = -EIO;
+      }
+    }
+  };
+
+  void is_dirty(bool *pisdirty, int *prval) {
+    add_op(CEPH_OSD_OP_ISDIRTY);
+    unsigned p = ops.size() - 1;
+    out_rval[p] = prval;
+    C_ObjectOperation_isdirty *h =
+      new C_ObjectOperation_isdirty(pisdirty, prval);
+    out_bl[p] = &h->bl;
+    out_handler[p] = h;
+  }
+
+  struct C_ObjectOperation_hit_set_ls : public Context {
+    bufferlist bl;
+    std::list< std::pair<time_t, time_t> > *ptls;
+    std::list< std::pair<ceph::real_time, ceph::real_time> > *putls;
+    int *prval;
+    C_ObjectOperation_hit_set_ls(std::list< std::pair<time_t, time_t> > *t,
+				 std::list< std::pair<ceph::real_time,
+						      ceph::real_time> > *ut,
+				 int *r)
+      : ptls(t), putls(ut), prval(r) {}
+    void finish(int r) override {
+      if (r < 0)
+	return;
+      try {
+	auto p = bl.cbegin();
+	std::list< std::pair<ceph::real_time, ceph::real_time> > ls;
+	decode(ls, p);
+	if (ptls) {
+	  ptls->clear();
+	  for (auto p = ls.begin(); p != ls.end(); ++p)
+	    // round initial timestamp up to the next full second to
+	    // keep this a valid interval.
+	    ptls->push_back(
+	      make_pair(ceph::real_clock::to_time_t(
+			  ceph::ceil(p->first,
+				     // Sadly, no time literals until C++14.
+				     std::chrono::seconds(1))),
+			ceph::real_clock::to_time_t(p->second)));
+	}
+	if (putls)
+	  putls->swap(ls);
+      } catch (buffer::error& e) {
+	r = -EIO;
+      }
+      if (prval)
+	*prval = r;
+    }
+  };
+
+  /**
+   * list available HitSets.
+   *
+   * We will get back a list of time intervals.  Note that the most
+   * recent range may have an empty end timestamp if it is still
+   * accumulating.
+   *
+   * @param pls [out] list of time intervals
+   * @param prval [out] return value
+   */
+  void hit_set_ls(std::list< std::pair<time_t, time_t> > *pls, int *prval) {
+    add_op(CEPH_OSD_OP_PG_HITSET_LS);
+    unsigned p = ops.size() - 1;
+    out_rval[p] = prval;
+    C_ObjectOperation_hit_set_ls *h =
+      new C_ObjectOperation_hit_set_ls(pls, NULL, prval);
+    out_bl[p] = &h->bl;
+    out_handler[p] = h;
+  }
+  void hit_set_ls(std::list<std::pair<ceph::real_time, ceph::real_time> > *pls,
+		  int *prval) {
+    add_op(CEPH_OSD_OP_PG_HITSET_LS);
+    unsigned p = ops.size() - 1;
+    out_rval[p] = prval;
+    C_ObjectOperation_hit_set_ls *h =
+      new C_ObjectOperation_hit_set_ls(NULL, pls, prval);
+    out_bl[p] = &h->bl;
+    out_handler[p] = h;
+  }
+
+  /**
+   * get HitSet
+   *
+   * Return an encoded HitSet that includes the provided time
+   * interval.
+   *
+   * @param stamp [in] timestamp
+   * @param pbl [out] target buffer for encoded HitSet
+   * @param prval [out] return value
+   */
+  void hit_set_get(ceph::real_time stamp, bufferlist *pbl, int *prval) {
+    OSDOp& op = add_op(CEPH_OSD_OP_PG_HITSET_GET);
+    op.op.hit_set_get.stamp = ceph::real_clock::to_ceph_timespec(stamp);
+    unsigned p = ops.size() - 1;
+    out_rval[p] = prval;
+    out_bl[p] = pbl;
+  }
+
+  void omap_get_header(bufferlist *bl, int *prval) {
+    add_op(CEPH_OSD_OP_OMAPGETHEADER);
+    unsigned p = ops.size() - 1;
+    out_bl[p] = bl;
+    out_rval[p] = prval;
+  }
+
+  void omap_set(const map<string, bufferlist> &map) {
+    bufferlist bl;
+    encode(map, bl);
+    add_data(CEPH_OSD_OP_OMAPSETVALS, 0, bl.length(), bl);
+  }
+
+  void omap_set_header(bufferlist &bl) {
+    add_data(CEPH_OSD_OP_OMAPSETHEADER, 0, bl.length(), bl);
+  }
+
+  void omap_clear() {
+    add_op(CEPH_OSD_OP_OMAPCLEAR);
+  }
+
+  void omap_rm_keys(const std::set<std::string> &to_remove) {
+    bufferlist bl;
+    encode(to_remove, bl);
+    add_data(CEPH_OSD_OP_OMAPRMKEYS, 0, bl.length(), bl);
+  }
+
+  // object classes
+  void call(const char *cname, const char *method, bufferlist &indata) {
+    add_call(CEPH_OSD_OP_CALL, cname, method, indata, NULL, NULL, NULL);
+  }
+
+  void call(const char *cname, const char *method, bufferlist &indata,
+	    bufferlist *outdata, Context *ctx, int *prval) {
+    add_call(CEPH_OSD_OP_CALL, cname, method, indata, outdata, ctx, prval);
+  }
+
+  // watch/notify
+  void watch(uint64_t cookie, __u8 op, uint32_t timeout = 0) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_WATCH);
+    osd_op.op.watch.cookie = cookie;
+    osd_op.op.watch.op = op;
+    osd_op.op.watch.timeout = timeout;
+  }
+
+  void notify(uint64_t cookie, uint32_t prot_ver, uint32_t timeout,
+              bufferlist &bl, bufferlist *inbl) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_NOTIFY);
+    osd_op.op.notify.cookie = cookie;
+    encode(prot_ver, *inbl);
+    encode(timeout, *inbl);
+    encode(bl, *inbl);
+    osd_op.indata.append(*inbl);
+  }
+
+  void notify_ack(uint64_t notify_id, uint64_t cookie,
+		  bufferlist& reply_bl) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_NOTIFY_ACK);
+    bufferlist bl;
+    encode(notify_id, bl);
+    encode(cookie, bl);
+    encode(reply_bl, bl);
+    osd_op.indata.append(bl);
+  }
+
+  void list_watchers(list<obj_watch_t> *out,
+		     int *prval) {
+    (void)add_op(CEPH_OSD_OP_LIST_WATCHERS);
+    if (prval || out) {
+      unsigned p = ops.size() - 1;
+      C_ObjectOperation_decodewatchers *h =
+	new C_ObjectOperation_decodewatchers(out, prval);
+      out_handler[p] = h;
+      out_bl[p] = &h->bl;
+      out_rval[p] = prval;
+    }
+  }
+
+  void list_snaps(librados::snap_set_t *out, int *prval) {
+    (void)add_op(CEPH_OSD_OP_LIST_SNAPS);
+    if (prval || out) {
+      unsigned p = ops.size() - 1;
+      C_ObjectOperation_decodesnaps *h =
+	new C_ObjectOperation_decodesnaps(out, prval);
+      out_handler[p] = h;
+      out_bl[p] = &h->bl;
+      out_rval[p] = prval;
+    }
+  }
+
+  void assert_version(uint64_t ver) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_ASSERT_VER);
+    osd_op.op.assert_ver.ver = ver;
+  }
+
+  void cmpxattr(const char *name, const bufferlist& val,
+		int op, int mode) {
+    add_xattr(CEPH_OSD_OP_CMPXATTR, name, val);
+    OSDOp& o = *ops.rbegin();
+    o.op.xattr.cmp_op = op;
+    o.op.xattr.cmp_mode = mode;
+  }
+
+  void rollback(uint64_t snapid) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_ROLLBACK);
+    osd_op.op.snap.snapid = snapid;
+  }
+
+  void copy_from(object_t src, snapid_t snapid, object_locator_t src_oloc,
+		 version_t src_version, unsigned flags,
+		 unsigned src_fadvise_flags) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_COPY_FROM);
+    osd_op.op.copy_from.snapid = snapid;
+    osd_op.op.copy_from.src_version = src_version;
+    osd_op.op.copy_from.flags = flags;
+    osd_op.op.copy_from.src_fadvise_flags = src_fadvise_flags;
+    encode(src, osd_op.indata);
+    encode(src_oloc, osd_op.indata);
+  }
+
+  /**
+   * writeback content to backing tier
+   *
+   * If object is marked dirty in the cache tier, write back content
+   * to backing tier. If the object is clean this is a no-op.
+   *
+   * If writeback races with an update, the update will block.
+   *
+   * use with IGNORE_CACHE to avoid triggering promote.
+   */
+  void cache_flush() {
+    add_op(CEPH_OSD_OP_CACHE_FLUSH);
+  }
+
+  /**
+   * writeback content to backing tier
+   *
+   * If object is marked dirty in the cache tier, write back content
+   * to backing tier. If the object is clean this is a no-op.
+   *
+   * If writeback races with an update, return EAGAIN.  Requires that
+   * the SKIPRWLOCKS flag be set.
+   *
+   * use with IGNORE_CACHE to avoid triggering promote.
+   */
+  void cache_try_flush() {
+    add_op(CEPH_OSD_OP_CACHE_TRY_FLUSH);
+  }
+
+  /**
+   * evict object from cache tier
+   *
+   * If object is marked clean, remove the object from the cache tier.
+   * Otherwise, return EBUSY.
+   *
+   * use with IGNORE_CACHE to avoid triggering promote.
+   */
+  void cache_evict() {
+    add_op(CEPH_OSD_OP_CACHE_EVICT);
+  }
+
+  /*
+   * Extensible tier
+   */
+  void set_redirect(object_t tgt, snapid_t snapid, object_locator_t tgt_oloc, 
+		    version_t tgt_version, int flag) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_SET_REDIRECT);
+    osd_op.op.copy_from.snapid = snapid;
+    osd_op.op.copy_from.src_version = tgt_version;
+    encode(tgt, osd_op.indata);
+    encode(tgt_oloc, osd_op.indata);
+    set_last_op_flags(flag);
+  }
+
+  void set_chunk(uint64_t src_offset, uint64_t src_length, object_locator_t tgt_oloc,
+		 object_t tgt_oid, uint64_t tgt_offset, int flag) {
+    OSDOp& osd_op = add_op(CEPH_OSD_OP_SET_CHUNK);
+    encode(src_offset, osd_op.indata);
+    encode(src_length, osd_op.indata);
+    encode(tgt_oloc, osd_op.indata);
+    encode(tgt_oid, osd_op.indata);
+    encode(tgt_offset, osd_op.indata);
+    set_last_op_flags(flag);
+  }
+
+  void tier_promote() {
+    add_op(CEPH_OSD_OP_TIER_PROMOTE);
+  }
+
+  void unset_manifest() {
+    add_op(CEPH_OSD_OP_UNSET_MANIFEST);
+  }
+
+  void set_alloc_hint(uint64_t expected_object_size,
+                      uint64_t expected_write_size,
+		      uint32_t flags) {
+    add_alloc_hint(CEPH_OSD_OP_SETALLOCHINT, expected_object_size,
+		   expected_write_size, flags);
+
+    // CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
+    // not worth a feature bit.  Set FAILOK per-op flag to make
+    // sure older osds don't trip over an unsupported opcode.
+    set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK);
+  }
+
+  void dup(vector<OSDOp>& sops) {
+    ops = sops;
+    out_bl.resize(sops.size());
+    out_handler.resize(sops.size());
+    out_rval.resize(sops.size());
+    for (uint32_t i = 0; i < sops.size(); i++) {
+      out_bl[i] = &sops[i].outdata;
+      out_handler[i] = NULL;
+      out_rval[i] = &sops[i].rval;
+    }
+  }
+
+  /**
+   * Pin/unpin an object in cache tier
+   */
+  void cache_pin() {
+    add_op(CEPH_OSD_OP_CACHE_PIN);
+  }
+
+  void cache_unpin() {
+    add_op(CEPH_OSD_OP_CACHE_UNPIN);
+  }
+};
+
+
+// ----------------
+
+
+class Objecter : public md_config_obs_t, public Dispatcher {
+public:
+  // config observer bits
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override;
+
+public:
+  Messenger *messenger;
+  MonClient *monc;
+  Finisher *finisher;
+  ZTracer::Endpoint trace_endpoint;
+private:
+  OSDMap    *osdmap;
+public:
+  using Dispatcher::cct;
+  std::multimap<string,string> crush_location;
+
+  std::atomic<bool> initialized{false};
+
+private:
+  std::atomic<uint64_t> last_tid{0};
+  std::atomic<unsigned> inflight_ops{0};
+  std::atomic<int> client_inc{-1};
+  uint64_t max_linger_id;
+  std::atomic<unsigned> num_in_flight{0};
+  std::atomic<int> global_op_flags{0}; // flags which are applied to each IO op
+  bool keep_balanced_budget;
+  bool honor_osdmap_full;
+  bool osdmap_full_try;
+
+  // If this is true, accumulate a set of blacklisted entities
+  // to be drained by consume_blacklist_events.
+  bool blacklist_events_enabled;
+  std::set<entity_addr_t> blacklist_events;
+
+public:
+  void maybe_request_map();
+
+  void enable_blacklist_events();
+private:
+
+  void _maybe_request_map();
+
+  version_t last_seen_osdmap_version;
+  version_t last_seen_pgmap_version;
+
+  mutable std::shared_mutex rwlock;
+  using lock_guard = std::lock_guard<decltype(rwlock)>;
+  using unique_lock = std::unique_lock<decltype(rwlock)>;
+  using shared_lock = boost::shared_lock<decltype(rwlock)>;
+  using shunique_lock = ceph::shunique_lock<decltype(rwlock)>;
+  ceph::timer<ceph::coarse_mono_clock> timer;
+
+  PerfCounters *logger;
+
+  uint64_t tick_event;
+
+  void start_tick();
+  void tick();
+  void update_crush_location();
+
+  class RequestStateHook;
+
+  RequestStateHook *m_request_state_hook;
+
+public:
+  /*** track pending operations ***/
+  // read
+
+  struct OSDSession;
+
+  struct op_target_t {
+    int flags = 0;
+
+    epoch_t epoch = 0;  ///< latest epoch we calculated the mapping
+
+    object_t base_oid;
+    object_locator_t base_oloc;
+    object_t target_oid;
+    object_locator_t target_oloc;
+
+    ///< true if we are directed at base_pgid, not base_oid
+    bool precalc_pgid = false;
+
+    ///< true if we have ever mapped to a valid pool
+    bool pool_ever_existed = false;
+
+    ///< explcit pg target, if any
+    pg_t base_pgid;
+
+    pg_t pgid; ///< last (raw) pg we mapped to
+    spg_t actual_pgid; ///< last (actual) spg_t we mapped to
+    unsigned pg_num = 0; ///< last pg_num we mapped to
+    unsigned pg_num_mask = 0; ///< last pg_num_mask we mapped to
+    unsigned pg_num_pending = 0; ///< last pg_num we mapped to
+    vector<int> up; ///< set of up osds for last pg we mapped to
+    vector<int> acting; ///< set of acting osds for last pg we mapped to
+    int up_primary = -1; ///< last up_primary we mapped to
+    int acting_primary = -1;  ///< last acting_primary we mapped to
+    int size = -1; ///< the size of the pool when were were last mapped
+    int min_size = -1; ///< the min size of the pool when were were last mapped
+    bool sort_bitwise = false; ///< whether the hobject_t sort order is bitwise
+    bool recovery_deletes = false; ///< whether the deletes are performed during recovery instead of peering
+
+    bool used_replica = false;
+    bool paused = false;
+
+    int osd = -1;      ///< the final target osd, or -1
+
+    epoch_t last_force_resend = 0;
+
+    op_target_t(object_t oid, object_locator_t oloc, int flags)
+      : flags(flags),
+	base_oid(oid),
+	base_oloc(oloc)
+      {}
+
+    explicit op_target_t(pg_t pgid)
+      : base_oloc(pgid.pool(), pgid.ps()),
+	precalc_pgid(true),
+	base_pgid(pgid)
+      {}
+
+    op_target_t() = default;
+
+    hobject_t get_hobj() {
+      return hobject_t(target_oid,
+		       target_oloc.key,
+		       CEPH_NOSNAP,
+		       target_oloc.hash >= 0 ? target_oloc.hash : pgid.ps(),
+		       target_oloc.pool,
+		       target_oloc.nspace);
+    }
+
+    bool contained_by(const hobject_t& begin, const hobject_t& end) {
+      hobject_t h = get_hobj();
+      int r = cmp(h, begin);
+      return r == 0 || (r > 0 && h < end);
+    }
+
+    void dump(Formatter *f) const;
+  };
+
+  struct Op : public RefCountedObject {
+    OSDSession *session;
+    int incarnation;
+
+    op_target_t target;
+
+    ConnectionRef con;  // for rx buffer only
+    uint64_t features;  // explicitly specified op features
+
+    vector<OSDOp> ops;
+
+    snapid_t snapid;
+    SnapContext snapc;
+    ceph::real_time mtime;
+
+    bufferlist *outbl;
+    vector<bufferlist*> out_bl;
+    vector<Context*> out_handler;
+    vector<int*> out_rval;
+
+    int priority;
+    Context *onfinish;
+    uint64_t ontimeout;
+
+    ceph_tid_t tid;
+    int attempts;
+
+    version_t *objver;
+    epoch_t *reply_epoch;
+
+    ceph::coarse_mono_time stamp;
+
+    epoch_t map_dne_bound;
+
+    int budget;
+
+    /// true if we should resend this message on failure
+    bool should_resend;
+
+    /// true if the throttle budget is get/put on a series of OPs,
+    /// instead of per OP basis, when this flag is set, the budget is
+    /// acquired before sending the very first OP of the series and
+    /// released upon receiving the last OP reply.
+    bool ctx_budgeted;
+
+    int *data_offset;
+
+    osd_reqid_t reqid; // explicitly setting reqid
+    ZTracer::Trace trace;
+
+    Op(const object_t& o, const object_locator_t& ol, vector<OSDOp>& op,
+       int f, Context *fin, version_t *ov, int *offset = NULL,
+       ZTracer::Trace *parent_trace = nullptr) :
+      session(NULL), incarnation(0),
+      target(o, ol, f),
+      con(NULL),
+      features(CEPH_FEATURES_SUPPORTED_DEFAULT),
+      snapid(CEPH_NOSNAP),
+      outbl(NULL),
+      priority(0),
+      onfinish(fin),
+      ontimeout(0),
+      tid(0),
+      attempts(0),
+      objver(ov),
+      reply_epoch(NULL),
+      map_dne_bound(0),
+      budget(-1),
+      should_resend(true),
+      ctx_budgeted(false),
+      data_offset(offset) {
+      ops.swap(op);
+
+      /* initialize out_* to match op vector */
+      out_bl.resize(ops.size());
+      out_rval.resize(ops.size());
+      out_handler.resize(ops.size());
+      for (unsigned i = 0; i < ops.size(); i++) {
+	out_bl[i] = NULL;
+	out_handler[i] = NULL;
+	out_rval[i] = NULL;
+      }
+
+      if (target.base_oloc.key == o)
+	target.base_oloc.key.clear();
+
+      if (parent_trace && parent_trace->valid()) {
+        trace.init("op", nullptr, parent_trace);
+        trace.event("start");
+      }
+    }
+
+    bool operator<(const Op& other) const {
+      return tid < other.tid;
+    }
+
+    bool respects_full() const {
+      return
+	(target.flags & (CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_RWORDERED)) &&
+	!(target.flags & (CEPH_OSD_FLAG_FULL_TRY | CEPH_OSD_FLAG_FULL_FORCE));
+    }
+
+  private:
+    ~Op() override {
+      while (!out_handler.empty()) {
+	delete out_handler.back();
+	out_handler.pop_back();
+      }
+      trace.event("finish");
+    }
+  };
+
+  struct C_Op_Map_Latest : public Context {
+    Objecter *objecter;
+    ceph_tid_t tid;
+    version_t latest;
+    C_Op_Map_Latest(Objecter *o, ceph_tid_t t) : objecter(o), tid(t),
+						 latest(0) {}
+    void finish(int r) override;
+  };
+
+  struct C_Command_Map_Latest : public Context {
+    Objecter *objecter;
+    uint64_t tid;
+    version_t latest;
+    C_Command_Map_Latest(Objecter *o, ceph_tid_t t) :  objecter(o), tid(t),
+						       latest(0) {}
+    void finish(int r) override;
+  };
+
+  struct C_Stat : public Context {
+    bufferlist bl;
+    uint64_t *psize;
+    ceph::real_time *pmtime;
+    Context *fin;
+    C_Stat(uint64_t *ps, ceph::real_time *pm, Context *c) :
+      psize(ps), pmtime(pm), fin(c) {}
+    void finish(int r) override {
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	uint64_t s;
+	ceph::real_time m;
+	decode(s, p);
+	decode(m, p);
+	if (psize)
+	  *psize = s;
+	if (pmtime)
+	  *pmtime = m;
+      }
+      fin->complete(r);
+    }
+  };
+
+  struct C_GetAttrs : public Context {
+    bufferlist bl;
+    map<string,bufferlist>& attrset;
+    Context *fin;
+    C_GetAttrs(map<string, bufferlist>& set, Context *c) : attrset(set),
+							   fin(c) {}
+    void finish(int r) override {
+      if (r >= 0) {
+	auto p = bl.cbegin();
+	decode(attrset, p);
+      }
+      fin->complete(r);
+    }
+  };
+
+
+  // Pools and statistics
+  struct NListContext {
+    collection_list_handle_t pos;
+
+    // these are for !sortbitwise compat only
+    int current_pg = 0;
+    int starting_pg_num = 0;
+    bool sort_bitwise = false;
+
+    bool at_end_of_pool = false; ///< publicly visible end flag
+
+    int64_t pool_id = -1;
+    int pool_snap_seq = 0;
+    uint64_t max_entries = 0;
+    string nspace;
+
+    bufferlist bl;   // raw data read to here
+    std::list<librados::ListObjectImpl> list;
+
+    bufferlist filter;
+
+    bufferlist extra_info;
+
+    // The budget associated with this context, once it is set (>= 0),
+    // the budget is not get/released on OP basis, instead the budget
+    // is acquired before sending the first OP and released upon receiving
+    // the last op reply.
+    int ctx_budget = -1;
+
+    bool at_end() const {
+      return at_end_of_pool;
+    }
+
+    uint32_t get_pg_hash_position() const {
+      return pos.get_hash();
+    }
+  };
+
+  struct C_NList : public Context {
+    NListContext *list_context;
+    Context *final_finish;
+    Objecter *objecter;
+    epoch_t epoch;
+    C_NList(NListContext *lc, Context * finish, Objecter *ob) :
+      list_context(lc), final_finish(finish), objecter(ob), epoch(0) {}
+    void finish(int r) override {
+      if (r >= 0) {
+	objecter->_nlist_reply(list_context, r, final_finish, epoch);
+      } else {
+	final_finish->complete(r);
+      }
+    }
+  };
+
+  struct PoolStatOp {
+    ceph_tid_t tid;
+    list<string> pools;
+
+    map<string,pool_stat_t> *pool_stats;
+    bool *per_pool;
+    Context *onfinish;
+    uint64_t ontimeout;
+
+    ceph::coarse_mono_time last_submit;
+  };
+
+  struct StatfsOp {
+    ceph_tid_t tid;
+    struct ceph_statfs *stats;
+    boost::optional<int64_t> data_pool;
+    Context *onfinish;
+    uint64_t ontimeout;
+
+    ceph::coarse_mono_time last_submit;
+  };
+
+  struct PoolOp {
+    ceph_tid_t tid;
+    int64_t pool;
+    string name;
+    Context *onfinish;
+    uint64_t ontimeout;
+    int pool_op;
+    int16_t crush_rule;
+    snapid_t snapid;
+    bufferlist *blp;
+
+    ceph::coarse_mono_time last_submit;
+    PoolOp() : tid(0), pool(0), onfinish(NULL), ontimeout(0), pool_op(0),
+	       crush_rule(0), snapid(0), blp(NULL) {}
+  };
+
+  // -- osd commands --
+  struct CommandOp : public RefCountedObject {
+    OSDSession *session = nullptr;
+    ceph_tid_t tid = 0;
+    vector<string> cmd;
+    bufferlist inbl;
+    bufferlist *poutbl = nullptr;
+    string *prs = nullptr;
+
+    // target_osd == -1 means target_pg is valid
+    const int target_osd = -1;
+    const pg_t target_pg;
+
+    op_target_t target;
+
+    epoch_t map_dne_bound = 0;
+    int map_check_error = 0; // error to return if map check fails
+    const char *map_check_error_str = nullptr;
+
+    Context *onfinish = nullptr;
+    uint64_t ontimeout = 0;
+    ceph::coarse_mono_time last_submit;
+
+    CommandOp(
+      int target_osd,
+      const vector<string> &cmd,
+      bufferlist inbl,
+      bufferlist *poutbl,
+      string *prs,
+      Context *onfinish)
+      : cmd(cmd),
+	inbl(inbl),
+	poutbl(poutbl),
+	prs(prs),
+	target_osd(target_osd),
+	onfinish(onfinish) {}
+
+    CommandOp(
+      pg_t pgid,
+      const vector<string> &cmd,
+      bufferlist inbl,
+      bufferlist *poutbl,
+      string *prs,
+      Context *onfinish)
+      : cmd(cmd),
+	inbl(inbl),
+	poutbl(poutbl),
+	prs(prs),
+	target_pg(pgid),
+	target(pgid),
+	onfinish(onfinish) {}
+
+  };
+
+  void submit_command(CommandOp *c, ceph_tid_t *ptid);
+  int _calc_command_target(CommandOp *c, shunique_lock &sul);
+  void _assign_command_session(CommandOp *c, shunique_lock &sul);
+  void _send_command(CommandOp *c);
+  int command_op_cancel(OSDSession *s, ceph_tid_t tid, int r);
+  void _finish_command(CommandOp *c, int r, string rs);
+  void handle_command_reply(MCommandReply *m);
+
+
+  // -- lingering ops --
+
+  struct WatchContext {
+    // this simply mirrors librados WatchCtx2
+    virtual void handle_notify(uint64_t notify_id,
+			       uint64_t cookie,
+			       uint64_t notifier_id,
+			       bufferlist& bl) = 0;
+    virtual void handle_error(uint64_t cookie, int err) = 0;
+    virtual ~WatchContext() {}
+  };
+
+  struct LingerOp : public RefCountedObject {
+    uint64_t linger_id;
+
+    op_target_t target;
+
+    snapid_t snap;
+    SnapContext snapc;
+    ceph::real_time mtime;
+
+    vector<OSDOp> ops;
+    bufferlist inbl;
+    bufferlist *poutbl;
+    version_t *pobjver;
+
+    bool is_watch;
+    ceph::coarse_mono_time watch_valid_thru; ///< send time for last acked ping
+    int last_error;  ///< error from last failed ping|reconnect, if any
+    std::shared_mutex watch_lock;
+    using lock_guard = std::unique_lock<decltype(watch_lock)>;
+    using unique_lock = std::unique_lock<decltype(watch_lock)>;
+    using shared_lock = boost::shared_lock<decltype(watch_lock)>;
+    using shunique_lock = ceph::shunique_lock<decltype(watch_lock)>;
+
+    // queue of pending async operations, with the timestamp of
+    // when they were queued.
+    list<ceph::coarse_mono_time> watch_pending_async;
+
+    uint32_t register_gen;
+    bool registered;
+    bool canceled;
+    Context *on_reg_commit;
+
+    // we trigger these from an async finisher
+    Context *on_notify_finish;
+    bufferlist *notify_result_bl;
+    uint64_t notify_id;
+
+    WatchContext *watch_context;
+
+    OSDSession *session;
+
+    Objecter *objecter;
+    int ctx_budget;
+    ceph_tid_t register_tid;
+    ceph_tid_t ping_tid;
+    epoch_t map_dne_bound;
+
+    void _queued_async() {
+      // watch_lock ust be locked unique
+      watch_pending_async.push_back(ceph::coarse_mono_clock::now());
+    }
+    void finished_async() {
+      unique_lock l(watch_lock);
+      ceph_assert(!watch_pending_async.empty());
+      watch_pending_async.pop_front();
+    }
+
+    explicit LingerOp(Objecter *o) : linger_id(0),
+			    target(object_t(), object_locator_t(), 0),
+			    snap(CEPH_NOSNAP), poutbl(NULL), pobjver(NULL),
+			    is_watch(false), last_error(0),
+			    register_gen(0),
+			    registered(false),
+			    canceled(false),
+			    on_reg_commit(NULL),
+			    on_notify_finish(NULL),
+			    notify_result_bl(NULL),
+			    notify_id(0),
+			    watch_context(NULL),
+			    session(NULL),
+			    objecter(o),
+			    ctx_budget(-1),
+			    register_tid(0),
+			    ping_tid(0),
+			    map_dne_bound(0) {}
+
+    const LingerOp &operator=(const LingerOp& r) = delete;
+    LingerOp(const LingerOp& o) = delete;
+
+    uint64_t get_cookie() {
+      return reinterpret_cast<uint64_t>(this);
+    }
+
+  private:
+    ~LingerOp() override {
+      delete watch_context;
+    }
+  };
+
+  struct C_Linger_Commit : public Context {
+    Objecter *objecter;
+    LingerOp *info;
+    bufferlist outbl;  // used for notify only
+    C_Linger_Commit(Objecter *o, LingerOp *l) : objecter(o), info(l) {
+      info->get();
+    }
+    ~C_Linger_Commit() override {
+      info->put();
+    }
+    void finish(int r) override {
+      objecter->_linger_commit(info, r, outbl);
+    }
+  };
+
+  struct C_Linger_Reconnect : public Context {
+    Objecter *objecter;
+    LingerOp *info;
+    C_Linger_Reconnect(Objecter *o, LingerOp *l) : objecter(o), info(l) {
+      info->get();
+    }
+    ~C_Linger_Reconnect() override {
+      info->put();
+    }
+    void finish(int r) override {
+      objecter->_linger_reconnect(info, r);
+    }
+  };
+
+  struct C_Linger_Ping : public Context {
+    Objecter *objecter;
+    LingerOp *info;
+    ceph::coarse_mono_time sent;
+    uint32_t register_gen;
+    C_Linger_Ping(Objecter *o, LingerOp *l)
+      : objecter(o), info(l), register_gen(info->register_gen) {
+      info->get();
+    }
+    ~C_Linger_Ping() override {
+      info->put();
+    }
+    void finish(int r) override {
+      objecter->_linger_ping(info, r, sent, register_gen);
+    }
+  };
+
+  struct C_Linger_Map_Latest : public Context {
+    Objecter *objecter;
+    uint64_t linger_id;
+    version_t latest;
+    C_Linger_Map_Latest(Objecter *o, uint64_t id) :
+      objecter(o), linger_id(id), latest(0) {}
+    void finish(int r) override;
+  };
+
+  // -- osd sessions --
+  struct OSDBackoff {
+    spg_t pgid;
+    uint64_t id;
+    hobject_t begin, end;
+  };
+
+  struct OSDSession : public RefCountedObject {
+    std::shared_mutex lock;
+    using lock_guard = std::lock_guard<decltype(lock)>;
+    using unique_lock = std::unique_lock<decltype(lock)>;
+    using shared_lock = boost::shared_lock<decltype(lock)>;
+    using shunique_lock = ceph::shunique_lock<decltype(lock)>;
+
+    // pending ops
+    map<ceph_tid_t,Op*> ops;
+    map<uint64_t, LingerOp*> linger_ops;
+    map<ceph_tid_t,CommandOp*> command_ops;
+
+    // backoffs
+    map<spg_t,map<hobject_t,OSDBackoff>> backoffs;
+    map<uint64_t,OSDBackoff*> backoffs_by_id;
+
+    int osd;
+    int incarnation;
+    ConnectionRef con;
+    int num_locks;
+    std::unique_ptr<std::mutex[]> completion_locks;
+    using unique_completion_lock = std::unique_lock<
+      decltype(completion_locks)::element_type>;
+
+
+    OSDSession(CephContext *cct, int o) :
+      osd(o), incarnation(0), con(NULL),
+      num_locks(cct->_conf->objecter_completion_locks_per_session),
+      completion_locks(new std::mutex[num_locks]) {}
+
+    ~OSDSession() override;
+
+    bool is_homeless() { return (osd == -1); }
+
+    unique_completion_lock get_lock(object_t& oid);
+  };
+  map<int,OSDSession*> osd_sessions;
+
+  bool osdmap_full_flag() const;
+  bool osdmap_pool_full(const int64_t pool_id) const;
+
+ private:
+
+  /**
+   * Test pg_pool_t::FLAG_FULL on a pool
+   *
+   * @return true if the pool exists and has the flag set, or
+   *         the global full flag is set, else false
+   */
+  bool _osdmap_pool_full(const int64_t pool_id) const;
+  bool _osdmap_pool_full(const pg_pool_t &p) const;
+  void update_pool_full_map(map<int64_t, bool>& pool_full_map);
+
+  map<uint64_t, LingerOp*> linger_ops;
+  // we use this just to confirm a cookie is valid before dereferencing the ptr
+  set<LingerOp*> linger_ops_set;
+
+  map<ceph_tid_t,PoolStatOp*> poolstat_ops;
+  map<ceph_tid_t,StatfsOp*> statfs_ops;
+  map<ceph_tid_t,PoolOp*> pool_ops;
+  std::atomic<unsigned> num_homeless_ops{0};
+
+  OSDSession *homeless_session;
+
+  // ops waiting for an osdmap with a new pool or confirmation that
+  // the pool does not exist (may be expanded to other uses later)
+  map<uint64_t, LingerOp*> check_latest_map_lingers;
+  map<ceph_tid_t, Op*> check_latest_map_ops;
+  map<ceph_tid_t, CommandOp*> check_latest_map_commands;
+
+  map<epoch_t,list< pair<Context*, int> > > waiting_for_map;
+
+  ceph::timespan mon_timeout;
+  ceph::timespan osd_timeout;
+
+  MOSDOp *_prepare_osd_op(Op *op);
+  void _send_op(Op *op);
+  void _send_op_account(Op *op);
+  void _cancel_linger_op(Op *op);
+  void _finish_op(Op *op, int r);
+  static bool is_pg_changed(
+    int oldprimary,
+    const vector<int>& oldacting,
+    int newprimary,
+    const vector<int>& newacting,
+    bool any_change=false);
+  enum recalc_op_target_result {
+    RECALC_OP_TARGET_NO_ACTION = 0,
+    RECALC_OP_TARGET_NEED_RESEND,
+    RECALC_OP_TARGET_POOL_DNE,
+    RECALC_OP_TARGET_OSD_DNE,
+    RECALC_OP_TARGET_OSD_DOWN,
+  };
+  bool _osdmap_full_flag() const;
+  bool _osdmap_has_pool_full() const;
+  void _prune_snapc(
+    const mempool::osdmap::map<int64_t, OSDMap::snap_interval_set_t>& new_removed_snaps,
+    Op *op);
+
+  bool target_should_be_paused(op_target_t *op);
+  int _calc_target(op_target_t *t, Connection *con,
+		   bool any_change = false);
+  int _map_session(op_target_t *op, OSDSession **s,
+		   shunique_lock& lc);
+
+  void _session_op_assign(OSDSession *s, Op *op);
+  void _session_op_remove(OSDSession *s, Op *op);
+  void _session_linger_op_assign(OSDSession *to, LingerOp *op);
+  void _session_linger_op_remove(OSDSession *from, LingerOp *op);
+  void _session_command_op_assign(OSDSession *to, CommandOp *op);
+  void _session_command_op_remove(OSDSession *from, CommandOp *op);
+
+  int _assign_op_target_session(Op *op, shunique_lock& lc,
+				bool src_session_locked,
+				bool dst_session_locked);
+  int _recalc_linger_op_target(LingerOp *op, shunique_lock& lc);
+
+  void _linger_submit(LingerOp *info, shunique_lock& sul);
+  void _send_linger(LingerOp *info, shunique_lock& sul);
+  void _linger_commit(LingerOp *info, int r, bufferlist& outbl);
+  void _linger_reconnect(LingerOp *info, int r);
+  void _send_linger_ping(LingerOp *info);
+  void _linger_ping(LingerOp *info, int r, ceph::coarse_mono_time sent,
+		    uint32_t register_gen);
+  int _normalize_watch_error(int r);
+
+  friend class C_DoWatchError;
+public:
+  void linger_callback_flush(Context *ctx) {
+    finisher->queue(ctx);
+  }
+
+private:
+  void _check_op_pool_dne(Op *op, unique_lock *sl);
+  void _send_op_map_check(Op *op);
+  void _op_cancel_map_check(Op *op);
+  void _check_linger_pool_dne(LingerOp *op, bool *need_unregister);
+  void _send_linger_map_check(LingerOp *op);
+  void _linger_cancel_map_check(LingerOp *op);
+  void _check_command_map_dne(CommandOp *op);
+  void _send_command_map_check(CommandOp *op);
+  void _command_cancel_map_check(CommandOp *op);
+
+  void _kick_requests(OSDSession *session, map<uint64_t, LingerOp *>& lresend);
+  void _linger_ops_resend(map<uint64_t, LingerOp *>& lresend, unique_lock& ul);
+
+  int _get_session(int osd, OSDSession **session, shunique_lock& sul);
+  void put_session(OSDSession *s);
+  void get_session(OSDSession *s);
+  void _reopen_session(OSDSession *session);
+  void close_session(OSDSession *session);
+
+  void _nlist_reply(NListContext *list_context, int r, Context *final_finish,
+		   epoch_t reply_epoch);
+
+  void resend_mon_ops();
+
+  /**
+   * handle a budget for in-flight ops
+   * budget is taken whenever an op goes into the ops map
+   * and returned whenever an op is removed from the map
+   * If throttle_op needs to throttle it will unlock client_lock.
+   */
+  int calc_op_budget(const vector<OSDOp>& ops);
+  void _throttle_op(Op *op, shunique_lock& sul, int op_size = 0);
+  int _take_op_budget(Op *op, shunique_lock& sul) {
+    ceph_assert(sul && sul.mutex() == &rwlock);
+    int op_budget = calc_op_budget(op->ops);
+    if (keep_balanced_budget) {
+      _throttle_op(op, sul, op_budget);
+    } else { // update take_linger_budget to match this!
+      op_throttle_bytes.take(op_budget);
+      op_throttle_ops.take(1);
+    }
+    op->budget = op_budget;
+    return op_budget;
+  }
+  int take_linger_budget(LingerOp *info);
+  friend class WatchContext; // to invoke put_up_budget_bytes
+  void put_op_budget_bytes(int op_budget) {
+    ceph_assert(op_budget >= 0);
+    op_throttle_bytes.put(op_budget);
+    op_throttle_ops.put(1);
+  }
+  void put_nlist_context_budget(NListContext *list_context);
+  Throttle op_throttle_bytes, op_throttle_ops;
+
+ public:
+  Objecter(CephContext *cct_, Messenger *m, MonClient *mc,
+	   Finisher *fin,
+	   double mon_timeout,
+	   double osd_timeout) :
+    Dispatcher(cct_), messenger(m), monc(mc), finisher(fin),
+    trace_endpoint("0.0.0.0", 0, "Objecter"),
+    osdmap(new OSDMap),
+    max_linger_id(0),
+    keep_balanced_budget(false), honor_osdmap_full(true), osdmap_full_try(false),
+    blacklist_events_enabled(false),
+    last_seen_osdmap_version(0), last_seen_pgmap_version(0),
+    logger(NULL), tick_event(0), m_request_state_hook(NULL),
+    homeless_session(new OSDSession(cct, -1)),
+    mon_timeout(ceph::make_timespan(mon_timeout)),
+    osd_timeout(ceph::make_timespan(osd_timeout)),
+    op_throttle_bytes(cct, "objecter_bytes",
+		      cct->_conf->objecter_inflight_op_bytes),
+    op_throttle_ops(cct, "objecter_ops", cct->_conf->objecter_inflight_ops),
+    epoch_barrier(0),
+    retry_writes_after_first_reply(cct->_conf->objecter_retry_writes_after_first_reply)
+  { }
+  ~Objecter() override;
+
+  void init();
+  void start(const OSDMap *o = nullptr);
+  void shutdown();
+
+  // These two templates replace osdmap_(get)|(put)_read. Simply wrap
+  // whatever functionality you want to use the OSDMap in a lambda like:
+  //
+  // with_osdmap([](const OSDMap& o) { o.do_stuff(); });
+  //
+  // or
+  //
+  // auto t = with_osdmap([&](const OSDMap& o) { return o.lookup_stuff(x); });
+  //
+  // Do not call into something that will try to lock the OSDMap from
+  // here or you will have great woe and misery.
+
+  template<typename Callback, typename...Args>
+  auto with_osdmap(Callback&& cb, Args&&... args) const ->
+    decltype(cb(*osdmap, std::forward<Args>(args)...)) {
+    shared_lock l(rwlock);
+    return std::forward<Callback>(cb)(*osdmap, std::forward<Args>(args)...);
+  }
+
+
+  /**
+   * Tell the objecter to throttle outgoing ops according to its
+   * budget (in _conf). If you do this, ops can block, in
+   * which case it will unlock client_lock and sleep until
+   * incoming messages reduce the used budget low enough for
+   * the ops to continue going; then it will lock client_lock again.
+   */
+  void set_balanced_budget() { keep_balanced_budget = true; }
+  void unset_balanced_budget() { keep_balanced_budget = false; }
+
+  void set_honor_osdmap_full() { honor_osdmap_full = true; }
+  void unset_honor_osdmap_full() { honor_osdmap_full = false; }
+
+  void set_osdmap_full_try() { osdmap_full_try = true; }
+  void unset_osdmap_full_try() { osdmap_full_try = false; }
+
+  void _scan_requests(
+    OSDSession *s,
+    bool skipped_map,
+    bool cluster_full,
+    map<int64_t, bool> *pool_full_map,
+    map<ceph_tid_t, Op*>& need_resend,
+    list<LingerOp*>& need_resend_linger,
+    map<ceph_tid_t, CommandOp*>& need_resend_command,
+    shunique_lock& sul,
+    const mempool::osdmap::map<int64_t,OSDMap::snap_interval_set_t> *gap_removed_snaps);
+
+  int64_t get_object_hash_position(int64_t pool, const string& key,
+				   const string& ns);
+  int64_t get_object_pg_hash_position(int64_t pool, const string& key,
+				      const string& ns);
+
+  // messages
+ public:
+  bool ms_dispatch(Message *m) override;
+  bool ms_can_fast_dispatch_any() const override {
+    return true;
+  }
+  bool ms_can_fast_dispatch(const Message *m) const override {
+    switch (m->get_type()) {
+    case CEPH_MSG_OSD_OPREPLY:
+    case CEPH_MSG_WATCH_NOTIFY:
+      return true;
+    default:
+      return false;
+    }
+  }
+  void ms_fast_dispatch(Message *m) override {
+    if (!ms_dispatch(m)) {
+      m->put();
+    }
+  }
+
+  void handle_osd_op_reply(class MOSDOpReply *m);
+  void handle_osd_backoff(class MOSDBackoff *m);
+  void handle_watch_notify(class MWatchNotify *m);
+  void handle_osd_map(class MOSDMap *m);
+  void wait_for_osd_map();
+
+  /**
+   * Get list of entities blacklisted since this was last called,
+   * and reset the list.
+   *
+   * Uses a std::set because typical use case is to compare some
+   * other list of clients to see which overlap with the blacklisted
+   * addrs.
+   *
+   */
+  void consume_blacklist_events(std::set<entity_addr_t> *events);
+
+  int pool_snap_by_name(int64_t poolid,
+			const char *snap_name,
+			snapid_t *snap) const;
+  int pool_snap_get_info(int64_t poolid, snapid_t snap,
+			 pool_snap_info_t *info) const;
+  int pool_snap_list(int64_t poolid, vector<uint64_t> *snaps);
+private:
+
+  void emit_blacklist_events(const OSDMap::Incremental &inc);
+  void emit_blacklist_events(const OSDMap &old_osd_map,
+                             const OSDMap &new_osd_map);
+
+  // low-level
+  void _op_submit(Op *op, shunique_lock& lc, ceph_tid_t *ptid);
+  void _op_submit_with_budget(Op *op, shunique_lock& lc,
+			      ceph_tid_t *ptid,
+			      int *ctx_budget = NULL);
+  // public interface
+public:
+  void op_submit(Op *op, ceph_tid_t *ptid = NULL, int *ctx_budget = NULL);
+  bool is_active() {
+    shared_lock l(rwlock);
+    return !((!inflight_ops) && linger_ops.empty() &&
+	     poolstat_ops.empty() && statfs_ops.empty());
+  }
+
+  /**
+   * Output in-flight requests
+   */
+  void _dump_active(OSDSession *s);
+  void _dump_active();
+  void dump_active();
+  void dump_requests(Formatter *fmt);
+  void _dump_ops(const OSDSession *s, Formatter *fmt);
+  void dump_ops(Formatter *fmt);
+  void _dump_linger_ops(const OSDSession *s, Formatter *fmt);
+  void dump_linger_ops(Formatter *fmt);
+  void _dump_command_ops(const OSDSession *s, Formatter *fmt);
+  void dump_command_ops(Formatter *fmt);
+  void dump_pool_ops(Formatter *fmt) const;
+  void dump_pool_stat_ops(Formatter *fmt) const;
+  void dump_statfs_ops(Formatter *fmt) const;
+
+  int get_client_incarnation() const { return client_inc; }
+  void set_client_incarnation(int inc) { client_inc = inc; }
+
+  bool have_map(epoch_t epoch);
+  /// wait for epoch; true if we already have it
+  bool wait_for_map(epoch_t epoch, Context *c, int err=0);
+  void _wait_for_new_map(Context *c, epoch_t epoch, int err=0);
+  void wait_for_latest_osdmap(Context *fin);
+  void get_latest_version(epoch_t oldest, epoch_t neweset, Context *fin);
+
+  /** Get the current set of global op flags */
+  int get_global_op_flags() const { return global_op_flags; }
+  /** Add a flag to the global op flags, not really atomic operation */
+  void add_global_op_flags(int flag) {
+    global_op_flags.fetch_or(flag);
+  }
+  /** Clear the passed flags from the global op flag set */
+  void clear_global_op_flag(int flags) {
+    global_op_flags.fetch_and(~flags);
+  }
+
+  /// cancel an in-progress request with the given return code
+private:
+  int op_cancel(OSDSession *s, ceph_tid_t tid, int r);
+  int _op_cancel(ceph_tid_t tid, int r);
+public:
+  int op_cancel(ceph_tid_t tid, int r);
+  int op_cancel(const vector<ceph_tid_t>& tidls, int r);
+
+  /**
+   * Any write op which is in progress at the start of this call shall no
+   * longer be in progress when this call ends.  Operations started after the
+   * start of this call may still be in progress when this call ends.
+   *
+   * @return the latest possible epoch in which a cancelled op could have
+   *         existed, or -1 if nothing was cancelled.
+   */
+  epoch_t op_cancel_writes(int r, int64_t pool=-1);
+
+  // commands
+  void osd_command(int osd, const std::vector<string>& cmd,
+		  const bufferlist& inbl, ceph_tid_t *ptid,
+		  bufferlist *poutbl, string *prs, Context *onfinish) {
+    ceph_assert(osd >= 0);
+    CommandOp *c = new CommandOp(
+      osd,
+      cmd,
+      inbl,
+      poutbl,
+      prs,
+      onfinish);
+    submit_command(c, ptid);
+  }
+  void pg_command(pg_t pgid, const vector<string>& cmd,
+		 const bufferlist& inbl, ceph_tid_t *ptid,
+		 bufferlist *poutbl, string *prs, Context *onfinish) {
+    CommandOp *c = new CommandOp(
+      pgid,
+      cmd,
+      inbl,
+      poutbl,
+      prs,
+      onfinish);
+    submit_command(c, ptid);
+  }
+
+  // mid-level helpers
+  Op *prepare_mutate_op(
+    const object_t& oid, const object_locator_t& oloc,
+    ObjectOperation& op, const SnapContext& snapc,
+    ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    osd_reqid_t reqid = osd_reqid_t(),
+    ZTracer::Trace *parent_trace = nullptr) {
+    Op *o = new Op(oid, oloc, op.ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver, nullptr, parent_trace);
+    o->priority = op.priority;
+    o->mtime = mtime;
+    o->snapc = snapc;
+    o->out_rval.swap(op.out_rval);
+    o->reqid = reqid;
+    return o;
+  }
+  ceph_tid_t mutate(
+    const object_t& oid, const object_locator_t& oloc,
+    ObjectOperation& op, const SnapContext& snapc,
+    ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    osd_reqid_t reqid = osd_reqid_t()) {
+    Op *o = prepare_mutate_op(oid, oloc, op, snapc, mtime, flags,
+			      oncommit, objver, reqid);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_read_op(
+    const object_t& oid, const object_locator_t& oloc,
+    ObjectOperation& op,
+    snapid_t snapid, bufferlist *pbl, int flags,
+    Context *onack, version_t *objver = NULL,
+    int *data_offset = NULL,
+    uint64_t features = 0,
+    ZTracer::Trace *parent_trace = nullptr) {
+    Op *o = new Op(oid, oloc, op.ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, onack, objver, data_offset, parent_trace);
+    o->priority = op.priority;
+    o->snapid = snapid;
+    o->outbl = pbl;
+    if (!o->outbl && op.size() == 1 && op.out_bl[0]->length())
+	o->outbl = op.out_bl[0];
+    o->out_bl.swap(op.out_bl);
+    o->out_handler.swap(op.out_handler);
+    o->out_rval.swap(op.out_rval);
+    return o;
+  }
+  ceph_tid_t read(
+    const object_t& oid, const object_locator_t& oloc,
+    ObjectOperation& op,
+    snapid_t snapid, bufferlist *pbl, int flags,
+    Context *onack, version_t *objver = NULL,
+    int *data_offset = NULL,
+    uint64_t features = 0) {
+    Op *o = prepare_read_op(oid, oloc, op, snapid, pbl, flags, onack, objver,
+			    data_offset);
+    if (features)
+      o->features = features;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_pg_read_op(
+    uint32_t hash, object_locator_t oloc,
+    ObjectOperation& op, bufferlist *pbl, int flags,
+    Context *onack, epoch_t *reply_epoch,
+    int *ctx_budget) {
+    Op *o = new Op(object_t(), oloc,
+		   op.ops,
+		   flags | global_op_flags | CEPH_OSD_FLAG_READ |
+		   CEPH_OSD_FLAG_IGNORE_OVERLAY,
+		   onack, NULL);
+    o->target.precalc_pgid = true;
+    o->target.base_pgid = pg_t(hash, oloc.pool);
+    o->priority = op.priority;
+    o->snapid = CEPH_NOSNAP;
+    o->outbl = pbl;
+    o->out_bl.swap(op.out_bl);
+    o->out_handler.swap(op.out_handler);
+    o->out_rval.swap(op.out_rval);
+    o->reply_epoch = reply_epoch;
+    if (ctx_budget) {
+      // budget is tracked by listing context
+      o->ctx_budgeted = true;
+    }
+    return o;
+  }
+  ceph_tid_t pg_read(
+    uint32_t hash, object_locator_t oloc,
+    ObjectOperation& op, bufferlist *pbl, int flags,
+    Context *onack, epoch_t *reply_epoch,
+    int *ctx_budget) {
+    Op *o = prepare_pg_read_op(hash, oloc, op, pbl, flags,
+			       onack, reply_epoch, ctx_budget);
+    ceph_tid_t tid;
+    op_submit(o, &tid, ctx_budget);
+    return tid;
+  }
+
+  // caller owns a ref
+  LingerOp *linger_register(const object_t& oid, const object_locator_t& oloc,
+			    int flags);
+  ceph_tid_t linger_watch(LingerOp *info,
+			  ObjectOperation& op,
+			  const SnapContext& snapc, ceph::real_time mtime,
+			  bufferlist& inbl,
+			  Context *onfinish,
+			  version_t *objver);
+  ceph_tid_t linger_notify(LingerOp *info,
+			   ObjectOperation& op,
+			   snapid_t snap, bufferlist& inbl,
+			   bufferlist *poutbl,
+			   Context *onack,
+			   version_t *objver);
+  int linger_check(LingerOp *info);
+  void linger_cancel(LingerOp *info);  // releases a reference
+  void _linger_cancel(LingerOp *info);
+
+  void _do_watch_notify(LingerOp *info, MWatchNotify *m);
+
+  /**
+   * set up initial ops in the op vector, and allocate a final op slot.
+   *
+   * The caller is responsible for filling in the final ops_count ops.
+   *
+   * @param ops op vector
+   * @param ops_count number of final ops the caller will fill in
+   * @param extra_ops pointer to [array of] initial op[s]
+   * @return index of final op (for caller to fill in)
+   */
+  int init_ops(vector<OSDOp>& ops, int ops_count, ObjectOperation *extra_ops) {
+    int i;
+    int extra = 0;
+
+    if (extra_ops)
+      extra = extra_ops->ops.size();
+
+    ops.resize(ops_count + extra);
+
+    for (i=0; i<extra; i++) {
+      ops[i] = extra_ops->ops[i];
+    }
+
+    return i;
+  }
+
+
+  // high-level helpers
+  Op *prepare_stat_op(
+    const object_t& oid, const object_locator_t& oloc,
+    snapid_t snap, uint64_t *psize, ceph::real_time *pmtime,
+    int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_STAT;
+    C_Stat *fin = new C_Stat(psize, pmtime, onfinish);
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, fin, objver);
+    o->snapid = snap;
+    o->outbl = &fin->bl;
+    return o;
+  }
+  ceph_tid_t stat(
+    const object_t& oid, const object_locator_t& oloc,
+    snapid_t snap, uint64_t *psize, ceph::real_time *pmtime,
+    int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL) {
+    Op *o = prepare_stat_op(oid, oloc, snap, psize, pmtime, flags,
+			    onfinish, objver, extra_ops);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  Op *prepare_read_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl,
+    int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0,
+    ZTracer::Trace *parent_trace = nullptr) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_READ;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = len;
+    ops[i].op.extent.truncate_size = 0;
+    ops[i].op.extent.truncate_seq = 0;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, onfinish, objver, nullptr, parent_trace);
+    o->snapid = snap;
+    o->outbl = pbl;
+    return o;
+  }
+  ceph_tid_t read(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl,
+    int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    Op *o = prepare_read_op(oid, oloc, off, len, snap, pbl, flags,
+			    onfinish, objver, extra_ops, op_flags);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  Op *prepare_cmpext_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, bufferlist &cmp_bl,
+    snapid_t snap, int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_CMPEXT;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = cmp_bl.length();
+    ops[i].op.extent.truncate_size = 0;
+    ops[i].op.extent.truncate_seq = 0;
+    ops[i].indata = cmp_bl;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, onfinish, objver);
+    o->snapid = snap;
+    return o;
+  }
+
+  ceph_tid_t cmpext(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, bufferlist &cmp_bl,
+    snapid_t snap, int flags, Context *onfinish, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    Op *o = prepare_cmpext_op(oid, oloc, off, cmp_bl, snap,
+			      flags, onfinish, objver, extra_ops, op_flags);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  ceph_tid_t read_trunc(const object_t& oid, const object_locator_t& oloc,
+			uint64_t off, uint64_t len, snapid_t snap,
+			bufferlist *pbl, int flags, uint64_t trunc_size,
+			__u32 trunc_seq, Context *onfinish,
+			version_t *objver = NULL,
+			ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_READ;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = len;
+    ops[i].op.extent.truncate_size = trunc_size;
+    ops[i].op.extent.truncate_seq = trunc_seq;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, onfinish, objver);
+    o->snapid = snap;
+    o->outbl = pbl;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t mapext(const object_t& oid, const object_locator_t& oloc,
+		    uint64_t off, uint64_t len, snapid_t snap, bufferlist *pbl,
+		    int flags, Context *onfinish, version_t *objver = NULL,
+		    ObjectOperation *extra_ops = NULL) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_MAPEXT;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = len;
+    ops[i].op.extent.truncate_size = 0;
+    ops[i].op.extent.truncate_seq = 0;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, onfinish, objver);
+    o->snapid = snap;
+    o->outbl = pbl;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t getxattr(const object_t& oid, const object_locator_t& oloc,
+	     const char *name, snapid_t snap, bufferlist *pbl, int flags,
+	     Context *onfinish,
+	     version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_GETXATTR;
+    ops[i].op.xattr.name_len = (name ? strlen(name) : 0);
+    ops[i].op.xattr.value_len = 0;
+    if (name)
+      ops[i].indata.append(name, ops[i].op.xattr.name_len);
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, onfinish, objver);
+    o->snapid = snap;
+    o->outbl = pbl;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  ceph_tid_t getxattrs(const object_t& oid, const object_locator_t& oloc,
+		       snapid_t snap, map<string,bufferlist>& attrset,
+		       int flags, Context *onfinish, version_t *objver = NULL,
+		       ObjectOperation *extra_ops = NULL) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_GETXATTRS;
+    C_GetAttrs *fin = new C_GetAttrs(attrset, onfinish);
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_READ, fin, objver);
+    o->snapid = snap;
+    o->outbl = &fin->bl;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  ceph_tid_t read_full(const object_t& oid, const object_locator_t& oloc,
+		       snapid_t snap, bufferlist *pbl, int flags,
+		       Context *onfinish, version_t *objver = NULL,
+		       ObjectOperation *extra_ops = NULL) {
+    return read(oid, oloc, 0, 0, snap, pbl, flags | global_op_flags |
+		CEPH_OSD_FLAG_READ, onfinish, objver, extra_ops);
+  }
+
+
+  // writes
+  ceph_tid_t _modify(const object_t& oid, const object_locator_t& oloc,
+		     vector<OSDOp>& ops, ceph::real_time mtime,
+		     const SnapContext& snapc, int flags,
+		     Context *oncommit,
+		     version_t *objver = NULL) {
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_write_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, uint64_t len, const SnapContext& snapc,
+    const bufferlist &bl, ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0,
+    ZTracer::Trace *parent_trace = nullptr) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_WRITE;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = len;
+    ops[i].op.extent.truncate_size = 0;
+    ops[i].op.extent.truncate_seq = 0;
+    ops[i].indata = bl;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver,
+                   nullptr, parent_trace);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    return o;
+  }
+  ceph_tid_t write(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t off, uint64_t len, const SnapContext& snapc,
+    const bufferlist &bl, ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    Op *o = prepare_write_op(oid, oloc, off, len, snapc, bl, mtime, flags,
+			     oncommit, objver, extra_ops, op_flags);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_append_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t len, const SnapContext& snapc,
+    const bufferlist &bl, ceph::real_time mtime, int flags,
+    Context *oncommit,
+    version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_APPEND;
+    ops[i].op.extent.offset = 0;
+    ops[i].op.extent.length = len;
+    ops[i].op.extent.truncate_size = 0;
+    ops[i].op.extent.truncate_seq = 0;
+    ops[i].indata = bl;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    return o;
+  }
+  ceph_tid_t append(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t len, const SnapContext& snapc,
+    const bufferlist &bl, ceph::real_time mtime, int flags,
+    Context *oncommit,
+    version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL) {
+    Op *o = prepare_append_op(oid, oloc, len, snapc, bl, mtime, flags,
+			      oncommit, objver, extra_ops);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t write_trunc(const object_t& oid, const object_locator_t& oloc,
+			 uint64_t off, uint64_t len, const SnapContext& snapc,
+			 const bufferlist &bl, ceph::real_time mtime, int flags,
+			 uint64_t trunc_size, __u32 trunc_seq,
+			 Context *oncommit,
+			 version_t *objver = NULL,
+			 ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_WRITE;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = len;
+    ops[i].op.extent.truncate_size = trunc_size;
+    ops[i].op.extent.truncate_seq = trunc_seq;
+    ops[i].indata = bl;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_write_full_op(
+    const object_t& oid, const object_locator_t& oloc,
+    const SnapContext& snapc, const bufferlist &bl,
+    ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_WRITEFULL;
+    ops[i].op.extent.offset = 0;
+    ops[i].op.extent.length = bl.length();
+    ops[i].indata = bl;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    return o;
+  }
+  ceph_tid_t write_full(
+    const object_t& oid, const object_locator_t& oloc,
+    const SnapContext& snapc, const bufferlist &bl,
+    ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+    Op *o = prepare_write_full_op(oid, oloc, snapc, bl, mtime, flags,
+				  oncommit, objver, extra_ops, op_flags);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_writesame_op(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t write_len, uint64_t off,
+    const SnapContext& snapc, const bufferlist &bl,
+    ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_WRITESAME;
+    ops[i].op.writesame.offset = off;
+    ops[i].op.writesame.length = write_len;
+    ops[i].op.writesame.data_length = bl.length();
+    ops[i].indata = bl;
+    ops[i].op.flags = op_flags;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    return o;
+  }
+  ceph_tid_t writesame(
+    const object_t& oid, const object_locator_t& oloc,
+    uint64_t write_len, uint64_t off,
+    const SnapContext& snapc, const bufferlist &bl,
+    ceph::real_time mtime, int flags,
+    Context *oncommit, version_t *objver = NULL,
+    ObjectOperation *extra_ops = NULL, int op_flags = 0) {
+
+    Op *o = prepare_writesame_op(oid, oloc, write_len, off, snapc, bl,
+				 mtime, flags, oncommit, objver,
+				 extra_ops, op_flags);
+
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t trunc(const object_t& oid, const object_locator_t& oloc,
+		   const SnapContext& snapc, ceph::real_time mtime, int flags,
+		   uint64_t trunc_size, __u32 trunc_seq,
+		   Context *oncommit, version_t *objver = NULL,
+		   ObjectOperation *extra_ops = NULL) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_TRUNCATE;
+    ops[i].op.extent.offset = trunc_size;
+    ops[i].op.extent.truncate_size = trunc_size;
+    ops[i].op.extent.truncate_seq = trunc_seq;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t zero(const object_t& oid, const object_locator_t& oloc,
+		  uint64_t off, uint64_t len, const SnapContext& snapc,
+		  ceph::real_time mtime, int flags, Context *oncommit,
+	     version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_ZERO;
+    ops[i].op.extent.offset = off;
+    ops[i].op.extent.length = len;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t rollback_object(const object_t& oid, const object_locator_t& oloc,
+			     const SnapContext& snapc, snapid_t snapid,
+			     ceph::real_time mtime, Context *oncommit,
+			     version_t *objver = NULL,
+			     ObjectOperation *extra_ops = NULL) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_ROLLBACK;
+    ops[i].op.snap.snapid = snapid;
+    Op *o = new Op(oid, oloc, ops, CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t create(const object_t& oid, const object_locator_t& oloc,
+		    const SnapContext& snapc, ceph::real_time mtime, int global_flags,
+		    int create_flags, Context *oncommit,
+		    version_t *objver = NULL,
+		    ObjectOperation *extra_ops = NULL) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_CREATE;
+    ops[i].op.flags = create_flags;
+    Op *o = new Op(oid, oloc, ops, global_flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  Op *prepare_remove_op(
+    const object_t& oid, const object_locator_t& oloc,
+    const SnapContext& snapc, ceph::real_time mtime, int flags,
+    Context *oncommit,
+    version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_DELETE;
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    return o;
+  }
+  ceph_tid_t remove(
+    const object_t& oid, const object_locator_t& oloc,
+    const SnapContext& snapc, ceph::real_time mtime, int flags,
+    Context *oncommit,
+    version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    Op *o = prepare_remove_op(oid, oloc, snapc, mtime, flags,
+			      oncommit, objver, extra_ops);
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  ceph_tid_t setxattr(const object_t& oid, const object_locator_t& oloc,
+	      const char *name, const SnapContext& snapc, const bufferlist &bl,
+	      ceph::real_time mtime, int flags,
+	      Context *oncommit,
+	      version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_SETXATTR;
+    ops[i].op.xattr.name_len = (name ? strlen(name) : 0);
+    ops[i].op.xattr.value_len = bl.length();
+    if (name)
+      ops[i].indata.append(name, ops[i].op.xattr.name_len);
+    ops[i].indata.append(bl);
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+  ceph_tid_t removexattr(const object_t& oid, const object_locator_t& oloc,
+	      const char *name, const SnapContext& snapc,
+	      ceph::real_time mtime, int flags,
+	      Context *oncommit,
+	      version_t *objver = NULL, ObjectOperation *extra_ops = NULL) {
+    vector<OSDOp> ops;
+    int i = init_ops(ops, 1, extra_ops);
+    ops[i].op.op = CEPH_OSD_OP_RMXATTR;
+    ops[i].op.xattr.name_len = (name ? strlen(name) : 0);
+    ops[i].op.xattr.value_len = 0;
+    if (name)
+      ops[i].indata.append(name, ops[i].op.xattr.name_len);
+    Op *o = new Op(oid, oloc, ops, flags | global_op_flags |
+		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
+    o->mtime = mtime;
+    o->snapc = snapc;
+    ceph_tid_t tid;
+    op_submit(o, &tid);
+    return tid;
+  }
+
+  void list_nobjects(NListContext *p, Context *onfinish);
+  uint32_t list_nobjects_seek(NListContext *p, uint32_t pos);
+  uint32_t list_nobjects_seek(NListContext *list_context, const hobject_t& c);
+  void list_nobjects_get_cursor(NListContext *list_context, hobject_t *c);
+
+  hobject_t enumerate_objects_begin();
+  hobject_t enumerate_objects_end();
+  //hobject_t enumerate_objects_begin(int n, int m);
+  void enumerate_objects(
+    int64_t pool_id,
+    const std::string &ns,
+    const hobject_t &start,
+    const hobject_t &end,
+    const uint32_t max,
+    const bufferlist &filter_bl,
+    std::list<librados::ListObjectImpl> *result, 
+    hobject_t *next,
+    Context *on_finish);
+
+  void _enumerate_reply(
+      bufferlist &bl,
+      int r,
+      const hobject_t &end,
+      const int64_t pool_id,
+      int budget,
+      epoch_t reply_epoch,
+      std::list<librados::ListObjectImpl> *result, 
+      hobject_t *next,
+      Context *on_finish);
+  friend class C_EnumerateReply;
+
+  // -------------------------
+  // pool ops
+private:
+  void pool_op_submit(PoolOp *op);
+  void _pool_op_submit(PoolOp *op);
+  void _finish_pool_op(PoolOp *op, int r);
+  void _do_delete_pool(int64_t pool, Context *onfinish);
+public:
+  int create_pool_snap(int64_t pool, string& snapName, Context *onfinish);
+  int allocate_selfmanaged_snap(int64_t pool, snapid_t *psnapid,
+				Context *onfinish);
+  int delete_pool_snap(int64_t pool, string& snapName, Context *onfinish);
+  int delete_selfmanaged_snap(int64_t pool, snapid_t snap, Context *onfinish);
+
+  int create_pool(string& name, Context *onfinish,
+		  int crush_rule=-1);
+  int delete_pool(int64_t pool, Context *onfinish);
+  int delete_pool(const string& name, Context *onfinish);
+
+  void handle_pool_op_reply(MPoolOpReply *m);
+  int pool_op_cancel(ceph_tid_t tid, int r);
+
+  // --------------------------
+  // pool stats
+private:
+  void _poolstat_submit(PoolStatOp *op);
+public:
+  void handle_get_pool_stats_reply(MGetPoolStatsReply *m);
+  void get_pool_stats(list<string>& pools, map<string,pool_stat_t> *result,
+		      bool *per_pool,
+		      Context *onfinish);
+  int pool_stat_op_cancel(ceph_tid_t tid, int r);
+  void _finish_pool_stat_op(PoolStatOp *op, int r);
+
+  // ---------------------------
+  // df stats
+private:
+  void _fs_stats_submit(StatfsOp *op);
+public:
+  void handle_fs_stats_reply(MStatfsReply *m);
+  void get_fs_stats(struct ceph_statfs& result, boost::optional<int64_t> poolid,
+		    Context *onfinish);
+  int statfs_op_cancel(ceph_tid_t tid, int r);
+  void _finish_statfs_op(StatfsOp *op, int r);
+
+  // ---------------------------
+  // some scatter/gather hackery
+
+  void _sg_read_finish(vector<ObjectExtent>& extents,
+		       vector<bufferlist>& resultbl,
+		       bufferlist *bl, Context *onfinish);
+
+  struct C_SGRead : public Context {
+    Objecter *objecter;
+    vector<ObjectExtent> extents;
+    vector<bufferlist> resultbl;
+    bufferlist *bl;
+    Context *onfinish;
+    C_SGRead(Objecter *ob,
+	     vector<ObjectExtent>& e, vector<bufferlist>& r, bufferlist *b,
+	     Context *c) :
+      objecter(ob), bl(b), onfinish(c) {
+      extents.swap(e);
+      resultbl.swap(r);
+    }
+    void finish(int r) override {
+      objecter->_sg_read_finish(extents, resultbl, bl, onfinish);
+    }
+  };
+
+  void sg_read_trunc(vector<ObjectExtent>& extents, snapid_t snap,
+		     bufferlist *bl, int flags, uint64_t trunc_size,
+		     __u32 trunc_seq, Context *onfinish, int op_flags = 0) {
+    if (extents.size() == 1) {
+      read_trunc(extents[0].oid, extents[0].oloc, extents[0].offset,
+		 extents[0].length, snap, bl, flags, extents[0].truncate_size,
+		 trunc_seq, onfinish, 0, 0, op_flags);
+    } else {
+      C_GatherBuilder gather(cct);
+      vector<bufferlist> resultbl(extents.size());
+      int i=0;
+      for (vector<ObjectExtent>::iterator p = extents.begin();
+	   p != extents.end();
+	   ++p) {
+	read_trunc(p->oid, p->oloc, p->offset, p->length, snap, &resultbl[i++],
+		   flags, p->truncate_size, trunc_seq, gather.new_sub(),
+		   0, 0, op_flags);
+      }
+      gather.set_finisher(new C_SGRead(this, extents, resultbl, bl, onfinish));
+      gather.activate();
+    }
+  }
+
+  void sg_read(vector<ObjectExtent>& extents, snapid_t snap, bufferlist *bl,
+	       int flags, Context *onfinish, int op_flags = 0) {
+    sg_read_trunc(extents, snap, bl, flags, 0, 0, onfinish, op_flags);
+  }
+
+  void sg_write_trunc(vector<ObjectExtent>& extents, const SnapContext& snapc,
+		      const bufferlist& bl, ceph::real_time mtime, int flags,
+		      uint64_t trunc_size, __u32 trunc_seq,
+		      Context *oncommit, int op_flags = 0) {
+    if (extents.size() == 1) {
+      write_trunc(extents[0].oid, extents[0].oloc, extents[0].offset,
+		  extents[0].length, snapc, bl, mtime, flags,
+		  extents[0].truncate_size, trunc_seq, oncommit,
+		  0, 0, op_flags);
+    } else {
+      C_GatherBuilder gcom(cct, oncommit);
+      for (vector<ObjectExtent>::iterator p = extents.begin();
+	   p != extents.end();
+	   ++p) {
+	bufferlist cur;
+	for (vector<pair<uint64_t,uint64_t> >::iterator bit
+	       = p->buffer_extents.begin();
+	     bit != p->buffer_extents.end();
+	     ++bit)
+	  bl.copy(bit->first, bit->second, cur);
+	ceph_assert(cur.length() == p->length);
+	write_trunc(p->oid, p->oloc, p->offset, p->length,
+	      snapc, cur, mtime, flags, p->truncate_size, trunc_seq,
+	      oncommit ? gcom.new_sub():0,
+	      0, 0, op_flags);
+      }
+      gcom.activate();
+    }
+  }
+
+  void sg_write(vector<ObjectExtent>& extents, const SnapContext& snapc,
+		const bufferlist& bl, ceph::real_time mtime, int flags,
+		Context *oncommit, int op_flags = 0) {
+    sg_write_trunc(extents, snapc, bl, mtime, flags, 0, 0, oncommit,
+		   op_flags);
+  }
+
+  void ms_handle_connect(Connection *con) override;
+  bool ms_handle_reset(Connection *con) override;
+  void ms_handle_remote_reset(Connection *con) override;
+  bool ms_handle_refused(Connection *con) override;
+  bool ms_get_authorizer(int dest_type,
+			 AuthAuthorizer **authorizer) override;
+
+  void blacklist_self(bool set);
+
+private:
+  epoch_t epoch_barrier;
+  bool retry_writes_after_first_reply;
+public:
+  void set_epoch_barrier(epoch_t epoch);
+
+  PerfCounters *get_logger() {
+    return logger;
+  }
+};
+
+#endif
diff --git a/src/osdc/Striper.cc b/src/osdc/Striper.cc
new file mode 100644
index 00000000..3286b012
--- /dev/null
+++ b/src/osdc/Striper.cc
@@ -0,0 +1,411 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "Striper.h"
+
+#include "include/types.h"
+#include "include/buffer.h"
+#include "osd/OSDMap.h"
+
+#include "common/config.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_striper
+#undef dout_prefix
+#define dout_prefix *_dout << "striper "
+
+
+void Striper::file_to_extents(CephContext *cct, const char *object_format,
+			      const file_layout_t *layout,
+			      uint64_t offset, uint64_t len,
+			      uint64_t trunc_size,
+			      vector<ObjectExtent>& extents,
+			      uint64_t buffer_offset)
+{
+  map<object_t,vector<ObjectExtent> > object_extents;
+  file_to_extents(cct, object_format, layout, offset, len, trunc_size,
+		  object_extents, buffer_offset);
+  assimilate_extents(object_extents, extents);
+}
+
+void Striper::file_to_extents(
+  CephContext *cct, const char *object_format,
+  const file_layout_t *layout,
+  uint64_t offset, uint64_t len,
+  uint64_t trunc_size,
+  map<object_t,vector<ObjectExtent> >& object_extents,
+  uint64_t buffer_offset)
+{
+  ldout(cct, 10) << "file_to_extents " << offset << "~" << len
+		 << " format " << object_format
+		 << dendl;
+  ceph_assert(len > 0);
+
+  /*
+   * we want only one extent per object!  this means that each extent
+   * we read may map into different bits of the final read
+   * buffer.. hence ObjectExtent.buffer_extents
+   */
+
+  __u32 object_size = layout->object_size;
+  __u32 su = layout->stripe_unit;
+  __u32 stripe_count = layout->stripe_count;
+  ceph_assert(object_size >= su);
+  if (stripe_count == 1) {
+    ldout(cct, 20) << " sc is one, reset su to os" << dendl;
+    su = object_size;
+  }
+  uint64_t stripes_per_object = object_size / su;
+  ldout(cct, 20) << " su " << su << " sc " << stripe_count << " os "
+		 << object_size << " stripes_per_object " << stripes_per_object
+		 << dendl;
+
+  uint64_t cur = offset;
+  uint64_t left = len;
+  while (left > 0) {
+    // layout into objects
+    uint64_t blockno = cur / su; // which block
+    // which horizontal stripe (Y)
+    uint64_t stripeno = blockno / stripe_count;
+    // which object in the object set (X)
+    uint64_t stripepos = blockno % stripe_count;
+    // which object set
+    uint64_t objectsetno = stripeno / stripes_per_object;
+    // object id
+    uint64_t objectno = objectsetno * stripe_count + stripepos;
+
+    // find oid, extent
+    char buf[strlen(object_format) + 32];
+    snprintf(buf, sizeof(buf), object_format, (long long unsigned)objectno);
+    object_t oid = buf;
+
+    // map range into object
+    uint64_t block_start = (stripeno % stripes_per_object) * su;
+    uint64_t block_off = cur % su;
+    uint64_t max = su - block_off;
+
+    uint64_t x_offset = block_start + block_off;
+    uint64_t x_len;
+    if (left > max)
+      x_len = max;
+    else
+      x_len = left;
+
+    ldout(cct, 20) << " off " << cur << " blockno " << blockno << " stripeno "
+		   << stripeno << " stripepos " << stripepos << " objectsetno "
+		   << objectsetno << " objectno " << objectno
+		   << " block_start " << block_start << " block_off "
+		   << block_off << " " << x_offset << "~" << x_len
+		   << dendl;
+
+    ObjectExtent *ex = 0;
+    vector<ObjectExtent>& exv = object_extents[oid];
+    if (exv.empty() || exv.back().offset + exv.back().length != x_offset) {
+      exv.resize(exv.size() + 1);
+      ex = &exv.back();
+      ex->oid = oid;
+      ex->objectno = objectno;
+      ex->oloc = OSDMap::file_to_object_locator(*layout);
+
+      ex->offset = x_offset;
+      ex->length = x_len;
+      ex->truncate_size = object_truncate_size(cct, layout, objectno,
+					       trunc_size);
+
+      ldout(cct, 20) << " added new " << *ex << dendl;
+    } else {
+      // add to extent
+      ex = &exv.back();
+      ldout(cct, 20) << " adding in to " << *ex << dendl;
+      ex->length += x_len;
+    }
+    ex->buffer_extents.push_back(make_pair(cur - offset + buffer_offset,
+					   x_len));
+
+    ldout(cct, 15) << "file_to_extents  " << *ex << " in " << ex->oloc
+		   << dendl;
+    // ldout(cct, 0) << "map: ino " << ino << " oid " << ex.oid << " osd "
+    //		  << ex.osd << " offset " << ex.offset << " len " << ex.len
+    //		  << " ... left " << left << dendl;
+
+    left -= x_len;
+    cur += x_len;
+  }
+}
+
+void Striper::assimilate_extents(
+  map<object_t,vector<ObjectExtent> >& object_extents,
+  vector<ObjectExtent>& extents)
+{
+  // make final list
+  for (map<object_t, vector<ObjectExtent> >::iterator it
+	 = object_extents.begin();
+       it != object_extents.end();
+       ++it) {
+    for (vector<ObjectExtent>::iterator p = it->second.begin();
+	 p != it->second.end();
+	 ++p) {
+      extents.push_back(*p);
+    }
+  }
+}
+
+void Striper::extent_to_file(CephContext *cct, file_layout_t *layout,
+			   uint64_t objectno, uint64_t off, uint64_t len,
+			   vector<pair<uint64_t, uint64_t> >& extents)
+{
+  ldout(cct, 10) << "extent_to_file " << objectno << " " << off << "~"
+		 << len << dendl;
+
+  __u32 object_size = layout->object_size;
+  __u32 su = layout->stripe_unit;
+  __u32 stripe_count = layout->stripe_count;
+  ceph_assert(object_size >= su);
+  uint64_t stripes_per_object = object_size / su;
+  ldout(cct, 20) << " stripes_per_object " << stripes_per_object << dendl;
+
+  uint64_t off_in_block = off % su;
+
+  extents.reserve(len / su + 1);
+
+  while (len > 0) {
+    uint64_t stripepos = objectno % stripe_count;
+    uint64_t objectsetno = objectno / stripe_count;
+    uint64_t stripeno = off / su + objectsetno * stripes_per_object;
+    uint64_t blockno = stripeno * stripe_count + stripepos;
+    uint64_t extent_off = blockno * su + off_in_block;
+    uint64_t extent_len = std::min(len, su - off_in_block);
+    extents.push_back(make_pair(extent_off, extent_len));
+
+    ldout(cct, 20) << " object " << off << "~" << extent_len
+		   << " -> file " << extent_off << "~" << extent_len
+		   << dendl;
+
+    off_in_block = 0;
+    off += extent_len;
+    len -= extent_len;
+  }
+}
+
+uint64_t Striper::object_truncate_size(CephContext *cct,
+				       const file_layout_t *layout,
+				       uint64_t objectno, uint64_t trunc_size)
+{
+  uint64_t obj_trunc_size;
+  if (trunc_size == 0 || trunc_size == (uint64_t)-1) {
+    obj_trunc_size = trunc_size;
+  } else {
+    __u32 object_size = layout->object_size;
+    __u32 su = layout->stripe_unit;
+    __u32 stripe_count = layout->stripe_count;
+    ceph_assert(object_size >= su);
+    uint64_t stripes_per_object = object_size / su;
+
+    uint64_t objectsetno = objectno / stripe_count;
+    uint64_t trunc_objectsetno = trunc_size / object_size / stripe_count;
+    if (objectsetno > trunc_objectsetno)
+      obj_trunc_size = 0;
+    else if (objectsetno < trunc_objectsetno)
+      obj_trunc_size = object_size;
+    else {
+      uint64_t trunc_blockno = trunc_size / su;
+      uint64_t trunc_stripeno = trunc_blockno / stripe_count;
+      uint64_t trunc_stripepos = trunc_blockno % stripe_count;
+      uint64_t trunc_objectno = trunc_objectsetno * stripe_count
+	+ trunc_stripepos;
+      if (objectno < trunc_objectno)
+	obj_trunc_size = ((trunc_stripeno % stripes_per_object) + 1) * su;
+      else if (objectno > trunc_objectno)
+	obj_trunc_size = (trunc_stripeno % stripes_per_object) * su;
+      else
+	obj_trunc_size = (trunc_stripeno % stripes_per_object) * su
+	  + (trunc_size % su);
+    }
+  }
+  ldout(cct, 20) << "object_truncate_size " << objectno << " "
+		 << trunc_size << "->" << obj_trunc_size << dendl;
+  return obj_trunc_size;
+}
+
+uint64_t Striper::get_num_objects(const file_layout_t& layout,
+				  uint64_t size)
+{
+  __u32 stripe_unit = layout.stripe_unit;
+  __u32 stripe_count = layout.stripe_count;
+  uint64_t period = layout.get_period();
+  uint64_t num_periods = (size + period - 1) / period;
+  uint64_t remainder_bytes = size % period;
+  uint64_t remainder_objs = 0;
+  if ((remainder_bytes > 0) && (remainder_bytes < (uint64_t)stripe_count
+				* stripe_unit))
+    remainder_objs = stripe_count - ((remainder_bytes + stripe_unit - 1)
+				     / stripe_unit);
+  return num_periods * stripe_count - remainder_objs;
+}
+
+// StripedReadResult
+
+void Striper::StripedReadResult::add_partial_result(
+  CephContext *cct, bufferlist& bl,
+  const vector<pair<uint64_t,uint64_t> >& buffer_extents)
+{
+  ldout(cct, 10) << "add_partial_result(" << this << ") " << bl.length()
+		 << " to " << buffer_extents << dendl;
+  for (vector<pair<uint64_t,uint64_t> >::const_iterator p
+	 = buffer_extents.begin();
+       p != buffer_extents.end();
+       ++p) {
+    pair<bufferlist, uint64_t>& r = partial[p->first];
+    size_t actual = std::min<uint64_t>(bl.length(), p->second);
+    bl.splice(0, actual, &r.first);
+    r.second = p->second;
+    total_intended_len += r.second;
+  }
+}
+
+void Striper::StripedReadResult::add_partial_sparse_result(
+  CephContext *cct, bufferlist& bl, const map<uint64_t, uint64_t>& bl_map,
+  uint64_t bl_off, const vector<pair<uint64_t,uint64_t> >& buffer_extents)
+{
+  ldout(cct, 10) << "add_partial_sparse_result(" << this << ") " << bl.length()
+		 << " covering " << bl_map << " (offset " << bl_off << ")"
+		 << " to " << buffer_extents << dendl;
+  map<uint64_t, uint64_t>::const_iterator s = bl_map.begin();
+  for (vector<pair<uint64_t,uint64_t> >::const_iterator p
+	 = buffer_extents.begin();
+       p != buffer_extents.end();
+       ++p) {
+    uint64_t tofs = p->first;
+    size_t tlen = p->second;
+    ldout(cct, 30) << " be " << tofs << "~" << tlen << dendl;
+    while (tlen > 0) {
+      ldout(cct, 20) << "  t " << tofs << "~" << tlen
+		     << " bl has " << bl.length()
+		     << " off " << bl_off
+		     << dendl;
+      if (s == bl_map.end()) {
+	ldout(cct, 20) << "  s at end" << dendl;
+	pair<bufferlist, uint64_t>& r = partial[tofs];
+	r.second = tlen;
+	total_intended_len += r.second;
+	break;
+      }
+
+      ldout(cct, 30) << "  s " << s->first << "~" << s->second << dendl;
+
+      // skip zero-length extent
+      if (s->second == 0) {
+	ldout(cct, 30) << "  s len 0, skipping" << dendl;
+	++s;
+	continue;
+      }
+
+      if (s->first > bl_off) {
+	// gap in sparse read result
+	pair<bufferlist, uint64_t>& r = partial[tofs];
+	size_t gap = std::min<size_t>(s->first - bl_off, tlen);
+	ldout(cct, 20) << "  s gap " << gap << ", skipping" << dendl;
+	r.second = gap;
+	total_intended_len += r.second;
+	bl_off += gap;
+	tofs += gap;
+	tlen -= gap;
+	if (tlen == 0) {
+	  continue;
+	}
+      }
+
+      ceph_assert(s->first <= bl_off);
+      size_t left = (s->first + s->second) - bl_off;
+      size_t actual = std::min(left, tlen);
+
+      if (actual > 0) {
+	ldout(cct, 20) << "  s has " << actual << ", copying" << dendl;
+	pair<bufferlist, uint64_t>& r = partial[tofs];
+	bl.splice(0, actual, &r.first);
+	r.second = actual;
+	total_intended_len += r.second;
+	bl_off += actual;
+	tofs += actual;
+	tlen -= actual;
+      }
+      if (actual == left) {
+	ldout(cct, 30) << "  s advancing" << dendl;
+	++s;
+      }
+    }
+  }
+}
+
+void Striper::StripedReadResult::assemble_result(CephContext *cct,
+						 bufferlist& bl,
+						 bool zero_tail)
+{
+  ldout(cct, 10) << "assemble_result(" << this << ") zero_tail=" << zero_tail
+		 << dendl;
+  size_t zeros = 0;  // zeros preceding current position
+  for (auto& p : partial) {
+    size_t got = p.second.first.length();
+    size_t expect = p.second.second;
+    if (got) {
+      if (zeros) {
+	bl.append_zero(zeros);
+	zeros = 0;
+      }
+      bl.claim_append(p.second.first);
+    }
+    zeros += expect - got;
+  }
+  if (zero_tail && zeros) {
+    bl.append_zero(zeros);
+  }
+  partial.clear();
+}
+
+void Striper::StripedReadResult::assemble_result(CephContext *cct, char *buffer, size_t length)
+{
+
+  ceph_assert(buffer && length == total_intended_len);
+
+  map<uint64_t,pair<bufferlist,uint64_t> >::reverse_iterator p = partial.rbegin();
+  if (p == partial.rend())
+    return;
+
+  uint64_t curr = length;
+  uint64_t end = p->first + p->second.second;
+  while (p != partial.rend()) {
+    // sanity check
+    ldout(cct, 20) << "assemble_result(" << this << ") " << p->first << "~" << p->second.second
+		   << " " << p->second.first.length() << " bytes"
+		   << dendl;
+    ceph_assert(p->first == end - p->second.second);
+    end = p->first;
+
+    size_t len = p->second.first.length();
+    ceph_assert(curr >= p->second.second);
+    curr -= p->second.second;
+    if (len < p->second.second) {
+      if (len)
+	p->second.first.copy(0, len, buffer + curr);
+      // FIPS zeroization audit 20191117: this memset is not security related.
+      memset(buffer + curr + len, 0, p->second.second - len);
+    } else {
+      p->second.first.copy(0, len, buffer + curr);
+    }
+    ++p;
+  }
+  partial.clear();
+  ceph_assert(curr == 0);
+}
+
diff --git a/src/osdc/Striper.h b/src/osdc/Striper.h
new file mode 100644
index 00000000..6d110e95
--- /dev/null
+++ b/src/osdc/Striper.h
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_STRIPER_H
+#define CEPH_STRIPER_H
+
+#include "include/types.h"
+#include "osd/osd_types.h"
+
+class CephContext;
+
+//namespace ceph {
+
+  class Striper {
+  public:
+    /*
+     * map (ino, layout, offset, len) to a (list of) ObjectExtents (byte
+     * ranges in objects on (primary) osds)
+     */
+    static void file_to_extents(CephContext *cct, const char *object_format,
+				const file_layout_t *layout,
+				uint64_t offset, uint64_t len,
+				uint64_t trunc_size,
+				map<object_t, vector<ObjectExtent> >& extents,
+				uint64_t buffer_offset=0);
+
+    static void file_to_extents(CephContext *cct, const char *object_format,
+				const file_layout_t *layout,
+				uint64_t offset, uint64_t len,
+				uint64_t trunc_size,
+				vector<ObjectExtent>& extents,
+				uint64_t buffer_offset=0);
+
+    static void file_to_extents(CephContext *cct, inodeno_t ino,
+				const file_layout_t *layout,
+				uint64_t offset, uint64_t len,
+				uint64_t trunc_size,
+				vector<ObjectExtent>& extents) {
+      // generate prefix/format
+      char buf[32];
+      snprintf(buf, sizeof(buf), "%llx.%%08llx", (long long unsigned)ino);
+
+      file_to_extents(cct, buf, layout, offset, len, trunc_size, extents);
+    }
+
+    static void assimilate_extents(
+      map<object_t, vector<ObjectExtent> >& object_extents,
+      vector<ObjectExtent>& extents);
+
+    /**
+     * reverse map an object extent to file extents
+     */
+    static void extent_to_file(CephContext *cct, file_layout_t *layout,
+			       uint64_t objectno, uint64_t off, uint64_t len,
+			       vector<pair<uint64_t, uint64_t> >& extents);
+
+    static uint64_t object_truncate_size(
+      CephContext *cct, const file_layout_t *layout,
+      uint64_t objectno, uint64_t trunc_size);
+
+    static uint64_t get_num_objects(const file_layout_t& layout,
+				    uint64_t size);
+    /*
+     * helper to assemble a striped result
+     */
+    class StripedReadResult {
+      // offset -> (data, intended length)
+      map<uint64_t, pair<bufferlist, uint64_t> > partial;
+      uint64_t total_intended_len = 0; //sum of partial.second.second
+
+    public:
+      void add_partial_result(
+	CephContext *cct, bufferlist& bl,
+	const vector<pair<uint64_t,uint64_t> >& buffer_extents);
+      /**
+       * add sparse read into results
+       *
+       * @param bl buffer
+       * @param bl_map map of which logical source extents this covers
+       * @param bl_off logical buffer offset (e.g., first bl_map key
+       *               if the buffer is not sparse)
+       * @param buffer_extents output buffer extents the data maps to
+       */
+      void add_partial_sparse_result(
+	CephContext *cct, bufferlist& bl,
+	const map<uint64_t, uint64_t>& bl_map, uint64_t bl_off,
+	const vector<pair<uint64_t,uint64_t> >& buffer_extents);
+
+      void assemble_result(CephContext *cct, bufferlist& bl, bool zero_tail);
+
+      /**
+       * @buffer copy read data into buffer
+       * @len the length of buffer
+       */
+      void assemble_result(CephContext *cct, char *buffer, size_t len);
+    };
+
+  };
+
+//};
+
+#endif
diff --git a/src/osdc/WritebackHandler.h b/src/osdc/WritebackHandler.h
new file mode 100644
index 00000000..ef3b7f6e
--- /dev/null
+++ b/src/osdc/WritebackHandler.h
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_OSDC_WRITEBACKHANDLER_H
+#define CEPH_OSDC_WRITEBACKHANDLER_H
+
+#include "include/Context.h"
+#include "include/types.h"
+#include "common/zipkin_trace.h"
+#include "osd/osd_types.h"
+
+class WritebackHandler {
+ public:
+  WritebackHandler() {}
+  virtual ~WritebackHandler() {}
+
+  virtual void read(const object_t& oid, uint64_t object_no,
+		    const object_locator_t& oloc, uint64_t off, uint64_t len,
+		    snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
+		    __u32 trunc_seq, int op_flags,
+                    const ZTracer::Trace &parent_trace, Context *onfinish) = 0;
+  /**
+   * check if a given extent read result may change due to a write
+   *
+   * Check if the content we see at the given read offset may change
+   * due to a write to this object.
+   *
+   * @param oid object
+   * @param read_off read offset
+   * @param read_len read length
+   * @param snapid read snapid
+   */
+  virtual bool may_copy_on_write(const object_t& oid, uint64_t read_off,
+				 uint64_t read_len, snapid_t snapid) = 0;
+  virtual ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
+			   uint64_t off, uint64_t len,
+			   const SnapContext& snapc,
+			   const bufferlist &bl, ceph::real_time mtime,
+			   uint64_t trunc_size, __u32 trunc_seq,
+                           ceph_tid_t journal_tid,
+                           const ZTracer::Trace &parent_trace,
+                           Context *oncommit) = 0;
+
+  virtual void overwrite_extent(const object_t& oid, uint64_t off, uint64_t len,
+                                ceph_tid_t original_journal_tid,
+                                ceph_tid_t new_journal_tid) {}
+
+  virtual bool can_scattered_write() { return false; }
+  virtual ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
+			   vector<pair<uint64_t, bufferlist> >& io_vec,
+			   const SnapContext& snapc, ceph::real_time mtime,
+			   uint64_t trunc_size, __u32 trunc_seq,
+			   Context *oncommit) {
+    return 0;
+  }
+};
+
+#endif
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
commit	483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree	e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/osdc
parent	Initial commit. (diff)
download	ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip